|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.09224203490028636, |
|
"eval_steps": 1000, |
|
"global_step": 630000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 490.5409240722656, |
|
"learning_rate": 1.4641588553791733e-08, |
|
"loss": 11.6484, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 8.838762283325195, |
|
"eval_runtime": 75.1115, |
|
"eval_samples_per_second": 10.984, |
|
"eval_steps_per_second": 5.498, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 313.693603515625, |
|
"learning_rate": 2.9283177107583466e-08, |
|
"loss": 6.1041, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 4.824796199798584, |
|
"eval_runtime": 75.0269, |
|
"eval_samples_per_second": 10.996, |
|
"eval_steps_per_second": 5.505, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 273.5600280761719, |
|
"learning_rate": 4.39247656613752e-08, |
|
"loss": 4.6414, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 4.450352191925049, |
|
"eval_runtime": 95.6828, |
|
"eval_samples_per_second": 8.622, |
|
"eval_steps_per_second": 4.316, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 250.93833923339844, |
|
"learning_rate": 5.856635421516693e-08, |
|
"loss": 4.373, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 4.234192848205566, |
|
"eval_runtime": 75.0756, |
|
"eval_samples_per_second": 10.989, |
|
"eval_steps_per_second": 5.501, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 204.94479370117188, |
|
"learning_rate": 7.320794276895867e-08, |
|
"loss": 4.1452, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.9963951110839844, |
|
"eval_runtime": 95.7015, |
|
"eval_samples_per_second": 8.621, |
|
"eval_steps_per_second": 4.316, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 167.37818908691406, |
|
"learning_rate": 8.78495313227504e-08, |
|
"loss": 4.0086, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.9861972332000732, |
|
"eval_runtime": 75.3345, |
|
"eval_samples_per_second": 10.951, |
|
"eval_steps_per_second": 5.482, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 166.5489959716797, |
|
"learning_rate": 1.0249111987654213e-07, |
|
"loss": 3.9908, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.9468181133270264, |
|
"eval_runtime": 75.5485, |
|
"eval_samples_per_second": 10.92, |
|
"eval_steps_per_second": 5.467, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 182.9307861328125, |
|
"learning_rate": 1.1713270843033386e-07, |
|
"loss": 3.9448, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.8646774291992188, |
|
"eval_runtime": 96.5287, |
|
"eval_samples_per_second": 8.547, |
|
"eval_steps_per_second": 4.279, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 139.25579833984375, |
|
"learning_rate": 1.3177429698412561e-07, |
|
"loss": 3.9426, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.868206739425659, |
|
"eval_runtime": 75.2358, |
|
"eval_samples_per_second": 10.966, |
|
"eval_steps_per_second": 5.489, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 134.8657684326172, |
|
"learning_rate": 1.4641588553791734e-07, |
|
"loss": 3.8819, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.861171245574951, |
|
"eval_runtime": 96.0503, |
|
"eval_samples_per_second": 8.589, |
|
"eval_steps_per_second": 4.3, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 115.51847076416016, |
|
"learning_rate": 1.6105747409170906e-07, |
|
"loss": 3.8613, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.767599105834961, |
|
"eval_runtime": 75.225, |
|
"eval_samples_per_second": 10.967, |
|
"eval_steps_per_second": 5.49, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 109.70362854003906, |
|
"learning_rate": 1.756990626455008e-07, |
|
"loss": 3.8343, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.7472667694091797, |
|
"eval_runtime": 75.0741, |
|
"eval_samples_per_second": 10.989, |
|
"eval_steps_per_second": 5.501, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 103.31382751464844, |
|
"learning_rate": 1.9034065119929256e-07, |
|
"loss": 3.7918, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.759472131729126, |
|
"eval_runtime": 96.0111, |
|
"eval_samples_per_second": 8.593, |
|
"eval_steps_per_second": 4.302, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 116.83769226074219, |
|
"learning_rate": 2.0498223975308426e-07, |
|
"loss": 3.7645, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.653149366378784, |
|
"eval_runtime": 75.074, |
|
"eval_samples_per_second": 10.989, |
|
"eval_steps_per_second": 5.501, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 102.68473052978516, |
|
"learning_rate": 2.19623828306876e-07, |
|
"loss": 3.7331, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.789041519165039, |
|
"eval_runtime": 96.0471, |
|
"eval_samples_per_second": 8.59, |
|
"eval_steps_per_second": 4.3, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 116.87753295898438, |
|
"learning_rate": 2.3426541686066773e-07, |
|
"loss": 3.7257, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.6204185485839844, |
|
"eval_runtime": 75.0683, |
|
"eval_samples_per_second": 10.99, |
|
"eval_steps_per_second": 5.502, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 95.3843002319336, |
|
"learning_rate": 2.489070054144595e-07, |
|
"loss": 3.6836, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.65740966796875, |
|
"eval_runtime": 95.9655, |
|
"eval_samples_per_second": 8.597, |
|
"eval_steps_per_second": 4.304, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 113.17263793945312, |
|
"learning_rate": 2.6354859396825123e-07, |
|
"loss": 3.6574, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.6075878143310547, |
|
"eval_runtime": 75.2673, |
|
"eval_samples_per_second": 10.961, |
|
"eval_steps_per_second": 5.487, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 115.56730651855469, |
|
"learning_rate": 2.781901825220429e-07, |
|
"loss": 3.6319, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.502887725830078, |
|
"eval_runtime": 75.054, |
|
"eval_samples_per_second": 10.992, |
|
"eval_steps_per_second": 5.503, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 91.19989776611328, |
|
"learning_rate": 2.928317710758347e-07, |
|
"loss": 3.5851, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.496408224105835, |
|
"eval_runtime": 96.1511, |
|
"eval_samples_per_second": 8.58, |
|
"eval_steps_per_second": 4.295, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 98.3852310180664, |
|
"learning_rate": 3.0747335962962637e-07, |
|
"loss": 3.5755, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.501962900161743, |
|
"eval_runtime": 75.0818, |
|
"eval_samples_per_second": 10.988, |
|
"eval_steps_per_second": 5.501, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 87.96759796142578, |
|
"learning_rate": 3.221149481834181e-07, |
|
"loss": 3.5306, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.4395623207092285, |
|
"eval_runtime": 95.7612, |
|
"eval_samples_per_second": 8.615, |
|
"eval_steps_per_second": 4.313, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 117.33857727050781, |
|
"learning_rate": 3.367565367372098e-07, |
|
"loss": 3.5465, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.4450230598449707, |
|
"eval_runtime": 75.1763, |
|
"eval_samples_per_second": 10.974, |
|
"eval_steps_per_second": 5.494, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 84.39014434814453, |
|
"learning_rate": 3.513981252910016e-07, |
|
"loss": 3.5045, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.398813247680664, |
|
"eval_runtime": 75.038, |
|
"eval_samples_per_second": 10.994, |
|
"eval_steps_per_second": 5.504, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 91.13440704345703, |
|
"learning_rate": 3.660397138447933e-07, |
|
"loss": 3.4932, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.4234652519226074, |
|
"eval_runtime": 96.7065, |
|
"eval_samples_per_second": 8.531, |
|
"eval_steps_per_second": 4.271, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 79.27528381347656, |
|
"learning_rate": 3.806813023985851e-07, |
|
"loss": 3.4615, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.360214948654175, |
|
"eval_runtime": 75.1229, |
|
"eval_samples_per_second": 10.982, |
|
"eval_steps_per_second": 5.498, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 106.40711975097656, |
|
"learning_rate": 3.953228909523768e-07, |
|
"loss": 3.4459, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.4171066284179688, |
|
"eval_runtime": 95.7756, |
|
"eval_samples_per_second": 8.614, |
|
"eval_steps_per_second": 4.312, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 101.22091674804688, |
|
"learning_rate": 4.099644795061685e-07, |
|
"loss": 3.4484, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.440232515335083, |
|
"eval_runtime": 75.1872, |
|
"eval_samples_per_second": 10.973, |
|
"eval_steps_per_second": 5.493, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 85.2155990600586, |
|
"learning_rate": 4.2460606805996026e-07, |
|
"loss": 3.4244, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.295353651046753, |
|
"eval_runtime": 75.0816, |
|
"eval_samples_per_second": 10.988, |
|
"eval_steps_per_second": 5.501, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 123.67411041259766, |
|
"learning_rate": 4.39247656613752e-07, |
|
"loss": 3.4244, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.3576502799987793, |
|
"eval_runtime": 95.7774, |
|
"eval_samples_per_second": 8.614, |
|
"eval_steps_per_second": 4.312, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 265.1561584472656, |
|
"learning_rate": 4.5388924516754376e-07, |
|
"loss": 3.3943, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.247262954711914, |
|
"eval_runtime": 75.0879, |
|
"eval_samples_per_second": 10.987, |
|
"eval_steps_per_second": 5.5, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 78.62483978271484, |
|
"learning_rate": 4.6853083372133546e-07, |
|
"loss": 3.3792, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.255089044570923, |
|
"eval_runtime": 95.9793, |
|
"eval_samples_per_second": 8.596, |
|
"eval_steps_per_second": 4.303, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 77.11738586425781, |
|
"learning_rate": 4.831724222751272e-07, |
|
"loss": 3.3503, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.1970388889312744, |
|
"eval_runtime": 75.0707, |
|
"eval_samples_per_second": 10.99, |
|
"eval_steps_per_second": 5.501, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 105.60401153564453, |
|
"learning_rate": 4.97814010828919e-07, |
|
"loss": 3.3165, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.2812023162841797, |
|
"eval_runtime": 95.8908, |
|
"eval_samples_per_second": 8.604, |
|
"eval_steps_per_second": 4.307, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 73.19610595703125, |
|
"learning_rate": 5.124555993827106e-07, |
|
"loss": 3.3296, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.2438435554504395, |
|
"eval_runtime": 75.2106, |
|
"eval_samples_per_second": 10.969, |
|
"eval_steps_per_second": 5.491, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 92.63853454589844, |
|
"learning_rate": 5.270971879365025e-07, |
|
"loss": 3.3123, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.1740996837615967, |
|
"eval_runtime": 75.0587, |
|
"eval_samples_per_second": 10.991, |
|
"eval_steps_per_second": 5.502, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 76.13302612304688, |
|
"learning_rate": 5.417387764902941e-07, |
|
"loss": 3.3046, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.2372987270355225, |
|
"eval_runtime": 95.751, |
|
"eval_samples_per_second": 8.616, |
|
"eval_steps_per_second": 4.313, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 92.608642578125, |
|
"learning_rate": 5.563803650440859e-07, |
|
"loss": 3.2746, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.2454912662506104, |
|
"eval_runtime": 75.0278, |
|
"eval_samples_per_second": 10.996, |
|
"eval_steps_per_second": 5.505, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 102.28934478759766, |
|
"learning_rate": 5.710219535978776e-07, |
|
"loss": 3.2682, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.196023941040039, |
|
"eval_runtime": 95.7842, |
|
"eval_samples_per_second": 8.613, |
|
"eval_steps_per_second": 4.312, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 108.18222045898438, |
|
"learning_rate": 5.856635421516694e-07, |
|
"loss": 3.2708, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.215292453765869, |
|
"eval_runtime": 75.1865, |
|
"eval_samples_per_second": 10.973, |
|
"eval_steps_per_second": 5.493, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 96.638427734375, |
|
"learning_rate": 6.003051307054611e-07, |
|
"loss": 3.2426, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.331897497177124, |
|
"eval_runtime": 75.005, |
|
"eval_samples_per_second": 10.999, |
|
"eval_steps_per_second": 5.506, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 103.09781646728516, |
|
"learning_rate": 6.149467192592527e-07, |
|
"loss": 3.2235, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.207122325897217, |
|
"eval_runtime": 95.9639, |
|
"eval_samples_per_second": 8.597, |
|
"eval_steps_per_second": 4.304, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 119.6423110961914, |
|
"learning_rate": 6.295883078130445e-07, |
|
"loss": 3.2281, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.1870551109313965, |
|
"eval_runtime": 74.9929, |
|
"eval_samples_per_second": 11.001, |
|
"eval_steps_per_second": 5.507, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 113.07027435302734, |
|
"learning_rate": 6.442298963668362e-07, |
|
"loss": 3.219, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.046499490737915, |
|
"eval_runtime": 95.8481, |
|
"eval_samples_per_second": 8.607, |
|
"eval_steps_per_second": 4.309, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 78.94286346435547, |
|
"learning_rate": 6.588714849206281e-07, |
|
"loss": 3.2089, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.217085599899292, |
|
"eval_runtime": 75.0114, |
|
"eval_samples_per_second": 10.998, |
|
"eval_steps_per_second": 5.506, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 97.2042007446289, |
|
"learning_rate": 6.735130734744196e-07, |
|
"loss": 3.1798, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.0830495357513428, |
|
"eval_runtime": 75.0234, |
|
"eval_samples_per_second": 10.997, |
|
"eval_steps_per_second": 5.505, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 119.12738037109375, |
|
"learning_rate": 6.881546620282115e-07, |
|
"loss": 3.1809, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.1285533905029297, |
|
"eval_runtime": 95.8319, |
|
"eval_samples_per_second": 8.609, |
|
"eval_steps_per_second": 4.31, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 65.7905502319336, |
|
"learning_rate": 7.027962505820032e-07, |
|
"loss": 3.1605, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.175480604171753, |
|
"eval_runtime": 75.0379, |
|
"eval_samples_per_second": 10.994, |
|
"eval_steps_per_second": 5.504, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 119.85379791259766, |
|
"learning_rate": 7.174378391357949e-07, |
|
"loss": 3.1485, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.1343507766723633, |
|
"eval_runtime": 95.7086, |
|
"eval_samples_per_second": 8.62, |
|
"eval_steps_per_second": 4.315, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 72.57598114013672, |
|
"learning_rate": 7.320794276895866e-07, |
|
"loss": 3.144, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.1388068199157715, |
|
"eval_runtime": 75.0332, |
|
"eval_samples_per_second": 10.995, |
|
"eval_steps_per_second": 5.504, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 101.29464721679688, |
|
"learning_rate": 7.467210162433784e-07, |
|
"loss": 3.1287, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.132037878036499, |
|
"eval_runtime": 95.9227, |
|
"eval_samples_per_second": 8.601, |
|
"eval_steps_per_second": 4.306, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 110.46088409423828, |
|
"learning_rate": 7.613626047971702e-07, |
|
"loss": 3.1234, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.025822162628174, |
|
"eval_runtime": 75.1384, |
|
"eval_samples_per_second": 10.98, |
|
"eval_steps_per_second": 5.497, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 88.59445190429688, |
|
"learning_rate": 7.760041933509619e-07, |
|
"loss": 3.1065, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.075002908706665, |
|
"eval_runtime": 75.005, |
|
"eval_samples_per_second": 10.999, |
|
"eval_steps_per_second": 5.506, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 71.0448989868164, |
|
"learning_rate": 7.906457819047536e-07, |
|
"loss": 3.1247, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.009894371032715, |
|
"eval_runtime": 103.8658, |
|
"eval_samples_per_second": 7.943, |
|
"eval_steps_per_second": 3.976, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 71.39112854003906, |
|
"learning_rate": 8.052873704585454e-07, |
|
"loss": 3.1066, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.0463435649871826, |
|
"eval_runtime": 75.0114, |
|
"eval_samples_per_second": 10.998, |
|
"eval_steps_per_second": 5.506, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 65.42823028564453, |
|
"learning_rate": 8.19928959012337e-07, |
|
"loss": 3.0938, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.003645181655884, |
|
"eval_runtime": 95.848, |
|
"eval_samples_per_second": 8.607, |
|
"eval_steps_per_second": 4.309, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 49.959877014160156, |
|
"learning_rate": 8.345705475661288e-07, |
|
"loss": 3.0987, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.0445501804351807, |
|
"eval_runtime": 75.3866, |
|
"eval_samples_per_second": 10.944, |
|
"eval_steps_per_second": 5.478, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 73.73646545410156, |
|
"learning_rate": 8.492121361199205e-07, |
|
"loss": 3.0884, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.1307623386383057, |
|
"eval_runtime": 75.0205, |
|
"eval_samples_per_second": 10.997, |
|
"eval_steps_per_second": 5.505, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 70.4371566772461, |
|
"learning_rate": 8.638537246737122e-07, |
|
"loss": 3.0493, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.093940496444702, |
|
"eval_runtime": 96.0292, |
|
"eval_samples_per_second": 8.591, |
|
"eval_steps_per_second": 4.301, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 105.82594299316406, |
|
"learning_rate": 8.78495313227504e-07, |
|
"loss": 3.039, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.0275042057037354, |
|
"eval_runtime": 75.0337, |
|
"eval_samples_per_second": 10.995, |
|
"eval_steps_per_second": 5.504, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 102.49614715576172, |
|
"learning_rate": 8.931369017812958e-07, |
|
"loss": 3.062, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.973200798034668, |
|
"eval_runtime": 95.9411, |
|
"eval_samples_per_second": 8.599, |
|
"eval_steps_per_second": 4.305, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 75.26753234863281, |
|
"learning_rate": 9.077784903350875e-07, |
|
"loss": 3.039, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.0531740188598633, |
|
"eval_runtime": 75.0299, |
|
"eval_samples_per_second": 10.996, |
|
"eval_steps_per_second": 5.504, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 80.71424102783203, |
|
"learning_rate": 9.224200788888792e-07, |
|
"loss": 3.0477, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.993370294570923, |
|
"eval_runtime": 95.8376, |
|
"eval_samples_per_second": 8.608, |
|
"eval_steps_per_second": 4.309, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 166.16552734375, |
|
"learning_rate": 9.370616674426709e-07, |
|
"loss": 3.0393, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.027719020843506, |
|
"eval_runtime": 75.1115, |
|
"eval_samples_per_second": 10.984, |
|
"eval_steps_per_second": 5.498, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 86.70442199707031, |
|
"learning_rate": 9.517032559964627e-07, |
|
"loss": 3.0254, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9751687049865723, |
|
"eval_runtime": 75.0016, |
|
"eval_samples_per_second": 11.0, |
|
"eval_steps_per_second": 5.507, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 86.4237289428711, |
|
"learning_rate": 9.663448445502544e-07, |
|
"loss": 3.0057, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.0547940731048584, |
|
"eval_runtime": 95.8012, |
|
"eval_samples_per_second": 8.612, |
|
"eval_steps_per_second": 4.311, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 69.26152801513672, |
|
"learning_rate": 9.809864331040462e-07, |
|
"loss": 3.0044, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9488420486450195, |
|
"eval_runtime": 75.0827, |
|
"eval_samples_per_second": 10.988, |
|
"eval_steps_per_second": 5.501, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 85.0170669555664, |
|
"learning_rate": 9.95628021657838e-07, |
|
"loss": 2.9895, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9108245372772217, |
|
"eval_runtime": 95.6907, |
|
"eval_samples_per_second": 8.622, |
|
"eval_steps_per_second": 4.316, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 105.4040756225586, |
|
"learning_rate": 1.0102696102116297e-06, |
|
"loss": 2.9791, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9387757778167725, |
|
"eval_runtime": 75.1792, |
|
"eval_samples_per_second": 10.974, |
|
"eval_steps_per_second": 5.494, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 63.04122543334961, |
|
"learning_rate": 1.0249111987654212e-06, |
|
"loss": 2.9898, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.011551856994629, |
|
"eval_runtime": 75.0439, |
|
"eval_samples_per_second": 10.994, |
|
"eval_steps_per_second": 5.503, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 125.3316650390625, |
|
"learning_rate": 1.0395527873192132e-06, |
|
"loss": 2.9872, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9619126319885254, |
|
"eval_runtime": 95.8269, |
|
"eval_samples_per_second": 8.609, |
|
"eval_steps_per_second": 4.31, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 93.01313781738281, |
|
"learning_rate": 1.054194375873005e-06, |
|
"loss": 2.9775, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.0116209983825684, |
|
"eval_runtime": 75.0043, |
|
"eval_samples_per_second": 10.999, |
|
"eval_steps_per_second": 5.506, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 70.66936492919922, |
|
"learning_rate": 1.0688359644267965e-06, |
|
"loss": 2.9808, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.014275550842285, |
|
"eval_runtime": 95.804, |
|
"eval_samples_per_second": 8.611, |
|
"eval_steps_per_second": 4.311, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 70.04747009277344, |
|
"learning_rate": 1.0834775529805882e-06, |
|
"loss": 2.9661, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.061206102371216, |
|
"eval_runtime": 75.0519, |
|
"eval_samples_per_second": 10.992, |
|
"eval_steps_per_second": 5.503, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 66.63554382324219, |
|
"learning_rate": 1.09811914153438e-06, |
|
"loss": 2.9643, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9404237270355225, |
|
"eval_runtime": 75.0388, |
|
"eval_samples_per_second": 10.994, |
|
"eval_steps_per_second": 5.504, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 65.95869445800781, |
|
"learning_rate": 1.1127607300881717e-06, |
|
"loss": 2.9639, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9306540489196777, |
|
"eval_runtime": 95.942, |
|
"eval_samples_per_second": 8.599, |
|
"eval_steps_per_second": 4.305, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 96.67823028564453, |
|
"learning_rate": 1.1274023186419635e-06, |
|
"loss": 2.9388, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9036476612091064, |
|
"eval_runtime": 75.2879, |
|
"eval_samples_per_second": 10.958, |
|
"eval_steps_per_second": 5.486, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 61.048980712890625, |
|
"learning_rate": 1.1420439071957552e-06, |
|
"loss": 2.9723, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9100253582000732, |
|
"eval_runtime": 95.6634, |
|
"eval_samples_per_second": 8.624, |
|
"eval_steps_per_second": 4.317, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 84.32980346679688, |
|
"learning_rate": 1.156685495749547e-06, |
|
"loss": 2.9403, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.951533555984497, |
|
"eval_runtime": 75.2112, |
|
"eval_samples_per_second": 10.969, |
|
"eval_steps_per_second": 5.491, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 78.58013916015625, |
|
"learning_rate": 1.1713270843033387e-06, |
|
"loss": 2.9315, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9593875408172607, |
|
"eval_runtime": 95.9075, |
|
"eval_samples_per_second": 8.602, |
|
"eval_steps_per_second": 4.306, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 111.2406005859375, |
|
"learning_rate": 1.1859686728571305e-06, |
|
"loss": 2.943, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9718501567840576, |
|
"eval_runtime": 75.2332, |
|
"eval_samples_per_second": 10.966, |
|
"eval_steps_per_second": 5.49, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 97.3222427368164, |
|
"learning_rate": 1.2006102614109222e-06, |
|
"loss": 2.9236, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9419431686401367, |
|
"eval_runtime": 74.9884, |
|
"eval_samples_per_second": 11.002, |
|
"eval_steps_per_second": 5.508, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 272.546630859375, |
|
"learning_rate": 1.2152518499647137e-06, |
|
"loss": 2.9088, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.906754970550537, |
|
"eval_runtime": 96.1313, |
|
"eval_samples_per_second": 8.582, |
|
"eval_steps_per_second": 4.296, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 86.16685485839844, |
|
"learning_rate": 1.2298934385185055e-06, |
|
"loss": 2.9249, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.927184581756592, |
|
"eval_runtime": 74.9953, |
|
"eval_samples_per_second": 11.001, |
|
"eval_steps_per_second": 5.507, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 48.68775177001953, |
|
"learning_rate": 1.2445350270722975e-06, |
|
"loss": 2.9086, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.993474006652832, |
|
"eval_runtime": 95.786, |
|
"eval_samples_per_second": 8.613, |
|
"eval_steps_per_second": 4.312, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 73.69239807128906, |
|
"learning_rate": 1.259176615626089e-06, |
|
"loss": 2.916, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.879518747329712, |
|
"eval_runtime": 75.0661, |
|
"eval_samples_per_second": 10.99, |
|
"eval_steps_per_second": 5.502, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 81.12676239013672, |
|
"learning_rate": 1.2738182041798807e-06, |
|
"loss": 2.963, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9365947246551514, |
|
"eval_runtime": 74.9513, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 5.51, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 73.390380859375, |
|
"learning_rate": 1.2884597927336725e-06, |
|
"loss": 2.8899, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.00091290473938, |
|
"eval_runtime": 103.6343, |
|
"eval_samples_per_second": 7.961, |
|
"eval_steps_per_second": 3.985, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 63.2940788269043, |
|
"learning_rate": 1.3031013812874642e-06, |
|
"loss": 2.9109, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.857473611831665, |
|
"eval_runtime": 75.1506, |
|
"eval_samples_per_second": 10.978, |
|
"eval_steps_per_second": 5.496, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 58.969947814941406, |
|
"learning_rate": 1.3177429698412562e-06, |
|
"loss": 2.8997, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.8435328006744385, |
|
"eval_runtime": 95.6436, |
|
"eval_samples_per_second": 8.626, |
|
"eval_steps_per_second": 4.318, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 85.3018569946289, |
|
"learning_rate": 1.3323845583950475e-06, |
|
"loss": 2.8939, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.8604986667633057, |
|
"eval_runtime": 75.0321, |
|
"eval_samples_per_second": 10.995, |
|
"eval_steps_per_second": 5.504, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 77.29539489746094, |
|
"learning_rate": 1.3470261469488393e-06, |
|
"loss": 2.9039, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9332242012023926, |
|
"eval_runtime": 75.0204, |
|
"eval_samples_per_second": 10.997, |
|
"eval_steps_per_second": 5.505, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 84.76435852050781, |
|
"learning_rate": 1.3616677355026312e-06, |
|
"loss": 2.8846, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.8225491046905518, |
|
"eval_runtime": 95.901, |
|
"eval_samples_per_second": 8.603, |
|
"eval_steps_per_second": 4.307, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 64.92876434326172, |
|
"learning_rate": 1.376309324056423e-06, |
|
"loss": 2.8728, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.7463650703430176, |
|
"eval_runtime": 74.9691, |
|
"eval_samples_per_second": 11.005, |
|
"eval_steps_per_second": 5.509, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 78.40308380126953, |
|
"learning_rate": 1.3909509126102147e-06, |
|
"loss": 2.8671, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.727618932723999, |
|
"eval_runtime": 95.9006, |
|
"eval_samples_per_second": 8.603, |
|
"eval_steps_per_second": 4.307, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 74.5372314453125, |
|
"learning_rate": 1.4055925011640065e-06, |
|
"loss": 2.893, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.779794692993164, |
|
"eval_runtime": 75.0188, |
|
"eval_samples_per_second": 10.997, |
|
"eval_steps_per_second": 5.505, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 110.36092376708984, |
|
"learning_rate": 1.4202340897177982e-06, |
|
"loss": 2.8587, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.7951598167419434, |
|
"eval_runtime": 103.6888, |
|
"eval_samples_per_second": 7.956, |
|
"eval_steps_per_second": 3.983, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 77.4548110961914, |
|
"learning_rate": 1.4348756782715898e-06, |
|
"loss": 2.8774, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.9320247173309326, |
|
"eval_runtime": 75.1206, |
|
"eval_samples_per_second": 10.982, |
|
"eval_steps_per_second": 5.498, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 56.37371063232422, |
|
"learning_rate": 1.4495172668253815e-06, |
|
"loss": 2.8618, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.8696322441101074, |
|
"eval_runtime": 74.9633, |
|
"eval_samples_per_second": 11.005, |
|
"eval_steps_per_second": 5.509, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 79.53195190429688, |
|
"learning_rate": 1.4641588553791733e-06, |
|
"loss": 2.8607, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.8158628940582275, |
|
"eval_runtime": 95.9613, |
|
"eval_samples_per_second": 8.597, |
|
"eval_steps_per_second": 4.304, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 64.08206939697266, |
|
"learning_rate": 1.478800443932965e-06, |
|
"loss": 2.8482, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.781191349029541, |
|
"eval_runtime": 75.3986, |
|
"eval_samples_per_second": 10.942, |
|
"eval_steps_per_second": 5.478, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 64.09700012207031, |
|
"learning_rate": 1.4934420324867568e-06, |
|
"loss": 2.856, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.8596715927124023, |
|
"eval_runtime": 95.6436, |
|
"eval_samples_per_second": 8.626, |
|
"eval_steps_per_second": 4.318, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 60.58209991455078, |
|
"learning_rate": 1.5080836210405485e-06, |
|
"loss": 2.8449, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8443682193756104, |
|
"eval_runtime": 75.0915, |
|
"eval_samples_per_second": 10.987, |
|
"eval_steps_per_second": 5.5, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 64.66394805908203, |
|
"learning_rate": 1.5227252095943405e-06, |
|
"loss": 2.8547, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8253190517425537, |
|
"eval_runtime": 74.9511, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 5.51, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 72.93558502197266, |
|
"learning_rate": 1.5373667981481318e-06, |
|
"loss": 2.8508, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8918566703796387, |
|
"eval_runtime": 95.8796, |
|
"eval_samples_per_second": 8.605, |
|
"eval_steps_per_second": 4.307, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 76.92417907714844, |
|
"learning_rate": 1.5520083867019238e-06, |
|
"loss": 2.852, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8885250091552734, |
|
"eval_runtime": 75.0272, |
|
"eval_samples_per_second": 10.996, |
|
"eval_steps_per_second": 5.505, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 61.644500732421875, |
|
"learning_rate": 1.5666499752557155e-06, |
|
"loss": 2.8452, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.766432046890259, |
|
"eval_runtime": 97.2331, |
|
"eval_samples_per_second": 8.485, |
|
"eval_steps_per_second": 4.248, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 76.87417602539062, |
|
"learning_rate": 1.5812915638095073e-06, |
|
"loss": 2.8166, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7734806537628174, |
|
"eval_runtime": 76.1146, |
|
"eval_samples_per_second": 10.839, |
|
"eval_steps_per_second": 5.426, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 71.81310272216797, |
|
"learning_rate": 1.595933152363299e-06, |
|
"loss": 2.8396, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.792924404144287, |
|
"eval_runtime": 97.4645, |
|
"eval_samples_per_second": 8.465, |
|
"eval_steps_per_second": 4.237, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 59.734291076660156, |
|
"learning_rate": 1.6105747409170908e-06, |
|
"loss": 2.8095, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7745141983032227, |
|
"eval_runtime": 76.6193, |
|
"eval_samples_per_second": 10.768, |
|
"eval_steps_per_second": 5.39, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 76.67354583740234, |
|
"learning_rate": 1.6252163294708825e-06, |
|
"loss": 2.8317, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.776149034500122, |
|
"eval_runtime": 76.58, |
|
"eval_samples_per_second": 10.773, |
|
"eval_steps_per_second": 5.393, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 124.78726959228516, |
|
"learning_rate": 1.639857918024674e-06, |
|
"loss": 2.8099, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8111283779144287, |
|
"eval_runtime": 95.6948, |
|
"eval_samples_per_second": 8.621, |
|
"eval_steps_per_second": 4.316, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 62.90610885620117, |
|
"learning_rate": 1.6544995065784658e-06, |
|
"loss": 2.8286, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.858586549758911, |
|
"eval_runtime": 74.9833, |
|
"eval_samples_per_second": 11.002, |
|
"eval_steps_per_second": 5.508, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 60.4992561340332, |
|
"learning_rate": 1.6691410951322576e-06, |
|
"loss": 2.8343, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.857048749923706, |
|
"eval_runtime": 95.7153, |
|
"eval_samples_per_second": 8.619, |
|
"eval_steps_per_second": 4.315, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 54.388065338134766, |
|
"learning_rate": 1.6837826836860493e-06, |
|
"loss": 2.8369, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.802098512649536, |
|
"eval_runtime": 75.1891, |
|
"eval_samples_per_second": 10.972, |
|
"eval_steps_per_second": 5.493, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 53.773902893066406, |
|
"learning_rate": 1.698424272239841e-06, |
|
"loss": 2.8027, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7923638820648193, |
|
"eval_runtime": 75.0022, |
|
"eval_samples_per_second": 11.0, |
|
"eval_steps_per_second": 5.507, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 80.11760711669922, |
|
"learning_rate": 1.7130658607936328e-06, |
|
"loss": 2.8181, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7276864051818848, |
|
"eval_runtime": 95.8648, |
|
"eval_samples_per_second": 8.606, |
|
"eval_steps_per_second": 4.308, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 93.17306518554688, |
|
"learning_rate": 1.7277074493474243e-06, |
|
"loss": 2.8024, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8029587268829346, |
|
"eval_runtime": 74.9841, |
|
"eval_samples_per_second": 11.002, |
|
"eval_steps_per_second": 5.508, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 63.65019989013672, |
|
"learning_rate": 1.742349037901216e-06, |
|
"loss": 2.8031, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8437631130218506, |
|
"eval_runtime": 95.9819, |
|
"eval_samples_per_second": 8.595, |
|
"eval_steps_per_second": 4.303, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 63.70858383178711, |
|
"learning_rate": 1.756990626455008e-06, |
|
"loss": 2.8067, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7934324741363525, |
|
"eval_runtime": 75.1472, |
|
"eval_samples_per_second": 10.978, |
|
"eval_steps_per_second": 5.496, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 56.11475372314453, |
|
"learning_rate": 1.7716322150087998e-06, |
|
"loss": 2.8209, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.808687210083008, |
|
"eval_runtime": 74.9779, |
|
"eval_samples_per_second": 11.003, |
|
"eval_steps_per_second": 5.508, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 68.00253295898438, |
|
"learning_rate": 1.7862738035625916e-06, |
|
"loss": 2.8113, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.973874092102051, |
|
"eval_runtime": 95.8935, |
|
"eval_samples_per_second": 8.603, |
|
"eval_steps_per_second": 4.307, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 67.90374755859375, |
|
"learning_rate": 1.8009153921163833e-06, |
|
"loss": 2.8085, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8363542556762695, |
|
"eval_runtime": 74.9744, |
|
"eval_samples_per_second": 11.004, |
|
"eval_steps_per_second": 5.509, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 54.21482467651367, |
|
"learning_rate": 1.815556980670175e-06, |
|
"loss": 2.8024, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.862858295440674, |
|
"eval_runtime": 95.7463, |
|
"eval_samples_per_second": 8.617, |
|
"eval_steps_per_second": 4.313, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 73.39325714111328, |
|
"learning_rate": 1.8301985692239666e-06, |
|
"loss": 2.7906, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.76910400390625, |
|
"eval_runtime": 75.0224, |
|
"eval_samples_per_second": 10.997, |
|
"eval_steps_per_second": 5.505, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 59.4721794128418, |
|
"learning_rate": 1.8448401577777583e-06, |
|
"loss": 2.8233, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7822389602661133, |
|
"eval_runtime": 95.6608, |
|
"eval_samples_per_second": 8.624, |
|
"eval_steps_per_second": 4.317, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 69.88265228271484, |
|
"learning_rate": 1.85948174633155e-06, |
|
"loss": 2.7905, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.810521364212036, |
|
"eval_runtime": 75.1741, |
|
"eval_samples_per_second": 10.975, |
|
"eval_steps_per_second": 5.494, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 57.1430549621582, |
|
"learning_rate": 1.8741233348853418e-06, |
|
"loss": 2.8324, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7673442363739014, |
|
"eval_runtime": 75.0005, |
|
"eval_samples_per_second": 11.0, |
|
"eval_steps_per_second": 5.507, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 60.84886169433594, |
|
"learning_rate": 1.8887649234391336e-06, |
|
"loss": 2.7855, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.761514902114868, |
|
"eval_runtime": 95.9983, |
|
"eval_samples_per_second": 8.594, |
|
"eval_steps_per_second": 4.302, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 44.72441482543945, |
|
"learning_rate": 1.9034065119929253e-06, |
|
"loss": 2.7882, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.704122304916382, |
|
"eval_runtime": 74.9992, |
|
"eval_samples_per_second": 11.0, |
|
"eval_steps_per_second": 5.507, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 62.372440338134766, |
|
"learning_rate": 1.918048100546717e-06, |
|
"loss": 2.7787, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.721027374267578, |
|
"eval_runtime": 95.7358, |
|
"eval_samples_per_second": 8.617, |
|
"eval_steps_per_second": 4.314, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 66.38520812988281, |
|
"learning_rate": 1.932689689100509e-06, |
|
"loss": 2.7689, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.728421211242676, |
|
"eval_runtime": 75.1015, |
|
"eval_samples_per_second": 10.985, |
|
"eval_steps_per_second": 5.499, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 95.22692108154297, |
|
"learning_rate": 1.9473312776543006e-06, |
|
"loss": 2.7857, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.66815447807312, |
|
"eval_runtime": 75.0123, |
|
"eval_samples_per_second": 10.998, |
|
"eval_steps_per_second": 5.506, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 89.40083312988281, |
|
"learning_rate": 1.9619728662080923e-06, |
|
"loss": 2.7883, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7113566398620605, |
|
"eval_runtime": 95.7697, |
|
"eval_samples_per_second": 8.614, |
|
"eval_steps_per_second": 4.312, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 81.26580047607422, |
|
"learning_rate": 1.976614454761884e-06, |
|
"loss": 2.7819, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7667925357818604, |
|
"eval_runtime": 74.9113, |
|
"eval_samples_per_second": 11.013, |
|
"eval_steps_per_second": 5.513, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 70.14020538330078, |
|
"learning_rate": 1.991256043315676e-06, |
|
"loss": 2.7797, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.767988920211792, |
|
"eval_runtime": 95.7035, |
|
"eval_samples_per_second": 8.62, |
|
"eval_steps_per_second": 4.315, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 54.9051513671875, |
|
"learning_rate": 2.0058976318694676e-06, |
|
"loss": 2.7491, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8340063095092773, |
|
"eval_runtime": 75.0666, |
|
"eval_samples_per_second": 10.99, |
|
"eval_steps_per_second": 5.502, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 77.1761474609375, |
|
"learning_rate": 2.0205392204232593e-06, |
|
"loss": 2.758, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.757105827331543, |
|
"eval_runtime": 74.953, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 5.51, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 79.28630065917969, |
|
"learning_rate": 2.0351808089770507e-06, |
|
"loss": 2.7981, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.856295585632324, |
|
"eval_runtime": 96.1389, |
|
"eval_samples_per_second": 8.581, |
|
"eval_steps_per_second": 4.296, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 70.3088607788086, |
|
"learning_rate": 2.0498223975308424e-06, |
|
"loss": 2.7733, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.756226062774658, |
|
"eval_runtime": 74.9334, |
|
"eval_samples_per_second": 11.01, |
|
"eval_steps_per_second": 5.512, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 42.964439392089844, |
|
"learning_rate": 2.0644639860846346e-06, |
|
"loss": 2.7643, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7820403575897217, |
|
"eval_runtime": 95.7318, |
|
"eval_samples_per_second": 8.618, |
|
"eval_steps_per_second": 4.314, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 48.390541076660156, |
|
"learning_rate": 2.0791055746384263e-06, |
|
"loss": 2.7708, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8036890029907227, |
|
"eval_runtime": 74.9876, |
|
"eval_samples_per_second": 11.002, |
|
"eval_steps_per_second": 5.508, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 53.37461853027344, |
|
"learning_rate": 2.093747163192218e-06, |
|
"loss": 2.7535, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.692978858947754, |
|
"eval_runtime": 95.9397, |
|
"eval_samples_per_second": 8.599, |
|
"eval_steps_per_second": 4.305, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 60.01034164428711, |
|
"learning_rate": 2.10838875174601e-06, |
|
"loss": 2.7677, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.740020751953125, |
|
"eval_runtime": 75.2482, |
|
"eval_samples_per_second": 10.964, |
|
"eval_steps_per_second": 5.489, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 65.03019714355469, |
|
"learning_rate": 2.123030340299801e-06, |
|
"loss": 2.7438, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.6849639415740967, |
|
"eval_runtime": 74.9511, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 5.51, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 66.77202606201172, |
|
"learning_rate": 2.137671928853593e-06, |
|
"loss": 2.7816, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.6878929138183594, |
|
"eval_runtime": 95.6358, |
|
"eval_samples_per_second": 8.626, |
|
"eval_steps_per_second": 4.318, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 38.58710479736328, |
|
"learning_rate": 2.1523135174073847e-06, |
|
"loss": 2.7873, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.6576905250549316, |
|
"eval_runtime": 74.9871, |
|
"eval_samples_per_second": 11.002, |
|
"eval_steps_per_second": 5.508, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 59.993080139160156, |
|
"learning_rate": 2.1669551059611764e-06, |
|
"loss": 2.7334, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.703524589538574, |
|
"eval_runtime": 95.5679, |
|
"eval_samples_per_second": 8.633, |
|
"eval_steps_per_second": 4.322, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 58.45066833496094, |
|
"learning_rate": 2.181596694514968e-06, |
|
"loss": 2.7596, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7554209232330322, |
|
"eval_runtime": 74.9888, |
|
"eval_samples_per_second": 11.002, |
|
"eval_steps_per_second": 5.507, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 64.96820068359375, |
|
"learning_rate": 2.19623828306876e-06, |
|
"loss": 2.7615, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.6536672115325928, |
|
"eval_runtime": 74.9475, |
|
"eval_samples_per_second": 11.008, |
|
"eval_steps_per_second": 5.511, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 47.30535888671875, |
|
"learning_rate": 2.2108798716225517e-06, |
|
"loss": 2.7354, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.73598051071167, |
|
"eval_runtime": 95.8251, |
|
"eval_samples_per_second": 8.609, |
|
"eval_steps_per_second": 4.31, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 63.874759674072266, |
|
"learning_rate": 2.2255214601763434e-06, |
|
"loss": 2.7469, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.720163345336914, |
|
"eval_runtime": 74.9546, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 5.51, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 60.05183029174805, |
|
"learning_rate": 2.240163048730135e-06, |
|
"loss": 2.7431, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.6975433826446533, |
|
"eval_runtime": 95.9027, |
|
"eval_samples_per_second": 8.602, |
|
"eval_steps_per_second": 4.306, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 67.57109069824219, |
|
"learning_rate": 2.254804637283927e-06, |
|
"loss": 2.729, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.697463274002075, |
|
"eval_runtime": 74.9668, |
|
"eval_samples_per_second": 11.005, |
|
"eval_steps_per_second": 5.509, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 61.8995361328125, |
|
"learning_rate": 2.2694462258377187e-06, |
|
"loss": 2.7624, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.702453136444092, |
|
"eval_runtime": 95.857, |
|
"eval_samples_per_second": 8.607, |
|
"eval_steps_per_second": 4.308, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 60.828304290771484, |
|
"learning_rate": 2.2840878143915104e-06, |
|
"loss": 2.7486, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.703162908554077, |
|
"eval_runtime": 75.085, |
|
"eval_samples_per_second": 10.988, |
|
"eval_steps_per_second": 5.5, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 52.02991485595703, |
|
"learning_rate": 2.298729402945302e-06, |
|
"loss": 2.7346, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7533764839172363, |
|
"eval_runtime": 74.9101, |
|
"eval_samples_per_second": 11.013, |
|
"eval_steps_per_second": 5.513, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 46.363529205322266, |
|
"learning_rate": 2.313370991499094e-06, |
|
"loss": 2.7476, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.612370491027832, |
|
"eval_runtime": 95.7878, |
|
"eval_samples_per_second": 8.613, |
|
"eval_steps_per_second": 4.312, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 59.908451080322266, |
|
"learning_rate": 2.3280125800528857e-06, |
|
"loss": 2.7345, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8132786750793457, |
|
"eval_runtime": 74.9641, |
|
"eval_samples_per_second": 11.005, |
|
"eval_steps_per_second": 5.509, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 48.26115798950195, |
|
"learning_rate": 2.3426541686066774e-06, |
|
"loss": 2.7304, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7734522819519043, |
|
"eval_runtime": 95.7078, |
|
"eval_samples_per_second": 8.62, |
|
"eval_steps_per_second": 4.315, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 55.67764663696289, |
|
"learning_rate": 2.357295757160469e-06, |
|
"loss": 2.7316, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7094366550445557, |
|
"eval_runtime": 75.1541, |
|
"eval_samples_per_second": 10.977, |
|
"eval_steps_per_second": 5.495, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 74.25537872314453, |
|
"learning_rate": 2.371937345714261e-06, |
|
"loss": 2.7367, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.636099100112915, |
|
"eval_runtime": 75.2637, |
|
"eval_samples_per_second": 10.961, |
|
"eval_steps_per_second": 5.487, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 73.27501678466797, |
|
"learning_rate": 2.3865789342680527e-06, |
|
"loss": 2.7381, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.8240554332733154, |
|
"eval_runtime": 96.1397, |
|
"eval_samples_per_second": 8.581, |
|
"eval_steps_per_second": 4.296, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 79.56513977050781, |
|
"learning_rate": 2.4012205228218444e-06, |
|
"loss": 2.7252, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.670012950897217, |
|
"eval_runtime": 74.948, |
|
"eval_samples_per_second": 11.008, |
|
"eval_steps_per_second": 5.51, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 75.19647979736328, |
|
"learning_rate": 2.415862111375636e-06, |
|
"loss": 2.7358, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7422537803649902, |
|
"eval_runtime": 95.7799, |
|
"eval_samples_per_second": 8.614, |
|
"eval_steps_per_second": 4.312, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 73.22459411621094, |
|
"learning_rate": 2.4305036999294275e-06, |
|
"loss": 2.7316, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.705203056335449, |
|
"eval_runtime": 75.0933, |
|
"eval_samples_per_second": 10.986, |
|
"eval_steps_per_second": 5.5, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 57.047245025634766, |
|
"learning_rate": 2.4451452884832192e-06, |
|
"loss": 2.7239, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7708864212036133, |
|
"eval_runtime": 74.9497, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 5.51, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 79.20235443115234, |
|
"learning_rate": 2.459786877037011e-06, |
|
"loss": 2.7118, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.7727091312408447, |
|
"eval_runtime": 96.2784, |
|
"eval_samples_per_second": 8.569, |
|
"eval_steps_per_second": 4.29, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 55.22106170654297, |
|
"learning_rate": 2.474428465590803e-06, |
|
"loss": 2.7325, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.647690534591675, |
|
"eval_runtime": 74.9199, |
|
"eval_samples_per_second": 11.012, |
|
"eval_steps_per_second": 5.513, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 64.41899871826172, |
|
"learning_rate": 2.489070054144595e-06, |
|
"loss": 2.7135, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.6558783054351807, |
|
"eval_runtime": 103.5276, |
|
"eval_samples_per_second": 7.969, |
|
"eval_steps_per_second": 3.989, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 57.375343322753906, |
|
"learning_rate": 2.5037116426983867e-06, |
|
"loss": 2.7279, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.5775904655456543, |
|
"eval_runtime": 74.9254, |
|
"eval_samples_per_second": 11.011, |
|
"eval_steps_per_second": 5.512, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 70.312255859375, |
|
"learning_rate": 2.518353231252178e-06, |
|
"loss": 2.7104, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.61749267578125, |
|
"eval_runtime": 95.7541, |
|
"eval_samples_per_second": 8.616, |
|
"eval_steps_per_second": 4.313, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 51.519561767578125, |
|
"learning_rate": 2.53299481980597e-06, |
|
"loss": 2.7306, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7134809494018555, |
|
"eval_runtime": 75.1688, |
|
"eval_samples_per_second": 10.975, |
|
"eval_steps_per_second": 5.494, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 51.85505294799805, |
|
"learning_rate": 2.5476364083597615e-06, |
|
"loss": 2.7344, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6391634941101074, |
|
"eval_runtime": 74.8776, |
|
"eval_samples_per_second": 11.018, |
|
"eval_steps_per_second": 5.516, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 49.310359954833984, |
|
"learning_rate": 2.5622779969135536e-06, |
|
"loss": 2.7166, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.644277334213257, |
|
"eval_runtime": 95.9839, |
|
"eval_samples_per_second": 8.595, |
|
"eval_steps_per_second": 4.303, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 55.3402214050293, |
|
"learning_rate": 2.576919585467345e-06, |
|
"loss": 2.7325, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6863160133361816, |
|
"eval_runtime": 74.8934, |
|
"eval_samples_per_second": 11.016, |
|
"eval_steps_per_second": 5.515, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 88.28662109375, |
|
"learning_rate": 2.5915611740211367e-06, |
|
"loss": 2.7363, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.635762929916382, |
|
"eval_runtime": 95.8689, |
|
"eval_samples_per_second": 8.606, |
|
"eval_steps_per_second": 4.308, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 54.152061462402344, |
|
"learning_rate": 2.6062027625749285e-06, |
|
"loss": 2.7308, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6707890033721924, |
|
"eval_runtime": 75.0171, |
|
"eval_samples_per_second": 10.997, |
|
"eval_steps_per_second": 5.505, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 57.176002502441406, |
|
"learning_rate": 2.6208443511287202e-06, |
|
"loss": 2.7079, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.698444128036499, |
|
"eval_runtime": 74.8872, |
|
"eval_samples_per_second": 11.017, |
|
"eval_steps_per_second": 5.515, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 48.09297180175781, |
|
"learning_rate": 2.6354859396825124e-06, |
|
"loss": 2.7096, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6643295288085938, |
|
"eval_runtime": 95.8171, |
|
"eval_samples_per_second": 8.61, |
|
"eval_steps_per_second": 4.31, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 53.34381866455078, |
|
"learning_rate": 2.6501275282363037e-06, |
|
"loss": 2.7171, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.65159273147583, |
|
"eval_runtime": 74.9201, |
|
"eval_samples_per_second": 11.012, |
|
"eval_steps_per_second": 5.513, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 53.656951904296875, |
|
"learning_rate": 2.664769116790095e-06, |
|
"loss": 2.7048, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6368868350982666, |
|
"eval_runtime": 95.7287, |
|
"eval_samples_per_second": 8.618, |
|
"eval_steps_per_second": 4.314, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 71.44660186767578, |
|
"learning_rate": 2.6794107053438872e-06, |
|
"loss": 2.7236, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7588698863983154, |
|
"eval_runtime": 74.9883, |
|
"eval_samples_per_second": 11.002, |
|
"eval_steps_per_second": 5.508, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 62.191505432128906, |
|
"learning_rate": 2.6940522938976785e-06, |
|
"loss": 2.7078, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7348010540008545, |
|
"eval_runtime": 74.9125, |
|
"eval_samples_per_second": 11.013, |
|
"eval_steps_per_second": 5.513, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 55.455528259277344, |
|
"learning_rate": 2.7086938824514707e-06, |
|
"loss": 2.7133, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7168068885803223, |
|
"eval_runtime": 95.9132, |
|
"eval_samples_per_second": 8.602, |
|
"eval_steps_per_second": 4.306, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 49.198707580566406, |
|
"learning_rate": 2.7233354710052625e-06, |
|
"loss": 2.7181, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6445090770721436, |
|
"eval_runtime": 74.8503, |
|
"eval_samples_per_second": 11.022, |
|
"eval_steps_per_second": 5.518, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 69.39476013183594, |
|
"learning_rate": 2.7379770595590542e-06, |
|
"loss": 2.694, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.668306589126587, |
|
"eval_runtime": 95.7168, |
|
"eval_samples_per_second": 8.619, |
|
"eval_steps_per_second": 4.315, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 49.924476623535156, |
|
"learning_rate": 2.752618648112846e-06, |
|
"loss": 2.7187, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.689119338989258, |
|
"eval_runtime": 74.8676, |
|
"eval_samples_per_second": 11.019, |
|
"eval_steps_per_second": 5.516, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 46.9813346862793, |
|
"learning_rate": 2.7672602366666373e-06, |
|
"loss": 2.7244, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6233108043670654, |
|
"eval_runtime": 95.9325, |
|
"eval_samples_per_second": 8.6, |
|
"eval_steps_per_second": 4.305, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 55.19742965698242, |
|
"learning_rate": 2.7819018252204295e-06, |
|
"loss": 2.7045, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6683340072631836, |
|
"eval_runtime": 75.0381, |
|
"eval_samples_per_second": 10.994, |
|
"eval_steps_per_second": 5.504, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 40.58945846557617, |
|
"learning_rate": 2.796543413774221e-06, |
|
"loss": 2.7182, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.5416364669799805, |
|
"eval_runtime": 74.9499, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 5.51, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 59.873573303222656, |
|
"learning_rate": 2.811185002328013e-06, |
|
"loss": 2.694, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7568399906158447, |
|
"eval_runtime": 95.8753, |
|
"eval_samples_per_second": 8.605, |
|
"eval_steps_per_second": 4.308, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 49.662471771240234, |
|
"learning_rate": 2.8258265908818043e-06, |
|
"loss": 2.7121, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6501502990722656, |
|
"eval_runtime": 74.9192, |
|
"eval_samples_per_second": 11.012, |
|
"eval_steps_per_second": 5.513, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 42.26154327392578, |
|
"learning_rate": 2.8404681794355965e-06, |
|
"loss": 2.6814, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.660658359527588, |
|
"eval_runtime": 95.7557, |
|
"eval_samples_per_second": 8.616, |
|
"eval_steps_per_second": 4.313, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 56.657474517822266, |
|
"learning_rate": 2.855109767989388e-06, |
|
"loss": 2.699, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.631808042526245, |
|
"eval_runtime": 75.0945, |
|
"eval_samples_per_second": 10.986, |
|
"eval_steps_per_second": 5.5, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 52.05348587036133, |
|
"learning_rate": 2.8697513565431795e-06, |
|
"loss": 2.6997, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.626267433166504, |
|
"eval_runtime": 76.2341, |
|
"eval_samples_per_second": 10.822, |
|
"eval_steps_per_second": 5.418, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 49.89586639404297, |
|
"learning_rate": 2.8843929450969717e-06, |
|
"loss": 2.6881, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6203720569610596, |
|
"eval_runtime": 96.0809, |
|
"eval_samples_per_second": 8.587, |
|
"eval_steps_per_second": 4.298, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 49.57963562011719, |
|
"learning_rate": 2.899034533650763e-06, |
|
"loss": 2.7143, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.601503372192383, |
|
"eval_runtime": 74.8871, |
|
"eval_samples_per_second": 11.017, |
|
"eval_steps_per_second": 5.515, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 57.84087371826172, |
|
"learning_rate": 2.9136761222045552e-06, |
|
"loss": 2.6896, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.655690908432007, |
|
"eval_runtime": 95.448, |
|
"eval_samples_per_second": 8.643, |
|
"eval_steps_per_second": 4.327, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 51.44375228881836, |
|
"learning_rate": 2.9283177107583465e-06, |
|
"loss": 2.6857, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.590942144393921, |
|
"eval_runtime": 74.9631, |
|
"eval_samples_per_second": 11.005, |
|
"eval_steps_per_second": 5.509, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 54.060970306396484, |
|
"learning_rate": 2.9429592993121387e-06, |
|
"loss": 2.6882, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.717146873474121, |
|
"eval_runtime": 95.7384, |
|
"eval_samples_per_second": 8.617, |
|
"eval_steps_per_second": 4.314, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 43.254676818847656, |
|
"learning_rate": 2.95760088786593e-06, |
|
"loss": 2.6999, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6694703102111816, |
|
"eval_runtime": 75.1451, |
|
"eval_samples_per_second": 10.979, |
|
"eval_steps_per_second": 5.496, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 48.996421813964844, |
|
"learning_rate": 2.972242476419722e-06, |
|
"loss": 2.6919, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.65441632270813, |
|
"eval_runtime": 74.8418, |
|
"eval_samples_per_second": 11.023, |
|
"eval_steps_per_second": 5.518, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 64.13751220703125, |
|
"learning_rate": 2.9868840649735135e-06, |
|
"loss": 2.6926, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.673851251602173, |
|
"eval_runtime": 95.6289, |
|
"eval_samples_per_second": 8.627, |
|
"eval_steps_per_second": 4.319, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 51.9734001159668, |
|
"learning_rate": 3.0015256535273053e-06, |
|
"loss": 2.6686, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6541659832000732, |
|
"eval_runtime": 74.8434, |
|
"eval_samples_per_second": 11.023, |
|
"eval_steps_per_second": 5.518, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 40.7435302734375, |
|
"learning_rate": 3.016167242081097e-06, |
|
"loss": 2.6833, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6632096767425537, |
|
"eval_runtime": 95.7779, |
|
"eval_samples_per_second": 8.614, |
|
"eval_steps_per_second": 4.312, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 57.3190803527832, |
|
"learning_rate": 3.030808830634889e-06, |
|
"loss": 2.6855, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.565784454345703, |
|
"eval_runtime": 76.5683, |
|
"eval_samples_per_second": 10.775, |
|
"eval_steps_per_second": 5.394, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 49.840736389160156, |
|
"learning_rate": 3.045450419188681e-06, |
|
"loss": 2.6655, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6578309535980225, |
|
"eval_runtime": 76.0025, |
|
"eval_samples_per_second": 10.855, |
|
"eval_steps_per_second": 5.434, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 46.31571578979492, |
|
"learning_rate": 3.0600920077424723e-06, |
|
"loss": 2.7052, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7100558280944824, |
|
"eval_runtime": 97.7041, |
|
"eval_samples_per_second": 8.444, |
|
"eval_steps_per_second": 4.227, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 55.01320266723633, |
|
"learning_rate": 3.0747335962962636e-06, |
|
"loss": 2.7003, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6395130157470703, |
|
"eval_runtime": 75.3617, |
|
"eval_samples_per_second": 10.947, |
|
"eval_steps_per_second": 5.48, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 60.59393310546875, |
|
"learning_rate": 3.089375184850056e-06, |
|
"loss": 2.68, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.625278949737549, |
|
"eval_runtime": 96.3447, |
|
"eval_samples_per_second": 8.563, |
|
"eval_steps_per_second": 4.287, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 45.25587463378906, |
|
"learning_rate": 3.1040167734038475e-06, |
|
"loss": 2.6903, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.598447561264038, |
|
"eval_runtime": 75.8018, |
|
"eval_samples_per_second": 10.884, |
|
"eval_steps_per_second": 5.448, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 51.60664749145508, |
|
"learning_rate": 3.1186583619576393e-06, |
|
"loss": 2.6726, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6069204807281494, |
|
"eval_runtime": 75.3029, |
|
"eval_samples_per_second": 10.956, |
|
"eval_steps_per_second": 5.485, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 48.45505905151367, |
|
"learning_rate": 3.133299950511431e-06, |
|
"loss": 2.6935, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.618152141571045, |
|
"eval_runtime": 96.5915, |
|
"eval_samples_per_second": 8.541, |
|
"eval_steps_per_second": 4.276, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 46.21786880493164, |
|
"learning_rate": 3.147941539065223e-06, |
|
"loss": 2.688, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.586667776107788, |
|
"eval_runtime": 75.391, |
|
"eval_samples_per_second": 10.943, |
|
"eval_steps_per_second": 5.478, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 79.90039825439453, |
|
"learning_rate": 3.1625831276190145e-06, |
|
"loss": 2.6829, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.563133955001831, |
|
"eval_runtime": 96.2319, |
|
"eval_samples_per_second": 8.573, |
|
"eval_steps_per_second": 4.292, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 49.727264404296875, |
|
"learning_rate": 3.177224716172806e-06, |
|
"loss": 2.6738, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.619941473007202, |
|
"eval_runtime": 74.8649, |
|
"eval_samples_per_second": 11.02, |
|
"eval_steps_per_second": 5.517, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 68.52893829345703, |
|
"learning_rate": 3.191866304726598e-06, |
|
"loss": 2.6622, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6587114334106445, |
|
"eval_runtime": 95.7442, |
|
"eval_samples_per_second": 8.617, |
|
"eval_steps_per_second": 4.314, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 63.71025466918945, |
|
"learning_rate": 3.2065078932803894e-06, |
|
"loss": 2.6832, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.598963737487793, |
|
"eval_runtime": 75.1499, |
|
"eval_samples_per_second": 10.978, |
|
"eval_steps_per_second": 5.496, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 54.349361419677734, |
|
"learning_rate": 3.2211494818341815e-06, |
|
"loss": 2.6613, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.629615306854248, |
|
"eval_runtime": 74.8449, |
|
"eval_samples_per_second": 11.023, |
|
"eval_steps_per_second": 5.518, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 54.800376892089844, |
|
"learning_rate": 3.235791070387973e-06, |
|
"loss": 2.685, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.5904054641723633, |
|
"eval_runtime": 95.7217, |
|
"eval_samples_per_second": 8.619, |
|
"eval_steps_per_second": 4.315, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 53.165672302246094, |
|
"learning_rate": 3.250432658941765e-06, |
|
"loss": 2.6736, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6516149044036865, |
|
"eval_runtime": 74.8894, |
|
"eval_samples_per_second": 11.016, |
|
"eval_steps_per_second": 5.515, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 81.56725311279297, |
|
"learning_rate": 3.2650742474955564e-06, |
|
"loss": 2.6636, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7000627517700195, |
|
"eval_runtime": 95.7364, |
|
"eval_samples_per_second": 8.617, |
|
"eval_steps_per_second": 4.314, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 42.29210662841797, |
|
"learning_rate": 3.279715836049348e-06, |
|
"loss": 2.6812, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.664086103439331, |
|
"eval_runtime": 74.9971, |
|
"eval_samples_per_second": 11.0, |
|
"eval_steps_per_second": 5.507, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 41.341575622558594, |
|
"learning_rate": 3.2943574246031403e-06, |
|
"loss": 2.681, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7260851860046387, |
|
"eval_runtime": 74.8808, |
|
"eval_samples_per_second": 11.018, |
|
"eval_steps_per_second": 5.515, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 56.54241943359375, |
|
"learning_rate": 3.3089990131569316e-06, |
|
"loss": 2.6515, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.631478786468506, |
|
"eval_runtime": 95.9423, |
|
"eval_samples_per_second": 8.599, |
|
"eval_steps_per_second": 4.305, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 46.822330474853516, |
|
"learning_rate": 3.3236406017107238e-06, |
|
"loss": 2.668, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.711561918258667, |
|
"eval_runtime": 74.8607, |
|
"eval_samples_per_second": 11.02, |
|
"eval_steps_per_second": 5.517, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 43.4654655456543, |
|
"learning_rate": 3.338282190264515e-06, |
|
"loss": 2.6744, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6422581672668457, |
|
"eval_runtime": 95.6494, |
|
"eval_samples_per_second": 8.625, |
|
"eval_steps_per_second": 4.318, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 43.6857795715332, |
|
"learning_rate": 3.3529237788183073e-06, |
|
"loss": 2.6626, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.695249557495117, |
|
"eval_runtime": 74.8743, |
|
"eval_samples_per_second": 11.018, |
|
"eval_steps_per_second": 5.516, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 58.39930725097656, |
|
"learning_rate": 3.3675653673720986e-06, |
|
"loss": 2.6679, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6393015384674072, |
|
"eval_runtime": 74.8495, |
|
"eval_samples_per_second": 11.022, |
|
"eval_steps_per_second": 5.518, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 44.445709228515625, |
|
"learning_rate": 3.3822069559258904e-06, |
|
"loss": 2.6668, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7629072666168213, |
|
"eval_runtime": 95.8302, |
|
"eval_samples_per_second": 8.609, |
|
"eval_steps_per_second": 4.31, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 40.28990173339844, |
|
"learning_rate": 3.396848544479682e-06, |
|
"loss": 2.6773, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.695685863494873, |
|
"eval_runtime": 74.7742, |
|
"eval_samples_per_second": 11.033, |
|
"eval_steps_per_second": 5.523, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 50.713233947753906, |
|
"learning_rate": 3.411490133033474e-06, |
|
"loss": 2.646, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.659492254257202, |
|
"eval_runtime": 95.7767, |
|
"eval_samples_per_second": 8.614, |
|
"eval_steps_per_second": 4.312, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 79.29195404052734, |
|
"learning_rate": 3.4261317215872656e-06, |
|
"loss": 2.6608, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.702548027038574, |
|
"eval_runtime": 74.8568, |
|
"eval_samples_per_second": 11.021, |
|
"eval_steps_per_second": 5.517, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 64.88552856445312, |
|
"learning_rate": 3.4407733101410574e-06, |
|
"loss": 2.6629, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.6832387447357178, |
|
"eval_runtime": 95.7323, |
|
"eval_samples_per_second": 8.618, |
|
"eval_steps_per_second": 4.314, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 54.41705322265625, |
|
"learning_rate": 3.4554148986948487e-06, |
|
"loss": 2.6726, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.638201951980591, |
|
"eval_runtime": 75.0538, |
|
"eval_samples_per_second": 10.992, |
|
"eval_steps_per_second": 5.503, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 47.16751480102539, |
|
"learning_rate": 3.470056487248641e-06, |
|
"loss": 2.661, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.7070186138153076, |
|
"eval_runtime": 74.8317, |
|
"eval_samples_per_second": 11.025, |
|
"eval_steps_per_second": 5.519, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 44.628997802734375, |
|
"learning_rate": 3.484698075802432e-06, |
|
"loss": 2.6687, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.645284652709961, |
|
"eval_runtime": 95.6957, |
|
"eval_samples_per_second": 8.621, |
|
"eval_steps_per_second": 4.316, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 41.38969039916992, |
|
"learning_rate": 3.4993396643562244e-06, |
|
"loss": 2.6702, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.644817590713501, |
|
"eval_runtime": 74.7903, |
|
"eval_samples_per_second": 11.031, |
|
"eval_steps_per_second": 5.522, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 64.58867645263672, |
|
"learning_rate": 3.513981252910016e-06, |
|
"loss": 2.6417, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6666617393493652, |
|
"eval_runtime": 95.5852, |
|
"eval_samples_per_second": 8.631, |
|
"eval_steps_per_second": 4.321, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 44.54737854003906, |
|
"learning_rate": 3.528622841463808e-06, |
|
"loss": 2.6597, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.606701612472534, |
|
"eval_runtime": 74.9721, |
|
"eval_samples_per_second": 11.004, |
|
"eval_steps_per_second": 5.509, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 63.05255126953125, |
|
"learning_rate": 3.5432644300175996e-06, |
|
"loss": 2.639, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.721276044845581, |
|
"eval_runtime": 74.8, |
|
"eval_samples_per_second": 11.029, |
|
"eval_steps_per_second": 5.521, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 63.65214920043945, |
|
"learning_rate": 3.557906018571391e-06, |
|
"loss": 2.6419, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6757595539093018, |
|
"eval_runtime": 95.5642, |
|
"eval_samples_per_second": 8.633, |
|
"eval_steps_per_second": 4.322, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 60.95127487182617, |
|
"learning_rate": 3.572547607125183e-06, |
|
"loss": 2.6346, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5521128177642822, |
|
"eval_runtime": 74.8449, |
|
"eval_samples_per_second": 11.023, |
|
"eval_steps_per_second": 5.518, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 34.42070007324219, |
|
"learning_rate": 3.5871891956789744e-06, |
|
"loss": 2.6748, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5943429470062256, |
|
"eval_runtime": 95.5809, |
|
"eval_samples_per_second": 8.631, |
|
"eval_steps_per_second": 4.321, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 50.500213623046875, |
|
"learning_rate": 3.6018307842327666e-06, |
|
"loss": 2.6669, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6257340908050537, |
|
"eval_runtime": 74.8799, |
|
"eval_samples_per_second": 11.018, |
|
"eval_steps_per_second": 5.516, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 40.18623733520508, |
|
"learning_rate": 3.616472372786558e-06, |
|
"loss": 2.64, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5401906967163086, |
|
"eval_runtime": 95.7899, |
|
"eval_samples_per_second": 8.613, |
|
"eval_steps_per_second": 4.312, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 54.12990951538086, |
|
"learning_rate": 3.63111396134035e-06, |
|
"loss": 2.6398, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5751349925994873, |
|
"eval_runtime": 75.2498, |
|
"eval_samples_per_second": 10.963, |
|
"eval_steps_per_second": 5.488, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 54.11725616455078, |
|
"learning_rate": 3.6457555498941414e-06, |
|
"loss": 2.6261, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.578989028930664, |
|
"eval_runtime": 74.8168, |
|
"eval_samples_per_second": 11.027, |
|
"eval_steps_per_second": 5.52, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 49.80808639526367, |
|
"learning_rate": 3.660397138447933e-06, |
|
"loss": 2.6424, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5562169551849365, |
|
"eval_runtime": 95.8177, |
|
"eval_samples_per_second": 8.61, |
|
"eval_steps_per_second": 4.31, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 61.51396560668945, |
|
"learning_rate": 3.6750387270017254e-06, |
|
"loss": 2.6531, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.537543773651123, |
|
"eval_runtime": 74.9434, |
|
"eval_samples_per_second": 11.008, |
|
"eval_steps_per_second": 5.511, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 69.51727294921875, |
|
"learning_rate": 3.6896803155555167e-06, |
|
"loss": 2.6588, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.599710464477539, |
|
"eval_runtime": 102.9802, |
|
"eval_samples_per_second": 8.011, |
|
"eval_steps_per_second": 4.01, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 55.579795837402344, |
|
"learning_rate": 3.704321904109309e-06, |
|
"loss": 2.6438, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5952088832855225, |
|
"eval_runtime": 75.3086, |
|
"eval_samples_per_second": 10.955, |
|
"eval_steps_per_second": 5.484, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 45.27565383911133, |
|
"learning_rate": 3.7189634926631e-06, |
|
"loss": 2.6215, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5829055309295654, |
|
"eval_runtime": 74.7515, |
|
"eval_samples_per_second": 11.037, |
|
"eval_steps_per_second": 5.525, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 44.96015548706055, |
|
"learning_rate": 3.7336050812168924e-06, |
|
"loss": 2.6283, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6275458335876465, |
|
"eval_runtime": 95.5562, |
|
"eval_samples_per_second": 8.634, |
|
"eval_steps_per_second": 4.322, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 38.68233871459961, |
|
"learning_rate": 3.7482466697706837e-06, |
|
"loss": 2.641, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5264251232147217, |
|
"eval_runtime": 74.8122, |
|
"eval_samples_per_second": 11.028, |
|
"eval_steps_per_second": 5.52, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 42.972618103027344, |
|
"learning_rate": 3.7628882583244754e-06, |
|
"loss": 2.6546, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.525606632232666, |
|
"eval_runtime": 95.3847, |
|
"eval_samples_per_second": 8.649, |
|
"eval_steps_per_second": 4.33, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 50.30706024169922, |
|
"learning_rate": 3.777529846878267e-06, |
|
"loss": 2.6542, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.575317859649658, |
|
"eval_runtime": 74.8866, |
|
"eval_samples_per_second": 11.017, |
|
"eval_steps_per_second": 5.515, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 65.63154602050781, |
|
"learning_rate": 3.792171435432059e-06, |
|
"loss": 2.6473, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6257107257843018, |
|
"eval_runtime": 74.8156, |
|
"eval_samples_per_second": 11.027, |
|
"eval_steps_per_second": 5.52, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 47.780609130859375, |
|
"learning_rate": 3.8068130239858507e-06, |
|
"loss": 2.6523, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6211636066436768, |
|
"eval_runtime": 95.7469, |
|
"eval_samples_per_second": 8.616, |
|
"eval_steps_per_second": 4.313, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 55.50344467163086, |
|
"learning_rate": 3.821454612539642e-06, |
|
"loss": 2.6307, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6138713359832764, |
|
"eval_runtime": 74.8053, |
|
"eval_samples_per_second": 11.029, |
|
"eval_steps_per_second": 5.521, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 59.76543426513672, |
|
"learning_rate": 3.836096201093434e-06, |
|
"loss": 2.662, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.666452407836914, |
|
"eval_runtime": 95.67, |
|
"eval_samples_per_second": 8.623, |
|
"eval_steps_per_second": 4.317, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 76.32318878173828, |
|
"learning_rate": 3.8507377896472255e-06, |
|
"loss": 2.6497, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.642090320587158, |
|
"eval_runtime": 74.8226, |
|
"eval_samples_per_second": 11.026, |
|
"eval_steps_per_second": 5.52, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 33.338584899902344, |
|
"learning_rate": 3.865379378201018e-06, |
|
"loss": 2.6369, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6893866062164307, |
|
"eval_runtime": 95.5516, |
|
"eval_samples_per_second": 8.634, |
|
"eval_steps_per_second": 4.322, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 66.35092163085938, |
|
"learning_rate": 3.88002096675481e-06, |
|
"loss": 2.6394, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6854238510131836, |
|
"eval_runtime": 74.9093, |
|
"eval_samples_per_second": 11.013, |
|
"eval_steps_per_second": 5.513, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 49.542320251464844, |
|
"learning_rate": 3.894662555308601e-06, |
|
"loss": 2.6358, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.639975070953369, |
|
"eval_runtime": 74.7199, |
|
"eval_samples_per_second": 11.041, |
|
"eval_steps_per_second": 5.527, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 47.11073303222656, |
|
"learning_rate": 3.909304143862393e-06, |
|
"loss": 2.6567, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6232967376708984, |
|
"eval_runtime": 96.0037, |
|
"eval_samples_per_second": 8.593, |
|
"eval_steps_per_second": 4.302, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 43.28693389892578, |
|
"learning_rate": 3.923945732416185e-06, |
|
"loss": 2.6497, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6127660274505615, |
|
"eval_runtime": 74.7657, |
|
"eval_samples_per_second": 11.034, |
|
"eval_steps_per_second": 5.524, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 36.58399963378906, |
|
"learning_rate": 3.938587320969977e-06, |
|
"loss": 2.6327, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5707271099090576, |
|
"eval_runtime": 95.7149, |
|
"eval_samples_per_second": 8.619, |
|
"eval_steps_per_second": 4.315, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 34.05764389038086, |
|
"learning_rate": 3.953228909523768e-06, |
|
"loss": 2.5982, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5980732440948486, |
|
"eval_runtime": 74.9457, |
|
"eval_samples_per_second": 11.008, |
|
"eval_steps_per_second": 5.511, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 46.130401611328125, |
|
"learning_rate": 3.9678704980775595e-06, |
|
"loss": 2.6492, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5685958862304688, |
|
"eval_runtime": 79.9423, |
|
"eval_samples_per_second": 10.32, |
|
"eval_steps_per_second": 5.166, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 45.223052978515625, |
|
"learning_rate": 3.982512086631352e-06, |
|
"loss": 2.6187, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.686868190765381, |
|
"eval_runtime": 97.0067, |
|
"eval_samples_per_second": 8.505, |
|
"eval_steps_per_second": 4.257, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 68.07273864746094, |
|
"learning_rate": 3.997153675185143e-06, |
|
"loss": 2.6348, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.598745822906494, |
|
"eval_runtime": 74.7772, |
|
"eval_samples_per_second": 11.033, |
|
"eval_steps_per_second": 5.523, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 55.39958190917969, |
|
"learning_rate": 4.011795263738935e-06, |
|
"loss": 2.6339, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.793588399887085, |
|
"eval_runtime": 95.6218, |
|
"eval_samples_per_second": 8.628, |
|
"eval_steps_per_second": 4.319, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 46.7374382019043, |
|
"learning_rate": 4.0264368522927265e-06, |
|
"loss": 2.6154, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.607264995574951, |
|
"eval_runtime": 74.9463, |
|
"eval_samples_per_second": 11.008, |
|
"eval_steps_per_second": 5.511, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 45.974037170410156, |
|
"learning_rate": 4.041078440846519e-06, |
|
"loss": 2.6517, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.603538751602173, |
|
"eval_runtime": 74.8767, |
|
"eval_samples_per_second": 11.018, |
|
"eval_steps_per_second": 5.516, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 46.00688171386719, |
|
"learning_rate": 4.05572002940031e-06, |
|
"loss": 2.6418, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6350746154785156, |
|
"eval_runtime": 96.1011, |
|
"eval_samples_per_second": 8.585, |
|
"eval_steps_per_second": 4.298, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 73.03669738769531, |
|
"learning_rate": 4.070361617954101e-06, |
|
"loss": 2.6235, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5305094718933105, |
|
"eval_runtime": 74.8288, |
|
"eval_samples_per_second": 11.025, |
|
"eval_steps_per_second": 5.519, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 41.87487030029297, |
|
"learning_rate": 4.0850032065078935e-06, |
|
"loss": 2.6279, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5468451976776123, |
|
"eval_runtime": 95.6834, |
|
"eval_samples_per_second": 8.622, |
|
"eval_steps_per_second": 4.316, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 49.32594299316406, |
|
"learning_rate": 4.099644795061685e-06, |
|
"loss": 2.6161, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6616108417510986, |
|
"eval_runtime": 74.9047, |
|
"eval_samples_per_second": 11.014, |
|
"eval_steps_per_second": 5.514, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 53.41552734375, |
|
"learning_rate": 4.114286383615477e-06, |
|
"loss": 2.6354, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.677363872528076, |
|
"eval_runtime": 95.9109, |
|
"eval_samples_per_second": 8.602, |
|
"eval_steps_per_second": 4.306, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 39.76441955566406, |
|
"learning_rate": 4.128927972169269e-06, |
|
"loss": 2.6431, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5935537815093994, |
|
"eval_runtime": 75.0858, |
|
"eval_samples_per_second": 10.987, |
|
"eval_steps_per_second": 5.5, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 37.35220718383789, |
|
"learning_rate": 4.1435695607230605e-06, |
|
"loss": 2.6335, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5552804470062256, |
|
"eval_runtime": 74.727, |
|
"eval_samples_per_second": 11.04, |
|
"eval_steps_per_second": 5.527, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 82.63373565673828, |
|
"learning_rate": 4.158211149276853e-06, |
|
"loss": 2.6379, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6075127124786377, |
|
"eval_runtime": 95.7342, |
|
"eval_samples_per_second": 8.618, |
|
"eval_steps_per_second": 4.314, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 43.660396575927734, |
|
"learning_rate": 4.172852737830644e-06, |
|
"loss": 2.6288, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.646052360534668, |
|
"eval_runtime": 74.7867, |
|
"eval_samples_per_second": 11.031, |
|
"eval_steps_per_second": 5.522, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 84.4545669555664, |
|
"learning_rate": 4.187494326384436e-06, |
|
"loss": 2.6322, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6136434078216553, |
|
"eval_runtime": 95.9788, |
|
"eval_samples_per_second": 8.596, |
|
"eval_steps_per_second": 4.303, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 63.179813385009766, |
|
"learning_rate": 4.2021359149382275e-06, |
|
"loss": 2.6312, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6379587650299072, |
|
"eval_runtime": 74.9615, |
|
"eval_samples_per_second": 11.006, |
|
"eval_steps_per_second": 5.509, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 59.3049201965332, |
|
"learning_rate": 4.21677750349202e-06, |
|
"loss": 2.6047, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.60385799407959, |
|
"eval_runtime": 74.7494, |
|
"eval_samples_per_second": 11.037, |
|
"eval_steps_per_second": 5.525, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 43.60575866699219, |
|
"learning_rate": 4.231419092045811e-06, |
|
"loss": 2.6223, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.672245502471924, |
|
"eval_runtime": 95.4279, |
|
"eval_samples_per_second": 8.645, |
|
"eval_steps_per_second": 4.328, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 38.0888786315918, |
|
"learning_rate": 4.246060680599602e-06, |
|
"loss": 2.6231, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.523240089416504, |
|
"eval_runtime": 74.7272, |
|
"eval_samples_per_second": 11.04, |
|
"eval_steps_per_second": 5.527, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 47.68942642211914, |
|
"learning_rate": 4.2607022691533945e-06, |
|
"loss": 2.6282, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6057043075561523, |
|
"eval_runtime": 95.5262, |
|
"eval_samples_per_second": 8.636, |
|
"eval_steps_per_second": 4.323, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 44.74800491333008, |
|
"learning_rate": 4.275343857707186e-06, |
|
"loss": 2.6191, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.541307210922241, |
|
"eval_runtime": 74.9271, |
|
"eval_samples_per_second": 11.011, |
|
"eval_steps_per_second": 5.512, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 42.66278839111328, |
|
"learning_rate": 4.289985446260978e-06, |
|
"loss": 2.6013, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.644083023071289, |
|
"eval_runtime": 96.0242, |
|
"eval_samples_per_second": 8.592, |
|
"eval_steps_per_second": 4.301, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 35.30427551269531, |
|
"learning_rate": 4.304627034814769e-06, |
|
"loss": 2.6046, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.644864559173584, |
|
"eval_runtime": 74.8348, |
|
"eval_samples_per_second": 11.024, |
|
"eval_steps_per_second": 5.519, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 55.9124870300293, |
|
"learning_rate": 4.3192686233685615e-06, |
|
"loss": 2.6076, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.56697154045105, |
|
"eval_runtime": 75.651, |
|
"eval_samples_per_second": 10.905, |
|
"eval_steps_per_second": 5.459, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 44.417022705078125, |
|
"learning_rate": 4.333910211922353e-06, |
|
"loss": 2.6097, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5623719692230225, |
|
"eval_runtime": 95.6054, |
|
"eval_samples_per_second": 8.629, |
|
"eval_steps_per_second": 4.32, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 45.69809341430664, |
|
"learning_rate": 4.348551800476144e-06, |
|
"loss": 2.6227, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.730081558227539, |
|
"eval_runtime": 74.6867, |
|
"eval_samples_per_second": 11.046, |
|
"eval_steps_per_second": 5.53, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 39.38439178466797, |
|
"learning_rate": 4.363193389029936e-06, |
|
"loss": 2.6253, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.595525026321411, |
|
"eval_runtime": 95.8343, |
|
"eval_samples_per_second": 8.609, |
|
"eval_steps_per_second": 4.31, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 39.93308639526367, |
|
"learning_rate": 4.3778349775837285e-06, |
|
"loss": 2.6177, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6082019805908203, |
|
"eval_runtime": 74.7352, |
|
"eval_samples_per_second": 11.039, |
|
"eval_steps_per_second": 5.526, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 63.840782165527344, |
|
"learning_rate": 4.39247656613752e-06, |
|
"loss": 2.6275, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.5048768520355225, |
|
"eval_runtime": 75.2695, |
|
"eval_samples_per_second": 10.961, |
|
"eval_steps_per_second": 5.487, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 39.90363311767578, |
|
"learning_rate": 4.407118154691312e-06, |
|
"loss": 2.6198, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.526975631713867, |
|
"eval_runtime": 95.5077, |
|
"eval_samples_per_second": 8.638, |
|
"eval_steps_per_second": 4.324, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 44.53697204589844, |
|
"learning_rate": 4.421759743245103e-06, |
|
"loss": 2.6176, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.582653045654297, |
|
"eval_runtime": 74.761, |
|
"eval_samples_per_second": 11.035, |
|
"eval_steps_per_second": 5.524, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 67.8877944946289, |
|
"learning_rate": 4.4364013317988955e-06, |
|
"loss": 2.6304, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.6658573150634766, |
|
"eval_runtime": 95.8482, |
|
"eval_samples_per_second": 8.607, |
|
"eval_steps_per_second": 4.309, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 38.599971771240234, |
|
"learning_rate": 4.451042920352687e-06, |
|
"loss": 2.5968, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.568730354309082, |
|
"eval_runtime": 74.8521, |
|
"eval_samples_per_second": 11.022, |
|
"eval_steps_per_second": 5.518, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 51.99705505371094, |
|
"learning_rate": 4.465684508906479e-06, |
|
"loss": 2.6146, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.642608880996704, |
|
"eval_runtime": 74.7133, |
|
"eval_samples_per_second": 11.042, |
|
"eval_steps_per_second": 5.528, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 58.74104690551758, |
|
"learning_rate": 4.48032609746027e-06, |
|
"loss": 2.6105, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.552302837371826, |
|
"eval_runtime": 95.4477, |
|
"eval_samples_per_second": 8.643, |
|
"eval_steps_per_second": 4.327, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 37.7462158203125, |
|
"learning_rate": 4.4949676860140625e-06, |
|
"loss": 2.5963, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.533169746398926, |
|
"eval_runtime": 74.8923, |
|
"eval_samples_per_second": 11.016, |
|
"eval_steps_per_second": 5.515, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 39.184391021728516, |
|
"learning_rate": 4.509609274567854e-06, |
|
"loss": 2.6195, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5383059978485107, |
|
"eval_runtime": 95.6484, |
|
"eval_samples_per_second": 8.625, |
|
"eval_steps_per_second": 4.318, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 40.16402053833008, |
|
"learning_rate": 4.524250863121646e-06, |
|
"loss": 2.616, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.618685483932495, |
|
"eval_runtime": 74.7258, |
|
"eval_samples_per_second": 11.04, |
|
"eval_steps_per_second": 5.527, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 37.97475814819336, |
|
"learning_rate": 4.538892451675437e-06, |
|
"loss": 2.6022, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5952703952789307, |
|
"eval_runtime": 95.6397, |
|
"eval_samples_per_second": 8.626, |
|
"eval_steps_per_second": 4.318, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 53.99909210205078, |
|
"learning_rate": 4.553534040229229e-06, |
|
"loss": 2.5939, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5446667671203613, |
|
"eval_runtime": 74.682, |
|
"eval_samples_per_second": 11.047, |
|
"eval_steps_per_second": 5.53, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 45.01235580444336, |
|
"learning_rate": 4.568175628783021e-06, |
|
"loss": 2.6115, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.615330934524536, |
|
"eval_runtime": 74.6971, |
|
"eval_samples_per_second": 11.045, |
|
"eval_steps_per_second": 5.529, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 46.55107498168945, |
|
"learning_rate": 4.582817217336812e-06, |
|
"loss": 2.6223, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5805556774139404, |
|
"eval_runtime": 95.6057, |
|
"eval_samples_per_second": 8.629, |
|
"eval_steps_per_second": 4.32, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 51.727203369140625, |
|
"learning_rate": 4.597458805890604e-06, |
|
"loss": 2.6132, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.566413164138794, |
|
"eval_runtime": 74.7772, |
|
"eval_samples_per_second": 11.033, |
|
"eval_steps_per_second": 5.523, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 41.56552505493164, |
|
"learning_rate": 4.612100394444396e-06, |
|
"loss": 2.6014, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.597524881362915, |
|
"eval_runtime": 95.7861, |
|
"eval_samples_per_second": 8.613, |
|
"eval_steps_per_second": 4.312, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.12276840209961, |
|
"learning_rate": 4.626741982998188e-06, |
|
"loss": 2.6105, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5419981479644775, |
|
"eval_runtime": 74.7674, |
|
"eval_samples_per_second": 11.034, |
|
"eval_steps_per_second": 5.524, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 32.75632095336914, |
|
"learning_rate": 4.641383571551979e-06, |
|
"loss": 2.6076, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.614522695541382, |
|
"eval_runtime": 74.713, |
|
"eval_samples_per_second": 11.042, |
|
"eval_steps_per_second": 5.528, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 72.66175842285156, |
|
"learning_rate": 4.656025160105771e-06, |
|
"loss": 2.6031, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.636115312576294, |
|
"eval_runtime": 95.4957, |
|
"eval_samples_per_second": 8.639, |
|
"eval_steps_per_second": 4.325, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 56.245540618896484, |
|
"learning_rate": 4.670666748659563e-06, |
|
"loss": 2.5956, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.656207323074341, |
|
"eval_runtime": 74.7116, |
|
"eval_samples_per_second": 11.042, |
|
"eval_steps_per_second": 5.528, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 40.125789642333984, |
|
"learning_rate": 4.685308337213355e-06, |
|
"loss": 2.6136, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.6252167224884033, |
|
"eval_runtime": 95.5473, |
|
"eval_samples_per_second": 8.634, |
|
"eval_steps_per_second": 4.322, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 30.78213119506836, |
|
"learning_rate": 4.699949925767147e-06, |
|
"loss": 2.6112, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5026865005493164, |
|
"eval_runtime": 74.9699, |
|
"eval_samples_per_second": 11.004, |
|
"eval_steps_per_second": 5.509, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 43.604705810546875, |
|
"learning_rate": 4.714591514320938e-06, |
|
"loss": 2.6137, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5527615547180176, |
|
"eval_runtime": 74.7501, |
|
"eval_samples_per_second": 11.037, |
|
"eval_steps_per_second": 5.525, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 51.488155364990234, |
|
"learning_rate": 4.7292331028747305e-06, |
|
"loss": 2.6012, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.573512077331543, |
|
"eval_runtime": 96.1372, |
|
"eval_samples_per_second": 8.581, |
|
"eval_steps_per_second": 4.296, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 42.72104263305664, |
|
"learning_rate": 4.743874691428522e-06, |
|
"loss": 2.6057, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.625537633895874, |
|
"eval_runtime": 74.7491, |
|
"eval_samples_per_second": 11.037, |
|
"eval_steps_per_second": 5.525, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 45.53932571411133, |
|
"learning_rate": 4.758516279982313e-06, |
|
"loss": 2.6134, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.558391809463501, |
|
"eval_runtime": 95.5834, |
|
"eval_samples_per_second": 8.631, |
|
"eval_steps_per_second": 4.321, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 48.866275787353516, |
|
"learning_rate": 4.773157868536105e-06, |
|
"loss": 2.6041, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5378012657165527, |
|
"eval_runtime": 74.7093, |
|
"eval_samples_per_second": 11.043, |
|
"eval_steps_per_second": 5.528, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.03339385986328, |
|
"learning_rate": 4.787799457089897e-06, |
|
"loss": 2.5948, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.563570737838745, |
|
"eval_runtime": 95.4651, |
|
"eval_samples_per_second": 8.642, |
|
"eval_steps_per_second": 4.326, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 42.4345817565918, |
|
"learning_rate": 4.802441045643689e-06, |
|
"loss": 2.604, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.592179536819458, |
|
"eval_runtime": 74.6574, |
|
"eval_samples_per_second": 11.05, |
|
"eval_steps_per_second": 5.532, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 56.61566925048828, |
|
"learning_rate": 4.81708263419748e-06, |
|
"loss": 2.6091, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5877678394317627, |
|
"eval_runtime": 74.6766, |
|
"eval_samples_per_second": 11.048, |
|
"eval_steps_per_second": 5.531, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 43.33776092529297, |
|
"learning_rate": 4.831724222751272e-06, |
|
"loss": 2.6022, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.6100189685821533, |
|
"eval_runtime": 95.502, |
|
"eval_samples_per_second": 8.639, |
|
"eval_steps_per_second": 4.325, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 36.602630615234375, |
|
"learning_rate": 4.846365811305064e-06, |
|
"loss": 2.5769, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5708513259887695, |
|
"eval_runtime": 74.7367, |
|
"eval_samples_per_second": 11.039, |
|
"eval_steps_per_second": 5.526, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 43.569435119628906, |
|
"learning_rate": 4.861007399858855e-06, |
|
"loss": 2.5931, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.564967632293701, |
|
"eval_runtime": 95.3119, |
|
"eval_samples_per_second": 8.656, |
|
"eval_steps_per_second": 4.333, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 32.60426712036133, |
|
"learning_rate": 4.875648988412647e-06, |
|
"loss": 2.5972, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.572070360183716, |
|
"eval_runtime": 74.6662, |
|
"eval_samples_per_second": 11.049, |
|
"eval_steps_per_second": 5.531, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 29.82908821105957, |
|
"learning_rate": 4.8902905769664385e-06, |
|
"loss": 2.5879, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.556232452392578, |
|
"eval_runtime": 74.7002, |
|
"eval_samples_per_second": 11.044, |
|
"eval_steps_per_second": 5.529, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 42.57914352416992, |
|
"learning_rate": 4.904932165520231e-06, |
|
"loss": 2.6053, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5889346599578857, |
|
"eval_runtime": 95.415, |
|
"eval_samples_per_second": 8.646, |
|
"eval_steps_per_second": 4.328, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 41.65904235839844, |
|
"learning_rate": 4.919573754074022e-06, |
|
"loss": 2.5853, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.564220428466797, |
|
"eval_runtime": 75.0285, |
|
"eval_samples_per_second": 10.996, |
|
"eval_steps_per_second": 5.505, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 36.339805603027344, |
|
"learning_rate": 4.934215342627814e-06, |
|
"loss": 2.601, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.616079568862915, |
|
"eval_runtime": 95.5747, |
|
"eval_samples_per_second": 8.632, |
|
"eval_steps_per_second": 4.321, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 45.61557388305664, |
|
"learning_rate": 4.948856931181606e-06, |
|
"loss": 2.5803, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.6592965126037598, |
|
"eval_runtime": 74.8749, |
|
"eval_samples_per_second": 11.018, |
|
"eval_steps_per_second": 5.516, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 52.03131103515625, |
|
"learning_rate": 4.963498519735398e-06, |
|
"loss": 2.593, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.596527099609375, |
|
"eval_runtime": 95.738, |
|
"eval_samples_per_second": 8.617, |
|
"eval_steps_per_second": 4.314, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.49449157714844, |
|
"learning_rate": 4.97814010828919e-06, |
|
"loss": 2.5997, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.633748769760132, |
|
"eval_runtime": 74.5989, |
|
"eval_samples_per_second": 11.059, |
|
"eval_steps_per_second": 5.536, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 54.275081634521484, |
|
"learning_rate": 4.992781696842981e-06, |
|
"loss": 2.5905, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.56416392326355, |
|
"eval_runtime": 74.6523, |
|
"eval_samples_per_second": 11.051, |
|
"eval_steps_per_second": 5.532, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 41.0302619934082, |
|
"learning_rate": 5.007423285396773e-06, |
|
"loss": 2.5862, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.618197202682495, |
|
"eval_runtime": 95.5993, |
|
"eval_samples_per_second": 8.63, |
|
"eval_steps_per_second": 4.32, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 63.483768463134766, |
|
"learning_rate": 5.0220648739505655e-06, |
|
"loss": 2.5804, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5387542247772217, |
|
"eval_runtime": 74.7839, |
|
"eval_samples_per_second": 11.032, |
|
"eval_steps_per_second": 5.523, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 44.93665313720703, |
|
"learning_rate": 5.036706462504356e-06, |
|
"loss": 2.5952, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.513009548187256, |
|
"eval_runtime": 96.0331, |
|
"eval_samples_per_second": 8.591, |
|
"eval_steps_per_second": 4.301, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 32.70128631591797, |
|
"learning_rate": 5.051348051058148e-06, |
|
"loss": 2.5809, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.621497631072998, |
|
"eval_runtime": 74.6905, |
|
"eval_samples_per_second": 11.046, |
|
"eval_steps_per_second": 5.529, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 48.31204605102539, |
|
"learning_rate": 5.06598963961194e-06, |
|
"loss": 2.5855, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.7315196990966797, |
|
"eval_runtime": 74.6218, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 5.535, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.62407302856445, |
|
"learning_rate": 5.080631228165731e-06, |
|
"loss": 2.6069, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.50812029838562, |
|
"eval_runtime": 95.4592, |
|
"eval_samples_per_second": 8.642, |
|
"eval_steps_per_second": 4.326, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 42.37205505371094, |
|
"learning_rate": 5.095272816719523e-06, |
|
"loss": 2.5716, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.4942855834960938, |
|
"eval_runtime": 74.6742, |
|
"eval_samples_per_second": 11.048, |
|
"eval_steps_per_second": 5.531, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 47.90432357788086, |
|
"learning_rate": 5.109914405273315e-06, |
|
"loss": 2.5962, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.545471668243408, |
|
"eval_runtime": 95.481, |
|
"eval_samples_per_second": 8.64, |
|
"eval_steps_per_second": 4.325, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 57.815704345703125, |
|
"learning_rate": 5.124555993827107e-06, |
|
"loss": 2.5763, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.4934654235839844, |
|
"eval_runtime": 74.8205, |
|
"eval_samples_per_second": 11.026, |
|
"eval_steps_per_second": 5.52, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 46.640071868896484, |
|
"learning_rate": 5.139197582380898e-06, |
|
"loss": 2.5946, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5673718452453613, |
|
"eval_runtime": 74.6438, |
|
"eval_samples_per_second": 11.052, |
|
"eval_steps_per_second": 5.533, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 39.76240539550781, |
|
"learning_rate": 5.15383917093469e-06, |
|
"loss": 2.5928, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5679984092712402, |
|
"eval_runtime": 74.6621, |
|
"eval_samples_per_second": 11.05, |
|
"eval_steps_per_second": 5.532, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 78.83277893066406, |
|
"learning_rate": 5.168480759488482e-06, |
|
"loss": 2.5809, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5745747089385986, |
|
"eval_runtime": 88.8716, |
|
"eval_samples_per_second": 9.283, |
|
"eval_steps_per_second": 4.647, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 31.719282150268555, |
|
"learning_rate": 5.1831223480422734e-06, |
|
"loss": 2.5817, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.542511224746704, |
|
"eval_runtime": 74.7387, |
|
"eval_samples_per_second": 11.038, |
|
"eval_steps_per_second": 5.526, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 45.4350471496582, |
|
"learning_rate": 5.197763936596066e-06, |
|
"loss": 2.5853, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.6151788234710693, |
|
"eval_runtime": 88.7294, |
|
"eval_samples_per_second": 9.298, |
|
"eval_steps_per_second": 4.655, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 43.207645416259766, |
|
"learning_rate": 5.212405525149857e-06, |
|
"loss": 2.5861, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5360398292541504, |
|
"eval_runtime": 74.7605, |
|
"eval_samples_per_second": 11.035, |
|
"eval_steps_per_second": 5.524, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 32.29416275024414, |
|
"learning_rate": 5.227047113703649e-06, |
|
"loss": 2.5792, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.475013494491577, |
|
"eval_runtime": 74.695, |
|
"eval_samples_per_second": 11.045, |
|
"eval_steps_per_second": 5.529, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 41.070499420166016, |
|
"learning_rate": 5.2416887022574404e-06, |
|
"loss": 2.5846, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.6203396320343018, |
|
"eval_runtime": 88.6304, |
|
"eval_samples_per_second": 9.308, |
|
"eval_steps_per_second": 4.66, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 38.02358627319336, |
|
"learning_rate": 5.256330290811233e-06, |
|
"loss": 2.5994, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.52642560005188, |
|
"eval_runtime": 74.6571, |
|
"eval_samples_per_second": 11.051, |
|
"eval_steps_per_second": 5.532, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 34.377044677734375, |
|
"learning_rate": 5.270971879365025e-06, |
|
"loss": 2.5857, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5763497352600098, |
|
"eval_runtime": 88.7937, |
|
"eval_samples_per_second": 9.291, |
|
"eval_steps_per_second": 4.651, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 37.10789489746094, |
|
"learning_rate": 5.285613467918815e-06, |
|
"loss": 2.5775, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.560091257095337, |
|
"eval_runtime": 74.7099, |
|
"eval_samples_per_second": 11.043, |
|
"eval_steps_per_second": 5.528, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.48027420043945, |
|
"learning_rate": 5.3002550564726074e-06, |
|
"loss": 2.5951, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.540125846862793, |
|
"eval_runtime": 74.6487, |
|
"eval_samples_per_second": 11.052, |
|
"eval_steps_per_second": 5.533, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 53.1136589050293, |
|
"learning_rate": 5.3148966450264e-06, |
|
"loss": 2.5838, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5976662635803223, |
|
"eval_runtime": 88.7661, |
|
"eval_samples_per_second": 9.294, |
|
"eval_steps_per_second": 4.653, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 47.28069305419922, |
|
"learning_rate": 5.32953823358019e-06, |
|
"loss": 2.6007, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.577742338180542, |
|
"eval_runtime": 74.7236, |
|
"eval_samples_per_second": 11.041, |
|
"eval_steps_per_second": 5.527, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 37.87571716308594, |
|
"learning_rate": 5.344179822133982e-06, |
|
"loss": 2.5998, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5661630630493164, |
|
"eval_runtime": 94.3883, |
|
"eval_samples_per_second": 8.74, |
|
"eval_steps_per_second": 4.376, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 50.52229309082031, |
|
"learning_rate": 5.3588214106877744e-06, |
|
"loss": 2.5764, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5430634021759033, |
|
"eval_runtime": 74.7841, |
|
"eval_samples_per_second": 11.032, |
|
"eval_steps_per_second": 5.523, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 33.59963607788086, |
|
"learning_rate": 5.373462999241567e-06, |
|
"loss": 2.5736, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5755114555358887, |
|
"eval_runtime": 88.7229, |
|
"eval_samples_per_second": 9.299, |
|
"eval_steps_per_second": 4.655, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 39.84903335571289, |
|
"learning_rate": 5.388104587795357e-06, |
|
"loss": 2.5516, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.574408769607544, |
|
"eval_runtime": 74.6854, |
|
"eval_samples_per_second": 11.046, |
|
"eval_steps_per_second": 5.53, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 36.43976593017578, |
|
"learning_rate": 5.402746176349149e-06, |
|
"loss": 2.5917, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5506107807159424, |
|
"eval_runtime": 74.6219, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 5.535, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 32.55131149291992, |
|
"learning_rate": 5.4173877649029414e-06, |
|
"loss": 2.5869, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5814473628997803, |
|
"eval_runtime": 88.8501, |
|
"eval_samples_per_second": 9.285, |
|
"eval_steps_per_second": 4.648, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 58.352046966552734, |
|
"learning_rate": 5.432029353456733e-06, |
|
"loss": 2.5719, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.650160312652588, |
|
"eval_runtime": 74.7626, |
|
"eval_samples_per_second": 11.035, |
|
"eval_steps_per_second": 5.524, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.47008514404297, |
|
"learning_rate": 5.446670942010525e-06, |
|
"loss": 2.5916, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.493919610977173, |
|
"eval_runtime": 88.7051, |
|
"eval_samples_per_second": 9.3, |
|
"eval_steps_per_second": 4.656, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 52.115318298339844, |
|
"learning_rate": 5.461312530564316e-06, |
|
"loss": 2.5711, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.626227855682373, |
|
"eval_runtime": 74.7738, |
|
"eval_samples_per_second": 11.033, |
|
"eval_steps_per_second": 5.523, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 55.728118896484375, |
|
"learning_rate": 5.4759541191181084e-06, |
|
"loss": 2.5898, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.645984172821045, |
|
"eval_runtime": 74.6433, |
|
"eval_samples_per_second": 11.053, |
|
"eval_steps_per_second": 5.533, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 37.5969123840332, |
|
"learning_rate": 5.4905957076719e-06, |
|
"loss": 2.613, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.5514795780181885, |
|
"eval_runtime": 89.1041, |
|
"eval_samples_per_second": 9.259, |
|
"eval_steps_per_second": 4.635, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 50.98681640625, |
|
"learning_rate": 5.505237296225692e-06, |
|
"loss": 2.5843, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5230400562286377, |
|
"eval_runtime": 74.666, |
|
"eval_samples_per_second": 11.049, |
|
"eval_steps_per_second": 5.531, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 64.90864562988281, |
|
"learning_rate": 5.519878884779484e-06, |
|
"loss": 2.5803, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.566574811935425, |
|
"eval_runtime": 88.7387, |
|
"eval_samples_per_second": 9.297, |
|
"eval_steps_per_second": 4.654, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 31.084749221801758, |
|
"learning_rate": 5.534520473333275e-06, |
|
"loss": 2.5823, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.599928140640259, |
|
"eval_runtime": 74.8013, |
|
"eval_samples_per_second": 11.029, |
|
"eval_steps_per_second": 5.521, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 37.018863677978516, |
|
"learning_rate": 5.549162061887067e-06, |
|
"loss": 2.5735, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5852713584899902, |
|
"eval_runtime": 74.637, |
|
"eval_samples_per_second": 11.053, |
|
"eval_steps_per_second": 5.533, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 47.78419876098633, |
|
"learning_rate": 5.563803650440859e-06, |
|
"loss": 2.5962, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5050272941589355, |
|
"eval_runtime": 89.0068, |
|
"eval_samples_per_second": 9.269, |
|
"eval_steps_per_second": 4.64, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 49.62971115112305, |
|
"learning_rate": 5.578445238994651e-06, |
|
"loss": 2.572, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5525805950164795, |
|
"eval_runtime": 74.676, |
|
"eval_samples_per_second": 11.048, |
|
"eval_steps_per_second": 5.531, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 41.620235443115234, |
|
"learning_rate": 5.593086827548442e-06, |
|
"loss": 2.583, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5529444217681885, |
|
"eval_runtime": 88.7249, |
|
"eval_samples_per_second": 9.298, |
|
"eval_steps_per_second": 4.655, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 32.2930793762207, |
|
"learning_rate": 5.607728416102234e-06, |
|
"loss": 2.5632, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5800976753234863, |
|
"eval_runtime": 74.7218, |
|
"eval_samples_per_second": 11.041, |
|
"eval_steps_per_second": 5.527, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 48.742530822753906, |
|
"learning_rate": 5.622370004656026e-06, |
|
"loss": 2.5652, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5238680839538574, |
|
"eval_runtime": 88.7907, |
|
"eval_samples_per_second": 9.292, |
|
"eval_steps_per_second": 4.651, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 35.834388732910156, |
|
"learning_rate": 5.637011593209816e-06, |
|
"loss": 2.5829, |
|
"step": 385000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4672865867614746, |
|
"eval_runtime": 74.6529, |
|
"eval_samples_per_second": 11.051, |
|
"eval_steps_per_second": 5.532, |
|
"step": 385000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 68.92035675048828, |
|
"learning_rate": 5.651653181763609e-06, |
|
"loss": 2.5608, |
|
"step": 386000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5575194358825684, |
|
"eval_runtime": 74.5939, |
|
"eval_samples_per_second": 11.06, |
|
"eval_steps_per_second": 5.537, |
|
"step": 386000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 50.190208435058594, |
|
"learning_rate": 5.666294770317401e-06, |
|
"loss": 2.5675, |
|
"step": 387000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5356907844543457, |
|
"eval_runtime": 88.6507, |
|
"eval_samples_per_second": 9.306, |
|
"eval_steps_per_second": 4.659, |
|
"step": 387000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 44.1893424987793, |
|
"learning_rate": 5.680936358871193e-06, |
|
"loss": 2.5636, |
|
"step": 388000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.586111545562744, |
|
"eval_runtime": 74.6289, |
|
"eval_samples_per_second": 11.055, |
|
"eval_steps_per_second": 5.534, |
|
"step": 388000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 40.74356460571289, |
|
"learning_rate": 5.695577947424984e-06, |
|
"loss": 2.556, |
|
"step": 389000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5448148250579834, |
|
"eval_runtime": 88.7042, |
|
"eval_samples_per_second": 9.301, |
|
"eval_steps_per_second": 4.656, |
|
"step": 389000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 50.917518615722656, |
|
"learning_rate": 5.710219535978776e-06, |
|
"loss": 2.5693, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.6050689220428467, |
|
"eval_runtime": 74.8093, |
|
"eval_samples_per_second": 11.028, |
|
"eval_steps_per_second": 5.521, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 30.194284439086914, |
|
"learning_rate": 5.724861124532568e-06, |
|
"loss": 2.58, |
|
"step": 391000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5194711685180664, |
|
"eval_runtime": 74.7688, |
|
"eval_samples_per_second": 11.034, |
|
"eval_steps_per_second": 5.524, |
|
"step": 391000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 35.72700119018555, |
|
"learning_rate": 5.739502713086359e-06, |
|
"loss": 2.5755, |
|
"step": 392000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.473149538040161, |
|
"eval_runtime": 88.7279, |
|
"eval_samples_per_second": 9.298, |
|
"eval_steps_per_second": 4.655, |
|
"step": 392000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 40.50430679321289, |
|
"learning_rate": 5.754144301640151e-06, |
|
"loss": 2.5705, |
|
"step": 393000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.583292245864868, |
|
"eval_runtime": 74.6506, |
|
"eval_samples_per_second": 11.051, |
|
"eval_steps_per_second": 5.532, |
|
"step": 393000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 38.33802032470703, |
|
"learning_rate": 5.7687858901939434e-06, |
|
"loss": 2.5743, |
|
"step": 394000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.6046974658966064, |
|
"eval_runtime": 88.8354, |
|
"eval_samples_per_second": 9.287, |
|
"eval_steps_per_second": 4.649, |
|
"step": 394000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 36.271080017089844, |
|
"learning_rate": 5.783427478747735e-06, |
|
"loss": 2.5732, |
|
"step": 395000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5364186763763428, |
|
"eval_runtime": 74.7791, |
|
"eval_samples_per_second": 11.032, |
|
"eval_steps_per_second": 5.523, |
|
"step": 395000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 41.960426330566406, |
|
"learning_rate": 5.798069067301526e-06, |
|
"loss": 2.573, |
|
"step": 396000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4924538135528564, |
|
"eval_runtime": 74.6168, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 5.535, |
|
"step": 396000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 34.67692565917969, |
|
"learning_rate": 5.812710655855318e-06, |
|
"loss": 2.5909, |
|
"step": 397000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.6636910438537598, |
|
"eval_runtime": 88.6833, |
|
"eval_samples_per_second": 9.303, |
|
"eval_steps_per_second": 4.657, |
|
"step": 397000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 35.99394989013672, |
|
"learning_rate": 5.8273522444091104e-06, |
|
"loss": 2.5664, |
|
"step": 398000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.502495288848877, |
|
"eval_runtime": 74.5822, |
|
"eval_samples_per_second": 11.062, |
|
"eval_steps_per_second": 5.538, |
|
"step": 398000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 33.0992546081543, |
|
"learning_rate": 5.841993832962901e-06, |
|
"loss": 2.5634, |
|
"step": 399000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4681994915008545, |
|
"eval_runtime": 88.6562, |
|
"eval_samples_per_second": 9.306, |
|
"eval_steps_per_second": 4.658, |
|
"step": 399000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 48.14725875854492, |
|
"learning_rate": 5.856635421516693e-06, |
|
"loss": 2.5751, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.547281503677368, |
|
"eval_runtime": 74.6694, |
|
"eval_samples_per_second": 11.049, |
|
"eval_steps_per_second": 5.531, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 27.271160125732422, |
|
"learning_rate": 5.871277010070485e-06, |
|
"loss": 2.5798, |
|
"step": 401000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4551727771759033, |
|
"eval_runtime": 88.5982, |
|
"eval_samples_per_second": 9.312, |
|
"eval_steps_per_second": 4.661, |
|
"step": 401000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 42.731956481933594, |
|
"learning_rate": 5.8859185986242774e-06, |
|
"loss": 2.5643, |
|
"step": 402000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.467008113861084, |
|
"eval_runtime": 74.7978, |
|
"eval_samples_per_second": 11.03, |
|
"eval_steps_per_second": 5.522, |
|
"step": 402000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 37.62224578857422, |
|
"learning_rate": 5.900560187178068e-06, |
|
"loss": 2.5783, |
|
"step": 403000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4793624877929688, |
|
"eval_runtime": 74.6072, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 5.536, |
|
"step": 403000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 33.82321548461914, |
|
"learning_rate": 5.91520177573186e-06, |
|
"loss": 2.5599, |
|
"step": 404000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5051112174987793, |
|
"eval_runtime": 88.887, |
|
"eval_samples_per_second": 9.281, |
|
"eval_steps_per_second": 4.646, |
|
"step": 404000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 38.417205810546875, |
|
"learning_rate": 5.929843364285652e-06, |
|
"loss": 2.5591, |
|
"step": 405000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4766504764556885, |
|
"eval_runtime": 74.6288, |
|
"eval_samples_per_second": 11.055, |
|
"eval_steps_per_second": 5.534, |
|
"step": 405000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 37.80118179321289, |
|
"learning_rate": 5.944484952839444e-06, |
|
"loss": 2.5938, |
|
"step": 406000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.46614146232605, |
|
"eval_runtime": 88.8603, |
|
"eval_samples_per_second": 9.284, |
|
"eval_steps_per_second": 4.648, |
|
"step": 406000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 31.764890670776367, |
|
"learning_rate": 5.959126541393235e-06, |
|
"loss": 2.5676, |
|
"step": 407000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.558773994445801, |
|
"eval_runtime": 74.7445, |
|
"eval_samples_per_second": 11.038, |
|
"eval_steps_per_second": 5.525, |
|
"step": 407000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 26.849288940429688, |
|
"learning_rate": 5.973768129947027e-06, |
|
"loss": 2.5805, |
|
"step": 408000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.568739652633667, |
|
"eval_runtime": 74.605, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 5.536, |
|
"step": 408000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 34.30747604370117, |
|
"learning_rate": 5.988409718500819e-06, |
|
"loss": 2.5582, |
|
"step": 409000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5398943424224854, |
|
"eval_runtime": 89.3871, |
|
"eval_samples_per_second": 9.23, |
|
"eval_steps_per_second": 4.62, |
|
"step": 409000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 28.676551818847656, |
|
"learning_rate": 6.003051307054611e-06, |
|
"loss": 2.5675, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5639803409576416, |
|
"eval_runtime": 74.7489, |
|
"eval_samples_per_second": 11.037, |
|
"eval_steps_per_second": 5.525, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 31.36862564086914, |
|
"learning_rate": 6.017692895608403e-06, |
|
"loss": 2.558, |
|
"step": 411000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.550550937652588, |
|
"eval_runtime": 89.1468, |
|
"eval_samples_per_second": 9.254, |
|
"eval_steps_per_second": 4.633, |
|
"step": 411000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 53.808528900146484, |
|
"learning_rate": 6.032334484162194e-06, |
|
"loss": 2.5706, |
|
"step": 412000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5042293071746826, |
|
"eval_runtime": 74.77, |
|
"eval_samples_per_second": 11.034, |
|
"eval_steps_per_second": 5.524, |
|
"step": 412000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 33.744991302490234, |
|
"learning_rate": 6.046976072715985e-06, |
|
"loss": 2.5493, |
|
"step": 413000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.567261219024658, |
|
"eval_runtime": 88.8853, |
|
"eval_samples_per_second": 9.282, |
|
"eval_steps_per_second": 4.646, |
|
"step": 413000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 35.459835052490234, |
|
"learning_rate": 6.061617661269778e-06, |
|
"loss": 2.5576, |
|
"step": 414000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.517771005630493, |
|
"eval_runtime": 74.6445, |
|
"eval_samples_per_second": 11.052, |
|
"eval_steps_per_second": 5.533, |
|
"step": 414000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 29.082275390625, |
|
"learning_rate": 6.07625924982357e-06, |
|
"loss": 2.5644, |
|
"step": 415000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.6622846126556396, |
|
"eval_runtime": 74.6381, |
|
"eval_samples_per_second": 11.053, |
|
"eval_steps_per_second": 5.533, |
|
"step": 415000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 33.123802185058594, |
|
"learning_rate": 6.090900838377362e-06, |
|
"loss": 2.5636, |
|
"step": 416000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.468601942062378, |
|
"eval_runtime": 88.6295, |
|
"eval_samples_per_second": 9.308, |
|
"eval_steps_per_second": 4.66, |
|
"step": 416000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 29.530460357666016, |
|
"learning_rate": 6.105542426931152e-06, |
|
"loss": 2.5537, |
|
"step": 417000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.502995014190674, |
|
"eval_runtime": 74.6451, |
|
"eval_samples_per_second": 11.052, |
|
"eval_steps_per_second": 5.533, |
|
"step": 417000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 34.51593780517578, |
|
"learning_rate": 6.120184015484945e-06, |
|
"loss": 2.5498, |
|
"step": 418000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4599108695983887, |
|
"eval_runtime": 89.0393, |
|
"eval_samples_per_second": 9.266, |
|
"eval_steps_per_second": 4.638, |
|
"step": 418000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 69.78622436523438, |
|
"learning_rate": 6.134825604038737e-06, |
|
"loss": 2.5641, |
|
"step": 419000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.533233880996704, |
|
"eval_runtime": 74.7606, |
|
"eval_samples_per_second": 11.035, |
|
"eval_steps_per_second": 5.524, |
|
"step": 419000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 37.08543014526367, |
|
"learning_rate": 6.149467192592527e-06, |
|
"loss": 2.5594, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.632356882095337, |
|
"eval_runtime": 74.6365, |
|
"eval_samples_per_second": 11.054, |
|
"eval_steps_per_second": 5.533, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 36.292423248291016, |
|
"learning_rate": 6.164108781146319e-06, |
|
"loss": 2.5503, |
|
"step": 421000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5228469371795654, |
|
"eval_runtime": 88.5975, |
|
"eval_samples_per_second": 9.312, |
|
"eval_steps_per_second": 4.662, |
|
"step": 421000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 34.91773986816406, |
|
"learning_rate": 6.178750369700112e-06, |
|
"loss": 2.556, |
|
"step": 422000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5192246437072754, |
|
"eval_runtime": 74.6474, |
|
"eval_samples_per_second": 11.052, |
|
"eval_steps_per_second": 5.533, |
|
"step": 422000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 28.986318588256836, |
|
"learning_rate": 6.193391958253904e-06, |
|
"loss": 2.5706, |
|
"step": 423000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4696004390716553, |
|
"eval_runtime": 88.7451, |
|
"eval_samples_per_second": 9.296, |
|
"eval_steps_per_second": 4.654, |
|
"step": 423000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 29.763254165649414, |
|
"learning_rate": 6.208033546807695e-06, |
|
"loss": 2.5512, |
|
"step": 424000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.575294256210327, |
|
"eval_runtime": 74.8028, |
|
"eval_samples_per_second": 11.029, |
|
"eval_steps_per_second": 5.521, |
|
"step": 424000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 36.24395751953125, |
|
"learning_rate": 6.222675135361486e-06, |
|
"loss": 2.5634, |
|
"step": 425000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.550208806991577, |
|
"eval_runtime": 74.6793, |
|
"eval_samples_per_second": 11.047, |
|
"eval_steps_per_second": 5.53, |
|
"step": 425000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 29.8460636138916, |
|
"learning_rate": 6.237316723915279e-06, |
|
"loss": 2.5697, |
|
"step": 426000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5694708824157715, |
|
"eval_runtime": 88.7935, |
|
"eval_samples_per_second": 9.291, |
|
"eval_steps_per_second": 4.651, |
|
"step": 426000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 50.25217819213867, |
|
"learning_rate": 6.25195831246907e-06, |
|
"loss": 2.551, |
|
"step": 427000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.550706624984741, |
|
"eval_runtime": 74.6395, |
|
"eval_samples_per_second": 11.053, |
|
"eval_steps_per_second": 5.533, |
|
"step": 427000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 32.787391662597656, |
|
"learning_rate": 6.266599901022862e-06, |
|
"loss": 2.5591, |
|
"step": 428000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5466034412384033, |
|
"eval_runtime": 88.7357, |
|
"eval_samples_per_second": 9.297, |
|
"eval_steps_per_second": 4.654, |
|
"step": 428000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 39.36776351928711, |
|
"learning_rate": 6.281241489576653e-06, |
|
"loss": 2.5547, |
|
"step": 429000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4820621013641357, |
|
"eval_runtime": 74.7807, |
|
"eval_samples_per_second": 11.032, |
|
"eval_steps_per_second": 5.523, |
|
"step": 429000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 39.31442642211914, |
|
"learning_rate": 6.295883078130446e-06, |
|
"loss": 2.5622, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.617039918899536, |
|
"eval_runtime": 89.1322, |
|
"eval_samples_per_second": 9.256, |
|
"eval_steps_per_second": 4.634, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 46.105777740478516, |
|
"learning_rate": 6.310524666684237e-06, |
|
"loss": 2.5665, |
|
"step": 431000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.545649290084839, |
|
"eval_runtime": 74.5958, |
|
"eval_samples_per_second": 11.06, |
|
"eval_steps_per_second": 5.537, |
|
"step": 431000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 36.128662109375, |
|
"learning_rate": 6.325166255238029e-06, |
|
"loss": 2.5821, |
|
"step": 432000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5234498977661133, |
|
"eval_runtime": 74.6321, |
|
"eval_samples_per_second": 11.054, |
|
"eval_steps_per_second": 5.534, |
|
"step": 432000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 25.52347183227539, |
|
"learning_rate": 6.339807843791821e-06, |
|
"loss": 2.5569, |
|
"step": 433000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5337672233581543, |
|
"eval_runtime": 88.6702, |
|
"eval_samples_per_second": 9.304, |
|
"eval_steps_per_second": 4.658, |
|
"step": 433000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 41.64098358154297, |
|
"learning_rate": 6.354449432345612e-06, |
|
"loss": 2.5632, |
|
"step": 434000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5711653232574463, |
|
"eval_runtime": 74.6523, |
|
"eval_samples_per_second": 11.051, |
|
"eval_steps_per_second": 5.532, |
|
"step": 434000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 41.7122688293457, |
|
"learning_rate": 6.369091020899404e-06, |
|
"loss": 2.588, |
|
"step": 435000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.548222541809082, |
|
"eval_runtime": 89.057, |
|
"eval_samples_per_second": 9.264, |
|
"eval_steps_per_second": 4.637, |
|
"step": 435000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 40.96338653564453, |
|
"learning_rate": 6.383732609453196e-06, |
|
"loss": 2.5794, |
|
"step": 436000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4672553539276123, |
|
"eval_runtime": 74.8012, |
|
"eval_samples_per_second": 11.029, |
|
"eval_steps_per_second": 5.521, |
|
"step": 436000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 30.040346145629883, |
|
"learning_rate": 6.398374198006988e-06, |
|
"loss": 2.5489, |
|
"step": 437000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5354793071746826, |
|
"eval_runtime": 74.6068, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 5.536, |
|
"step": 437000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 42.75343322753906, |
|
"learning_rate": 6.413015786560779e-06, |
|
"loss": 2.5638, |
|
"step": 438000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5378284454345703, |
|
"eval_runtime": 89.2076, |
|
"eval_samples_per_second": 9.248, |
|
"eval_steps_per_second": 4.63, |
|
"step": 438000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 43.05962371826172, |
|
"learning_rate": 6.427657375114571e-06, |
|
"loss": 2.5384, |
|
"step": 439000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5371294021606445, |
|
"eval_runtime": 74.6255, |
|
"eval_samples_per_second": 11.055, |
|
"eval_steps_per_second": 5.534, |
|
"step": 439000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 36.83192443847656, |
|
"learning_rate": 6.442298963668363e-06, |
|
"loss": 2.5481, |
|
"step": 440000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5858242511749268, |
|
"eval_runtime": 88.6555, |
|
"eval_samples_per_second": 9.306, |
|
"eval_steps_per_second": 4.658, |
|
"step": 440000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 33.47575759887695, |
|
"learning_rate": 6.456940552222154e-06, |
|
"loss": 2.5498, |
|
"step": 441000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5402233600616455, |
|
"eval_runtime": 74.7407, |
|
"eval_samples_per_second": 11.038, |
|
"eval_steps_per_second": 5.526, |
|
"step": 441000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 41.94609451293945, |
|
"learning_rate": 6.471582140775946e-06, |
|
"loss": 2.561, |
|
"step": 442000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.605259895324707, |
|
"eval_runtime": 74.9512, |
|
"eval_samples_per_second": 11.007, |
|
"eval_steps_per_second": 5.51, |
|
"step": 442000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 19.982908248901367, |
|
"learning_rate": 6.486223729329738e-06, |
|
"loss": 2.5588, |
|
"step": 443000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.5843842029571533, |
|
"eval_runtime": 88.7552, |
|
"eval_samples_per_second": 9.295, |
|
"eval_steps_per_second": 4.653, |
|
"step": 443000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 57.40386199951172, |
|
"learning_rate": 6.50086531788353e-06, |
|
"loss": 2.5735, |
|
"step": 444000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.6279358863830566, |
|
"eval_runtime": 74.6147, |
|
"eval_samples_per_second": 11.057, |
|
"eval_steps_per_second": 5.535, |
|
"step": 444000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 41.1192741394043, |
|
"learning_rate": 6.515506906437321e-06, |
|
"loss": 2.5533, |
|
"step": 445000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5092525482177734, |
|
"eval_runtime": 89.3712, |
|
"eval_samples_per_second": 9.231, |
|
"eval_steps_per_second": 4.621, |
|
"step": 445000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 32.98606491088867, |
|
"learning_rate": 6.530148494991113e-06, |
|
"loss": 2.5607, |
|
"step": 446000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5889837741851807, |
|
"eval_runtime": 74.6817, |
|
"eval_samples_per_second": 11.047, |
|
"eval_steps_per_second": 5.53, |
|
"step": 446000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 41.232357025146484, |
|
"learning_rate": 6.544790083544905e-06, |
|
"loss": 2.5588, |
|
"step": 447000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.467031717300415, |
|
"eval_runtime": 88.7084, |
|
"eval_samples_per_second": 9.3, |
|
"eval_steps_per_second": 4.656, |
|
"step": 447000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 72.1984634399414, |
|
"learning_rate": 6.559431672098696e-06, |
|
"loss": 2.5572, |
|
"step": 448000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5372323989868164, |
|
"eval_runtime": 74.6878, |
|
"eval_samples_per_second": 11.046, |
|
"eval_steps_per_second": 5.53, |
|
"step": 448000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 36.4392204284668, |
|
"learning_rate": 6.574073260652488e-06, |
|
"loss": 2.5337, |
|
"step": 449000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5478267669677734, |
|
"eval_runtime": 74.5728, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 5.538, |
|
"step": 449000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 34.9202766418457, |
|
"learning_rate": 6.5887148492062806e-06, |
|
"loss": 2.5564, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.465947151184082, |
|
"eval_runtime": 88.7189, |
|
"eval_samples_per_second": 9.299, |
|
"eval_steps_per_second": 4.655, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 39.647132873535156, |
|
"learning_rate": 6.603356437760072e-06, |
|
"loss": 2.5621, |
|
"step": 451000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5845820903778076, |
|
"eval_runtime": 74.6226, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 5.535, |
|
"step": 451000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 33.027671813964844, |
|
"learning_rate": 6.617998026313863e-06, |
|
"loss": 2.5541, |
|
"step": 452000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5217037200927734, |
|
"eval_runtime": 88.777, |
|
"eval_samples_per_second": 9.293, |
|
"eval_steps_per_second": 4.652, |
|
"step": 452000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 30.724512100219727, |
|
"learning_rate": 6.632639614867655e-06, |
|
"loss": 2.5594, |
|
"step": 453000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.509258508682251, |
|
"eval_runtime": 74.7622, |
|
"eval_samples_per_second": 11.035, |
|
"eval_steps_per_second": 5.524, |
|
"step": 453000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 29.195533752441406, |
|
"learning_rate": 6.6472812034214476e-06, |
|
"loss": 2.5462, |
|
"step": 454000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.488044500350952, |
|
"eval_runtime": 74.612, |
|
"eval_samples_per_second": 11.057, |
|
"eval_steps_per_second": 5.535, |
|
"step": 454000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 41.07986068725586, |
|
"learning_rate": 6.661922791975238e-06, |
|
"loss": 2.5504, |
|
"step": 455000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5311644077301025, |
|
"eval_runtime": 88.9117, |
|
"eval_samples_per_second": 9.279, |
|
"eval_steps_per_second": 4.645, |
|
"step": 455000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 33.23712158203125, |
|
"learning_rate": 6.67656438052903e-06, |
|
"loss": 2.538, |
|
"step": 456000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.492626428604126, |
|
"eval_runtime": 74.5715, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 5.538, |
|
"step": 456000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 42.68756103515625, |
|
"learning_rate": 6.691205969082822e-06, |
|
"loss": 2.5204, |
|
"step": 457000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.58370304107666, |
|
"eval_runtime": 88.7612, |
|
"eval_samples_per_second": 9.295, |
|
"eval_steps_per_second": 4.653, |
|
"step": 457000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 64.60575866699219, |
|
"learning_rate": 6.7058475576366146e-06, |
|
"loss": 2.5611, |
|
"step": 458000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.6022603511810303, |
|
"eval_runtime": 74.7033, |
|
"eval_samples_per_second": 11.044, |
|
"eval_steps_per_second": 5.529, |
|
"step": 458000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 52.22064208984375, |
|
"learning_rate": 6.720489146190405e-06, |
|
"loss": 2.5452, |
|
"step": 459000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.6357836723327637, |
|
"eval_runtime": 88.7513, |
|
"eval_samples_per_second": 9.296, |
|
"eval_steps_per_second": 4.653, |
|
"step": 459000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 34.85495376586914, |
|
"learning_rate": 6.735130734744197e-06, |
|
"loss": 2.5756, |
|
"step": 460000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5200676918029785, |
|
"eval_runtime": 74.5868, |
|
"eval_samples_per_second": 11.061, |
|
"eval_steps_per_second": 5.537, |
|
"step": 460000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 33.7723503112793, |
|
"learning_rate": 6.749772323297989e-06, |
|
"loss": 2.5329, |
|
"step": 461000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4919097423553467, |
|
"eval_runtime": 74.6394, |
|
"eval_samples_per_second": 11.053, |
|
"eval_steps_per_second": 5.533, |
|
"step": 461000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 37.637420654296875, |
|
"learning_rate": 6.764413911851781e-06, |
|
"loss": 2.551, |
|
"step": 462000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.452228307723999, |
|
"eval_runtime": 88.7249, |
|
"eval_samples_per_second": 9.298, |
|
"eval_steps_per_second": 4.655, |
|
"step": 462000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 24.536401748657227, |
|
"learning_rate": 6.779055500405573e-06, |
|
"loss": 2.5563, |
|
"step": 463000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.494086742401123, |
|
"eval_runtime": 74.6768, |
|
"eval_samples_per_second": 11.048, |
|
"eval_steps_per_second": 5.53, |
|
"step": 463000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 36.97923278808594, |
|
"learning_rate": 6.793697088959364e-06, |
|
"loss": 2.553, |
|
"step": 464000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5208866596221924, |
|
"eval_runtime": 88.7626, |
|
"eval_samples_per_second": 9.294, |
|
"eval_steps_per_second": 4.653, |
|
"step": 464000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 41.09996032714844, |
|
"learning_rate": 6.8083386775131555e-06, |
|
"loss": 2.5181, |
|
"step": 465000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4860949516296387, |
|
"eval_runtime": 74.724, |
|
"eval_samples_per_second": 11.041, |
|
"eval_steps_per_second": 5.527, |
|
"step": 465000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 32.66045379638672, |
|
"learning_rate": 6.822980266066948e-06, |
|
"loss": 2.5581, |
|
"step": 466000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.568377733230591, |
|
"eval_runtime": 75.6998, |
|
"eval_samples_per_second": 10.898, |
|
"eval_steps_per_second": 5.456, |
|
"step": 466000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 32.50819396972656, |
|
"learning_rate": 6.83762185462074e-06, |
|
"loss": 2.5579, |
|
"step": 467000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5605528354644775, |
|
"eval_runtime": 88.7915, |
|
"eval_samples_per_second": 9.291, |
|
"eval_steps_per_second": 4.651, |
|
"step": 467000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 28.087833404541016, |
|
"learning_rate": 6.852263443174531e-06, |
|
"loss": 2.5367, |
|
"step": 468000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4373321533203125, |
|
"eval_runtime": 74.6151, |
|
"eval_samples_per_second": 11.057, |
|
"eval_steps_per_second": 5.535, |
|
"step": 468000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 24.929182052612305, |
|
"learning_rate": 6.8669050317283225e-06, |
|
"loss": 2.525, |
|
"step": 469000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5279572010040283, |
|
"eval_runtime": 88.694, |
|
"eval_samples_per_second": 9.302, |
|
"eval_steps_per_second": 4.656, |
|
"step": 469000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 27.20203399658203, |
|
"learning_rate": 6.881546620282115e-06, |
|
"loss": 2.5362, |
|
"step": 470000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5852034091949463, |
|
"eval_runtime": 74.7835, |
|
"eval_samples_per_second": 11.032, |
|
"eval_steps_per_second": 5.523, |
|
"step": 470000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 25.984107971191406, |
|
"learning_rate": 6.896188208835907e-06, |
|
"loss": 2.5573, |
|
"step": 471000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5982773303985596, |
|
"eval_runtime": 74.6024, |
|
"eval_samples_per_second": 11.059, |
|
"eval_steps_per_second": 5.536, |
|
"step": 471000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 34.37070846557617, |
|
"learning_rate": 6.910829797389697e-06, |
|
"loss": 2.5387, |
|
"step": 472000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5567188262939453, |
|
"eval_runtime": 88.7033, |
|
"eval_samples_per_second": 9.301, |
|
"eval_steps_per_second": 4.656, |
|
"step": 472000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 35.77717971801758, |
|
"learning_rate": 6.9254713859434895e-06, |
|
"loss": 2.5461, |
|
"step": 473000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.500488758087158, |
|
"eval_runtime": 74.5895, |
|
"eval_samples_per_second": 11.061, |
|
"eval_steps_per_second": 5.537, |
|
"step": 473000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 39.12084197998047, |
|
"learning_rate": 6.940112974497282e-06, |
|
"loss": 2.5294, |
|
"step": 474000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.567323923110962, |
|
"eval_runtime": 88.7207, |
|
"eval_samples_per_second": 9.299, |
|
"eval_steps_per_second": 4.655, |
|
"step": 474000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 39.28538513183594, |
|
"learning_rate": 6.954754563051074e-06, |
|
"loss": 2.552, |
|
"step": 475000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4914865493774414, |
|
"eval_runtime": 74.6134, |
|
"eval_samples_per_second": 11.057, |
|
"eval_steps_per_second": 5.535, |
|
"step": 475000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 28.785789489746094, |
|
"learning_rate": 6.969396151604864e-06, |
|
"loss": 2.5472, |
|
"step": 476000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5251002311706543, |
|
"eval_runtime": 88.9687, |
|
"eval_samples_per_second": 9.273, |
|
"eval_steps_per_second": 4.642, |
|
"step": 476000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 29.67142677307129, |
|
"learning_rate": 6.9840377401586565e-06, |
|
"loss": 2.5435, |
|
"step": 477000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5729432106018066, |
|
"eval_runtime": 74.5981, |
|
"eval_samples_per_second": 11.059, |
|
"eval_steps_per_second": 5.536, |
|
"step": 477000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 29.99863052368164, |
|
"learning_rate": 6.998679328712449e-06, |
|
"loss": 2.5487, |
|
"step": 478000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.535391330718994, |
|
"eval_runtime": 74.581, |
|
"eval_samples_per_second": 11.062, |
|
"eval_steps_per_second": 5.538, |
|
"step": 478000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 41.351619720458984, |
|
"learning_rate": 7.01332091726624e-06, |
|
"loss": 2.5679, |
|
"step": 479000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.525803804397583, |
|
"eval_runtime": 88.6559, |
|
"eval_samples_per_second": 9.306, |
|
"eval_steps_per_second": 4.658, |
|
"step": 479000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 38.389102935791016, |
|
"learning_rate": 7.027962505820032e-06, |
|
"loss": 2.5501, |
|
"step": 480000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.555030107498169, |
|
"eval_runtime": 74.6654, |
|
"eval_samples_per_second": 11.049, |
|
"eval_steps_per_second": 5.531, |
|
"step": 480000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 25.928979873657227, |
|
"learning_rate": 7.0426040943738235e-06, |
|
"loss": 2.5245, |
|
"step": 481000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.562833786010742, |
|
"eval_runtime": 88.8709, |
|
"eval_samples_per_second": 9.283, |
|
"eval_steps_per_second": 4.647, |
|
"step": 481000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 26.164514541625977, |
|
"learning_rate": 7.057245682927616e-06, |
|
"loss": 2.5322, |
|
"step": 482000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5449910163879395, |
|
"eval_runtime": 74.7155, |
|
"eval_samples_per_second": 11.042, |
|
"eval_steps_per_second": 5.528, |
|
"step": 482000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 32.9320182800293, |
|
"learning_rate": 7.071887271481407e-06, |
|
"loss": 2.522, |
|
"step": 483000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.516918897628784, |
|
"eval_runtime": 74.5959, |
|
"eval_samples_per_second": 11.06, |
|
"eval_steps_per_second": 5.536, |
|
"step": 483000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 24.265871047973633, |
|
"learning_rate": 7.086528860035199e-06, |
|
"loss": 2.5404, |
|
"step": 484000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5522592067718506, |
|
"eval_runtime": 88.5094, |
|
"eval_samples_per_second": 9.321, |
|
"eval_steps_per_second": 4.666, |
|
"step": 484000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 34.12384033203125, |
|
"learning_rate": 7.101170448588991e-06, |
|
"loss": 2.5504, |
|
"step": 485000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5303263664245605, |
|
"eval_runtime": 74.586, |
|
"eval_samples_per_second": 11.061, |
|
"eval_steps_per_second": 5.537, |
|
"step": 485000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 61.21946716308594, |
|
"learning_rate": 7.115812037142782e-06, |
|
"loss": 2.5445, |
|
"step": 486000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5049057006835938, |
|
"eval_runtime": 88.9345, |
|
"eval_samples_per_second": 9.276, |
|
"eval_steps_per_second": 4.644, |
|
"step": 486000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 70.74236297607422, |
|
"learning_rate": 7.130453625696574e-06, |
|
"loss": 2.5292, |
|
"step": 487000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4680702686309814, |
|
"eval_runtime": 74.7359, |
|
"eval_samples_per_second": 11.039, |
|
"eval_steps_per_second": 5.526, |
|
"step": 487000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 25.825979232788086, |
|
"learning_rate": 7.145095214250366e-06, |
|
"loss": 2.5356, |
|
"step": 488000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4915049076080322, |
|
"eval_runtime": 74.585, |
|
"eval_samples_per_second": 11.061, |
|
"eval_steps_per_second": 5.537, |
|
"step": 488000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 35.21977233886719, |
|
"learning_rate": 7.159736802804158e-06, |
|
"loss": 2.5506, |
|
"step": 489000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.475767135620117, |
|
"eval_runtime": 88.6633, |
|
"eval_samples_per_second": 9.305, |
|
"eval_steps_per_second": 4.658, |
|
"step": 489000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 27.019886016845703, |
|
"learning_rate": 7.174378391357949e-06, |
|
"loss": 2.5399, |
|
"step": 490000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5398621559143066, |
|
"eval_runtime": 74.544, |
|
"eval_samples_per_second": 11.067, |
|
"eval_steps_per_second": 5.54, |
|
"step": 490000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 27.912006378173828, |
|
"learning_rate": 7.189019979911741e-06, |
|
"loss": 2.5384, |
|
"step": 491000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.525010108947754, |
|
"eval_runtime": 88.5171, |
|
"eval_samples_per_second": 9.32, |
|
"eval_steps_per_second": 4.666, |
|
"step": 491000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 30.21108055114746, |
|
"learning_rate": 7.203661568465533e-06, |
|
"loss": 2.5517, |
|
"step": 492000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4748849868774414, |
|
"eval_runtime": 74.7588, |
|
"eval_samples_per_second": 11.035, |
|
"eval_steps_per_second": 5.524, |
|
"step": 492000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 29.793901443481445, |
|
"learning_rate": 7.218303157019324e-06, |
|
"loss": 2.5446, |
|
"step": 493000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.467487096786499, |
|
"eval_runtime": 88.7576, |
|
"eval_samples_per_second": 9.295, |
|
"eval_steps_per_second": 4.653, |
|
"step": 493000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 28.16090202331543, |
|
"learning_rate": 7.232944745573116e-06, |
|
"loss": 2.559, |
|
"step": 494000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5055534839630127, |
|
"eval_runtime": 74.7138, |
|
"eval_samples_per_second": 11.042, |
|
"eval_steps_per_second": 5.528, |
|
"step": 494000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 32.094730377197266, |
|
"learning_rate": 7.247586334126908e-06, |
|
"loss": 2.5501, |
|
"step": 495000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5964102745056152, |
|
"eval_runtime": 74.9562, |
|
"eval_samples_per_second": 11.006, |
|
"eval_steps_per_second": 5.51, |
|
"step": 495000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 33.27901840209961, |
|
"learning_rate": 7.2622279226807e-06, |
|
"loss": 2.5366, |
|
"step": 496000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.439891815185547, |
|
"eval_runtime": 88.794, |
|
"eval_samples_per_second": 9.291, |
|
"eval_steps_per_second": 4.651, |
|
"step": 496000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 43.62042999267578, |
|
"learning_rate": 7.2768695112344915e-06, |
|
"loss": 2.5349, |
|
"step": 497000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.524001359939575, |
|
"eval_runtime": 74.6053, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 5.536, |
|
"step": 497000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 60.96129608154297, |
|
"learning_rate": 7.291511099788283e-06, |
|
"loss": 2.5448, |
|
"step": 498000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.521202564239502, |
|
"eval_runtime": 88.6666, |
|
"eval_samples_per_second": 9.305, |
|
"eval_steps_per_second": 4.658, |
|
"step": 498000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 35.02910232543945, |
|
"learning_rate": 7.306152688342075e-06, |
|
"loss": 2.537, |
|
"step": 499000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.505171298980713, |
|
"eval_runtime": 74.7312, |
|
"eval_samples_per_second": 11.04, |
|
"eval_steps_per_second": 5.526, |
|
"step": 499000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 31.013050079345703, |
|
"learning_rate": 7.320794276895866e-06, |
|
"loss": 2.5467, |
|
"step": 500000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4867501258850098, |
|
"eval_runtime": 74.6206, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 5.535, |
|
"step": 500000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 37.47949981689453, |
|
"learning_rate": 7.3354358654496585e-06, |
|
"loss": 2.5407, |
|
"step": 501000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5458617210388184, |
|
"eval_runtime": 88.9434, |
|
"eval_samples_per_second": 9.276, |
|
"eval_steps_per_second": 4.643, |
|
"step": 501000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 47.31396484375, |
|
"learning_rate": 7.350077454003451e-06, |
|
"loss": 2.5313, |
|
"step": 502000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.411341667175293, |
|
"eval_runtime": 74.6183, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 5.535, |
|
"step": 502000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 31.15127944946289, |
|
"learning_rate": 7.364719042557242e-06, |
|
"loss": 2.5366, |
|
"step": 503000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4842355251312256, |
|
"eval_runtime": 88.7591, |
|
"eval_samples_per_second": 9.295, |
|
"eval_steps_per_second": 4.653, |
|
"step": 503000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 27.516429901123047, |
|
"learning_rate": 7.379360631111033e-06, |
|
"loss": 2.5214, |
|
"step": 504000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5037267208099365, |
|
"eval_runtime": 74.6844, |
|
"eval_samples_per_second": 11.046, |
|
"eval_steps_per_second": 5.53, |
|
"step": 504000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 25.75136947631836, |
|
"learning_rate": 7.3940022196648255e-06, |
|
"loss": 2.5469, |
|
"step": 505000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4592418670654297, |
|
"eval_runtime": 88.8258, |
|
"eval_samples_per_second": 9.288, |
|
"eval_steps_per_second": 4.65, |
|
"step": 505000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 30.610952377319336, |
|
"learning_rate": 7.408643808218618e-06, |
|
"loss": 2.5315, |
|
"step": 506000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.5430240631103516, |
|
"eval_runtime": 74.5293, |
|
"eval_samples_per_second": 11.069, |
|
"eval_steps_per_second": 5.541, |
|
"step": 506000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 26.383150100708008, |
|
"learning_rate": 7.423285396772408e-06, |
|
"loss": 2.5414, |
|
"step": 507000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4935925006866455, |
|
"eval_runtime": 74.608, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 5.536, |
|
"step": 507000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 41.32442855834961, |
|
"learning_rate": 7.4379269853262e-06, |
|
"loss": 2.5358, |
|
"step": 508000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4858062267303467, |
|
"eval_runtime": 88.6146, |
|
"eval_samples_per_second": 9.31, |
|
"eval_steps_per_second": 4.661, |
|
"step": 508000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 27.008840560913086, |
|
"learning_rate": 7.4525685738799925e-06, |
|
"loss": 2.5192, |
|
"step": 509000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.46337890625, |
|
"eval_runtime": 74.6729, |
|
"eval_samples_per_second": 11.048, |
|
"eval_steps_per_second": 5.531, |
|
"step": 509000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 31.72559928894043, |
|
"learning_rate": 7.467210162433785e-06, |
|
"loss": 2.5431, |
|
"step": 510000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.615093469619751, |
|
"eval_runtime": 88.9599, |
|
"eval_samples_per_second": 9.274, |
|
"eval_steps_per_second": 4.643, |
|
"step": 510000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 41.91428756713867, |
|
"learning_rate": 7.481851750987575e-06, |
|
"loss": 2.546, |
|
"step": 511000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4940452575683594, |
|
"eval_runtime": 74.6792, |
|
"eval_samples_per_second": 11.047, |
|
"eval_steps_per_second": 5.53, |
|
"step": 511000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 33.08389663696289, |
|
"learning_rate": 7.496493339541367e-06, |
|
"loss": 2.523, |
|
"step": 512000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.472357988357544, |
|
"eval_runtime": 74.5358, |
|
"eval_samples_per_second": 11.069, |
|
"eval_steps_per_second": 5.541, |
|
"step": 512000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 30.30219841003418, |
|
"learning_rate": 7.5111349280951595e-06, |
|
"loss": 2.5399, |
|
"step": 513000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5458850860595703, |
|
"eval_runtime": 88.6531, |
|
"eval_samples_per_second": 9.306, |
|
"eval_steps_per_second": 4.659, |
|
"step": 513000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 30.642229080200195, |
|
"learning_rate": 7.525776516648951e-06, |
|
"loss": 2.5253, |
|
"step": 514000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5789878368377686, |
|
"eval_runtime": 74.5705, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 5.538, |
|
"step": 514000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 39.65602493286133, |
|
"learning_rate": 7.540418105202742e-06, |
|
"loss": 2.5315, |
|
"step": 515000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.501187801361084, |
|
"eval_runtime": 88.5681, |
|
"eval_samples_per_second": 9.315, |
|
"eval_steps_per_second": 4.663, |
|
"step": 515000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 29.685272216796875, |
|
"learning_rate": 7.555059693756534e-06, |
|
"loss": 2.5209, |
|
"step": 516000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5158517360687256, |
|
"eval_runtime": 74.732, |
|
"eval_samples_per_second": 11.039, |
|
"eval_steps_per_second": 5.526, |
|
"step": 516000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 26.941364288330078, |
|
"learning_rate": 7.5697012823103265e-06, |
|
"loss": 2.5267, |
|
"step": 517000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5034916400909424, |
|
"eval_runtime": 74.5657, |
|
"eval_samples_per_second": 11.064, |
|
"eval_steps_per_second": 5.539, |
|
"step": 517000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 30.716243743896484, |
|
"learning_rate": 7.584342870864118e-06, |
|
"loss": 2.5341, |
|
"step": 518000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.466562032699585, |
|
"eval_runtime": 88.7982, |
|
"eval_samples_per_second": 9.291, |
|
"eval_steps_per_second": 4.651, |
|
"step": 518000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 51.75357437133789, |
|
"learning_rate": 7.59898445941791e-06, |
|
"loss": 2.5457, |
|
"step": 519000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5445785522460938, |
|
"eval_runtime": 74.53, |
|
"eval_samples_per_second": 11.069, |
|
"eval_steps_per_second": 5.541, |
|
"step": 519000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 34.443023681640625, |
|
"learning_rate": 7.613626047971701e-06, |
|
"loss": 2.5313, |
|
"step": 520000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.506009101867676, |
|
"eval_runtime": 88.555, |
|
"eval_samples_per_second": 9.316, |
|
"eval_steps_per_second": 4.664, |
|
"step": 520000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.235965728759766, |
|
"learning_rate": 7.628267636525493e-06, |
|
"loss": 2.5275, |
|
"step": 521000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5120317935943604, |
|
"eval_runtime": 74.6094, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 5.535, |
|
"step": 521000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 29.263076782226562, |
|
"learning_rate": 7.642909225079284e-06, |
|
"loss": 2.5309, |
|
"step": 522000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5825254917144775, |
|
"eval_runtime": 88.7861, |
|
"eval_samples_per_second": 9.292, |
|
"eval_steps_per_second": 4.652, |
|
"step": 522000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 22.8249454498291, |
|
"learning_rate": 7.657550813633077e-06, |
|
"loss": 2.534, |
|
"step": 523000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4425950050354004, |
|
"eval_runtime": 74.5795, |
|
"eval_samples_per_second": 11.062, |
|
"eval_steps_per_second": 5.538, |
|
"step": 523000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 26.066471099853516, |
|
"learning_rate": 7.672192402186868e-06, |
|
"loss": 2.5071, |
|
"step": 524000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.509237051010132, |
|
"eval_runtime": 74.8845, |
|
"eval_samples_per_second": 11.017, |
|
"eval_steps_per_second": 5.515, |
|
"step": 524000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 46.68310546875, |
|
"learning_rate": 7.68683399074066e-06, |
|
"loss": 2.5324, |
|
"step": 525000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.531177282333374, |
|
"eval_runtime": 89.0215, |
|
"eval_samples_per_second": 9.267, |
|
"eval_steps_per_second": 4.639, |
|
"step": 525000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.326271057128906, |
|
"learning_rate": 7.701475579294451e-06, |
|
"loss": 2.5266, |
|
"step": 526000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.495670795440674, |
|
"eval_runtime": 74.673, |
|
"eval_samples_per_second": 11.048, |
|
"eval_steps_per_second": 5.531, |
|
"step": 526000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 41.735721588134766, |
|
"learning_rate": 7.716117167848244e-06, |
|
"loss": 2.5443, |
|
"step": 527000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.530324935913086, |
|
"eval_runtime": 89.1796, |
|
"eval_samples_per_second": 9.251, |
|
"eval_steps_per_second": 4.631, |
|
"step": 527000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 26.984756469726562, |
|
"learning_rate": 7.730758756402035e-06, |
|
"loss": 2.5231, |
|
"step": 528000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.526041269302368, |
|
"eval_runtime": 74.8795, |
|
"eval_samples_per_second": 11.018, |
|
"eval_steps_per_second": 5.516, |
|
"step": 528000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 40.371009826660156, |
|
"learning_rate": 7.745400344955827e-06, |
|
"loss": 2.5324, |
|
"step": 529000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5326287746429443, |
|
"eval_runtime": 74.6843, |
|
"eval_samples_per_second": 11.046, |
|
"eval_steps_per_second": 5.53, |
|
"step": 529000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 25.49151039123535, |
|
"learning_rate": 7.76004193350962e-06, |
|
"loss": 2.531, |
|
"step": 530000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5524654388427734, |
|
"eval_runtime": 88.777, |
|
"eval_samples_per_second": 9.293, |
|
"eval_steps_per_second": 4.652, |
|
"step": 530000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 24.5642032623291, |
|
"learning_rate": 7.774683522063411e-06, |
|
"loss": 2.5473, |
|
"step": 531000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5063366889953613, |
|
"eval_runtime": 74.7154, |
|
"eval_samples_per_second": 11.042, |
|
"eval_steps_per_second": 5.528, |
|
"step": 531000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 29.164342880249023, |
|
"learning_rate": 7.789325110617202e-06, |
|
"loss": 2.5579, |
|
"step": 532000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5238850116729736, |
|
"eval_runtime": 88.8674, |
|
"eval_samples_per_second": 9.283, |
|
"eval_steps_per_second": 4.647, |
|
"step": 532000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.728139877319336, |
|
"learning_rate": 7.803966699170994e-06, |
|
"loss": 2.5455, |
|
"step": 533000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.530641555786133, |
|
"eval_runtime": 74.6942, |
|
"eval_samples_per_second": 11.045, |
|
"eval_steps_per_second": 5.529, |
|
"step": 533000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 30.494529724121094, |
|
"learning_rate": 7.818608287724787e-06, |
|
"loss": 2.5345, |
|
"step": 534000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.532782793045044, |
|
"eval_runtime": 74.8909, |
|
"eval_samples_per_second": 11.016, |
|
"eval_steps_per_second": 5.515, |
|
"step": 534000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.76959800720215, |
|
"learning_rate": 7.833249876278576e-06, |
|
"loss": 2.5307, |
|
"step": 535000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5115082263946533, |
|
"eval_runtime": 88.5627, |
|
"eval_samples_per_second": 9.315, |
|
"eval_steps_per_second": 4.663, |
|
"step": 535000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 37.1898307800293, |
|
"learning_rate": 7.84789146483237e-06, |
|
"loss": 2.5271, |
|
"step": 536000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.613095998764038, |
|
"eval_runtime": 74.8201, |
|
"eval_samples_per_second": 11.026, |
|
"eval_steps_per_second": 5.52, |
|
"step": 536000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 46.06565475463867, |
|
"learning_rate": 7.86253305338616e-06, |
|
"loss": 2.5351, |
|
"step": 537000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5240702629089355, |
|
"eval_runtime": 88.8498, |
|
"eval_samples_per_second": 9.285, |
|
"eval_steps_per_second": 4.648, |
|
"step": 537000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 48.30146408081055, |
|
"learning_rate": 7.877174641939954e-06, |
|
"loss": 2.5067, |
|
"step": 538000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.445352077484131, |
|
"eval_runtime": 81.7531, |
|
"eval_samples_per_second": 10.091, |
|
"eval_steps_per_second": 5.052, |
|
"step": 538000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 26.811046600341797, |
|
"learning_rate": 7.891816230493743e-06, |
|
"loss": 2.508, |
|
"step": 539000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4756014347076416, |
|
"eval_runtime": 89.498, |
|
"eval_samples_per_second": 9.218, |
|
"eval_steps_per_second": 4.615, |
|
"step": 539000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 38.2576789855957, |
|
"learning_rate": 7.906457819047536e-06, |
|
"loss": 2.5324, |
|
"step": 540000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.415710926055908, |
|
"eval_runtime": 75.1208, |
|
"eval_samples_per_second": 10.982, |
|
"eval_steps_per_second": 5.498, |
|
"step": 540000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 36.25407409667969, |
|
"learning_rate": 7.921099407601328e-06, |
|
"loss": 2.5211, |
|
"step": 541000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5010883808135986, |
|
"eval_runtime": 74.6035, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 5.536, |
|
"step": 541000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 27.875274658203125, |
|
"learning_rate": 7.935740996155119e-06, |
|
"loss": 2.5319, |
|
"step": 542000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4954843521118164, |
|
"eval_runtime": 89.0529, |
|
"eval_samples_per_second": 9.264, |
|
"eval_steps_per_second": 4.638, |
|
"step": 542000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 30.717681884765625, |
|
"learning_rate": 7.95038258470891e-06, |
|
"loss": 2.5361, |
|
"step": 543000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5567915439605713, |
|
"eval_runtime": 76.6726, |
|
"eval_samples_per_second": 10.76, |
|
"eval_steps_per_second": 5.387, |
|
"step": 543000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 27.22251319885254, |
|
"learning_rate": 7.965024173262703e-06, |
|
"loss": 2.548, |
|
"step": 544000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.518052101135254, |
|
"eval_runtime": 91.1463, |
|
"eval_samples_per_second": 9.051, |
|
"eval_steps_per_second": 4.531, |
|
"step": 544000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 24.037654876708984, |
|
"learning_rate": 7.979665761816495e-06, |
|
"loss": 2.5258, |
|
"step": 545000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.6096935272216797, |
|
"eval_runtime": 76.5086, |
|
"eval_samples_per_second": 10.783, |
|
"eval_steps_per_second": 5.398, |
|
"step": 545000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 40.470664978027344, |
|
"learning_rate": 7.994307350370286e-06, |
|
"loss": 2.5173, |
|
"step": 546000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.543060302734375, |
|
"eval_runtime": 75.9324, |
|
"eval_samples_per_second": 10.865, |
|
"eval_steps_per_second": 5.439, |
|
"step": 546000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 29.762210845947266, |
|
"learning_rate": 8.008948938924079e-06, |
|
"loss": 2.5249, |
|
"step": 547000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4692766666412354, |
|
"eval_runtime": 90.7767, |
|
"eval_samples_per_second": 9.088, |
|
"eval_steps_per_second": 4.55, |
|
"step": 547000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 27.77049446105957, |
|
"learning_rate": 8.02359052747787e-06, |
|
"loss": 2.5087, |
|
"step": 548000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.480736017227173, |
|
"eval_runtime": 76.3978, |
|
"eval_samples_per_second": 10.799, |
|
"eval_steps_per_second": 5.406, |
|
"step": 548000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 29.757713317871094, |
|
"learning_rate": 8.038232116031662e-06, |
|
"loss": 2.5288, |
|
"step": 549000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4918410778045654, |
|
"eval_runtime": 92.7712, |
|
"eval_samples_per_second": 8.893, |
|
"eval_steps_per_second": 4.452, |
|
"step": 549000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 36.95774841308594, |
|
"learning_rate": 8.052873704585453e-06, |
|
"loss": 2.5337, |
|
"step": 550000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.440648317337036, |
|
"eval_runtime": 75.8485, |
|
"eval_samples_per_second": 10.877, |
|
"eval_steps_per_second": 5.445, |
|
"step": 550000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 42.54120635986328, |
|
"learning_rate": 8.067515293139246e-06, |
|
"loss": 2.5361, |
|
"step": 551000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4498894214630127, |
|
"eval_runtime": 90.2465, |
|
"eval_samples_per_second": 9.142, |
|
"eval_steps_per_second": 4.576, |
|
"step": 551000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 32.604347229003906, |
|
"learning_rate": 8.082156881693037e-06, |
|
"loss": 2.5226, |
|
"step": 552000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.461261034011841, |
|
"eval_runtime": 76.3392, |
|
"eval_samples_per_second": 10.807, |
|
"eval_steps_per_second": 5.41, |
|
"step": 552000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 38.40324783325195, |
|
"learning_rate": 8.096798470246829e-06, |
|
"loss": 2.5331, |
|
"step": 553000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.507526397705078, |
|
"eval_runtime": 75.972, |
|
"eval_samples_per_second": 10.859, |
|
"eval_steps_per_second": 5.436, |
|
"step": 553000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 34.65354919433594, |
|
"learning_rate": 8.11144005880062e-06, |
|
"loss": 2.5268, |
|
"step": 554000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4727320671081543, |
|
"eval_runtime": 89.1458, |
|
"eval_samples_per_second": 9.255, |
|
"eval_steps_per_second": 4.633, |
|
"step": 554000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 36.14072799682617, |
|
"learning_rate": 8.126081647354413e-06, |
|
"loss": 2.5297, |
|
"step": 555000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5081801414489746, |
|
"eval_runtime": 75.1359, |
|
"eval_samples_per_second": 10.98, |
|
"eval_steps_per_second": 5.497, |
|
"step": 555000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 21.90192413330078, |
|
"learning_rate": 8.140723235908203e-06, |
|
"loss": 2.5182, |
|
"step": 556000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.542210340499878, |
|
"eval_runtime": 89.2503, |
|
"eval_samples_per_second": 9.244, |
|
"eval_steps_per_second": 4.627, |
|
"step": 556000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 29.731470108032227, |
|
"learning_rate": 8.155364824461996e-06, |
|
"loss": 2.5396, |
|
"step": 557000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5540778636932373, |
|
"eval_runtime": 74.5655, |
|
"eval_samples_per_second": 11.064, |
|
"eval_steps_per_second": 5.539, |
|
"step": 557000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 36.37150955200195, |
|
"learning_rate": 8.170006413015787e-06, |
|
"loss": 2.5462, |
|
"step": 558000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5844576358795166, |
|
"eval_runtime": 74.52, |
|
"eval_samples_per_second": 11.071, |
|
"eval_steps_per_second": 5.542, |
|
"step": 558000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 25.697357177734375, |
|
"learning_rate": 8.18464800156958e-06, |
|
"loss": 2.5018, |
|
"step": 559000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4566192626953125, |
|
"eval_runtime": 91.6957, |
|
"eval_samples_per_second": 8.997, |
|
"eval_steps_per_second": 4.504, |
|
"step": 559000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 48.198944091796875, |
|
"learning_rate": 8.19928959012337e-06, |
|
"loss": 2.5175, |
|
"step": 560000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5511298179626465, |
|
"eval_runtime": 77.1596, |
|
"eval_samples_per_second": 10.692, |
|
"eval_steps_per_second": 5.353, |
|
"step": 560000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 35.16044235229492, |
|
"learning_rate": 8.213931178677163e-06, |
|
"loss": 2.5186, |
|
"step": 561000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5079469680786133, |
|
"eval_runtime": 94.2533, |
|
"eval_samples_per_second": 8.753, |
|
"eval_steps_per_second": 4.382, |
|
"step": 561000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 24.49595832824707, |
|
"learning_rate": 8.228572767230954e-06, |
|
"loss": 2.5321, |
|
"step": 562000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4904894828796387, |
|
"eval_runtime": 74.9718, |
|
"eval_samples_per_second": 11.004, |
|
"eval_steps_per_second": 5.509, |
|
"step": 562000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 33.582340240478516, |
|
"learning_rate": 8.243214355784745e-06, |
|
"loss": 2.531, |
|
"step": 563000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.6183409690856934, |
|
"eval_runtime": 74.5558, |
|
"eval_samples_per_second": 11.066, |
|
"eval_steps_per_second": 5.539, |
|
"step": 563000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 24.678638458251953, |
|
"learning_rate": 8.257855944338538e-06, |
|
"loss": 2.5162, |
|
"step": 564000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.525707244873047, |
|
"eval_runtime": 88.6578, |
|
"eval_samples_per_second": 9.305, |
|
"eval_steps_per_second": 4.658, |
|
"step": 564000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 33.279022216796875, |
|
"learning_rate": 8.27249753289233e-06, |
|
"loss": 2.5074, |
|
"step": 565000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5521037578582764, |
|
"eval_runtime": 74.5573, |
|
"eval_samples_per_second": 11.065, |
|
"eval_steps_per_second": 5.539, |
|
"step": 565000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 37.84657287597656, |
|
"learning_rate": 8.287139121446121e-06, |
|
"loss": 2.5216, |
|
"step": 566000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.581077814102173, |
|
"eval_runtime": 88.7044, |
|
"eval_samples_per_second": 9.301, |
|
"eval_steps_per_second": 4.656, |
|
"step": 566000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 34.67258834838867, |
|
"learning_rate": 8.301780709999912e-06, |
|
"loss": 2.5553, |
|
"step": 567000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5216948986053467, |
|
"eval_runtime": 74.6784, |
|
"eval_samples_per_second": 11.047, |
|
"eval_steps_per_second": 5.53, |
|
"step": 567000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.939085006713867, |
|
"learning_rate": 8.316422298553705e-06, |
|
"loss": 2.5188, |
|
"step": 568000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.483720302581787, |
|
"eval_runtime": 88.8032, |
|
"eval_samples_per_second": 9.29, |
|
"eval_steps_per_second": 4.651, |
|
"step": 568000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 70.68443298339844, |
|
"learning_rate": 8.331063887107497e-06, |
|
"loss": 2.521, |
|
"step": 569000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.486253499984741, |
|
"eval_runtime": 74.511, |
|
"eval_samples_per_second": 11.072, |
|
"eval_steps_per_second": 5.543, |
|
"step": 569000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 36.54584503173828, |
|
"learning_rate": 8.345705475661288e-06, |
|
"loss": 2.5131, |
|
"step": 570000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.49752140045166, |
|
"eval_runtime": 74.6321, |
|
"eval_samples_per_second": 11.054, |
|
"eval_steps_per_second": 5.534, |
|
"step": 570000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 33.440093994140625, |
|
"learning_rate": 8.36034706421508e-06, |
|
"loss": 2.522, |
|
"step": 571000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4143617153167725, |
|
"eval_runtime": 88.6944, |
|
"eval_samples_per_second": 9.302, |
|
"eval_steps_per_second": 4.656, |
|
"step": 571000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 34.93145751953125, |
|
"learning_rate": 8.374988652768872e-06, |
|
"loss": 2.5145, |
|
"step": 572000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.463707447052002, |
|
"eval_runtime": 74.5869, |
|
"eval_samples_per_second": 11.061, |
|
"eval_steps_per_second": 5.537, |
|
"step": 572000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.742279052734375, |
|
"learning_rate": 8.389630241322662e-06, |
|
"loss": 2.5006, |
|
"step": 573000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5146420001983643, |
|
"eval_runtime": 88.6696, |
|
"eval_samples_per_second": 9.304, |
|
"eval_steps_per_second": 4.658, |
|
"step": 573000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 53.028282165527344, |
|
"learning_rate": 8.404271829876455e-06, |
|
"loss": 2.5507, |
|
"step": 574000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.531402587890625, |
|
"eval_runtime": 74.5529, |
|
"eval_samples_per_second": 11.066, |
|
"eval_steps_per_second": 5.54, |
|
"step": 574000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 57.27425003051758, |
|
"learning_rate": 8.418913418430246e-06, |
|
"loss": 2.5357, |
|
"step": 575000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.6296398639678955, |
|
"eval_runtime": 74.7514, |
|
"eval_samples_per_second": 11.037, |
|
"eval_steps_per_second": 5.525, |
|
"step": 575000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 22.733678817749023, |
|
"learning_rate": 8.43355500698404e-06, |
|
"loss": 2.5193, |
|
"step": 576000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.570042133331299, |
|
"eval_runtime": 88.802, |
|
"eval_samples_per_second": 9.29, |
|
"eval_steps_per_second": 4.651, |
|
"step": 576000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 35.834075927734375, |
|
"learning_rate": 8.448196595537829e-06, |
|
"loss": 2.5336, |
|
"step": 577000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5378034114837646, |
|
"eval_runtime": 74.6827, |
|
"eval_samples_per_second": 11.047, |
|
"eval_steps_per_second": 5.53, |
|
"step": 577000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 36.09931945800781, |
|
"learning_rate": 8.462838184091622e-06, |
|
"loss": 2.505, |
|
"step": 578000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4723126888275146, |
|
"eval_runtime": 88.9314, |
|
"eval_samples_per_second": 9.277, |
|
"eval_steps_per_second": 4.644, |
|
"step": 578000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 29.487991333007812, |
|
"learning_rate": 8.477479772645413e-06, |
|
"loss": 2.5118, |
|
"step": 579000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.4950342178344727, |
|
"eval_runtime": 74.6665, |
|
"eval_samples_per_second": 11.049, |
|
"eval_steps_per_second": 5.531, |
|
"step": 579000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 35.32073974609375, |
|
"learning_rate": 8.492121361199205e-06, |
|
"loss": 2.527, |
|
"step": 580000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 2.5424089431762695, |
|
"eval_runtime": 74.5322, |
|
"eval_samples_per_second": 11.069, |
|
"eval_steps_per_second": 5.541, |
|
"step": 580000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 40.19975280761719, |
|
"learning_rate": 8.506762949752998e-06, |
|
"loss": 2.531, |
|
"step": 581000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5420637130737305, |
|
"eval_runtime": 88.8561, |
|
"eval_samples_per_second": 9.285, |
|
"eval_steps_per_second": 4.648, |
|
"step": 581000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 27.63242530822754, |
|
"learning_rate": 8.521404538306789e-06, |
|
"loss": 2.5215, |
|
"step": 582000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4698994159698486, |
|
"eval_runtime": 74.5737, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 5.538, |
|
"step": 582000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 45.67693328857422, |
|
"learning_rate": 8.53604612686058e-06, |
|
"loss": 2.5103, |
|
"step": 583000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5570404529571533, |
|
"eval_runtime": 88.6527, |
|
"eval_samples_per_second": 9.306, |
|
"eval_steps_per_second": 4.659, |
|
"step": 583000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 33.481082916259766, |
|
"learning_rate": 8.550687715414372e-06, |
|
"loss": 2.5258, |
|
"step": 584000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.529367208480835, |
|
"eval_runtime": 78.065, |
|
"eval_samples_per_second": 10.568, |
|
"eval_steps_per_second": 5.29, |
|
"step": 584000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 20.300424575805664, |
|
"learning_rate": 8.565329303968165e-06, |
|
"loss": 2.4886, |
|
"step": 585000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5173099040985107, |
|
"eval_runtime": 88.6295, |
|
"eval_samples_per_second": 9.308, |
|
"eval_steps_per_second": 4.66, |
|
"step": 585000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 30.857240676879883, |
|
"learning_rate": 8.579970892521956e-06, |
|
"loss": 2.5094, |
|
"step": 586000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.6584701538085938, |
|
"eval_runtime": 74.5434, |
|
"eval_samples_per_second": 11.067, |
|
"eval_steps_per_second": 5.54, |
|
"step": 586000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 25.00079917907715, |
|
"learning_rate": 8.594612481075747e-06, |
|
"loss": 2.5294, |
|
"step": 587000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.546011209487915, |
|
"eval_runtime": 74.5637, |
|
"eval_samples_per_second": 11.064, |
|
"eval_steps_per_second": 5.539, |
|
"step": 587000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 30.173933029174805, |
|
"learning_rate": 8.609254069629539e-06, |
|
"loss": 2.5104, |
|
"step": 588000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.46292781829834, |
|
"eval_runtime": 88.6384, |
|
"eval_samples_per_second": 9.307, |
|
"eval_steps_per_second": 4.659, |
|
"step": 588000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 34.49174118041992, |
|
"learning_rate": 8.623895658183332e-06, |
|
"loss": 2.4978, |
|
"step": 589000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4769954681396484, |
|
"eval_runtime": 74.6204, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 5.535, |
|
"step": 589000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 31.314254760742188, |
|
"learning_rate": 8.638537246737123e-06, |
|
"loss": 2.5209, |
|
"step": 590000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4981024265289307, |
|
"eval_runtime": 88.6067, |
|
"eval_samples_per_second": 9.311, |
|
"eval_steps_per_second": 4.661, |
|
"step": 590000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 44.28144454956055, |
|
"learning_rate": 8.653178835290914e-06, |
|
"loss": 2.5353, |
|
"step": 591000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.546908140182495, |
|
"eval_runtime": 74.6528, |
|
"eval_samples_per_second": 11.051, |
|
"eval_steps_per_second": 5.532, |
|
"step": 591000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 22.053375244140625, |
|
"learning_rate": 8.667820423844706e-06, |
|
"loss": 2.5199, |
|
"step": 592000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.51155161857605, |
|
"eval_runtime": 74.5424, |
|
"eval_samples_per_second": 11.068, |
|
"eval_steps_per_second": 5.54, |
|
"step": 592000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 31.350740432739258, |
|
"learning_rate": 8.682462012398499e-06, |
|
"loss": 2.4976, |
|
"step": 593000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5021212100982666, |
|
"eval_runtime": 88.8346, |
|
"eval_samples_per_second": 9.287, |
|
"eval_steps_per_second": 4.649, |
|
"step": 593000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 31.255252838134766, |
|
"learning_rate": 8.697103600952288e-06, |
|
"loss": 2.5163, |
|
"step": 594000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4919252395629883, |
|
"eval_runtime": 74.5758, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 5.538, |
|
"step": 594000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 32.16548156738281, |
|
"learning_rate": 8.711745189506081e-06, |
|
"loss": 2.4954, |
|
"step": 595000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.508427381515503, |
|
"eval_runtime": 88.8381, |
|
"eval_samples_per_second": 9.287, |
|
"eval_steps_per_second": 4.649, |
|
"step": 595000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 38.885704040527344, |
|
"learning_rate": 8.726386778059873e-06, |
|
"loss": 2.5091, |
|
"step": 596000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5161032676696777, |
|
"eval_runtime": 74.6717, |
|
"eval_samples_per_second": 11.048, |
|
"eval_steps_per_second": 5.531, |
|
"step": 596000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 26.605031967163086, |
|
"learning_rate": 8.741028366613666e-06, |
|
"loss": 2.5106, |
|
"step": 597000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.480731248855591, |
|
"eval_runtime": 88.9373, |
|
"eval_samples_per_second": 9.276, |
|
"eval_steps_per_second": 4.644, |
|
"step": 597000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 31.664562225341797, |
|
"learning_rate": 8.755669955167457e-06, |
|
"loss": 2.4986, |
|
"step": 598000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.454038619995117, |
|
"eval_runtime": 74.4456, |
|
"eval_samples_per_second": 11.082, |
|
"eval_steps_per_second": 5.548, |
|
"step": 598000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 30.46480369567871, |
|
"learning_rate": 8.770311543721248e-06, |
|
"loss": 2.5167, |
|
"step": 599000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.481914758682251, |
|
"eval_runtime": 74.5377, |
|
"eval_samples_per_second": 11.068, |
|
"eval_steps_per_second": 5.541, |
|
"step": 599000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 66.7420654296875, |
|
"learning_rate": 8.78495313227504e-06, |
|
"loss": 2.5257, |
|
"step": 600000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4934213161468506, |
|
"eval_runtime": 88.8231, |
|
"eval_samples_per_second": 9.288, |
|
"eval_steps_per_second": 4.65, |
|
"step": 600000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 25.715648651123047, |
|
"learning_rate": 8.799594720828831e-06, |
|
"loss": 2.517, |
|
"step": 601000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.441848039627075, |
|
"eval_runtime": 74.6384, |
|
"eval_samples_per_second": 11.053, |
|
"eval_steps_per_second": 5.533, |
|
"step": 601000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 40.46959686279297, |
|
"learning_rate": 8.814236309382624e-06, |
|
"loss": 2.5514, |
|
"step": 602000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.470050096511841, |
|
"eval_runtime": 88.7602, |
|
"eval_samples_per_second": 9.295, |
|
"eval_steps_per_second": 4.653, |
|
"step": 602000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 32.37881088256836, |
|
"learning_rate": 8.828877897936415e-06, |
|
"loss": 2.5062, |
|
"step": 603000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5241613388061523, |
|
"eval_runtime": 74.6312, |
|
"eval_samples_per_second": 11.054, |
|
"eval_steps_per_second": 5.534, |
|
"step": 603000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 48.49308395385742, |
|
"learning_rate": 8.843519486490207e-06, |
|
"loss": 2.5172, |
|
"step": 604000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5067596435546875, |
|
"eval_runtime": 74.543, |
|
"eval_samples_per_second": 11.067, |
|
"eval_steps_per_second": 5.54, |
|
"step": 604000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 27.66318130493164, |
|
"learning_rate": 8.858161075043998e-06, |
|
"loss": 2.5241, |
|
"step": 605000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4602746963500977, |
|
"eval_runtime": 88.5077, |
|
"eval_samples_per_second": 9.321, |
|
"eval_steps_per_second": 4.666, |
|
"step": 605000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 48.21938705444336, |
|
"learning_rate": 8.872802663597791e-06, |
|
"loss": 2.5155, |
|
"step": 606000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.481112480163574, |
|
"eval_runtime": 74.5722, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 5.538, |
|
"step": 606000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 29.03439712524414, |
|
"learning_rate": 8.887444252151582e-06, |
|
"loss": 2.5271, |
|
"step": 607000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.524092197418213, |
|
"eval_runtime": 88.7735, |
|
"eval_samples_per_second": 9.293, |
|
"eval_steps_per_second": 4.652, |
|
"step": 607000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 19.625823974609375, |
|
"learning_rate": 8.902085840705374e-06, |
|
"loss": 2.4987, |
|
"step": 608000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5538923740386963, |
|
"eval_runtime": 74.6497, |
|
"eval_samples_per_second": 11.052, |
|
"eval_steps_per_second": 5.533, |
|
"step": 608000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 63.533599853515625, |
|
"learning_rate": 8.916727429259165e-06, |
|
"loss": 2.5101, |
|
"step": 609000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.502268075942993, |
|
"eval_runtime": 74.5661, |
|
"eval_samples_per_second": 11.064, |
|
"eval_steps_per_second": 5.539, |
|
"step": 609000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 21.927827835083008, |
|
"learning_rate": 8.931369017812958e-06, |
|
"loss": 2.5147, |
|
"step": 610000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5037426948547363, |
|
"eval_runtime": 89.0991, |
|
"eval_samples_per_second": 9.259, |
|
"eval_steps_per_second": 4.635, |
|
"step": 610000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 27.656044006347656, |
|
"learning_rate": 8.94601060636675e-06, |
|
"loss": 2.5236, |
|
"step": 611000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.487699508666992, |
|
"eval_runtime": 74.5421, |
|
"eval_samples_per_second": 11.068, |
|
"eval_steps_per_second": 5.54, |
|
"step": 611000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 19.147541046142578, |
|
"learning_rate": 8.96065219492054e-06, |
|
"loss": 2.5209, |
|
"step": 612000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.465284585952759, |
|
"eval_runtime": 88.8409, |
|
"eval_samples_per_second": 9.286, |
|
"eval_steps_per_second": 4.649, |
|
"step": 612000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 38.874507904052734, |
|
"learning_rate": 8.975293783474332e-06, |
|
"loss": 2.4915, |
|
"step": 613000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.500398874282837, |
|
"eval_runtime": 74.6884, |
|
"eval_samples_per_second": 11.046, |
|
"eval_steps_per_second": 5.53, |
|
"step": 613000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 91.2654037475586, |
|
"learning_rate": 8.989935372028125e-06, |
|
"loss": 2.5087, |
|
"step": 614000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.511294364929199, |
|
"eval_runtime": 88.9508, |
|
"eval_samples_per_second": 9.275, |
|
"eval_steps_per_second": 4.643, |
|
"step": 614000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 25.057418823242188, |
|
"learning_rate": 9.004576960581916e-06, |
|
"loss": 2.518, |
|
"step": 615000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5428690910339355, |
|
"eval_runtime": 74.509, |
|
"eval_samples_per_second": 11.072, |
|
"eval_steps_per_second": 5.543, |
|
"step": 615000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 47.45814514160156, |
|
"learning_rate": 9.019218549135708e-06, |
|
"loss": 2.5108, |
|
"step": 616000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.516467809677124, |
|
"eval_runtime": 74.5781, |
|
"eval_samples_per_second": 11.062, |
|
"eval_steps_per_second": 5.538, |
|
"step": 616000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 38.216609954833984, |
|
"learning_rate": 9.033860137689499e-06, |
|
"loss": 2.501, |
|
"step": 617000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.532259464263916, |
|
"eval_runtime": 88.6384, |
|
"eval_samples_per_second": 9.307, |
|
"eval_steps_per_second": 4.659, |
|
"step": 617000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 22.716007232666016, |
|
"learning_rate": 9.048501726243292e-06, |
|
"loss": 2.4959, |
|
"step": 618000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4635117053985596, |
|
"eval_runtime": 74.6568, |
|
"eval_samples_per_second": 11.051, |
|
"eval_steps_per_second": 5.532, |
|
"step": 618000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 30.824867248535156, |
|
"learning_rate": 9.063143314797083e-06, |
|
"loss": 2.5062, |
|
"step": 619000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.512939214706421, |
|
"eval_runtime": 89.075, |
|
"eval_samples_per_second": 9.262, |
|
"eval_steps_per_second": 4.637, |
|
"step": 619000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 58.68292999267578, |
|
"learning_rate": 9.077784903350875e-06, |
|
"loss": 2.5045, |
|
"step": 620000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5159168243408203, |
|
"eval_runtime": 74.6453, |
|
"eval_samples_per_second": 11.052, |
|
"eval_steps_per_second": 5.533, |
|
"step": 620000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 34.04876708984375, |
|
"learning_rate": 9.092426491904666e-06, |
|
"loss": 2.5157, |
|
"step": 621000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.50462007522583, |
|
"eval_runtime": 74.5748, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 5.538, |
|
"step": 621000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 21.843368530273438, |
|
"learning_rate": 9.107068080458457e-06, |
|
"loss": 2.5001, |
|
"step": 622000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.6030919551849365, |
|
"eval_runtime": 88.9981, |
|
"eval_samples_per_second": 9.27, |
|
"eval_steps_per_second": 4.641, |
|
"step": 622000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 80.11573028564453, |
|
"learning_rate": 9.12170966901225e-06, |
|
"loss": 2.5089, |
|
"step": 623000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5459847450256348, |
|
"eval_runtime": 79.1187, |
|
"eval_samples_per_second": 10.427, |
|
"eval_steps_per_second": 5.22, |
|
"step": 623000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 33.59737014770508, |
|
"learning_rate": 9.136351257566042e-06, |
|
"loss": 2.5009, |
|
"step": 624000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5228402614593506, |
|
"eval_runtime": 88.6038, |
|
"eval_samples_per_second": 9.311, |
|
"eval_steps_per_second": 4.661, |
|
"step": 624000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 87.70807647705078, |
|
"learning_rate": 9.150992846119835e-06, |
|
"loss": 2.5444, |
|
"step": 625000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4817819595336914, |
|
"eval_runtime": 74.6477, |
|
"eval_samples_per_second": 11.052, |
|
"eval_steps_per_second": 5.533, |
|
"step": 625000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 36.25197219848633, |
|
"learning_rate": 9.165634434673624e-06, |
|
"loss": 2.5222, |
|
"step": 626000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.5177390575408936, |
|
"eval_runtime": 74.5677, |
|
"eval_samples_per_second": 11.064, |
|
"eval_steps_per_second": 5.539, |
|
"step": 626000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 27.811765670776367, |
|
"learning_rate": 9.180276023227417e-06, |
|
"loss": 2.5234, |
|
"step": 627000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4546000957489014, |
|
"eval_runtime": 89.1778, |
|
"eval_samples_per_second": 9.251, |
|
"eval_steps_per_second": 4.631, |
|
"step": 627000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 23.489614486694336, |
|
"learning_rate": 9.194917611781209e-06, |
|
"loss": 2.5004, |
|
"step": 628000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.509915351867676, |
|
"eval_runtime": 74.8877, |
|
"eval_samples_per_second": 11.016, |
|
"eval_steps_per_second": 5.515, |
|
"step": 628000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 27.403156280517578, |
|
"learning_rate": 9.209559200335e-06, |
|
"loss": 2.5133, |
|
"step": 629000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4226651191711426, |
|
"eval_runtime": 88.7661, |
|
"eval_samples_per_second": 9.294, |
|
"eval_steps_per_second": 4.653, |
|
"step": 629000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 30.951892852783203, |
|
"learning_rate": 9.224200788888791e-06, |
|
"loss": 2.5104, |
|
"step": 630000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 2.4871816635131836, |
|
"eval_runtime": 74.6946, |
|
"eval_samples_per_second": 11.045, |
|
"eval_steps_per_second": 5.529, |
|
"step": 630000 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 6829858, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 70000, |
|
"total_flos": 2.0736382681586196e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|