|
{ |
|
"best_metric": 0.6895740032196045, |
|
"best_model_checkpoint": "./checkpoints/llava-v1.5-13b/checkpoint-224", |
|
"epoch": 7.0, |
|
"eval_steps": 1.0, |
|
"global_step": 224, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 0.2380081706918525, |
|
"learning_rate": 0.0, |
|
"loss": 1.2458, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"eval_loss": 1.3161638975143433, |
|
"eval_runtime": 50.8995, |
|
"eval_samples_per_second": 3.929, |
|
"eval_steps_per_second": 0.255, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.20429495268987705, |
|
"learning_rate": 8.613531161467863e-06, |
|
"loss": 1.2003, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"eval_loss": 1.3161638975143433, |
|
"eval_runtime": 47.4818, |
|
"eval_samples_per_second": 4.212, |
|
"eval_steps_per_second": 0.274, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 0.20616215800420787, |
|
"learning_rate": 1.3652123889719709e-05, |
|
"loss": 1.2622, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"eval_loss": 1.309991478919983, |
|
"eval_runtime": 47.4152, |
|
"eval_samples_per_second": 4.218, |
|
"eval_steps_per_second": 0.274, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.20155595022101944, |
|
"learning_rate": 1.7227062322935725e-05, |
|
"loss": 1.2845, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"eval_loss": 1.3013781309127808, |
|
"eval_runtime": 47.4814, |
|
"eval_samples_per_second": 4.212, |
|
"eval_steps_per_second": 0.274, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.21113117474989132, |
|
"learning_rate": 2e-05, |
|
"loss": 1.246, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"eval_loss": 1.2892160415649414, |
|
"eval_runtime": 47.7209, |
|
"eval_samples_per_second": 4.191, |
|
"eval_steps_per_second": 0.272, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.21377946631015488, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2684, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"eval_loss": 1.2754532098770142, |
|
"eval_runtime": 47.5781, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.2284268997618767, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2681, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"eval_loss": 1.2605774402618408, |
|
"eval_runtime": 47.5326, |
|
"eval_samples_per_second": 4.208, |
|
"eval_steps_per_second": 0.273, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.23585343568544442, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2407, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.244718313217163, |
|
"eval_runtime": 47.5001, |
|
"eval_samples_per_second": 4.211, |
|
"eval_steps_per_second": 0.274, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.23051191992462533, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2766, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"eval_loss": 1.2285138368606567, |
|
"eval_runtime": 47.4631, |
|
"eval_samples_per_second": 4.214, |
|
"eval_steps_per_second": 0.274, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.22726394327484983, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2024, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"eval_loss": 1.2118008136749268, |
|
"eval_runtime": 47.4991, |
|
"eval_samples_per_second": 4.211, |
|
"eval_steps_per_second": 0.274, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.25404890894461285, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2742, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"eval_loss": 1.1942989826202393, |
|
"eval_runtime": 49.2609, |
|
"eval_samples_per_second": 4.06, |
|
"eval_steps_per_second": 0.264, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.26336210916526287, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2258, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"eval_loss": 1.176426649093628, |
|
"eval_runtime": 49.0639, |
|
"eval_samples_per_second": 4.076, |
|
"eval_steps_per_second": 0.265, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.29637148470746666, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2345, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"eval_loss": 1.1577811241149902, |
|
"eval_runtime": 49.1352, |
|
"eval_samples_per_second": 4.07, |
|
"eval_steps_per_second": 0.265, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.2841880377627424, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0765, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"eval_loss": 1.1381279230117798, |
|
"eval_runtime": 49.25, |
|
"eval_samples_per_second": 4.061, |
|
"eval_steps_per_second": 0.264, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.2773140636191091, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1812, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"eval_loss": 1.1178216934204102, |
|
"eval_runtime": 49.0879, |
|
"eval_samples_per_second": 4.074, |
|
"eval_steps_per_second": 0.265, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3568607365552051, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1327, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0954149961471558, |
|
"eval_runtime": 48.6546, |
|
"eval_samples_per_second": 4.111, |
|
"eval_steps_per_second": 0.267, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.32574391414112897, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1162, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"eval_loss": 1.071275234222412, |
|
"eval_runtime": 48.5618, |
|
"eval_samples_per_second": 4.118, |
|
"eval_steps_per_second": 0.268, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.4256864144638081, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1138, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"eval_loss": 1.0455905199050903, |
|
"eval_runtime": 48.4981, |
|
"eval_samples_per_second": 4.124, |
|
"eval_steps_per_second": 0.268, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.31230014132112643, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0011, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"eval_loss": 1.0208789110183716, |
|
"eval_runtime": 48.4675, |
|
"eval_samples_per_second": 4.126, |
|
"eval_steps_per_second": 0.268, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.3025724039243594, |
|
"learning_rate": 2e-05, |
|
"loss": 1.109, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"eval_loss": 1.002480149269104, |
|
"eval_runtime": 48.5265, |
|
"eval_samples_per_second": 4.121, |
|
"eval_steps_per_second": 0.268, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 0.27787879590501874, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0291, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"eval_loss": 0.9933492541313171, |
|
"eval_runtime": 50.0369, |
|
"eval_samples_per_second": 3.997, |
|
"eval_steps_per_second": 0.26, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.4231294067130801, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0779, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"eval_loss": 0.9850385785102844, |
|
"eval_runtime": 50.0062, |
|
"eval_samples_per_second": 4.0, |
|
"eval_steps_per_second": 0.26, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 0.42130097437373987, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0897, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"eval_loss": 0.9758670330047607, |
|
"eval_runtime": 50.1031, |
|
"eval_samples_per_second": 3.992, |
|
"eval_steps_per_second": 0.259, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.27711808063263893, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0739, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.9674506187438965, |
|
"eval_runtime": 50.0337, |
|
"eval_samples_per_second": 3.997, |
|
"eval_steps_per_second": 0.26, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.2879649409281791, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0182, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"eval_loss": 0.9592065215110779, |
|
"eval_runtime": 50.0709, |
|
"eval_samples_per_second": 3.994, |
|
"eval_steps_per_second": 0.26, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.19327450826076825, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0413, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"eval_loss": 0.9518552422523499, |
|
"eval_runtime": 50.0572, |
|
"eval_samples_per_second": 3.995, |
|
"eval_steps_per_second": 0.26, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.19707021382445633, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9525, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"eval_loss": 0.9449941515922546, |
|
"eval_runtime": 50.0515, |
|
"eval_samples_per_second": 3.996, |
|
"eval_steps_per_second": 0.26, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.2420270757641518, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9658, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"eval_loss": 0.9378474354743958, |
|
"eval_runtime": 49.9299, |
|
"eval_samples_per_second": 4.006, |
|
"eval_steps_per_second": 0.26, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 0.18074632782127534, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9866, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"eval_loss": 0.93099045753479, |
|
"eval_runtime": 50.0096, |
|
"eval_samples_per_second": 3.999, |
|
"eval_steps_per_second": 0.26, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.1936051126921734, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0128, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"eval_loss": 0.9244199991226196, |
|
"eval_runtime": 50.2469, |
|
"eval_samples_per_second": 3.98, |
|
"eval_steps_per_second": 0.259, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 0.26164254459782943, |
|
"learning_rate": 2e-05, |
|
"loss": 0.88, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"eval_loss": 0.9175177216529846, |
|
"eval_runtime": 50.1695, |
|
"eval_samples_per_second": 3.986, |
|
"eval_steps_per_second": 0.259, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.18677152741688485, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9569, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.9108598828315735, |
|
"eval_runtime": 50.0387, |
|
"eval_samples_per_second": 3.997, |
|
"eval_steps_per_second": 0.26, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 0.20486279036126417, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0208, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"eval_loss": 0.9042049646377563, |
|
"eval_runtime": 50.1472, |
|
"eval_samples_per_second": 3.988, |
|
"eval_steps_per_second": 0.259, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 0.2004946169291112, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9931, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"eval_loss": 0.8980298042297363, |
|
"eval_runtime": 50.245, |
|
"eval_samples_per_second": 3.98, |
|
"eval_steps_per_second": 0.259, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.1645872432258401, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0184, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"eval_loss": 0.8924428820610046, |
|
"eval_runtime": 50.3703, |
|
"eval_samples_per_second": 3.971, |
|
"eval_steps_per_second": 0.258, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.18293519304435016, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0026, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"eval_loss": 0.8870412707328796, |
|
"eval_runtime": 50.0483, |
|
"eval_samples_per_second": 3.996, |
|
"eval_steps_per_second": 0.26, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 0.17712548516246762, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9387, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"eval_loss": 0.881915271282196, |
|
"eval_runtime": 49.9751, |
|
"eval_samples_per_second": 4.002, |
|
"eval_steps_per_second": 0.26, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 0.21472689311609464, |
|
"learning_rate": 2e-05, |
|
"loss": 0.958, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"eval_loss": 0.8768754601478577, |
|
"eval_runtime": 50.1204, |
|
"eval_samples_per_second": 3.99, |
|
"eval_steps_per_second": 0.259, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 0.21117297910005806, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9922, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"eval_loss": 0.8718628883361816, |
|
"eval_runtime": 50.1732, |
|
"eval_samples_per_second": 3.986, |
|
"eval_steps_per_second": 0.259, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.17835587003909165, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9776, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8669865131378174, |
|
"eval_runtime": 50.1148, |
|
"eval_samples_per_second": 3.991, |
|
"eval_steps_per_second": 0.259, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 0.2092736372483734, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9731, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"eval_loss": 0.8619834780693054, |
|
"eval_runtime": 50.052, |
|
"eval_samples_per_second": 3.996, |
|
"eval_steps_per_second": 0.26, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 0.2338857391910308, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9319, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"eval_loss": 0.8572126030921936, |
|
"eval_runtime": 50.1212, |
|
"eval_samples_per_second": 3.99, |
|
"eval_steps_per_second": 0.259, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 0.19168719284572813, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9083, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"eval_loss": 0.8525611758232117, |
|
"eval_runtime": 50.1733, |
|
"eval_samples_per_second": 3.986, |
|
"eval_steps_per_second": 0.259, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.20004868138433377, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9118, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"eval_loss": 0.8483461141586304, |
|
"eval_runtime": 50.1083, |
|
"eval_samples_per_second": 3.991, |
|
"eval_steps_per_second": 0.259, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.19012965506122342, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8888, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"eval_loss": 0.8446614742279053, |
|
"eval_runtime": 50.1171, |
|
"eval_samples_per_second": 3.991, |
|
"eval_steps_per_second": 0.259, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 0.21187005706805245, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9319, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"eval_loss": 0.8412036299705505, |
|
"eval_runtime": 50.0918, |
|
"eval_samples_per_second": 3.993, |
|
"eval_steps_per_second": 0.26, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 0.19673832205926584, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9359, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"eval_loss": 0.8380417823791504, |
|
"eval_runtime": 50.2214, |
|
"eval_samples_per_second": 3.982, |
|
"eval_steps_per_second": 0.259, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.21712294106174318, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8511, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.8353021740913391, |
|
"eval_runtime": 50.1617, |
|
"eval_samples_per_second": 3.987, |
|
"eval_steps_per_second": 0.259, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 0.2138924779700934, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8695, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"eval_loss": 0.8327407836914062, |
|
"eval_runtime": 50.1442, |
|
"eval_samples_per_second": 3.988, |
|
"eval_steps_per_second": 0.259, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.22387442384578618, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8518, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"eval_loss": 0.8301742076873779, |
|
"eval_runtime": 50.1867, |
|
"eval_samples_per_second": 3.985, |
|
"eval_steps_per_second": 0.259, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 0.1975577146517192, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8868, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"eval_loss": 0.8275265693664551, |
|
"eval_runtime": 51.2257, |
|
"eval_samples_per_second": 3.904, |
|
"eval_steps_per_second": 0.254, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.21474817057286624, |
|
"learning_rate": 2e-05, |
|
"loss": 0.767, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"eval_loss": 0.824796736240387, |
|
"eval_runtime": 51.276, |
|
"eval_samples_per_second": 3.9, |
|
"eval_steps_per_second": 0.254, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 0.21105651676755652, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9219, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"eval_loss": 0.8221166729927063, |
|
"eval_runtime": 51.141, |
|
"eval_samples_per_second": 3.911, |
|
"eval_steps_per_second": 0.254, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 0.20706475184742085, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8873, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"eval_loss": 0.819589376449585, |
|
"eval_runtime": 51.0045, |
|
"eval_samples_per_second": 3.921, |
|
"eval_steps_per_second": 0.255, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.21722220033855957, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8956, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"eval_loss": 0.8176340460777283, |
|
"eval_runtime": 51.1941, |
|
"eval_samples_per_second": 3.907, |
|
"eval_steps_per_second": 0.254, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.20669001221665667, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9506, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.8158826231956482, |
|
"eval_runtime": 52.1162, |
|
"eval_samples_per_second": 3.838, |
|
"eval_steps_per_second": 0.249, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 0.22189732090066341, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8955, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"eval_loss": 0.814656674861908, |
|
"eval_runtime": 52.1361, |
|
"eval_samples_per_second": 3.836, |
|
"eval_steps_per_second": 0.249, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 0.2030113892848459, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9108, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"eval_loss": 0.813343346118927, |
|
"eval_runtime": 52.2552, |
|
"eval_samples_per_second": 3.827, |
|
"eval_steps_per_second": 0.249, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 0.2123201057569791, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8779, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"eval_loss": 0.8116877675056458, |
|
"eval_runtime": 52.1233, |
|
"eval_samples_per_second": 3.837, |
|
"eval_steps_per_second": 0.249, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.211551126937912, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9294, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"eval_loss": 0.8098442554473877, |
|
"eval_runtime": 52.1091, |
|
"eval_samples_per_second": 3.838, |
|
"eval_steps_per_second": 0.249, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 0.24981344981629752, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8409, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"eval_loss": 0.8070770502090454, |
|
"eval_runtime": 53.4187, |
|
"eval_samples_per_second": 3.744, |
|
"eval_steps_per_second": 0.243, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 0.2341550589775159, |
|
"learning_rate": 2e-05, |
|
"loss": 0.888, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"eval_loss": 0.8040286898612976, |
|
"eval_runtime": 53.2197, |
|
"eval_samples_per_second": 3.758, |
|
"eval_steps_per_second": 0.244, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 0.2336241775649256, |
|
"learning_rate": 2e-05, |
|
"loss": 0.913, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"eval_loss": 0.8013430833816528, |
|
"eval_runtime": 53.1784, |
|
"eval_samples_per_second": 3.761, |
|
"eval_steps_per_second": 0.244, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2414390628081758, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8754, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7985894680023193, |
|
"eval_runtime": 53.2454, |
|
"eval_samples_per_second": 3.756, |
|
"eval_steps_per_second": 0.244, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 0.2484104465653703, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8497, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"eval_loss": 0.7954932451248169, |
|
"eval_runtime": 53.3794, |
|
"eval_samples_per_second": 3.747, |
|
"eval_steps_per_second": 0.244, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 0.23859744120942086, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8567, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"eval_loss": 0.7929843068122864, |
|
"eval_runtime": 55.517, |
|
"eval_samples_per_second": 3.602, |
|
"eval_steps_per_second": 0.234, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 0.24584758647855462, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8489, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"eval_loss": 0.7903321981430054, |
|
"eval_runtime": 55.4151, |
|
"eval_samples_per_second": 3.609, |
|
"eval_steps_per_second": 0.235, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.2484917818304153, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9122, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"eval_loss": 0.7877185344696045, |
|
"eval_runtime": 55.4069, |
|
"eval_samples_per_second": 3.61, |
|
"eval_steps_per_second": 0.235, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 0.2184614083026819, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8355, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"eval_loss": 0.7852210998535156, |
|
"eval_runtime": 55.3381, |
|
"eval_samples_per_second": 3.614, |
|
"eval_steps_per_second": 0.235, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.24978410070800153, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7968, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"eval_loss": 0.7827157378196716, |
|
"eval_runtime": 55.3708, |
|
"eval_samples_per_second": 3.612, |
|
"eval_steps_per_second": 0.235, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 0.23059883325890385, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8783, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"eval_loss": 0.7805906534194946, |
|
"eval_runtime": 55.6033, |
|
"eval_samples_per_second": 3.597, |
|
"eval_steps_per_second": 0.234, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.23261007334915096, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7956, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.7786691784858704, |
|
"eval_runtime": 55.0913, |
|
"eval_samples_per_second": 3.63, |
|
"eval_steps_per_second": 0.236, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 0.25779598356574085, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8426, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"eval_loss": 0.7771151661872864, |
|
"eval_runtime": 55.0698, |
|
"eval_samples_per_second": 3.632, |
|
"eval_steps_per_second": 0.236, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 0.2288243335971112, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8381, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"eval_loss": 0.7756838202476501, |
|
"eval_runtime": 54.8412, |
|
"eval_samples_per_second": 3.647, |
|
"eval_steps_per_second": 0.237, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 0.24235644907977733, |
|
"learning_rate": 2e-05, |
|
"loss": 0.887, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"eval_loss": 0.7739972472190857, |
|
"eval_runtime": 54.9718, |
|
"eval_samples_per_second": 3.638, |
|
"eval_steps_per_second": 0.236, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.23666820017867402, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8007, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"eval_loss": 0.7724328637123108, |
|
"eval_runtime": 55.0225, |
|
"eval_samples_per_second": 3.635, |
|
"eval_steps_per_second": 0.236, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 0.22815737396609181, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8529, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"eval_loss": 0.7710004448890686, |
|
"eval_runtime": 55.321, |
|
"eval_samples_per_second": 3.615, |
|
"eval_steps_per_second": 0.235, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 0.2701264871470739, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8515, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"eval_loss": 0.7695322632789612, |
|
"eval_runtime": 55.3045, |
|
"eval_samples_per_second": 3.616, |
|
"eval_steps_per_second": 0.235, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 0.24363813951328234, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8587, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"eval_loss": 0.7689024209976196, |
|
"eval_runtime": 55.3009, |
|
"eval_samples_per_second": 3.617, |
|
"eval_steps_per_second": 0.235, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.30924701355253065, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9076, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.7676254510879517, |
|
"eval_runtime": 55.2365, |
|
"eval_samples_per_second": 3.621, |
|
"eval_steps_per_second": 0.235, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 0.2665188280221636, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8445, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"eval_loss": 0.7661146521568298, |
|
"eval_runtime": 55.2775, |
|
"eval_samples_per_second": 3.618, |
|
"eval_steps_per_second": 0.235, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 0.24674191720675534, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8882, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"eval_loss": 0.76513671875, |
|
"eval_runtime": 55.0857, |
|
"eval_samples_per_second": 3.631, |
|
"eval_steps_per_second": 0.236, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 0.2736689405531704, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8336, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"eval_loss": 0.764373779296875, |
|
"eval_runtime": 55.2069, |
|
"eval_samples_per_second": 3.623, |
|
"eval_steps_per_second": 0.235, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.290841287198557, |
|
"learning_rate": 2e-05, |
|
"loss": 0.795, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"eval_loss": 0.7632084488868713, |
|
"eval_runtime": 55.1009, |
|
"eval_samples_per_second": 3.63, |
|
"eval_steps_per_second": 0.236, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 0.2912051076836381, |
|
"learning_rate": 2e-05, |
|
"loss": 0.772, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"eval_loss": 0.7618446350097656, |
|
"eval_runtime": 55.3717, |
|
"eval_samples_per_second": 3.612, |
|
"eval_steps_per_second": 0.235, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 0.3169908538809109, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8148, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"eval_loss": 0.7599577307701111, |
|
"eval_runtime": 55.3931, |
|
"eval_samples_per_second": 3.611, |
|
"eval_steps_per_second": 0.235, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 0.28780549186847426, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8154, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"eval_loss": 0.7583369612693787, |
|
"eval_runtime": 55.1679, |
|
"eval_samples_per_second": 3.625, |
|
"eval_steps_per_second": 0.236, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.30695250620091474, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9032, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 0.7571613192558289, |
|
"eval_runtime": 55.1779, |
|
"eval_samples_per_second": 3.625, |
|
"eval_steps_per_second": 0.236, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 0.2693887416759828, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8106, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"eval_loss": 0.7566004991531372, |
|
"eval_runtime": 55.1107, |
|
"eval_samples_per_second": 3.629, |
|
"eval_steps_per_second": 0.236, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.2887583627563198, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8518, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"eval_loss": 0.7558963298797607, |
|
"eval_runtime": 55.2153, |
|
"eval_samples_per_second": 3.622, |
|
"eval_steps_per_second": 0.235, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 0.3059402168979351, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7727, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"eval_loss": 0.7545350790023804, |
|
"eval_runtime": 55.3225, |
|
"eval_samples_per_second": 3.615, |
|
"eval_steps_per_second": 0.235, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.3096260477909968, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8477, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"eval_loss": 0.7526452541351318, |
|
"eval_runtime": 55.4311, |
|
"eval_samples_per_second": 3.608, |
|
"eval_steps_per_second": 0.235, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 0.31498884686525297, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7982, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"eval_loss": 0.7510760426521301, |
|
"eval_runtime": 55.4361, |
|
"eval_samples_per_second": 3.608, |
|
"eval_steps_per_second": 0.235, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 0.31302830623184313, |
|
"learning_rate": 2e-05, |
|
"loss": 0.871, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"eval_loss": 0.7500898838043213, |
|
"eval_runtime": 55.3025, |
|
"eval_samples_per_second": 3.616, |
|
"eval_steps_per_second": 0.235, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.3132608568779145, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8094, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"eval_loss": 0.7498895525932312, |
|
"eval_runtime": 55.2402, |
|
"eval_samples_per_second": 3.621, |
|
"eval_steps_per_second": 0.235, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.298645350091386, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7673, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7493192553520203, |
|
"eval_runtime": 54.8718, |
|
"eval_samples_per_second": 3.645, |
|
"eval_steps_per_second": 0.237, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"grad_norm": 0.34042584783125357, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7336, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"eval_loss": 0.7476670742034912, |
|
"eval_runtime": 54.9305, |
|
"eval_samples_per_second": 3.641, |
|
"eval_steps_per_second": 0.237, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 0.293099043801068, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8088, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"eval_loss": 0.745802640914917, |
|
"eval_runtime": 55.2051, |
|
"eval_samples_per_second": 3.623, |
|
"eval_steps_per_second": 0.235, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"grad_norm": 0.3042839507858426, |
|
"learning_rate": 2e-05, |
|
"loss": 0.787, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"eval_loss": 0.7439618110656738, |
|
"eval_runtime": 55.0065, |
|
"eval_samples_per_second": 3.636, |
|
"eval_steps_per_second": 0.236, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.32992077073227005, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8296, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_loss": 0.7424842715263367, |
|
"eval_runtime": 55.1254, |
|
"eval_samples_per_second": 3.628, |
|
"eval_steps_per_second": 0.236, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"grad_norm": 0.2798839747424062, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7642, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"eval_loss": 0.7414796948432922, |
|
"eval_runtime": 49.183, |
|
"eval_samples_per_second": 4.066, |
|
"eval_steps_per_second": 0.264, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 0.3046631191964983, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8203, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"eval_loss": 0.7410265207290649, |
|
"eval_runtime": 48.1541, |
|
"eval_samples_per_second": 4.153, |
|
"eval_steps_per_second": 0.27, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"grad_norm": 0.3117517214859861, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8222, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"eval_loss": 0.7405675649642944, |
|
"eval_runtime": 47.7145, |
|
"eval_samples_per_second": 4.192, |
|
"eval_steps_per_second": 0.272, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.3412709249466801, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7459, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_loss": 0.7395681738853455, |
|
"eval_runtime": 47.5855, |
|
"eval_samples_per_second": 4.203, |
|
"eval_steps_per_second": 0.273, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 0.2917443566507923, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7849, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"eval_loss": 0.7387100458145142, |
|
"eval_runtime": 47.6344, |
|
"eval_samples_per_second": 4.199, |
|
"eval_steps_per_second": 0.273, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 0.3054484743574741, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8354, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"eval_loss": 0.7384718060493469, |
|
"eval_runtime": 47.8373, |
|
"eval_samples_per_second": 4.181, |
|
"eval_steps_per_second": 0.272, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"grad_norm": 0.34986630381114014, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7069, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"eval_loss": 0.737342357635498, |
|
"eval_runtime": 47.5763, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.32324403145716496, |
|
"learning_rate": 2e-05, |
|
"loss": 0.767, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"eval_loss": 0.7360101938247681, |
|
"eval_runtime": 47.5774, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"grad_norm": 0.3795969851258545, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7556, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"eval_loss": 0.7339167594909668, |
|
"eval_runtime": 47.5818, |
|
"eval_samples_per_second": 4.203, |
|
"eval_steps_per_second": 0.273, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.34401062275458993, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7494, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"eval_loss": 0.7321068644523621, |
|
"eval_runtime": 47.7643, |
|
"eval_samples_per_second": 4.187, |
|
"eval_steps_per_second": 0.272, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"grad_norm": 0.3248480010385237, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8103, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"eval_loss": 0.7309197783470154, |
|
"eval_runtime": 49.5841, |
|
"eval_samples_per_second": 4.034, |
|
"eval_steps_per_second": 0.262, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.3572409124813593, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7972, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 0.7301727533340454, |
|
"eval_runtime": 49.3728, |
|
"eval_samples_per_second": 4.051, |
|
"eval_steps_per_second": 0.263, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"grad_norm": 0.37348522775103665, |
|
"learning_rate": 2e-05, |
|
"loss": 0.88, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"eval_loss": 0.7292957305908203, |
|
"eval_runtime": 49.2192, |
|
"eval_samples_per_second": 4.063, |
|
"eval_steps_per_second": 0.264, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 0.37667450960329546, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7518, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"eval_loss": 0.728556215763092, |
|
"eval_runtime": 49.0971, |
|
"eval_samples_per_second": 4.074, |
|
"eval_steps_per_second": 0.265, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 0.3163628607304638, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7948, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"eval_loss": 0.7287828326225281, |
|
"eval_runtime": 49.0213, |
|
"eval_samples_per_second": 4.08, |
|
"eval_steps_per_second": 0.265, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 0.3038899302084592, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7791, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"eval_loss": 0.7294514179229736, |
|
"eval_runtime": 51.9137, |
|
"eval_samples_per_second": 3.853, |
|
"eval_steps_per_second": 0.25, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"grad_norm": 0.3746448663122327, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7863, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"eval_loss": 0.7289304137229919, |
|
"eval_runtime": 51.3023, |
|
"eval_samples_per_second": 3.898, |
|
"eval_steps_per_second": 0.253, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"grad_norm": 0.4058937381299434, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7907, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"eval_loss": 0.7281011343002319, |
|
"eval_runtime": 50.8635, |
|
"eval_samples_per_second": 3.932, |
|
"eval_steps_per_second": 0.256, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"grad_norm": 0.31608065583227885, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8348, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"eval_loss": 0.7280247211456299, |
|
"eval_runtime": 50.4903, |
|
"eval_samples_per_second": 3.961, |
|
"eval_steps_per_second": 0.257, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.3375768031046084, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7783, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.7281913757324219, |
|
"eval_runtime": 50.5906, |
|
"eval_samples_per_second": 3.953, |
|
"eval_steps_per_second": 0.257, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"grad_norm": 0.36047493494859845, |
|
"learning_rate": 2e-05, |
|
"loss": 0.765, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"eval_loss": 0.7269737124443054, |
|
"eval_runtime": 53.4722, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"grad_norm": 0.389743860171921, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8269, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"eval_loss": 0.7251996397972107, |
|
"eval_runtime": 53.4986, |
|
"eval_samples_per_second": 3.738, |
|
"eval_steps_per_second": 0.243, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"grad_norm": 0.33850935145960215, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7497, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"eval_loss": 0.723595142364502, |
|
"eval_runtime": 53.4196, |
|
"eval_samples_per_second": 3.744, |
|
"eval_steps_per_second": 0.243, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 0.3166770012114478, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7648, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"eval_loss": 0.7223578095436096, |
|
"eval_runtime": 52.6143, |
|
"eval_samples_per_second": 3.801, |
|
"eval_steps_per_second": 0.247, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 0.41948670305268276, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8306, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"eval_loss": 0.7206680774688721, |
|
"eval_runtime": 52.3885, |
|
"eval_samples_per_second": 3.818, |
|
"eval_steps_per_second": 0.248, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"grad_norm": 0.35580041105853477, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7945, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"eval_loss": 0.7196171283721924, |
|
"eval_runtime": 55.1225, |
|
"eval_samples_per_second": 3.628, |
|
"eval_steps_per_second": 0.236, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"grad_norm": 0.38411890663257114, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7466, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"eval_loss": 0.7188088297843933, |
|
"eval_runtime": 55.3068, |
|
"eval_samples_per_second": 3.616, |
|
"eval_steps_per_second": 0.235, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.3682220575203032, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6752, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.7181470990180969, |
|
"eval_runtime": 53.9116, |
|
"eval_samples_per_second": 3.71, |
|
"eval_steps_per_second": 0.241, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"grad_norm": 0.34160763542661665, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7788, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"eval_loss": 0.717949390411377, |
|
"eval_runtime": 53.8446, |
|
"eval_samples_per_second": 3.714, |
|
"eval_steps_per_second": 0.241, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 0.35709301353799944, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8002, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"eval_loss": 0.7179380655288696, |
|
"eval_runtime": 53.9299, |
|
"eval_samples_per_second": 3.709, |
|
"eval_steps_per_second": 0.241, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"grad_norm": 0.3503147340749238, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7789, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"eval_loss": 0.7180312871932983, |
|
"eval_runtime": 53.4091, |
|
"eval_samples_per_second": 3.745, |
|
"eval_steps_per_second": 0.243, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 0.3931715546229069, |
|
"learning_rate": 2e-05, |
|
"loss": 0.762, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"eval_loss": 0.717825710773468, |
|
"eval_runtime": 53.6366, |
|
"eval_samples_per_second": 3.729, |
|
"eval_steps_per_second": 0.242, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"grad_norm": 0.36864033862644363, |
|
"learning_rate": 2e-05, |
|
"loss": 0.829, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"eval_loss": 0.7178698182106018, |
|
"eval_runtime": 53.4891, |
|
"eval_samples_per_second": 3.739, |
|
"eval_steps_per_second": 0.243, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"grad_norm": 0.41393587587462155, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7624, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"eval_loss": 0.7181968092918396, |
|
"eval_runtime": 53.5395, |
|
"eval_samples_per_second": 3.736, |
|
"eval_steps_per_second": 0.243, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 0.36727603900023204, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7572, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"eval_loss": 0.7187527418136597, |
|
"eval_runtime": 53.4818, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.3684078795455007, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7352, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"eval_loss": 0.7194793820381165, |
|
"eval_runtime": 53.4694, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"grad_norm": 0.42414766562621153, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7433, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"eval_loss": 0.7189603447914124, |
|
"eval_runtime": 53.8049, |
|
"eval_samples_per_second": 3.717, |
|
"eval_steps_per_second": 0.242, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"grad_norm": 0.40420796619211563, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7466, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"eval_loss": 0.7173956036567688, |
|
"eval_runtime": 53.4014, |
|
"eval_samples_per_second": 3.745, |
|
"eval_steps_per_second": 0.243, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"grad_norm": 0.36419740641344456, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7045, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"eval_loss": 0.7153105139732361, |
|
"eval_runtime": 53.285, |
|
"eval_samples_per_second": 3.753, |
|
"eval_steps_per_second": 0.244, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 0.384927357409491, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7437, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"eval_loss": 0.7135314345359802, |
|
"eval_runtime": 53.4056, |
|
"eval_samples_per_second": 3.745, |
|
"eval_steps_per_second": 0.243, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"grad_norm": 0.37218579680263697, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7693, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"eval_loss": 0.7120725512504578, |
|
"eval_runtime": 53.5467, |
|
"eval_samples_per_second": 3.735, |
|
"eval_steps_per_second": 0.243, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"grad_norm": 0.38541382926033946, |
|
"learning_rate": 2e-05, |
|
"loss": 0.708, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"eval_loss": 0.7110380530357361, |
|
"eval_runtime": 53.4119, |
|
"eval_samples_per_second": 3.744, |
|
"eval_steps_per_second": 0.243, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"grad_norm": 0.4028726453247759, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7263, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"eval_loss": 0.7100683450698853, |
|
"eval_runtime": 53.4337, |
|
"eval_samples_per_second": 3.743, |
|
"eval_steps_per_second": 0.243, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.3736204162232246, |
|
"learning_rate": 2e-05, |
|
"loss": 0.698, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"eval_loss": 0.7093971371650696, |
|
"eval_runtime": 53.4582, |
|
"eval_samples_per_second": 3.741, |
|
"eval_steps_per_second": 0.243, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 0.4179284798304916, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7611, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"eval_loss": 0.7089446783065796, |
|
"eval_runtime": 53.4752, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"grad_norm": 0.4038858950888911, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6652, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"eval_loss": 0.7089542150497437, |
|
"eval_runtime": 53.4741, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.243, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"grad_norm": 0.41740068710674544, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7319, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"eval_loss": 0.7090431451797485, |
|
"eval_runtime": 53.2419, |
|
"eval_samples_per_second": 3.756, |
|
"eval_steps_per_second": 0.244, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 0.4288335811568808, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6837, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"eval_loss": 0.7088204026222229, |
|
"eval_runtime": 53.3614, |
|
"eval_samples_per_second": 3.748, |
|
"eval_steps_per_second": 0.244, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"grad_norm": 0.399955010119186, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7989, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"eval_loss": 0.7084855437278748, |
|
"eval_runtime": 53.4923, |
|
"eval_samples_per_second": 3.739, |
|
"eval_steps_per_second": 0.243, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 0.41794643164255846, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7194, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"eval_loss": 0.7080708146095276, |
|
"eval_runtime": 53.639, |
|
"eval_samples_per_second": 3.729, |
|
"eval_steps_per_second": 0.242, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"grad_norm": 0.40953367303148197, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7354, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"eval_loss": 0.7077429890632629, |
|
"eval_runtime": 53.3837, |
|
"eval_samples_per_second": 3.746, |
|
"eval_steps_per_second": 0.244, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.5012282841513718, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7662, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_loss": 0.7064151167869568, |
|
"eval_runtime": 53.3549, |
|
"eval_samples_per_second": 3.748, |
|
"eval_steps_per_second": 0.244, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"grad_norm": 0.4210784420989087, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7133, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"eval_loss": 0.7052726745605469, |
|
"eval_runtime": 53.5059, |
|
"eval_samples_per_second": 3.738, |
|
"eval_steps_per_second": 0.243, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"grad_norm": 0.43520348530514996, |
|
"learning_rate": 2e-05, |
|
"loss": 0.729, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"eval_loss": 0.7045274972915649, |
|
"eval_runtime": 53.8352, |
|
"eval_samples_per_second": 3.715, |
|
"eval_steps_per_second": 0.241, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 0.4287647569802656, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6727, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"eval_loss": 0.7041358947753906, |
|
"eval_runtime": 53.7435, |
|
"eval_samples_per_second": 3.721, |
|
"eval_steps_per_second": 0.242, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 0.41883715320456333, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7755, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"eval_loss": 0.7037128210067749, |
|
"eval_runtime": 53.8035, |
|
"eval_samples_per_second": 3.717, |
|
"eval_steps_per_second": 0.242, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"grad_norm": 0.40617584505395354, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7776, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"eval_loss": 0.703965425491333, |
|
"eval_runtime": 53.8731, |
|
"eval_samples_per_second": 3.712, |
|
"eval_steps_per_second": 0.241, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"grad_norm": 0.4085802225532245, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7628, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"eval_loss": 0.7040860056877136, |
|
"eval_runtime": 53.9059, |
|
"eval_samples_per_second": 3.71, |
|
"eval_steps_per_second": 0.241, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"grad_norm": 0.418039298119887, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7221, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"eval_loss": 0.7039948105812073, |
|
"eval_runtime": 53.7323, |
|
"eval_samples_per_second": 3.722, |
|
"eval_steps_per_second": 0.242, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.46118870048713073, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7029, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.703814685344696, |
|
"eval_runtime": 53.8975, |
|
"eval_samples_per_second": 3.711, |
|
"eval_steps_per_second": 0.241, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"grad_norm": 0.431474386110294, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6772, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"eval_loss": 0.7034456133842468, |
|
"eval_runtime": 51.1105, |
|
"eval_samples_per_second": 3.913, |
|
"eval_steps_per_second": 0.254, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"grad_norm": 0.39618929325750435, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8219, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"eval_loss": 0.7042189240455627, |
|
"eval_runtime": 47.2927, |
|
"eval_samples_per_second": 4.229, |
|
"eval_steps_per_second": 0.275, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"grad_norm": 0.4489132713249424, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6387, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"eval_loss": 0.7061256170272827, |
|
"eval_runtime": 47.387, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"grad_norm": 0.5100329637159183, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7677, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"eval_loss": 0.708121657371521, |
|
"eval_runtime": 47.3311, |
|
"eval_samples_per_second": 4.226, |
|
"eval_steps_per_second": 0.275, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 0.525511631981176, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5956, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"eval_loss": 0.7091134786605835, |
|
"eval_runtime": 47.2978, |
|
"eval_samples_per_second": 4.229, |
|
"eval_steps_per_second": 0.275, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"grad_norm": 0.534675354231597, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7097, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"eval_loss": 0.7097848653793335, |
|
"eval_runtime": 47.4095, |
|
"eval_samples_per_second": 4.219, |
|
"eval_steps_per_second": 0.274, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"grad_norm": 0.47286903698857446, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7371, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"eval_loss": 0.7090296745300293, |
|
"eval_runtime": 47.4487, |
|
"eval_samples_per_second": 4.215, |
|
"eval_steps_per_second": 0.274, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.4734705066820788, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7652, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"eval_loss": 0.7079525589942932, |
|
"eval_runtime": 47.4101, |
|
"eval_samples_per_second": 4.219, |
|
"eval_steps_per_second": 0.274, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"grad_norm": 0.46209764763985184, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6852, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"eval_loss": 0.7072803974151611, |
|
"eval_runtime": 47.3704, |
|
"eval_samples_per_second": 4.222, |
|
"eval_steps_per_second": 0.274, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 0.4828284708486433, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6609, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"eval_loss": 0.7068901062011719, |
|
"eval_runtime": 47.425, |
|
"eval_samples_per_second": 4.217, |
|
"eval_steps_per_second": 0.274, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"grad_norm": 0.5230116179180577, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6872, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"eval_loss": 0.7058187127113342, |
|
"eval_runtime": 47.5711, |
|
"eval_samples_per_second": 4.204, |
|
"eval_steps_per_second": 0.273, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"grad_norm": 0.48081340678536255, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7694, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"eval_loss": 0.7044984698295593, |
|
"eval_runtime": 47.4233, |
|
"eval_samples_per_second": 4.217, |
|
"eval_steps_per_second": 0.274, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"grad_norm": 0.4787525602476421, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7342, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"eval_loss": 0.7032212018966675, |
|
"eval_runtime": 47.3534, |
|
"eval_samples_per_second": 4.224, |
|
"eval_steps_per_second": 0.275, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"grad_norm": 0.4871847582306217, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7562, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"eval_loss": 0.7019696235656738, |
|
"eval_runtime": 47.382, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 0.47999745025553603, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7534, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"eval_loss": 0.7014529705047607, |
|
"eval_runtime": 47.4435, |
|
"eval_samples_per_second": 4.216, |
|
"eval_steps_per_second": 0.274, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.5168030891996357, |
|
"learning_rate": 2e-05, |
|
"loss": 0.707, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"eval_loss": 0.6993884444236755, |
|
"eval_runtime": 47.4943, |
|
"eval_samples_per_second": 4.211, |
|
"eval_steps_per_second": 0.274, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"grad_norm": 0.536450206978984, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7318, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"eval_loss": 0.6971662640571594, |
|
"eval_runtime": 47.4193, |
|
"eval_samples_per_second": 4.218, |
|
"eval_steps_per_second": 0.274, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"grad_norm": 0.45352543205020696, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7421, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"eval_loss": 0.6962605118751526, |
|
"eval_runtime": 47.3798, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"grad_norm": 0.5054883443109318, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6668, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"eval_loss": 0.6970357298851013, |
|
"eval_runtime": 47.3311, |
|
"eval_samples_per_second": 4.226, |
|
"eval_steps_per_second": 0.275, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.49584660418833293, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6548, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"eval_loss": 0.6980059146881104, |
|
"eval_runtime": 47.299, |
|
"eval_samples_per_second": 4.228, |
|
"eval_steps_per_second": 0.275, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"grad_norm": 0.5114381326491793, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6691, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"eval_loss": 0.6995040774345398, |
|
"eval_runtime": 47.3887, |
|
"eval_samples_per_second": 4.22, |
|
"eval_steps_per_second": 0.274, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"grad_norm": 0.48550125668870825, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6525, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"eval_loss": 0.7020326256752014, |
|
"eval_runtime": 47.3838, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"grad_norm": 0.5860847796671736, |
|
"learning_rate": 2e-05, |
|
"loss": 0.674, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"eval_loss": 0.7027825713157654, |
|
"eval_runtime": 47.3875, |
|
"eval_samples_per_second": 4.221, |
|
"eval_steps_per_second": 0.274, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.5535582209035479, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6643, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"eval_loss": 0.7025408148765564, |
|
"eval_runtime": 47.5534, |
|
"eval_samples_per_second": 4.206, |
|
"eval_steps_per_second": 0.273, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 0.5443574176405931, |
|
"learning_rate": 2e-05, |
|
"loss": 0.709, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"eval_loss": 0.7007840871810913, |
|
"eval_runtime": 47.4469, |
|
"eval_samples_per_second": 4.215, |
|
"eval_steps_per_second": 0.274, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"grad_norm": 0.563830259704143, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6884, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"eval_loss": 0.6979361176490784, |
|
"eval_runtime": 49.1203, |
|
"eval_samples_per_second": 4.072, |
|
"eval_steps_per_second": 0.265, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"grad_norm": 0.5094956892765212, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7318, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"eval_loss": 0.6962587237358093, |
|
"eval_runtime": 49.1831, |
|
"eval_samples_per_second": 4.066, |
|
"eval_steps_per_second": 0.264, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"grad_norm": 0.5264819980742595, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6746, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"eval_loss": 0.694776713848114, |
|
"eval_runtime": 49.1994, |
|
"eval_samples_per_second": 4.065, |
|
"eval_steps_per_second": 0.264, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"grad_norm": 0.4737429304023209, |
|
"learning_rate": 2e-05, |
|
"loss": 0.664, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"eval_loss": 0.6939517855644226, |
|
"eval_runtime": 49.2438, |
|
"eval_samples_per_second": 4.061, |
|
"eval_steps_per_second": 0.264, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.494163934813738, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6978, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"eval_loss": 0.6933834552764893, |
|
"eval_runtime": 49.3494, |
|
"eval_samples_per_second": 4.053, |
|
"eval_steps_per_second": 0.263, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"grad_norm": 0.4945972278087299, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6909, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"eval_loss": 0.6924250721931458, |
|
"eval_runtime": 50.3255, |
|
"eval_samples_per_second": 3.974, |
|
"eval_steps_per_second": 0.258, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.48872556688745233, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6622, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.6922193765640259, |
|
"eval_runtime": 50.4561, |
|
"eval_samples_per_second": 3.964, |
|
"eval_steps_per_second": 0.258, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"grad_norm": 0.5013452255378538, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7458, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"eval_loss": 0.6931161284446716, |
|
"eval_runtime": 50.5049, |
|
"eval_samples_per_second": 3.96, |
|
"eval_steps_per_second": 0.257, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"grad_norm": 0.48271161232093784, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7171, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"eval_loss": 0.6959040760993958, |
|
"eval_runtime": 50.2441, |
|
"eval_samples_per_second": 3.981, |
|
"eval_steps_per_second": 0.259, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"grad_norm": 0.5414562703154852, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6419, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"eval_loss": 0.7000604271888733, |
|
"eval_runtime": 50.4261, |
|
"eval_samples_per_second": 3.966, |
|
"eval_steps_per_second": 0.258, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"grad_norm": 0.5074661247335385, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6881, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"eval_loss": 0.7039622664451599, |
|
"eval_runtime": 51.5214, |
|
"eval_samples_per_second": 3.882, |
|
"eval_steps_per_second": 0.252, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"grad_norm": 0.5603468534764365, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7085, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"eval_loss": 0.7055023312568665, |
|
"eval_runtime": 51.7102, |
|
"eval_samples_per_second": 3.868, |
|
"eval_steps_per_second": 0.251, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"grad_norm": 0.5992190802422799, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7614, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"eval_loss": 0.7046856880187988, |
|
"eval_runtime": 51.5464, |
|
"eval_samples_per_second": 3.88, |
|
"eval_steps_per_second": 0.252, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"grad_norm": 0.6293684167527106, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6435, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"eval_loss": 0.7021151781082153, |
|
"eval_runtime": 51.5328, |
|
"eval_samples_per_second": 3.881, |
|
"eval_steps_per_second": 0.252, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.591265449241434, |
|
"learning_rate": 2e-05, |
|
"loss": 0.688, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_loss": 0.7002359628677368, |
|
"eval_runtime": 51.5812, |
|
"eval_samples_per_second": 3.877, |
|
"eval_steps_per_second": 0.252, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"grad_norm": 0.543141536526749, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7027, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"eval_loss": 0.6986366510391235, |
|
"eval_runtime": 52.6956, |
|
"eval_samples_per_second": 3.795, |
|
"eval_steps_per_second": 0.247, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"grad_norm": 0.5679656300203245, |
|
"learning_rate": 2e-05, |
|
"loss": 0.625, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"eval_loss": 0.698679506778717, |
|
"eval_runtime": 52.5102, |
|
"eval_samples_per_second": 3.809, |
|
"eval_steps_per_second": 0.248, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"grad_norm": 0.5285839896523021, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7687, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"eval_loss": 0.7005956768989563, |
|
"eval_runtime": 52.6067, |
|
"eval_samples_per_second": 3.802, |
|
"eval_steps_per_second": 0.247, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"grad_norm": 0.6512964945211068, |
|
"learning_rate": 2e-05, |
|
"loss": 0.623, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"eval_loss": 0.7013595104217529, |
|
"eval_runtime": 52.5428, |
|
"eval_samples_per_second": 3.806, |
|
"eval_steps_per_second": 0.247, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"grad_norm": 0.5295248631519638, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5941, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"eval_loss": 0.7016547322273254, |
|
"eval_runtime": 52.6142, |
|
"eval_samples_per_second": 3.801, |
|
"eval_steps_per_second": 0.247, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"grad_norm": 0.6134157701434021, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6506, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"eval_loss": 0.7009623646736145, |
|
"eval_runtime": 52.1942, |
|
"eval_samples_per_second": 3.832, |
|
"eval_steps_per_second": 0.249, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"grad_norm": 0.57886797614996, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6983, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"eval_loss": 0.6988092064857483, |
|
"eval_runtime": 52.2577, |
|
"eval_samples_per_second": 3.827, |
|
"eval_steps_per_second": 0.249, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.5593482836944472, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6348, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"eval_loss": 0.698823094367981, |
|
"eval_runtime": 52.2296, |
|
"eval_samples_per_second": 3.829, |
|
"eval_steps_per_second": 0.249, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"grad_norm": 0.662802162179718, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6206, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"eval_loss": 0.6990167498588562, |
|
"eval_runtime": 52.4316, |
|
"eval_samples_per_second": 3.814, |
|
"eval_steps_per_second": 0.248, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 0.6874374231122908, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6033, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"eval_loss": 0.699796736240387, |
|
"eval_runtime": 52.3193, |
|
"eval_samples_per_second": 3.823, |
|
"eval_steps_per_second": 0.248, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"grad_norm": 0.6625766736772473, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6398, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"eval_loss": 0.6989737153053284, |
|
"eval_runtime": 52.1885, |
|
"eval_samples_per_second": 3.832, |
|
"eval_steps_per_second": 0.249, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"grad_norm": 0.6563419096027812, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6119, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"eval_loss": 0.6973609924316406, |
|
"eval_runtime": 52.1628, |
|
"eval_samples_per_second": 3.834, |
|
"eval_steps_per_second": 0.249, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"grad_norm": 0.5796353226697397, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7041, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"eval_loss": 0.6957942247390747, |
|
"eval_runtime": 52.2028, |
|
"eval_samples_per_second": 3.831, |
|
"eval_steps_per_second": 0.249, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"grad_norm": 0.5711947110504899, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6465, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"eval_loss": 0.696739673614502, |
|
"eval_runtime": 52.1849, |
|
"eval_samples_per_second": 3.833, |
|
"eval_steps_per_second": 0.249, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"grad_norm": 0.6619502413653232, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6563, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"eval_loss": 0.6960940361022949, |
|
"eval_runtime": 52.0996, |
|
"eval_samples_per_second": 3.839, |
|
"eval_steps_per_second": 0.25, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.6587126256919645, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6505, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"eval_loss": 0.6959022283554077, |
|
"eval_runtime": 52.1062, |
|
"eval_samples_per_second": 3.838, |
|
"eval_steps_per_second": 0.249, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"grad_norm": 0.648164277941964, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5969, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"eval_loss": 0.6999121308326721, |
|
"eval_runtime": 51.9356, |
|
"eval_samples_per_second": 3.851, |
|
"eval_steps_per_second": 0.25, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"grad_norm": 0.6595860789738482, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5945, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"eval_loss": 0.7028067111968994, |
|
"eval_runtime": 52.2232, |
|
"eval_samples_per_second": 3.83, |
|
"eval_steps_per_second": 0.249, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"grad_norm": 0.7116894779822719, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7027, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"eval_loss": 0.7035638689994812, |
|
"eval_runtime": 52.1471, |
|
"eval_samples_per_second": 3.835, |
|
"eval_steps_per_second": 0.249, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.7581142336087988, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7171, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"eval_loss": 0.6981176733970642, |
|
"eval_runtime": 52.1366, |
|
"eval_samples_per_second": 3.836, |
|
"eval_steps_per_second": 0.249, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"grad_norm": 0.6261292745909233, |
|
"learning_rate": 2e-05, |
|
"loss": 0.658, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"eval_loss": 0.6939045786857605, |
|
"eval_runtime": 52.2211, |
|
"eval_samples_per_second": 3.83, |
|
"eval_steps_per_second": 0.249, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"grad_norm": 0.7256427809370966, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6576, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"eval_loss": 0.6904327273368835, |
|
"eval_runtime": 52.1829, |
|
"eval_samples_per_second": 3.833, |
|
"eval_steps_per_second": 0.249, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"grad_norm": 0.6653711103404113, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6938, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"eval_loss": 0.6893274188041687, |
|
"eval_runtime": 51.899, |
|
"eval_samples_per_second": 3.854, |
|
"eval_steps_per_second": 0.25, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.6730688267524797, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7397, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.6895740032196045, |
|
"eval_runtime": 52.1977, |
|
"eval_samples_per_second": 3.832, |
|
"eval_steps_per_second": 0.249, |
|
"step": 224 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 224, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 322567586447360.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|