|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 11.152748037116346, |
|
"global_step": 500000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 5.999999999999999e-06, |
|
"loss": 0.9165, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 1.1999999999999999e-05, |
|
"loss": 0.7051, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 0.6818633079528809, |
|
"eval_runtime": 1.9324, |
|
"eval_samples_per_second": 1188.697, |
|
"eval_steps_per_second": 18.63, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.7999999999999997e-05, |
|
"loss": 0.6818, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 2.3999999999999997e-05, |
|
"loss": 0.6808, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.6788448691368103, |
|
"eval_runtime": 1.762, |
|
"eval_samples_per_second": 1303.62, |
|
"eval_steps_per_second": 20.431, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 0.6804, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 3.5999999999999994e-05, |
|
"loss": 0.6803, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 0.678871750831604, |
|
"eval_runtime": 1.9035, |
|
"eval_samples_per_second": 1206.735, |
|
"eval_steps_per_second": 18.913, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.6801, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 4.7999999999999994e-05, |
|
"loss": 0.68, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 0.6781899333000183, |
|
"eval_runtime": 1.9139, |
|
"eval_samples_per_second": 1200.176, |
|
"eval_steps_per_second": 18.81, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 5.399999999999999e-05, |
|
"loss": 0.6798, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 0.6796, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 0.6784412860870361, |
|
"eval_runtime": 1.8311, |
|
"eval_samples_per_second": 1254.405, |
|
"eval_steps_per_second": 19.66, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 0.6792, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 7.199999999999999e-05, |
|
"loss": 0.6788, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 0.67804354429245, |
|
"eval_runtime": 1.8509, |
|
"eval_samples_per_second": 1240.986, |
|
"eval_steps_per_second": 19.449, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 7.8e-05, |
|
"loss": 0.6783, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.678, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.6776260733604431, |
|
"eval_runtime": 1.8893, |
|
"eval_samples_per_second": 1215.825, |
|
"eval_steps_per_second": 19.055, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 0.6778, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 9.599999999999999e-05, |
|
"loss": 0.678, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 0.6782737374305725, |
|
"eval_runtime": 1.8707, |
|
"eval_samples_per_second": 1227.889, |
|
"eval_steps_per_second": 19.244, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.000102, |
|
"loss": 0.6783, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00010799999999999998, |
|
"loss": 0.6672, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.6526092290878296, |
|
"eval_runtime": 1.8141, |
|
"eval_samples_per_second": 1266.219, |
|
"eval_steps_per_second": 19.845, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 0.6453, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 0.632, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_loss": 0.6212196350097656, |
|
"eval_runtime": 1.9366, |
|
"eval_samples_per_second": 1186.109, |
|
"eval_steps_per_second": 18.589, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.00012599999999999997, |
|
"loss": 0.6235, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.00013199999999999998, |
|
"loss": 0.6146, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.6032013893127441, |
|
"eval_runtime": 1.9441, |
|
"eval_samples_per_second": 1181.504, |
|
"eval_steps_per_second": 18.517, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.000138, |
|
"loss": 0.604, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.00014399999999999998, |
|
"loss": 0.5936, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_loss": 0.5801298022270203, |
|
"eval_runtime": 1.9051, |
|
"eval_samples_per_second": 1205.71, |
|
"eval_steps_per_second": 18.897, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.00015, |
|
"loss": 0.583, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 0.000156, |
|
"loss": 0.573, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 0.5577847361564636, |
|
"eval_runtime": 1.8487, |
|
"eval_samples_per_second": 1242.481, |
|
"eval_steps_per_second": 19.473, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.000162, |
|
"loss": 0.5636, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 0.000168, |
|
"loss": 0.5544, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 0.5371876955032349, |
|
"eval_runtime": 1.956, |
|
"eval_samples_per_second": 1174.317, |
|
"eval_steps_per_second": 18.405, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.00017399999999999997, |
|
"loss": 0.546, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 0.5382, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 0.5219064354896545, |
|
"eval_runtime": 1.9007, |
|
"eval_samples_per_second": 1208.524, |
|
"eval_steps_per_second": 18.941, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.000186, |
|
"loss": 0.5291, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.00019199999999999998, |
|
"loss": 0.5214, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 0.5052428841590881, |
|
"eval_runtime": 1.9463, |
|
"eval_samples_per_second": 1180.202, |
|
"eval_steps_per_second": 18.497, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 0.000198, |
|
"loss": 0.5141, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.000204, |
|
"loss": 0.5074, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 0.4891608655452728, |
|
"eval_runtime": 1.9578, |
|
"eval_samples_per_second": 1173.284, |
|
"eval_steps_per_second": 18.388, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 0.5006, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 0.00021599999999999996, |
|
"loss": 0.4943, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.4745289087295532, |
|
"eval_runtime": 1.9643, |
|
"eval_samples_per_second": 1169.354, |
|
"eval_steps_per_second": 18.327, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.00022199999999999998, |
|
"loss": 0.4886, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.00022799999999999999, |
|
"loss": 0.4834, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 0.4653691351413727, |
|
"eval_runtime": 1.8946, |
|
"eval_samples_per_second": 1212.382, |
|
"eval_steps_per_second": 19.001, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.000234, |
|
"loss": 0.4781, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 0.4727, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 0.4524119198322296, |
|
"eval_runtime": 1.8731, |
|
"eval_samples_per_second": 1226.293, |
|
"eval_steps_per_second": 19.219, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 0.00024599999999999996, |
|
"loss": 0.4662, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.00025199999999999995, |
|
"loss": 0.4596, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 0.4388866424560547, |
|
"eval_runtime": 1.9513, |
|
"eval_samples_per_second": 1177.152, |
|
"eval_steps_per_second": 18.449, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.000258, |
|
"loss": 0.4536, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 0.00026399999999999997, |
|
"loss": 0.448, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_loss": 0.42909005284309387, |
|
"eval_runtime": 1.8841, |
|
"eval_samples_per_second": 1219.142, |
|
"eval_steps_per_second": 19.107, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 0.00027, |
|
"loss": 0.4428, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.000276, |
|
"loss": 0.4378, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 0.41723570227622986, |
|
"eval_runtime": 1.8673, |
|
"eval_samples_per_second": 1230.107, |
|
"eval_steps_per_second": 19.279, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 0.00028199999999999997, |
|
"loss": 0.4336, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 0.00028799999999999995, |
|
"loss": 0.4294, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 0.4092707931995392, |
|
"eval_runtime": 1.8847, |
|
"eval_samples_per_second": 1218.745, |
|
"eval_steps_per_second": 19.101, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.000294, |
|
"loss": 0.4253, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4218, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.40236151218414307, |
|
"eval_runtime": 1.9002, |
|
"eval_samples_per_second": 1208.849, |
|
"eval_steps_per_second": 18.946, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 0.00029999920715161553, |
|
"loss": 0.4181, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 0.0002999968286151326, |
|
"loss": 0.4146, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 0.3960426449775696, |
|
"eval_runtime": 1.8399, |
|
"eval_samples_per_second": 1248.469, |
|
"eval_steps_per_second": 19.567, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.0002999928644165624, |
|
"loss": 0.4113, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 0.0002999873145992569, |
|
"loss": 0.408, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.38567298650741577, |
|
"eval_runtime": 1.8013, |
|
"eval_samples_per_second": 1275.179, |
|
"eval_steps_per_second": 19.985, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 0.000299980179223908, |
|
"loss": 0.4048, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 0.0002999714583685469, |
|
"loss": 0.4018, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 0.3801218569278717, |
|
"eval_runtime": 1.9159, |
|
"eval_samples_per_second": 1198.935, |
|
"eval_steps_per_second": 18.79, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.00029996115212854366, |
|
"loss": 0.3991, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.00029994926061660554, |
|
"loss": 0.3963, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 0.37746793031692505, |
|
"eval_runtime": 1.9561, |
|
"eval_samples_per_second": 1174.286, |
|
"eval_steps_per_second": 18.404, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 0.0002999357839627762, |
|
"loss": 0.394, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 0.00029992072231443425, |
|
"loss": 0.3917, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 0.3722357749938965, |
|
"eval_runtime": 1.841, |
|
"eval_samples_per_second": 1247.702, |
|
"eval_steps_per_second": 19.555, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 0.0002999040758362914, |
|
"loss": 0.3894, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 0.00029988584471039094, |
|
"loss": 0.3869, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 0.3689954876899719, |
|
"eval_runtime": 1.9129, |
|
"eval_samples_per_second": 1200.807, |
|
"eval_steps_per_second": 18.82, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.0002998660291361054, |
|
"loss": 0.3851, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 0.0002998446293301349, |
|
"loss": 0.3831, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 0.3645932674407959, |
|
"eval_runtime": 1.8679, |
|
"eval_samples_per_second": 1229.739, |
|
"eval_steps_per_second": 19.273, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 0.0002998216455265042, |
|
"loss": 0.3813, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 0.00029979707797656046, |
|
"loss": 0.3794, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 0.35992059111595154, |
|
"eval_runtime": 1.8813, |
|
"eval_samples_per_second": 1220.942, |
|
"eval_steps_per_second": 19.135, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.00029977092694897053, |
|
"loss": 0.3773, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 0.0002997431927297178, |
|
"loss": 0.3756, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.3545241057872772, |
|
"eval_runtime": 1.8945, |
|
"eval_samples_per_second": 1212.449, |
|
"eval_steps_per_second": 19.002, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 0.00029971387562209936, |
|
"loss": 0.3739, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 0.00029968297594672226, |
|
"loss": 0.3723, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 0.3539046347141266, |
|
"eval_runtime": 1.8855, |
|
"eval_samples_per_second": 1218.214, |
|
"eval_steps_per_second": 19.093, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 0.0002996504940415005, |
|
"loss": 0.3712, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 0.00029961643026165096, |
|
"loss": 0.3693, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.352187842130661, |
|
"eval_runtime": 1.9629, |
|
"eval_samples_per_second": 1170.187, |
|
"eval_steps_per_second": 18.34, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 0.00029958078497968973, |
|
"loss": 0.3681, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 0.0002995435585854278, |
|
"loss": 0.3668, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 0.3482549488544464, |
|
"eval_runtime": 1.9671, |
|
"eval_samples_per_second": 1167.724, |
|
"eval_steps_per_second": 18.301, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 0.0002995047514859671, |
|
"loss": 0.365, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 0.0002994643641056959, |
|
"loss": 0.3635, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 0.3442147970199585, |
|
"eval_runtime": 1.9101, |
|
"eval_samples_per_second": 1202.583, |
|
"eval_steps_per_second": 18.848, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 0.000299422396886284, |
|
"loss": 0.3627, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 0.0002993788502866783, |
|
"loss": 0.3613, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 0.3413570523262024, |
|
"eval_runtime": 1.8505, |
|
"eval_samples_per_second": 1241.299, |
|
"eval_steps_per_second": 19.454, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 0.00029933372478309746, |
|
"loss": 0.3602, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 0.00029928702086902664, |
|
"loss": 0.3589, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 0.3388674557209015, |
|
"eval_runtime": 1.8714, |
|
"eval_samples_per_second": 1227.432, |
|
"eval_steps_per_second": 19.237, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 0.00029923873905521244, |
|
"loss": 0.3579, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 0.000299188879869657, |
|
"loss": 0.3568, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 0.33952972292900085, |
|
"eval_runtime": 2.0164, |
|
"eval_samples_per_second": 1139.183, |
|
"eval_steps_per_second": 17.854, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 0.00029913744385761244, |
|
"loss": 0.3554, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 0.00029908443158157465, |
|
"loss": 0.3549, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 0.334180623292923, |
|
"eval_runtime": 1.9257, |
|
"eval_samples_per_second": 1192.785, |
|
"eval_steps_per_second": 18.694, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 0.0002990298436212775, |
|
"loss": 0.3538, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 0.0002989736805736861, |
|
"loss": 0.3524, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.3344336450099945, |
|
"eval_runtime": 1.9242, |
|
"eval_samples_per_second": 1193.754, |
|
"eval_steps_per_second": 18.709, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.00029891594305299065, |
|
"loss": 0.3519, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 0.00029885663169059926, |
|
"loss": 0.3504, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_loss": 0.3332378566265106, |
|
"eval_runtime": 1.9034, |
|
"eval_samples_per_second": 1206.779, |
|
"eval_steps_per_second": 18.913, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 0.0002987957471351316, |
|
"loss": 0.3499, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 0.00029873329005241137, |
|
"loss": 0.349, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.3305933475494385, |
|
"eval_runtime": 2.0165, |
|
"eval_samples_per_second": 1139.102, |
|
"eval_steps_per_second": 17.853, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"learning_rate": 0.00029866926112545925, |
|
"loss": 0.3482, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"learning_rate": 0.00029860366105448534, |
|
"loss": 0.3473, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"eval_loss": 0.32961174845695496, |
|
"eval_runtime": 2.0634, |
|
"eval_samples_per_second": 1113.205, |
|
"eval_steps_per_second": 17.447, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 0.00029853649055688143, |
|
"loss": 0.3464, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"learning_rate": 0.00029846775036721337, |
|
"loss": 0.3459, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"eval_loss": 0.32834696769714355, |
|
"eval_runtime": 1.9893, |
|
"eval_samples_per_second": 1154.665, |
|
"eval_steps_per_second": 18.097, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 0.0002983974412372129, |
|
"loss": 0.3446, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"learning_rate": 0.00029832556393576934, |
|
"loss": 0.3441, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_loss": 0.3241877555847168, |
|
"eval_runtime": 2.051, |
|
"eval_samples_per_second": 1119.916, |
|
"eval_steps_per_second": 17.552, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.0002982521192489214, |
|
"loss": 0.3434, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 0.0002981771079798483, |
|
"loss": 0.3427, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"eval_loss": 0.3232521414756775, |
|
"eval_runtime": 1.9619, |
|
"eval_samples_per_second": 1170.819, |
|
"eval_steps_per_second": 18.35, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"learning_rate": 0.00029810053094886136, |
|
"loss": 0.3417, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"learning_rate": 0.00029802238899339473, |
|
"loss": 0.3411, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.322670042514801, |
|
"eval_runtime": 2.0139, |
|
"eval_samples_per_second": 1140.566, |
|
"eval_steps_per_second": 17.876, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 0.0002979426829679962, |
|
"loss": 0.3406, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"learning_rate": 0.0002978614137443183, |
|
"loss": 0.3398, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"eval_loss": 0.32256901264190674, |
|
"eval_runtime": 1.9699, |
|
"eval_samples_per_second": 1166.043, |
|
"eval_steps_per_second": 18.275, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 0.000297778582211108, |
|
"loss": 0.3391, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"learning_rate": 0.00029769418927419786, |
|
"loss": 0.3385, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 0.3191610276699066, |
|
"eval_runtime": 2.0164, |
|
"eval_samples_per_second": 1139.174, |
|
"eval_steps_per_second": 17.854, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"learning_rate": 0.0002976082358564954, |
|
"loss": 0.3381, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.00029752072289797353, |
|
"loss": 0.3372, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_loss": 0.31954050064086914, |
|
"eval_runtime": 1.913, |
|
"eval_samples_per_second": 1200.727, |
|
"eval_steps_per_second": 18.819, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"learning_rate": 0.00029743165135565986, |
|
"loss": 0.3368, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 0.00029734102220362654, |
|
"loss": 0.3359, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.31627157330513, |
|
"eval_runtime": 2.0014, |
|
"eval_samples_per_second": 1147.684, |
|
"eval_steps_per_second": 17.987, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"learning_rate": 0.00029724883643297937, |
|
"loss": 0.3356, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 0.0002971550950518473, |
|
"loss": 0.3348, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"eval_loss": 0.3175150454044342, |
|
"eval_runtime": 2.0666, |
|
"eval_samples_per_second": 1111.49, |
|
"eval_steps_per_second": 17.42, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 0.000297059799085371, |
|
"loss": 0.3343, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"learning_rate": 0.00029696294957569196, |
|
"loss": 0.3338, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.3165690302848816, |
|
"eval_runtime": 2.0136, |
|
"eval_samples_per_second": 1140.732, |
|
"eval_steps_per_second": 17.878, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 0.00029686454758194076, |
|
"loss": 0.3331, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"learning_rate": 0.00029676459418022594, |
|
"loss": 0.3327, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_loss": 0.316009521484375, |
|
"eval_runtime": 1.9665, |
|
"eval_samples_per_second": 1168.059, |
|
"eval_steps_per_second": 18.307, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"learning_rate": 0.0002966630904636219, |
|
"loss": 0.3321, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 0.0002965600375421569, |
|
"loss": 0.3315, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"eval_loss": 0.31569892168045044, |
|
"eval_runtime": 2.026, |
|
"eval_samples_per_second": 1133.75, |
|
"eval_steps_per_second": 17.769, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"learning_rate": 0.0002964554365428013, |
|
"loss": 0.3313, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"learning_rate": 0.00029634928860945486, |
|
"loss": 0.3301, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_loss": 0.310556024312973, |
|
"eval_runtime": 2.0156, |
|
"eval_samples_per_second": 1139.586, |
|
"eval_steps_per_second": 17.86, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"learning_rate": 0.0002962415949029343, |
|
"loss": 0.33, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 0.00029613235660096084, |
|
"loss": 0.3294, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"eval_loss": 0.3107939064502716, |
|
"eval_runtime": 1.9893, |
|
"eval_samples_per_second": 1154.672, |
|
"eval_steps_per_second": 18.097, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"learning_rate": 0.00029602157489814693, |
|
"loss": 0.3291, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"learning_rate": 0.00029590925100598365, |
|
"loss": 0.3285, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"eval_loss": 0.31057360768318176, |
|
"eval_runtime": 2.0555, |
|
"eval_samples_per_second": 1117.488, |
|
"eval_steps_per_second": 17.514, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"learning_rate": 0.000295795386152827, |
|
"loss": 0.3284, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"learning_rate": 0.0002956799815838848, |
|
"loss": 0.3285, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"eval_loss": 0.3091903328895569, |
|
"eval_runtime": 2.0051, |
|
"eval_samples_per_second": 1145.598, |
|
"eval_steps_per_second": 17.955, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"learning_rate": 0.0002955630385612029, |
|
"loss": 0.3268, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"learning_rate": 0.0002954445583636515, |
|
"loss": 0.3266, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"eval_loss": 0.309052050113678, |
|
"eval_runtime": 1.9663, |
|
"eval_samples_per_second": 1168.204, |
|
"eval_steps_per_second": 18.309, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"learning_rate": 0.00029532454228691103, |
|
"loss": 0.3263, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 0.0002952029916434581, |
|
"loss": 0.3259, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"eval_loss": 0.3066043555736542, |
|
"eval_runtime": 2.0287, |
|
"eval_samples_per_second": 1132.238, |
|
"eval_steps_per_second": 17.745, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"learning_rate": 0.00029507990776255107, |
|
"loss": 0.3257, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 0.00029495529199021555, |
|
"loss": 0.3251, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"eval_loss": 0.3062935769557953, |
|
"eval_runtime": 2.0366, |
|
"eval_samples_per_second": 1127.874, |
|
"eval_steps_per_second": 17.677, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"learning_rate": 0.0002948291456892296, |
|
"loss": 0.325, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 0.00029470147023910907, |
|
"loss": 0.3245, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"eval_loss": 0.3044787049293518, |
|
"eval_runtime": 1.9965, |
|
"eval_samples_per_second": 1150.53, |
|
"eval_steps_per_second": 18.032, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"learning_rate": 0.0002945722670360921, |
|
"loss": 0.3261, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"learning_rate": 0.0002944415374931243, |
|
"loss": 0.3233, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"eval_loss": 0.3051415681838989, |
|
"eval_runtime": 2.0098, |
|
"eval_samples_per_second": 1142.892, |
|
"eval_steps_per_second": 17.912, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 0.00029430928303984295, |
|
"loss": 0.3231, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"learning_rate": 0.0002941755051225616, |
|
"loss": 0.3227, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"eval_loss": 0.303621768951416, |
|
"eval_runtime": 2.0068, |
|
"eval_samples_per_second": 1144.62, |
|
"eval_steps_per_second": 17.939, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"learning_rate": 0.00029404020520425417, |
|
"loss": 0.3226, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 0.0002939033847645388, |
|
"loss": 0.3221, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"eval_loss": 0.30241796374320984, |
|
"eval_runtime": 1.989, |
|
"eval_samples_per_second": 1154.871, |
|
"eval_steps_per_second": 18.1, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"learning_rate": 0.00029376504529966195, |
|
"loss": 0.3216, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 0.00029362518832248184, |
|
"loss": 0.3216, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_loss": 0.3033766746520996, |
|
"eval_runtime": 1.9913, |
|
"eval_samples_per_second": 1153.526, |
|
"eval_steps_per_second": 18.079, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"learning_rate": 0.0002934838153624519, |
|
"loss": 0.3212, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"learning_rate": 0.00029334092796560427, |
|
"loss": 0.3206, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_loss": 0.3010749816894531, |
|
"eval_runtime": 1.9979, |
|
"eval_samples_per_second": 1149.696, |
|
"eval_steps_per_second": 18.019, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 0.0002931965276945326, |
|
"loss": 0.3203, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 0.0002930506161283751, |
|
"loss": 0.3202, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"eval_loss": 0.30220872163772583, |
|
"eval_runtime": 2.0424, |
|
"eval_samples_per_second": 1124.632, |
|
"eval_steps_per_second": 17.626, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"learning_rate": 0.00029290319486279724, |
|
"loss": 0.3195, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"learning_rate": 0.0002927542655099744, |
|
"loss": 0.3194, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"eval_loss": 0.30245041847229004, |
|
"eval_runtime": 2.0617, |
|
"eval_samples_per_second": 1114.154, |
|
"eval_steps_per_second": 17.462, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"learning_rate": 0.00029260382969857417, |
|
"loss": 0.319, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"learning_rate": 0.00029245188907373845, |
|
"loss": 0.3188, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"eval_loss": 0.2999315857887268, |
|
"eval_runtime": 2.0046, |
|
"eval_samples_per_second": 1145.863, |
|
"eval_steps_per_second": 17.959, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"learning_rate": 0.0002922984452970655, |
|
"loss": 0.3185, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 0.000292143500046592, |
|
"loss": 0.3184, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_loss": 0.2986513078212738, |
|
"eval_runtime": 1.898, |
|
"eval_samples_per_second": 1210.225, |
|
"eval_steps_per_second": 18.967, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"learning_rate": 0.0002919870550167743, |
|
"loss": 0.3178, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"learning_rate": 0.0002918291119184702, |
|
"loss": 0.3184, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 0.30084192752838135, |
|
"eval_runtime": 2.0127, |
|
"eval_samples_per_second": 1141.227, |
|
"eval_steps_per_second": 17.886, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"learning_rate": 0.0002916696724789201, |
|
"loss": 0.3171, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"learning_rate": 0.00029150873844172823, |
|
"loss": 0.3168, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"eval_loss": 0.2981663942337036, |
|
"eval_runtime": 2.1151, |
|
"eval_samples_per_second": 1085.979, |
|
"eval_steps_per_second": 17.02, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"learning_rate": 0.00029134631156684334, |
|
"loss": 0.3167, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"learning_rate": 0.0002911823936305398, |
|
"loss": 0.3162, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"eval_loss": 0.2964055836200714, |
|
"eval_runtime": 2.046, |
|
"eval_samples_per_second": 1122.697, |
|
"eval_steps_per_second": 17.596, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"learning_rate": 0.0002910169864253979, |
|
"loss": 0.3162, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 0.0002908500917602842, |
|
"loss": 0.3157, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 0.2958185076713562, |
|
"eval_runtime": 1.9809, |
|
"eval_samples_per_second": 1159.565, |
|
"eval_steps_per_second": 18.173, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 0.00029068171146033226, |
|
"loss": 0.3154, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"learning_rate": 0.0002905118473669218, |
|
"loss": 0.3149, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"eval_loss": 0.2947300374507904, |
|
"eval_runtime": 1.9379, |
|
"eval_samples_per_second": 1185.314, |
|
"eval_steps_per_second": 18.577, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"learning_rate": 0.00029034050133765947, |
|
"loss": 0.3149, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"learning_rate": 0.00029016767524635804, |
|
"loss": 0.3145, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"eval_loss": 0.2963857054710388, |
|
"eval_runtime": 1.9198, |
|
"eval_samples_per_second": 1196.488, |
|
"eval_steps_per_second": 18.752, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 0.00028999337098301585, |
|
"loss": 0.3144, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 0.0002898175904537964, |
|
"loss": 0.314, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"eval_loss": 0.295415997505188, |
|
"eval_runtime": 1.9531, |
|
"eval_samples_per_second": 1176.079, |
|
"eval_steps_per_second": 18.432, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"learning_rate": 0.0002896403355810075, |
|
"loss": 0.3135, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"learning_rate": 0.00028946160830307997, |
|
"loss": 0.3135, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"eval_loss": 0.2971150577068329, |
|
"eval_runtime": 2.0357, |
|
"eval_samples_per_second": 1128.362, |
|
"eval_steps_per_second": 17.684, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"learning_rate": 0.00028928141057454665, |
|
"loss": 0.3136, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 0.0002890997443660211, |
|
"loss": 0.3127, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"eval_loss": 0.2954028248786926, |
|
"eval_runtime": 2.0203, |
|
"eval_samples_per_second": 1136.969, |
|
"eval_steps_per_second": 17.819, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 0.00028891661166417586, |
|
"loss": 0.3126, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"learning_rate": 0.00028873201447172074, |
|
"loss": 0.3125, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_loss": 0.2963060438632965, |
|
"eval_runtime": 1.9991, |
|
"eval_samples_per_second": 1149.017, |
|
"eval_steps_per_second": 18.008, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"learning_rate": 0.0002885459548073812, |
|
"loss": 0.3124, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"learning_rate": 0.0002883584347058758, |
|
"loss": 0.3122, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_loss": 0.2932322025299072, |
|
"eval_runtime": 1.9426, |
|
"eval_samples_per_second": 1182.434, |
|
"eval_steps_per_second": 18.532, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"learning_rate": 0.00028816945621789437, |
|
"loss": 0.3121, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 0.0002879790214100753, |
|
"loss": 0.3119, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"eval_loss": 0.29384109377861023, |
|
"eval_runtime": 1.9978, |
|
"eval_samples_per_second": 1149.741, |
|
"eval_steps_per_second": 18.019, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"learning_rate": 0.0002877871323649833, |
|
"loss": 0.3115, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"learning_rate": 0.0002875937911810861, |
|
"loss": 0.3111, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 0.29092082381248474, |
|
"eval_runtime": 1.9165, |
|
"eval_samples_per_second": 1198.512, |
|
"eval_steps_per_second": 18.784, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"learning_rate": 0.000287398999972732, |
|
"loss": 0.3107, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 0.00028720276087012636, |
|
"loss": 0.3107, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"eval_loss": 0.2925170063972473, |
|
"eval_runtime": 1.9863, |
|
"eval_samples_per_second": 1156.436, |
|
"eval_steps_per_second": 18.124, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 0.0002870050760193086, |
|
"loss": 0.3103, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"learning_rate": 0.00028680594758212854, |
|
"loss": 0.3105, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"eval_loss": 0.2921919524669647, |
|
"eval_runtime": 2.0098, |
|
"eval_samples_per_second": 1142.913, |
|
"eval_steps_per_second": 17.912, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"learning_rate": 0.00028660537773622294, |
|
"loss": 0.3098, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"learning_rate": 0.00028640336867499143, |
|
"loss": 0.3098, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"eval_loss": 0.2905424237251282, |
|
"eval_runtime": 1.9993, |
|
"eval_samples_per_second": 1148.881, |
|
"eval_steps_per_second": 18.006, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 0.0002861999226075728, |
|
"loss": 0.3095, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"learning_rate": 0.0002859950417588206, |
|
"loss": 0.309, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_loss": 0.2886371612548828, |
|
"eval_runtime": 1.9517, |
|
"eval_samples_per_second": 1176.934, |
|
"eval_steps_per_second": 18.446, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"learning_rate": 0.00028578872836927904, |
|
"loss": 0.4773, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"learning_rate": 0.0002855809846951582, |
|
"loss": 0.6769, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"eval_loss": 0.6724876165390015, |
|
"eval_runtime": 1.9377, |
|
"eval_samples_per_second": 1185.409, |
|
"eval_steps_per_second": 18.578, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 0.00028537181300830963, |
|
"loss": 0.4512, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 0.0002851612155962014, |
|
"loss": 0.3124, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"eval_loss": 0.29317304491996765, |
|
"eval_runtime": 2.3779, |
|
"eval_samples_per_second": 965.986, |
|
"eval_steps_per_second": 15.14, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"learning_rate": 0.0002849491947618932, |
|
"loss": 0.3108, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"learning_rate": 0.0002847357528240107, |
|
"loss": 0.3099, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"eval_loss": 0.2914526164531708, |
|
"eval_runtime": 2.0271, |
|
"eval_samples_per_second": 1133.13, |
|
"eval_steps_per_second": 17.759, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"learning_rate": 0.0002845208921167208, |
|
"loss": 0.3093, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"learning_rate": 0.00028430461498970584, |
|
"loss": 0.3088, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"eval_loss": 0.2922168970108032, |
|
"eval_runtime": 2.0567, |
|
"eval_samples_per_second": 1116.857, |
|
"eval_steps_per_second": 17.504, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"learning_rate": 0.00028408692380813775, |
|
"loss": 0.3082, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"learning_rate": 0.00028386782095265247, |
|
"loss": 0.3081, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_loss": 0.2883111238479614, |
|
"eval_runtime": 2.0553, |
|
"eval_samples_per_second": 1117.577, |
|
"eval_steps_per_second": 17.515, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"learning_rate": 0.0002836473088193237, |
|
"loss": 0.308, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"learning_rate": 0.00028342538981963677, |
|
"loss": 0.3074, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"eval_loss": 0.28875553607940674, |
|
"eval_runtime": 2.0294, |
|
"eval_samples_per_second": 1131.888, |
|
"eval_steps_per_second": 17.74, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"learning_rate": 0.0002832020663804624, |
|
"loss": 0.3074, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"learning_rate": 0.00028297734094402986, |
|
"loss": 0.3067, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"eval_loss": 0.2898538112640381, |
|
"eval_runtime": 1.9557, |
|
"eval_samples_per_second": 1174.502, |
|
"eval_steps_per_second": 18.408, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"learning_rate": 0.0002827512159679005, |
|
"loss": 0.3072, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"learning_rate": 0.00028252369392494086, |
|
"loss": 0.3069, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_loss": 0.2872167229652405, |
|
"eval_runtime": 2.0341, |
|
"eval_samples_per_second": 1129.255, |
|
"eval_steps_per_second": 17.698, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"learning_rate": 0.0002822947773032956, |
|
"loss": 0.3066, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"learning_rate": 0.0002820644686063602, |
|
"loss": 0.3061, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.28897979855537415, |
|
"eval_runtime": 1.9983, |
|
"eval_samples_per_second": 1149.501, |
|
"eval_steps_per_second": 18.016, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"learning_rate": 0.00028183277035275363, |
|
"loss": 0.3058, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"learning_rate": 0.0002815996850762909, |
|
"loss": 0.3059, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"eval_loss": 0.2893044054508209, |
|
"eval_runtime": 2.0103, |
|
"eval_samples_per_second": 1142.629, |
|
"eval_steps_per_second": 17.908, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 0.00028136521532595515, |
|
"loss": 0.3059, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"learning_rate": 0.00028112936366587023, |
|
"loss": 0.3053, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"eval_loss": 0.2896111011505127, |
|
"eval_runtime": 2.0298, |
|
"eval_samples_per_second": 1131.621, |
|
"eval_steps_per_second": 17.735, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 0.00028089213267527184, |
|
"loss": 0.305, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"learning_rate": 0.0002806535249484803, |
|
"loss": 0.3052, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"eval_loss": 0.28629741072654724, |
|
"eval_runtime": 1.9678, |
|
"eval_samples_per_second": 1167.305, |
|
"eval_steps_per_second": 18.295, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"learning_rate": 0.00028041354309487135, |
|
"loss": 0.3049, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"learning_rate": 0.0002801721897388482, |
|
"loss": 0.3046, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"eval_loss": 0.288398802280426, |
|
"eval_runtime": 2.0315, |
|
"eval_samples_per_second": 1130.7, |
|
"eval_steps_per_second": 17.721, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 0.0002799294675198124, |
|
"loss": 0.3062, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"learning_rate": 0.00027968537909213524, |
|
"loss": 0.3041, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"eval_loss": 0.2860032618045807, |
|
"eval_runtime": 1.9922, |
|
"eval_samples_per_second": 1152.987, |
|
"eval_steps_per_second": 18.07, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"learning_rate": 0.0002794399271251287, |
|
"loss": 0.3038, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"learning_rate": 0.0002791931143030162, |
|
"loss": 0.3046, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"eval_loss": 0.28703776001930237, |
|
"eval_runtime": 1.8567, |
|
"eval_samples_per_second": 1237.118, |
|
"eval_steps_per_second": 19.389, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"learning_rate": 0.00027894494332490315, |
|
"loss": 0.3034, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"learning_rate": 0.0002786954169047476, |
|
"loss": 0.3034, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"eval_loss": 0.2861557602882385, |
|
"eval_runtime": 1.9458, |
|
"eval_samples_per_second": 1180.495, |
|
"eval_steps_per_second": 18.501, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"learning_rate": 0.0002784445377713306, |
|
"loss": 0.3037, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"learning_rate": 0.0002781923086682261, |
|
"loss": 0.3034, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"eval_loss": 0.2837345004081726, |
|
"eval_runtime": 2.0145, |
|
"eval_samples_per_second": 1140.255, |
|
"eval_steps_per_second": 17.871, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"learning_rate": 0.0002779387323537711, |
|
"loss": 0.3033, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"learning_rate": 0.0002776838116010356, |
|
"loss": 0.3029, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"eval_loss": 0.2857857942581177, |
|
"eval_runtime": 1.9616, |
|
"eval_samples_per_second": 1170.988, |
|
"eval_steps_per_second": 18.352, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"learning_rate": 0.0002774275491977922, |
|
"loss": 0.3028, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"learning_rate": 0.0002771699479464853, |
|
"loss": 0.3026, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"eval_loss": 0.284712016582489, |
|
"eval_runtime": 2.0468, |
|
"eval_samples_per_second": 1122.254, |
|
"eval_steps_per_second": 17.589, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"learning_rate": 0.00027691101066420104, |
|
"loss": 0.3024, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"learning_rate": 0.0002766507401826361, |
|
"loss": 0.3022, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.2855146527290344, |
|
"eval_runtime": 1.9866, |
|
"eval_samples_per_second": 1156.261, |
|
"eval_steps_per_second": 18.122, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"learning_rate": 0.0002763891393480666, |
|
"loss": 0.3019, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"learning_rate": 0.0002761262110213175, |
|
"loss": 0.3021, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"eval_loss": 0.2834793031215668, |
|
"eval_runtime": 1.9281, |
|
"eval_samples_per_second": 1191.346, |
|
"eval_steps_per_second": 18.672, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 0.00027586195807773083, |
|
"loss": 0.3017, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"learning_rate": 0.00027559638340713435, |
|
"loss": 0.3014, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"eval_loss": 0.28450024127960205, |
|
"eval_runtime": 2.0139, |
|
"eval_samples_per_second": 1140.591, |
|
"eval_steps_per_second": 17.876, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"learning_rate": 0.00027532948991381025, |
|
"loss": 0.3016, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"learning_rate": 0.00027506128051646287, |
|
"loss": 0.3012, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"eval_loss": 0.2832774817943573, |
|
"eval_runtime": 1.9735, |
|
"eval_samples_per_second": 1163.926, |
|
"eval_steps_per_second": 18.242, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 0.00027479175814818733, |
|
"loss": 0.3041, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"learning_rate": 0.000274520925756437, |
|
"loss": 0.306, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"eval_loss": 0.2821986675262451, |
|
"eval_runtime": 1.9791, |
|
"eval_samples_per_second": 1160.624, |
|
"eval_steps_per_second": 18.19, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"learning_rate": 0.00027424878630299157, |
|
"loss": 0.301, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"learning_rate": 0.0002739753427639244, |
|
"loss": 0.3004, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"eval_loss": 0.28125685453414917, |
|
"eval_runtime": 2.0653, |
|
"eval_samples_per_second": 1112.189, |
|
"eval_steps_per_second": 17.431, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"learning_rate": 0.0002737005981295704, |
|
"loss": 0.3005, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"learning_rate": 0.0002734245554044927, |
|
"loss": 0.3, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"eval_loss": 0.28397423028945923, |
|
"eval_runtime": 2.0248, |
|
"eval_samples_per_second": 1134.431, |
|
"eval_steps_per_second": 17.78, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 0.0002731472176074504, |
|
"loss": 0.3002, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"learning_rate": 0.0002728685877713653, |
|
"loss": 0.3, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"eval_loss": 0.28364112973213196, |
|
"eval_runtime": 1.9818, |
|
"eval_samples_per_second": 1159.029, |
|
"eval_steps_per_second": 18.165, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"learning_rate": 0.0002725886689432884, |
|
"loss": 0.3007, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"learning_rate": 0.0002723074641843674, |
|
"loss": 0.2996, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"eval_loss": 0.280828058719635, |
|
"eval_runtime": 2.0656, |
|
"eval_samples_per_second": 1112.04, |
|
"eval_steps_per_second": 17.429, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 0.0002720249765698123, |
|
"loss": 0.2999, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"learning_rate": 0.0002717412091888626, |
|
"loss": 0.3004, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"eval_loss": 0.28047671914100647, |
|
"eval_runtime": 1.9731, |
|
"eval_samples_per_second": 1164.181, |
|
"eval_steps_per_second": 18.246, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"learning_rate": 0.00027145616514475274, |
|
"loss": 0.2996, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"learning_rate": 0.0002711698475546788, |
|
"loss": 0.299, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"eval_loss": 0.2800908386707306, |
|
"eval_runtime": 2.05, |
|
"eval_samples_per_second": 1120.501, |
|
"eval_steps_per_second": 17.561, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"learning_rate": 0.00027088225954976407, |
|
"loss": 0.2995, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 0.0002705934042750249, |
|
"loss": 0.2991, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"eval_loss": 0.27821099758148193, |
|
"eval_runtime": 2.0199, |
|
"eval_samples_per_second": 1137.17, |
|
"eval_steps_per_second": 17.822, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"learning_rate": 0.00027030328488933625, |
|
"loss": 0.2989, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"learning_rate": 0.00027001190456539726, |
|
"loss": 0.2987, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"eval_loss": 0.2801131308078766, |
|
"eval_runtime": 1.9909, |
|
"eval_samples_per_second": 1153.743, |
|
"eval_steps_per_second": 18.082, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"learning_rate": 0.0002697192664896965, |
|
"loss": 0.2987, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"learning_rate": 0.00026942537386247706, |
|
"loss": 0.2987, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"eval_loss": 0.2810814380645752, |
|
"eval_runtime": 1.998, |
|
"eval_samples_per_second": 1149.64, |
|
"eval_steps_per_second": 18.018, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 0.0002691302298977016, |
|
"loss": 0.2985, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"learning_rate": 0.0002688338378230173, |
|
"loss": 0.298, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"eval_loss": 0.27943727374076843, |
|
"eval_runtime": 2.0072, |
|
"eval_samples_per_second": 1144.392, |
|
"eval_steps_per_second": 17.936, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"learning_rate": 0.00026853620087972035, |
|
"loss": 0.2983, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"learning_rate": 0.00026823732232272065, |
|
"loss": 0.2981, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"eval_loss": 0.27999699115753174, |
|
"eval_runtime": 2.0113, |
|
"eval_samples_per_second": 1142.034, |
|
"eval_steps_per_second": 17.899, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"learning_rate": 0.0002679372054205063, |
|
"loss": 0.2985, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"learning_rate": 0.0002676358534551076, |
|
"loss": 0.2976, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"eval_loss": 0.2797393500804901, |
|
"eval_runtime": 2.072, |
|
"eval_samples_per_second": 1108.589, |
|
"eval_steps_per_second": 17.374, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"learning_rate": 0.00026733326972206133, |
|
"loss": 0.2981, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"learning_rate": 0.0002670294575303748, |
|
"loss": 0.2977, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_loss": 0.2784021496772766, |
|
"eval_runtime": 2.0212, |
|
"eval_samples_per_second": 1136.428, |
|
"eval_steps_per_second": 17.811, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"learning_rate": 0.0002667244202024894, |
|
"loss": 0.2975, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"learning_rate": 0.00026641816107424453, |
|
"loss": 0.297, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"eval_loss": 0.2790050208568573, |
|
"eval_runtime": 1.9904, |
|
"eval_samples_per_second": 1154.041, |
|
"eval_steps_per_second": 18.087, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"learning_rate": 0.0002661106834948409, |
|
"loss": 0.2971, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"learning_rate": 0.0002658019908268041, |
|
"loss": 0.2968, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"eval_loss": 0.27784037590026855, |
|
"eval_runtime": 1.9858, |
|
"eval_samples_per_second": 1156.736, |
|
"eval_steps_per_second": 18.129, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"learning_rate": 0.00026549208644594766, |
|
"loss": 0.2965, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 0.00026518097374133627, |
|
"loss": 0.2965, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"eval_loss": 0.27897170186042786, |
|
"eval_runtime": 1.9786, |
|
"eval_samples_per_second": 1160.938, |
|
"eval_steps_per_second": 18.195, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"learning_rate": 0.00026486865611524853, |
|
"loss": 0.2964, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"learning_rate": 0.00026455513698314003, |
|
"loss": 0.2968, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"eval_loss": 0.27647751569747925, |
|
"eval_runtime": 1.9935, |
|
"eval_samples_per_second": 1152.23, |
|
"eval_steps_per_second": 18.058, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"learning_rate": 0.0002642404197736058, |
|
"loss": 0.2959, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"learning_rate": 0.0002639245079283428, |
|
"loss": 0.2962, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_loss": 0.2772797644138336, |
|
"eval_runtime": 1.997, |
|
"eval_samples_per_second": 1150.213, |
|
"eval_steps_per_second": 18.027, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 0.00026360740490211234, |
|
"loss": 0.2957, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"learning_rate": 0.0002632891141627023, |
|
"loss": 0.2961, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"eval_loss": 0.27765029668807983, |
|
"eval_runtime": 1.9854, |
|
"eval_samples_per_second": 1156.944, |
|
"eval_steps_per_second": 18.132, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"learning_rate": 0.00026296963919088923, |
|
"loss": 0.2956, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"learning_rate": 0.00026264898348040024, |
|
"loss": 0.2955, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"eval_loss": 0.27664855122566223, |
|
"eval_runtime": 1.9862, |
|
"eval_samples_per_second": 1156.498, |
|
"eval_steps_per_second": 18.125, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"learning_rate": 0.0002623271505378748, |
|
"loss": 0.2952, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"learning_rate": 0.00026200414388282637, |
|
"loss": 0.2951, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"eval_loss": 0.2762826979160309, |
|
"eval_runtime": 1.9933, |
|
"eval_samples_per_second": 1152.379, |
|
"eval_steps_per_second": 18.061, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"learning_rate": 0.00026167996704760406, |
|
"loss": 0.2952, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"learning_rate": 0.00026135462357735375, |
|
"loss": 0.295, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"eval_loss": 0.27805039286613464, |
|
"eval_runtime": 2.0209, |
|
"eval_samples_per_second": 1136.598, |
|
"eval_steps_per_second": 17.813, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"learning_rate": 0.0002610281170299795, |
|
"loss": 0.2948, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"learning_rate": 0.00026070045097610465, |
|
"loss": 0.2948, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"eval_loss": 0.27585625648498535, |
|
"eval_runtime": 2.0475, |
|
"eval_samples_per_second": 1121.859, |
|
"eval_steps_per_second": 17.582, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"learning_rate": 0.0002603716289990326, |
|
"loss": 0.2944, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"learning_rate": 0.00026004165469470787, |
|
"loss": 0.2945, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"eval_loss": 0.28066232800483704, |
|
"eval_runtime": 1.9112, |
|
"eval_samples_per_second": 1201.849, |
|
"eval_steps_per_second": 18.836, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"learning_rate": 0.0002597105316716766, |
|
"loss": 0.2948, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"learning_rate": 0.000259378263551047, |
|
"loss": 0.2945, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"eval_loss": 0.27713122963905334, |
|
"eval_runtime": 1.9955, |
|
"eval_samples_per_second": 1151.101, |
|
"eval_steps_per_second": 18.041, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"learning_rate": 0.0002590448539664501, |
|
"loss": 0.3036, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 0.00025871030656399966, |
|
"loss": 0.2997, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"eval_loss": 0.2775452733039856, |
|
"eval_runtime": 1.9855, |
|
"eval_samples_per_second": 1156.879, |
|
"eval_steps_per_second": 18.131, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"learning_rate": 0.00025837462500225255, |
|
"loss": 0.297, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"learning_rate": 0.0002580378129521685, |
|
"loss": 0.2998, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"eval_loss": 0.2764818072319031, |
|
"eval_runtime": 1.9519, |
|
"eval_samples_per_second": 1176.822, |
|
"eval_steps_per_second": 18.444, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 0.0002576998740970701, |
|
"loss": 0.2949, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"learning_rate": 0.00025736081213260253, |
|
"loss": 0.2945, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"eval_loss": 0.27885541319847107, |
|
"eval_runtime": 1.9999, |
|
"eval_samples_per_second": 1148.549, |
|
"eval_steps_per_second": 18.001, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"learning_rate": 0.0002570206307666931, |
|
"loss": 0.2937, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"learning_rate": 0.0002566793337195108, |
|
"loss": 0.2935, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"eval_loss": 0.2775752544403076, |
|
"eval_runtime": 1.9798, |
|
"eval_samples_per_second": 1160.196, |
|
"eval_steps_per_second": 18.183, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"learning_rate": 0.0002563369247234254, |
|
"loss": 0.2935, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"learning_rate": 0.0002559934075229669, |
|
"loss": 0.2941, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"eval_loss": 0.2758236229419708, |
|
"eval_runtime": 1.9855, |
|
"eval_samples_per_second": 1156.897, |
|
"eval_steps_per_second": 18.132, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"learning_rate": 0.0002556487858747843, |
|
"loss": 0.2931, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 0.00025530306354760464, |
|
"loss": 0.2934, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"eval_loss": 0.2756287157535553, |
|
"eval_runtime": 1.9891, |
|
"eval_samples_per_second": 1154.8, |
|
"eval_steps_per_second": 18.099, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"learning_rate": 0.000254956244322192, |
|
"loss": 0.2929, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"learning_rate": 0.00025460833199130595, |
|
"loss": 0.2931, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"eval_loss": 0.2749306857585907, |
|
"eval_runtime": 1.9885, |
|
"eval_samples_per_second": 1155.115, |
|
"eval_steps_per_second": 18.104, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"learning_rate": 0.00025425933035965983, |
|
"loss": 0.293, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"learning_rate": 0.00025390924324387965, |
|
"loss": 0.2929, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"eval_loss": 0.2756015658378601, |
|
"eval_runtime": 2.0342, |
|
"eval_samples_per_second": 1129.181, |
|
"eval_steps_per_second": 17.697, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 0.0002535580744724621, |
|
"loss": 0.2928, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"learning_rate": 0.00025320582788573246, |
|
"loss": 0.2927, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"eval_loss": 0.27484118938446045, |
|
"eval_runtime": 1.9877, |
|
"eval_samples_per_second": 1155.618, |
|
"eval_steps_per_second": 18.112, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"learning_rate": 0.000252852507335803, |
|
"loss": 0.2923, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"learning_rate": 0.0002524981166865307, |
|
"loss": 0.2924, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"eval_loss": 0.2738962769508362, |
|
"eval_runtime": 1.9816, |
|
"eval_samples_per_second": 1159.146, |
|
"eval_steps_per_second": 18.167, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"learning_rate": 0.00025214265981347487, |
|
"loss": 0.2921, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 0.000251786140603855, |
|
"loss": 0.292, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"eval_loss": 0.27476462721824646, |
|
"eval_runtime": 1.9539, |
|
"eval_samples_per_second": 1175.601, |
|
"eval_steps_per_second": 18.425, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"learning_rate": 0.00025142856295650795, |
|
"loss": 0.2919, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"learning_rate": 0.0002510699307818457, |
|
"loss": 0.2917, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"eval_loss": 0.2740522623062134, |
|
"eval_runtime": 1.9887, |
|
"eval_samples_per_second": 1155.036, |
|
"eval_steps_per_second": 18.102, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"learning_rate": 0.00025071024800181214, |
|
"loss": 0.2913, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 0.0002503495185498405, |
|
"loss": 0.2913, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"eval_loss": 0.27570095658302307, |
|
"eval_runtime": 1.9561, |
|
"eval_samples_per_second": 1174.293, |
|
"eval_steps_per_second": 18.404, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"learning_rate": 0.00024998774637081044, |
|
"loss": 0.2914, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"learning_rate": 0.00024962493542100443, |
|
"loss": 0.2911, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"eval_loss": 0.27467048168182373, |
|
"eval_runtime": 1.9751, |
|
"eval_samples_per_second": 1163.003, |
|
"eval_steps_per_second": 18.227, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"learning_rate": 0.0002492610896680649, |
|
"loss": 0.2911, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"learning_rate": 0.00024889621309095067, |
|
"loss": 0.2911, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"eval_loss": 0.27558836340904236, |
|
"eval_runtime": 1.9879, |
|
"eval_samples_per_second": 1155.504, |
|
"eval_steps_per_second": 18.11, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"learning_rate": 0.00024853030967989366, |
|
"loss": 0.2913, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"learning_rate": 0.00024816338343635485, |
|
"loss": 0.2909, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 0.27131345868110657, |
|
"eval_runtime": 1.9424, |
|
"eval_samples_per_second": 1182.575, |
|
"eval_steps_per_second": 18.534, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"learning_rate": 0.0002477954383729809, |
|
"loss": 0.2908, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"learning_rate": 0.00024742647851355997, |
|
"loss": 0.2908, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"eval_loss": 0.2729162871837616, |
|
"eval_runtime": 2.0222, |
|
"eval_samples_per_second": 1135.884, |
|
"eval_steps_per_second": 17.802, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"learning_rate": 0.0002470565078929781, |
|
"loss": 0.2909, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"learning_rate": 0.00024668553055717465, |
|
"loss": 0.2905, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"eval_loss": 0.2731362581253052, |
|
"eval_runtime": 1.928, |
|
"eval_samples_per_second": 1191.362, |
|
"eval_steps_per_second": 18.672, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"learning_rate": 0.0002463135505630984, |
|
"loss": 0.2905, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"learning_rate": 0.00024594057197866283, |
|
"loss": 0.2902, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"eval_loss": 0.27327215671539307, |
|
"eval_runtime": 1.9424, |
|
"eval_samples_per_second": 1182.586, |
|
"eval_steps_per_second": 18.534, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"learning_rate": 0.0002455665988827021, |
|
"loss": 0.2899, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"learning_rate": 0.0002451916353649261, |
|
"loss": 0.2903, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"eval_loss": 0.2730531692504883, |
|
"eval_runtime": 1.9636, |
|
"eval_samples_per_second": 1169.786, |
|
"eval_steps_per_second": 18.334, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 0.00024481568552587566, |
|
"loss": 0.2901, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"learning_rate": 0.0002444387534768781, |
|
"loss": 0.2895, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"eval_loss": 0.2728096544742584, |
|
"eval_runtime": 1.9828, |
|
"eval_samples_per_second": 1158.488, |
|
"eval_steps_per_second": 18.157, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"learning_rate": 0.0002440608433400018, |
|
"loss": 0.2938, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 0.00024368195924801158, |
|
"loss": 0.2904, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"eval_loss": 0.27427318692207336, |
|
"eval_runtime": 1.9845, |
|
"eval_samples_per_second": 1157.485, |
|
"eval_steps_per_second": 18.141, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"learning_rate": 0.00024330210534432314, |
|
"loss": 0.29, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"learning_rate": 0.0002429212857829579, |
|
"loss": 0.2891, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"eval_loss": 0.2683263421058655, |
|
"eval_runtime": 1.9558, |
|
"eval_samples_per_second": 1174.46, |
|
"eval_steps_per_second": 18.407, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"learning_rate": 0.00024253950472849758, |
|
"loss": 0.2893, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"learning_rate": 0.0002421567663560386, |
|
"loss": 0.2888, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"eval_loss": 0.27206218242645264, |
|
"eval_runtime": 1.9344, |
|
"eval_samples_per_second": 1187.465, |
|
"eval_steps_per_second": 18.611, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"learning_rate": 0.00024177307485114653, |
|
"loss": 0.2891, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"learning_rate": 0.0002413884344098101, |
|
"loss": 0.2889, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"eval_loss": 0.2694759666919708, |
|
"eval_runtime": 1.9374, |
|
"eval_samples_per_second": 1185.599, |
|
"eval_steps_per_second": 18.581, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 0.00024100284923839568, |
|
"loss": 0.2892, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"learning_rate": 0.0002406163235536008, |
|
"loss": 0.289, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"eval_loss": 0.27140599489212036, |
|
"eval_runtime": 2.0107, |
|
"eval_samples_per_second": 1142.373, |
|
"eval_steps_per_second": 17.904, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"learning_rate": 0.00024022886158240857, |
|
"loss": 0.2887, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"learning_rate": 0.0002398404675620409, |
|
"loss": 0.6252, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.6785263419151306, |
|
"eval_runtime": 1.9867, |
|
"eval_samples_per_second": 1156.175, |
|
"eval_steps_per_second": 18.12, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"learning_rate": 0.0002394511457399126, |
|
"loss": 0.6774, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"learning_rate": 0.00023906090037358478, |
|
"loss": 0.6773, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"eval_loss": 0.6770340800285339, |
|
"eval_runtime": 1.9319, |
|
"eval_samples_per_second": 1188.977, |
|
"eval_steps_per_second": 18.634, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"learning_rate": 0.0002386697357307182, |
|
"loss": 0.6769, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"learning_rate": 0.00023827765608902676, |
|
"loss": 0.6547, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"eval_loss": 0.6202901601791382, |
|
"eval_runtime": 1.9551, |
|
"eval_samples_per_second": 1174.884, |
|
"eval_steps_per_second": 18.414, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"learning_rate": 0.0002378846657362306, |
|
"loss": 0.3248, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"learning_rate": 0.00023749076897000928, |
|
"loss": 0.2956, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"eval_loss": 0.27639877796173096, |
|
"eval_runtime": 1.9923, |
|
"eval_samples_per_second": 1152.936, |
|
"eval_steps_per_second": 18.07, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"learning_rate": 0.00023709597009795465, |
|
"loss": 0.2929, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"learning_rate": 0.000236700273437524, |
|
"loss": 0.2914, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"eval_loss": 0.27524837851524353, |
|
"eval_runtime": 1.9856, |
|
"eval_samples_per_second": 1156.853, |
|
"eval_steps_per_second": 18.131, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"learning_rate": 0.0002363036833159925, |
|
"loss": 0.2913, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"learning_rate": 0.00023590620407040633, |
|
"loss": 0.2901, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"eval_loss": 0.27291467785835266, |
|
"eval_runtime": 2.0289, |
|
"eval_samples_per_second": 1132.125, |
|
"eval_steps_per_second": 17.743, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 0.00023550784004753471, |
|
"loss": 0.2896, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"learning_rate": 0.0002351085956038229, |
|
"loss": 0.2891, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"eval_loss": 0.2731077969074249, |
|
"eval_runtime": 1.9499, |
|
"eval_samples_per_second": 1177.996, |
|
"eval_steps_per_second": 18.462, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"learning_rate": 0.00023470847510534407, |
|
"loss": 0.2891, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"learning_rate": 0.00023430748292775188, |
|
"loss": 0.2886, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"eval_loss": 0.2711014151573181, |
|
"eval_runtime": 1.958, |
|
"eval_samples_per_second": 1173.126, |
|
"eval_steps_per_second": 18.386, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"learning_rate": 0.0002339056234562326, |
|
"loss": 0.2887, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"learning_rate": 0.00023350290108545694, |
|
"loss": 0.2882, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"eval_loss": 0.27429357171058655, |
|
"eval_runtime": 1.9855, |
|
"eval_samples_per_second": 1156.89, |
|
"eval_steps_per_second": 18.131, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"learning_rate": 0.00023309932021953238, |
|
"loss": 0.2883, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"learning_rate": 0.00023269488527195446, |
|
"loss": 0.2881, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"eval_loss": 0.26987290382385254, |
|
"eval_runtime": 1.9002, |
|
"eval_samples_per_second": 1208.791, |
|
"eval_steps_per_second": 18.945, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"learning_rate": 0.00023228960066555907, |
|
"loss": 0.2878, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"learning_rate": 0.00023188347083247365, |
|
"loss": 0.2875, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"eval_loss": 0.2710895836353302, |
|
"eval_runtime": 1.9126, |
|
"eval_samples_per_second": 1200.973, |
|
"eval_steps_per_second": 18.822, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"learning_rate": 0.00023147650021406905, |
|
"loss": 0.2875, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"learning_rate": 0.00023106869326091075, |
|
"loss": 0.2873, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"eval_loss": 0.27097710967063904, |
|
"eval_runtime": 2.0543, |
|
"eval_samples_per_second": 1118.139, |
|
"eval_steps_per_second": 17.524, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 0.00023066005443271017, |
|
"loss": 0.2877, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"learning_rate": 0.00023025058819827618, |
|
"loss": 0.287, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"eval_loss": 0.2711479663848877, |
|
"eval_runtime": 2.0281, |
|
"eval_samples_per_second": 1132.606, |
|
"eval_steps_per_second": 17.751, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"learning_rate": 0.00022984029903546578, |
|
"loss": 0.2873, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"learning_rate": 0.00022942919143113572, |
|
"loss": 0.2873, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"eval_loss": 0.2718214690685272, |
|
"eval_runtime": 1.9128, |
|
"eval_samples_per_second": 1200.871, |
|
"eval_steps_per_second": 18.821, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"learning_rate": 0.0002290172698810927, |
|
"loss": 0.525, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"learning_rate": 0.00022860453889004493, |
|
"loss": 0.6785, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"eval_loss": 0.6782656311988831, |
|
"eval_runtime": 1.9047, |
|
"eval_samples_per_second": 1205.991, |
|
"eval_steps_per_second": 18.901, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"learning_rate": 0.00022819100297155235, |
|
"loss": 0.6774, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"learning_rate": 0.0002277766666479774, |
|
"loss": 0.6773, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"eval_loss": 0.6771561503410339, |
|
"eval_runtime": 2.0046, |
|
"eval_samples_per_second": 1145.885, |
|
"eval_steps_per_second": 17.959, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"learning_rate": 0.00022736153445043595, |
|
"loss": 0.6595, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"learning_rate": 0.00022694561091874706, |
|
"loss": 0.3007, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"eval_loss": 0.2753528952598572, |
|
"eval_runtime": 2.0143, |
|
"eval_samples_per_second": 1140.359, |
|
"eval_steps_per_second": 17.872, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"learning_rate": 0.00022652890060138387, |
|
"loss": 0.2927, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"learning_rate": 0.00022611140805542366, |
|
"loss": 0.2905, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"eval_loss": 0.27167341113090515, |
|
"eval_runtime": 1.9686, |
|
"eval_samples_per_second": 1166.834, |
|
"eval_steps_per_second": 18.287, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"learning_rate": 0.00022569313784649798, |
|
"loss": 0.2891, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"learning_rate": 0.0002252740945487429, |
|
"loss": 0.288, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"eval_loss": 0.272190660238266, |
|
"eval_runtime": 1.9619, |
|
"eval_samples_per_second": 1170.775, |
|
"eval_steps_per_second": 18.349, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"learning_rate": 0.00022485428274474867, |
|
"loss": 0.2881, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"learning_rate": 0.00022443370702551, |
|
"loss": 0.2873, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"eval_loss": 0.27140477299690247, |
|
"eval_runtime": 1.9557, |
|
"eval_samples_per_second": 1174.514, |
|
"eval_steps_per_second": 18.408, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"learning_rate": 0.00022401237199037565, |
|
"loss": 0.287, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"learning_rate": 0.0002235902822469979, |
|
"loss": 0.2866, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"eval_loss": 0.2686367928981781, |
|
"eval_runtime": 1.9954, |
|
"eval_samples_per_second": 1151.13, |
|
"eval_steps_per_second": 18.041, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"learning_rate": 0.00022316744241128268, |
|
"loss": 0.2864, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"learning_rate": 0.00022274385710733855, |
|
"loss": 0.2862, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"eval_loss": 0.2684060335159302, |
|
"eval_runtime": 2.0107, |
|
"eval_samples_per_second": 1142.364, |
|
"eval_steps_per_second": 17.904, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"learning_rate": 0.00022231953096742672, |
|
"loss": 0.286, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"learning_rate": 0.00022189446863190974, |
|
"loss": 0.2862, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"eval_loss": 0.2695789337158203, |
|
"eval_runtime": 1.9916, |
|
"eval_samples_per_second": 1153.341, |
|
"eval_steps_per_second": 18.076, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"learning_rate": 0.00022146867474920118, |
|
"loss": 0.2859, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"learning_rate": 0.00022104215397571484, |
|
"loss": 0.2858, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"eval_loss": 0.270698606967926, |
|
"eval_runtime": 2.0195, |
|
"eval_samples_per_second": 1137.436, |
|
"eval_steps_per_second": 17.827, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"learning_rate": 0.0002206149109758135, |
|
"loss": 0.2855, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"learning_rate": 0.00022018695042175818, |
|
"loss": 0.2865, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"eval_loss": 0.2688451111316681, |
|
"eval_runtime": 2.0, |
|
"eval_samples_per_second": 1148.488, |
|
"eval_steps_per_second": 18.0, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"learning_rate": 0.00021975827699365693, |
|
"loss": 0.2849, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"learning_rate": 0.00021932889537941365, |
|
"loss": 0.2851, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"eval_loss": 0.26711875200271606, |
|
"eval_runtime": 1.9779, |
|
"eval_samples_per_second": 1161.305, |
|
"eval_steps_per_second": 18.201, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"learning_rate": 0.0002188988102746769, |
|
"loss": 0.2871, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"learning_rate": 0.0002184680263827885, |
|
"loss": 0.2853, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"eval_loss": 0.26893720030784607, |
|
"eval_runtime": 1.9488, |
|
"eval_samples_per_second": 1178.699, |
|
"eval_steps_per_second": 18.473, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"learning_rate": 0.00021803654841473204, |
|
"loss": 0.286, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"learning_rate": 0.00021760438108908142, |
|
"loss": 0.2853, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"eval_loss": 0.2694932520389557, |
|
"eval_runtime": 1.9477, |
|
"eval_samples_per_second": 1179.331, |
|
"eval_steps_per_second": 18.483, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"learning_rate": 0.0002171715291319494, |
|
"loss": 0.2845, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"learning_rate": 0.0002167379972769355, |
|
"loss": 0.2853, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"eval_loss": 0.2688761353492737, |
|
"eval_runtime": 1.976, |
|
"eval_samples_per_second": 1162.467, |
|
"eval_steps_per_second": 18.219, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"learning_rate": 0.0002163037902650747, |
|
"loss": 0.2845, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"learning_rate": 0.0002158689128447853, |
|
"loss": 0.2844, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"eval_loss": 0.2665814757347107, |
|
"eval_runtime": 1.9647, |
|
"eval_samples_per_second": 1169.149, |
|
"eval_steps_per_second": 18.324, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"learning_rate": 0.00021543336977181704, |
|
"loss": 0.2841, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"learning_rate": 0.00021499716580919933, |
|
"loss": 0.2837, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"eval_loss": 0.2675575613975525, |
|
"eval_runtime": 2.0149, |
|
"eval_samples_per_second": 1140.003, |
|
"eval_steps_per_second": 17.867, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"learning_rate": 0.00021456030572718866, |
|
"loss": 0.2841, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"learning_rate": 0.000214122794303217, |
|
"loss": 0.2837, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"eval_loss": 0.2669568657875061, |
|
"eval_runtime": 1.9768, |
|
"eval_samples_per_second": 1161.955, |
|
"eval_steps_per_second": 18.211, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"learning_rate": 0.00021368463632183912, |
|
"loss": 0.2838, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"learning_rate": 0.00021324583657468055, |
|
"loss": 0.2838, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"eval_loss": 0.2675861120223999, |
|
"eval_runtime": 2.0013, |
|
"eval_samples_per_second": 1147.731, |
|
"eval_steps_per_second": 17.988, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"learning_rate": 0.000212806399860385, |
|
"loss": 0.2837, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"learning_rate": 0.00021236633098456196, |
|
"loss": 0.2835, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_loss": 0.26529645919799805, |
|
"eval_runtime": 2.0487, |
|
"eval_samples_per_second": 1121.175, |
|
"eval_steps_per_second": 17.572, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"learning_rate": 0.0002119256347597342, |
|
"loss": 0.2834, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"learning_rate": 0.000211484316005285, |
|
"loss": 0.2835, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"eval_loss": 0.2684980034828186, |
|
"eval_runtime": 2.049, |
|
"eval_samples_per_second": 1121.029, |
|
"eval_steps_per_second": 17.569, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"learning_rate": 0.00021104237954740554, |
|
"loss": 0.2833, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"learning_rate": 0.00021059983021904215, |
|
"loss": 0.2834, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"eval_loss": 0.26848986744880676, |
|
"eval_runtime": 2.068, |
|
"eval_samples_per_second": 1110.727, |
|
"eval_steps_per_second": 17.408, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"learning_rate": 0.00021015667285984336, |
|
"loss": 0.2834, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"learning_rate": 0.00020971291231610707, |
|
"loss": 0.283, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"eval_loss": 0.26503807306289673, |
|
"eval_runtime": 1.9718, |
|
"eval_samples_per_second": 1164.914, |
|
"eval_steps_per_second": 18.257, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"learning_rate": 0.0002092685534407274, |
|
"loss": 0.2829, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"learning_rate": 0.00020882360109314197, |
|
"loss": 0.2828, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"eval_loss": 0.2652031183242798, |
|
"eval_runtime": 2.0127, |
|
"eval_samples_per_second": 1141.23, |
|
"eval_steps_per_second": 17.886, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"learning_rate": 0.0002083780601392783, |
|
"loss": 0.2832, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"learning_rate": 0.0002079319354515008, |
|
"loss": 0.2827, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"eval_loss": 0.26393648982048035, |
|
"eval_runtime": 2.0365, |
|
"eval_samples_per_second": 1127.908, |
|
"eval_steps_per_second": 17.677, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"learning_rate": 0.00020748523190855772, |
|
"loss": 0.2827, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"learning_rate": 0.0002070379543955273, |
|
"loss": 0.2826, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"eval_loss": 0.26564860343933105, |
|
"eval_runtime": 1.9676, |
|
"eval_samples_per_second": 1167.423, |
|
"eval_steps_per_second": 18.297, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"learning_rate": 0.00020659010780376487, |
|
"loss": 0.2825, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"learning_rate": 0.00020614169703084896, |
|
"loss": 0.2824, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"eval_loss": 0.26574334502220154, |
|
"eval_runtime": 2.0204, |
|
"eval_samples_per_second": 1136.893, |
|
"eval_steps_per_second": 17.818, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"learning_rate": 0.0002056927269805279, |
|
"loss": 0.2822, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"learning_rate": 0.00020524320256266635, |
|
"loss": 0.2823, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"eval_loss": 0.26812705397605896, |
|
"eval_runtime": 2.0173, |
|
"eval_samples_per_second": 1138.645, |
|
"eval_steps_per_second": 17.846, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"learning_rate": 0.0002047931286931912, |
|
"loss": 0.2821, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"learning_rate": 0.00020434251029403824, |
|
"loss": 0.2819, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"eval_loss": 0.2622651159763336, |
|
"eval_runtime": 1.9772, |
|
"eval_samples_per_second": 1161.716, |
|
"eval_steps_per_second": 18.207, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"learning_rate": 0.00020389135229309803, |
|
"loss": 0.2819, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"learning_rate": 0.00020343965962416229, |
|
"loss": 0.2818, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"eval_loss": 0.26698386669158936, |
|
"eval_runtime": 1.9801, |
|
"eval_samples_per_second": 1160.056, |
|
"eval_steps_per_second": 18.181, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"learning_rate": 0.00020298743722686958, |
|
"loss": 0.2818, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"learning_rate": 0.0002025346900466516, |
|
"loss": 0.2818, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"eval_loss": 0.2666832506656647, |
|
"eval_runtime": 2.0181, |
|
"eval_samples_per_second": 1138.224, |
|
"eval_steps_per_second": 17.839, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"learning_rate": 0.0002020814230346791, |
|
"loss": 0.2815, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"learning_rate": 0.00020162764114780733, |
|
"loss": 0.2816, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_loss": 0.2648046016693115, |
|
"eval_runtime": 1.9978, |
|
"eval_samples_per_second": 1149.778, |
|
"eval_steps_per_second": 18.02, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"learning_rate": 0.0002011733493485224, |
|
"loss": 0.2817, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"learning_rate": 0.00020071855260488664, |
|
"loss": 0.2815, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"eval_loss": 0.2653491497039795, |
|
"eval_runtime": 1.9416, |
|
"eval_samples_per_second": 1183.015, |
|
"eval_steps_per_second": 18.541, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"learning_rate": 0.0002002632558904843, |
|
"loss": 0.2813, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"learning_rate": 0.00019980746418436736, |
|
"loss": 0.2814, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"eval_loss": 0.2634630799293518, |
|
"eval_runtime": 1.9884, |
|
"eval_samples_per_second": 1155.227, |
|
"eval_steps_per_second": 18.105, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"learning_rate": 0.00019935118247100088, |
|
"loss": 0.6253, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"learning_rate": 0.00019889441574020864, |
|
"loss": 0.6773, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"eval_loss": 0.6766608357429504, |
|
"eval_runtime": 2.0373, |
|
"eval_samples_per_second": 1127.446, |
|
"eval_steps_per_second": 17.67, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"learning_rate": 0.0001984371689871183, |
|
"loss": 0.6773, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"learning_rate": 0.00019797944721210725, |
|
"loss": 0.6251, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"eval_loss": 0.29495057463645935, |
|
"eval_runtime": 1.9016, |
|
"eval_samples_per_second": 1207.906, |
|
"eval_steps_per_second": 18.931, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"learning_rate": 0.00019752125542074736, |
|
"loss": 0.2897, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"learning_rate": 0.00019706259862375074, |
|
"loss": 0.285, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"eval_loss": 0.2679499685764313, |
|
"eval_runtime": 1.9722, |
|
"eval_samples_per_second": 1164.712, |
|
"eval_steps_per_second": 18.254, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"learning_rate": 0.00019660348183691453, |
|
"loss": 0.2835, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"learning_rate": 0.0001961439100810664, |
|
"loss": 0.2825, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"eval_loss": 0.26572567224502563, |
|
"eval_runtime": 2.0053, |
|
"eval_samples_per_second": 1145.441, |
|
"eval_steps_per_second": 17.952, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"learning_rate": 0.00019568388838200952, |
|
"loss": 0.2819, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"learning_rate": 0.00019522342177046744, |
|
"loss": 0.2818, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"eval_loss": 0.26373621821403503, |
|
"eval_runtime": 1.9567, |
|
"eval_samples_per_second": 1173.894, |
|
"eval_steps_per_second": 18.398, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"learning_rate": 0.00019476251528202922, |
|
"loss": 0.2815, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"learning_rate": 0.0001943011739570944, |
|
"loss": 0.2812, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"eval_loss": 0.26436519622802734, |
|
"eval_runtime": 2.0008, |
|
"eval_samples_per_second": 1148.012, |
|
"eval_steps_per_second": 17.992, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"learning_rate": 0.00019383940284081774, |
|
"loss": 0.2808, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"learning_rate": 0.00019337720698305431, |
|
"loss": 0.2808, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"eval_loss": 0.26223430037498474, |
|
"eval_runtime": 1.9438, |
|
"eval_samples_per_second": 1181.718, |
|
"eval_steps_per_second": 18.521, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"learning_rate": 0.0001929145914383038, |
|
"loss": 0.2805, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"learning_rate": 0.00019245156126565586, |
|
"loss": 0.2804, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"eval_loss": 0.26395902037620544, |
|
"eval_runtime": 1.9723, |
|
"eval_samples_per_second": 1164.61, |
|
"eval_steps_per_second": 18.252, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"learning_rate": 0.00019198812152873416, |
|
"loss": 0.2805, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 0.00019152427729564144, |
|
"loss": 0.2803, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.2627279460430145, |
|
"eval_runtime": 1.9464, |
|
"eval_samples_per_second": 1180.147, |
|
"eval_steps_per_second": 18.496, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"learning_rate": 0.00019106003363890395, |
|
"loss": 0.2801, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"learning_rate": 0.00019059539563541584, |
|
"loss": 0.28, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"eval_loss": 0.2637809216976166, |
|
"eval_runtime": 1.9918, |
|
"eval_samples_per_second": 1153.2, |
|
"eval_steps_per_second": 18.074, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"learning_rate": 0.000190130368366384, |
|
"loss": 0.2797, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"learning_rate": 0.00018966495691727207, |
|
"loss": 0.2797, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"eval_loss": 0.26251962780952454, |
|
"eval_runtime": 2.0094, |
|
"eval_samples_per_second": 1143.149, |
|
"eval_steps_per_second": 17.916, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"learning_rate": 0.0001891991663777451, |
|
"loss": 0.2797, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"learning_rate": 0.00018873300184161387, |
|
"loss": 0.2795, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"eval_loss": 0.26239344477653503, |
|
"eval_runtime": 2.0026, |
|
"eval_samples_per_second": 1146.993, |
|
"eval_steps_per_second": 17.976, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"learning_rate": 0.00018826646840677894, |
|
"loss": 0.2798, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"learning_rate": 0.00018779957117517532, |
|
"loss": 0.2825, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"eval_loss": 0.2686799168586731, |
|
"eval_runtime": 2.0437, |
|
"eval_samples_per_second": 1123.939, |
|
"eval_steps_per_second": 17.615, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"learning_rate": 0.00018733231525271625, |
|
"loss": 0.2845, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"learning_rate": 0.00018686470574923766, |
|
"loss": 0.283, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"eval_loss": 0.2658364176750183, |
|
"eval_runtime": 1.9961, |
|
"eval_samples_per_second": 1150.721, |
|
"eval_steps_per_second": 18.035, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"learning_rate": 0.00018639674777844224, |
|
"loss": 0.2821, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 5.13, |
|
"learning_rate": 0.00018592844645784327, |
|
"loss": 0.2791, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 5.13, |
|
"eval_loss": 0.2603645920753479, |
|
"eval_runtime": 2.047, |
|
"eval_samples_per_second": 1122.145, |
|
"eval_steps_per_second": 17.587, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"learning_rate": 0.00018545980690870903, |
|
"loss": 0.2799, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"learning_rate": 0.00018499083425600648, |
|
"loss": 0.2791, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"eval_loss": 0.26100829243659973, |
|
"eval_runtime": 1.9738, |
|
"eval_samples_per_second": 1163.773, |
|
"eval_steps_per_second": 18.239, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"learning_rate": 0.00018452153362834552, |
|
"loss": 0.2791, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"learning_rate": 0.00018405191015792254, |
|
"loss": 0.2829, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"eval_loss": 0.26664701104164124, |
|
"eval_runtime": 2.018, |
|
"eval_samples_per_second": 1138.267, |
|
"eval_steps_per_second": 17.84, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"learning_rate": 0.0001835819689804646, |
|
"loss": 0.2818, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"learning_rate": 0.0001831117152351732, |
|
"loss": 0.2789, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"eval_loss": 0.2605225443840027, |
|
"eval_runtime": 1.994, |
|
"eval_samples_per_second": 1151.94, |
|
"eval_steps_per_second": 18.054, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"learning_rate": 0.00018264115406466778, |
|
"loss": 0.2784, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"learning_rate": 0.00018217029061493007, |
|
"loss": 0.2816, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"eval_loss": 0.26246902346611023, |
|
"eval_runtime": 2.0381, |
|
"eval_samples_per_second": 1127.052, |
|
"eval_steps_per_second": 17.664, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"learning_rate": 0.00018169913003524717, |
|
"loss": 0.2818, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"learning_rate": 0.00018122767747815594, |
|
"loss": 0.2791, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"eval_loss": 0.26137277483940125, |
|
"eval_runtime": 2.0611, |
|
"eval_samples_per_second": 1114.437, |
|
"eval_steps_per_second": 17.466, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"learning_rate": 0.00018075593809938574, |
|
"loss": 0.2783, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 5.26, |
|
"learning_rate": 0.00018028391705780295, |
|
"loss": 0.2782, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 5.26, |
|
"eval_loss": 0.26220422983169556, |
|
"eval_runtime": 2.0409, |
|
"eval_samples_per_second": 1125.474, |
|
"eval_steps_per_second": 17.639, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"learning_rate": 0.0001798116195153541, |
|
"loss": 0.2782, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"learning_rate": 0.0001793390506370094, |
|
"loss": 0.2816, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"eval_loss": 0.2643316388130188, |
|
"eval_runtime": 2.0022, |
|
"eval_samples_per_second": 1147.236, |
|
"eval_steps_per_second": 17.98, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"learning_rate": 0.00017886621559070638, |
|
"loss": 0.281, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 5.31, |
|
"learning_rate": 0.00017839311954729337, |
|
"loss": 0.2799, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 5.31, |
|
"eval_loss": 0.2624029219150543, |
|
"eval_runtime": 1.9308, |
|
"eval_samples_per_second": 1189.661, |
|
"eval_steps_per_second": 18.645, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"learning_rate": 0.00017791976768047292, |
|
"loss": 0.28, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"learning_rate": 0.00017744616516674518, |
|
"loss": 0.2792, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"eval_loss": 0.2625776529312134, |
|
"eval_runtime": 1.9068, |
|
"eval_samples_per_second": 1204.635, |
|
"eval_steps_per_second": 18.88, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"learning_rate": 0.00017697231718535132, |
|
"loss": 0.2807, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"learning_rate": 0.00017649822891821707, |
|
"loss": 0.2794, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"eval_loss": 0.26089775562286377, |
|
"eval_runtime": 2.0641, |
|
"eval_samples_per_second": 1112.856, |
|
"eval_steps_per_second": 17.441, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"learning_rate": 0.00017602390554989563, |
|
"loss": 0.2782, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"learning_rate": 0.0001755493522675115, |
|
"loss": 0.2778, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"eval_loss": 0.26353833079338074, |
|
"eval_runtime": 2.0451, |
|
"eval_samples_per_second": 1123.162, |
|
"eval_steps_per_second": 17.603, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 5.39, |
|
"learning_rate": 0.00017507457426070317, |
|
"loss": 0.2778, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"learning_rate": 0.00017459957672156704, |
|
"loss": 0.2843, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"eval_loss": 0.2617889940738678, |
|
"eval_runtime": 2.1044, |
|
"eval_samples_per_second": 1091.504, |
|
"eval_steps_per_second": 17.107, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"learning_rate": 0.00017412436484459998, |
|
"loss": 0.2799, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"learning_rate": 0.00017364894382664297, |
|
"loss": 0.2778, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"eval_loss": 0.261848509311676, |
|
"eval_runtime": 2.0011, |
|
"eval_samples_per_second": 1147.894, |
|
"eval_steps_per_second": 17.991, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"learning_rate": 0.00017317331886682408, |
|
"loss": 0.2781, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"learning_rate": 0.0001726974951665017, |
|
"loss": 0.2777, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"eval_loss": 0.2593342363834381, |
|
"eval_runtime": 2.0597, |
|
"eval_samples_per_second": 1115.201, |
|
"eval_steps_per_second": 17.478, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"learning_rate": 0.0001722214779292076, |
|
"loss": 0.2771, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"learning_rate": 0.00017174527236058998, |
|
"loss": 0.2769, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"eval_loss": 0.2584320306777954, |
|
"eval_runtime": 2.0154, |
|
"eval_samples_per_second": 1139.721, |
|
"eval_steps_per_second": 17.862, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"learning_rate": 0.00017126888366835662, |
|
"loss": 0.278, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"learning_rate": 0.000170792317062218, |
|
"loss": 0.2793, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"eval_loss": 0.2609806954860687, |
|
"eval_runtime": 2.0172, |
|
"eval_samples_per_second": 1138.717, |
|
"eval_steps_per_second": 17.847, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"learning_rate": 0.00017031557775383011, |
|
"loss": 0.277, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"learning_rate": 0.0001698386709567377, |
|
"loss": 0.2775, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"eval_loss": 0.25816255807876587, |
|
"eval_runtime": 2.0194, |
|
"eval_samples_per_second": 1137.488, |
|
"eval_steps_per_second": 17.827, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"learning_rate": 0.0001693616018863171, |
|
"loss": 0.2768, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"learning_rate": 0.00016888437575971913, |
|
"loss": 0.278, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"eval_loss": 0.2611255645751953, |
|
"eval_runtime": 2.0505, |
|
"eval_samples_per_second": 1120.189, |
|
"eval_steps_per_second": 17.556, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"learning_rate": 0.00016840699779581238, |
|
"loss": 0.2768, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"learning_rate": 0.00016792947321512573, |
|
"loss": 0.2765, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"eval_loss": 0.25990262627601624, |
|
"eval_runtime": 2.0037, |
|
"eval_samples_per_second": 1146.355, |
|
"eval_steps_per_second": 17.966, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"learning_rate": 0.00016745180723979144, |
|
"loss": 0.2796, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"learning_rate": 0.00016697400509348818, |
|
"loss": 0.2767, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"eval_loss": 0.25914427638053894, |
|
"eval_runtime": 2.0331, |
|
"eval_samples_per_second": 1129.79, |
|
"eval_steps_per_second": 17.707, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"learning_rate": 0.00016649607200138356, |
|
"loss": 0.2772, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"learning_rate": 0.00016601801319007743, |
|
"loss": 0.2766, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"eval_loss": 0.25976353883743286, |
|
"eval_runtime": 2.0645, |
|
"eval_samples_per_second": 1112.603, |
|
"eval_steps_per_second": 17.437, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"learning_rate": 0.00016553983388754428, |
|
"loss": 0.276, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"learning_rate": 0.00016506153932307636, |
|
"loss": 0.2776, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"eval_loss": 0.2604348659515381, |
|
"eval_runtime": 1.9875, |
|
"eval_samples_per_second": 1155.746, |
|
"eval_steps_per_second": 18.114, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 5.63, |
|
"learning_rate": 0.00016458313472722638, |
|
"loss": 0.2761, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"learning_rate": 0.00016410462533175045, |
|
"loss": 0.2759, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"eval_loss": 0.25958681106567383, |
|
"eval_runtime": 2.0685, |
|
"eval_samples_per_second": 1110.487, |
|
"eval_steps_per_second": 17.404, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"learning_rate": 0.00016362601636955049, |
|
"loss": 0.2757, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"learning_rate": 0.00016314731307461754, |
|
"loss": 0.2762, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"eval_loss": 0.2618536949157715, |
|
"eval_runtime": 2.0252, |
|
"eval_samples_per_second": 1134.221, |
|
"eval_steps_per_second": 17.776, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"learning_rate": 0.0001626685206819742, |
|
"loss": 0.2775, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"learning_rate": 0.0001621896444276172, |
|
"loss": 0.2762, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"eval_loss": 0.26094868779182434, |
|
"eval_runtime": 2.0113, |
|
"eval_samples_per_second": 1142.074, |
|
"eval_steps_per_second": 17.899, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 0.00016171068954846067, |
|
"loss": 0.276, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"learning_rate": 0.00016123166128227835, |
|
"loss": 0.2754, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"eval_loss": 0.26099157333374023, |
|
"eval_runtime": 1.9698, |
|
"eval_samples_per_second": 1166.087, |
|
"eval_steps_per_second": 18.276, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"learning_rate": 0.0001607525648676467, |
|
"loss": 0.2756, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"learning_rate": 0.0001602734055438873, |
|
"loss": 0.2757, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"eval_loss": 0.2581404149532318, |
|
"eval_runtime": 1.9737, |
|
"eval_samples_per_second": 1163.806, |
|
"eval_steps_per_second": 18.24, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"learning_rate": 0.00015979418855100963, |
|
"loss": 0.2757, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"learning_rate": 0.00015931491912965417, |
|
"loss": 0.2751, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"eval_loss": 0.2564995288848877, |
|
"eval_runtime": 1.9044, |
|
"eval_samples_per_second": 1206.164, |
|
"eval_steps_per_second": 18.904, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"learning_rate": 0.0001588356025210344, |
|
"loss": 0.2752, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"learning_rate": 0.00015835624396688, |
|
"loss": 0.275, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"eval_loss": 0.2554072439670563, |
|
"eval_runtime": 1.984, |
|
"eval_samples_per_second": 1157.747, |
|
"eval_steps_per_second": 18.145, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"learning_rate": 0.00015787684870937924, |
|
"loss": 0.2747, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"learning_rate": 0.00015739742199112196, |
|
"loss": 0.2748, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"eval_loss": 0.2579441964626312, |
|
"eval_runtime": 1.8886, |
|
"eval_samples_per_second": 1216.255, |
|
"eval_steps_per_second": 19.062, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"learning_rate": 0.00015691796905504187, |
|
"loss": 0.2748, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"learning_rate": 0.00015643849514435944, |
|
"loss": 0.2747, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"eval_loss": 0.2571495771408081, |
|
"eval_runtime": 1.9462, |
|
"eval_samples_per_second": 1180.236, |
|
"eval_steps_per_second": 18.497, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"learning_rate": 0.00015595900550252463, |
|
"loss": 0.2746, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"learning_rate": 0.00015547950537315926, |
|
"loss": 0.2747, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"eval_loss": 0.25983506441116333, |
|
"eval_runtime": 1.9787, |
|
"eval_samples_per_second": 1160.871, |
|
"eval_steps_per_second": 18.194, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"learning_rate": 0.00015499999999999997, |
|
"loss": 0.2744, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 5.87, |
|
"learning_rate": 0.00015452049462684068, |
|
"loss": 0.2744, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 5.87, |
|
"eval_loss": 0.2598707377910614, |
|
"eval_runtime": 2.0257, |
|
"eval_samples_per_second": 1133.91, |
|
"eval_steps_per_second": 17.771, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"learning_rate": 0.00015404099449747535, |
|
"loss": 0.2742, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 5.89, |
|
"learning_rate": 0.0001535615048556405, |
|
"loss": 0.2746, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 5.89, |
|
"eval_loss": 0.2571488916873932, |
|
"eval_runtime": 1.9862, |
|
"eval_samples_per_second": 1156.504, |
|
"eval_steps_per_second": 18.125, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"learning_rate": 0.0001530820309449581, |
|
"loss": 0.274, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"learning_rate": 0.00015260257800887798, |
|
"loss": 0.2743, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"eval_loss": 0.2586234211921692, |
|
"eval_runtime": 1.9966, |
|
"eval_samples_per_second": 1150.434, |
|
"eval_steps_per_second": 18.03, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"learning_rate": 0.0001521231512906207, |
|
"loss": 0.274, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 5.93, |
|
"learning_rate": 0.00015164375603311998, |
|
"loss": 0.274, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 5.93, |
|
"eval_loss": 0.2586924433708191, |
|
"eval_runtime": 1.9388, |
|
"eval_samples_per_second": 1184.767, |
|
"eval_steps_per_second": 18.568, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 5.94, |
|
"learning_rate": 0.00015116439747896553, |
|
"loss": 0.2739, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"learning_rate": 0.00015068508087034578, |
|
"loss": 0.2741, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"eval_loss": 0.25905266404151917, |
|
"eval_runtime": 1.9258, |
|
"eval_samples_per_second": 1192.738, |
|
"eval_steps_per_second": 18.693, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"learning_rate": 0.00015020581144899027, |
|
"loss": 0.2737, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"learning_rate": 0.0001497265944561127, |
|
"loss": 0.2735, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"eval_loss": 0.2568492293357849, |
|
"eval_runtime": 2.0451, |
|
"eval_samples_per_second": 1123.172, |
|
"eval_steps_per_second": 17.603, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 5.99, |
|
"learning_rate": 0.00014924743513235327, |
|
"loss": 0.2735, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"learning_rate": 0.0001487683387177216, |
|
"loss": 0.2736, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.25593629479408264, |
|
"eval_runtime": 2.0056, |
|
"eval_samples_per_second": 1145.295, |
|
"eval_steps_per_second": 17.95, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"learning_rate": 0.00014828931045153928, |
|
"loss": 0.2736, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"learning_rate": 0.00014781035557238272, |
|
"loss": 0.2735, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"eval_loss": 0.2567874789237976, |
|
"eval_runtime": 1.9639, |
|
"eval_samples_per_second": 1169.619, |
|
"eval_steps_per_second": 18.331, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"learning_rate": 0.00014733147931802578, |
|
"loss": 0.2733, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"learning_rate": 0.00014685268692538238, |
|
"loss": 0.273, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"eval_loss": 0.2563098967075348, |
|
"eval_runtime": 1.9603, |
|
"eval_samples_per_second": 1171.784, |
|
"eval_steps_per_second": 18.365, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"learning_rate": 0.00014637398363044946, |
|
"loss": 0.2732, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"learning_rate": 0.00014589537466824955, |
|
"loss": 0.2732, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"eval_loss": 0.2550007402896881, |
|
"eval_runtime": 1.9875, |
|
"eval_samples_per_second": 1155.744, |
|
"eval_steps_per_second": 18.114, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"learning_rate": 0.00014541686527277356, |
|
"loss": 0.2729, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"learning_rate": 0.00014493846067692358, |
|
"loss": 0.2759, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"eval_loss": 0.2564890682697296, |
|
"eval_runtime": 1.9877, |
|
"eval_samples_per_second": 1155.615, |
|
"eval_steps_per_second": 18.112, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"learning_rate": 0.00014446016611245567, |
|
"loss": 0.279, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 6.11, |
|
"learning_rate": 0.00014398198680992252, |
|
"loss": 0.2778, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 6.11, |
|
"eval_loss": 0.25714361667633057, |
|
"eval_runtime": 2.0307, |
|
"eval_samples_per_second": 1131.141, |
|
"eval_steps_per_second": 17.728, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"learning_rate": 0.00014350392799861636, |
|
"loss": 0.2758, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"learning_rate": 0.0001430259949065118, |
|
"loss": 0.275, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"eval_loss": 0.258448988199234, |
|
"eval_runtime": 1.9657, |
|
"eval_samples_per_second": 1168.538, |
|
"eval_steps_per_second": 18.314, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"learning_rate": 0.0001425481927602085, |
|
"loss": 0.2749, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"learning_rate": 0.0001420705267848743, |
|
"loss": 0.2742, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"eval_loss": 0.25592052936553955, |
|
"eval_runtime": 1.9618, |
|
"eval_samples_per_second": 1170.879, |
|
"eval_steps_per_second": 18.351, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"learning_rate": 0.00014159300220418757, |
|
"loss": 0.2742, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"learning_rate": 0.0001411156242402808, |
|
"loss": 0.2741, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"eval_loss": 0.2545378506183624, |
|
"eval_runtime": 1.9657, |
|
"eval_samples_per_second": 1168.513, |
|
"eval_steps_per_second": 18.314, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"learning_rate": 0.0001406383981136829, |
|
"loss": 0.2726, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"learning_rate": 0.00014016132904326226, |
|
"loss": 0.2723, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"eval_loss": 0.25342997908592224, |
|
"eval_runtime": 1.92, |
|
"eval_samples_per_second": 1196.358, |
|
"eval_steps_per_second": 18.75, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 6.21, |
|
"learning_rate": 0.00013968442224616989, |
|
"loss": 0.2721, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"learning_rate": 0.00013920768293778195, |
|
"loss": 0.2722, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"eval_loss": 0.25338229537010193, |
|
"eval_runtime": 1.9729, |
|
"eval_samples_per_second": 1164.267, |
|
"eval_steps_per_second": 18.247, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 6.23, |
|
"learning_rate": 0.00013873111633164336, |
|
"loss": 0.2722, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"learning_rate": 0.00013825472763941, |
|
"loss": 0.2722, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_loss": 0.25419288873672485, |
|
"eval_runtime": 1.9659, |
|
"eval_samples_per_second": 1168.404, |
|
"eval_steps_per_second": 18.312, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"learning_rate": 0.00013777852207079235, |
|
"loss": 0.2716, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"learning_rate": 0.00013730250483349825, |
|
"loss": 0.2718, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"eval_loss": 0.25364091992378235, |
|
"eval_runtime": 2.0184, |
|
"eval_samples_per_second": 1138.006, |
|
"eval_steps_per_second": 17.836, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"learning_rate": 0.00013682668113317584, |
|
"loss": 0.2718, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"learning_rate": 0.00013635105617335703, |
|
"loss": 0.2717, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"eval_loss": 0.2546054422855377, |
|
"eval_runtime": 1.9353, |
|
"eval_samples_per_second": 1186.875, |
|
"eval_steps_per_second": 18.601, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"learning_rate": 0.00013587563515539996, |
|
"loss": 0.2718, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"learning_rate": 0.00013540042327843296, |
|
"loss": 0.2716, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"eval_loss": 0.25594207644462585, |
|
"eval_runtime": 1.9903, |
|
"eval_samples_per_second": 1154.11, |
|
"eval_steps_per_second": 18.088, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"learning_rate": 0.00013492542573929678, |
|
"loss": 0.2718, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"learning_rate": 0.00013445064773248846, |
|
"loss": 0.2718, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"eval_loss": 0.25390535593032837, |
|
"eval_runtime": 1.9996, |
|
"eval_samples_per_second": 1148.745, |
|
"eval_steps_per_second": 18.004, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"learning_rate": 0.00013397609445010432, |
|
"loss": 0.2711, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"learning_rate": 0.00013350177108178288, |
|
"loss": 0.2715, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"eval_loss": 0.2538110613822937, |
|
"eval_runtime": 2.0098, |
|
"eval_samples_per_second": 1142.879, |
|
"eval_steps_per_second": 17.912, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"learning_rate": 0.00013302768281464863, |
|
"loss": 0.2713, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"learning_rate": 0.0001325538348332548, |
|
"loss": 0.2713, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"eval_loss": 0.2533688545227051, |
|
"eval_runtime": 1.9973, |
|
"eval_samples_per_second": 1150.057, |
|
"eval_steps_per_second": 18.024, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"learning_rate": 0.00013208023231952706, |
|
"loss": 0.2711, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 0.0001316068804527066, |
|
"loss": 0.271, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"eval_loss": 0.25663408637046814, |
|
"eval_runtime": 2.0033, |
|
"eval_samples_per_second": 1146.595, |
|
"eval_steps_per_second": 17.97, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 6.41, |
|
"learning_rate": 0.00013113378440929353, |
|
"loss": 0.2709, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"learning_rate": 0.00013066094936299056, |
|
"loss": 0.2707, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"eval_loss": 0.2554568946361542, |
|
"eval_runtime": 1.9451, |
|
"eval_samples_per_second": 1180.915, |
|
"eval_steps_per_second": 18.508, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"learning_rate": 0.00013018838048464582, |
|
"loss": 0.271, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"learning_rate": 0.00012971608294219702, |
|
"loss": 0.2709, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"eval_loss": 0.2539980113506317, |
|
"eval_runtime": 2.0049, |
|
"eval_samples_per_second": 1145.67, |
|
"eval_steps_per_second": 17.956, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"learning_rate": 0.00012924406190061423, |
|
"loss": 0.2707, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"learning_rate": 0.0001287723225218441, |
|
"loss": 0.2706, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"eval_loss": 0.2525833547115326, |
|
"eval_runtime": 1.9728, |
|
"eval_samples_per_second": 1164.356, |
|
"eval_steps_per_second": 18.249, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"learning_rate": 0.00012830086996475274, |
|
"loss": 0.2707, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"learning_rate": 0.00012782970938506988, |
|
"loss": 0.2707, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"eval_loss": 0.2514384388923645, |
|
"eval_runtime": 2.0121, |
|
"eval_samples_per_second": 1141.574, |
|
"eval_steps_per_second": 17.891, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"learning_rate": 0.00012735884593533222, |
|
"loss": 0.2706, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"learning_rate": 0.0001268882847648268, |
|
"loss": 0.2702, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"eval_loss": 0.2527428865432739, |
|
"eval_runtime": 2.1423, |
|
"eval_samples_per_second": 1072.197, |
|
"eval_steps_per_second": 16.804, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"learning_rate": 0.00012641803101953535, |
|
"loss": 0.271, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"learning_rate": 0.00012594808984207743, |
|
"loss": 0.2702, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"eval_loss": 0.2542712688446045, |
|
"eval_runtime": 2.0253, |
|
"eval_samples_per_second": 1134.154, |
|
"eval_steps_per_second": 17.775, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"learning_rate": 0.00012547846637165445, |
|
"loss": 0.2698, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"learning_rate": 0.00012500916574399346, |
|
"loss": 0.27, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"eval_loss": 0.25084540247917175, |
|
"eval_runtime": 1.9928, |
|
"eval_samples_per_second": 1152.639, |
|
"eval_steps_per_second": 18.065, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 6.57, |
|
"learning_rate": 0.00012454019309129095, |
|
"loss": 0.2698, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 6.58, |
|
"learning_rate": 0.0001240715535421567, |
|
"loss": 0.27, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 6.58, |
|
"eval_loss": 0.252863347530365, |
|
"eval_runtime": 1.9718, |
|
"eval_samples_per_second": 1164.945, |
|
"eval_steps_per_second": 18.258, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 6.59, |
|
"learning_rate": 0.00012360325222155773, |
|
"loss": 0.2701, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"learning_rate": 0.00012313529425076228, |
|
"loss": 0.2697, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"eval_loss": 0.25461703538894653, |
|
"eval_runtime": 1.9521, |
|
"eval_samples_per_second": 1176.688, |
|
"eval_steps_per_second": 18.442, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"learning_rate": 0.00012266768474728372, |
|
"loss": 0.2698, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"learning_rate": 0.00012220042882482468, |
|
"loss": 0.2703, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"eval_loss": 0.2524978220462799, |
|
"eval_runtime": 2.026, |
|
"eval_samples_per_second": 1133.755, |
|
"eval_steps_per_second": 17.769, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"learning_rate": 0.00012173353159322102, |
|
"loss": 0.2695, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"learning_rate": 0.00012126699815838609, |
|
"loss": 0.2697, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"eval_loss": 0.2543928623199463, |
|
"eval_runtime": 1.9625, |
|
"eval_samples_per_second": 1170.438, |
|
"eval_steps_per_second": 18.344, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"learning_rate": 0.00012080083362225484, |
|
"loss": 0.2696, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"learning_rate": 0.00012033504308272786, |
|
"loss": 0.2698, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"eval_loss": 0.24954737722873688, |
|
"eval_runtime": 1.9913, |
|
"eval_samples_per_second": 1153.505, |
|
"eval_steps_per_second": 18.078, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"learning_rate": 0.00011986963163361598, |
|
"loss": 0.2715, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"learning_rate": 0.0001194046043645841, |
|
"loss": 0.274, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"eval_loss": 0.2508510649204254, |
|
"eval_runtime": 1.967, |
|
"eval_samples_per_second": 1167.773, |
|
"eval_steps_per_second": 18.302, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 6.7, |
|
"learning_rate": 0.00011893996636109606, |
|
"loss": 0.2728, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 6.71, |
|
"learning_rate": 0.00011847572270435852, |
|
"loss": 0.2726, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 6.71, |
|
"eval_loss": 0.2552023231983185, |
|
"eval_runtime": 5.3426, |
|
"eval_samples_per_second": 429.941, |
|
"eval_steps_per_second": 6.738, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"learning_rate": 0.00011801187847126579, |
|
"loss": 0.2696, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 6.74, |
|
"learning_rate": 0.00011754843873434411, |
|
"loss": 0.2691, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 6.74, |
|
"eval_loss": 0.25297147035598755, |
|
"eval_runtime": 1.9927, |
|
"eval_samples_per_second": 1152.684, |
|
"eval_steps_per_second": 18.066, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"learning_rate": 0.00011708540856169612, |
|
"loss": 0.2702, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"learning_rate": 0.00011662279301694567, |
|
"loss": 0.2698, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"eval_loss": 0.253802090883255, |
|
"eval_runtime": 1.9502, |
|
"eval_samples_per_second": 1177.813, |
|
"eval_steps_per_second": 18.459, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 6.77, |
|
"learning_rate": 0.0001161605971591822, |
|
"loss": 0.2697, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"learning_rate": 0.00011569882604290559, |
|
"loss": 0.2706, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"eval_loss": 0.252233624458313, |
|
"eval_runtime": 2.0068, |
|
"eval_samples_per_second": 1144.633, |
|
"eval_steps_per_second": 17.939, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 6.79, |
|
"learning_rate": 0.00011523748471797075, |
|
"loss": 0.2704, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"learning_rate": 0.00011477657822953255, |
|
"loss": 0.2705, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"eval_loss": 0.25286948680877686, |
|
"eval_runtime": 2.0237, |
|
"eval_samples_per_second": 1135.028, |
|
"eval_steps_per_second": 17.789, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 6.81, |
|
"learning_rate": 0.00011431611161799043, |
|
"loss": 0.2732, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"learning_rate": 0.0001138560899189335, |
|
"loss": 0.2707, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"eval_loss": 0.2536955773830414, |
|
"eval_runtime": 2.005, |
|
"eval_samples_per_second": 1145.645, |
|
"eval_steps_per_second": 17.955, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 6.84, |
|
"learning_rate": 0.00011339651816308543, |
|
"loss": 0.271, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 6.85, |
|
"learning_rate": 0.00011293740137624925, |
|
"loss": 0.2713, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 6.85, |
|
"eval_loss": 0.25167515873908997, |
|
"eval_runtime": 1.9694, |
|
"eval_samples_per_second": 1166.352, |
|
"eval_steps_per_second": 18.28, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"learning_rate": 0.00011247874457925261, |
|
"loss": 0.2716, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"learning_rate": 0.0001120205527878927, |
|
"loss": 0.2696, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"eval_loss": 0.25594934821128845, |
|
"eval_runtime": 1.9543, |
|
"eval_samples_per_second": 1175.377, |
|
"eval_steps_per_second": 18.421, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"learning_rate": 0.00011156283101288165, |
|
"loss": 0.2701, |
|
"step": 308500 |
|
}, |
|
{ |
|
"epoch": 6.89, |
|
"learning_rate": 0.00011110558425979132, |
|
"loss": 0.2702, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 6.89, |
|
"eval_loss": 0.2543644607067108, |
|
"eval_runtime": 1.9412, |
|
"eval_samples_per_second": 1183.27, |
|
"eval_steps_per_second": 18.545, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"learning_rate": 0.00011064881752899906, |
|
"loss": 0.2698, |
|
"step": 309500 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"learning_rate": 0.00011019253581563262, |
|
"loss": 0.2695, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"eval_loss": 0.2494657188653946, |
|
"eval_runtime": 2.0069, |
|
"eval_samples_per_second": 1144.556, |
|
"eval_steps_per_second": 17.938, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"learning_rate": 0.00010973674410951567, |
|
"loss": 0.2688, |
|
"step": 310500 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"learning_rate": 0.00010928144739511337, |
|
"loss": 0.2688, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"eval_loss": 0.2559347450733185, |
|
"eval_runtime": 1.9953, |
|
"eval_samples_per_second": 1151.191, |
|
"eval_steps_per_second": 18.042, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"learning_rate": 0.00010882665065147757, |
|
"loss": 0.269, |
|
"step": 311500 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"learning_rate": 0.00010837235885219267, |
|
"loss": 0.2681, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"eval_loss": 0.2514026165008545, |
|
"eval_runtime": 2.0268, |
|
"eval_samples_per_second": 1133.324, |
|
"eval_steps_per_second": 17.762, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 6.97, |
|
"learning_rate": 0.00010791857696532089, |
|
"loss": 0.2679, |
|
"step": 312500 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"learning_rate": 0.00010746530995334832, |
|
"loss": 0.2678, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"eval_loss": 0.25468307733535767, |
|
"eval_runtime": 1.9525, |
|
"eval_samples_per_second": 1176.424, |
|
"eval_steps_per_second": 18.438, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"learning_rate": 0.0001070125627731304, |
|
"loss": 0.268, |
|
"step": 313500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"learning_rate": 0.0001065603403758377, |
|
"loss": 0.2682, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.2510579824447632, |
|
"eval_runtime": 2.0207, |
|
"eval_samples_per_second": 1136.748, |
|
"eval_steps_per_second": 17.816, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"learning_rate": 0.00010610864770690196, |
|
"loss": 0.2676, |
|
"step": 314500 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"learning_rate": 0.00010565748970596172, |
|
"loss": 0.2675, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"eval_loss": 0.25261634588241577, |
|
"eval_runtime": 2.04, |
|
"eval_samples_per_second": 1125.977, |
|
"eval_steps_per_second": 17.647, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"learning_rate": 0.00010520687130680884, |
|
"loss": 0.2676, |
|
"step": 315500 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"learning_rate": 0.00010475679743733364, |
|
"loss": 0.2672, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"eval_loss": 0.25101763010025024, |
|
"eval_runtime": 1.9764, |
|
"eval_samples_per_second": 1162.216, |
|
"eval_steps_per_second": 18.215, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 7.06, |
|
"learning_rate": 0.00010430727301947202, |
|
"loss": 0.2673, |
|
"step": 316500 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"learning_rate": 0.00010385830296915104, |
|
"loss": 0.2674, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"eval_loss": 0.2475864142179489, |
|
"eval_runtime": 2.013, |
|
"eval_samples_per_second": 1141.106, |
|
"eval_steps_per_second": 17.884, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"learning_rate": 0.00010340989219623508, |
|
"loss": 0.2674, |
|
"step": 317500 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"learning_rate": 0.0001029620456044727, |
|
"loss": 0.2672, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"eval_loss": 0.2495851069688797, |
|
"eval_runtime": 2.0072, |
|
"eval_samples_per_second": 1144.404, |
|
"eval_steps_per_second": 17.936, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"learning_rate": 0.00010251476809144226, |
|
"loss": 0.267, |
|
"step": 318500 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"learning_rate": 0.00010206806454849917, |
|
"loss": 0.2668, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"eval_loss": 0.24910998344421387, |
|
"eval_runtime": 2.0019, |
|
"eval_samples_per_second": 1147.411, |
|
"eval_steps_per_second": 17.983, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 7.13, |
|
"learning_rate": 0.00010162193986072167, |
|
"loss": 0.267, |
|
"step": 319500 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"learning_rate": 0.00010117639890685795, |
|
"loss": 0.2671, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"eval_loss": 0.2508242130279541, |
|
"eval_runtime": 1.9741, |
|
"eval_samples_per_second": 1163.57, |
|
"eval_steps_per_second": 18.236, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"learning_rate": 0.00010073144655927253, |
|
"loss": 0.2668, |
|
"step": 320500 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"learning_rate": 0.0001002870876838929, |
|
"loss": 0.2666, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"eval_loss": 0.24993054568767548, |
|
"eval_runtime": 2.0461, |
|
"eval_samples_per_second": 1122.605, |
|
"eval_steps_per_second": 17.594, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"learning_rate": 9.984332714015662e-05, |
|
"loss": 0.2668, |
|
"step": 321500 |
|
}, |
|
{ |
|
"epoch": 7.18, |
|
"learning_rate": 9.94001697809578e-05, |
|
"loss": 0.2666, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 7.18, |
|
"eval_loss": 0.2504477798938751, |
|
"eval_runtime": 1.9968, |
|
"eval_samples_per_second": 1150.317, |
|
"eval_steps_per_second": 18.028, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"learning_rate": 9.895762045259445e-05, |
|
"loss": 0.2667, |
|
"step": 322500 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"learning_rate": 9.851568399471498e-05, |
|
"loss": 0.2667, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"eval_loss": 0.2511838674545288, |
|
"eval_runtime": 1.9771, |
|
"eval_samples_per_second": 1161.828, |
|
"eval_steps_per_second": 18.209, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 7.22, |
|
"learning_rate": 9.807436524026574e-05, |
|
"loss": 0.2665, |
|
"step": 323500 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"learning_rate": 9.763366901543801e-05, |
|
"loss": 0.2665, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"eval_loss": 0.2511294484138489, |
|
"eval_runtime": 2.0259, |
|
"eval_samples_per_second": 1133.828, |
|
"eval_steps_per_second": 17.77, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"learning_rate": 9.719360013961495e-05, |
|
"loss": 0.2658, |
|
"step": 324500 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"learning_rate": 9.675416342531944e-05, |
|
"loss": 0.2661, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"eval_loss": 0.25169792771339417, |
|
"eval_runtime": 1.9684, |
|
"eval_samples_per_second": 1166.965, |
|
"eval_steps_per_second": 18.289, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"learning_rate": 9.631536367816086e-05, |
|
"loss": 0.2661, |
|
"step": 325500 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"learning_rate": 9.587720569678299e-05, |
|
"loss": 0.2661, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"eval_loss": 0.2492215633392334, |
|
"eval_runtime": 1.9633, |
|
"eval_samples_per_second": 1169.965, |
|
"eval_steps_per_second": 18.336, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"learning_rate": 9.543969427281131e-05, |
|
"loss": 0.2662, |
|
"step": 326500 |
|
}, |
|
{ |
|
"epoch": 7.29, |
|
"learning_rate": 9.500283419080062e-05, |
|
"loss": 0.2665, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 7.29, |
|
"eval_loss": 0.252391517162323, |
|
"eval_runtime": 2.0026, |
|
"eval_samples_per_second": 1147.012, |
|
"eval_steps_per_second": 17.977, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"learning_rate": 9.45666302281829e-05, |
|
"loss": 0.2657, |
|
"step": 327500 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"learning_rate": 9.413108715521467e-05, |
|
"loss": 0.266, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"eval_loss": 0.24788686633110046, |
|
"eval_runtime": 1.9452, |
|
"eval_samples_per_second": 1180.833, |
|
"eval_steps_per_second": 18.507, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 7.33, |
|
"learning_rate": 9.369620973492525e-05, |
|
"loss": 0.2658, |
|
"step": 328500 |
|
}, |
|
{ |
|
"epoch": 7.34, |
|
"learning_rate": 9.326200272306445e-05, |
|
"loss": 0.2654, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 7.34, |
|
"eval_loss": 0.2492542862892151, |
|
"eval_runtime": 2.0015, |
|
"eval_samples_per_second": 1147.652, |
|
"eval_steps_per_second": 17.987, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"learning_rate": 9.282847086805059e-05, |
|
"loss": 0.2657, |
|
"step": 329500 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"learning_rate": 9.239561891091853e-05, |
|
"loss": 0.2656, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"eval_loss": 0.2490701973438263, |
|
"eval_runtime": 2.0564, |
|
"eval_samples_per_second": 1117.004, |
|
"eval_steps_per_second": 17.506, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"learning_rate": 9.196345158526793e-05, |
|
"loss": 0.2655, |
|
"step": 330500 |
|
}, |
|
{ |
|
"epoch": 7.38, |
|
"learning_rate": 9.153197361721149e-05, |
|
"loss": 0.2653, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 7.38, |
|
"eval_loss": 0.24651852250099182, |
|
"eval_runtime": 1.9849, |
|
"eval_samples_per_second": 1157.249, |
|
"eval_steps_per_second": 18.137, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"learning_rate": 9.110118972532302e-05, |
|
"loss": 0.2652, |
|
"step": 331500 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"learning_rate": 9.067110462058634e-05, |
|
"loss": 0.2656, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"eval_loss": 0.24983063340187073, |
|
"eval_runtime": 1.9337, |
|
"eval_samples_per_second": 1187.894, |
|
"eval_steps_per_second": 18.617, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"learning_rate": 9.024172300634305e-05, |
|
"loss": 0.2655, |
|
"step": 332500 |
|
}, |
|
{ |
|
"epoch": 7.43, |
|
"learning_rate": 8.981304957824182e-05, |
|
"loss": 0.2655, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 7.43, |
|
"eval_loss": 0.2502962350845337, |
|
"eval_runtime": 2.0089, |
|
"eval_samples_per_second": 1143.431, |
|
"eval_steps_per_second": 17.921, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"learning_rate": 8.938508902418643e-05, |
|
"loss": 0.2651, |
|
"step": 333500 |
|
}, |
|
{ |
|
"epoch": 7.45, |
|
"learning_rate": 8.89578460242851e-05, |
|
"loss": 0.265, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 7.45, |
|
"eval_loss": 0.2484452873468399, |
|
"eval_runtime": 1.9509, |
|
"eval_samples_per_second": 1177.421, |
|
"eval_steps_per_second": 18.453, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"learning_rate": 8.85313252507988e-05, |
|
"loss": 0.265, |
|
"step": 334500 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"learning_rate": 8.810553136809027e-05, |
|
"loss": 0.2651, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"eval_loss": 0.24745772778987885, |
|
"eval_runtime": 2.0372, |
|
"eval_samples_per_second": 1127.544, |
|
"eval_steps_per_second": 17.672, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"learning_rate": 8.76804690325733e-05, |
|
"loss": 0.2649, |
|
"step": 335500 |
|
}, |
|
{ |
|
"epoch": 7.49, |
|
"learning_rate": 8.725614289266137e-05, |
|
"loss": 0.2647, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 7.49, |
|
"eval_loss": 0.24857033789157867, |
|
"eval_runtime": 2.0353, |
|
"eval_samples_per_second": 1128.559, |
|
"eval_steps_per_second": 17.687, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"learning_rate": 8.683255758871734e-05, |
|
"loss": 0.2649, |
|
"step": 336500 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"learning_rate": 8.640971775300207e-05, |
|
"loss": 0.2654, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"eval_loss": 0.24552865326404572, |
|
"eval_runtime": 2.0136, |
|
"eval_samples_per_second": 1140.751, |
|
"eval_steps_per_second": 17.879, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"learning_rate": 8.598762800962431e-05, |
|
"loss": 0.2767, |
|
"step": 337500 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"learning_rate": 8.55662929744899e-05, |
|
"loss": 0.2655, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"eval_loss": 0.24788717925548553, |
|
"eval_runtime": 2.0497, |
|
"eval_samples_per_second": 1120.627, |
|
"eval_steps_per_second": 17.563, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"learning_rate": 8.514571725525124e-05, |
|
"loss": 0.2666, |
|
"step": 338500 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"learning_rate": 8.47259054512571e-05, |
|
"loss": 0.2678, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"eval_loss": 0.24549083411693573, |
|
"eval_runtime": 2.0079, |
|
"eval_samples_per_second": 1143.957, |
|
"eval_steps_per_second": 17.929, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 7.57, |
|
"learning_rate": 8.430686215350198e-05, |
|
"loss": 0.265, |
|
"step": 339500 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"learning_rate": 8.388859194457636e-05, |
|
"loss": 0.265, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"eval_loss": 0.24893943965435028, |
|
"eval_runtime": 2.0535, |
|
"eval_samples_per_second": 1118.576, |
|
"eval_steps_per_second": 17.531, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"learning_rate": 8.347109939861605e-05, |
|
"loss": 0.2675, |
|
"step": 340500 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"learning_rate": 8.305438908125285e-05, |
|
"loss": 0.2649, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"eval_loss": 0.2475811243057251, |
|
"eval_runtime": 2.0091, |
|
"eval_samples_per_second": 1143.275, |
|
"eval_steps_per_second": 17.918, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"learning_rate": 8.263846554956402e-05, |
|
"loss": 0.2659, |
|
"step": 341500 |
|
}, |
|
{ |
|
"epoch": 7.63, |
|
"learning_rate": 8.222333335202254e-05, |
|
"loss": 0.2649, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 7.63, |
|
"eval_loss": 0.2466391623020172, |
|
"eval_runtime": 2.0704, |
|
"eval_samples_per_second": 1109.463, |
|
"eval_steps_per_second": 17.388, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"learning_rate": 8.18089970284477e-05, |
|
"loss": 0.2646, |
|
"step": 342500 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"learning_rate": 8.1395461109955e-05, |
|
"loss": 0.2645, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"eval_loss": 0.24718651175498962, |
|
"eval_runtime": 2.0893, |
|
"eval_samples_per_second": 1099.419, |
|
"eval_steps_per_second": 17.231, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 7.66, |
|
"learning_rate": 8.098273011890726e-05, |
|
"loss": 0.2654, |
|
"step": 343500 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"learning_rate": 8.057080856886426e-05, |
|
"loss": 0.2676, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"eval_loss": 0.2473945915699005, |
|
"eval_runtime": 1.9984, |
|
"eval_samples_per_second": 1149.403, |
|
"eval_steps_per_second": 18.014, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"learning_rate": 8.015970096453414e-05, |
|
"loss": 0.2655, |
|
"step": 344500 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"learning_rate": 7.974941180172382e-05, |
|
"loss": 0.265, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"eval_loss": 0.24670176208019257, |
|
"eval_runtime": 2.0516, |
|
"eval_samples_per_second": 1119.641, |
|
"eval_steps_per_second": 17.548, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 7.71, |
|
"learning_rate": 7.933994556728976e-05, |
|
"loss": 0.2648, |
|
"step": 345500 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"learning_rate": 7.893130673908927e-05, |
|
"loss": 0.2666, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"eval_loss": 0.24692493677139282, |
|
"eval_runtime": 2.0097, |
|
"eval_samples_per_second": 1142.931, |
|
"eval_steps_per_second": 17.913, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"learning_rate": 7.852349978593091e-05, |
|
"loss": 0.2645, |
|
"step": 346500 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"learning_rate": 7.811652916752633e-05, |
|
"loss": 0.2645, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"eval_loss": 0.24660193920135498, |
|
"eval_runtime": 2.0554, |
|
"eval_samples_per_second": 1117.529, |
|
"eval_steps_per_second": 17.515, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"learning_rate": 7.771039933444092e-05, |
|
"loss": 0.2644, |
|
"step": 347500 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"learning_rate": 7.730511472804544e-05, |
|
"loss": 0.2636, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"eval_loss": 0.248375803232193, |
|
"eval_runtime": 1.9897, |
|
"eval_samples_per_second": 1154.427, |
|
"eval_steps_per_second": 18.093, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 7.77, |
|
"learning_rate": 7.690067978046758e-05, |
|
"loss": 0.2637, |
|
"step": 348500 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"learning_rate": 7.649709891454298e-05, |
|
"loss": 0.2632, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"eval_loss": 0.24848568439483643, |
|
"eval_runtime": 2.0056, |
|
"eval_samples_per_second": 1145.314, |
|
"eval_steps_per_second": 17.95, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"learning_rate": 7.609437654376742e-05, |
|
"loss": 0.2636, |
|
"step": 349500 |
|
}, |
|
{ |
|
"epoch": 7.81, |
|
"learning_rate": 7.569251707224812e-05, |
|
"loss": 0.2635, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 7.81, |
|
"eval_loss": 0.24438388645648956, |
|
"eval_runtime": 2.0031, |
|
"eval_samples_per_second": 1146.703, |
|
"eval_steps_per_second": 17.972, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 7.82, |
|
"learning_rate": 7.529152489465592e-05, |
|
"loss": 0.2638, |
|
"step": 350500 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"learning_rate": 7.489140439617708e-05, |
|
"loss": 0.2632, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"eval_loss": 0.2466663420200348, |
|
"eval_runtime": 1.9675, |
|
"eval_samples_per_second": 1167.492, |
|
"eval_steps_per_second": 18.298, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"learning_rate": 7.449215995246522e-05, |
|
"loss": 0.263, |
|
"step": 351500 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"learning_rate": 7.409379592959367e-05, |
|
"loss": 0.2631, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"eval_loss": 0.24644367396831512, |
|
"eval_runtime": 1.9998, |
|
"eval_samples_per_second": 1148.612, |
|
"eval_steps_per_second": 18.002, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"learning_rate": 7.369631668400746e-05, |
|
"loss": 0.2632, |
|
"step": 352500 |
|
}, |
|
{ |
|
"epoch": 7.87, |
|
"learning_rate": 7.3299726562476e-05, |
|
"loss": 0.2629, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 7.87, |
|
"eval_loss": 0.24639040231704712, |
|
"eval_runtime": 1.996, |
|
"eval_samples_per_second": 1150.775, |
|
"eval_steps_per_second": 18.036, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"learning_rate": 7.290402990204531e-05, |
|
"loss": 0.2628, |
|
"step": 353500 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"learning_rate": 7.250923102999073e-05, |
|
"loss": 0.2629, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"eval_loss": 0.24618536233901978, |
|
"eval_runtime": 1.9784, |
|
"eval_samples_per_second": 1161.045, |
|
"eval_steps_per_second": 18.197, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"learning_rate": 7.211533426376934e-05, |
|
"loss": 0.2629, |
|
"step": 354500 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"learning_rate": 7.172234391097317e-05, |
|
"loss": 0.2625, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"eval_loss": 0.24593985080718994, |
|
"eval_runtime": 2.059, |
|
"eval_samples_per_second": 1115.596, |
|
"eval_steps_per_second": 17.484, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"learning_rate": 7.133026426928173e-05, |
|
"loss": 0.2626, |
|
"step": 355500 |
|
}, |
|
{ |
|
"epoch": 7.94, |
|
"learning_rate": 7.093909962641514e-05, |
|
"loss": 0.2626, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 7.94, |
|
"eval_loss": 0.24694356322288513, |
|
"eval_runtime": 1.9849, |
|
"eval_samples_per_second": 1157.222, |
|
"eval_steps_per_second": 18.137, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 7.95, |
|
"learning_rate": 7.054885426008737e-05, |
|
"loss": 0.2624, |
|
"step": 356500 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"learning_rate": 7.015953243795907e-05, |
|
"loss": 0.2625, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"eval_loss": 0.24489082396030426, |
|
"eval_runtime": 2.0439, |
|
"eval_samples_per_second": 1123.842, |
|
"eval_steps_per_second": 17.614, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 7.97, |
|
"learning_rate": 6.97711384175914e-05, |
|
"loss": 0.2623, |
|
"step": 357500 |
|
}, |
|
{ |
|
"epoch": 7.99, |
|
"learning_rate": 6.938367644639911e-05, |
|
"loss": 0.4432, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 7.99, |
|
"eval_loss": 0.6783205270767212, |
|
"eval_runtime": 2.0172, |
|
"eval_samples_per_second": 1138.683, |
|
"eval_steps_per_second": 17.846, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 6.899715076160425e-05, |
|
"loss": 0.6785, |
|
"step": 358500 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"learning_rate": 6.861156559018986e-05, |
|
"loss": 0.6774, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"eval_loss": 0.6766601204872131, |
|
"eval_runtime": 2.0317, |
|
"eval_samples_per_second": 1130.554, |
|
"eval_steps_per_second": 17.719, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 8.02, |
|
"learning_rate": 6.822692514885346e-05, |
|
"loss": 0.6773, |
|
"step": 359500 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"learning_rate": 6.784323364396135e-05, |
|
"loss": 0.6773, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"eval_loss": 0.6772929430007935, |
|
"eval_runtime": 2.0141, |
|
"eval_samples_per_second": 1140.463, |
|
"eval_steps_per_second": 17.874, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"learning_rate": 6.746049527150238e-05, |
|
"loss": 0.6774, |
|
"step": 360500 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"learning_rate": 6.707871421704209e-05, |
|
"loss": 0.4655, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"eval_loss": 0.2545197010040283, |
|
"eval_runtime": 2.0031, |
|
"eval_samples_per_second": 1146.74, |
|
"eval_steps_per_second": 17.972, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"learning_rate": 6.669789465567683e-05, |
|
"loss": 0.2697, |
|
"step": 361500 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"learning_rate": 6.631804075198838e-05, |
|
"loss": 0.2677, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"eval_loss": 0.2466827929019928, |
|
"eval_runtime": 1.9817, |
|
"eval_samples_per_second": 1159.082, |
|
"eval_steps_per_second": 18.166, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 8.09, |
|
"learning_rate": 6.593915665999816e-05, |
|
"loss": 0.2646, |
|
"step": 362500 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"learning_rate": 6.55612465231219e-05, |
|
"loss": 0.2638, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"eval_loss": 0.24644041061401367, |
|
"eval_runtime": 1.9701, |
|
"eval_samples_per_second": 1165.949, |
|
"eval_steps_per_second": 18.273, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 8.11, |
|
"learning_rate": 6.518431447412434e-05, |
|
"loss": 0.2633, |
|
"step": 363500 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"learning_rate": 6.480836463507392e-05, |
|
"loss": 0.263, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"eval_loss": 0.24640262126922607, |
|
"eval_runtime": 1.9937, |
|
"eval_samples_per_second": 1152.148, |
|
"eval_steps_per_second": 18.057, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 8.13, |
|
"learning_rate": 6.443340111729786e-05, |
|
"loss": 0.2626, |
|
"step": 364500 |
|
}, |
|
{ |
|
"epoch": 8.14, |
|
"learning_rate": 6.405942802133713e-05, |
|
"loss": 0.2624, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 8.14, |
|
"eval_loss": 0.2466081976890564, |
|
"eval_runtime": 1.9386, |
|
"eval_samples_per_second": 1184.893, |
|
"eval_steps_per_second": 18.57, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 8.15, |
|
"learning_rate": 6.36864494369016e-05, |
|
"loss": 0.2622, |
|
"step": 365500 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"learning_rate": 6.331446944282534e-05, |
|
"loss": 0.2615, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"eval_loss": 0.24654769897460938, |
|
"eval_runtime": 1.9437, |
|
"eval_samples_per_second": 1181.759, |
|
"eval_steps_per_second": 18.521, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"learning_rate": 6.294349210702188e-05, |
|
"loss": 0.2617, |
|
"step": 366500 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"learning_rate": 6.257352148643998e-05, |
|
"loss": 0.262, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"eval_loss": 0.2468617558479309, |
|
"eval_runtime": 1.8987, |
|
"eval_samples_per_second": 1209.803, |
|
"eval_steps_per_second": 18.961, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"learning_rate": 6.220456162701908e-05, |
|
"loss": 0.2617, |
|
"step": 367500 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"learning_rate": 6.183661656364515e-05, |
|
"loss": 0.2615, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"eval_loss": 0.24478143453598022, |
|
"eval_runtime": 1.9857, |
|
"eval_samples_per_second": 1156.78, |
|
"eval_steps_per_second": 18.13, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 8.22, |
|
"learning_rate": 6.146969032010631e-05, |
|
"loss": 0.2616, |
|
"step": 368500 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"learning_rate": 6.110378690904928e-05, |
|
"loss": 0.2617, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"eval_loss": 0.2461312711238861, |
|
"eval_runtime": 2.0304, |
|
"eval_samples_per_second": 1131.313, |
|
"eval_steps_per_second": 17.731, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"learning_rate": 6.073891033193507e-05, |
|
"loss": 0.2613, |
|
"step": 369500 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"learning_rate": 6.037506457899553e-05, |
|
"loss": 0.2611, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"eval_loss": 0.24196158349514008, |
|
"eval_runtime": 1.9951, |
|
"eval_samples_per_second": 1151.317, |
|
"eval_steps_per_second": 18.044, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 8.26, |
|
"learning_rate": 6.0012253629189544e-05, |
|
"loss": 0.261, |
|
"step": 370500 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"learning_rate": 5.965048145015944e-05, |
|
"loss": 0.2611, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"eval_loss": 0.24591800570487976, |
|
"eval_runtime": 1.9624, |
|
"eval_samples_per_second": 1170.514, |
|
"eval_steps_per_second": 18.345, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 8.29, |
|
"learning_rate": 5.928975199818785e-05, |
|
"loss": 0.2611, |
|
"step": 371500 |
|
}, |
|
{ |
|
"epoch": 8.3, |
|
"learning_rate": 5.893006921815428e-05, |
|
"loss": 0.2608, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 8.3, |
|
"eval_loss": 0.2432386726140976, |
|
"eval_runtime": 2.0093, |
|
"eval_samples_per_second": 1143.209, |
|
"eval_steps_per_second": 17.917, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"learning_rate": 5.857143704349198e-05, |
|
"loss": 0.2608, |
|
"step": 372500 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"learning_rate": 5.8213859396144986e-05, |
|
"loss": 0.2605, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"eval_loss": 0.2445555180311203, |
|
"eval_runtime": 1.9525, |
|
"eval_samples_per_second": 1176.451, |
|
"eval_steps_per_second": 18.438, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"learning_rate": 5.785734018652507e-05, |
|
"loss": 0.2609, |
|
"step": 373500 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"learning_rate": 5.750188331346927e-05, |
|
"loss": 0.2609, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"eval_loss": 0.24143685400485992, |
|
"eval_runtime": 1.9945, |
|
"eval_samples_per_second": 1151.665, |
|
"eval_steps_per_second": 18.05, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 8.35, |
|
"learning_rate": 5.714749266419695e-05, |
|
"loss": 0.2605, |
|
"step": 374500 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"learning_rate": 5.6794172114267566e-05, |
|
"loss": 0.2614, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"eval_loss": 0.2436528205871582, |
|
"eval_runtime": 1.9628, |
|
"eval_samples_per_second": 1170.242, |
|
"eval_steps_per_second": 18.341, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"learning_rate": 5.6441925527537914e-05, |
|
"loss": 0.2614, |
|
"step": 375500 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"learning_rate": 5.60907567561203e-05, |
|
"loss": 0.2624, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"eval_loss": 0.24615313112735748, |
|
"eval_runtime": 1.9915, |
|
"eval_samples_per_second": 1153.415, |
|
"eval_steps_per_second": 18.077, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"learning_rate": 5.574066964034012e-05, |
|
"loss": 0.2614, |
|
"step": 376500 |
|
}, |
|
{ |
|
"epoch": 8.41, |
|
"learning_rate": 5.539166800869402e-05, |
|
"loss": 0.2611, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 8.41, |
|
"eval_loss": 0.24158482253551483, |
|
"eval_runtime": 1.9896, |
|
"eval_samples_per_second": 1154.526, |
|
"eval_steps_per_second": 18.094, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"learning_rate": 5.5043755677807955e-05, |
|
"loss": 0.261, |
|
"step": 377500 |
|
}, |
|
{ |
|
"epoch": 8.43, |
|
"learning_rate": 5.4696936452395344e-05, |
|
"loss": 0.2604, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 8.43, |
|
"eval_loss": 0.2432401180267334, |
|
"eval_runtime": 1.9707, |
|
"eval_samples_per_second": 1165.6, |
|
"eval_steps_per_second": 18.268, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"learning_rate": 5.435121412521576e-05, |
|
"loss": 0.2604, |
|
"step": 378500 |
|
}, |
|
{ |
|
"epoch": 8.45, |
|
"learning_rate": 5.400659247703307e-05, |
|
"loss": 0.2605, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 8.45, |
|
"eval_loss": 0.24426017701625824, |
|
"eval_runtime": 2.017, |
|
"eval_samples_per_second": 1138.839, |
|
"eval_steps_per_second": 17.849, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"learning_rate": 5.36630752765745e-05, |
|
"loss": 0.2605, |
|
"step": 379500 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"learning_rate": 5.3320666280489146e-05, |
|
"loss": 0.26, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"eval_loss": 0.24248941242694855, |
|
"eval_runtime": 2.0075, |
|
"eval_samples_per_second": 1144.188, |
|
"eval_steps_per_second": 17.932, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"learning_rate": 5.2979369233306834e-05, |
|
"loss": 0.2597, |
|
"step": 380500 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"learning_rate": 5.26391878673975e-05, |
|
"loss": 0.2598, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"eval_loss": 0.24129272997379303, |
|
"eval_runtime": 1.9892, |
|
"eval_samples_per_second": 1154.716, |
|
"eval_steps_per_second": 18.097, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 8.51, |
|
"learning_rate": 5.230012590292987e-05, |
|
"loss": 0.26, |
|
"step": 381500 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"learning_rate": 5.1962187047831517e-05, |
|
"loss": 0.2597, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"eval_loss": 0.2438046783208847, |
|
"eval_runtime": 2.0783, |
|
"eval_samples_per_second": 1105.244, |
|
"eval_steps_per_second": 17.322, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 8.53, |
|
"learning_rate": 5.162537499774743e-05, |
|
"loss": 0.2603, |
|
"step": 382500 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"learning_rate": 5.128969343600032e-05, |
|
"loss": 0.2602, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"eval_loss": 0.24265888333320618, |
|
"eval_runtime": 1.9571, |
|
"eval_samples_per_second": 1173.666, |
|
"eval_steps_per_second": 18.394, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"learning_rate": 5.09551460335499e-05, |
|
"loss": 0.2599, |
|
"step": 383500 |
|
}, |
|
{ |
|
"epoch": 8.57, |
|
"learning_rate": 5.062173644895296e-05, |
|
"loss": 0.2599, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 8.57, |
|
"eval_loss": 0.24097691476345062, |
|
"eval_runtime": 1.9672, |
|
"eval_samples_per_second": 1167.657, |
|
"eval_steps_per_second": 18.3, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"learning_rate": 5.0289468328323434e-05, |
|
"loss": 0.2591, |
|
"step": 384500 |
|
}, |
|
{ |
|
"epoch": 8.59, |
|
"learning_rate": 4.995834530529208e-05, |
|
"loss": 0.2596, |
|
"step": 385000 |
|
}, |
|
{ |
|
"epoch": 8.59, |
|
"eval_loss": 0.24366655945777893, |
|
"eval_runtime": 2.0603, |
|
"eval_samples_per_second": 1114.876, |
|
"eval_steps_per_second": 17.473, |
|
"step": 385000 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"learning_rate": 4.9628371000967394e-05, |
|
"loss": 0.2628, |
|
"step": 385500 |
|
}, |
|
{ |
|
"epoch": 8.61, |
|
"learning_rate": 4.929954902389534e-05, |
|
"loss": 0.2605, |
|
"step": 386000 |
|
}, |
|
{ |
|
"epoch": 8.61, |
|
"eval_loss": 0.2400692254304886, |
|
"eval_runtime": 2.0808, |
|
"eval_samples_per_second": 1103.885, |
|
"eval_steps_per_second": 17.301, |
|
"step": 386000 |
|
}, |
|
{ |
|
"epoch": 8.62, |
|
"learning_rate": 4.897188297002046e-05, |
|
"loss": 0.2606, |
|
"step": 386500 |
|
}, |
|
{ |
|
"epoch": 8.63, |
|
"learning_rate": 4.8645376422646226e-05, |
|
"loss": 0.2595, |
|
"step": 387000 |
|
}, |
|
{ |
|
"epoch": 8.63, |
|
"eval_loss": 0.24330011010169983, |
|
"eval_runtime": 2.0153, |
|
"eval_samples_per_second": 1139.773, |
|
"eval_steps_per_second": 17.863, |
|
"step": 387000 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"learning_rate": 4.832003295239591e-05, |
|
"loss": 0.2601, |
|
"step": 387500 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"learning_rate": 4.7995856117173624e-05, |
|
"loss": 0.2597, |
|
"step": 388000 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"eval_loss": 0.24292755126953125, |
|
"eval_runtime": 1.9613, |
|
"eval_samples_per_second": 1171.185, |
|
"eval_steps_per_second": 18.356, |
|
"step": 388000 |
|
}, |
|
{ |
|
"epoch": 8.67, |
|
"learning_rate": 4.767284946212521e-05, |
|
"loss": 0.2604, |
|
"step": 388500 |
|
}, |
|
{ |
|
"epoch": 8.68, |
|
"learning_rate": 4.735101651959977e-05, |
|
"loss": 0.2598, |
|
"step": 389000 |
|
}, |
|
{ |
|
"epoch": 8.68, |
|
"eval_loss": 0.24331249296665192, |
|
"eval_runtime": 1.9683, |
|
"eval_samples_per_second": 1166.976, |
|
"eval_steps_per_second": 18.29, |
|
"step": 389000 |
|
}, |
|
{ |
|
"epoch": 8.69, |
|
"learning_rate": 4.7030360809110754e-05, |
|
"loss": 0.2593, |
|
"step": 389500 |
|
}, |
|
{ |
|
"epoch": 8.7, |
|
"learning_rate": 4.6710885837297726e-05, |
|
"loss": 0.2594, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 8.7, |
|
"eval_loss": 0.24239999055862427, |
|
"eval_runtime": 1.9699, |
|
"eval_samples_per_second": 1166.029, |
|
"eval_steps_per_second": 18.275, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"learning_rate": 4.639259509788768e-05, |
|
"loss": 0.26, |
|
"step": 390500 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"learning_rate": 4.60754920716572e-05, |
|
"loss": 0.259, |
|
"step": 391000 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"eval_loss": 0.2440621256828308, |
|
"eval_runtime": 2.0127, |
|
"eval_samples_per_second": 1141.226, |
|
"eval_steps_per_second": 17.886, |
|
"step": 391000 |
|
}, |
|
{ |
|
"epoch": 8.73, |
|
"learning_rate": 4.5759580226394167e-05, |
|
"loss": 0.2593, |
|
"step": 391500 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"learning_rate": 4.544486301685993e-05, |
|
"loss": 0.2594, |
|
"step": 392000 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"eval_loss": 0.23992255330085754, |
|
"eval_runtime": 1.971, |
|
"eval_samples_per_second": 1165.424, |
|
"eval_steps_per_second": 18.265, |
|
"step": 392000 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"learning_rate": 4.5131343884751484e-05, |
|
"loss": 0.2591, |
|
"step": 392500 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"learning_rate": 4.4819026258663774e-05, |
|
"loss": 0.2591, |
|
"step": 393000 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"eval_loss": 0.2458277940750122, |
|
"eval_runtime": 1.9966, |
|
"eval_samples_per_second": 1150.457, |
|
"eval_steps_per_second": 18.031, |
|
"step": 393000 |
|
}, |
|
{ |
|
"epoch": 8.78, |
|
"learning_rate": 4.450791355405234e-05, |
|
"loss": 0.2592, |
|
"step": 393500 |
|
}, |
|
{ |
|
"epoch": 8.79, |
|
"learning_rate": 4.419800917319588e-05, |
|
"loss": 0.2591, |
|
"step": 394000 |
|
}, |
|
{ |
|
"epoch": 8.79, |
|
"eval_loss": 0.2452658861875534, |
|
"eval_runtime": 1.9635, |
|
"eval_samples_per_second": 1169.821, |
|
"eval_steps_per_second": 18.334, |
|
"step": 394000 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"learning_rate": 4.3889316505159056e-05, |
|
"loss": 0.2601, |
|
"step": 394500 |
|
}, |
|
{ |
|
"epoch": 8.81, |
|
"learning_rate": 4.3581838925755465e-05, |
|
"loss": 0.2591, |
|
"step": 395000 |
|
}, |
|
{ |
|
"epoch": 8.81, |
|
"eval_loss": 0.24037905037403107, |
|
"eval_runtime": 2.0778, |
|
"eval_samples_per_second": 1105.5, |
|
"eval_steps_per_second": 17.326, |
|
"step": 395000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"learning_rate": 4.327557979751057e-05, |
|
"loss": 0.2589, |
|
"step": 395500 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"learning_rate": 4.297054246962517e-05, |
|
"loss": 0.259, |
|
"step": 396000 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"eval_loss": 0.2426002323627472, |
|
"eval_runtime": 1.9871, |
|
"eval_samples_per_second": 1155.977, |
|
"eval_steps_per_second": 18.117, |
|
"step": 396000 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"learning_rate": 4.266673027793864e-05, |
|
"loss": 0.2583, |
|
"step": 396500 |
|
}, |
|
{ |
|
"epoch": 8.86, |
|
"learning_rate": 4.236414654489242e-05, |
|
"loss": 0.2583, |
|
"step": 397000 |
|
}, |
|
{ |
|
"epoch": 8.86, |
|
"eval_loss": 0.24136394262313843, |
|
"eval_runtime": 2.043, |
|
"eval_samples_per_second": 1124.309, |
|
"eval_steps_per_second": 17.621, |
|
"step": 397000 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"learning_rate": 4.206279457949371e-05, |
|
"loss": 0.2581, |
|
"step": 397500 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"learning_rate": 4.1762677677279335e-05, |
|
"loss": 0.2584, |
|
"step": 398000 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"eval_loss": 0.24172720313072205, |
|
"eval_runtime": 1.9971, |
|
"eval_samples_per_second": 1150.177, |
|
"eval_steps_per_second": 18.026, |
|
"step": 398000 |
|
}, |
|
{ |
|
"epoch": 8.89, |
|
"learning_rate": 4.146379912027964e-05, |
|
"loss": 0.2582, |
|
"step": 398500 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"learning_rate": 4.1166162176982664e-05, |
|
"loss": 0.2584, |
|
"step": 399000 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"eval_loss": 0.2435595542192459, |
|
"eval_runtime": 2.0606, |
|
"eval_samples_per_second": 1114.743, |
|
"eval_steps_per_second": 17.471, |
|
"step": 399000 |
|
}, |
|
{ |
|
"epoch": 8.91, |
|
"learning_rate": 4.086977010229838e-05, |
|
"loss": 0.2583, |
|
"step": 399500 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"learning_rate": 4.057462613752294e-05, |
|
"loss": 0.2615, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"eval_loss": 0.24273519217967987, |
|
"eval_runtime": 1.9831, |
|
"eval_samples_per_second": 1158.313, |
|
"eval_steps_per_second": 18.154, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"learning_rate": 4.0280733510303475e-05, |
|
"loss": 0.2627, |
|
"step": 400500 |
|
}, |
|
{ |
|
"epoch": 8.94, |
|
"learning_rate": 3.9988095434602716e-05, |
|
"loss": 0.2591, |
|
"step": 401000 |
|
}, |
|
{ |
|
"epoch": 8.94, |
|
"eval_loss": 0.24216598272323608, |
|
"eval_runtime": 1.9765, |
|
"eval_samples_per_second": 1162.139, |
|
"eval_steps_per_second": 18.214, |
|
"step": 401000 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"learning_rate": 3.9696715110663726e-05, |
|
"loss": 0.2588, |
|
"step": 401500 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"learning_rate": 3.9406595724975116e-05, |
|
"loss": 0.2585, |
|
"step": 402000 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"eval_loss": 0.245298832654953, |
|
"eval_runtime": 2.023, |
|
"eval_samples_per_second": 1135.425, |
|
"eval_steps_per_second": 17.795, |
|
"step": 402000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"learning_rate": 3.9117740450235914e-05, |
|
"loss": 0.2586, |
|
"step": 402500 |
|
}, |
|
{ |
|
"epoch": 8.99, |
|
"learning_rate": 3.8830152445321163e-05, |
|
"loss": 0.258, |
|
"step": 403000 |
|
}, |
|
{ |
|
"epoch": 8.99, |
|
"eval_loss": 0.24233770370483398, |
|
"eval_runtime": 2.0488, |
|
"eval_samples_per_second": 1121.145, |
|
"eval_steps_per_second": 17.571, |
|
"step": 403000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"learning_rate": 3.854383485524724e-05, |
|
"loss": 0.2583, |
|
"step": 403500 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"learning_rate": 3.8258790811137425e-05, |
|
"loss": 0.2577, |
|
"step": 404000 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"eval_loss": 0.24036923050880432, |
|
"eval_runtime": 2.0024, |
|
"eval_samples_per_second": 1147.117, |
|
"eval_steps_per_second": 17.978, |
|
"step": 404000 |
|
}, |
|
{ |
|
"epoch": 9.02, |
|
"learning_rate": 3.7975023430187676e-05, |
|
"loss": 0.2581, |
|
"step": 404500 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"learning_rate": 3.7692535815632624e-05, |
|
"loss": 0.2578, |
|
"step": 405000 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"eval_loss": 0.24242642521858215, |
|
"eval_runtime": 2.0428, |
|
"eval_samples_per_second": 1124.43, |
|
"eval_steps_per_second": 17.623, |
|
"step": 405000 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"learning_rate": 3.741133105671159e-05, |
|
"loss": 0.2575, |
|
"step": 405500 |
|
}, |
|
{ |
|
"epoch": 9.06, |
|
"learning_rate": 3.713141222863474e-05, |
|
"loss": 0.2576, |
|
"step": 406000 |
|
}, |
|
{ |
|
"epoch": 9.06, |
|
"eval_loss": 0.2440737634897232, |
|
"eval_runtime": 2.0157, |
|
"eval_samples_per_second": 1139.547, |
|
"eval_steps_per_second": 17.86, |
|
"step": 406000 |
|
}, |
|
{ |
|
"epoch": 9.07, |
|
"learning_rate": 3.6852782392549584e-05, |
|
"loss": 0.2575, |
|
"step": 406500 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"learning_rate": 3.657544459550729e-05, |
|
"loss": 0.2574, |
|
"step": 407000 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"eval_loss": 0.2409050464630127, |
|
"eval_runtime": 2.0313, |
|
"eval_samples_per_second": 1130.817, |
|
"eval_steps_per_second": 17.723, |
|
"step": 407000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"learning_rate": 3.6299401870429606e-05, |
|
"loss": 0.2574, |
|
"step": 407500 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"learning_rate": 3.6024657236075546e-05, |
|
"loss": 0.2573, |
|
"step": 408000 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"eval_loss": 0.24264054000377655, |
|
"eval_runtime": 2.0373, |
|
"eval_samples_per_second": 1127.471, |
|
"eval_steps_per_second": 17.67, |
|
"step": 408000 |
|
}, |
|
{ |
|
"epoch": 9.11, |
|
"learning_rate": 3.575121369700841e-05, |
|
"loss": 0.2576, |
|
"step": 408500 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"learning_rate": 3.5479074243562995e-05, |
|
"loss": 0.2576, |
|
"step": 409000 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"eval_loss": 0.24062800407409668, |
|
"eval_runtime": 2.0012, |
|
"eval_samples_per_second": 1147.804, |
|
"eval_steps_per_second": 17.989, |
|
"step": 409000 |
|
}, |
|
{ |
|
"epoch": 9.13, |
|
"learning_rate": 3.5208241851812644e-05, |
|
"loss": 0.2572, |
|
"step": 409500 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"learning_rate": 3.493871948353709e-05, |
|
"loss": 0.2574, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"eval_loss": 0.23946772515773773, |
|
"eval_runtime": 2.1141, |
|
"eval_samples_per_second": 1086.515, |
|
"eval_steps_per_second": 17.029, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 9.16, |
|
"learning_rate": 3.4670510086189736e-05, |
|
"loss": 0.2575, |
|
"step": 410500 |
|
}, |
|
{ |
|
"epoch": 9.17, |
|
"learning_rate": 3.440361659286563e-05, |
|
"loss": 0.2571, |
|
"step": 411000 |
|
}, |
|
{ |
|
"epoch": 9.17, |
|
"eval_loss": 0.23894140124320984, |
|
"eval_runtime": 2.0061, |
|
"eval_samples_per_second": 1145.029, |
|
"eval_steps_per_second": 17.946, |
|
"step": 411000 |
|
}, |
|
{ |
|
"epoch": 9.18, |
|
"learning_rate": 3.413804192226918e-05, |
|
"loss": 0.2573, |
|
"step": 411500 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"learning_rate": 3.387378897868246e-05, |
|
"loss": 0.257, |
|
"step": 412000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"eval_loss": 0.23886139690876007, |
|
"eval_runtime": 2.0352, |
|
"eval_samples_per_second": 1128.628, |
|
"eval_steps_per_second": 17.689, |
|
"step": 412000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"learning_rate": 3.361086065193336e-05, |
|
"loss": 0.257, |
|
"step": 412500 |
|
}, |
|
{ |
|
"epoch": 9.21, |
|
"learning_rate": 3.334925981736389e-05, |
|
"loss": 0.257, |
|
"step": 413000 |
|
}, |
|
{ |
|
"epoch": 9.21, |
|
"eval_loss": 0.23928523063659668, |
|
"eval_runtime": 2.0712, |
|
"eval_samples_per_second": 1109.008, |
|
"eval_steps_per_second": 17.381, |
|
"step": 413000 |
|
}, |
|
{ |
|
"epoch": 9.22, |
|
"learning_rate": 3.3088989335798925e-05, |
|
"loss": 0.2572, |
|
"step": 413500 |
|
}, |
|
{ |
|
"epoch": 9.23, |
|
"learning_rate": 3.283005205351467e-05, |
|
"loss": 0.2568, |
|
"step": 414000 |
|
}, |
|
{ |
|
"epoch": 9.23, |
|
"eval_loss": 0.24012628197669983, |
|
"eval_runtime": 1.9771, |
|
"eval_samples_per_second": 1161.82, |
|
"eval_steps_per_second": 18.209, |
|
"step": 414000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"learning_rate": 3.2572450802207845e-05, |
|
"loss": 0.2567, |
|
"step": 414500 |
|
}, |
|
{ |
|
"epoch": 9.26, |
|
"learning_rate": 3.2316188398964344e-05, |
|
"loss": 0.2568, |
|
"step": 415000 |
|
}, |
|
{ |
|
"epoch": 9.26, |
|
"eval_loss": 0.24090546369552612, |
|
"eval_runtime": 1.9932, |
|
"eval_samples_per_second": 1152.439, |
|
"eval_steps_per_second": 18.062, |
|
"step": 415000 |
|
}, |
|
{ |
|
"epoch": 9.27, |
|
"learning_rate": 3.206126764622888e-05, |
|
"loss": 0.2571, |
|
"step": 415500 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"learning_rate": 3.180769133177392e-05, |
|
"loss": 0.2567, |
|
"step": 416000 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"eval_loss": 0.24101045727729797, |
|
"eval_runtime": 2.0158, |
|
"eval_samples_per_second": 1139.515, |
|
"eval_steps_per_second": 17.859, |
|
"step": 416000 |
|
}, |
|
{ |
|
"epoch": 9.29, |
|
"learning_rate": 3.155546222866939e-05, |
|
"loss": 0.2567, |
|
"step": 416500 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"learning_rate": 3.130458309525239e-05, |
|
"loss": 0.2564, |
|
"step": 417000 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"eval_loss": 0.2400181144475937, |
|
"eval_runtime": 2.0304, |
|
"eval_samples_per_second": 1131.292, |
|
"eval_steps_per_second": 17.73, |
|
"step": 417000 |
|
}, |
|
{ |
|
"epoch": 9.31, |
|
"learning_rate": 3.1055056675096826e-05, |
|
"loss": 0.2567, |
|
"step": 417500 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"learning_rate": 3.0806885696983816e-05, |
|
"loss": 0.2563, |
|
"step": 418000 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"eval_loss": 0.24112293124198914, |
|
"eval_runtime": 2.0404, |
|
"eval_samples_per_second": 1125.741, |
|
"eval_steps_per_second": 17.643, |
|
"step": 418000 |
|
}, |
|
{ |
|
"epoch": 9.33, |
|
"learning_rate": 3.056007287487128e-05, |
|
"loss": 0.2564, |
|
"step": 418500 |
|
}, |
|
{ |
|
"epoch": 9.35, |
|
"learning_rate": 3.0314620907864744e-05, |
|
"loss": 0.2561, |
|
"step": 419000 |
|
}, |
|
{ |
|
"epoch": 9.35, |
|
"eval_loss": 0.2410934865474701, |
|
"eval_runtime": 2.0663, |
|
"eval_samples_per_second": 1111.669, |
|
"eval_steps_per_second": 17.423, |
|
"step": 419000 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"learning_rate": 3.0070532480187637e-05, |
|
"loss": 0.2564, |
|
"step": 419500 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"learning_rate": 2.9827810261151784e-05, |
|
"loss": 0.2562, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"eval_loss": 0.23920150101184845, |
|
"eval_runtime": 2.0818, |
|
"eval_samples_per_second": 1103.391, |
|
"eval_steps_per_second": 17.293, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 9.38, |
|
"learning_rate": 2.9586456905128618e-05, |
|
"loss": 0.2562, |
|
"step": 420500 |
|
}, |
|
{ |
|
"epoch": 9.39, |
|
"learning_rate": 2.9346475051519687e-05, |
|
"loss": 0.2583, |
|
"step": 421000 |
|
}, |
|
{ |
|
"epoch": 9.39, |
|
"eval_loss": 0.2419823408126831, |
|
"eval_runtime": 2.1088, |
|
"eval_samples_per_second": 1089.249, |
|
"eval_steps_per_second": 17.071, |
|
"step": 421000 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"learning_rate": 2.910786732472815e-05, |
|
"loss": 0.257, |
|
"step": 421500 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"learning_rate": 2.887063633412981e-05, |
|
"loss": 0.2565, |
|
"step": 422000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"eval_loss": 0.240878626704216, |
|
"eval_runtime": 2.008, |
|
"eval_samples_per_second": 1143.9, |
|
"eval_steps_per_second": 17.928, |
|
"step": 422000 |
|
}, |
|
{ |
|
"epoch": 9.42, |
|
"learning_rate": 2.863478467404478e-05, |
|
"loss": 0.2563, |
|
"step": 422500 |
|
}, |
|
{ |
|
"epoch": 9.44, |
|
"learning_rate": 2.8400314923709112e-05, |
|
"loss": 0.2562, |
|
"step": 423000 |
|
}, |
|
{ |
|
"epoch": 9.44, |
|
"eval_loss": 0.2374911606311798, |
|
"eval_runtime": 2.1002, |
|
"eval_samples_per_second": 1093.715, |
|
"eval_steps_per_second": 17.141, |
|
"step": 423000 |
|
}, |
|
{ |
|
"epoch": 9.45, |
|
"learning_rate": 2.816722964724636e-05, |
|
"loss": 0.256, |
|
"step": 423500 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"learning_rate": 2.793553139363981e-05, |
|
"loss": 0.2556, |
|
"step": 424000 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"eval_loss": 0.2397317737340927, |
|
"eval_runtime": 2.1055, |
|
"eval_samples_per_second": 1090.975, |
|
"eval_steps_per_second": 17.098, |
|
"step": 424000 |
|
}, |
|
{ |
|
"epoch": 9.47, |
|
"learning_rate": 2.7705222696704366e-05, |
|
"loss": 0.256, |
|
"step": 424500 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"learning_rate": 2.7476306075059096e-05, |
|
"loss": 0.2562, |
|
"step": 425000 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"eval_loss": 0.23977774381637573, |
|
"eval_runtime": 2.06, |
|
"eval_samples_per_second": 1115.059, |
|
"eval_steps_per_second": 17.476, |
|
"step": 425000 |
|
}, |
|
{ |
|
"epoch": 9.49, |
|
"learning_rate": 2.7248784032099478e-05, |
|
"loss": 0.2574, |
|
"step": 425500 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"learning_rate": 2.7022659055970144e-05, |
|
"loss": 0.2584, |
|
"step": 426000 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"eval_loss": 0.2388191670179367, |
|
"eval_runtime": 2.1089, |
|
"eval_samples_per_second": 1089.195, |
|
"eval_steps_per_second": 17.071, |
|
"step": 426000 |
|
}, |
|
{ |
|
"epoch": 9.51, |
|
"learning_rate": 2.6797933619537604e-05, |
|
"loss": 0.2572, |
|
"step": 426500 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"learning_rate": 2.6574610180363166e-05, |
|
"loss": 0.2566, |
|
"step": 427000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"eval_loss": 0.24121782183647156, |
|
"eval_runtime": 2.0738, |
|
"eval_samples_per_second": 1107.617, |
|
"eval_steps_per_second": 17.359, |
|
"step": 427000 |
|
}, |
|
{ |
|
"epoch": 9.54, |
|
"learning_rate": 2.6352691180676286e-05, |
|
"loss": 0.2568, |
|
"step": 427500 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"learning_rate": 2.6132179047347505e-05, |
|
"loss": 0.256, |
|
"step": 428000 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"eval_loss": 0.24006181955337524, |
|
"eval_runtime": 1.9904, |
|
"eval_samples_per_second": 1154.041, |
|
"eval_steps_per_second": 18.087, |
|
"step": 428000 |
|
}, |
|
{ |
|
"epoch": 9.56, |
|
"learning_rate": 2.5913076191862238e-05, |
|
"loss": 0.2564, |
|
"step": 428500 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"learning_rate": 2.5695385010294165e-05, |
|
"loss": 0.2564, |
|
"step": 429000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"eval_loss": 0.24071797728538513, |
|
"eval_runtime": 1.9595, |
|
"eval_samples_per_second": 1172.235, |
|
"eval_steps_per_second": 18.372, |
|
"step": 429000 |
|
}, |
|
{ |
|
"epoch": 9.58, |
|
"learning_rate": 2.5479107883279144e-05, |
|
"loss": 0.2564, |
|
"step": 429500 |
|
}, |
|
{ |
|
"epoch": 9.59, |
|
"learning_rate": 2.5264247175989292e-05, |
|
"loss": 0.2564, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 9.59, |
|
"eval_loss": 0.23974178731441498, |
|
"eval_runtime": 2.0589, |
|
"eval_samples_per_second": 1115.617, |
|
"eval_steps_per_second": 17.485, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"learning_rate": 2.5050805238106804e-05, |
|
"loss": 0.2561, |
|
"step": 430500 |
|
}, |
|
{ |
|
"epoch": 9.61, |
|
"learning_rate": 2.4838784403798542e-05, |
|
"loss": 0.256, |
|
"step": 431000 |
|
}, |
|
{ |
|
"epoch": 9.61, |
|
"eval_loss": 0.23836444318294525, |
|
"eval_runtime": 2.1211, |
|
"eval_samples_per_second": 1082.938, |
|
"eval_steps_per_second": 16.972, |
|
"step": 431000 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"learning_rate": 2.4628186991690346e-05, |
|
"loss": 0.256, |
|
"step": 431500 |
|
}, |
|
{ |
|
"epoch": 9.64, |
|
"learning_rate": 2.4419015304841797e-05, |
|
"loss": 0.2562, |
|
"step": 432000 |
|
}, |
|
{ |
|
"epoch": 9.64, |
|
"eval_loss": 0.2407396286725998, |
|
"eval_runtime": 2.084, |
|
"eval_samples_per_second": 1102.19, |
|
"eval_steps_per_second": 17.274, |
|
"step": 432000 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"learning_rate": 2.4211271630720957e-05, |
|
"loss": 0.2559, |
|
"step": 432500 |
|
}, |
|
{ |
|
"epoch": 9.66, |
|
"learning_rate": 2.4004958241179347e-05, |
|
"loss": 0.2558, |
|
"step": 433000 |
|
}, |
|
{ |
|
"epoch": 9.66, |
|
"eval_loss": 0.23972494900226593, |
|
"eval_runtime": 2.0364, |
|
"eval_samples_per_second": 1127.958, |
|
"eval_steps_per_second": 17.678, |
|
"step": 433000 |
|
}, |
|
{ |
|
"epoch": 9.67, |
|
"learning_rate": 2.3800077392427193e-05, |
|
"loss": 0.2559, |
|
"step": 433500 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"learning_rate": 2.3596631325008536e-05, |
|
"loss": 0.256, |
|
"step": 434000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"eval_loss": 0.24173137545585632, |
|
"eval_runtime": 2.0453, |
|
"eval_samples_per_second": 1123.05, |
|
"eval_steps_per_second": 17.601, |
|
"step": 434000 |
|
}, |
|
{ |
|
"epoch": 9.69, |
|
"learning_rate": 2.3394622263777042e-05, |
|
"loss": 0.2566, |
|
"step": 434500 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"learning_rate": 2.3194052417871433e-05, |
|
"loss": 0.2558, |
|
"step": 435000 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"eval_loss": 0.23999714851379395, |
|
"eval_runtime": 2.0454, |
|
"eval_samples_per_second": 1123.023, |
|
"eval_steps_per_second": 17.601, |
|
"step": 435000 |
|
}, |
|
{ |
|
"epoch": 9.71, |
|
"learning_rate": 2.2994923980691425e-05, |
|
"loss": 0.2556, |
|
"step": 435500 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"learning_rate": 2.279723912987365e-05, |
|
"loss": 0.2552, |
|
"step": 436000 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"eval_loss": 0.23865634202957153, |
|
"eval_runtime": 2.0599, |
|
"eval_samples_per_second": 1115.091, |
|
"eval_steps_per_second": 17.476, |
|
"step": 436000 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"learning_rate": 2.2601000027268006e-05, |
|
"loss": 0.2555, |
|
"step": 436500 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"learning_rate": 2.2406208818913857e-05, |
|
"loss": 0.2556, |
|
"step": 437000 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"eval_loss": 0.2411411553621292, |
|
"eval_runtime": 2.0137, |
|
"eval_samples_per_second": 1140.659, |
|
"eval_steps_per_second": 17.877, |
|
"step": 437000 |
|
}, |
|
{ |
|
"epoch": 9.76, |
|
"learning_rate": 2.221286763501666e-05, |
|
"loss": 0.2571, |
|
"step": 437500 |
|
}, |
|
{ |
|
"epoch": 9.77, |
|
"learning_rate": 2.2020978589924673e-05, |
|
"loss": 0.258, |
|
"step": 438000 |
|
}, |
|
{ |
|
"epoch": 9.77, |
|
"eval_loss": 0.24090658128261566, |
|
"eval_runtime": 2.0077, |
|
"eval_samples_per_second": 1144.123, |
|
"eval_steps_per_second": 17.931, |
|
"step": 438000 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"learning_rate": 2.1830543782105647e-05, |
|
"loss": 0.2566, |
|
"step": 438500 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"learning_rate": 2.1641565294124206e-05, |
|
"loss": 0.2565, |
|
"step": 439000 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"eval_loss": 0.23793531954288483, |
|
"eval_runtime": 1.9748, |
|
"eval_samples_per_second": 1163.127, |
|
"eval_steps_per_second": 18.229, |
|
"step": 439000 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"learning_rate": 2.1454045192618794e-05, |
|
"loss": 0.2564, |
|
"step": 439500 |
|
}, |
|
{ |
|
"epoch": 9.81, |
|
"learning_rate": 2.1267985528279212e-05, |
|
"loss": 0.2569, |
|
"step": 440000 |
|
}, |
|
{ |
|
"epoch": 9.81, |
|
"eval_loss": 0.23971830308437347, |
|
"eval_runtime": 1.9911, |
|
"eval_samples_per_second": 1153.611, |
|
"eval_steps_per_second": 18.08, |
|
"step": 440000 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"learning_rate": 2.1083388335824145e-05, |
|
"loss": 0.2568, |
|
"step": 440500 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"learning_rate": 2.0900255633978873e-05, |
|
"loss": 0.257, |
|
"step": 441000 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"eval_loss": 0.23978090286254883, |
|
"eval_runtime": 2.0085, |
|
"eval_samples_per_second": 1143.638, |
|
"eval_steps_per_second": 17.924, |
|
"step": 441000 |
|
}, |
|
{ |
|
"epoch": 9.85, |
|
"learning_rate": 2.0718589425453314e-05, |
|
"loss": 0.2559, |
|
"step": 441500 |
|
}, |
|
{ |
|
"epoch": 9.86, |
|
"learning_rate": 2.0538391696920015e-05, |
|
"loss": 0.2559, |
|
"step": 442000 |
|
}, |
|
{ |
|
"epoch": 9.86, |
|
"eval_loss": 0.23992407321929932, |
|
"eval_runtime": 1.9856, |
|
"eval_samples_per_second": 1156.855, |
|
"eval_steps_per_second": 18.131, |
|
"step": 442000 |
|
}, |
|
{ |
|
"epoch": 9.87, |
|
"learning_rate": 2.035966441899249e-05, |
|
"loss": 0.2557, |
|
"step": 442500 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"learning_rate": 2.0182409546203555e-05, |
|
"loss": 0.2556, |
|
"step": 443000 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"eval_loss": 0.23771096765995026, |
|
"eval_runtime": 1.9588, |
|
"eval_samples_per_second": 1172.656, |
|
"eval_steps_per_second": 18.379, |
|
"step": 443000 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"learning_rate": 2.000662901698415e-05, |
|
"loss": 0.2562, |
|
"step": 443500 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"learning_rate": 1.983232475364195e-05, |
|
"loss": 0.2565, |
|
"step": 444000 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"eval_loss": 0.24137504398822784, |
|
"eval_runtime": 1.9524, |
|
"eval_samples_per_second": 1176.51, |
|
"eval_steps_per_second": 18.439, |
|
"step": 444000 |
|
}, |
|
{ |
|
"epoch": 9.91, |
|
"learning_rate": 1.9659498662340474e-05, |
|
"loss": 0.2563, |
|
"step": 444500 |
|
}, |
|
{ |
|
"epoch": 9.93, |
|
"learning_rate": 1.948815263307819e-05, |
|
"loss": 0.2562, |
|
"step": 445000 |
|
}, |
|
{ |
|
"epoch": 9.93, |
|
"eval_loss": 0.23801474273204803, |
|
"eval_runtime": 2.0266, |
|
"eval_samples_per_second": 1133.424, |
|
"eval_steps_per_second": 17.764, |
|
"step": 445000 |
|
}, |
|
{ |
|
"epoch": 9.94, |
|
"learning_rate": 1.9318288539667765e-05, |
|
"loss": 0.2562, |
|
"step": 445500 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"learning_rate": 1.914990823971574e-05, |
|
"loss": 0.2558, |
|
"step": 446000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"eval_loss": 0.23844870924949646, |
|
"eval_runtime": 2.0906, |
|
"eval_samples_per_second": 1098.726, |
|
"eval_steps_per_second": 17.22, |
|
"step": 446000 |
|
}, |
|
{ |
|
"epoch": 9.96, |
|
"learning_rate": 1.8983013574602096e-05, |
|
"loss": 0.2559, |
|
"step": 446500 |
|
}, |
|
{ |
|
"epoch": 9.97, |
|
"learning_rate": 1.8817606369460156e-05, |
|
"loss": 0.2555, |
|
"step": 447000 |
|
}, |
|
{ |
|
"epoch": 9.97, |
|
"eval_loss": 0.23887751996517181, |
|
"eval_runtime": 2.059, |
|
"eval_samples_per_second": 1115.606, |
|
"eval_steps_per_second": 17.484, |
|
"step": 447000 |
|
}, |
|
{ |
|
"epoch": 9.98, |
|
"learning_rate": 1.865368843315663e-05, |
|
"loss": 0.2561, |
|
"step": 447500 |
|
}, |
|
{ |
|
"epoch": 9.99, |
|
"learning_rate": 1.8491261558271762e-05, |
|
"loss": 0.2558, |
|
"step": 448000 |
|
}, |
|
{ |
|
"epoch": 9.99, |
|
"eval_loss": 0.2371719628572464, |
|
"eval_runtime": 1.9897, |
|
"eval_samples_per_second": 1154.434, |
|
"eval_steps_per_second": 18.093, |
|
"step": 448000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 1.833032752107986e-05, |
|
"loss": 0.256, |
|
"step": 448500 |
|
}, |
|
{ |
|
"epoch": 10.02, |
|
"learning_rate": 1.817088808152978e-05, |
|
"loss": 0.2563, |
|
"step": 449000 |
|
}, |
|
{ |
|
"epoch": 10.02, |
|
"eval_loss": 0.23607970774173737, |
|
"eval_runtime": 2.0104, |
|
"eval_samples_per_second": 1142.561, |
|
"eval_steps_per_second": 17.907, |
|
"step": 449000 |
|
}, |
|
{ |
|
"epoch": 10.03, |
|
"learning_rate": 1.801294498322569e-05, |
|
"loss": 0.2559, |
|
"step": 449500 |
|
}, |
|
{ |
|
"epoch": 10.04, |
|
"learning_rate": 1.7856499953407978e-05, |
|
"loss": 0.2555, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 10.04, |
|
"eval_loss": 0.238793283700943, |
|
"eval_runtime": 2.0158, |
|
"eval_samples_per_second": 1139.483, |
|
"eval_steps_per_second": 17.859, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 10.05, |
|
"learning_rate": 1.770155470293445e-05, |
|
"loss": 0.2555, |
|
"step": 450500 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"learning_rate": 1.7548110926261522e-05, |
|
"loss": 0.2557, |
|
"step": 451000 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"eval_loss": 0.23789702355861664, |
|
"eval_runtime": 2.0142, |
|
"eval_samples_per_second": 1140.419, |
|
"eval_steps_per_second": 17.873, |
|
"step": 451000 |
|
}, |
|
{ |
|
"epoch": 10.07, |
|
"learning_rate": 1.7396170301425777e-05, |
|
"loss": 0.2558, |
|
"step": 451500 |
|
}, |
|
{ |
|
"epoch": 10.08, |
|
"learning_rate": 1.7245734490025544e-05, |
|
"loss": 0.2556, |
|
"step": 452000 |
|
}, |
|
{ |
|
"epoch": 10.08, |
|
"eval_loss": 0.2400251030921936, |
|
"eval_runtime": 1.9794, |
|
"eval_samples_per_second": 1160.467, |
|
"eval_steps_per_second": 18.188, |
|
"step": 452000 |
|
}, |
|
{ |
|
"epoch": 10.09, |
|
"learning_rate": 1.7096805137202738e-05, |
|
"loss": 0.2559, |
|
"step": 452500 |
|
}, |
|
{ |
|
"epoch": 10.1, |
|
"learning_rate": 1.6949383871624917e-05, |
|
"loss": 0.2556, |
|
"step": 453000 |
|
}, |
|
{ |
|
"epoch": 10.1, |
|
"eval_loss": 0.24092479050159454, |
|
"eval_runtime": 1.9634, |
|
"eval_samples_per_second": 1169.917, |
|
"eval_steps_per_second": 18.336, |
|
"step": 453000 |
|
}, |
|
{ |
|
"epoch": 10.12, |
|
"learning_rate": 1.6803472305467368e-05, |
|
"loss": 0.2557, |
|
"step": 453500 |
|
}, |
|
{ |
|
"epoch": 10.13, |
|
"learning_rate": 1.665907203439568e-05, |
|
"loss": 0.2557, |
|
"step": 454000 |
|
}, |
|
{ |
|
"epoch": 10.13, |
|
"eval_loss": 0.23987002670764923, |
|
"eval_runtime": 1.9519, |
|
"eval_samples_per_second": 1176.785, |
|
"eval_steps_per_second": 18.443, |
|
"step": 454000 |
|
}, |
|
{ |
|
"epoch": 10.14, |
|
"learning_rate": 1.6516184637548058e-05, |
|
"loss": 0.257, |
|
"step": 454500 |
|
}, |
|
{ |
|
"epoch": 10.15, |
|
"learning_rate": 1.6374811677518142e-05, |
|
"loss": 0.2555, |
|
"step": 455000 |
|
}, |
|
{ |
|
"epoch": 10.15, |
|
"eval_loss": 0.2387952208518982, |
|
"eval_runtime": 2.0231, |
|
"eval_samples_per_second": 1135.362, |
|
"eval_steps_per_second": 17.794, |
|
"step": 455000 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"learning_rate": 1.6234954700338025e-05, |
|
"loss": 0.2555, |
|
"step": 455500 |
|
}, |
|
{ |
|
"epoch": 10.17, |
|
"learning_rate": 1.6096615235461148e-05, |
|
"loss": 0.2549, |
|
"step": 456000 |
|
}, |
|
{ |
|
"epoch": 10.17, |
|
"eval_loss": 0.23763985931873322, |
|
"eval_runtime": 2.0435, |
|
"eval_samples_per_second": 1124.037, |
|
"eval_steps_per_second": 17.617, |
|
"step": 456000 |
|
}, |
|
{ |
|
"epoch": 10.18, |
|
"learning_rate": 1.59597947957458e-05, |
|
"loss": 0.2557, |
|
"step": 456500 |
|
}, |
|
{ |
|
"epoch": 10.19, |
|
"learning_rate": 1.5824494877438344e-05, |
|
"loss": 0.2551, |
|
"step": 457000 |
|
}, |
|
{ |
|
"epoch": 10.19, |
|
"eval_loss": 0.23833227157592773, |
|
"eval_runtime": 2.01, |
|
"eval_samples_per_second": 1142.798, |
|
"eval_steps_per_second": 17.911, |
|
"step": 457000 |
|
}, |
|
{ |
|
"epoch": 10.2, |
|
"learning_rate": 1.569071696015702e-05, |
|
"loss": 0.2549, |
|
"step": 457500 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"learning_rate": 1.555846250687569e-05, |
|
"loss": 0.2551, |
|
"step": 458000 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"eval_loss": 0.23883755505084991, |
|
"eval_runtime": 1.9432, |
|
"eval_samples_per_second": 1182.062, |
|
"eval_steps_per_second": 18.526, |
|
"step": 458000 |
|
}, |
|
{ |
|
"epoch": 10.23, |
|
"learning_rate": 1.542773296390789e-05, |
|
"loss": 0.2555, |
|
"step": 458500 |
|
}, |
|
{ |
|
"epoch": 10.24, |
|
"learning_rate": 1.5298529760890945e-05, |
|
"loss": 0.2559, |
|
"step": 459000 |
|
}, |
|
{ |
|
"epoch": 10.24, |
|
"eval_loss": 0.23782269656658173, |
|
"eval_runtime": 1.9879, |
|
"eval_samples_per_second": 1155.497, |
|
"eval_steps_per_second": 18.11, |
|
"step": 459000 |
|
}, |
|
{ |
|
"epoch": 10.25, |
|
"learning_rate": 1.5170854310770376e-05, |
|
"loss": 0.2557, |
|
"step": 459500 |
|
}, |
|
{ |
|
"epoch": 10.26, |
|
"learning_rate": 1.5044708009784457e-05, |
|
"loss": 0.2555, |
|
"step": 460000 |
|
}, |
|
{ |
|
"epoch": 10.26, |
|
"eval_loss": 0.23906496167182922, |
|
"eval_runtime": 1.9573, |
|
"eval_samples_per_second": 1173.539, |
|
"eval_steps_per_second": 18.392, |
|
"step": 460000 |
|
}, |
|
{ |
|
"epoch": 10.27, |
|
"learning_rate": 1.4920092237448903e-05, |
|
"loss": 0.2554, |
|
"step": 460500 |
|
}, |
|
{ |
|
"epoch": 10.28, |
|
"learning_rate": 1.4797008356541874e-05, |
|
"loss": 0.2548, |
|
"step": 461000 |
|
}, |
|
{ |
|
"epoch": 10.28, |
|
"eval_loss": 0.2381051927804947, |
|
"eval_runtime": 1.9705, |
|
"eval_samples_per_second": 1165.71, |
|
"eval_steps_per_second": 18.27, |
|
"step": 461000 |
|
}, |
|
{ |
|
"epoch": 10.29, |
|
"learning_rate": 1.4675457713088947e-05, |
|
"loss": 0.2547, |
|
"step": 461500 |
|
}, |
|
{ |
|
"epoch": 10.31, |
|
"learning_rate": 1.4555441636348494e-05, |
|
"loss": 0.2549, |
|
"step": 462000 |
|
}, |
|
{ |
|
"epoch": 10.31, |
|
"eval_loss": 0.23718567192554474, |
|
"eval_runtime": 2.0107, |
|
"eval_samples_per_second": 1142.393, |
|
"eval_steps_per_second": 17.904, |
|
"step": 462000 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"learning_rate": 1.4436961438797095e-05, |
|
"loss": 0.2547, |
|
"step": 462500 |
|
}, |
|
{ |
|
"epoch": 10.33, |
|
"learning_rate": 1.4320018416115206e-05, |
|
"loss": 0.2548, |
|
"step": 463000 |
|
}, |
|
{ |
|
"epoch": 10.33, |
|
"eval_loss": 0.23715750873088837, |
|
"eval_runtime": 2.0434, |
|
"eval_samples_per_second": 1124.132, |
|
"eval_steps_per_second": 17.618, |
|
"step": 463000 |
|
}, |
|
{ |
|
"epoch": 10.34, |
|
"learning_rate": 1.4204613847173003e-05, |
|
"loss": 0.2547, |
|
"step": 463500 |
|
}, |
|
{ |
|
"epoch": 10.35, |
|
"learning_rate": 1.4090748994016354e-05, |
|
"loss": 0.2547, |
|
"step": 464000 |
|
}, |
|
{ |
|
"epoch": 10.35, |
|
"eval_loss": 0.23930229246616364, |
|
"eval_runtime": 2.02, |
|
"eval_samples_per_second": 1137.119, |
|
"eval_steps_per_second": 17.822, |
|
"step": 464000 |
|
}, |
|
{ |
|
"epoch": 10.36, |
|
"learning_rate": 1.3978425101853049e-05, |
|
"loss": 0.2545, |
|
"step": 464500 |
|
}, |
|
{ |
|
"epoch": 10.37, |
|
"learning_rate": 1.3867643399039165e-05, |
|
"loss": 0.2546, |
|
"step": 465000 |
|
}, |
|
{ |
|
"epoch": 10.37, |
|
"eval_loss": 0.24023665487766266, |
|
"eval_runtime": 2.0161, |
|
"eval_samples_per_second": 1139.319, |
|
"eval_steps_per_second": 17.856, |
|
"step": 465000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"learning_rate": 1.3758405097065648e-05, |
|
"loss": 0.2547, |
|
"step": 465500 |
|
}, |
|
{ |
|
"epoch": 10.39, |
|
"learning_rate": 1.3650711390545131e-05, |
|
"loss": 0.2549, |
|
"step": 466000 |
|
}, |
|
{ |
|
"epoch": 10.39, |
|
"eval_loss": 0.2383406162261963, |
|
"eval_runtime": 2.0372, |
|
"eval_samples_per_second": 1127.524, |
|
"eval_steps_per_second": 17.671, |
|
"step": 466000 |
|
}, |
|
{ |
|
"epoch": 10.41, |
|
"learning_rate": 1.3544563457198657e-05, |
|
"loss": 0.2546, |
|
"step": 466500 |
|
}, |
|
{ |
|
"epoch": 10.42, |
|
"learning_rate": 1.343996245784307e-05, |
|
"loss": 0.2545, |
|
"step": 467000 |
|
}, |
|
{ |
|
"epoch": 10.42, |
|
"eval_loss": 0.23841743171215057, |
|
"eval_runtime": 1.9888, |
|
"eval_samples_per_second": 1154.948, |
|
"eval_steps_per_second": 18.101, |
|
"step": 467000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"learning_rate": 1.3336909536378107e-05, |
|
"loss": 0.2549, |
|
"step": 467500 |
|
}, |
|
{ |
|
"epoch": 10.44, |
|
"learning_rate": 1.3235405819774022e-05, |
|
"loss": 0.2544, |
|
"step": 468000 |
|
}, |
|
{ |
|
"epoch": 10.44, |
|
"eval_loss": 0.2374790459871292, |
|
"eval_runtime": 2.0476, |
|
"eval_samples_per_second": 1121.805, |
|
"eval_steps_per_second": 17.582, |
|
"step": 468000 |
|
}, |
|
{ |
|
"epoch": 10.45, |
|
"learning_rate": 1.3135452418059208e-05, |
|
"loss": 0.2543, |
|
"step": 468500 |
|
}, |
|
{ |
|
"epoch": 10.46, |
|
"learning_rate": 1.3037050424308027e-05, |
|
"loss": 0.2544, |
|
"step": 469000 |
|
}, |
|
{ |
|
"epoch": 10.46, |
|
"eval_loss": 0.2366662174463272, |
|
"eval_runtime": 2.0397, |
|
"eval_samples_per_second": 1126.142, |
|
"eval_steps_per_second": 17.65, |
|
"step": 469000 |
|
}, |
|
{ |
|
"epoch": 10.47, |
|
"learning_rate": 1.2940200914628945e-05, |
|
"loss": 0.2547, |
|
"step": 469500 |
|
}, |
|
{ |
|
"epoch": 10.48, |
|
"learning_rate": 1.2844904948152644e-05, |
|
"loss": 0.255, |
|
"step": 470000 |
|
}, |
|
{ |
|
"epoch": 10.48, |
|
"eval_loss": 0.23585031926631927, |
|
"eval_runtime": 1.91, |
|
"eval_samples_per_second": 1202.649, |
|
"eval_steps_per_second": 18.849, |
|
"step": 470000 |
|
}, |
|
{ |
|
"epoch": 10.49, |
|
"learning_rate": 1.2751163567020592e-05, |
|
"loss": 0.2544, |
|
"step": 470500 |
|
}, |
|
{ |
|
"epoch": 10.51, |
|
"learning_rate": 1.2658977796373478e-05, |
|
"loss": 0.2546, |
|
"step": 471000 |
|
}, |
|
{ |
|
"epoch": 10.51, |
|
"eval_loss": 0.23835012316703796, |
|
"eval_runtime": 2.0021, |
|
"eval_samples_per_second": 1147.276, |
|
"eval_steps_per_second": 17.981, |
|
"step": 471000 |
|
}, |
|
{ |
|
"epoch": 10.52, |
|
"learning_rate": 1.2568348644340153e-05, |
|
"loss": 0.2545, |
|
"step": 471500 |
|
}, |
|
{ |
|
"epoch": 10.53, |
|
"learning_rate": 1.2479277102026465e-05, |
|
"loss": 0.2544, |
|
"step": 472000 |
|
}, |
|
{ |
|
"epoch": 10.53, |
|
"eval_loss": 0.23700165748596191, |
|
"eval_runtime": 2.0158, |
|
"eval_samples_per_second": 1139.473, |
|
"eval_steps_per_second": 17.859, |
|
"step": 472000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"learning_rate": 1.2391764143504556e-05, |
|
"loss": 0.2542, |
|
"step": 472500 |
|
}, |
|
{ |
|
"epoch": 10.55, |
|
"learning_rate": 1.2305810725802118e-05, |
|
"loss": 0.254, |
|
"step": 473000 |
|
}, |
|
{ |
|
"epoch": 10.55, |
|
"eval_loss": 0.23847134411334991, |
|
"eval_runtime": 1.9758, |
|
"eval_samples_per_second": 1162.557, |
|
"eval_steps_per_second": 18.22, |
|
"step": 473000 |
|
}, |
|
{ |
|
"epoch": 10.56, |
|
"learning_rate": 1.222141778889195e-05, |
|
"loss": 0.2538, |
|
"step": 473500 |
|
}, |
|
{ |
|
"epoch": 10.57, |
|
"learning_rate": 1.2138586255681707e-05, |
|
"loss": 0.2539, |
|
"step": 474000 |
|
}, |
|
{ |
|
"epoch": 10.57, |
|
"eval_loss": 0.23491570353507996, |
|
"eval_runtime": 1.99, |
|
"eval_samples_per_second": 1154.271, |
|
"eval_steps_per_second": 18.09, |
|
"step": 474000 |
|
}, |
|
{ |
|
"epoch": 10.58, |
|
"learning_rate": 1.2057317032003731e-05, |
|
"loss": 0.2542, |
|
"step": 474500 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"learning_rate": 1.1977611006605263e-05, |
|
"loss": 0.2549, |
|
"step": 475000 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"eval_loss": 0.2358667552471161, |
|
"eval_runtime": 2.0083, |
|
"eval_samples_per_second": 1143.743, |
|
"eval_steps_per_second": 17.925, |
|
"step": 475000 |
|
}, |
|
{ |
|
"epoch": 10.61, |
|
"learning_rate": 1.1899469051138602e-05, |
|
"loss": 0.2543, |
|
"step": 475500 |
|
}, |
|
{ |
|
"epoch": 10.62, |
|
"learning_rate": 1.1822892020151667e-05, |
|
"loss": 0.2549, |
|
"step": 476000 |
|
}, |
|
{ |
|
"epoch": 10.62, |
|
"eval_loss": 0.2398059368133545, |
|
"eval_runtime": 2.007, |
|
"eval_samples_per_second": 1144.474, |
|
"eval_steps_per_second": 17.937, |
|
"step": 476000 |
|
}, |
|
{ |
|
"epoch": 10.63, |
|
"learning_rate": 1.1747880751078614e-05, |
|
"loss": 0.2554, |
|
"step": 476500 |
|
}, |
|
{ |
|
"epoch": 10.64, |
|
"learning_rate": 1.1674436064230637e-05, |
|
"loss": 0.2548, |
|
"step": 477000 |
|
}, |
|
{ |
|
"epoch": 10.64, |
|
"eval_loss": 0.23931536078453064, |
|
"eval_runtime": 1.9895, |
|
"eval_samples_per_second": 1154.534, |
|
"eval_steps_per_second": 18.095, |
|
"step": 477000 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"learning_rate": 1.1602558762787069e-05, |
|
"loss": 0.2547, |
|
"step": 477500 |
|
}, |
|
{ |
|
"epoch": 10.66, |
|
"learning_rate": 1.1532249632786582e-05, |
|
"loss": 0.2542, |
|
"step": 478000 |
|
}, |
|
{ |
|
"epoch": 10.66, |
|
"eval_loss": 0.237422376871109, |
|
"eval_runtime": 2.0815, |
|
"eval_samples_per_second": 1103.524, |
|
"eval_steps_per_second": 17.295, |
|
"step": 478000 |
|
}, |
|
{ |
|
"epoch": 10.67, |
|
"learning_rate": 1.1463509443118552e-05, |
|
"loss": 0.2541, |
|
"step": 478500 |
|
}, |
|
{ |
|
"epoch": 10.68, |
|
"learning_rate": 1.1396338945514663e-05, |
|
"loss": 0.2543, |
|
"step": 479000 |
|
}, |
|
{ |
|
"epoch": 10.68, |
|
"eval_loss": 0.2378796935081482, |
|
"eval_runtime": 2.0025, |
|
"eval_samples_per_second": 1147.061, |
|
"eval_steps_per_second": 17.977, |
|
"step": 479000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"learning_rate": 1.133073887454072e-05, |
|
"loss": 0.2538, |
|
"step": 479500 |
|
}, |
|
{ |
|
"epoch": 10.71, |
|
"learning_rate": 1.1266709947588599e-05, |
|
"loss": 0.2539, |
|
"step": 480000 |
|
}, |
|
{ |
|
"epoch": 10.71, |
|
"eval_loss": 0.2366316020488739, |
|
"eval_runtime": 1.9626, |
|
"eval_samples_per_second": 1170.393, |
|
"eval_steps_per_second": 18.343, |
|
"step": 480000 |
|
}, |
|
{ |
|
"epoch": 10.72, |
|
"learning_rate": 1.1204252864868377e-05, |
|
"loss": 0.2538, |
|
"step": 480500 |
|
}, |
|
{ |
|
"epoch": 10.73, |
|
"learning_rate": 1.1143368309400725e-05, |
|
"loss": 0.2539, |
|
"step": 481000 |
|
}, |
|
{ |
|
"epoch": 10.73, |
|
"eval_loss": 0.23603801429271698, |
|
"eval_runtime": 1.9718, |
|
"eval_samples_per_second": 1164.912, |
|
"eval_steps_per_second": 18.257, |
|
"step": 481000 |
|
}, |
|
{ |
|
"epoch": 10.74, |
|
"learning_rate": 1.1084056947009348e-05, |
|
"loss": 0.2538, |
|
"step": 481500 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"learning_rate": 1.1026319426313837e-05, |
|
"loss": 0.2538, |
|
"step": 482000 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"eval_loss": 0.23858527839183807, |
|
"eval_runtime": 1.961, |
|
"eval_samples_per_second": 1171.312, |
|
"eval_steps_per_second": 18.358, |
|
"step": 482000 |
|
}, |
|
{ |
|
"epoch": 10.76, |
|
"learning_rate": 1.097015637872247e-05, |
|
"loss": 0.2538, |
|
"step": 482500 |
|
}, |
|
{ |
|
"epoch": 10.77, |
|
"learning_rate": 1.0915568418425301e-05, |
|
"loss": 0.2537, |
|
"step": 483000 |
|
}, |
|
{ |
|
"epoch": 10.77, |
|
"eval_loss": 0.23714858293533325, |
|
"eval_runtime": 2.009, |
|
"eval_samples_per_second": 1143.375, |
|
"eval_steps_per_second": 17.92, |
|
"step": 483000 |
|
}, |
|
{ |
|
"epoch": 10.78, |
|
"learning_rate": 1.0862556142387571e-05, |
|
"loss": 0.2539, |
|
"step": 483500 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"learning_rate": 1.081112013034298e-05, |
|
"loss": 0.2537, |
|
"step": 484000 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"eval_loss": 0.23877692222595215, |
|
"eval_runtime": 1.9856, |
|
"eval_samples_per_second": 1156.824, |
|
"eval_steps_per_second": 18.13, |
|
"step": 484000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"learning_rate": 1.0761260944787561e-05, |
|
"loss": 0.2551, |
|
"step": 484500 |
|
}, |
|
{ |
|
"epoch": 10.82, |
|
"learning_rate": 1.0712979130973347e-05, |
|
"loss": 0.2542, |
|
"step": 485000 |
|
}, |
|
{ |
|
"epoch": 10.82, |
|
"eval_loss": 0.23765695095062256, |
|
"eval_runtime": 1.9888, |
|
"eval_samples_per_second": 1154.965, |
|
"eval_steps_per_second": 18.101, |
|
"step": 485000 |
|
}, |
|
{ |
|
"epoch": 10.83, |
|
"learning_rate": 1.0666275216902535e-05, |
|
"loss": 0.2539, |
|
"step": 485500 |
|
}, |
|
{ |
|
"epoch": 10.84, |
|
"learning_rate": 1.0621149713321656e-05, |
|
"loss": 0.2539, |
|
"step": 486000 |
|
}, |
|
{ |
|
"epoch": 10.84, |
|
"eval_loss": 0.23621481657028198, |
|
"eval_runtime": 1.9428, |
|
"eval_samples_per_second": 1182.329, |
|
"eval_steps_per_second": 18.53, |
|
"step": 486000 |
|
}, |
|
{ |
|
"epoch": 10.85, |
|
"learning_rate": 1.0577603113715964e-05, |
|
"loss": 0.2539, |
|
"step": 486500 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"learning_rate": 1.0535635894304106e-05, |
|
"loss": 0.2535, |
|
"step": 487000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"eval_loss": 0.23733575642108917, |
|
"eval_runtime": 1.9603, |
|
"eval_samples_per_second": 1171.773, |
|
"eval_steps_per_second": 18.365, |
|
"step": 487000 |
|
}, |
|
{ |
|
"epoch": 10.87, |
|
"learning_rate": 1.0495248514032875e-05, |
|
"loss": 0.2539, |
|
"step": 487500 |
|
}, |
|
{ |
|
"epoch": 10.89, |
|
"learning_rate": 1.045644141457218e-05, |
|
"loss": 0.2533, |
|
"step": 488000 |
|
}, |
|
{ |
|
"epoch": 10.89, |
|
"eval_loss": 0.23612964153289795, |
|
"eval_runtime": 1.9923, |
|
"eval_samples_per_second": 1152.93, |
|
"eval_steps_per_second": 18.069, |
|
"step": 488000 |
|
}, |
|
{ |
|
"epoch": 10.9, |
|
"learning_rate": 1.0419215020310254e-05, |
|
"loss": 0.2534, |
|
"step": 488500 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"learning_rate": 1.0383569738348988e-05, |
|
"loss": 0.2533, |
|
"step": 489000 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"eval_loss": 0.2368190884590149, |
|
"eval_runtime": 1.9507, |
|
"eval_samples_per_second": 1177.524, |
|
"eval_steps_per_second": 18.455, |
|
"step": 489000 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"learning_rate": 1.0349505958499436e-05, |
|
"loss": 0.2534, |
|
"step": 489500 |
|
}, |
|
{ |
|
"epoch": 10.93, |
|
"learning_rate": 1.0317024053277693e-05, |
|
"loss": 0.2535, |
|
"step": 490000 |
|
}, |
|
{ |
|
"epoch": 10.93, |
|
"eval_loss": 0.23948417603969574, |
|
"eval_runtime": 2.0351, |
|
"eval_samples_per_second": 1128.67, |
|
"eval_steps_per_second": 17.689, |
|
"step": 490000 |
|
}, |
|
{ |
|
"epoch": 10.94, |
|
"learning_rate": 1.0286124377900624e-05, |
|
"loss": 0.2541, |
|
"step": 490500 |
|
}, |
|
{ |
|
"epoch": 10.95, |
|
"learning_rate": 1.0256807270282153e-05, |
|
"loss": 0.2537, |
|
"step": 491000 |
|
}, |
|
{ |
|
"epoch": 10.95, |
|
"eval_loss": 0.23797546327114105, |
|
"eval_runtime": 1.9751, |
|
"eval_samples_per_second": 1162.982, |
|
"eval_steps_per_second": 18.227, |
|
"step": 491000 |
|
}, |
|
{ |
|
"epoch": 10.96, |
|
"learning_rate": 1.0229073051029455e-05, |
|
"loss": 0.2539, |
|
"step": 491500 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"learning_rate": 1.020292202343952e-05, |
|
"loss": 0.254, |
|
"step": 492000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"eval_loss": 0.23647533357143402, |
|
"eval_runtime": 2.0099, |
|
"eval_samples_per_second": 1142.868, |
|
"eval_steps_per_second": 17.912, |
|
"step": 492000 |
|
}, |
|
{ |
|
"epoch": 10.99, |
|
"learning_rate": 1.0178354473495813e-05, |
|
"loss": 0.2539, |
|
"step": 492500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"learning_rate": 1.0155370669865077e-05, |
|
"loss": 0.254, |
|
"step": 493000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 0.2374097853899002, |
|
"eval_runtime": 1.9968, |
|
"eval_samples_per_second": 1150.347, |
|
"eval_steps_per_second": 18.029, |
|
"step": 493000 |
|
}, |
|
{ |
|
"epoch": 11.01, |
|
"learning_rate": 1.0133970863894557e-05, |
|
"loss": 0.2537, |
|
"step": 493500 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"learning_rate": 1.0114155289609061e-05, |
|
"loss": 0.2535, |
|
"step": 494000 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"eval_loss": 0.23701806366443634, |
|
"eval_runtime": 2.0405, |
|
"eval_samples_per_second": 1125.682, |
|
"eval_steps_per_second": 17.642, |
|
"step": 494000 |
|
}, |
|
{ |
|
"epoch": 11.03, |
|
"learning_rate": 1.0095924163708572e-05, |
|
"loss": 0.2542, |
|
"step": 494500 |
|
}, |
|
{ |
|
"epoch": 11.04, |
|
"learning_rate": 1.0079277685565724e-05, |
|
"loss": 0.2538, |
|
"step": 495000 |
|
}, |
|
{ |
|
"epoch": 11.04, |
|
"eval_loss": 0.23448336124420166, |
|
"eval_runtime": 1.9969, |
|
"eval_samples_per_second": 1150.279, |
|
"eval_steps_per_second": 18.028, |
|
"step": 495000 |
|
}, |
|
{ |
|
"epoch": 11.05, |
|
"learning_rate": 1.0064216037223772e-05, |
|
"loss": 0.2536, |
|
"step": 495500 |
|
}, |
|
{ |
|
"epoch": 11.06, |
|
"learning_rate": 1.0050739383394454e-05, |
|
"loss": 0.2539, |
|
"step": 496000 |
|
}, |
|
{ |
|
"epoch": 11.06, |
|
"eval_loss": 0.23559238016605377, |
|
"eval_runtime": 2.0311, |
|
"eval_samples_per_second": 1130.902, |
|
"eval_steps_per_second": 17.724, |
|
"step": 496000 |
|
}, |
|
{ |
|
"epoch": 11.07, |
|
"learning_rate": 1.003884787145633e-05, |
|
"loss": 0.2532, |
|
"step": 496500 |
|
}, |
|
{ |
|
"epoch": 11.09, |
|
"learning_rate": 1.002854163145305e-05, |
|
"loss": 0.2533, |
|
"step": 497000 |
|
}, |
|
{ |
|
"epoch": 11.09, |
|
"eval_loss": 0.23612073063850403, |
|
"eval_runtime": 2.0421, |
|
"eval_samples_per_second": 1124.803, |
|
"eval_steps_per_second": 17.629, |
|
"step": 497000 |
|
}, |
|
{ |
|
"epoch": 11.1, |
|
"learning_rate": 1.0019820776091995e-05, |
|
"loss": 0.2531, |
|
"step": 497500 |
|
}, |
|
{ |
|
"epoch": 11.11, |
|
"learning_rate": 1.0012685400743077e-05, |
|
"loss": 0.2533, |
|
"step": 498000 |
|
}, |
|
{ |
|
"epoch": 11.11, |
|
"eval_loss": 0.23812700808048248, |
|
"eval_runtime": 2.036, |
|
"eval_samples_per_second": 1128.176, |
|
"eval_steps_per_second": 17.681, |
|
"step": 498000 |
|
}, |
|
{ |
|
"epoch": 11.12, |
|
"learning_rate": 1.0007135583437572e-05, |
|
"loss": 0.2531, |
|
"step": 498500 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"learning_rate": 1.0003171384867436e-05, |
|
"loss": 0.2534, |
|
"step": 499000 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"eval_loss": 0.23714645206928253, |
|
"eval_runtime": 2.0313, |
|
"eval_samples_per_second": 1130.821, |
|
"eval_steps_per_second": 17.723, |
|
"step": 499000 |
|
}, |
|
{ |
|
"epoch": 11.14, |
|
"learning_rate": 1.0000792848384467e-05, |
|
"loss": 0.2535, |
|
"step": 499500 |
|
}, |
|
{ |
|
"epoch": 11.15, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2535, |
|
"step": 500000 |
|
}, |
|
{ |
|
"epoch": 11.15, |
|
"eval_loss": 0.23556514084339142, |
|
"eval_runtime": 1.9635, |
|
"eval_samples_per_second": 1169.857, |
|
"eval_steps_per_second": 18.335, |
|
"step": 500000 |
|
} |
|
], |
|
"max_steps": 500000, |
|
"num_train_epochs": 12, |
|
"total_flos": 1.5974043941849432e+22, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|