|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 397, |
|
"global_step": 9536, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.17, |
|
"eval_gen_len": 83.5593, |
|
"eval_loss": 1.6528608798980713, |
|
"eval_rouge1": 50.4957, |
|
"eval_rouge2": 32.5323, |
|
"eval_rougeL": 40.7567, |
|
"eval_rougeLsum": 40.5733, |
|
"eval_runtime": 41.984, |
|
"eval_samples_per_second": 1.405, |
|
"eval_steps_per_second": 0.715, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 8.36017894744873, |
|
"learning_rate": 1.9404362416107384e-05, |
|
"loss": 2.7418, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_gen_len": 83.3729, |
|
"eval_loss": 1.3848459720611572, |
|
"eval_rouge1": 49.9993, |
|
"eval_rouge2": 31.5422, |
|
"eval_rougeL": 40.7043, |
|
"eval_rougeLsum": 40.6632, |
|
"eval_runtime": 41.848, |
|
"eval_samples_per_second": 1.41, |
|
"eval_steps_per_second": 0.717, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 10.013521194458008, |
|
"learning_rate": 1.880512943432407e-05, |
|
"loss": 1.6117, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_gen_len": 84.8983, |
|
"eval_loss": 1.3274192810058594, |
|
"eval_rouge1": 50.0655, |
|
"eval_rouge2": 31.4638, |
|
"eval_rougeL": 40.2184, |
|
"eval_rougeLsum": 39.9987, |
|
"eval_runtime": 42.1731, |
|
"eval_samples_per_second": 1.399, |
|
"eval_steps_per_second": 0.711, |
|
"step": 1191 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 9.272841453552246, |
|
"learning_rate": 1.820589645254075e-05, |
|
"loss": 1.4861, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_gen_len": 87.1864, |
|
"eval_loss": 1.3262691497802734, |
|
"eval_rouge1": 51.2154, |
|
"eval_rouge2": 33.6289, |
|
"eval_rougeL": 41.9642, |
|
"eval_rougeLsum": 41.7649, |
|
"eval_runtime": 43.0575, |
|
"eval_samples_per_second": 1.37, |
|
"eval_steps_per_second": 0.697, |
|
"step": 1588 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_gen_len": 85.9661, |
|
"eval_loss": 1.2881355285644531, |
|
"eval_rouge1": 52.2072, |
|
"eval_rouge2": 34.2681, |
|
"eval_rougeL": 42.7582, |
|
"eval_rougeLsum": 42.5683, |
|
"eval_runtime": 42.4338, |
|
"eval_samples_per_second": 1.39, |
|
"eval_steps_per_second": 0.707, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 10.048806190490723, |
|
"learning_rate": 1.7609060402684567e-05, |
|
"loss": 1.495, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_gen_len": 80.1864, |
|
"eval_loss": 1.2640005350112915, |
|
"eval_rouge1": 52.1344, |
|
"eval_rouge2": 34.3518, |
|
"eval_rougeL": 42.9145, |
|
"eval_rougeLsum": 42.7837, |
|
"eval_runtime": 40.643, |
|
"eval_samples_per_second": 1.452, |
|
"eval_steps_per_second": 0.738, |
|
"step": 2382 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 9.145220756530762, |
|
"learning_rate": 1.7009827420901247e-05, |
|
"loss": 1.4292, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"eval_gen_len": 83.5593, |
|
"eval_loss": 1.2814366817474365, |
|
"eval_rouge1": 51.9388, |
|
"eval_rouge2": 33.6073, |
|
"eval_rougeL": 41.9771, |
|
"eval_rougeLsum": 41.8638, |
|
"eval_runtime": 41.9785, |
|
"eval_samples_per_second": 1.405, |
|
"eval_steps_per_second": 0.715, |
|
"step": 2779 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 9.626166343688965, |
|
"learning_rate": 1.641059443911793e-05, |
|
"loss": 1.2572, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"eval_gen_len": 81.7458, |
|
"eval_loss": 1.3041572570800781, |
|
"eval_rouge1": 52.685, |
|
"eval_rouge2": 34.8664, |
|
"eval_rougeL": 43.247, |
|
"eval_rougeLsum": 43.2174, |
|
"eval_runtime": 40.4041, |
|
"eval_samples_per_second": 1.46, |
|
"eval_steps_per_second": 0.742, |
|
"step": 3176 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 8.85732364654541, |
|
"learning_rate": 1.5811361457334612e-05, |
|
"loss": 1.2858, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_gen_len": 83.4915, |
|
"eval_loss": 1.250982403755188, |
|
"eval_rouge1": 53.1395, |
|
"eval_rouge2": 35.0366, |
|
"eval_rougeL": 44.0336, |
|
"eval_rougeLsum": 43.8277, |
|
"eval_runtime": 41.066, |
|
"eval_samples_per_second": 1.437, |
|
"eval_steps_per_second": 0.731, |
|
"step": 3573 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_gen_len": 85.7797, |
|
"eval_loss": 1.2450958490371704, |
|
"eval_rouge1": 53.2435, |
|
"eval_rouge2": 34.0265, |
|
"eval_rougeL": 43.1606, |
|
"eval_rougeLsum": 42.9125, |
|
"eval_runtime": 42.4863, |
|
"eval_samples_per_second": 1.389, |
|
"eval_steps_per_second": 0.706, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 9.060718536376953, |
|
"learning_rate": 1.5212128475551296e-05, |
|
"loss": 1.2632, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"eval_gen_len": 84.0678, |
|
"eval_loss": 1.2505569458007812, |
|
"eval_rouge1": 52.9033, |
|
"eval_rouge2": 34.6637, |
|
"eval_rougeL": 43.0146, |
|
"eval_rougeLsum": 42.8985, |
|
"eval_runtime": 42.1089, |
|
"eval_samples_per_second": 1.401, |
|
"eval_steps_per_second": 0.712, |
|
"step": 4367 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 7.555502414703369, |
|
"learning_rate": 1.4612895493767978e-05, |
|
"loss": 1.2367, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_gen_len": 82.322, |
|
"eval_loss": 1.2485252618789673, |
|
"eval_rouge1": 50.1387, |
|
"eval_rouge2": 31.1201, |
|
"eval_rougeL": 40.0786, |
|
"eval_rougeLsum": 40.1657, |
|
"eval_runtime": 41.189, |
|
"eval_samples_per_second": 1.432, |
|
"eval_steps_per_second": 0.728, |
|
"step": 4764 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 7.1890788078308105, |
|
"learning_rate": 1.401366251198466e-05, |
|
"loss": 1.1512, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_gen_len": 82.9322, |
|
"eval_loss": 1.261144757270813, |
|
"eval_rouge1": 52.7072, |
|
"eval_rouge2": 34.6442, |
|
"eval_rougeL": 43.2377, |
|
"eval_rougeLsum": 43.1384, |
|
"eval_runtime": 41.2591, |
|
"eval_samples_per_second": 1.43, |
|
"eval_steps_per_second": 0.727, |
|
"step": 5161 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 7.86561918258667, |
|
"learning_rate": 1.341562799616491e-05, |
|
"loss": 1.0728, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"eval_gen_len": 86.4237, |
|
"eval_loss": 1.2699768543243408, |
|
"eval_rouge1": 52.3383, |
|
"eval_rouge2": 34.7756, |
|
"eval_rougeL": 42.9406, |
|
"eval_rougeLsum": 42.7658, |
|
"eval_runtime": 42.4715, |
|
"eval_samples_per_second": 1.389, |
|
"eval_steps_per_second": 0.706, |
|
"step": 5558 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_gen_len": 84.3051, |
|
"eval_loss": 1.2631828784942627, |
|
"eval_rouge1": 52.8233, |
|
"eval_rouge2": 35.1768, |
|
"eval_rougeL": 43.8642, |
|
"eval_rougeLsum": 43.7259, |
|
"eval_runtime": 41.727, |
|
"eval_samples_per_second": 1.414, |
|
"eval_steps_per_second": 0.719, |
|
"step": 5955 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 7.868692398071289, |
|
"learning_rate": 1.2816395014381592e-05, |
|
"loss": 1.0826, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"eval_gen_len": 82.8644, |
|
"eval_loss": 1.2638760805130005, |
|
"eval_rouge1": 53.9367, |
|
"eval_rouge2": 36.2676, |
|
"eval_rougeL": 44.9414, |
|
"eval_rougeLsum": 44.7603, |
|
"eval_runtime": 42.7574, |
|
"eval_samples_per_second": 1.38, |
|
"eval_steps_per_second": 0.702, |
|
"step": 6352 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 7.449892997741699, |
|
"learning_rate": 1.2217162032598275e-05, |
|
"loss": 1.0921, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"eval_gen_len": 82.4237, |
|
"eval_loss": 1.2491506338119507, |
|
"eval_rouge1": 52.8146, |
|
"eval_rouge2": 34.6392, |
|
"eval_rougeL": 43.5323, |
|
"eval_rougeLsum": 43.4647, |
|
"eval_runtime": 42.5398, |
|
"eval_samples_per_second": 1.387, |
|
"eval_steps_per_second": 0.705, |
|
"step": 6749 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 7.139917850494385, |
|
"learning_rate": 1.1617929050814957e-05, |
|
"loss": 1.1129, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_gen_len": 83.1356, |
|
"eval_loss": 1.2625495195388794, |
|
"eval_rouge1": 53.6493, |
|
"eval_rouge2": 35.0396, |
|
"eval_rougeL": 43.501, |
|
"eval_rougeLsum": 43.4039, |
|
"eval_runtime": 43.1051, |
|
"eval_samples_per_second": 1.369, |
|
"eval_steps_per_second": 0.696, |
|
"step": 7146 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 5.8409600257873535, |
|
"learning_rate": 1.1018696069031641e-05, |
|
"loss": 0.9783, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"eval_gen_len": 84.7797, |
|
"eval_loss": 1.293487787246704, |
|
"eval_rouge1": 53.245, |
|
"eval_rouge2": 35.655, |
|
"eval_rougeL": 44.4306, |
|
"eval_rougeLsum": 44.482, |
|
"eval_runtime": 41.7791, |
|
"eval_samples_per_second": 1.412, |
|
"eval_steps_per_second": 0.718, |
|
"step": 7543 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"eval_gen_len": 84.1186, |
|
"eval_loss": 1.266953706741333, |
|
"eval_rouge1": 52.146, |
|
"eval_rouge2": 33.0632, |
|
"eval_rougeL": 41.4382, |
|
"eval_rougeLsum": 41.5159, |
|
"eval_runtime": 41.1238, |
|
"eval_samples_per_second": 1.435, |
|
"eval_steps_per_second": 0.73, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 8.737879753112793, |
|
"learning_rate": 1.0419463087248323e-05, |
|
"loss": 0.9771, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_gen_len": 82.8475, |
|
"eval_loss": 1.275550127029419, |
|
"eval_rouge1": 51.7108, |
|
"eval_rouge2": 33.5352, |
|
"eval_rougeL": 42.4153, |
|
"eval_rougeLsum": 42.4572, |
|
"eval_runtime": 41.088, |
|
"eval_samples_per_second": 1.436, |
|
"eval_steps_per_second": 0.73, |
|
"step": 8337 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 8.45171070098877, |
|
"learning_rate": 9.820230105465006e-06, |
|
"loss": 0.9841, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"eval_gen_len": 84.322, |
|
"eval_loss": 1.260237455368042, |
|
"eval_rouge1": 53.2394, |
|
"eval_rouge2": 34.9695, |
|
"eval_rougeL": 43.2182, |
|
"eval_rougeLsum": 43.1333, |
|
"eval_runtime": 41.5567, |
|
"eval_samples_per_second": 1.42, |
|
"eval_steps_per_second": 0.722, |
|
"step": 8734 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 47.454078674316406, |
|
"learning_rate": 9.220997123681688e-06, |
|
"loss": 0.9643, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"eval_gen_len": 81.4915, |
|
"eval_loss": 1.27409827709198, |
|
"eval_rouge1": 53.5588, |
|
"eval_rouge2": 36.0425, |
|
"eval_rougeL": 44.2044, |
|
"eval_rougeLsum": 44.2287, |
|
"eval_runtime": 40.5566, |
|
"eval_samples_per_second": 1.455, |
|
"eval_steps_per_second": 0.74, |
|
"step": 9131 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 7.34140157699585, |
|
"learning_rate": 8.62176414189837e-06, |
|
"loss": 0.9439, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_gen_len": 86.1864, |
|
"eval_loss": 1.2641756534576416, |
|
"eval_rouge1": 53.7305, |
|
"eval_rouge2": 35.3844, |
|
"eval_rougeL": 43.8211, |
|
"eval_rougeLsum": 43.7597, |
|
"eval_runtime": 42.1877, |
|
"eval_samples_per_second": 1.399, |
|
"eval_steps_per_second": 0.711, |
|
"step": 9528 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 16688, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 1192, |
|
"total_flos": 2.071244574793728e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|