|
{ |
|
"best_metric": 0.03447870910167694, |
|
"best_model_checkpoint": "LLMNIDS-t5small-1/checkpoint-5043", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 5043, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 13.822306632995605, |
|
"learning_rate": 1.6336633663366337e-05, |
|
"loss": 6.4708, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.269347190856934, |
|
"learning_rate": 3.306930693069307e-05, |
|
"loss": 2.544, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 13.6782865524292, |
|
"learning_rate": 4.9900990099009906e-05, |
|
"loss": 0.8408, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 10.320158004760742, |
|
"learning_rate": 4.813794623182018e-05, |
|
"loss": 0.5769, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.96146297454834, |
|
"learning_rate": 4.626487439400617e-05, |
|
"loss": 0.4061, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.3415141105651855, |
|
"learning_rate": 4.439180255619216e-05, |
|
"loss": 0.3239, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.0349297523498535, |
|
"learning_rate": 4.251873071837814e-05, |
|
"loss": 0.2748, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.790640354156494, |
|
"learning_rate": 4.064565888056412e-05, |
|
"loss": 0.1759, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 17.055940628051758, |
|
"learning_rate": 3.877258704275011e-05, |
|
"loss": 0.155, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_gen_len": 4.5983, |
|
"eval_loss": 0.07687737792730331, |
|
"eval_rouge1": 94.273, |
|
"eval_rouge2": 41.5009, |
|
"eval_rougeL": 94.2803, |
|
"eval_rougeLsum": 94.273, |
|
"eval_runtime": 31.2003, |
|
"eval_samples_per_second": 219.101, |
|
"eval_steps_per_second": 13.718, |
|
"step": 1681 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 3.797430992126465, |
|
"learning_rate": 3.68995152049361e-05, |
|
"loss": 0.1099, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.853536605834961, |
|
"learning_rate": 3.502644336712208e-05, |
|
"loss": 0.0839, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.6505563259124756, |
|
"learning_rate": 3.315337152930807e-05, |
|
"loss": 0.0958, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 4.214806079864502, |
|
"learning_rate": 3.128029969149405e-05, |
|
"loss": 0.0868, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 3.6818881034851074, |
|
"learning_rate": 2.9407227853680037e-05, |
|
"loss": 0.0754, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.5994019508361816, |
|
"learning_rate": 2.753415601586602e-05, |
|
"loss": 0.0736, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.05288717895746231, |
|
"learning_rate": 2.5661084178052008e-05, |
|
"loss": 0.0563, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.6832149028778076, |
|
"learning_rate": 2.3788012340237993e-05, |
|
"loss": 0.0681, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.5376071929931641, |
|
"learning_rate": 2.1914940502423974e-05, |
|
"loss": 0.0457, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.6747487783432007, |
|
"learning_rate": 2.004186866460996e-05, |
|
"loss": 0.0494, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_gen_len": 4.5459, |
|
"eval_loss": 0.038406919687986374, |
|
"eval_rouge1": 97.2352, |
|
"eval_rouge2": 43.4026, |
|
"eval_rougeL": 97.2572, |
|
"eval_rougeLsum": 97.2499, |
|
"eval_runtime": 31.2975, |
|
"eval_samples_per_second": 218.42, |
|
"eval_steps_per_second": 13.675, |
|
"step": 3362 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.16915227472782135, |
|
"learning_rate": 1.8168796826795948e-05, |
|
"loss": 0.0515, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 5.823632717132568, |
|
"learning_rate": 1.629572498898193e-05, |
|
"loss": 0.0487, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 4.443608283996582, |
|
"learning_rate": 1.4422653151167917e-05, |
|
"loss": 0.0459, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.1082069873809814, |
|
"learning_rate": 1.25495813133539e-05, |
|
"loss": 0.0416, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 3.4776525497436523, |
|
"learning_rate": 1.0676509475539887e-05, |
|
"loss": 0.0412, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.5169752240180969, |
|
"learning_rate": 8.803437637725872e-06, |
|
"loss": 0.0393, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 7.877155780792236, |
|
"learning_rate": 6.930365799911856e-06, |
|
"loss": 0.0425, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.5286906957626343, |
|
"learning_rate": 5.0572939620978405e-06, |
|
"loss": 0.0358, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.1692440509796143, |
|
"learning_rate": 3.1842221242838256e-06, |
|
"loss": 0.0434, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.6163593530654907, |
|
"learning_rate": 1.3111502864698105e-06, |
|
"loss": 0.0312, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_gen_len": 4.5563, |
|
"eval_loss": 0.03447870910167694, |
|
"eval_rouge1": 97.3376, |
|
"eval_rouge2": 43.622, |
|
"eval_rougeL": 97.3669, |
|
"eval_rougeLsum": 97.3596, |
|
"eval_runtime": 31.306, |
|
"eval_samples_per_second": 218.361, |
|
"eval_steps_per_second": 13.672, |
|
"step": 5043 |
|
} |
|
], |
|
"logging_steps": 170, |
|
"max_steps": 5043, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1364346865188864.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|