|
{ |
|
"best_metric": 0.020906077697873116, |
|
"best_model_checkpoint": "LLMNIDS-t5base-1/checkpoint-5043", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 5043, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.331108093261719, |
|
"learning_rate": 1.6336633663366337e-05, |
|
"loss": 5.5187, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.446777820587158, |
|
"learning_rate": 3.3168316831683175e-05, |
|
"loss": 0.8703, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.2873497009277344, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3904, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.13348913192749, |
|
"learning_rate": 4.813794623182018e-05, |
|
"loss": 0.1922, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.8617521524429321, |
|
"learning_rate": 4.626487439400617e-05, |
|
"loss": 0.1091, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5739182829856873, |
|
"learning_rate": 4.439180255619216e-05, |
|
"loss": 0.0807, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.6188409328460693, |
|
"learning_rate": 4.251873071837814e-05, |
|
"loss": 0.0719, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.20058074593544006, |
|
"learning_rate": 4.064565888056412e-05, |
|
"loss": 0.0449, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.7106176614761353, |
|
"learning_rate": 3.877258704275011e-05, |
|
"loss": 0.0512, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_gen_len": 4.5944, |
|
"eval_loss": 0.044217173010110855, |
|
"eval_rouge1": 96.4965, |
|
"eval_rouge2": 43.8853, |
|
"eval_rougeL": 96.5111, |
|
"eval_rougeLsum": 96.4892, |
|
"eval_runtime": 56.362, |
|
"eval_samples_per_second": 121.287, |
|
"eval_steps_per_second": 7.594, |
|
"step": 1681 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.03131448104977608, |
|
"learning_rate": 3.68995152049361e-05, |
|
"loss": 0.0372, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.7714371085166931, |
|
"learning_rate": 3.502644336712208e-05, |
|
"loss": 0.035, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.47366881370544434, |
|
"learning_rate": 3.316438959894227e-05, |
|
"loss": 0.0305, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.289427399635315, |
|
"learning_rate": 3.129131776112825e-05, |
|
"loss": 0.0295, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.7327721118927002, |
|
"learning_rate": 2.9440282062582637e-05, |
|
"loss": 0.0358, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.13644327223300934, |
|
"learning_rate": 2.756721022476862e-05, |
|
"loss": 0.0283, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.032617200165987015, |
|
"learning_rate": 2.5694138386954607e-05, |
|
"loss": 0.0213, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.21351170539855957, |
|
"learning_rate": 2.3821066549140592e-05, |
|
"loss": 0.0268, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.03627165034413338, |
|
"learning_rate": 2.1947994711326578e-05, |
|
"loss": 0.0196, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.08273334801197052, |
|
"learning_rate": 2.0074922873512563e-05, |
|
"loss": 0.0153, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_gen_len": 4.559, |
|
"eval_loss": 0.026822404935956, |
|
"eval_rouge1": 98.2592, |
|
"eval_rouge2": 44.3827, |
|
"eval_rougeL": 98.2592, |
|
"eval_rougeLsum": 98.2665, |
|
"eval_runtime": 55.7197, |
|
"eval_samples_per_second": 122.686, |
|
"eval_steps_per_second": 7.681, |
|
"step": 3362 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.009718131273984909, |
|
"learning_rate": 1.8201851035698548e-05, |
|
"loss": 0.0173, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.4162699282169342, |
|
"learning_rate": 1.632877919788453e-05, |
|
"loss": 0.0202, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.6483927965164185, |
|
"learning_rate": 1.4455707360070516e-05, |
|
"loss": 0.0166, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.029726264998316765, |
|
"learning_rate": 1.25826355222565e-05, |
|
"loss": 0.0138, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.008093880489468575, |
|
"learning_rate": 1.0709563684442487e-05, |
|
"loss": 0.0124, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.06707121431827545, |
|
"learning_rate": 8.836491846628472e-06, |
|
"loss": 0.0193, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.49983274936676025, |
|
"learning_rate": 6.963420008814456e-06, |
|
"loss": 0.0142, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.009119812399148941, |
|
"learning_rate": 5.090348171000441e-06, |
|
"loss": 0.0118, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.155787855386734, |
|
"learning_rate": 3.217276333186426e-06, |
|
"loss": 0.0235, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.08707331120967865, |
|
"learning_rate": 1.3442044953724108e-06, |
|
"loss": 0.0135, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_gen_len": 4.5541, |
|
"eval_loss": 0.020906077697873116, |
|
"eval_rouge1": 98.3836, |
|
"eval_rouge2": 44.4266, |
|
"eval_rougeL": 98.3836, |
|
"eval_rougeLsum": 98.3909, |
|
"eval_runtime": 55.8601, |
|
"eval_samples_per_second": 122.377, |
|
"eval_steps_per_second": 7.662, |
|
"step": 5043 |
|
} |
|
], |
|
"logging_steps": 170, |
|
"max_steps": 5043, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 6138752255262720.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|