|
{ |
|
"best_metric": 0.5073133111000061, |
|
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/Qwen/Qwen1.5_1.8B_amazon/checkpoint-350", |
|
"epoch": 1.0, |
|
"eval_steps": 50, |
|
"global_step": 380, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 129.14089965820312, |
|
"learning_rate": 4.8684210526315795e-06, |
|
"loss": 7.3844, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 123.16605377197266, |
|
"learning_rate": 4.736842105263158e-06, |
|
"loss": 4.8281, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 73.3310546875, |
|
"learning_rate": 4.605263157894737e-06, |
|
"loss": 3.2703, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 79.52980041503906, |
|
"learning_rate": 4.473684210526316e-06, |
|
"loss": 2.0736, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 65.27238464355469, |
|
"learning_rate": 4.342105263157895e-06, |
|
"loss": 1.5641, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_accuracy": 0.647562582345191, |
|
"eval_f1_macro": 0.5588571972473421, |
|
"eval_f1_micro": 0.647562582345191, |
|
"eval_loss": 1.2904983758926392, |
|
"eval_runtime": 4.0513, |
|
"eval_samples_per_second": 374.697, |
|
"eval_steps_per_second": 11.848, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 50.00752258300781, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 1.1144, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 39.85783386230469, |
|
"learning_rate": 4.078947368421053e-06, |
|
"loss": 1.1287, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 62.46803665161133, |
|
"learning_rate": 3.947368421052632e-06, |
|
"loss": 0.8914, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 42.484432220458984, |
|
"learning_rate": 3.815789473684211e-06, |
|
"loss": 0.8281, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 48.54948043823242, |
|
"learning_rate": 3.6842105263157896e-06, |
|
"loss": 0.744, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_accuracy": 0.769433465085639, |
|
"eval_f1_macro": 0.716629286760697, |
|
"eval_f1_micro": 0.769433465085639, |
|
"eval_loss": 0.7993608117103577, |
|
"eval_runtime": 4.0452, |
|
"eval_samples_per_second": 375.256, |
|
"eval_steps_per_second": 11.866, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 47.28898239135742, |
|
"learning_rate": 3.5526315789473687e-06, |
|
"loss": 0.9268, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 49.59343338012695, |
|
"learning_rate": 3.421052631578948e-06, |
|
"loss": 0.7649, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 36.2591438293457, |
|
"learning_rate": 3.289473684210527e-06, |
|
"loss": 0.716, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 47.06060791015625, |
|
"learning_rate": 3.157894736842105e-06, |
|
"loss": 0.5508, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 45.960975646972656, |
|
"learning_rate": 3.0263157894736843e-06, |
|
"loss": 0.7245, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_accuracy": 0.7997364953886693, |
|
"eval_f1_macro": 0.744994161609646, |
|
"eval_f1_micro": 0.7997364953886693, |
|
"eval_loss": 0.6845870614051819, |
|
"eval_runtime": 4.0142, |
|
"eval_samples_per_second": 378.16, |
|
"eval_steps_per_second": 11.958, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 49.28917694091797, |
|
"learning_rate": 2.8947368421052634e-06, |
|
"loss": 0.6837, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 34.09211349487305, |
|
"learning_rate": 2.7631578947368424e-06, |
|
"loss": 0.6589, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 38.694705963134766, |
|
"learning_rate": 2.631578947368421e-06, |
|
"loss": 0.6702, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 64.92501068115234, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.7353, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 39.18191909790039, |
|
"learning_rate": 2.368421052631579e-06, |
|
"loss": 0.6491, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_accuracy": 0.8155467720685112, |
|
"eval_f1_macro": 0.7679142075529768, |
|
"eval_f1_micro": 0.8155467720685112, |
|
"eval_loss": 0.640599250793457, |
|
"eval_runtime": 4.0408, |
|
"eval_samples_per_second": 375.669, |
|
"eval_steps_per_second": 11.879, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 39.82322311401367, |
|
"learning_rate": 2.236842105263158e-06, |
|
"loss": 0.545, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 60.72475051879883, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 0.5595, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 42.45864486694336, |
|
"learning_rate": 1.973684210526316e-06, |
|
"loss": 0.6386, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 48.077884674072266, |
|
"learning_rate": 1.8421052631578948e-06, |
|
"loss": 0.649, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 48.596435546875, |
|
"learning_rate": 1.710526315789474e-06, |
|
"loss": 0.6193, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_accuracy": 0.8399209486166008, |
|
"eval_f1_macro": 0.7970428846372188, |
|
"eval_f1_micro": 0.8399209486166008, |
|
"eval_loss": 0.5427243113517761, |
|
"eval_runtime": 4.3118, |
|
"eval_samples_per_second": 352.055, |
|
"eval_steps_per_second": 11.132, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 45.50446319580078, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.6343, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 40.66709518432617, |
|
"learning_rate": 1.4473684210526317e-06, |
|
"loss": 0.5493, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 50.266971588134766, |
|
"learning_rate": 1.3157894736842106e-06, |
|
"loss": 0.5177, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 41.21518325805664, |
|
"learning_rate": 1.1842105263157894e-06, |
|
"loss": 0.4751, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 35.079444885253906, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 0.4828, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_accuracy": 0.836627140974967, |
|
"eval_f1_macro": 0.8114023231734937, |
|
"eval_f1_micro": 0.836627140974967, |
|
"eval_loss": 0.5453078746795654, |
|
"eval_runtime": 4.0339, |
|
"eval_samples_per_second": 376.309, |
|
"eval_steps_per_second": 11.899, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 51.024139404296875, |
|
"learning_rate": 9.210526315789474e-07, |
|
"loss": 0.4726, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 41.68681335449219, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 0.5492, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 31.459598541259766, |
|
"learning_rate": 6.578947368421053e-07, |
|
"loss": 0.6464, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 37.8900260925293, |
|
"learning_rate": 5.263157894736843e-07, |
|
"loss": 0.4235, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 80.35220336914062, |
|
"learning_rate": 3.9473684210526315e-07, |
|
"loss": 0.6122, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_accuracy": 0.852437417654809, |
|
"eval_f1_macro": 0.8289709215944268, |
|
"eval_f1_micro": 0.852437417654809, |
|
"eval_loss": 0.5073133111000061, |
|
"eval_runtime": 4.0684, |
|
"eval_samples_per_second": 373.116, |
|
"eval_steps_per_second": 11.798, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 35.06690979003906, |
|
"learning_rate": 2.6315789473684213e-07, |
|
"loss": 0.4292, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 46.45317077636719, |
|
"learning_rate": 1.3157894736842107e-07, |
|
"loss": 0.5656, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 42.95165252685547, |
|
"learning_rate": 0.0, |
|
"loss": 0.5355, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 380, |
|
"total_flos": 1.13424986013696e+16, |
|
"train_loss": 1.0740083393297697, |
|
"train_runtime": 415.974, |
|
"train_samples_per_second": 29.194, |
|
"train_steps_per_second": 0.914 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 380, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"total_flos": 1.13424986013696e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|