SLM_vs_LLM_experiments
/
max_seq_length_128_experiments
/Qwen
/Qwen1.5_1.8B_amazon
/trainer_state.json
{ | |
"best_metric": 0.5073133111000061, | |
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/Qwen/Qwen1.5_1.8B_amazon/checkpoint-350", | |
"epoch": 1.0, | |
"eval_steps": 50, | |
"global_step": 380, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.03, | |
"grad_norm": 129.14089965820312, | |
"learning_rate": 4.8684210526315795e-06, | |
"loss": 7.3844, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.05, | |
"grad_norm": 123.16605377197266, | |
"learning_rate": 4.736842105263158e-06, | |
"loss": 4.8281, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.08, | |
"grad_norm": 73.3310546875, | |
"learning_rate": 4.605263157894737e-06, | |
"loss": 3.2703, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.11, | |
"grad_norm": 79.52980041503906, | |
"learning_rate": 4.473684210526316e-06, | |
"loss": 2.0736, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.13, | |
"grad_norm": 65.27238464355469, | |
"learning_rate": 4.342105263157895e-06, | |
"loss": 1.5641, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.13, | |
"eval_accuracy": 0.647562582345191, | |
"eval_f1_macro": 0.5588571972473421, | |
"eval_f1_micro": 0.647562582345191, | |
"eval_loss": 1.2904983758926392, | |
"eval_runtime": 4.0513, | |
"eval_samples_per_second": 374.697, | |
"eval_steps_per_second": 11.848, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 50.00752258300781, | |
"learning_rate": 4.210526315789474e-06, | |
"loss": 1.1144, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.18, | |
"grad_norm": 39.85783386230469, | |
"learning_rate": 4.078947368421053e-06, | |
"loss": 1.1287, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.21, | |
"grad_norm": 62.46803665161133, | |
"learning_rate": 3.947368421052632e-06, | |
"loss": 0.8914, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 42.484432220458984, | |
"learning_rate": 3.815789473684211e-06, | |
"loss": 0.8281, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.26, | |
"grad_norm": 48.54948043823242, | |
"learning_rate": 3.6842105263157896e-06, | |
"loss": 0.744, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.26, | |
"eval_accuracy": 0.769433465085639, | |
"eval_f1_macro": 0.716629286760697, | |
"eval_f1_micro": 0.769433465085639, | |
"eval_loss": 0.7993608117103577, | |
"eval_runtime": 4.0452, | |
"eval_samples_per_second": 375.256, | |
"eval_steps_per_second": 11.866, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.29, | |
"grad_norm": 47.28898239135742, | |
"learning_rate": 3.5526315789473687e-06, | |
"loss": 0.9268, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 49.59343338012695, | |
"learning_rate": 3.421052631578948e-06, | |
"loss": 0.7649, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.34, | |
"grad_norm": 36.2591438293457, | |
"learning_rate": 3.289473684210527e-06, | |
"loss": 0.716, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.37, | |
"grad_norm": 47.06060791015625, | |
"learning_rate": 3.157894736842105e-06, | |
"loss": 0.5508, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.39, | |
"grad_norm": 45.960975646972656, | |
"learning_rate": 3.0263157894736843e-06, | |
"loss": 0.7245, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.39, | |
"eval_accuracy": 0.7997364953886693, | |
"eval_f1_macro": 0.744994161609646, | |
"eval_f1_micro": 0.7997364953886693, | |
"eval_loss": 0.6845870614051819, | |
"eval_runtime": 4.0142, | |
"eval_samples_per_second": 378.16, | |
"eval_steps_per_second": 11.958, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.42, | |
"grad_norm": 49.28917694091797, | |
"learning_rate": 2.8947368421052634e-06, | |
"loss": 0.6837, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.45, | |
"grad_norm": 34.09211349487305, | |
"learning_rate": 2.7631578947368424e-06, | |
"loss": 0.6589, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.47, | |
"grad_norm": 38.694705963134766, | |
"learning_rate": 2.631578947368421e-06, | |
"loss": 0.6702, | |
"step": 180 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 64.92501068115234, | |
"learning_rate": 2.5e-06, | |
"loss": 0.7353, | |
"step": 190 | |
}, | |
{ | |
"epoch": 0.53, | |
"grad_norm": 39.18191909790039, | |
"learning_rate": 2.368421052631579e-06, | |
"loss": 0.6491, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.53, | |
"eval_accuracy": 0.8155467720685112, | |
"eval_f1_macro": 0.7679142075529768, | |
"eval_f1_micro": 0.8155467720685112, | |
"eval_loss": 0.640599250793457, | |
"eval_runtime": 4.0408, | |
"eval_samples_per_second": 375.669, | |
"eval_steps_per_second": 11.879, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.55, | |
"grad_norm": 39.82322311401367, | |
"learning_rate": 2.236842105263158e-06, | |
"loss": 0.545, | |
"step": 210 | |
}, | |
{ | |
"epoch": 0.58, | |
"grad_norm": 60.72475051879883, | |
"learning_rate": 2.105263157894737e-06, | |
"loss": 0.5595, | |
"step": 220 | |
}, | |
{ | |
"epoch": 0.61, | |
"grad_norm": 42.45864486694336, | |
"learning_rate": 1.973684210526316e-06, | |
"loss": 0.6386, | |
"step": 230 | |
}, | |
{ | |
"epoch": 0.63, | |
"grad_norm": 48.077884674072266, | |
"learning_rate": 1.8421052631578948e-06, | |
"loss": 0.649, | |
"step": 240 | |
}, | |
{ | |
"epoch": 0.66, | |
"grad_norm": 48.596435546875, | |
"learning_rate": 1.710526315789474e-06, | |
"loss": 0.6193, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.66, | |
"eval_accuracy": 0.8399209486166008, | |
"eval_f1_macro": 0.7970428846372188, | |
"eval_f1_micro": 0.8399209486166008, | |
"eval_loss": 0.5427243113517761, | |
"eval_runtime": 4.3118, | |
"eval_samples_per_second": 352.055, | |
"eval_steps_per_second": 11.132, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.68, | |
"grad_norm": 45.50446319580078, | |
"learning_rate": 1.5789473684210526e-06, | |
"loss": 0.6343, | |
"step": 260 | |
}, | |
{ | |
"epoch": 0.71, | |
"grad_norm": 40.66709518432617, | |
"learning_rate": 1.4473684210526317e-06, | |
"loss": 0.5493, | |
"step": 270 | |
}, | |
{ | |
"epoch": 0.74, | |
"grad_norm": 50.266971588134766, | |
"learning_rate": 1.3157894736842106e-06, | |
"loss": 0.5177, | |
"step": 280 | |
}, | |
{ | |
"epoch": 0.76, | |
"grad_norm": 41.21518325805664, | |
"learning_rate": 1.1842105263157894e-06, | |
"loss": 0.4751, | |
"step": 290 | |
}, | |
{ | |
"epoch": 0.79, | |
"grad_norm": 35.079444885253906, | |
"learning_rate": 1.0526315789473685e-06, | |
"loss": 0.4828, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.79, | |
"eval_accuracy": 0.836627140974967, | |
"eval_f1_macro": 0.8114023231734937, | |
"eval_f1_micro": 0.836627140974967, | |
"eval_loss": 0.5453078746795654, | |
"eval_runtime": 4.0339, | |
"eval_samples_per_second": 376.309, | |
"eval_steps_per_second": 11.899, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.82, | |
"grad_norm": 51.024139404296875, | |
"learning_rate": 9.210526315789474e-07, | |
"loss": 0.4726, | |
"step": 310 | |
}, | |
{ | |
"epoch": 0.84, | |
"grad_norm": 41.68681335449219, | |
"learning_rate": 7.894736842105263e-07, | |
"loss": 0.5492, | |
"step": 320 | |
}, | |
{ | |
"epoch": 0.87, | |
"grad_norm": 31.459598541259766, | |
"learning_rate": 6.578947368421053e-07, | |
"loss": 0.6464, | |
"step": 330 | |
}, | |
{ | |
"epoch": 0.89, | |
"grad_norm": 37.8900260925293, | |
"learning_rate": 5.263157894736843e-07, | |
"loss": 0.4235, | |
"step": 340 | |
}, | |
{ | |
"epoch": 0.92, | |
"grad_norm": 80.35220336914062, | |
"learning_rate": 3.9473684210526315e-07, | |
"loss": 0.6122, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.92, | |
"eval_accuracy": 0.852437417654809, | |
"eval_f1_macro": 0.8289709215944268, | |
"eval_f1_micro": 0.852437417654809, | |
"eval_loss": 0.5073133111000061, | |
"eval_runtime": 4.0684, | |
"eval_samples_per_second": 373.116, | |
"eval_steps_per_second": 11.798, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.95, | |
"grad_norm": 35.06690979003906, | |
"learning_rate": 2.6315789473684213e-07, | |
"loss": 0.4292, | |
"step": 360 | |
}, | |
{ | |
"epoch": 0.97, | |
"grad_norm": 46.45317077636719, | |
"learning_rate": 1.3157894736842107e-07, | |
"loss": 0.5656, | |
"step": 370 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 42.95165252685547, | |
"learning_rate": 0.0, | |
"loss": 0.5355, | |
"step": 380 | |
}, | |
{ | |
"epoch": 1.0, | |
"step": 380, | |
"total_flos": 1.13424986013696e+16, | |
"train_loss": 1.0740083393297697, | |
"train_runtime": 415.974, | |
"train_samples_per_second": 29.194, | |
"train_steps_per_second": 0.914 | |
} | |
], | |
"logging_steps": 10, | |
"max_steps": 380, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 1, | |
"save_steps": 50, | |
"total_flos": 1.13424986013696e+16, | |
"train_batch_size": 16, | |
"trial_name": null, | |
"trial_params": null | |
} | |