SLM_vs_LLM_experiments
/
distilbert
/distilbert_base_uncased_amazon
/checkpoint-550
/trainer_state.json
{ | |
"best_metric": 0.9129917025566101, | |
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/distilbert/distilbert_base_uncased_amazon/checkpoint-550", | |
"epoch": 2.8947368421052633, | |
"eval_steps": 50, | |
"global_step": 550, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.05, | |
"grad_norm": 1.327728271484375, | |
"learning_rate": 1.9649122807017544e-05, | |
"loss": 3.109, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.11, | |
"grad_norm": 1.7082853317260742, | |
"learning_rate": 1.929824561403509e-05, | |
"loss": 3.048, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 1.893571138381958, | |
"learning_rate": 1.894736842105263e-05, | |
"loss": 2.9327, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.21, | |
"grad_norm": 1.9520453214645386, | |
"learning_rate": 1.8596491228070176e-05, | |
"loss": 2.7781, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.26, | |
"grad_norm": 2.1847422122955322, | |
"learning_rate": 1.824561403508772e-05, | |
"loss": 2.6322, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.26, | |
"eval_accuracy": 0.4749670619235837, | |
"eval_f1_macro": 0.3209193957177527, | |
"eval_f1_micro": 0.4749670619235837, | |
"eval_loss": 2.519139528274536, | |
"eval_runtime": 0.7577, | |
"eval_samples_per_second": 2003.426, | |
"eval_steps_per_second": 31.675, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 2.446019411087036, | |
"learning_rate": 1.7894736842105264e-05, | |
"loss": 2.4648, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.37, | |
"grad_norm": 2.346569538116455, | |
"learning_rate": 1.754385964912281e-05, | |
"loss": 2.2919, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.42, | |
"grad_norm": 2.682988405227661, | |
"learning_rate": 1.719298245614035e-05, | |
"loss": 2.1292, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.47, | |
"grad_norm": 2.50565242767334, | |
"learning_rate": 1.6842105263157896e-05, | |
"loss": 2.0467, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.53, | |
"grad_norm": 2.4023382663726807, | |
"learning_rate": 1.649122807017544e-05, | |
"loss": 1.9044, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.53, | |
"eval_accuracy": 0.6014492753623188, | |
"eval_f1_macro": 0.4625900098895629, | |
"eval_f1_micro": 0.6014492753623188, | |
"eval_loss": 1.8323251008987427, | |
"eval_runtime": 0.7593, | |
"eval_samples_per_second": 1999.241, | |
"eval_steps_per_second": 31.609, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.58, | |
"grad_norm": 2.8123605251312256, | |
"learning_rate": 1.6140350877192984e-05, | |
"loss": 1.7838, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.63, | |
"grad_norm": 2.5916481018066406, | |
"learning_rate": 1.578947368421053e-05, | |
"loss": 1.7117, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.68, | |
"grad_norm": 3.583634853363037, | |
"learning_rate": 1.543859649122807e-05, | |
"loss": 1.6639, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.74, | |
"grad_norm": 3.1771187782287598, | |
"learning_rate": 1.5087719298245615e-05, | |
"loss": 1.6064, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.79, | |
"grad_norm": 3.575974464416504, | |
"learning_rate": 1.4736842105263159e-05, | |
"loss": 1.5127, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.79, | |
"eval_accuracy": 0.6574440052700923, | |
"eval_f1_macro": 0.5153852800565432, | |
"eval_f1_micro": 0.6574440052700923, | |
"eval_loss": 1.4809564352035522, | |
"eval_runtime": 0.7651, | |
"eval_samples_per_second": 1984.0, | |
"eval_steps_per_second": 31.368, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.84, | |
"grad_norm": 3.4453086853027344, | |
"learning_rate": 1.4385964912280704e-05, | |
"loss": 1.5096, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.89, | |
"grad_norm": 3.194457530975342, | |
"learning_rate": 1.4035087719298246e-05, | |
"loss": 1.4366, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.95, | |
"grad_norm": 2.7298433780670166, | |
"learning_rate": 1.3684210526315791e-05, | |
"loss": 1.3785, | |
"step": 180 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 3.8832905292510986, | |
"learning_rate": 1.3333333333333333e-05, | |
"loss": 1.4003, | |
"step": 190 | |
}, | |
{ | |
"epoch": 1.05, | |
"grad_norm": 3.53489089012146, | |
"learning_rate": 1.2982456140350879e-05, | |
"loss": 1.2857, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.05, | |
"eval_accuracy": 0.6982872200263505, | |
"eval_f1_macro": 0.5795313200947776, | |
"eval_f1_micro": 0.6982872200263505, | |
"eval_loss": 1.2679345607757568, | |
"eval_runtime": 0.8173, | |
"eval_samples_per_second": 1857.339, | |
"eval_steps_per_second": 29.365, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.11, | |
"grad_norm": 3.6431682109832764, | |
"learning_rate": 1.263157894736842e-05, | |
"loss": 1.2646, | |
"step": 210 | |
}, | |
{ | |
"epoch": 1.16, | |
"grad_norm": 3.8024988174438477, | |
"learning_rate": 1.2280701754385966e-05, | |
"loss": 1.229, | |
"step": 220 | |
}, | |
{ | |
"epoch": 1.21, | |
"grad_norm": 4.475400924682617, | |
"learning_rate": 1.192982456140351e-05, | |
"loss": 1.2163, | |
"step": 230 | |
}, | |
{ | |
"epoch": 1.26, | |
"grad_norm": 3.2734055519104004, | |
"learning_rate": 1.1578947368421053e-05, | |
"loss": 1.12, | |
"step": 240 | |
}, | |
{ | |
"epoch": 1.32, | |
"grad_norm": 3.5318431854248047, | |
"learning_rate": 1.1228070175438597e-05, | |
"loss": 1.0669, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.32, | |
"eval_accuracy": 0.730566534914361, | |
"eval_f1_macro": 0.6376220869475787, | |
"eval_f1_micro": 0.730566534914361, | |
"eval_loss": 1.1414965391159058, | |
"eval_runtime": 0.7652, | |
"eval_samples_per_second": 1983.819, | |
"eval_steps_per_second": 31.365, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.37, | |
"grad_norm": 3.1338298320770264, | |
"learning_rate": 1.0877192982456142e-05, | |
"loss": 1.0747, | |
"step": 260 | |
}, | |
{ | |
"epoch": 1.42, | |
"grad_norm": 3.7175045013427734, | |
"learning_rate": 1.0526315789473684e-05, | |
"loss": 1.1091, | |
"step": 270 | |
}, | |
{ | |
"epoch": 1.47, | |
"grad_norm": 2.825385093688965, | |
"learning_rate": 1.017543859649123e-05, | |
"loss": 1.0954, | |
"step": 280 | |
}, | |
{ | |
"epoch": 1.53, | |
"grad_norm": 4.713174343109131, | |
"learning_rate": 9.824561403508772e-06, | |
"loss": 0.9891, | |
"step": 290 | |
}, | |
{ | |
"epoch": 1.58, | |
"grad_norm": 4.003322124481201, | |
"learning_rate": 9.473684210526315e-06, | |
"loss": 1.0931, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.58, | |
"eval_accuracy": 0.7312252964426877, | |
"eval_f1_macro": 0.6332619788197302, | |
"eval_f1_micro": 0.7312252964426877, | |
"eval_loss": 1.0668787956237793, | |
"eval_runtime": 0.7678, | |
"eval_samples_per_second": 1977.083, | |
"eval_steps_per_second": 31.258, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.63, | |
"grad_norm": 3.055854082107544, | |
"learning_rate": 9.12280701754386e-06, | |
"loss": 1.0605, | |
"step": 310 | |
}, | |
{ | |
"epoch": 1.68, | |
"grad_norm": 3.6614320278167725, | |
"learning_rate": 8.771929824561405e-06, | |
"loss": 0.9953, | |
"step": 320 | |
}, | |
{ | |
"epoch": 1.74, | |
"grad_norm": 4.1040449142456055, | |
"learning_rate": 8.421052631578948e-06, | |
"loss": 1.0317, | |
"step": 330 | |
}, | |
{ | |
"epoch": 1.79, | |
"grad_norm": 4.793609619140625, | |
"learning_rate": 8.070175438596492e-06, | |
"loss": 1.1011, | |
"step": 340 | |
}, | |
{ | |
"epoch": 1.84, | |
"grad_norm": 5.102194786071777, | |
"learning_rate": 7.719298245614036e-06, | |
"loss": 0.9879, | |
"step": 350 | |
}, | |
{ | |
"epoch": 1.84, | |
"eval_accuracy": 0.7437417654808959, | |
"eval_f1_macro": 0.6541772971492381, | |
"eval_f1_micro": 0.7437417654808959, | |
"eval_loss": 1.0101571083068848, | |
"eval_runtime": 0.8196, | |
"eval_samples_per_second": 1852.047, | |
"eval_steps_per_second": 29.281, | |
"step": 350 | |
}, | |
{ | |
"epoch": 1.89, | |
"grad_norm": 5.22916841506958, | |
"learning_rate": 7.368421052631579e-06, | |
"loss": 0.9148, | |
"step": 360 | |
}, | |
{ | |
"epoch": 1.95, | |
"grad_norm": 4.314509391784668, | |
"learning_rate": 7.017543859649123e-06, | |
"loss": 0.9774, | |
"step": 370 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 4.692554950714111, | |
"learning_rate": 6.666666666666667e-06, | |
"loss": 0.9843, | |
"step": 380 | |
}, | |
{ | |
"epoch": 2.05, | |
"grad_norm": 4.54512357711792, | |
"learning_rate": 6.31578947368421e-06, | |
"loss": 0.9259, | |
"step": 390 | |
}, | |
{ | |
"epoch": 2.11, | |
"grad_norm": 3.737957715988159, | |
"learning_rate": 5.964912280701755e-06, | |
"loss": 0.8936, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.11, | |
"eval_accuracy": 0.7444005270092227, | |
"eval_f1_macro": 0.6640066115044797, | |
"eval_f1_micro": 0.7444005270092227, | |
"eval_loss": 0.9649816751480103, | |
"eval_runtime": 0.8189, | |
"eval_samples_per_second": 1853.724, | |
"eval_steps_per_second": 29.308, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.16, | |
"grad_norm": 3.669529914855957, | |
"learning_rate": 5.6140350877192985e-06, | |
"loss": 0.8246, | |
"step": 410 | |
}, | |
{ | |
"epoch": 2.21, | |
"grad_norm": 3.788975954055786, | |
"learning_rate": 5.263157894736842e-06, | |
"loss": 0.8956, | |
"step": 420 | |
}, | |
{ | |
"epoch": 2.26, | |
"grad_norm": 4.400717258453369, | |
"learning_rate": 4.912280701754386e-06, | |
"loss": 0.8508, | |
"step": 430 | |
}, | |
{ | |
"epoch": 2.32, | |
"grad_norm": 4.932755470275879, | |
"learning_rate": 4.56140350877193e-06, | |
"loss": 0.9209, | |
"step": 440 | |
}, | |
{ | |
"epoch": 2.37, | |
"grad_norm": 3.4981260299682617, | |
"learning_rate": 4.210526315789474e-06, | |
"loss": 0.8345, | |
"step": 450 | |
}, | |
{ | |
"epoch": 2.37, | |
"eval_accuracy": 0.7582345191040843, | |
"eval_f1_macro": 0.6900497906953322, | |
"eval_f1_micro": 0.7582345191040843, | |
"eval_loss": 0.9388595819473267, | |
"eval_runtime": 0.8212, | |
"eval_samples_per_second": 1848.509, | |
"eval_steps_per_second": 29.225, | |
"step": 450 | |
}, | |
{ | |
"epoch": 2.42, | |
"grad_norm": 3.988497257232666, | |
"learning_rate": 3.859649122807018e-06, | |
"loss": 0.8174, | |
"step": 460 | |
}, | |
{ | |
"epoch": 2.47, | |
"grad_norm": 4.119844913482666, | |
"learning_rate": 3.5087719298245615e-06, | |
"loss": 0.9026, | |
"step": 470 | |
}, | |
{ | |
"epoch": 2.53, | |
"grad_norm": 3.8894877433776855, | |
"learning_rate": 3.157894736842105e-06, | |
"loss": 0.8755, | |
"step": 480 | |
}, | |
{ | |
"epoch": 2.58, | |
"grad_norm": 3.8152105808258057, | |
"learning_rate": 2.8070175438596493e-06, | |
"loss": 0.8427, | |
"step": 490 | |
}, | |
{ | |
"epoch": 2.63, | |
"grad_norm": 3.738555908203125, | |
"learning_rate": 2.456140350877193e-06, | |
"loss": 0.7851, | |
"step": 500 | |
}, | |
{ | |
"epoch": 2.63, | |
"eval_accuracy": 0.7628458498023716, | |
"eval_f1_macro": 0.6923797058622336, | |
"eval_f1_micro": 0.7628458498023716, | |
"eval_loss": 0.9207842350006104, | |
"eval_runtime": 0.8212, | |
"eval_samples_per_second": 1848.55, | |
"eval_steps_per_second": 29.226, | |
"step": 500 | |
}, | |
{ | |
"epoch": 2.68, | |
"grad_norm": 4.088294506072998, | |
"learning_rate": 2.105263157894737e-06, | |
"loss": 0.8308, | |
"step": 510 | |
}, | |
{ | |
"epoch": 2.74, | |
"grad_norm": 4.426513195037842, | |
"learning_rate": 1.7543859649122807e-06, | |
"loss": 0.8498, | |
"step": 520 | |
}, | |
{ | |
"epoch": 2.79, | |
"grad_norm": 3.5125749111175537, | |
"learning_rate": 1.4035087719298246e-06, | |
"loss": 0.8491, | |
"step": 530 | |
}, | |
{ | |
"epoch": 2.84, | |
"grad_norm": 4.475162982940674, | |
"learning_rate": 1.0526315789473685e-06, | |
"loss": 0.7996, | |
"step": 540 | |
}, | |
{ | |
"epoch": 2.89, | |
"grad_norm": 4.1890549659729, | |
"learning_rate": 7.017543859649123e-07, | |
"loss": 0.8439, | |
"step": 550 | |
}, | |
{ | |
"epoch": 2.89, | |
"eval_accuracy": 0.7575757575757576, | |
"eval_f1_macro": 0.6903636946713366, | |
"eval_f1_micro": 0.7575757575757576, | |
"eval_loss": 0.9129917025566101, | |
"eval_runtime": 0.8209, | |
"eval_samples_per_second": 1849.148, | |
"eval_steps_per_second": 29.236, | |
"step": 550 | |
} | |
], | |
"logging_steps": 10, | |
"max_steps": 570, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 3, | |
"save_steps": 50, | |
"total_flos": 1166149628723200.0, | |
"train_batch_size": 32, | |
"trial_name": null, | |
"trial_params": null | |
} | |