|
{ |
|
"best_metric": 0.9129917025566101, |
|
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/distilbert/distilbert_base_uncased_amazon/checkpoint-550", |
|
"epoch": 3.0, |
|
"eval_steps": 50, |
|
"global_step": 570, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.327728271484375, |
|
"learning_rate": 1.9649122807017544e-05, |
|
"loss": 3.109, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.7082853317260742, |
|
"learning_rate": 1.929824561403509e-05, |
|
"loss": 3.048, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.893571138381958, |
|
"learning_rate": 1.894736842105263e-05, |
|
"loss": 2.9327, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.9520453214645386, |
|
"learning_rate": 1.8596491228070176e-05, |
|
"loss": 2.7781, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.1847422122955322, |
|
"learning_rate": 1.824561403508772e-05, |
|
"loss": 2.6322, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_accuracy": 0.4749670619235837, |
|
"eval_f1_macro": 0.3209193957177527, |
|
"eval_f1_micro": 0.4749670619235837, |
|
"eval_loss": 2.519139528274536, |
|
"eval_runtime": 0.7577, |
|
"eval_samples_per_second": 2003.426, |
|
"eval_steps_per_second": 31.675, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.446019411087036, |
|
"learning_rate": 1.7894736842105264e-05, |
|
"loss": 2.4648, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.346569538116455, |
|
"learning_rate": 1.754385964912281e-05, |
|
"loss": 2.2919, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.682988405227661, |
|
"learning_rate": 1.719298245614035e-05, |
|
"loss": 2.1292, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.50565242767334, |
|
"learning_rate": 1.6842105263157896e-05, |
|
"loss": 2.0467, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.4023382663726807, |
|
"learning_rate": 1.649122807017544e-05, |
|
"loss": 1.9044, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_accuracy": 0.6014492753623188, |
|
"eval_f1_macro": 0.4625900098895629, |
|
"eval_f1_micro": 0.6014492753623188, |
|
"eval_loss": 1.8323251008987427, |
|
"eval_runtime": 0.7593, |
|
"eval_samples_per_second": 1999.241, |
|
"eval_steps_per_second": 31.609, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.8123605251312256, |
|
"learning_rate": 1.6140350877192984e-05, |
|
"loss": 1.7838, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.5916481018066406, |
|
"learning_rate": 1.578947368421053e-05, |
|
"loss": 1.7117, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.583634853363037, |
|
"learning_rate": 1.543859649122807e-05, |
|
"loss": 1.6639, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.1771187782287598, |
|
"learning_rate": 1.5087719298245615e-05, |
|
"loss": 1.6064, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.575974464416504, |
|
"learning_rate": 1.4736842105263159e-05, |
|
"loss": 1.5127, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_accuracy": 0.6574440052700923, |
|
"eval_f1_macro": 0.5153852800565432, |
|
"eval_f1_micro": 0.6574440052700923, |
|
"eval_loss": 1.4809564352035522, |
|
"eval_runtime": 0.7651, |
|
"eval_samples_per_second": 1984.0, |
|
"eval_steps_per_second": 31.368, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.4453086853027344, |
|
"learning_rate": 1.4385964912280704e-05, |
|
"loss": 1.5096, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.194457530975342, |
|
"learning_rate": 1.4035087719298246e-05, |
|
"loss": 1.4366, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.7298433780670166, |
|
"learning_rate": 1.3684210526315791e-05, |
|
"loss": 1.3785, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.8832905292510986, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.4003, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 3.53489089012146, |
|
"learning_rate": 1.2982456140350879e-05, |
|
"loss": 1.2857, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"eval_accuracy": 0.6982872200263505, |
|
"eval_f1_macro": 0.5795313200947776, |
|
"eval_f1_micro": 0.6982872200263505, |
|
"eval_loss": 1.2679345607757568, |
|
"eval_runtime": 0.8173, |
|
"eval_samples_per_second": 1857.339, |
|
"eval_steps_per_second": 29.365, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 3.6431682109832764, |
|
"learning_rate": 1.263157894736842e-05, |
|
"loss": 1.2646, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.8024988174438477, |
|
"learning_rate": 1.2280701754385966e-05, |
|
"loss": 1.229, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 4.475400924682617, |
|
"learning_rate": 1.192982456140351e-05, |
|
"loss": 1.2163, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 3.2734055519104004, |
|
"learning_rate": 1.1578947368421053e-05, |
|
"loss": 1.12, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 3.5318431854248047, |
|
"learning_rate": 1.1228070175438597e-05, |
|
"loss": 1.0669, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_accuracy": 0.730566534914361, |
|
"eval_f1_macro": 0.6376220869475787, |
|
"eval_f1_micro": 0.730566534914361, |
|
"eval_loss": 1.1414965391159058, |
|
"eval_runtime": 0.7652, |
|
"eval_samples_per_second": 1983.819, |
|
"eval_steps_per_second": 31.365, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 3.1338298320770264, |
|
"learning_rate": 1.0877192982456142e-05, |
|
"loss": 1.0747, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 3.7175045013427734, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 1.1091, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.825385093688965, |
|
"learning_rate": 1.017543859649123e-05, |
|
"loss": 1.0954, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 4.713174343109131, |
|
"learning_rate": 9.824561403508772e-06, |
|
"loss": 0.9891, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 4.003322124481201, |
|
"learning_rate": 9.473684210526315e-06, |
|
"loss": 1.0931, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_accuracy": 0.7312252964426877, |
|
"eval_f1_macro": 0.6332619788197302, |
|
"eval_f1_micro": 0.7312252964426877, |
|
"eval_loss": 1.0668787956237793, |
|
"eval_runtime": 0.7678, |
|
"eval_samples_per_second": 1977.083, |
|
"eval_steps_per_second": 31.258, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 3.055854082107544, |
|
"learning_rate": 9.12280701754386e-06, |
|
"loss": 1.0605, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.6614320278167725, |
|
"learning_rate": 8.771929824561405e-06, |
|
"loss": 0.9953, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 4.1040449142456055, |
|
"learning_rate": 8.421052631578948e-06, |
|
"loss": 1.0317, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 4.793609619140625, |
|
"learning_rate": 8.070175438596492e-06, |
|
"loss": 1.1011, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 5.102194786071777, |
|
"learning_rate": 7.719298245614036e-06, |
|
"loss": 0.9879, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"eval_accuracy": 0.7437417654808959, |
|
"eval_f1_macro": 0.6541772971492381, |
|
"eval_f1_micro": 0.7437417654808959, |
|
"eval_loss": 1.0101571083068848, |
|
"eval_runtime": 0.8196, |
|
"eval_samples_per_second": 1852.047, |
|
"eval_steps_per_second": 29.281, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 5.22916841506958, |
|
"learning_rate": 7.368421052631579e-06, |
|
"loss": 0.9148, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 4.314509391784668, |
|
"learning_rate": 7.017543859649123e-06, |
|
"loss": 0.9774, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 4.692554950714111, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.9843, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 4.54512357711792, |
|
"learning_rate": 6.31578947368421e-06, |
|
"loss": 0.9259, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 3.737957715988159, |
|
"learning_rate": 5.964912280701755e-06, |
|
"loss": 0.8936, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"eval_accuracy": 0.7444005270092227, |
|
"eval_f1_macro": 0.6640066115044797, |
|
"eval_f1_micro": 0.7444005270092227, |
|
"eval_loss": 0.9649816751480103, |
|
"eval_runtime": 0.8189, |
|
"eval_samples_per_second": 1853.724, |
|
"eval_steps_per_second": 29.308, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 3.669529914855957, |
|
"learning_rate": 5.6140350877192985e-06, |
|
"loss": 0.8246, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 3.788975954055786, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 0.8956, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 4.400717258453369, |
|
"learning_rate": 4.912280701754386e-06, |
|
"loss": 0.8508, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 4.932755470275879, |
|
"learning_rate": 4.56140350877193e-06, |
|
"loss": 0.9209, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 3.4981260299682617, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 0.8345, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"eval_accuracy": 0.7582345191040843, |
|
"eval_f1_macro": 0.6900497906953322, |
|
"eval_f1_micro": 0.7582345191040843, |
|
"eval_loss": 0.9388595819473267, |
|
"eval_runtime": 0.8212, |
|
"eval_samples_per_second": 1848.509, |
|
"eval_steps_per_second": 29.225, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 3.988497257232666, |
|
"learning_rate": 3.859649122807018e-06, |
|
"loss": 0.8174, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 4.119844913482666, |
|
"learning_rate": 3.5087719298245615e-06, |
|
"loss": 0.9026, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 3.8894877433776855, |
|
"learning_rate": 3.157894736842105e-06, |
|
"loss": 0.8755, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 3.8152105808258057, |
|
"learning_rate": 2.8070175438596493e-06, |
|
"loss": 0.8427, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 3.738555908203125, |
|
"learning_rate": 2.456140350877193e-06, |
|
"loss": 0.7851, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"eval_accuracy": 0.7628458498023716, |
|
"eval_f1_macro": 0.6923797058622336, |
|
"eval_f1_micro": 0.7628458498023716, |
|
"eval_loss": 0.9207842350006104, |
|
"eval_runtime": 0.8212, |
|
"eval_samples_per_second": 1848.55, |
|
"eval_steps_per_second": 29.226, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 4.088294506072998, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 0.8308, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 4.426513195037842, |
|
"learning_rate": 1.7543859649122807e-06, |
|
"loss": 0.8498, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 3.5125749111175537, |
|
"learning_rate": 1.4035087719298246e-06, |
|
"loss": 0.8491, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 4.475162982940674, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 0.7996, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 4.1890549659729, |
|
"learning_rate": 7.017543859649123e-07, |
|
"loss": 0.8439, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"eval_accuracy": 0.7575757575757576, |
|
"eval_f1_macro": 0.6903636946713366, |
|
"eval_f1_micro": 0.7575757575757576, |
|
"eval_loss": 0.9129917025566101, |
|
"eval_runtime": 0.8209, |
|
"eval_samples_per_second": 1849.148, |
|
"eval_steps_per_second": 29.236, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 3.682612419128418, |
|
"learning_rate": 3.5087719298245616e-07, |
|
"loss": 0.8467, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 3.6457090377807617, |
|
"learning_rate": 0.0, |
|
"loss": 0.9181, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 570, |
|
"total_flos": 1208555069767680.0, |
|
"train_loss": 1.337625675870661, |
|
"train_runtime": 76.5319, |
|
"train_samples_per_second": 476.037, |
|
"train_steps_per_second": 7.448 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 570, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"total_flos": 1208555069767680.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|