{ "best_metric": 0.9129917025566101, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/distilbert/distilbert_base_uncased_amazon/checkpoint-550", "epoch": 3.0, "eval_steps": 50, "global_step": 570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 1.327728271484375, "learning_rate": 1.9649122807017544e-05, "loss": 3.109, "step": 10 }, { "epoch": 0.11, "grad_norm": 1.7082853317260742, "learning_rate": 1.929824561403509e-05, "loss": 3.048, "step": 20 }, { "epoch": 0.16, "grad_norm": 1.893571138381958, "learning_rate": 1.894736842105263e-05, "loss": 2.9327, "step": 30 }, { "epoch": 0.21, "grad_norm": 1.9520453214645386, "learning_rate": 1.8596491228070176e-05, "loss": 2.7781, "step": 40 }, { "epoch": 0.26, "grad_norm": 2.1847422122955322, "learning_rate": 1.824561403508772e-05, "loss": 2.6322, "step": 50 }, { "epoch": 0.26, "eval_accuracy": 0.4749670619235837, "eval_f1_macro": 0.3209193957177527, "eval_f1_micro": 0.4749670619235837, "eval_loss": 2.519139528274536, "eval_runtime": 0.7577, "eval_samples_per_second": 2003.426, "eval_steps_per_second": 31.675, "step": 50 }, { "epoch": 0.32, "grad_norm": 2.446019411087036, "learning_rate": 1.7894736842105264e-05, "loss": 2.4648, "step": 60 }, { "epoch": 0.37, "grad_norm": 2.346569538116455, "learning_rate": 1.754385964912281e-05, "loss": 2.2919, "step": 70 }, { "epoch": 0.42, "grad_norm": 2.682988405227661, "learning_rate": 1.719298245614035e-05, "loss": 2.1292, "step": 80 }, { "epoch": 0.47, "grad_norm": 2.50565242767334, "learning_rate": 1.6842105263157896e-05, "loss": 2.0467, "step": 90 }, { "epoch": 0.53, "grad_norm": 2.4023382663726807, "learning_rate": 1.649122807017544e-05, "loss": 1.9044, "step": 100 }, { "epoch": 0.53, "eval_accuracy": 0.6014492753623188, "eval_f1_macro": 0.4625900098895629, "eval_f1_micro": 0.6014492753623188, "eval_loss": 1.8323251008987427, "eval_runtime": 0.7593, "eval_samples_per_second": 1999.241, "eval_steps_per_second": 31.609, "step": 100 }, { "epoch": 0.58, "grad_norm": 2.8123605251312256, "learning_rate": 1.6140350877192984e-05, "loss": 1.7838, "step": 110 }, { "epoch": 0.63, "grad_norm": 2.5916481018066406, "learning_rate": 1.578947368421053e-05, "loss": 1.7117, "step": 120 }, { "epoch": 0.68, "grad_norm": 3.583634853363037, "learning_rate": 1.543859649122807e-05, "loss": 1.6639, "step": 130 }, { "epoch": 0.74, "grad_norm": 3.1771187782287598, "learning_rate": 1.5087719298245615e-05, "loss": 1.6064, "step": 140 }, { "epoch": 0.79, "grad_norm": 3.575974464416504, "learning_rate": 1.4736842105263159e-05, "loss": 1.5127, "step": 150 }, { "epoch": 0.79, "eval_accuracy": 0.6574440052700923, "eval_f1_macro": 0.5153852800565432, "eval_f1_micro": 0.6574440052700923, "eval_loss": 1.4809564352035522, "eval_runtime": 0.7651, "eval_samples_per_second": 1984.0, "eval_steps_per_second": 31.368, "step": 150 }, { "epoch": 0.84, "grad_norm": 3.4453086853027344, "learning_rate": 1.4385964912280704e-05, "loss": 1.5096, "step": 160 }, { "epoch": 0.89, "grad_norm": 3.194457530975342, "learning_rate": 1.4035087719298246e-05, "loss": 1.4366, "step": 170 }, { "epoch": 0.95, "grad_norm": 2.7298433780670166, "learning_rate": 1.3684210526315791e-05, "loss": 1.3785, "step": 180 }, { "epoch": 1.0, "grad_norm": 3.8832905292510986, "learning_rate": 1.3333333333333333e-05, "loss": 1.4003, "step": 190 }, { "epoch": 1.05, "grad_norm": 3.53489089012146, "learning_rate": 1.2982456140350879e-05, "loss": 1.2857, "step": 200 }, { "epoch": 1.05, "eval_accuracy": 0.6982872200263505, "eval_f1_macro": 0.5795313200947776, "eval_f1_micro": 0.6982872200263505, "eval_loss": 1.2679345607757568, "eval_runtime": 0.8173, "eval_samples_per_second": 1857.339, "eval_steps_per_second": 29.365, "step": 200 }, { "epoch": 1.11, "grad_norm": 3.6431682109832764, "learning_rate": 1.263157894736842e-05, "loss": 1.2646, "step": 210 }, { "epoch": 1.16, "grad_norm": 3.8024988174438477, "learning_rate": 1.2280701754385966e-05, "loss": 1.229, "step": 220 }, { "epoch": 1.21, "grad_norm": 4.475400924682617, "learning_rate": 1.192982456140351e-05, "loss": 1.2163, "step": 230 }, { "epoch": 1.26, "grad_norm": 3.2734055519104004, "learning_rate": 1.1578947368421053e-05, "loss": 1.12, "step": 240 }, { "epoch": 1.32, "grad_norm": 3.5318431854248047, "learning_rate": 1.1228070175438597e-05, "loss": 1.0669, "step": 250 }, { "epoch": 1.32, "eval_accuracy": 0.730566534914361, "eval_f1_macro": 0.6376220869475787, "eval_f1_micro": 0.730566534914361, "eval_loss": 1.1414965391159058, "eval_runtime": 0.7652, "eval_samples_per_second": 1983.819, "eval_steps_per_second": 31.365, "step": 250 }, { "epoch": 1.37, "grad_norm": 3.1338298320770264, "learning_rate": 1.0877192982456142e-05, "loss": 1.0747, "step": 260 }, { "epoch": 1.42, "grad_norm": 3.7175045013427734, "learning_rate": 1.0526315789473684e-05, "loss": 1.1091, "step": 270 }, { "epoch": 1.47, "grad_norm": 2.825385093688965, "learning_rate": 1.017543859649123e-05, "loss": 1.0954, "step": 280 }, { "epoch": 1.53, "grad_norm": 4.713174343109131, "learning_rate": 9.824561403508772e-06, "loss": 0.9891, "step": 290 }, { "epoch": 1.58, "grad_norm": 4.003322124481201, "learning_rate": 9.473684210526315e-06, "loss": 1.0931, "step": 300 }, { "epoch": 1.58, "eval_accuracy": 0.7312252964426877, "eval_f1_macro": 0.6332619788197302, "eval_f1_micro": 0.7312252964426877, "eval_loss": 1.0668787956237793, "eval_runtime": 0.7678, "eval_samples_per_second": 1977.083, "eval_steps_per_second": 31.258, "step": 300 }, { "epoch": 1.63, "grad_norm": 3.055854082107544, "learning_rate": 9.12280701754386e-06, "loss": 1.0605, "step": 310 }, { "epoch": 1.68, "grad_norm": 3.6614320278167725, "learning_rate": 8.771929824561405e-06, "loss": 0.9953, "step": 320 }, { "epoch": 1.74, "grad_norm": 4.1040449142456055, "learning_rate": 8.421052631578948e-06, "loss": 1.0317, "step": 330 }, { "epoch": 1.79, "grad_norm": 4.793609619140625, "learning_rate": 8.070175438596492e-06, "loss": 1.1011, "step": 340 }, { "epoch": 1.84, "grad_norm": 5.102194786071777, "learning_rate": 7.719298245614036e-06, "loss": 0.9879, "step": 350 }, { "epoch": 1.84, "eval_accuracy": 0.7437417654808959, "eval_f1_macro": 0.6541772971492381, "eval_f1_micro": 0.7437417654808959, "eval_loss": 1.0101571083068848, "eval_runtime": 0.8196, "eval_samples_per_second": 1852.047, "eval_steps_per_second": 29.281, "step": 350 }, { "epoch": 1.89, "grad_norm": 5.22916841506958, "learning_rate": 7.368421052631579e-06, "loss": 0.9148, "step": 360 }, { "epoch": 1.95, "grad_norm": 4.314509391784668, "learning_rate": 7.017543859649123e-06, "loss": 0.9774, "step": 370 }, { "epoch": 2.0, "grad_norm": 4.692554950714111, "learning_rate": 6.666666666666667e-06, "loss": 0.9843, "step": 380 }, { "epoch": 2.05, "grad_norm": 4.54512357711792, "learning_rate": 6.31578947368421e-06, "loss": 0.9259, "step": 390 }, { "epoch": 2.11, "grad_norm": 3.737957715988159, "learning_rate": 5.964912280701755e-06, "loss": 0.8936, "step": 400 }, { "epoch": 2.11, "eval_accuracy": 0.7444005270092227, "eval_f1_macro": 0.6640066115044797, "eval_f1_micro": 0.7444005270092227, "eval_loss": 0.9649816751480103, "eval_runtime": 0.8189, "eval_samples_per_second": 1853.724, "eval_steps_per_second": 29.308, "step": 400 }, { "epoch": 2.16, "grad_norm": 3.669529914855957, "learning_rate": 5.6140350877192985e-06, "loss": 0.8246, "step": 410 }, { "epoch": 2.21, "grad_norm": 3.788975954055786, "learning_rate": 5.263157894736842e-06, "loss": 0.8956, "step": 420 }, { "epoch": 2.26, "grad_norm": 4.400717258453369, "learning_rate": 4.912280701754386e-06, "loss": 0.8508, "step": 430 }, { "epoch": 2.32, "grad_norm": 4.932755470275879, "learning_rate": 4.56140350877193e-06, "loss": 0.9209, "step": 440 }, { "epoch": 2.37, "grad_norm": 3.4981260299682617, "learning_rate": 4.210526315789474e-06, "loss": 0.8345, "step": 450 }, { "epoch": 2.37, "eval_accuracy": 0.7582345191040843, "eval_f1_macro": 0.6900497906953322, "eval_f1_micro": 0.7582345191040843, "eval_loss": 0.9388595819473267, "eval_runtime": 0.8212, "eval_samples_per_second": 1848.509, "eval_steps_per_second": 29.225, "step": 450 }, { "epoch": 2.42, "grad_norm": 3.988497257232666, "learning_rate": 3.859649122807018e-06, "loss": 0.8174, "step": 460 }, { "epoch": 2.47, "grad_norm": 4.119844913482666, "learning_rate": 3.5087719298245615e-06, "loss": 0.9026, "step": 470 }, { "epoch": 2.53, "grad_norm": 3.8894877433776855, "learning_rate": 3.157894736842105e-06, "loss": 0.8755, "step": 480 }, { "epoch": 2.58, "grad_norm": 3.8152105808258057, "learning_rate": 2.8070175438596493e-06, "loss": 0.8427, "step": 490 }, { "epoch": 2.63, "grad_norm": 3.738555908203125, "learning_rate": 2.456140350877193e-06, "loss": 0.7851, "step": 500 }, { "epoch": 2.63, "eval_accuracy": 0.7628458498023716, "eval_f1_macro": 0.6923797058622336, "eval_f1_micro": 0.7628458498023716, "eval_loss": 0.9207842350006104, "eval_runtime": 0.8212, "eval_samples_per_second": 1848.55, "eval_steps_per_second": 29.226, "step": 500 }, { "epoch": 2.68, "grad_norm": 4.088294506072998, "learning_rate": 2.105263157894737e-06, "loss": 0.8308, "step": 510 }, { "epoch": 2.74, "grad_norm": 4.426513195037842, "learning_rate": 1.7543859649122807e-06, "loss": 0.8498, "step": 520 }, { "epoch": 2.79, "grad_norm": 3.5125749111175537, "learning_rate": 1.4035087719298246e-06, "loss": 0.8491, "step": 530 }, { "epoch": 2.84, "grad_norm": 4.475162982940674, "learning_rate": 1.0526315789473685e-06, "loss": 0.7996, "step": 540 }, { "epoch": 2.89, "grad_norm": 4.1890549659729, "learning_rate": 7.017543859649123e-07, "loss": 0.8439, "step": 550 }, { "epoch": 2.89, "eval_accuracy": 0.7575757575757576, "eval_f1_macro": 0.6903636946713366, "eval_f1_micro": 0.7575757575757576, "eval_loss": 0.9129917025566101, "eval_runtime": 0.8209, "eval_samples_per_second": 1849.148, "eval_steps_per_second": 29.236, "step": 550 }, { "epoch": 2.95, "grad_norm": 3.682612419128418, "learning_rate": 3.5087719298245616e-07, "loss": 0.8467, "step": 560 }, { "epoch": 3.0, "grad_norm": 3.6457090377807617, "learning_rate": 0.0, "loss": 0.9181, "step": 570 }, { "epoch": 3.0, "step": 570, "total_flos": 1208555069767680.0, "train_loss": 1.337625675870661, "train_runtime": 76.5319, "train_samples_per_second": 476.037, "train_steps_per_second": 7.448 } ], "logging_steps": 10, "max_steps": 570, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "total_flos": 1208555069767680.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }