{ "best_metric": 0.8535663673078441, "best_model_checkpoint": "distilhubert-finetuned-mixed-data/checkpoint-1800", "epoch": 35.03649635036496, "eval_steps": 200, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.7299270072992701, "grad_norm": 2.289438486099243, "learning_rate": 5.5147058823529414e-05, "loss": 1.2878, "step": 50 }, { "epoch": 1.4598540145985401, "grad_norm": 2.5048491954803467, "learning_rate": 0.00011029411764705883, "loss": 0.8322, "step": 100 }, { "epoch": 2.18978102189781, "grad_norm": 11.18371295928955, "learning_rate": 0.00016544117647058823, "loss": 0.7897, "step": 150 }, { "epoch": 2.9197080291970803, "grad_norm": 9.702393531799316, "learning_rate": 0.00022058823529411765, "loss": 0.7149, "step": 200 }, { "epoch": 2.9197080291970803, "eval_accuracy": 0.7252747252747253, "eval_confusion_matrix": [ [ 34, 39, 0, 2 ], [ 7, 62, 6, 0 ], [ 0, 19, 43, 0 ], [ 0, 2, 0, 59 ] ], "eval_f1": 0.7260427659517454, "eval_loss": 0.9058456420898438, "eval_precision": 0.7828499608603893, "eval_recall": 0.7252747252747253, "eval_runtime": 3.7417, "eval_samples_per_second": 72.962, "eval_steps_per_second": 0.802, "step": 200 }, { "epoch": 3.6496350364963503, "grad_norm": 8.47255802154541, "learning_rate": 0.000275735294117647, "loss": 0.6917, "step": 250 }, { "epoch": 4.37956204379562, "grad_norm": 16.689321517944336, "learning_rate": 0.0002999031705390845, "loss": 0.7264, "step": 300 }, { "epoch": 5.109489051094891, "grad_norm": 1.7369310855865479, "learning_rate": 0.00029924913005299595, "loss": 0.6895, "step": 350 }, { "epoch": 5.839416058394161, "grad_norm": 2.210369348526001, "learning_rate": 0.0002979807906935489, "loss": 0.6939, "step": 400 }, { "epoch": 5.839416058394161, "eval_accuracy": 0.7509157509157509, "eval_confusion_matrix": [ [ 66, 2, 0, 7 ], [ 29, 38, 7, 1 ], [ 2, 20, 40, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.7418721712792054, "eval_loss": 0.8107791543006897, "eval_precision": 0.7517378077426524, "eval_recall": 0.7509157509157509, "eval_runtime": 3.7702, "eval_samples_per_second": 72.409, "eval_steps_per_second": 0.796, "step": 400 }, { "epoch": 6.569343065693431, "grad_norm": 2.1358511447906494, "learning_rate": 0.000296103372855926, "loss": 0.5986, "step": 450 }, { "epoch": 7.299270072992701, "grad_norm": 13.704009056091309, "learning_rate": 0.0002936246038592886, "loss": 0.5932, "step": 500 }, { "epoch": 8.02919708029197, "grad_norm": 2.032876968383789, "learning_rate": 0.00029055468614167716, "loss": 0.5633, "step": 550 }, { "epoch": 8.75912408759124, "grad_norm": 28.525798797607422, "learning_rate": 0.00028690625526749705, "loss": 0.4941, "step": 600 }, { "epoch": 8.75912408759124, "eval_accuracy": 0.8241758241758241, "eval_confusion_matrix": [ [ 63, 8, 1, 3 ], [ 8, 50, 17, 0 ], [ 2, 9, 51, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.8222676260809794, "eval_loss": 0.7625077366828918, "eval_precision": 0.8229409839103053, "eval_recall": 0.8241758241758241, "eval_runtime": 3.757, "eval_samples_per_second": 72.664, "eval_steps_per_second": 0.799, "step": 600 }, { "epoch": 9.489051094890511, "grad_norm": 0.18371808528900146, "learning_rate": 0.0002826943279204283, "loss": 0.4842, "step": 650 }, { "epoch": 10.218978102189782, "grad_norm": 11.426072120666504, "learning_rate": 0.0002779362400958168, "loss": 0.4352, "step": 700 }, { "epoch": 10.94890510948905, "grad_norm": 8.062601089477539, "learning_rate": 0.0002726515757469423, "loss": 0.4447, "step": 750 }, { "epoch": 11.678832116788321, "grad_norm": 0.3985881805419922, "learning_rate": 0.00026686208617885055, "loss": 0.442, "step": 800 }, { "epoch": 11.678832116788321, "eval_accuracy": 0.7985347985347986, "eval_confusion_matrix": [ [ 66, 6, 1, 2 ], [ 15, 32, 26, 2 ], [ 2, 1, 59, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.781170020153555, "eval_loss": 0.9623217582702637, "eval_precision": 0.8093701586901577, "eval_recall": 0.7985347985347986, "eval_runtime": 3.774, "eval_samples_per_second": 72.337, "eval_steps_per_second": 0.795, "step": 800 }, { "epoch": 12.408759124087592, "grad_norm": 38.726985931396484, "learning_rate": 0.0002605916005215186, "loss": 0.4504, "step": 850 }, { "epoch": 13.138686131386862, "grad_norm": 0.026563748717308044, "learning_rate": 0.0002538659276508397, "loss": 0.3903, "step": 900 }, { "epoch": 13.86861313868613, "grad_norm": 0.06770322471857071, "learning_rate": 0.0002467127499611136, "loss": 0.4094, "step": 950 }, { "epoch": 14.598540145985401, "grad_norm": 1.2612749338150024, "learning_rate": 0.00023916150942626798, "loss": 0.4188, "step": 1000 }, { "epoch": 14.598540145985401, "eval_accuracy": 0.8315018315018315, "eval_confusion_matrix": [ [ 60, 9, 2, 4 ], [ 8, 56, 11, 0 ], [ 1, 11, 50, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.8307422385946511, "eval_loss": 0.8534455299377441, "eval_precision": 0.8312566016541674, "eval_recall": 0.8315018315018315, "eval_runtime": 3.796, "eval_samples_per_second": 71.917, "eval_steps_per_second": 0.79, "step": 1000 }, { "epoch": 15.328467153284672, "grad_norm": 28.980899810791016, "learning_rate": 0.0002312432864187738, "loss": 0.3798, "step": 1050 }, { "epoch": 16.05839416058394, "grad_norm": 0.022609323263168335, "learning_rate": 0.0002229906717850284, "loss": 0.3672, "step": 1100 }, { "epoch": 16.78832116788321, "grad_norm": 0.02360348217189312, "learning_rate": 0.00021443763270373483, "loss": 0.3715, "step": 1150 }, { "epoch": 17.51824817518248, "grad_norm": 0.014020542614161968, "learning_rate": 0.0002056193728793941, "loss": 0.349, "step": 1200 }, { "epoch": 17.51824817518248, "eval_accuracy": 0.8351648351648352, "eval_confusion_matrix": [ [ 62, 10, 1, 2 ], [ 9, 57, 9, 0 ], [ 2, 12, 48, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.8350675728555914, "eval_loss": 0.8131950497627258, "eval_precision": 0.8358475863688551, "eval_recall": 0.8351648351648352, "eval_runtime": 3.7788, "eval_samples_per_second": 72.246, "eval_steps_per_second": 0.794, "step": 1200 }, { "epoch": 18.248175182481752, "grad_norm": 0.006028232164680958, "learning_rate": 0.0001965721876463452, "loss": 0.3491, "step": 1250 }, { "epoch": 18.978102189781023, "grad_norm": 0.008285734802484512, "learning_rate": 0.00018733331457973358, "loss": 0.3489, "step": 1300 }, { "epoch": 19.708029197080293, "grad_norm": 0.008053851313889027, "learning_rate": 0.00017794078022828275, "loss": 0.3497, "step": 1350 }, { "epoch": 20.437956204379564, "grad_norm": 0.003234422067180276, "learning_rate": 0.00016843324359970712, "loss": 0.3488, "step": 1400 }, { "epoch": 20.437956204379564, "eval_accuracy": 0.8461538461538461, "eval_confusion_matrix": [ [ 61, 11, 1, 2 ], [ 8, 57, 10, 0 ], [ 0, 10, 52, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.8462423027109934, "eval_loss": 0.7859560251235962, "eval_precision": 0.8474363933035696, "eval_recall": 0.8461538461538461, "eval_runtime": 3.7947, "eval_samples_per_second": 71.942, "eval_steps_per_second": 0.791, "step": 1400 }, { "epoch": 21.16788321167883, "grad_norm": 0.004595920909196138, "learning_rate": 0.00015884983704296757, "loss": 0.3488, "step": 1450 }, { "epoch": 21.8978102189781, "grad_norm": 0.002511706668883562, "learning_rate": 0.00014923000518228847, "loss": 0.3488, "step": 1500 }, { "epoch": 22.62773722627737, "grad_norm": 0.002340014325454831, "learning_rate": 0.00013961334256587125, "loss": 0.3488, "step": 1550 }, { "epoch": 23.357664233576642, "grad_norm": 0.0028287076856940985, "learning_rate": 0.00013003943069753198, "loss": 0.3488, "step": 1600 }, { "epoch": 23.357664233576642, "eval_accuracy": 0.8461538461538461, "eval_confusion_matrix": [ [ 61, 11, 1, 2 ], [ 8, 57, 10, 0 ], [ 0, 10, 52, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.8462423027109934, "eval_loss": 0.7856015563011169, "eval_precision": 0.8474363933035696, "eval_recall": 0.8461538461538461, "eval_runtime": 3.7861, "eval_samples_per_second": 72.105, "eval_steps_per_second": 0.792, "step": 1600 }, { "epoch": 24.087591240875913, "grad_norm": 0.0027960864827036858, "learning_rate": 0.00012054767512202832, "loss": 0.3488, "step": 1650 }, { "epoch": 24.817518248175183, "grad_norm": 0.0033820979297161102, "learning_rate": 0.00011117714323462186, "loss": 0.3488, "step": 1700 }, { "epoch": 25.547445255474454, "grad_norm": 0.0034969367552548647, "learning_rate": 0.00010196640348243974, "loss": 0.3488, "step": 1750 }, { "epoch": 26.277372262773724, "grad_norm": 0.0014958898536860943, "learning_rate": 9.295336661947115e-05, "loss": 0.3488, "step": 1800 }, { "epoch": 26.277372262773724, "eval_accuracy": 0.8534798534798534, "eval_confusion_matrix": [ [ 61, 11, 1, 2 ], [ 7, 58, 10, 0 ], [ 0, 9, 53, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.8535663673078441, "eval_loss": 0.7831193804740906, "eval_precision": 0.8551497604301419, "eval_recall": 0.8534798534798534, "eval_runtime": 3.7976, "eval_samples_per_second": 71.888, "eval_steps_per_second": 0.79, "step": 1800 }, { "epoch": 27.00729927007299, "grad_norm": 0.004900149069726467, "learning_rate": 8.417512966858319e-05, "loss": 0.3488, "step": 1850 }, { "epoch": 27.73722627737226, "grad_norm": 0.0018804975552484393, "learning_rate": 7.566782323279578e-05, "loss": 0.3488, "step": 1900 }, { "epoch": 28.467153284671532, "grad_norm": 0.0019178036600351334, "learning_rate": 6.746646278427247e-05, "loss": 0.3488, "step": 1950 }, { "epoch": 29.197080291970803, "grad_norm": 0.001025234698317945, "learning_rate": 5.960480454311155e-05, "loss": 0.3488, "step": 2000 }, { "epoch": 29.197080291970803, "eval_accuracy": 0.8498168498168498, "eval_confusion_matrix": [ [ 61, 11, 1, 2 ], [ 8, 57, 10, 0 ], [ 0, 9, 53, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.8496942339108237, "eval_loss": 0.7866398692131042, "eval_precision": 0.8506632615716467, "eval_recall": 0.8498168498168498, "eval_runtime": 3.7892, "eval_samples_per_second": 72.047, "eval_steps_per_second": 0.792, "step": 2000 }, { "epoch": 29.927007299270073, "grad_norm": 0.0027674695011228323, "learning_rate": 5.2115206539129e-05, "loss": 0.3488, "step": 2050 }, { "epoch": 30.656934306569344, "grad_norm": 0.0016269112238660455, "learning_rate": 4.5028495428494483e-05, "loss": 0.3488, "step": 2100 }, { "epoch": 31.386861313868614, "grad_norm": 0.0019462064374238253, "learning_rate": 3.837383961339246e-05, "loss": 0.3488, "step": 2150 }, { "epoch": 32.11678832116788, "grad_norm": 0.0011992512736469507, "learning_rate": 3.21786291869402e-05, "loss": 0.3488, "step": 2200 }, { "epoch": 32.11678832116788, "eval_accuracy": 0.8534798534798534, "eval_confusion_matrix": [ [ 61, 11, 1, 2 ], [ 8, 57, 10, 0 ], [ 0, 8, 54, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.8531308487327289, "eval_loss": 0.7856839895248413, "eval_precision": 0.8539396783782831, "eval_recall": 0.8534798534798534, "eval_runtime": 3.787, "eval_samples_per_second": 72.088, "eval_steps_per_second": 0.792, "step": 2200 }, { "epoch": 32.846715328467155, "grad_norm": 0.0029719627927988768, "learning_rate": 2.6468363197499458e-05, "loss": 0.3488, "step": 2250 }, { "epoch": 33.57664233576642, "grad_norm": 0.0012639207998290658, "learning_rate": 2.1266544696395582e-05, "loss": 0.3488, "step": 2300 }, { "epoch": 34.306569343065696, "grad_norm": 0.0011322245700284839, "learning_rate": 1.659458400101879e-05, "loss": 0.3488, "step": 2350 }, { "epoch": 35.03649635036496, "grad_norm": 0.002087602624669671, "learning_rate": 1.2471710571470578e-05, "loss": 0.3488, "step": 2400 }, { "epoch": 35.03649635036496, "eval_accuracy": 0.8498168498168498, "eval_confusion_matrix": [ [ 61, 11, 1, 2 ], [ 8, 57, 10, 0 ], [ 0, 9, 53, 0 ], [ 0, 0, 0, 61 ] ], "eval_f1": 0.8496942339108237, "eval_loss": 0.7856935858726501, "eval_precision": 0.8506632615716467, "eval_recall": 0.8498168498168498, "eval_runtime": 3.788, "eval_samples_per_second": 72.069, "eval_steps_per_second": 0.792, "step": 2400 } ], "logging_steps": 50, "max_steps": 2720, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.001 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.68527123264e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }