{ "best_metric": 0.5063937306404114, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/Qwen/Qwen1.5_1.8B_ledgar/checkpoint-1800", "epoch": 1.9189765458422174, "eval_steps": 100, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 80.4836196899414, "learning_rate": 4.9555792466240235e-06, "loss": 7.9887, "step": 25 }, { "epoch": 0.05, "grad_norm": 60.927364349365234, "learning_rate": 4.911158493248046e-06, "loss": 3.1908, "step": 50 }, { "epoch": 0.08, "grad_norm": 49.68091583251953, "learning_rate": 4.866737739872069e-06, "loss": 1.7183, "step": 75 }, { "epoch": 0.11, "grad_norm": 55.176666259765625, "learning_rate": 4.822316986496091e-06, "loss": 1.3077, "step": 100 }, { "epoch": 0.11, "eval_accuracy": 0.7277, "eval_f1_macro": 0.5770831474844406, "eval_f1_micro": 0.7277, "eval_loss": 1.0944937467575073, "eval_runtime": 25.447, "eval_samples_per_second": 392.973, "eval_steps_per_second": 6.17, "step": 100 }, { "epoch": 0.13, "grad_norm": 46.64506530761719, "learning_rate": 4.777896233120114e-06, "loss": 1.1393, "step": 125 }, { "epoch": 0.16, "grad_norm": 41.11891174316406, "learning_rate": 4.733475479744136e-06, "loss": 1.0243, "step": 150 }, { "epoch": 0.19, "grad_norm": 34.20009994506836, "learning_rate": 4.6890547263681595e-06, "loss": 0.9005, "step": 175 }, { "epoch": 0.21, "grad_norm": 38.80377197265625, "learning_rate": 4.644633972992183e-06, "loss": 0.8627, "step": 200 }, { "epoch": 0.21, "eval_accuracy": 0.7907, "eval_f1_macro": 0.6657039157603262, "eval_f1_micro": 0.7907, "eval_loss": 0.8368468880653381, "eval_runtime": 25.9484, "eval_samples_per_second": 385.38, "eval_steps_per_second": 6.05, "step": 200 }, { "epoch": 0.24, "grad_norm": 34.973506927490234, "learning_rate": 4.600213219616206e-06, "loss": 0.7896, "step": 225 }, { "epoch": 0.27, "grad_norm": 29.98388671875, "learning_rate": 4.555792466240228e-06, "loss": 0.8307, "step": 250 }, { "epoch": 0.29, "grad_norm": 27.09973907470703, "learning_rate": 4.51137171286425e-06, "loss": 0.7846, "step": 275 }, { "epoch": 0.32, "grad_norm": 36.151161193847656, "learning_rate": 4.466950959488273e-06, "loss": 0.7179, "step": 300 }, { "epoch": 0.32, "eval_accuracy": 0.7971, "eval_f1_macro": 0.6861778340669753, "eval_f1_micro": 0.7971, "eval_loss": 0.7824062705039978, "eval_runtime": 25.9003, "eval_samples_per_second": 386.095, "eval_steps_per_second": 6.062, "step": 300 }, { "epoch": 0.35, "grad_norm": 33.09822463989258, "learning_rate": 4.422530206112296e-06, "loss": 0.7133, "step": 325 }, { "epoch": 0.37, "grad_norm": 35.52923583984375, "learning_rate": 4.378109452736319e-06, "loss": 0.7354, "step": 350 }, { "epoch": 0.4, "grad_norm": 39.79545211791992, "learning_rate": 4.333688699360342e-06, "loss": 0.6619, "step": 375 }, { "epoch": 0.43, "grad_norm": 30.13161849975586, "learning_rate": 4.289267945984365e-06, "loss": 0.6961, "step": 400 }, { "epoch": 0.43, "eval_accuracy": 0.8138, "eval_f1_macro": 0.6992465625213966, "eval_f1_micro": 0.8138, "eval_loss": 0.6951531171798706, "eval_runtime": 25.6082, "eval_samples_per_second": 390.5, "eval_steps_per_second": 6.131, "step": 400 }, { "epoch": 0.45, "grad_norm": 27.575519561767578, "learning_rate": 4.244847192608387e-06, "loss": 0.7162, "step": 425 }, { "epoch": 0.48, "grad_norm": 35.084754943847656, "learning_rate": 4.200426439232409e-06, "loss": 0.7722, "step": 450 }, { "epoch": 0.51, "grad_norm": 28.47511863708496, "learning_rate": 4.156005685856432e-06, "loss": 0.6866, "step": 475 }, { "epoch": 0.53, "grad_norm": 32.34709548950195, "learning_rate": 4.1115849324804554e-06, "loss": 0.745, "step": 500 }, { "epoch": 0.53, "eval_accuracy": 0.8121, "eval_f1_macro": 0.7033560293953169, "eval_f1_micro": 0.8121, "eval_loss": 0.6718780994415283, "eval_runtime": 25.9161, "eval_samples_per_second": 385.86, "eval_steps_per_second": 6.058, "step": 500 }, { "epoch": 0.56, "grad_norm": 25.5845890045166, "learning_rate": 4.067164179104478e-06, "loss": 0.6535, "step": 525 }, { "epoch": 0.59, "grad_norm": 22.466503143310547, "learning_rate": 4.022743425728501e-06, "loss": 0.5969, "step": 550 }, { "epoch": 0.61, "grad_norm": 27.53134536743164, "learning_rate": 3.978322672352524e-06, "loss": 0.5926, "step": 575 }, { "epoch": 0.64, "grad_norm": 31.356454849243164, "learning_rate": 3.933901918976546e-06, "loss": 0.6505, "step": 600 }, { "epoch": 0.64, "eval_accuracy": 0.834, "eval_f1_macro": 0.7469091035082649, "eval_f1_micro": 0.834, "eval_loss": 0.6219750046730042, "eval_runtime": 25.9316, "eval_samples_per_second": 385.63, "eval_steps_per_second": 6.054, "step": 600 }, { "epoch": 0.67, "grad_norm": 37.17654800415039, "learning_rate": 3.889481165600569e-06, "loss": 0.6171, "step": 625 }, { "epoch": 0.69, "grad_norm": 26.71038055419922, "learning_rate": 3.8450604122245914e-06, "loss": 0.6218, "step": 650 }, { "epoch": 0.72, "grad_norm": 27.787952423095703, "learning_rate": 3.8006396588486145e-06, "loss": 0.6124, "step": 675 }, { "epoch": 0.75, "grad_norm": 30.405912399291992, "learning_rate": 3.756218905472637e-06, "loss": 0.5914, "step": 700 }, { "epoch": 0.75, "eval_accuracy": 0.8362, "eval_f1_macro": 0.7410957777496914, "eval_f1_micro": 0.8362, "eval_loss": 0.6109625101089478, "eval_runtime": 25.6247, "eval_samples_per_second": 390.248, "eval_steps_per_second": 6.127, "step": 700 }, { "epoch": 0.77, "grad_norm": 30.52012062072754, "learning_rate": 3.71179815209666e-06, "loss": 0.5711, "step": 725 }, { "epoch": 0.8, "grad_norm": 30.88004493713379, "learning_rate": 3.667377398720683e-06, "loss": 0.6695, "step": 750 }, { "epoch": 0.83, "grad_norm": 22.504459381103516, "learning_rate": 3.622956645344705e-06, "loss": 0.5731, "step": 775 }, { "epoch": 0.85, "grad_norm": 21.515512466430664, "learning_rate": 3.578535891968728e-06, "loss": 0.5837, "step": 800 }, { "epoch": 0.85, "eval_accuracy": 0.8385, "eval_f1_macro": 0.7413235492734335, "eval_f1_micro": 0.8385, "eval_loss": 0.5766780972480774, "eval_runtime": 25.6608, "eval_samples_per_second": 389.7, "eval_steps_per_second": 6.118, "step": 800 }, { "epoch": 0.88, "grad_norm": 30.9660587310791, "learning_rate": 3.534115138592751e-06, "loss": 0.6085, "step": 825 }, { "epoch": 0.91, "grad_norm": 18.883647918701172, "learning_rate": 3.4896943852167736e-06, "loss": 0.5121, "step": 850 }, { "epoch": 0.93, "grad_norm": 24.548561096191406, "learning_rate": 3.4452736318407963e-06, "loss": 0.5621, "step": 875 }, { "epoch": 0.96, "grad_norm": 29.833791732788086, "learning_rate": 3.4008528784648194e-06, "loss": 0.5218, "step": 900 }, { "epoch": 0.96, "eval_accuracy": 0.849, "eval_f1_macro": 0.7702797685808792, "eval_f1_micro": 0.849, "eval_loss": 0.5365203022956848, "eval_runtime": 25.9091, "eval_samples_per_second": 385.964, "eval_steps_per_second": 6.06, "step": 900 }, { "epoch": 0.99, "grad_norm": 27.948928833007812, "learning_rate": 3.3564321250888416e-06, "loss": 0.5681, "step": 925 }, { "epoch": 1.01, "grad_norm": 19.800880432128906, "learning_rate": 3.3120113717128643e-06, "loss": 0.4014, "step": 950 }, { "epoch": 1.04, "grad_norm": 19.333465576171875, "learning_rate": 3.2675906183368874e-06, "loss": 0.2795, "step": 975 }, { "epoch": 1.07, "grad_norm": 22.315195083618164, "learning_rate": 3.22316986496091e-06, "loss": 0.2632, "step": 1000 }, { "epoch": 1.07, "eval_accuracy": 0.8562, "eval_f1_macro": 0.7683569808757446, "eval_f1_micro": 0.8562, "eval_loss": 0.5503664016723633, "eval_runtime": 25.5198, "eval_samples_per_second": 391.852, "eval_steps_per_second": 6.152, "step": 1000 }, { "epoch": 1.09, "grad_norm": 24.819501876831055, "learning_rate": 3.1787491115849327e-06, "loss": 0.2532, "step": 1025 }, { "epoch": 1.12, "grad_norm": 21.534936904907227, "learning_rate": 3.1343283582089558e-06, "loss": 0.2311, "step": 1050 }, { "epoch": 1.15, "grad_norm": 24.088809967041016, "learning_rate": 3.0899076048329785e-06, "loss": 0.3134, "step": 1075 }, { "epoch": 1.17, "grad_norm": 27.605493545532227, "learning_rate": 3.0454868514570007e-06, "loss": 0.2607, "step": 1100 }, { "epoch": 1.17, "eval_accuracy": 0.8525, "eval_f1_macro": 0.7656891626030512, "eval_f1_micro": 0.8525, "eval_loss": 0.5496523380279541, "eval_runtime": 25.7081, "eval_samples_per_second": 388.982, "eval_steps_per_second": 6.107, "step": 1100 }, { "epoch": 1.2, "grad_norm": 22.955158233642578, "learning_rate": 3.0010660980810234e-06, "loss": 0.2674, "step": 1125 }, { "epoch": 1.23, "grad_norm": 19.089893341064453, "learning_rate": 2.9566453447050464e-06, "loss": 0.2074, "step": 1150 }, { "epoch": 1.25, "grad_norm": 19.285688400268555, "learning_rate": 2.912224591329069e-06, "loss": 0.2488, "step": 1175 }, { "epoch": 1.28, "grad_norm": 23.45233726501465, "learning_rate": 2.867803837953092e-06, "loss": 0.274, "step": 1200 }, { "epoch": 1.28, "eval_accuracy": 0.8584, "eval_f1_macro": 0.7746299057445165, "eval_f1_micro": 0.8584, "eval_loss": 0.5439000129699707, "eval_runtime": 25.9014, "eval_samples_per_second": 386.079, "eval_steps_per_second": 6.061, "step": 1200 }, { "epoch": 1.31, "grad_norm": 31.231454849243164, "learning_rate": 2.823383084577115e-06, "loss": 0.2624, "step": 1225 }, { "epoch": 1.33, "grad_norm": 28.1010799407959, "learning_rate": 2.7789623312011375e-06, "loss": 0.2992, "step": 1250 }, { "epoch": 1.36, "grad_norm": 30.002384185791016, "learning_rate": 2.7345415778251598e-06, "loss": 0.2589, "step": 1275 }, { "epoch": 1.39, "grad_norm": 23.61323356628418, "learning_rate": 2.690120824449183e-06, "loss": 0.2216, "step": 1300 }, { "epoch": 1.39, "eval_accuracy": 0.8563, "eval_f1_macro": 0.7753520513346309, "eval_f1_micro": 0.8563, "eval_loss": 0.5687375068664551, "eval_runtime": 25.9424, "eval_samples_per_second": 385.47, "eval_steps_per_second": 6.052, "step": 1300 }, { "epoch": 1.41, "grad_norm": 27.56183433532715, "learning_rate": 2.6457000710732055e-06, "loss": 0.2845, "step": 1325 }, { "epoch": 1.44, "grad_norm": 18.88576316833496, "learning_rate": 2.601279317697228e-06, "loss": 0.2685, "step": 1350 }, { "epoch": 1.47, "grad_norm": 19.662220001220703, "learning_rate": 2.5568585643212513e-06, "loss": 0.2489, "step": 1375 }, { "epoch": 1.49, "grad_norm": 22.736656188964844, "learning_rate": 2.512437810945274e-06, "loss": 0.2044, "step": 1400 }, { "epoch": 1.49, "eval_accuracy": 0.861, "eval_f1_macro": 0.7820141563614671, "eval_f1_micro": 0.861, "eval_loss": 0.5385035276412964, "eval_runtime": 25.6666, "eval_samples_per_second": 389.612, "eval_steps_per_second": 6.117, "step": 1400 }, { "epoch": 1.52, "grad_norm": 24.569435119628906, "learning_rate": 2.4680170575692966e-06, "loss": 0.2388, "step": 1425 }, { "epoch": 1.55, "grad_norm": 17.50179100036621, "learning_rate": 2.4235963041933193e-06, "loss": 0.2556, "step": 1450 }, { "epoch": 1.57, "grad_norm": 15.387917518615723, "learning_rate": 2.379175550817342e-06, "loss": 0.2343, "step": 1475 }, { "epoch": 1.6, "grad_norm": 29.757495880126953, "learning_rate": 2.3347547974413646e-06, "loss": 0.2508, "step": 1500 }, { "epoch": 1.6, "eval_accuracy": 0.8577, "eval_f1_macro": 0.7710712973870113, "eval_f1_micro": 0.8577, "eval_loss": 0.5657808780670166, "eval_runtime": 25.9754, "eval_samples_per_second": 384.98, "eval_steps_per_second": 6.044, "step": 1500 }, { "epoch": 1.63, "grad_norm": 24.104217529296875, "learning_rate": 2.2903340440653877e-06, "loss": 0.2647, "step": 1525 }, { "epoch": 1.65, "grad_norm": 29.48048973083496, "learning_rate": 2.24591329068941e-06, "loss": 0.212, "step": 1550 }, { "epoch": 1.68, "grad_norm": 11.834880828857422, "learning_rate": 2.201492537313433e-06, "loss": 0.1939, "step": 1575 }, { "epoch": 1.71, "grad_norm": 24.24506378173828, "learning_rate": 2.1570717839374557e-06, "loss": 0.2513, "step": 1600 }, { "epoch": 1.71, "eval_accuracy": 0.8589, "eval_f1_macro": 0.7871987440671023, "eval_f1_micro": 0.8589, "eval_loss": 0.5366827845573425, "eval_runtime": 25.9643, "eval_samples_per_second": 385.144, "eval_steps_per_second": 6.047, "step": 1600 }, { "epoch": 1.73, "grad_norm": 23.33180046081543, "learning_rate": 2.112651030561479e-06, "loss": 0.2409, "step": 1625 }, { "epoch": 1.76, "grad_norm": 18.71114730834961, "learning_rate": 2.068230277185501e-06, "loss": 0.224, "step": 1650 }, { "epoch": 1.79, "grad_norm": 21.95819854736328, "learning_rate": 2.023809523809524e-06, "loss": 0.2223, "step": 1675 }, { "epoch": 1.81, "grad_norm": 27.065677642822266, "learning_rate": 1.979388770433547e-06, "loss": 0.2787, "step": 1700 }, { "epoch": 1.81, "eval_accuracy": 0.8653, "eval_f1_macro": 0.790261134849528, "eval_f1_micro": 0.8653, "eval_loss": 0.5133171677589417, "eval_runtime": 25.5701, "eval_samples_per_second": 391.081, "eval_steps_per_second": 6.14, "step": 1700 }, { "epoch": 1.84, "grad_norm": 35.288761138916016, "learning_rate": 1.9349680170575695e-06, "loss": 0.2709, "step": 1725 }, { "epoch": 1.87, "grad_norm": 21.077306747436523, "learning_rate": 1.8905472636815921e-06, "loss": 0.2002, "step": 1750 }, { "epoch": 1.89, "grad_norm": 25.394838333129883, "learning_rate": 1.846126510305615e-06, "loss": 0.2461, "step": 1775 }, { "epoch": 1.92, "grad_norm": 26.597759246826172, "learning_rate": 1.8017057569296375e-06, "loss": 0.2357, "step": 1800 }, { "epoch": 1.92, "eval_accuracy": 0.8669, "eval_f1_macro": 0.7902403947168268, "eval_f1_micro": 0.8669, "eval_loss": 0.5063937306404114, "eval_runtime": 25.6031, "eval_samples_per_second": 390.577, "eval_steps_per_second": 6.132, "step": 1800 } ], "logging_steps": 25, "max_steps": 2814, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.074692042564567e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }