{ "best_metric": 0.5063937306404114, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/Qwen/Qwen1.5_1.8B_ledgar/checkpoint-1800", "epoch": 3.0, "eval_steps": 100, "global_step": 2814, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 80.4836196899414, "learning_rate": 4.9555792466240235e-06, "loss": 7.9887, "step": 25 }, { "epoch": 0.05, "grad_norm": 60.927364349365234, "learning_rate": 4.911158493248046e-06, "loss": 3.1908, "step": 50 }, { "epoch": 0.08, "grad_norm": 49.68091583251953, "learning_rate": 4.866737739872069e-06, "loss": 1.7183, "step": 75 }, { "epoch": 0.11, "grad_norm": 55.176666259765625, "learning_rate": 4.822316986496091e-06, "loss": 1.3077, "step": 100 }, { "epoch": 0.11, "eval_accuracy": 0.7277, "eval_f1_macro": 0.5770831474844406, "eval_f1_micro": 0.7277, "eval_loss": 1.0944937467575073, "eval_runtime": 25.447, "eval_samples_per_second": 392.973, "eval_steps_per_second": 6.17, "step": 100 }, { "epoch": 0.13, "grad_norm": 46.64506530761719, "learning_rate": 4.777896233120114e-06, "loss": 1.1393, "step": 125 }, { "epoch": 0.16, "grad_norm": 41.11891174316406, "learning_rate": 4.733475479744136e-06, "loss": 1.0243, "step": 150 }, { "epoch": 0.19, "grad_norm": 34.20009994506836, "learning_rate": 4.6890547263681595e-06, "loss": 0.9005, "step": 175 }, { "epoch": 0.21, "grad_norm": 38.80377197265625, "learning_rate": 4.644633972992183e-06, "loss": 0.8627, "step": 200 }, { "epoch": 0.21, "eval_accuracy": 0.7907, "eval_f1_macro": 0.6657039157603262, "eval_f1_micro": 0.7907, "eval_loss": 0.8368468880653381, "eval_runtime": 25.9484, "eval_samples_per_second": 385.38, "eval_steps_per_second": 6.05, "step": 200 }, { "epoch": 0.24, "grad_norm": 34.973506927490234, "learning_rate": 4.600213219616206e-06, "loss": 0.7896, "step": 225 }, { "epoch": 0.27, "grad_norm": 29.98388671875, "learning_rate": 4.555792466240228e-06, "loss": 0.8307, "step": 250 }, { "epoch": 0.29, "grad_norm": 27.09973907470703, "learning_rate": 4.51137171286425e-06, "loss": 0.7846, "step": 275 }, { "epoch": 0.32, "grad_norm": 36.151161193847656, "learning_rate": 4.466950959488273e-06, "loss": 0.7179, "step": 300 }, { "epoch": 0.32, "eval_accuracy": 0.7971, "eval_f1_macro": 0.6861778340669753, "eval_f1_micro": 0.7971, "eval_loss": 0.7824062705039978, "eval_runtime": 25.9003, "eval_samples_per_second": 386.095, "eval_steps_per_second": 6.062, "step": 300 }, { "epoch": 0.35, "grad_norm": 33.09822463989258, "learning_rate": 4.422530206112296e-06, "loss": 0.7133, "step": 325 }, { "epoch": 0.37, "grad_norm": 35.52923583984375, "learning_rate": 4.378109452736319e-06, "loss": 0.7354, "step": 350 }, { "epoch": 0.4, "grad_norm": 39.79545211791992, "learning_rate": 4.333688699360342e-06, "loss": 0.6619, "step": 375 }, { "epoch": 0.43, "grad_norm": 30.13161849975586, "learning_rate": 4.289267945984365e-06, "loss": 0.6961, "step": 400 }, { "epoch": 0.43, "eval_accuracy": 0.8138, "eval_f1_macro": 0.6992465625213966, "eval_f1_micro": 0.8138, "eval_loss": 0.6951531171798706, "eval_runtime": 25.6082, "eval_samples_per_second": 390.5, "eval_steps_per_second": 6.131, "step": 400 }, { "epoch": 0.45, "grad_norm": 27.575519561767578, "learning_rate": 4.244847192608387e-06, "loss": 0.7162, "step": 425 }, { "epoch": 0.48, "grad_norm": 35.084754943847656, "learning_rate": 4.200426439232409e-06, "loss": 0.7722, "step": 450 }, { "epoch": 0.51, "grad_norm": 28.47511863708496, "learning_rate": 4.156005685856432e-06, "loss": 0.6866, "step": 475 }, { "epoch": 0.53, "grad_norm": 32.34709548950195, "learning_rate": 4.1115849324804554e-06, "loss": 0.745, "step": 500 }, { "epoch": 0.53, "eval_accuracy": 0.8121, "eval_f1_macro": 0.7033560293953169, "eval_f1_micro": 0.8121, "eval_loss": 0.6718780994415283, "eval_runtime": 25.9161, "eval_samples_per_second": 385.86, "eval_steps_per_second": 6.058, "step": 500 }, { "epoch": 0.56, "grad_norm": 25.5845890045166, "learning_rate": 4.067164179104478e-06, "loss": 0.6535, "step": 525 }, { "epoch": 0.59, "grad_norm": 22.466503143310547, "learning_rate": 4.022743425728501e-06, "loss": 0.5969, "step": 550 }, { "epoch": 0.61, "grad_norm": 27.53134536743164, "learning_rate": 3.978322672352524e-06, "loss": 0.5926, "step": 575 }, { "epoch": 0.64, "grad_norm": 31.356454849243164, "learning_rate": 3.933901918976546e-06, "loss": 0.6505, "step": 600 }, { "epoch": 0.64, "eval_accuracy": 0.834, "eval_f1_macro": 0.7469091035082649, "eval_f1_micro": 0.834, "eval_loss": 0.6219750046730042, "eval_runtime": 25.9316, "eval_samples_per_second": 385.63, "eval_steps_per_second": 6.054, "step": 600 }, { "epoch": 0.67, "grad_norm": 37.17654800415039, "learning_rate": 3.889481165600569e-06, "loss": 0.6171, "step": 625 }, { "epoch": 0.69, "grad_norm": 26.71038055419922, "learning_rate": 3.8450604122245914e-06, "loss": 0.6218, "step": 650 }, { "epoch": 0.72, "grad_norm": 27.787952423095703, "learning_rate": 3.8006396588486145e-06, "loss": 0.6124, "step": 675 }, { "epoch": 0.75, "grad_norm": 30.405912399291992, "learning_rate": 3.756218905472637e-06, "loss": 0.5914, "step": 700 }, { "epoch": 0.75, "eval_accuracy": 0.8362, "eval_f1_macro": 0.7410957777496914, "eval_f1_micro": 0.8362, "eval_loss": 0.6109625101089478, "eval_runtime": 25.6247, "eval_samples_per_second": 390.248, "eval_steps_per_second": 6.127, "step": 700 }, { "epoch": 0.77, "grad_norm": 30.52012062072754, "learning_rate": 3.71179815209666e-06, "loss": 0.5711, "step": 725 }, { "epoch": 0.8, "grad_norm": 30.88004493713379, "learning_rate": 3.667377398720683e-06, "loss": 0.6695, "step": 750 }, { "epoch": 0.83, "grad_norm": 22.504459381103516, "learning_rate": 3.622956645344705e-06, "loss": 0.5731, "step": 775 }, { "epoch": 0.85, "grad_norm": 21.515512466430664, "learning_rate": 3.578535891968728e-06, "loss": 0.5837, "step": 800 }, { "epoch": 0.85, "eval_accuracy": 0.8385, "eval_f1_macro": 0.7413235492734335, "eval_f1_micro": 0.8385, "eval_loss": 0.5766780972480774, "eval_runtime": 25.6608, "eval_samples_per_second": 389.7, "eval_steps_per_second": 6.118, "step": 800 }, { "epoch": 0.88, "grad_norm": 30.9660587310791, "learning_rate": 3.534115138592751e-06, "loss": 0.6085, "step": 825 }, { "epoch": 0.91, "grad_norm": 18.883647918701172, "learning_rate": 3.4896943852167736e-06, "loss": 0.5121, "step": 850 }, { "epoch": 0.93, "grad_norm": 24.548561096191406, "learning_rate": 3.4452736318407963e-06, "loss": 0.5621, "step": 875 }, { "epoch": 0.96, "grad_norm": 29.833791732788086, "learning_rate": 3.4008528784648194e-06, "loss": 0.5218, "step": 900 }, { "epoch": 0.96, "eval_accuracy": 0.849, "eval_f1_macro": 0.7702797685808792, "eval_f1_micro": 0.849, "eval_loss": 0.5365203022956848, "eval_runtime": 25.9091, "eval_samples_per_second": 385.964, "eval_steps_per_second": 6.06, "step": 900 }, { "epoch": 0.99, "grad_norm": 27.948928833007812, "learning_rate": 3.3564321250888416e-06, "loss": 0.5681, "step": 925 }, { "epoch": 1.01, "grad_norm": 19.800880432128906, "learning_rate": 3.3120113717128643e-06, "loss": 0.4014, "step": 950 }, { "epoch": 1.04, "grad_norm": 19.333465576171875, "learning_rate": 3.2675906183368874e-06, "loss": 0.2795, "step": 975 }, { "epoch": 1.07, "grad_norm": 22.315195083618164, "learning_rate": 3.22316986496091e-06, "loss": 0.2632, "step": 1000 }, { "epoch": 1.07, "eval_accuracy": 0.8562, "eval_f1_macro": 0.7683569808757446, "eval_f1_micro": 0.8562, "eval_loss": 0.5503664016723633, "eval_runtime": 25.5198, "eval_samples_per_second": 391.852, "eval_steps_per_second": 6.152, "step": 1000 }, { "epoch": 1.09, "grad_norm": 24.819501876831055, "learning_rate": 3.1787491115849327e-06, "loss": 0.2532, "step": 1025 }, { "epoch": 1.12, "grad_norm": 21.534936904907227, "learning_rate": 3.1343283582089558e-06, "loss": 0.2311, "step": 1050 }, { "epoch": 1.15, "grad_norm": 24.088809967041016, "learning_rate": 3.0899076048329785e-06, "loss": 0.3134, "step": 1075 }, { "epoch": 1.17, "grad_norm": 27.605493545532227, "learning_rate": 3.0454868514570007e-06, "loss": 0.2607, "step": 1100 }, { "epoch": 1.17, "eval_accuracy": 0.8525, "eval_f1_macro": 0.7656891626030512, "eval_f1_micro": 0.8525, "eval_loss": 0.5496523380279541, "eval_runtime": 25.7081, "eval_samples_per_second": 388.982, "eval_steps_per_second": 6.107, "step": 1100 }, { "epoch": 1.2, "grad_norm": 22.955158233642578, "learning_rate": 3.0010660980810234e-06, "loss": 0.2674, "step": 1125 }, { "epoch": 1.23, "grad_norm": 19.089893341064453, "learning_rate": 2.9566453447050464e-06, "loss": 0.2074, "step": 1150 }, { "epoch": 1.25, "grad_norm": 19.285688400268555, "learning_rate": 2.912224591329069e-06, "loss": 0.2488, "step": 1175 }, { "epoch": 1.28, "grad_norm": 23.45233726501465, "learning_rate": 2.867803837953092e-06, "loss": 0.274, "step": 1200 }, { "epoch": 1.28, "eval_accuracy": 0.8584, "eval_f1_macro": 0.7746299057445165, "eval_f1_micro": 0.8584, "eval_loss": 0.5439000129699707, "eval_runtime": 25.9014, "eval_samples_per_second": 386.079, "eval_steps_per_second": 6.061, "step": 1200 }, { "epoch": 1.31, "grad_norm": 31.231454849243164, "learning_rate": 2.823383084577115e-06, "loss": 0.2624, "step": 1225 }, { "epoch": 1.33, "grad_norm": 28.1010799407959, "learning_rate": 2.7789623312011375e-06, "loss": 0.2992, "step": 1250 }, { "epoch": 1.36, "grad_norm": 30.002384185791016, "learning_rate": 2.7345415778251598e-06, "loss": 0.2589, "step": 1275 }, { "epoch": 1.39, "grad_norm": 23.61323356628418, "learning_rate": 2.690120824449183e-06, "loss": 0.2216, "step": 1300 }, { "epoch": 1.39, "eval_accuracy": 0.8563, "eval_f1_macro": 0.7753520513346309, "eval_f1_micro": 0.8563, "eval_loss": 0.5687375068664551, "eval_runtime": 25.9424, "eval_samples_per_second": 385.47, "eval_steps_per_second": 6.052, "step": 1300 }, { "epoch": 1.41, "grad_norm": 27.56183433532715, "learning_rate": 2.6457000710732055e-06, "loss": 0.2845, "step": 1325 }, { "epoch": 1.44, "grad_norm": 18.88576316833496, "learning_rate": 2.601279317697228e-06, "loss": 0.2685, "step": 1350 }, { "epoch": 1.47, "grad_norm": 19.662220001220703, "learning_rate": 2.5568585643212513e-06, "loss": 0.2489, "step": 1375 }, { "epoch": 1.49, "grad_norm": 22.736656188964844, "learning_rate": 2.512437810945274e-06, "loss": 0.2044, "step": 1400 }, { "epoch": 1.49, "eval_accuracy": 0.861, "eval_f1_macro": 0.7820141563614671, "eval_f1_micro": 0.861, "eval_loss": 0.5385035276412964, "eval_runtime": 25.6666, "eval_samples_per_second": 389.612, "eval_steps_per_second": 6.117, "step": 1400 }, { "epoch": 1.52, "grad_norm": 24.569435119628906, "learning_rate": 2.4680170575692966e-06, "loss": 0.2388, "step": 1425 }, { "epoch": 1.55, "grad_norm": 17.50179100036621, "learning_rate": 2.4235963041933193e-06, "loss": 0.2556, "step": 1450 }, { "epoch": 1.57, "grad_norm": 15.387917518615723, "learning_rate": 2.379175550817342e-06, "loss": 0.2343, "step": 1475 }, { "epoch": 1.6, "grad_norm": 29.757495880126953, "learning_rate": 2.3347547974413646e-06, "loss": 0.2508, "step": 1500 }, { "epoch": 1.6, "eval_accuracy": 0.8577, "eval_f1_macro": 0.7710712973870113, "eval_f1_micro": 0.8577, "eval_loss": 0.5657808780670166, "eval_runtime": 25.9754, "eval_samples_per_second": 384.98, "eval_steps_per_second": 6.044, "step": 1500 }, { "epoch": 1.63, "grad_norm": 24.104217529296875, "learning_rate": 2.2903340440653877e-06, "loss": 0.2647, "step": 1525 }, { "epoch": 1.65, "grad_norm": 29.48048973083496, "learning_rate": 2.24591329068941e-06, "loss": 0.212, "step": 1550 }, { "epoch": 1.68, "grad_norm": 11.834880828857422, "learning_rate": 2.201492537313433e-06, "loss": 0.1939, "step": 1575 }, { "epoch": 1.71, "grad_norm": 24.24506378173828, "learning_rate": 2.1570717839374557e-06, "loss": 0.2513, "step": 1600 }, { "epoch": 1.71, "eval_accuracy": 0.8589, "eval_f1_macro": 0.7871987440671023, "eval_f1_micro": 0.8589, "eval_loss": 0.5366827845573425, "eval_runtime": 25.9643, "eval_samples_per_second": 385.144, "eval_steps_per_second": 6.047, "step": 1600 }, { "epoch": 1.73, "grad_norm": 23.33180046081543, "learning_rate": 2.112651030561479e-06, "loss": 0.2409, "step": 1625 }, { "epoch": 1.76, "grad_norm": 18.71114730834961, "learning_rate": 2.068230277185501e-06, "loss": 0.224, "step": 1650 }, { "epoch": 1.79, "grad_norm": 21.95819854736328, "learning_rate": 2.023809523809524e-06, "loss": 0.2223, "step": 1675 }, { "epoch": 1.81, "grad_norm": 27.065677642822266, "learning_rate": 1.979388770433547e-06, "loss": 0.2787, "step": 1700 }, { "epoch": 1.81, "eval_accuracy": 0.8653, "eval_f1_macro": 0.790261134849528, "eval_f1_micro": 0.8653, "eval_loss": 0.5133171677589417, "eval_runtime": 25.5701, "eval_samples_per_second": 391.081, "eval_steps_per_second": 6.14, "step": 1700 }, { "epoch": 1.84, "grad_norm": 35.288761138916016, "learning_rate": 1.9349680170575695e-06, "loss": 0.2709, "step": 1725 }, { "epoch": 1.87, "grad_norm": 21.077306747436523, "learning_rate": 1.8905472636815921e-06, "loss": 0.2002, "step": 1750 }, { "epoch": 1.89, "grad_norm": 25.394838333129883, "learning_rate": 1.846126510305615e-06, "loss": 0.2461, "step": 1775 }, { "epoch": 1.92, "grad_norm": 26.597759246826172, "learning_rate": 1.8017057569296375e-06, "loss": 0.2357, "step": 1800 }, { "epoch": 1.92, "eval_accuracy": 0.8669, "eval_f1_macro": 0.7902403947168268, "eval_f1_micro": 0.8669, "eval_loss": 0.5063937306404114, "eval_runtime": 25.6031, "eval_samples_per_second": 390.577, "eval_steps_per_second": 6.132, "step": 1800 }, { "epoch": 1.95, "grad_norm": 18.62090301513672, "learning_rate": 1.7572850035536603e-06, "loss": 0.2612, "step": 1825 }, { "epoch": 1.97, "grad_norm": 25.897939682006836, "learning_rate": 1.7128642501776832e-06, "loss": 0.2243, "step": 1850 }, { "epoch": 2.0, "grad_norm": 20.556882858276367, "learning_rate": 1.668443496801706e-06, "loss": 0.2078, "step": 1875 }, { "epoch": 2.03, "grad_norm": 5.211686134338379, "learning_rate": 1.6240227434257286e-06, "loss": 0.049, "step": 1900 }, { "epoch": 2.03, "eval_accuracy": 0.8719, "eval_f1_macro": 0.797777536741942, "eval_f1_micro": 0.8719, "eval_loss": 0.5344421863555908, "eval_runtime": 25.9399, "eval_samples_per_second": 385.506, "eval_steps_per_second": 6.052, "step": 1900 }, { "epoch": 2.05, "grad_norm": 14.868837356567383, "learning_rate": 1.5796019900497514e-06, "loss": 0.0483, "step": 1925 }, { "epoch": 2.08, "grad_norm": 3.739365577697754, "learning_rate": 1.5351812366737743e-06, "loss": 0.0426, "step": 1950 }, { "epoch": 2.11, "grad_norm": 3.052903413772583, "learning_rate": 1.4907604832977968e-06, "loss": 0.0468, "step": 1975 }, { "epoch": 2.13, "grad_norm": 11.233345985412598, "learning_rate": 1.4463397299218196e-06, "loss": 0.0298, "step": 2000 }, { "epoch": 2.13, "eval_accuracy": 0.8737, "eval_f1_macro": 0.7992354841882687, "eval_f1_micro": 0.8737, "eval_loss": 0.5761749744415283, "eval_runtime": 25.6811, "eval_samples_per_second": 389.392, "eval_steps_per_second": 6.113, "step": 2000 }, { "epoch": 2.16, "grad_norm": 9.836750030517578, "learning_rate": 1.4019189765458423e-06, "loss": 0.0306, "step": 2025 }, { "epoch": 2.19, "grad_norm": 12.054607391357422, "learning_rate": 1.357498223169865e-06, "loss": 0.0408, "step": 2050 }, { "epoch": 2.21, "grad_norm": 2.877735137939453, "learning_rate": 1.3130774697938879e-06, "loss": 0.032, "step": 2075 }, { "epoch": 2.24, "grad_norm": 18.573556900024414, "learning_rate": 1.2686567164179105e-06, "loss": 0.0427, "step": 2100 }, { "epoch": 2.24, "eval_accuracy": 0.8708, "eval_f1_macro": 0.7976411680340069, "eval_f1_micro": 0.8708, "eval_loss": 0.5961406230926514, "eval_runtime": 25.6941, "eval_samples_per_second": 389.194, "eval_steps_per_second": 6.11, "step": 2100 }, { "epoch": 2.27, "grad_norm": 11.545409202575684, "learning_rate": 1.2242359630419332e-06, "loss": 0.0343, "step": 2125 }, { "epoch": 2.29, "grad_norm": 3.3840279579162598, "learning_rate": 1.179815209665956e-06, "loss": 0.0237, "step": 2150 }, { "epoch": 2.32, "grad_norm": 7.452319145202637, "learning_rate": 1.1353944562899787e-06, "loss": 0.042, "step": 2175 }, { "epoch": 2.35, "grad_norm": 7.546860694885254, "learning_rate": 1.0909737029140014e-06, "loss": 0.036, "step": 2200 }, { "epoch": 2.35, "eval_accuracy": 0.8728, "eval_f1_macro": 0.7987820731312831, "eval_f1_micro": 0.8728, "eval_loss": 0.6128308773040771, "eval_runtime": 25.9603, "eval_samples_per_second": 385.204, "eval_steps_per_second": 6.048, "step": 2200 }, { "epoch": 2.37, "grad_norm": 0.8773216605186462, "learning_rate": 1.0465529495380243e-06, "loss": 0.0264, "step": 2225 }, { "epoch": 2.4, "grad_norm": 1.4390593767166138, "learning_rate": 1.002132196162047e-06, "loss": 0.0326, "step": 2250 }, { "epoch": 2.43, "grad_norm": 3.424440622329712, "learning_rate": 9.577114427860696e-07, "loss": 0.0265, "step": 2275 }, { "epoch": 2.45, "grad_norm": 13.154258728027344, "learning_rate": 9.132906894100925e-07, "loss": 0.0551, "step": 2300 }, { "epoch": 2.45, "eval_accuracy": 0.8708, "eval_f1_macro": 0.7975921884184456, "eval_f1_micro": 0.8708, "eval_loss": 0.6165248155593872, "eval_runtime": 25.655, "eval_samples_per_second": 389.788, "eval_steps_per_second": 6.12, "step": 2300 }, { "epoch": 2.48, "grad_norm": 8.90126895904541, "learning_rate": 8.688699360341152e-07, "loss": 0.0359, "step": 2325 }, { "epoch": 2.51, "grad_norm": 10.456062316894531, "learning_rate": 8.24449182658138e-07, "loss": 0.0454, "step": 2350 }, { "epoch": 2.53, "grad_norm": 13.38987922668457, "learning_rate": 7.800284292821607e-07, "loss": 0.0329, "step": 2375 }, { "epoch": 2.56, "grad_norm": 6.421198844909668, "learning_rate": 7.356076759061834e-07, "loss": 0.0392, "step": 2400 }, { "epoch": 2.56, "eval_accuracy": 0.8749, "eval_f1_macro": 0.8038155628364919, "eval_f1_micro": 0.8749, "eval_loss": 0.6023103594779968, "eval_runtime": 25.6716, "eval_samples_per_second": 389.535, "eval_steps_per_second": 6.116, "step": 2400 }, { "epoch": 2.59, "grad_norm": 16.32231330871582, "learning_rate": 6.911869225302062e-07, "loss": 0.0319, "step": 2425 }, { "epoch": 2.61, "grad_norm": 5.884388446807861, "learning_rate": 6.467661691542289e-07, "loss": 0.041, "step": 2450 }, { "epoch": 2.64, "grad_norm": 17.850648880004883, "learning_rate": 6.023454157782517e-07, "loss": 0.036, "step": 2475 }, { "epoch": 2.67, "grad_norm": 16.628997802734375, "learning_rate": 5.579246624022743e-07, "loss": 0.0364, "step": 2500 }, { "epoch": 2.67, "eval_accuracy": 0.8729, "eval_f1_macro": 0.8001251167012569, "eval_f1_micro": 0.8729, "eval_loss": 0.6167578101158142, "eval_runtime": 25.9524, "eval_samples_per_second": 385.321, "eval_steps_per_second": 6.05, "step": 2500 }, { "epoch": 2.69, "grad_norm": 4.081849575042725, "learning_rate": 5.135039090262971e-07, "loss": 0.0418, "step": 2525 }, { "epoch": 2.72, "grad_norm": 8.027618408203125, "learning_rate": 4.690831556503199e-07, "loss": 0.0324, "step": 2550 }, { "epoch": 2.75, "grad_norm": 9.084144592285156, "learning_rate": 4.2466240227434256e-07, "loss": 0.0286, "step": 2575 }, { "epoch": 2.77, "grad_norm": 5.414234161376953, "learning_rate": 3.8024164889836533e-07, "loss": 0.0416, "step": 2600 }, { "epoch": 2.77, "eval_accuracy": 0.8753, "eval_f1_macro": 0.8048163846871306, "eval_f1_micro": 0.8753, "eval_loss": 0.6102917790412903, "eval_runtime": 25.9892, "eval_samples_per_second": 384.775, "eval_steps_per_second": 6.041, "step": 2600 }, { "epoch": 2.8, "grad_norm": 7.557031154632568, "learning_rate": 3.358208955223881e-07, "loss": 0.0271, "step": 2625 }, { "epoch": 2.83, "grad_norm": 5.741875648498535, "learning_rate": 2.914001421464108e-07, "loss": 0.0367, "step": 2650 }, { "epoch": 2.85, "grad_norm": 12.493782997131348, "learning_rate": 2.4697938877043354e-07, "loss": 0.0274, "step": 2675 }, { "epoch": 2.88, "grad_norm": 2.7892916202545166, "learning_rate": 2.0255863539445632e-07, "loss": 0.0353, "step": 2700 }, { "epoch": 2.88, "eval_accuracy": 0.8749, "eval_f1_macro": 0.8053988442835892, "eval_f1_micro": 0.8749, "eval_loss": 0.6117515563964844, "eval_runtime": 25.6582, "eval_samples_per_second": 389.74, "eval_steps_per_second": 6.119, "step": 2700 }, { "epoch": 2.91, "grad_norm": 5.350805759429932, "learning_rate": 1.5813788201847903e-07, "loss": 0.0261, "step": 2725 }, { "epoch": 2.93, "grad_norm": 11.953265190124512, "learning_rate": 1.1371712864250178e-07, "loss": 0.0377, "step": 2750 }, { "epoch": 2.96, "grad_norm": 13.371731758117676, "learning_rate": 6.929637526652453e-08, "loss": 0.0255, "step": 2775 }, { "epoch": 2.99, "grad_norm": 15.417765617370605, "learning_rate": 2.4875621890547265e-08, "loss": 0.0308, "step": 2800 }, { "epoch": 2.99, "eval_accuracy": 0.875, "eval_f1_macro": 0.805663420581269, "eval_f1_micro": 0.875, "eval_loss": 0.611430287361145, "eval_runtime": 25.9862, "eval_samples_per_second": 384.819, "eval_steps_per_second": 6.042, "step": 2800 }, { "epoch": 3.0, "step": 2814, "total_flos": 1.6801018930213683e+17, "train_loss": 0.42635450247932005, "train_runtime": 3623.2348, "train_samples_per_second": 49.679, "train_steps_per_second": 0.777 } ], "logging_steps": 25, "max_steps": 2814, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.6801018930213683e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }