{ "best_metric": 0.5465434193611145, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/google_t5/t5_small_ledgar/checkpoint-2800", "epoch": 2.9850746268656714, "eval_steps": 100, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 1.7876489162445068, "learning_rate": 0.0004955579246624023, "loss": 4.3484, "step": 25 }, { "epoch": 0.05, "grad_norm": 1.8848645687103271, "learning_rate": 0.0004911158493248046, "loss": 3.6753, "step": 50 }, { "epoch": 0.08, "grad_norm": 1.908879041671753, "learning_rate": 0.00048667377398720687, "loss": 2.9728, "step": 75 }, { "epoch": 0.11, "grad_norm": 2.578331708908081, "learning_rate": 0.0004822316986496091, "loss": 2.3898, "step": 100 }, { "epoch": 0.11, "eval_accuracy": 0.6083, "eval_f1_macro": 0.3304955770365372, "eval_f1_micro": 0.6083, "eval_loss": 1.8530884981155396, "eval_runtime": 8.4672, "eval_samples_per_second": 1181.029, "eval_steps_per_second": 18.542, "step": 100 }, { "epoch": 0.13, "grad_norm": 2.0771608352661133, "learning_rate": 0.0004777896233120114, "loss": 1.8696, "step": 125 }, { "epoch": 0.16, "grad_norm": 2.535646438598633, "learning_rate": 0.00047334754797441367, "loss": 1.6221, "step": 150 }, { "epoch": 0.19, "grad_norm": 2.7007129192352295, "learning_rate": 0.00046890547263681595, "loss": 1.3656, "step": 175 }, { "epoch": 0.21, "grad_norm": 1.9887003898620605, "learning_rate": 0.00046446339729921824, "loss": 1.1887, "step": 200 }, { "epoch": 0.21, "eval_accuracy": 0.7307, "eval_f1_macro": 0.5339813428887715, "eval_f1_micro": 0.7307, "eval_loss": 1.0730310678482056, "eval_runtime": 8.4036, "eval_samples_per_second": 1189.963, "eval_steps_per_second": 18.682, "step": 200 }, { "epoch": 0.24, "grad_norm": 2.7746949195861816, "learning_rate": 0.0004600213219616205, "loss": 1.1038, "step": 225 }, { "epoch": 0.27, "grad_norm": 2.2641148567199707, "learning_rate": 0.00045557924662402275, "loss": 1.0577, "step": 250 }, { "epoch": 0.29, "grad_norm": 2.5648245811462402, "learning_rate": 0.000451137171286425, "loss": 1.0154, "step": 275 }, { "epoch": 0.32, "grad_norm": 2.4375596046447754, "learning_rate": 0.00044669509594882727, "loss": 0.946, "step": 300 }, { "epoch": 0.32, "eval_accuracy": 0.77, "eval_f1_macro": 0.6068459597214834, "eval_f1_micro": 0.77, "eval_loss": 0.8826236128807068, "eval_runtime": 8.4296, "eval_samples_per_second": 1186.302, "eval_steps_per_second": 18.625, "step": 300 }, { "epoch": 0.35, "grad_norm": 1.7988697290420532, "learning_rate": 0.00044225302061122956, "loss": 0.9106, "step": 325 }, { "epoch": 0.37, "grad_norm": 2.5426275730133057, "learning_rate": 0.00043781094527363184, "loss": 0.8913, "step": 350 }, { "epoch": 0.4, "grad_norm": 2.555204391479492, "learning_rate": 0.0004333688699360341, "loss": 0.8176, "step": 375 }, { "epoch": 0.43, "grad_norm": 2.390054941177368, "learning_rate": 0.0004289267945984364, "loss": 0.8383, "step": 400 }, { "epoch": 0.43, "eval_accuracy": 0.7851, "eval_f1_macro": 0.6350877431295691, "eval_f1_micro": 0.7851, "eval_loss": 0.8015705943107605, "eval_runtime": 8.4118, "eval_samples_per_second": 1188.809, "eval_steps_per_second": 18.664, "step": 400 }, { "epoch": 0.45, "grad_norm": 2.5415456295013428, "learning_rate": 0.00042448471926083864, "loss": 0.8728, "step": 425 }, { "epoch": 0.48, "grad_norm": 2.9339377880096436, "learning_rate": 0.00042004264392324093, "loss": 0.9204, "step": 450 }, { "epoch": 0.51, "grad_norm": 2.0690979957580566, "learning_rate": 0.0004156005685856432, "loss": 0.859, "step": 475 }, { "epoch": 0.53, "grad_norm": 2.4634995460510254, "learning_rate": 0.0004111584932480455, "loss": 0.8559, "step": 500 }, { "epoch": 0.53, "eval_accuracy": 0.8011, "eval_f1_macro": 0.674742730667306, "eval_f1_micro": 0.8011, "eval_loss": 0.7436844110488892, "eval_runtime": 8.4108, "eval_samples_per_second": 1188.952, "eval_steps_per_second": 18.667, "step": 500 }, { "epoch": 0.56, "grad_norm": 1.8953908681869507, "learning_rate": 0.0004067164179104478, "loss": 0.8096, "step": 525 }, { "epoch": 0.59, "grad_norm": 1.3915468454360962, "learning_rate": 0.00040227434257285007, "loss": 0.7702, "step": 550 }, { "epoch": 0.61, "grad_norm": 3.767996072769165, "learning_rate": 0.00039783226723525235, "loss": 0.7342, "step": 575 }, { "epoch": 0.64, "grad_norm": 2.76263165473938, "learning_rate": 0.0003933901918976546, "loss": 0.7944, "step": 600 }, { "epoch": 0.64, "eval_accuracy": 0.8091, "eval_f1_macro": 0.6932776242562073, "eval_f1_micro": 0.8091, "eval_loss": 0.706779956817627, "eval_runtime": 8.4071, "eval_samples_per_second": 1189.465, "eval_steps_per_second": 18.675, "step": 600 }, { "epoch": 0.67, "grad_norm": 2.232637882232666, "learning_rate": 0.00038894811656005687, "loss": 0.7901, "step": 625 }, { "epoch": 0.69, "grad_norm": 2.9547877311706543, "learning_rate": 0.00038450604122245916, "loss": 0.7668, "step": 650 }, { "epoch": 0.72, "grad_norm": 2.981079578399658, "learning_rate": 0.00038006396588486144, "loss": 0.7579, "step": 675 }, { "epoch": 0.75, "grad_norm": 1.8093345165252686, "learning_rate": 0.0003756218905472637, "loss": 0.7151, "step": 700 }, { "epoch": 0.75, "eval_accuracy": 0.8191, "eval_f1_macro": 0.6983413957630255, "eval_f1_micro": 0.8191, "eval_loss": 0.6853256821632385, "eval_runtime": 8.3986, "eval_samples_per_second": 1190.672, "eval_steps_per_second": 18.694, "step": 700 }, { "epoch": 0.77, "grad_norm": 2.0441739559173584, "learning_rate": 0.00037117981520966596, "loss": 0.6854, "step": 725 }, { "epoch": 0.8, "grad_norm": 2.1465673446655273, "learning_rate": 0.00036673773987206824, "loss": 0.7541, "step": 750 }, { "epoch": 0.83, "grad_norm": 1.6907949447631836, "learning_rate": 0.0003622956645344705, "loss": 0.7349, "step": 775 }, { "epoch": 0.85, "grad_norm": 2.384601354598999, "learning_rate": 0.00035785358919687276, "loss": 0.7077, "step": 800 }, { "epoch": 0.85, "eval_accuracy": 0.8187, "eval_f1_macro": 0.7119750119742265, "eval_f1_micro": 0.8187, "eval_loss": 0.6665953993797302, "eval_runtime": 8.3427, "eval_samples_per_second": 1198.652, "eval_steps_per_second": 18.819, "step": 800 }, { "epoch": 0.88, "grad_norm": 2.5287020206451416, "learning_rate": 0.00035341151385927504, "loss": 0.756, "step": 825 }, { "epoch": 0.91, "grad_norm": 2.267557382583618, "learning_rate": 0.00034896943852167733, "loss": 0.6752, "step": 850 }, { "epoch": 0.93, "grad_norm": 2.61407208442688, "learning_rate": 0.0003445273631840796, "loss": 0.6903, "step": 875 }, { "epoch": 0.96, "grad_norm": 2.658088445663452, "learning_rate": 0.0003400852878464819, "loss": 0.6645, "step": 900 }, { "epoch": 0.96, "eval_accuracy": 0.8196, "eval_f1_macro": 0.7210971149577416, "eval_f1_micro": 0.8196, "eval_loss": 0.6476125121116638, "eval_runtime": 8.3951, "eval_samples_per_second": 1191.177, "eval_steps_per_second": 18.701, "step": 900 }, { "epoch": 0.99, "grad_norm": 2.093398332595825, "learning_rate": 0.00033564321250888413, "loss": 0.6929, "step": 925 }, { "epoch": 1.01, "grad_norm": 1.9347296953201294, "learning_rate": 0.0003312011371712864, "loss": 0.6415, "step": 950 }, { "epoch": 1.04, "grad_norm": 2.2153849601745605, "learning_rate": 0.0003267590618336887, "loss": 0.6228, "step": 975 }, { "epoch": 1.07, "grad_norm": 1.8041683435440063, "learning_rate": 0.000322316986496091, "loss": 0.5918, "step": 1000 }, { "epoch": 1.07, "eval_accuracy": 0.8297, "eval_f1_macro": 0.7261577527278286, "eval_f1_micro": 0.8297, "eval_loss": 0.646891713142395, "eval_runtime": 8.3945, "eval_samples_per_second": 1191.262, "eval_steps_per_second": 18.703, "step": 1000 }, { "epoch": 1.09, "grad_norm": 1.6319005489349365, "learning_rate": 0.00031787491115849327, "loss": 0.5817, "step": 1025 }, { "epoch": 1.12, "grad_norm": 2.7899856567382812, "learning_rate": 0.00031343283582089556, "loss": 0.581, "step": 1050 }, { "epoch": 1.15, "grad_norm": 2.230139970779419, "learning_rate": 0.00030899076048329784, "loss": 0.642, "step": 1075 }, { "epoch": 1.17, "grad_norm": 2.1724562644958496, "learning_rate": 0.0003045486851457001, "loss": 0.5866, "step": 1100 }, { "epoch": 1.17, "eval_accuracy": 0.8288, "eval_f1_macro": 0.7285961959729647, "eval_f1_micro": 0.8288, "eval_loss": 0.6308562755584717, "eval_runtime": 8.3833, "eval_samples_per_second": 1192.854, "eval_steps_per_second": 18.728, "step": 1100 }, { "epoch": 1.2, "grad_norm": 1.5955884456634521, "learning_rate": 0.00030010660980810236, "loss": 0.605, "step": 1125 }, { "epoch": 1.23, "grad_norm": 1.6877374649047852, "learning_rate": 0.00029566453447050464, "loss": 0.5427, "step": 1150 }, { "epoch": 1.25, "grad_norm": 1.8254977464675903, "learning_rate": 0.0002912224591329069, "loss": 0.5278, "step": 1175 }, { "epoch": 1.28, "grad_norm": 2.1816537380218506, "learning_rate": 0.00028678038379530916, "loss": 0.6665, "step": 1200 }, { "epoch": 1.28, "eval_accuracy": 0.8363, "eval_f1_macro": 0.7473206082782838, "eval_f1_micro": 0.8363, "eval_loss": 0.6188435554504395, "eval_runtime": 8.3807, "eval_samples_per_second": 1193.214, "eval_steps_per_second": 18.733, "step": 1200 }, { "epoch": 1.31, "grad_norm": 2.975969076156616, "learning_rate": 0.00028233830845771145, "loss": 0.6379, "step": 1225 }, { "epoch": 1.33, "grad_norm": 1.7407147884368896, "learning_rate": 0.00027789623312011373, "loss": 0.6198, "step": 1250 }, { "epoch": 1.36, "grad_norm": 1.3424392938613892, "learning_rate": 0.00027345415778251596, "loss": 0.5808, "step": 1275 }, { "epoch": 1.39, "grad_norm": 3.161609172821045, "learning_rate": 0.00026901208244491825, "loss": 0.5684, "step": 1300 }, { "epoch": 1.39, "eval_accuracy": 0.837, "eval_f1_macro": 0.7455872865987888, "eval_f1_micro": 0.837, "eval_loss": 0.6117735505104065, "eval_runtime": 8.3705, "eval_samples_per_second": 1194.666, "eval_steps_per_second": 18.756, "step": 1300 }, { "epoch": 1.41, "grad_norm": 2.2285666465759277, "learning_rate": 0.00026457000710732053, "loss": 0.5964, "step": 1325 }, { "epoch": 1.44, "grad_norm": 1.9442161321640015, "learning_rate": 0.0002601279317697228, "loss": 0.5932, "step": 1350 }, { "epoch": 1.47, "grad_norm": 2.2172343730926514, "learning_rate": 0.0002556858564321251, "loss": 0.5752, "step": 1375 }, { "epoch": 1.49, "grad_norm": 1.6125428676605225, "learning_rate": 0.0002512437810945274, "loss": 0.4986, "step": 1400 }, { "epoch": 1.49, "eval_accuracy": 0.8374, "eval_f1_macro": 0.7519546979290682, "eval_f1_micro": 0.8374, "eval_loss": 0.6116703152656555, "eval_runtime": 8.3786, "eval_samples_per_second": 1193.512, "eval_steps_per_second": 18.738, "step": 1400 }, { "epoch": 1.52, "grad_norm": 2.1590497493743896, "learning_rate": 0.0002468017057569296, "loss": 0.5876, "step": 1425 }, { "epoch": 1.55, "grad_norm": 2.2718493938446045, "learning_rate": 0.00024235963041933193, "loss": 0.5973, "step": 1450 }, { "epoch": 1.57, "grad_norm": 1.4332152605056763, "learning_rate": 0.0002379175550817342, "loss": 0.5829, "step": 1475 }, { "epoch": 1.6, "grad_norm": 1.6546821594238281, "learning_rate": 0.00023347547974413648, "loss": 0.5786, "step": 1500 }, { "epoch": 1.6, "eval_accuracy": 0.8363, "eval_f1_macro": 0.7461921136413558, "eval_f1_micro": 0.8363, "eval_loss": 0.6103752851486206, "eval_runtime": 8.3693, "eval_samples_per_second": 1194.837, "eval_steps_per_second": 18.759, "step": 1500 }, { "epoch": 1.63, "grad_norm": 1.9171111583709717, "learning_rate": 0.00022903340440653876, "loss": 0.5505, "step": 1525 }, { "epoch": 1.65, "grad_norm": 2.0956099033355713, "learning_rate": 0.000224591329068941, "loss": 0.535, "step": 1550 }, { "epoch": 1.68, "grad_norm": 2.201850414276123, "learning_rate": 0.00022014925373134328, "loss": 0.5165, "step": 1575 }, { "epoch": 1.71, "grad_norm": 1.9713943004608154, "learning_rate": 0.00021570717839374556, "loss": 0.5956, "step": 1600 }, { "epoch": 1.71, "eval_accuracy": 0.8365, "eval_f1_macro": 0.7455143804937282, "eval_f1_micro": 0.8365, "eval_loss": 0.5964870452880859, "eval_runtime": 8.3692, "eval_samples_per_second": 1194.851, "eval_steps_per_second": 18.759, "step": 1600 }, { "epoch": 1.73, "grad_norm": 2.684727907180786, "learning_rate": 0.00021126510305614785, "loss": 0.5603, "step": 1625 }, { "epoch": 1.76, "grad_norm": 2.506880283355713, "learning_rate": 0.0002068230277185501, "loss": 0.5434, "step": 1650 }, { "epoch": 1.79, "grad_norm": 2.2245657444000244, "learning_rate": 0.0002023809523809524, "loss": 0.5225, "step": 1675 }, { "epoch": 1.81, "grad_norm": 2.1831839084625244, "learning_rate": 0.00019793887704335468, "loss": 0.5653, "step": 1700 }, { "epoch": 1.81, "eval_accuracy": 0.8425, "eval_f1_macro": 0.75879689124166, "eval_f1_micro": 0.8425, "eval_loss": 0.5817448496818542, "eval_runtime": 8.2668, "eval_samples_per_second": 1209.66, "eval_steps_per_second": 18.992, "step": 1700 }, { "epoch": 1.84, "grad_norm": 3.0693764686584473, "learning_rate": 0.00019349680170575694, "loss": 0.5874, "step": 1725 }, { "epoch": 1.87, "grad_norm": 2.015434741973877, "learning_rate": 0.00018905472636815922, "loss": 0.5276, "step": 1750 }, { "epoch": 1.89, "grad_norm": 1.8584706783294678, "learning_rate": 0.00018461265103056148, "loss": 0.5956, "step": 1775 }, { "epoch": 1.92, "grad_norm": 2.165771007537842, "learning_rate": 0.00018017057569296374, "loss": 0.5292, "step": 1800 }, { "epoch": 1.92, "eval_accuracy": 0.842, "eval_f1_macro": 0.7515987736281642, "eval_f1_micro": 0.842, "eval_loss": 0.5732225775718689, "eval_runtime": 8.3577, "eval_samples_per_second": 1196.496, "eval_steps_per_second": 18.785, "step": 1800 }, { "epoch": 1.95, "grad_norm": 1.7540485858917236, "learning_rate": 0.00017572850035536602, "loss": 0.6377, "step": 1825 }, { "epoch": 1.97, "grad_norm": 2.1525564193725586, "learning_rate": 0.0001712864250177683, "loss": 0.5995, "step": 1850 }, { "epoch": 2.0, "grad_norm": 1.8227964639663696, "learning_rate": 0.0001668443496801706, "loss": 0.4946, "step": 1875 }, { "epoch": 2.03, "grad_norm": 1.2243927717208862, "learning_rate": 0.00016240227434257285, "loss": 0.4674, "step": 1900 }, { "epoch": 2.03, "eval_accuracy": 0.8456, "eval_f1_macro": 0.7544341963792838, "eval_f1_micro": 0.8456, "eval_loss": 0.5669511556625366, "eval_runtime": 8.3607, "eval_samples_per_second": 1196.075, "eval_steps_per_second": 18.778, "step": 1900 }, { "epoch": 2.05, "grad_norm": 1.6851863861083984, "learning_rate": 0.00015796019900497514, "loss": 0.4851, "step": 1925 }, { "epoch": 2.08, "grad_norm": 1.5451288223266602, "learning_rate": 0.00015351812366737742, "loss": 0.4883, "step": 1950 }, { "epoch": 2.11, "grad_norm": 1.6870522499084473, "learning_rate": 0.00014907604832977968, "loss": 0.4975, "step": 1975 }, { "epoch": 2.13, "grad_norm": 2.1712546348571777, "learning_rate": 0.00014463397299218194, "loss": 0.452, "step": 2000 }, { "epoch": 2.13, "eval_accuracy": 0.847, "eval_f1_macro": 0.7615345380474722, "eval_f1_micro": 0.847, "eval_loss": 0.5686009526252747, "eval_runtime": 8.3632, "eval_samples_per_second": 1195.713, "eval_steps_per_second": 18.773, "step": 2000 }, { "epoch": 2.16, "grad_norm": 2.1659963130950928, "learning_rate": 0.00014019189765458422, "loss": 0.4517, "step": 2025 }, { "epoch": 2.19, "grad_norm": 1.7023169994354248, "learning_rate": 0.00013574982231698648, "loss": 0.4938, "step": 2050 }, { "epoch": 2.21, "grad_norm": 1.8300412893295288, "learning_rate": 0.00013130774697938877, "loss": 0.4468, "step": 2075 }, { "epoch": 2.24, "grad_norm": 2.360961675643921, "learning_rate": 0.00012686567164179105, "loss": 0.4827, "step": 2100 }, { "epoch": 2.24, "eval_accuracy": 0.8461, "eval_f1_macro": 0.7715708187088293, "eval_f1_micro": 0.8461, "eval_loss": 0.5636329054832458, "eval_runtime": 8.3161, "eval_samples_per_second": 1202.485, "eval_steps_per_second": 18.879, "step": 2100 }, { "epoch": 2.27, "grad_norm": 1.511915683746338, "learning_rate": 0.0001224235963041933, "loss": 0.4353, "step": 2125 }, { "epoch": 2.29, "grad_norm": 1.1389654874801636, "learning_rate": 0.0001179815209665956, "loss": 0.4304, "step": 2150 }, { "epoch": 2.32, "grad_norm": 1.280794620513916, "learning_rate": 0.00011353944562899787, "loss": 0.3848, "step": 2175 }, { "epoch": 2.35, "grad_norm": 1.9939156770706177, "learning_rate": 0.00010909737029140014, "loss": 0.4617, "step": 2200 }, { "epoch": 2.35, "eval_accuracy": 0.8491, "eval_f1_macro": 0.7612720077781023, "eval_f1_micro": 0.8491, "eval_loss": 0.5610827803611755, "eval_runtime": 8.3647, "eval_samples_per_second": 1195.499, "eval_steps_per_second": 18.769, "step": 2200 }, { "epoch": 2.37, "grad_norm": 1.6452387571334839, "learning_rate": 0.00010465529495380242, "loss": 0.41, "step": 2225 }, { "epoch": 2.4, "grad_norm": 1.8779040575027466, "learning_rate": 0.0001002132196162047, "loss": 0.4365, "step": 2250 }, { "epoch": 2.43, "grad_norm": 2.088947296142578, "learning_rate": 9.577114427860697e-05, "loss": 0.4681, "step": 2275 }, { "epoch": 2.45, "grad_norm": 2.114861011505127, "learning_rate": 9.132906894100924e-05, "loss": 0.4508, "step": 2300 }, { "epoch": 2.45, "eval_accuracy": 0.8499, "eval_f1_macro": 0.7610245022013704, "eval_f1_micro": 0.8499, "eval_loss": 0.5593724846839905, "eval_runtime": 8.3635, "eval_samples_per_second": 1195.672, "eval_steps_per_second": 18.772, "step": 2300 }, { "epoch": 2.48, "grad_norm": 2.641166925430298, "learning_rate": 8.688699360341151e-05, "loss": 0.4928, "step": 2325 }, { "epoch": 2.51, "grad_norm": 2.3455002307891846, "learning_rate": 8.24449182658138e-05, "loss": 0.477, "step": 2350 }, { "epoch": 2.53, "grad_norm": 2.1197574138641357, "learning_rate": 7.800284292821607e-05, "loss": 0.4401, "step": 2375 }, { "epoch": 2.56, "grad_norm": 2.29925537109375, "learning_rate": 7.356076759061834e-05, "loss": 0.432, "step": 2400 }, { "epoch": 2.56, "eval_accuracy": 0.85, "eval_f1_macro": 0.7653751726212001, "eval_f1_micro": 0.85, "eval_loss": 0.5532112121582031, "eval_runtime": 8.3602, "eval_samples_per_second": 1196.146, "eval_steps_per_second": 18.779, "step": 2400 }, { "epoch": 2.59, "grad_norm": 1.8475215435028076, "learning_rate": 6.911869225302061e-05, "loss": 0.53, "step": 2425 }, { "epoch": 2.61, "grad_norm": 2.600374221801758, "learning_rate": 6.467661691542288e-05, "loss": 0.4721, "step": 2450 }, { "epoch": 2.64, "grad_norm": 2.111037492752075, "learning_rate": 6.023454157782516e-05, "loss": 0.4328, "step": 2475 }, { "epoch": 2.67, "grad_norm": 2.5330514907836914, "learning_rate": 5.579246624022743e-05, "loss": 0.4298, "step": 2500 }, { "epoch": 2.67, "eval_accuracy": 0.8503, "eval_f1_macro": 0.7666150685407888, "eval_f1_micro": 0.8503, "eval_loss": 0.5520814061164856, "eval_runtime": 8.2724, "eval_samples_per_second": 1208.838, "eval_steps_per_second": 18.979, "step": 2500 }, { "epoch": 2.69, "grad_norm": 2.078260898590088, "learning_rate": 5.135039090262971e-05, "loss": 0.473, "step": 2525 }, { "epoch": 2.72, "grad_norm": 2.062211751937866, "learning_rate": 4.690831556503199e-05, "loss": 0.4516, "step": 2550 }, { "epoch": 2.75, "grad_norm": 2.3843843936920166, "learning_rate": 4.2466240227434255e-05, "loss": 0.4763, "step": 2575 }, { "epoch": 2.77, "grad_norm": 1.875730037689209, "learning_rate": 3.802416488983653e-05, "loss": 0.4627, "step": 2600 }, { "epoch": 2.77, "eval_accuracy": 0.85, "eval_f1_macro": 0.7661219116228528, "eval_f1_micro": 0.85, "eval_loss": 0.5510755181312561, "eval_runtime": 8.3637, "eval_samples_per_second": 1195.644, "eval_steps_per_second": 18.772, "step": 2600 }, { "epoch": 2.8, "grad_norm": 1.708553671836853, "learning_rate": 3.3582089552238805e-05, "loss": 0.4443, "step": 2625 }, { "epoch": 2.83, "grad_norm": 2.3728208541870117, "learning_rate": 2.9140014214641083e-05, "loss": 0.4784, "step": 2650 }, { "epoch": 2.85, "grad_norm": 2.382185459136963, "learning_rate": 2.4697938877043355e-05, "loss": 0.4554, "step": 2675 }, { "epoch": 2.88, "grad_norm": 1.8948814868927002, "learning_rate": 2.025586353944563e-05, "loss": 0.4353, "step": 2700 }, { "epoch": 2.88, "eval_accuracy": 0.8532, "eval_f1_macro": 0.7705762474067852, "eval_f1_micro": 0.8532, "eval_loss": 0.5466225743293762, "eval_runtime": 8.3561, "eval_samples_per_second": 1196.726, "eval_steps_per_second": 18.789, "step": 2700 }, { "epoch": 2.91, "grad_norm": 1.1043734550476074, "learning_rate": 1.5813788201847902e-05, "loss": 0.4379, "step": 2725 }, { "epoch": 2.93, "grad_norm": 2.514373302459717, "learning_rate": 1.1371712864250177e-05, "loss": 0.442, "step": 2750 }, { "epoch": 2.96, "grad_norm": 2.475358724594116, "learning_rate": 6.929637526652452e-06, "loss": 0.4166, "step": 2775 }, { "epoch": 2.99, "grad_norm": 1.829185128211975, "learning_rate": 2.4875621890547264e-06, "loss": 0.4371, "step": 2800 }, { "epoch": 2.99, "eval_accuracy": 0.8527, "eval_f1_macro": 0.7698066205317469, "eval_f1_micro": 0.8527, "eval_loss": 0.5465434193611145, "eval_runtime": 8.2631, "eval_samples_per_second": 1210.206, "eval_steps_per_second": 19.0, "step": 2800 } ], "logging_steps": 25, "max_steps": 2814, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 6106520949358592.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }