{ "best_metric": 0.8511214230471771, "best_model_checkpoint": "./XLMR-large-multi-109k-multi-outputs/checkpoint-20000", "epoch": 7.893792608539648, "eval_steps": 1000, "global_step": 22000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.35880875493362036, "grad_norm": 4.352888584136963, "learning_rate": 8.968609865470852e-07, "loss": 0.6986, "step": 1000 }, { "epoch": 0.35880875493362036, "eval_accuracy": 0.5540417801998183, "eval_f1": 0.5386783589101158, "eval_loss": 0.6878186464309692, "eval_precision": 0.5646749835850295, "eval_recall": 0.5149700598802395, "eval_runtime": 102.1517, "eval_samples_per_second": 97.003, "eval_steps_per_second": 3.035, "step": 1000 }, { "epoch": 0.7176175098672407, "grad_norm": 21.848262786865234, "learning_rate": 1.7937219730941704e-06, "loss": 0.6696, "step": 2000 }, { "epoch": 0.7176175098672407, "eval_accuracy": 0.6608134019578161, "eval_f1": 0.658677769879151, "eval_loss": 0.5846661925315857, "eval_precision": 0.6704568947694852, "eval_recall": 0.6473053892215569, "eval_runtime": 102.1359, "eval_samples_per_second": 97.018, "eval_steps_per_second": 3.035, "step": 2000 }, { "epoch": 1.0764262648008611, "grad_norm": 40.847599029541016, "learning_rate": 1.9232532642280473e-06, "loss": 0.5066, "step": 3000 }, { "epoch": 1.0764262648008611, "eval_accuracy": 0.7411444141689373, "eval_f1": 0.7009443861490031, "eval_loss": 0.4447058439254761, "eval_precision": 0.8427249789739276, "eval_recall": 0.6, "eval_runtime": 101.7923, "eval_samples_per_second": 97.345, "eval_steps_per_second": 3.045, "step": 3000 }, { "epoch": 1.4352350197344816, "grad_norm": 54.295196533203125, "learning_rate": 1.8235821788099273e-06, "loss": 0.4293, "step": 4000 }, { "epoch": 1.4352350197344816, "eval_accuracy": 0.7554748208699162, "eval_f1": 0.7800671689207588, "eval_loss": 0.4109445810317993, "eval_precision": 0.7153321125353754, "eval_recall": 0.857684630738523, "eval_runtime": 102.1072, "eval_samples_per_second": 97.045, "eval_steps_per_second": 3.036, "step": 4000 }, { "epoch": 1.794043774668102, "grad_norm": 15.192496299743652, "learning_rate": 1.723911093391807e-06, "loss": 0.402, "step": 5000 }, { "epoch": 1.794043774668102, "eval_accuracy": 0.77041073771319, "eval_f1": 0.7882736156351792, "eval_loss": 0.382622092962265, "eval_precision": 0.7384481255448997, "eval_recall": 0.845309381237525, "eval_runtime": 102.0586, "eval_samples_per_second": 97.091, "eval_steps_per_second": 3.037, "step": 5000 }, { "epoch": 2.1528525296017222, "grad_norm": 20.097732543945312, "learning_rate": 1.6242400079736866e-06, "loss": 0.379, "step": 6000 }, { "epoch": 2.1528525296017222, "eval_accuracy": 0.7927136946210516, "eval_f1": 0.8103416435826408, "eval_loss": 0.3549489676952362, "eval_precision": 0.7539518900343642, "eval_recall": 0.8758483033932136, "eval_runtime": 102.0802, "eval_samples_per_second": 97.071, "eval_steps_per_second": 3.037, "step": 6000 }, { "epoch": 2.5116612845353425, "grad_norm": 18.669349670410156, "learning_rate": 1.5245689225555665e-06, "loss": 0.3542, "step": 7000 }, { "epoch": 2.5116612845353425, "eval_accuracy": 0.8001816530426885, "eval_f1": 0.8057680988816951, "eval_loss": 0.33865031599998474, "eval_precision": 0.7922453703703703, "eval_recall": 0.8197604790419162, "eval_runtime": 101.988, "eval_samples_per_second": 97.158, "eval_steps_per_second": 3.04, "step": 7000 }, { "epoch": 2.8704700394689633, "grad_norm": 16.86187744140625, "learning_rate": 1.4248978371374463e-06, "loss": 0.3347, "step": 8000 }, { "epoch": 2.8704700394689633, "eval_accuracy": 0.8095670602482592, "eval_f1": 0.8015146733985484, "eval_loss": 0.3362087607383728, "eval_precision": 0.8472314876584389, "eval_recall": 0.7604790419161677, "eval_runtime": 102.0615, "eval_samples_per_second": 97.089, "eval_steps_per_second": 3.037, "step": 8000 }, { "epoch": 3.2292787944025836, "grad_norm": 43.31462860107422, "learning_rate": 1.325226751719326e-06, "loss": 0.3166, "step": 9000 }, { "epoch": 3.2292787944025836, "eval_accuracy": 0.812897366030881, "eval_f1": 0.8211460544086436, "eval_loss": 0.32650405168533325, "eval_precision": 0.79462285287528, "eval_recall": 0.849500998003992, "eval_runtime": 102.0249, "eval_samples_per_second": 97.123, "eval_steps_per_second": 3.038, "step": 9000 }, { "epoch": 3.588087549336204, "grad_norm": 30.563907623291016, "learning_rate": 1.2255556663012058e-06, "loss": 0.3051, "step": 10000 }, { "epoch": 3.588087549336204, "eval_accuracy": 0.8215763447371077, "eval_f1": 0.8306188925081434, "eval_loss": 0.3094022274017334, "eval_precision": 0.7986366985998526, "eval_recall": 0.8652694610778443, "eval_runtime": 101.9854, "eval_samples_per_second": 97.161, "eval_steps_per_second": 3.04, "step": 10000 }, { "epoch": 3.946896304269824, "grad_norm": 16.99272918701172, "learning_rate": 1.1258845808830857e-06, "loss": 0.3004, "step": 11000 }, { "epoch": 3.946896304269824, "eval_accuracy": 0.8221818548794025, "eval_f1": 0.82064332247557, "eval_loss": 0.30684638023376465, "eval_precision": 0.8373493975903614, "eval_recall": 0.8045908183632735, "eval_runtime": 101.8448, "eval_samples_per_second": 97.295, "eval_steps_per_second": 3.044, "step": 11000 }, { "epoch": 4.3057050592034445, "grad_norm": 13.4973783493042, "learning_rate": 1.0262134954649656e-06, "loss": 0.2808, "step": 12000 }, { "epoch": 4.3057050592034445, "eval_accuracy": 0.8267231809466142, "eval_f1": 0.8348879699971151, "eval_loss": 0.3041677176952362, "eval_precision": 0.8055297828910745, "eval_recall": 0.8664670658682635, "eval_runtime": 101.9104, "eval_samples_per_second": 97.232, "eval_steps_per_second": 3.042, "step": 12000 }, { "epoch": 4.664513814137065, "grad_norm": 11.244281768798828, "learning_rate": 9.265424100468454e-07, "loss": 0.2698, "step": 13000 }, { "epoch": 4.664513814137065, "eval_accuracy": 0.832778282369563, "eval_f1": 0.8460466412710211, "eval_loss": 0.31224456429481506, "eval_precision": 0.7914131757343994, "eval_recall": 0.9087824351297406, "eval_runtime": 102.1673, "eval_samples_per_second": 96.988, "eval_steps_per_second": 3.034, "step": 13000 }, { "epoch": 5.023322569070685, "grad_norm": 16.027326583862305, "learning_rate": 8.268713246287253e-07, "loss": 0.2744, "step": 14000 }, { "epoch": 5.023322569070685, "eval_accuracy": 0.8310626702997275, "eval_f1": 0.8291139240506329, "eval_loss": 0.3018481731414795, "eval_precision": 0.8485165064772252, "eval_recall": 0.8105788423153693, "eval_runtime": 102.3417, "eval_samples_per_second": 96.823, "eval_steps_per_second": 3.029, "step": 14000 }, { "epoch": 5.382131324004305, "grad_norm": 10.459993362426758, "learning_rate": 7.272002392106049e-07, "loss": 0.2526, "step": 15000 }, { "epoch": 5.382131324004305, "eval_accuracy": 0.8403471591482491, "eval_f1": 0.8377102995486254, "eval_loss": 0.29085829854011536, "eval_precision": 0.8617560151962853, "eval_recall": 0.8149700598802395, "eval_runtime": 102.0802, "eval_samples_per_second": 97.071, "eval_steps_per_second": 3.037, "step": 15000 }, { "epoch": 5.740940078937927, "grad_norm": 35.09458923339844, "learning_rate": 6.275291537924848e-07, "loss": 0.2526, "step": 16000 }, { "epoch": 5.740940078937927, "eval_accuracy": 0.8417600161469371, "eval_f1": 0.8460632240329865, "eval_loss": 0.2996227443218231, "eval_precision": 0.8324961360123647, "eval_recall": 0.8600798403193612, "eval_runtime": 102.7417, "eval_samples_per_second": 96.446, "eval_steps_per_second": 3.017, "step": 16000 }, { "epoch": 6.099748833871547, "grad_norm": 14.264002799987793, "learning_rate": 5.278580683743645e-07, "loss": 0.2497, "step": 17000 }, { "epoch": 6.099748833871547, "eval_accuracy": 0.8463013422141488, "eval_f1": 0.8384084880636605, "eval_loss": 0.3004078269004822, "eval_precision": 0.8949037372593431, "eval_recall": 0.7886227544910179, "eval_runtime": 102.339, "eval_samples_per_second": 96.825, "eval_steps_per_second": 3.029, "step": 17000 }, { "epoch": 6.458557588805167, "grad_norm": 22.82822036743164, "learning_rate": 4.281869829562444e-07, "loss": 0.2383, "step": 18000 }, { "epoch": 6.458557588805167, "eval_accuracy": 0.8451912402866081, "eval_f1": 0.8430851063829787, "eval_loss": 0.300709992647171, "eval_precision": 0.8646663869072597, "eval_recall": 0.8225548902195609, "eval_runtime": 102.0165, "eval_samples_per_second": 97.131, "eval_steps_per_second": 3.039, "step": 18000 }, { "epoch": 6.8173663437387875, "grad_norm": 35.29351043701172, "learning_rate": 3.285158975381242e-07, "loss": 0.2357, "step": 19000 }, { "epoch": 6.8173663437387875, "eval_accuracy": 0.8466040972852962, "eval_f1": 0.8489966222928671, "eval_loss": 0.2865988314151764, "eval_precision": 0.8451344936708861, "eval_recall": 0.8528942115768463, "eval_runtime": 102.3958, "eval_samples_per_second": 96.772, "eval_steps_per_second": 3.027, "step": 19000 }, { "epoch": 7.176175098672408, "grad_norm": 6.055088043212891, "learning_rate": 2.28844812120004e-07, "loss": 0.2308, "step": 20000 }, { "epoch": 7.176175098672408, "eval_accuracy": 0.8445857301443133, "eval_f1": 0.8511214230471771, "eval_loss": 0.29024893045425415, "eval_precision": 0.8252718410198725, "eval_recall": 0.8786427145708583, "eval_runtime": 102.3434, "eval_samples_per_second": 96.821, "eval_steps_per_second": 3.029, "step": 20000 }, { "epoch": 7.534983853606028, "grad_norm": 6.145542621612549, "learning_rate": 1.291737267018838e-07, "loss": 0.2263, "step": 21000 }, { "epoch": 7.534983853606028, "eval_accuracy": 0.8494298112826724, "eval_f1": 0.844258872651357, "eval_loss": 0.2922903001308441, "eval_precision": 0.8849015317286653, "eval_recall": 0.807185628742515, "eval_runtime": 102.1235, "eval_samples_per_second": 97.03, "eval_steps_per_second": 3.036, "step": 21000 }, { "epoch": 7.893792608539648, "grad_norm": 31.86915397644043, "learning_rate": 2.950264128376358e-08, "loss": 0.227, "step": 22000 }, { "epoch": 7.893792608539648, "eval_accuracy": 0.8489252194974266, "eval_f1": 0.8480665787069928, "eval_loss": 0.2893160879611969, "eval_precision": 0.8626884162709064, "eval_recall": 0.833932135728543, "eval_runtime": 102.3512, "eval_samples_per_second": 96.814, "eval_steps_per_second": 3.029, "step": 22000 } ], "logging_steps": 1000, "max_steps": 22296, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.6388360807888006e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }