{ "best_metric": 0.267339825630188, "best_model_checkpoint": "./checkpoint-xlm-v-base/checkpoint-62000", "epoch": 3.032583672746591, "eval_steps": 1000, "global_step": 68500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 46.0612678527832, "learning_rate": 1.9873510587163855e-05, "loss": 0.943, "step": 1000 }, { "epoch": 0.04, "eval_LOC_f1": 0.6007898672649095, "eval_ORG_f1": 0.5205864729691723, "eval_PER_f1": 0.6537866457692849, "eval_loss": 0.5755352973937988, "eval_overall_accuracy": 0.813493507822672, "eval_overall_f1": 0.5959796923883924, "eval_overall_precision": 0.571592844668358, "eval_overall_recall": 0.622540194436182, "eval_runtime": 909.7639, "eval_samples_per_second": 72.217, "eval_steps_per_second": 0.282, "step": 1000 }, { "epoch": 0.09, "grad_norm": 5.543514251708984, "learning_rate": 1.974702117432771e-05, "loss": 0.5825, "step": 2000 }, { "epoch": 0.09, "eval_LOC_f1": 0.6761863812260971, "eval_ORG_f1": 0.573783382357224, "eval_PER_f1": 0.7231398018028106, "eval_loss": 0.5157074332237244, "eval_overall_accuracy": 0.844698117312631, "eval_overall_f1": 0.664661446599853, "eval_overall_precision": 0.690302943433134, "eval_overall_recall": 0.640856646367237, "eval_runtime": 884.4485, "eval_samples_per_second": 74.284, "eval_steps_per_second": 0.291, "step": 2000 }, { "epoch": 0.13, "grad_norm": 30.680952072143555, "learning_rate": 1.9620531761491565e-05, "loss": 0.5153, "step": 3000 }, { "epoch": 0.13, "eval_LOC_f1": 0.6448332585613877, "eval_ORG_f1": 0.5780655943179445, "eval_PER_f1": 0.749264457627936, "eval_loss": 0.4841216504573822, "eval_overall_accuracy": 0.8415159100197845, "eval_overall_f1": 0.6599932560127353, "eval_overall_precision": 0.6612013701212998, "eval_overall_recall": 0.6587895486638383, "eval_runtime": 887.5751, "eval_samples_per_second": 74.022, "eval_steps_per_second": 0.29, "step": 3000 }, { "epoch": 0.18, "grad_norm": 14.460062026977539, "learning_rate": 1.949404234865542e-05, "loss": 0.4744, "step": 4000 }, { "epoch": 0.18, "eval_LOC_f1": 0.7135048963789569, "eval_ORG_f1": 0.6044746860257756, "eval_PER_f1": 0.7562344421814889, "eval_loss": 0.4284209907054901, "eval_overall_accuracy": 0.8650805108611886, "eval_overall_f1": 0.6945276906141412, "eval_overall_precision": 0.6951840204528166, "eval_overall_recall": 0.6938725989010129, "eval_runtime": 884.4505, "eval_samples_per_second": 74.283, "eval_steps_per_second": 0.291, "step": 4000 }, { "epoch": 0.22, "grad_norm": 2.3655357360839844, "learning_rate": 1.9367552935819272e-05, "loss": 0.4385, "step": 5000 }, { "epoch": 0.22, "eval_LOC_f1": 0.7295629535257298, "eval_ORG_f1": 0.6200475216886777, "eval_PER_f1": 0.7408250910983861, "eval_loss": 0.4239448308944702, "eval_overall_accuracy": 0.8635053351958905, "eval_overall_f1": 0.7025319684063077, "eval_overall_precision": 0.7043199823770524, "eval_overall_recall": 0.7007530096904989, "eval_runtime": 907.5971, "eval_samples_per_second": 72.389, "eval_steps_per_second": 0.283, "step": 5000 }, { "epoch": 0.27, "grad_norm": 9.867854118347168, "learning_rate": 1.924106352298313e-05, "loss": 0.4279, "step": 6000 }, { "epoch": 0.27, "eval_LOC_f1": 0.7370099725835874, "eval_ORG_f1": 0.6403813434199981, "eval_PER_f1": 0.7776639577500056, "eval_loss": 0.38233184814453125, "eval_overall_accuracy": 0.8816618781055326, "eval_overall_f1": 0.7238370468534203, "eval_overall_precision": 0.7385234418271267, "eval_overall_recall": 0.7097233746105797, "eval_runtime": 948.2573, "eval_samples_per_second": 69.285, "eval_steps_per_second": 0.271, "step": 6000 }, { "epoch": 0.31, "grad_norm": 25.877347946166992, "learning_rate": 1.9114574110146982e-05, "loss": 0.4099, "step": 7000 }, { "epoch": 0.31, "eval_LOC_f1": 0.7575684397708062, "eval_ORG_f1": 0.630874803840732, "eval_PER_f1": 0.7838224767358626, "eval_loss": 0.38040244579315186, "eval_overall_accuracy": 0.883136741379065, "eval_overall_f1": 0.7288917006049582, "eval_overall_precision": 0.7364569017865703, "eval_overall_recall": 0.7214803450381201, "eval_runtime": 907.1265, "eval_samples_per_second": 72.427, "eval_steps_per_second": 0.283, "step": 7000 }, { "epoch": 0.35, "grad_norm": 30.637121200561523, "learning_rate": 1.8988084697310836e-05, "loss": 0.3874, "step": 8000 }, { "epoch": 0.35, "eval_LOC_f1": 0.7474734456723695, "eval_ORG_f1": 0.6407748343462335, "eval_PER_f1": 0.7854664027017585, "eval_loss": 0.37021398544311523, "eval_overall_accuracy": 0.8872800498308584, "eval_overall_f1": 0.7295473133392094, "eval_overall_precision": 0.74022719948437, "eval_overall_recall": 0.7191712196878376, "eval_runtime": 884.9913, "eval_samples_per_second": 74.238, "eval_steps_per_second": 0.29, "step": 8000 }, { "epoch": 0.4, "grad_norm": 20.109619140625, "learning_rate": 1.8861595284474693e-05, "loss": 0.3841, "step": 9000 }, { "epoch": 0.4, "eval_LOC_f1": 0.7684967782745274, "eval_ORG_f1": 0.655326947582435, "eval_PER_f1": 0.789217873159736, "eval_loss": 0.3808096945285797, "eval_overall_accuracy": 0.8879015799879489, "eval_overall_f1": 0.743771496693436, "eval_overall_precision": 0.76602787456446, "eval_overall_recall": 0.7227718897255663, "eval_runtime": 885.0879, "eval_samples_per_second": 74.23, "eval_steps_per_second": 0.29, "step": 9000 }, { "epoch": 0.44, "grad_norm": 10.265982627868652, "learning_rate": 1.8735105871638546e-05, "loss": 0.3764, "step": 10000 }, { "epoch": 0.44, "eval_LOC_f1": 0.7831821749367751, "eval_ORG_f1": 0.6622161847467495, "eval_PER_f1": 0.7948864849077164, "eval_loss": 0.34247785806655884, "eval_overall_accuracy": 0.8911041208737209, "eval_overall_f1": 0.7509862429761675, "eval_overall_precision": 0.7436605881991772, "eval_overall_recall": 0.7584576608168825, "eval_runtime": 885.3889, "eval_samples_per_second": 74.205, "eval_steps_per_second": 0.29, "step": 10000 }, { "epoch": 0.49, "grad_norm": 4.401586055755615, "learning_rate": 1.86086164588024e-05, "loss": 0.3564, "step": 11000 }, { "epoch": 0.49, "eval_LOC_f1": 0.789610444706057, "eval_ORG_f1": 0.6701892389389907, "eval_PER_f1": 0.8054954166474735, "eval_loss": 0.35062676668167114, "eval_overall_accuracy": 0.8928101093201735, "eval_overall_f1": 0.7598466310260445, "eval_overall_precision": 0.7565214692509428, "eval_overall_recall": 0.7632011522144121, "eval_runtime": 885.0018, "eval_samples_per_second": 74.237, "eval_steps_per_second": 0.29, "step": 11000 }, { "epoch": 0.53, "grad_norm": 1.0011909008026123, "learning_rate": 1.8482127045966253e-05, "loss": 0.3484, "step": 12000 }, { "epoch": 0.53, "eval_LOC_f1": 0.7712024123633622, "eval_ORG_f1": 0.6809512535185331, "eval_PER_f1": 0.8100081183474653, "eval_loss": 0.37064051628112793, "eval_overall_accuracy": 0.8851477337194005, "eval_overall_f1": 0.758193057536852, "eval_overall_precision": 0.7375241450255696, "eval_overall_recall": 0.7800538534996947, "eval_runtime": 950.4008, "eval_samples_per_second": 69.129, "eval_steps_per_second": 0.27, "step": 12000 }, { "epoch": 0.58, "grad_norm": 13.891754150390625, "learning_rate": 1.8355637633130106e-05, "loss": 0.3563, "step": 13000 }, { "epoch": 0.58, "eval_LOC_f1": 0.7934942596408595, "eval_ORG_f1": 0.6686377545091862, "eval_PER_f1": 0.8152987398240509, "eval_loss": 0.3389296531677246, "eval_overall_accuracy": 0.8935766857734662, "eval_overall_f1": 0.7638085016673694, "eval_overall_precision": 0.7483700877298401, "eval_overall_recall": 0.7798973026284891, "eval_runtime": 970.0425, "eval_samples_per_second": 67.729, "eval_steps_per_second": 0.265, "step": 13000 }, { "epoch": 0.62, "grad_norm": 19.86951446533203, "learning_rate": 1.8229148220293963e-05, "loss": 0.3396, "step": 14000 }, { "epoch": 0.62, "eval_LOC_f1": 0.7943446440452429, "eval_ORG_f1": 0.6860717813631874, "eval_PER_f1": 0.808841180333809, "eval_loss": 0.33801111578941345, "eval_overall_accuracy": 0.8965501348456104, "eval_overall_f1": 0.767065352823492, "eval_overall_precision": 0.770362767931157, "eval_overall_recall": 0.7637960455249934, "eval_runtime": 892.9487, "eval_samples_per_second": 73.576, "eval_steps_per_second": 0.288, "step": 14000 }, { "epoch": 0.66, "grad_norm": 6.008892059326172, "learning_rate": 1.8102658807457817e-05, "loss": 0.3513, "step": 15000 }, { "epoch": 0.66, "eval_LOC_f1": 0.796750172086523, "eval_ORG_f1": 0.6869723599718148, "eval_PER_f1": 0.8222321051851345, "eval_loss": 0.3108769357204437, "eval_overall_accuracy": 0.900560597156957, "eval_overall_f1": 0.773478314631055, "eval_overall_precision": 0.7637860773210824, "eval_overall_recall": 0.7834196972306151, "eval_runtime": 885.3039, "eval_samples_per_second": 74.212, "eval_steps_per_second": 0.29, "step": 15000 }, { "epoch": 0.71, "grad_norm": 36.22490692138672, "learning_rate": 1.7976169394621674e-05, "loss": 0.3332, "step": 16000 }, { "epoch": 0.71, "eval_LOC_f1": 0.8066277986085525, "eval_ORG_f1": 0.6983725665012176, "eval_PER_f1": 0.8332057011578333, "eval_loss": 0.31944143772125244, "eval_overall_accuracy": 0.9018524580941595, "eval_overall_f1": 0.7838363709114303, "eval_overall_precision": 0.7856547450536315, "eval_overall_recall": 0.7820263944768853, "eval_runtime": 885.7636, "eval_samples_per_second": 74.173, "eval_steps_per_second": 0.29, "step": 16000 }, { "epoch": 0.75, "grad_norm": 12.729876518249512, "learning_rate": 1.7849679981785527e-05, "loss": 0.3324, "step": 17000 }, { "epoch": 0.75, "eval_LOC_f1": 0.8091387580118001, "eval_ORG_f1": 0.7010727593385756, "eval_PER_f1": 0.8321347376235326, "eval_loss": 0.3180248737335205, "eval_overall_accuracy": 0.9008195115735422, "eval_overall_f1": 0.7850356248119096, "eval_overall_precision": 0.7839540384210074, "eval_overall_recall": 0.7861201997589117, "eval_runtime": 884.7821, "eval_samples_per_second": 74.256, "eval_steps_per_second": 0.29, "step": 17000 }, { "epoch": 0.8, "grad_norm": 8.6387939453125, "learning_rate": 1.772319056894938e-05, "loss": 0.3385, "step": 18000 }, { "epoch": 0.8, "eval_LOC_f1": 0.7763382604086921, "eval_ORG_f1": 0.6755694544225161, "eval_PER_f1": 0.8256535378180551, "eval_loss": 0.33690232038497925, "eval_overall_accuracy": 0.8964328777930731, "eval_overall_f1": 0.7651045588831604, "eval_overall_precision": 0.755082284607938, "eval_overall_recall": 0.7753964650813282, "eval_runtime": 890.8661, "eval_samples_per_second": 73.748, "eval_steps_per_second": 0.288, "step": 18000 }, { "epoch": 0.84, "grad_norm": 9.216795921325684, "learning_rate": 1.7596701156113234e-05, "loss": 0.3267, "step": 19000 }, { "epoch": 0.84, "eval_LOC_f1": 0.8015086633534421, "eval_ORG_f1": 0.70105107410561, "eval_PER_f1": 0.8364990020855292, "eval_loss": 0.3270108103752136, "eval_overall_accuracy": 0.9019283701744147, "eval_overall_f1": 0.7856358496296815, "eval_overall_precision": 0.7816849022099619, "eval_overall_recall": 0.789626939273917, "eval_runtime": 950.9161, "eval_samples_per_second": 69.091, "eval_steps_per_second": 0.27, "step": 19000 }, { "epoch": 0.89, "grad_norm": 10.781560897827148, "learning_rate": 1.747021174327709e-05, "loss": 0.3233, "step": 20000 }, { "epoch": 0.89, "eval_LOC_f1": 0.8056958546374753, "eval_ORG_f1": 0.6917519315097098, "eval_PER_f1": 0.8345942771968653, "eval_loss": 0.30491939187049866, "eval_overall_accuracy": 0.9033815442821598, "eval_overall_f1": 0.783789045768533, "eval_overall_precision": 0.7798233379823338, "eval_overall_recall": 0.7877952940808116, "eval_runtime": 917.217, "eval_samples_per_second": 71.63, "eval_steps_per_second": 0.28, "step": 20000 }, { "epoch": 0.93, "grad_norm": 15.803696632385254, "learning_rate": 1.7343722330440944e-05, "loss": 0.3112, "step": 21000 }, { "epoch": 0.93, "eval_LOC_f1": 0.8185129588612579, "eval_ORG_f1": 0.7198663442992582, "eval_PER_f1": 0.8407628833170422, "eval_loss": 0.3230852782726288, "eval_overall_accuracy": 0.9054094812832666, "eval_overall_f1": 0.7978603713205666, "eval_overall_precision": 0.7936613663999752, "eval_overall_recall": 0.8021040437090032, "eval_runtime": 886.1519, "eval_samples_per_second": 74.141, "eval_steps_per_second": 0.29, "step": 21000 }, { "epoch": 0.97, "grad_norm": 2.6037917137145996, "learning_rate": 1.7217232917604798e-05, "loss": 0.3256, "step": 22000 }, { "epoch": 0.97, "eval_LOC_f1": 0.8135399708226737, "eval_ORG_f1": 0.7164829968607569, "eval_PER_f1": 0.8380035321776889, "eval_loss": 0.3068985044956207, "eval_overall_accuracy": 0.9048909746636654, "eval_overall_f1": 0.7934551505253751, "eval_overall_precision": 0.781072268142868, "eval_overall_recall": 0.806236986708831, "eval_runtime": 885.0947, "eval_samples_per_second": 74.229, "eval_steps_per_second": 0.29, "step": 22000 }, { "epoch": 1.02, "grad_norm": 20.057506561279297, "learning_rate": 1.709074350476865e-05, "loss": 0.2931, "step": 23000 }, { "epoch": 1.02, "eval_LOC_f1": 0.8090675101559834, "eval_ORG_f1": 0.7063356635658061, "eval_PER_f1": 0.8313781014023732, "eval_loss": 0.3428688645362854, "eval_overall_accuracy": 0.8993378704357015, "eval_overall_f1": 0.7878607280931577, "eval_overall_precision": 0.779137676333938, "eval_overall_recall": 0.7967813140880129, "eval_runtime": 913.4969, "eval_samples_per_second": 71.921, "eval_steps_per_second": 0.281, "step": 23000 }, { "epoch": 1.06, "grad_norm": 10.215392112731934, "learning_rate": 1.6964254091932504e-05, "loss": 0.2936, "step": 24000 }, { "epoch": 1.06, "eval_LOC_f1": 0.8279096724337782, "eval_ORG_f1": 0.7213703143205346, "eval_PER_f1": 0.8399587047500638, "eval_loss": 0.3276561200618744, "eval_overall_accuracy": 0.9048035402140856, "eval_overall_f1": 0.8012747941254013, "eval_overall_precision": 0.7938388261504187, "eval_overall_recall": 0.8088513862579645, "eval_runtime": 888.4065, "eval_samples_per_second": 73.953, "eval_steps_per_second": 0.289, "step": 24000 }, { "epoch": 1.11, "grad_norm": 13.276248931884766, "learning_rate": 1.683776467909636e-05, "loss": 0.2797, "step": 25000 }, { "epoch": 1.11, "eval_LOC_f1": 0.8137454037894649, "eval_ORG_f1": 0.7157125069541951, "eval_PER_f1": 0.8422392139071162, "eval_loss": 0.30677318572998047, "eval_overall_accuracy": 0.9050170429398038, "eval_overall_f1": 0.7953167244565965, "eval_overall_precision": 0.7764588335296837, "eval_overall_recall": 0.8151134211061885, "eval_runtime": 923.9779, "eval_samples_per_second": 71.106, "eval_steps_per_second": 0.278, "step": 25000 }, { "epoch": 1.15, "grad_norm": 0.08989755064249039, "learning_rate": 1.6711275266260215e-05, "loss": 0.2792, "step": 26000 }, { "epoch": 1.15, "eval_LOC_f1": 0.8237387838615486, "eval_ORG_f1": 0.717827626918536, "eval_PER_f1": 0.8449089917750373, "eval_loss": 0.3350381851196289, "eval_overall_accuracy": 0.9065115620198306, "eval_overall_f1": 0.7990867935095444, "eval_overall_precision": 0.7941203753807146, "eval_overall_recall": 0.8041157224039952, "eval_runtime": 950.4133, "eval_samples_per_second": 69.128, "eval_steps_per_second": 0.27, "step": 26000 }, { "epoch": 1.2, "grad_norm": 32.182918548583984, "learning_rate": 1.658478585342407e-05, "loss": 0.2698, "step": 27000 }, { "epoch": 1.2, "eval_LOC_f1": 0.8267147626869356, "eval_ORG_f1": 0.7322118816415036, "eval_PER_f1": 0.8463502705378134, "eval_loss": 0.3303050696849823, "eval_overall_accuracy": 0.9053356025623038, "eval_overall_f1": 0.8061476513209491, "eval_overall_precision": 0.803686040812516, "eval_overall_recall": 0.8086243874947164, "eval_runtime": 903.1604, "eval_samples_per_second": 72.745, "eval_steps_per_second": 0.285, "step": 27000 }, { "epoch": 1.24, "grad_norm": 27.704275131225586, "learning_rate": 1.6458296440587925e-05, "loss": 0.2846, "step": 28000 }, { "epoch": 1.24, "eval_LOC_f1": 0.8198407012516331, "eval_ORG_f1": 0.7170252756930114, "eval_PER_f1": 0.8497716275494053, "eval_loss": 0.3040228486061096, "eval_overall_accuracy": 0.9088763588770705, "eval_overall_f1": 0.7998951624404026, "eval_overall_precision": 0.7879266486958503, "eval_overall_recall": 0.8122328850760054, "eval_runtime": 883.6437, "eval_samples_per_second": 74.351, "eval_steps_per_second": 0.291, "step": 28000 }, { "epoch": 1.28, "grad_norm": 1.532094120979309, "learning_rate": 1.633180702775178e-05, "loss": 0.2765, "step": 29000 }, { "epoch": 1.28, "eval_LOC_f1": 0.8226323815533471, "eval_ORG_f1": 0.7322743544720759, "eval_PER_f1": 0.8458698818030955, "eval_loss": 0.3010263741016388, "eval_overall_accuracy": 0.9093941877102408, "eval_overall_f1": 0.8048208514659728, "eval_overall_precision": 0.799766577265244, "eval_overall_recall": 0.8099394148128435, "eval_runtime": 883.3843, "eval_samples_per_second": 74.373, "eval_steps_per_second": 0.291, "step": 29000 }, { "epoch": 1.33, "grad_norm": 2.7648439407348633, "learning_rate": 1.6205317614915632e-05, "loss": 0.2758, "step": 30000 }, { "epoch": 1.33, "eval_LOC_f1": 0.8158232882579698, "eval_ORG_f1": 0.7211553763726063, "eval_PER_f1": 0.8392668350824088, "eval_loss": 0.2979504466056824, "eval_overall_accuracy": 0.907270682822384, "eval_overall_f1": 0.7967142515352101, "eval_overall_precision": 0.7737616641463505, "eval_overall_recall": 0.8210701817555615, "eval_runtime": 884.2556, "eval_samples_per_second": 74.3, "eval_steps_per_second": 0.291, "step": 30000 }, { "epoch": 1.37, "grad_norm": 2.9498727321624756, "learning_rate": 1.607882820207949e-05, "loss": 0.2745, "step": 31000 }, { "epoch": 1.37, "eval_LOC_f1": 0.8269012485811577, "eval_ORG_f1": 0.7277582167305856, "eval_PER_f1": 0.856517895595802, "eval_loss": 0.2944641709327698, "eval_overall_accuracy": 0.9108385505943848, "eval_overall_f1": 0.807803496021649, "eval_overall_precision": 0.7947313807024321, "eval_overall_recall": 0.8213128356059302, "eval_runtime": 883.9066, "eval_samples_per_second": 74.329, "eval_steps_per_second": 0.291, "step": 31000 }, { "epoch": 1.42, "grad_norm": 11.60289192199707, "learning_rate": 1.5952338789243342e-05, "loss": 0.2645, "step": 32000 }, { "epoch": 1.42, "eval_LOC_f1": 0.8305319969159598, "eval_ORG_f1": 0.7228604829282057, "eval_PER_f1": 0.8315148384875288, "eval_loss": 0.32325080037117004, "eval_overall_accuracy": 0.9048618298471388, "eval_overall_f1": 0.7998450483255535, "eval_overall_precision": 0.7917570997998328, "eval_overall_recall": 0.8080999420761776, "eval_runtime": 933.3011, "eval_samples_per_second": 70.395, "eval_steps_per_second": 0.275, "step": 32000 }, { "epoch": 1.46, "grad_norm": 42.618431091308594, "learning_rate": 1.5825849376407196e-05, "loss": 0.2779, "step": 33000 }, { "epoch": 1.46, "eval_LOC_f1": 0.8264125401549256, "eval_ORG_f1": 0.7424042624042624, "eval_PER_f1": 0.8601716304896517, "eval_loss": 0.2943771183490753, "eval_overall_accuracy": 0.9127221190857203, "eval_overall_f1": 0.8132353632361465, "eval_overall_precision": 0.8138473840171838, "eval_overall_recall": 0.8126242622540194, "eval_runtime": 953.7502, "eval_samples_per_second": 68.886, "eval_steps_per_second": 0.269, "step": 33000 }, { "epoch": 1.51, "grad_norm": 15.319729804992676, "learning_rate": 1.569935996357105e-05, "loss": 0.2709, "step": 34000 }, { "epoch": 1.51, "eval_LOC_f1": 0.832774509183695, "eval_ORG_f1": 0.7316936984844457, "eval_PER_f1": 0.8539732494099136, "eval_loss": 0.2914768159389496, "eval_overall_accuracy": 0.9130203451152948, "eval_overall_f1": 0.8107029247351679, "eval_overall_precision": 0.7998217523118878, "eval_overall_recall": 0.8218842462858306, "eval_runtime": 894.9182, "eval_samples_per_second": 73.415, "eval_steps_per_second": 0.287, "step": 34000 }, { "epoch": 1.55, "grad_norm": 1.931920051574707, "learning_rate": 1.5572870550734906e-05, "loss": 0.2631, "step": 35000 }, { "epoch": 1.55, "eval_LOC_f1": 0.8323614548810673, "eval_ORG_f1": 0.7279775567457282, "eval_PER_f1": 0.8522675037838443, "eval_loss": 0.3124816417694092, "eval_overall_accuracy": 0.9096829247297835, "eval_overall_f1": 0.8079843932416348, "eval_overall_precision": 0.7857095311702623, "eval_overall_recall": 0.8315590901263366, "eval_runtime": 886.0992, "eval_samples_per_second": 74.145, "eval_steps_per_second": 0.29, "step": 35000 }, { "epoch": 1.59, "grad_norm": 2.4540863037109375, "learning_rate": 1.544638113789876e-05, "loss": 0.2684, "step": 36000 }, { "epoch": 1.59, "eval_LOC_f1": 0.8353469255313396, "eval_ORG_f1": 0.743517370545253, "eval_PER_f1": 0.8544175455688603, "eval_loss": 0.31003931164741516, "eval_overall_accuracy": 0.9140255023922472, "eval_overall_f1": 0.8147700607298496, "eval_overall_precision": 0.8114867383067271, "eval_overall_recall": 0.8180800601155346, "eval_runtime": 886.9365, "eval_samples_per_second": 74.075, "eval_steps_per_second": 0.29, "step": 36000 }, { "epoch": 1.64, "grad_norm": 1.3480443954467773, "learning_rate": 1.5319891725062616e-05, "loss": 0.2546, "step": 37000 }, { "epoch": 1.64, "eval_LOC_f1": 0.8268920250802105, "eval_ORG_f1": 0.7359205250232403, "eval_PER_f1": 0.8566762684569846, "eval_loss": 0.3172565698623657, "eval_overall_accuracy": 0.9102766656432092, "eval_overall_f1": 0.8115168704156479, "eval_overall_precision": 0.8111456076827428, "eval_overall_recall": 0.8118884731593531, "eval_runtime": 886.1348, "eval_samples_per_second": 74.142, "eval_steps_per_second": 0.29, "step": 37000 }, { "epoch": 1.68, "grad_norm": 2.66180419921875, "learning_rate": 1.5193402312226468e-05, "loss": 0.2642, "step": 38000 }, { "epoch": 1.68, "eval_LOC_f1": 0.8459391601383606, "eval_ORG_f1": 0.7362593503366764, "eval_PER_f1": 0.8645872824401172, "eval_loss": 0.2804827094078064, "eval_overall_accuracy": 0.913356527184997, "eval_overall_f1": 0.8191614534186092, "eval_overall_precision": 0.8128202954617264, "eval_overall_recall": 0.8256023294769635, "eval_runtime": 901.1127, "eval_samples_per_second": 72.91, "eval_steps_per_second": 0.285, "step": 38000 }, { "epoch": 1.73, "grad_norm": 47.826175689697266, "learning_rate": 1.5066912899390323e-05, "loss": 0.2776, "step": 39000 }, { "epoch": 1.73, "eval_LOC_f1": 0.8417204029165086, "eval_ORG_f1": 0.7461313828771049, "eval_PER_f1": 0.8621870343195805, "eval_loss": 0.2955803871154785, "eval_overall_accuracy": 0.9141780043391887, "eval_overall_f1": 0.8213932893138981, "eval_overall_precision": 0.8195021231836067, "eval_overall_recall": 0.8232932041266809, "eval_runtime": 949.1162, "eval_samples_per_second": 69.222, "eval_steps_per_second": 0.271, "step": 39000 }, { "epoch": 1.77, "grad_norm": 14.713150024414062, "learning_rate": 1.4940423486554176e-05, "loss": 0.2616, "step": 40000 }, { "epoch": 1.77, "eval_LOC_f1": 0.8312933303965682, "eval_ORG_f1": 0.738299968952903, "eval_PER_f1": 0.8513267743278481, "eval_loss": 0.29292425513267517, "eval_overall_accuracy": 0.9104176452208262, "eval_overall_f1": 0.8118819476942669, "eval_overall_precision": 0.796034420507883, "eval_overall_recall": 0.8283732798973026, "eval_runtime": 908.7427, "eval_samples_per_second": 72.298, "eval_steps_per_second": 0.283, "step": 40000 }, { "epoch": 1.82, "grad_norm": 16.184900283813477, "learning_rate": 1.481393407371803e-05, "loss": 0.2701, "step": 41000 }, { "epoch": 1.82, "eval_LOC_f1": 0.8392716598242965, "eval_ORG_f1": 0.74401776384535, "eval_PER_f1": 0.8639952804501724, "eval_loss": 0.2767677903175354, "eval_overall_accuracy": 0.9163774213073009, "eval_overall_f1": 0.8195078963845922, "eval_overall_precision": 0.8094401856885441, "eval_overall_recall": 0.8298292029995147, "eval_runtime": 885.7415, "eval_samples_per_second": 74.175, "eval_steps_per_second": 0.29, "step": 41000 }, { "epoch": 1.86, "grad_norm": 16.436620712280273, "learning_rate": 1.4687444660881885e-05, "loss": 0.2669, "step": 42000 }, { "epoch": 1.86, "eval_LOC_f1": 0.8361988121287902, "eval_ORG_f1": 0.7500968409804315, "eval_PER_f1": 0.8611851501962505, "eval_loss": 0.29421770572662354, "eval_overall_accuracy": 0.9147995344962793, "eval_overall_f1": 0.819935938895562, "eval_overall_precision": 0.8067546477976939, "eval_overall_recall": 0.833555113734208, "eval_runtime": 886.1039, "eval_samples_per_second": 74.145, "eval_steps_per_second": 0.29, "step": 42000 }, { "epoch": 1.9, "grad_norm": 4.060434341430664, "learning_rate": 1.456095524804574e-05, "loss": 0.2422, "step": 43000 }, { "epoch": 1.9, "eval_LOC_f1": 0.8396598172309967, "eval_ORG_f1": 0.752934357339516, "eval_PER_f1": 0.8587656968190062, "eval_loss": 0.29513150453567505, "eval_overall_accuracy": 0.9147182001245772, "eval_overall_f1": 0.8206534155814486, "eval_overall_precision": 0.8120531232517684, "eval_overall_recall": 0.8294378258215007, "eval_runtime": 885.9872, "eval_samples_per_second": 74.155, "eval_steps_per_second": 0.29, "step": 43000 }, { "epoch": 1.95, "grad_norm": 4.084081172943115, "learning_rate": 1.4434465835209595e-05, "loss": 0.2616, "step": 44000 }, { "epoch": 1.95, "eval_LOC_f1": 0.8452747626229368, "eval_ORG_f1": 0.7507735621040889, "eval_PER_f1": 0.8679754713527367, "eval_loss": 0.29186713695526123, "eval_overall_accuracy": 0.915251618045657, "eval_overall_f1": 0.8252959748971241, "eval_overall_precision": 0.820965230928905, "eval_overall_recall": 0.8296726521283091, "eval_runtime": 886.49, "eval_samples_per_second": 74.113, "eval_steps_per_second": 0.29, "step": 44000 }, { "epoch": 1.99, "grad_norm": 12.051443099975586, "learning_rate": 1.4307976422373449e-05, "loss": 0.2449, "step": 45000 }, { "epoch": 1.99, "eval_LOC_f1": 0.8420882739030321, "eval_ORG_f1": 0.7511743283897188, "eval_PER_f1": 0.8660442600276625, "eval_loss": 0.28106340765953064, "eval_overall_accuracy": 0.9165787238772637, "eval_overall_f1": 0.8232545031821703, "eval_overall_precision": 0.823325400056368, "eval_overall_recall": 0.8231836185168371, "eval_runtime": 931.6188, "eval_samples_per_second": 70.522, "eval_steps_per_second": 0.276, "step": 45000 }, { "epoch": 2.04, "grad_norm": 16.095355987548828, "learning_rate": 1.4181487009537302e-05, "loss": 0.2379, "step": 46000 }, { "epoch": 2.04, "eval_LOC_f1": 0.8334692878701362, "eval_ORG_f1": 0.7499450670182377, "eval_PER_f1": 0.8676157711285138, "eval_loss": 0.2910194396972656, "eval_overall_accuracy": 0.914796145564125, "eval_overall_f1": 0.821648434727601, "eval_overall_precision": 0.8099514821518198, "eval_overall_recall": 0.8336881819747327, "eval_runtime": 955.4465, "eval_samples_per_second": 68.764, "eval_steps_per_second": 0.269, "step": 46000 }, { "epoch": 2.08, "grad_norm": 6.1998419761657715, "learning_rate": 1.4054997596701157e-05, "loss": 0.2128, "step": 47000 }, { "epoch": 2.08, "eval_LOC_f1": 0.8394146138221968, "eval_ORG_f1": 0.7394133361546803, "eval_PER_f1": 0.864682724271338, "eval_loss": 0.30839666724205017, "eval_overall_accuracy": 0.9148042790012952, "eval_overall_f1": 0.8188789651986448, "eval_overall_precision": 0.8056906504249807, "eval_overall_recall": 0.8325062228971304, "eval_runtime": 901.3734, "eval_samples_per_second": 72.889, "eval_steps_per_second": 0.285, "step": 47000 }, { "epoch": 2.13, "grad_norm": 5.403193950653076, "learning_rate": 1.3928508183865012e-05, "loss": 0.2237, "step": 48000 }, { "epoch": 2.13, "eval_LOC_f1": 0.8372631513660468, "eval_ORG_f1": 0.7524156839779593, "eval_PER_f1": 0.8648470673721019, "eval_loss": 0.3043561279773712, "eval_overall_accuracy": 0.9152800850757528, "eval_overall_f1": 0.8220857007666829, "eval_overall_precision": 0.8082709895080826, "eval_overall_recall": 0.836380856959469, "eval_runtime": 884.5053, "eval_samples_per_second": 74.279, "eval_steps_per_second": 0.291, "step": 48000 }, { "epoch": 2.17, "grad_norm": 13.765303611755371, "learning_rate": 1.3802018771028867e-05, "loss": 0.2246, "step": 49000 }, { "epoch": 2.17, "eval_LOC_f1": 0.8349439826902872, "eval_ORG_f1": 0.7424747298710351, "eval_PER_f1": 0.859314059653789, "eval_loss": 0.28388652205467224, "eval_overall_accuracy": 0.9144118406578324, "eval_overall_f1": 0.817645207294658, "eval_overall_precision": 0.8118686576378439, "eval_overall_recall": 0.8235045478028086, "eval_runtime": 886.1908, "eval_samples_per_second": 74.138, "eval_steps_per_second": 0.29, "step": 49000 }, { "epoch": 2.21, "grad_norm": 26.609722137451172, "learning_rate": 1.3675529358192721e-05, "loss": 0.2231, "step": 50000 }, { "epoch": 2.21, "eval_LOC_f1": 0.8453938301706774, "eval_ORG_f1": 0.7530178399743618, "eval_PER_f1": 0.8633811603243918, "eval_loss": 0.30370599031448364, "eval_overall_accuracy": 0.9166329467917318, "eval_overall_f1": 0.8246776205110672, "eval_overall_precision": 0.8187134051793966, "eval_overall_recall": 0.8307293705089469, "eval_runtime": 885.5464, "eval_samples_per_second": 74.191, "eval_steps_per_second": 0.29, "step": 50000 }, { "epoch": 2.26, "grad_norm": 18.287857055664062, "learning_rate": 1.3549039945356574e-05, "loss": 0.2156, "step": 51000 }, { "epoch": 2.26, "eval_LOC_f1": 0.8369090369642839, "eval_ORG_f1": 0.7570827451034141, "eval_PER_f1": 0.8699436414871374, "eval_loss": 0.2922073304653168, "eval_overall_accuracy": 0.916367932297269, "eval_overall_f1": 0.8256348807545127, "eval_overall_precision": 0.8155410977732979, "eval_overall_recall": 0.8359816522378947, "eval_runtime": 885.263, "eval_samples_per_second": 74.215, "eval_steps_per_second": 0.29, "step": 51000 }, { "epoch": 2.3, "grad_norm": 35.76387405395508, "learning_rate": 1.3422550532520428e-05, "loss": 0.2279, "step": 52000 }, { "epoch": 2.3, "eval_LOC_f1": 0.8493380871850663, "eval_ORG_f1": 0.7652859960552268, "eval_PER_f1": 0.8658015544747966, "eval_loss": 0.30765289068222046, "eval_overall_accuracy": 0.9169528619870936, "eval_overall_f1": 0.8303520832274882, "eval_overall_precision": 0.8291563575626546, "eval_overall_recall": 0.8315512625827762, "eval_runtime": 939.0823, "eval_samples_per_second": 69.962, "eval_steps_per_second": 0.274, "step": 52000 }, { "epoch": 2.35, "grad_norm": 12.871335983276367, "learning_rate": 1.3296061119684283e-05, "loss": 0.2192, "step": 53000 }, { "epoch": 2.35, "eval_LOC_f1": 0.8450333357909482, "eval_ORG_f1": 0.7589152754918096, "eval_PER_f1": 0.8706159740642501, "eval_loss": 0.29916831851005554, "eval_overall_accuracy": 0.9182454007107268, "eval_overall_f1": 0.8283387559440156, "eval_overall_precision": 0.8151347746682732, "eval_overall_recall": 0.8419775506050691, "eval_runtime": 927.2359, "eval_samples_per_second": 70.856, "eval_steps_per_second": 0.277, "step": 53000 }, { "epoch": 2.39, "grad_norm": 12.074441909790039, "learning_rate": 1.3169571706848138e-05, "loss": 0.2199, "step": 54000 }, { "epoch": 2.39, "eval_LOC_f1": 0.845725804758205, "eval_ORG_f1": 0.7583743578767123, "eval_PER_f1": 0.8723780235920504, "eval_loss": 0.29886308312416077, "eval_overall_accuracy": 0.9192946141056846, "eval_overall_f1": 0.8293654188671028, "eval_overall_precision": 0.8241158649684679, "eval_overall_recall": 0.8346822800068883, "eval_runtime": 890.1649, "eval_samples_per_second": 73.807, "eval_steps_per_second": 0.289, "step": 54000 }, { "epoch": 2.43, "grad_norm": 23.149980545043945, "learning_rate": 1.3043082294011993e-05, "loss": 0.2255, "step": 55000 }, { "epoch": 2.43, "eval_LOC_f1": 0.8466151994355207, "eval_ORG_f1": 0.7544473410506125, "eval_PER_f1": 0.870403734801872, "eval_loss": 0.2841680943965912, "eval_overall_accuracy": 0.9179688638469395, "eval_overall_f1": 0.8283511691203761, "eval_overall_precision": 0.8183908572825472, "eval_overall_recall": 0.8385569140692268, "eval_runtime": 885.5261, "eval_samples_per_second": 74.193, "eval_steps_per_second": 0.29, "step": 55000 }, { "epoch": 2.48, "grad_norm": 9.740825653076172, "learning_rate": 1.2916592881175847e-05, "loss": 0.2166, "step": 56000 }, { "epoch": 2.48, "eval_LOC_f1": 0.8564384031559538, "eval_ORG_f1": 0.7616027673681177, "eval_PER_f1": 0.8733214429549507, "eval_loss": 0.2920551300048828, "eval_overall_accuracy": 0.9202286038073975, "eval_overall_f1": 0.8339421536254372, "eval_overall_precision": 0.8307068573159461, "eval_overall_recall": 0.8372027490332984, "eval_runtime": 885.9705, "eval_samples_per_second": 74.156, "eval_steps_per_second": 0.29, "step": 56000 }, { "epoch": 2.52, "grad_norm": 7.81465482711792, "learning_rate": 1.27901034683397e-05, "loss": 0.2195, "step": 57000 }, { "epoch": 2.52, "eval_LOC_f1": 0.8524520572659642, "eval_ORG_f1": 0.7654914529914529, "eval_PER_f1": 0.8711133515111243, "eval_loss": 0.2894265651702881, "eval_overall_accuracy": 0.9196877302355783, "eval_overall_f1": 0.8334080883643471, "eval_overall_precision": 0.8305412821928031, "eval_overall_recall": 0.836294753980306, "eval_runtime": 886.104, "eval_samples_per_second": 74.145, "eval_steps_per_second": 0.29, "step": 57000 }, { "epoch": 2.57, "grad_norm": 4.749297618865967, "learning_rate": 1.2663614055503555e-05, "loss": 0.2198, "step": 58000 }, { "epoch": 2.57, "eval_LOC_f1": 0.8479476339833629, "eval_ORG_f1": 0.7569071497897121, "eval_PER_f1": 0.8715211159515157, "eval_loss": 0.2978798449039459, "eval_overall_accuracy": 0.917242276793067, "eval_overall_f1": 0.8290111404616975, "eval_overall_precision": 0.8113011029852536, "eval_overall_recall": 0.847511623902187, "eval_runtime": 912.438, "eval_samples_per_second": 72.005, "eval_steps_per_second": 0.282, "step": 58000 }, { "epoch": 2.61, "grad_norm": 8.958308219909668, "learning_rate": 1.2537124642667409e-05, "loss": 0.2186, "step": 59000 }, { "epoch": 2.61, "eval_LOC_f1": 0.8410615339749197, "eval_ORG_f1": 0.7623071419893903, "eval_PER_f1": 0.8698487455846391, "eval_loss": 0.2916683554649353, "eval_overall_accuracy": 0.9165563569250457, "eval_overall_f1": 0.828324104278644, "eval_overall_precision": 0.8190848632805027, "eval_overall_recall": 0.8377741597131988, "eval_runtime": 949.686, "eval_samples_per_second": 69.181, "eval_steps_per_second": 0.271, "step": 59000 }, { "epoch": 2.66, "grad_norm": 2.5494885444641113, "learning_rate": 1.2410635229831265e-05, "loss": 0.2105, "step": 60000 }, { "epoch": 2.66, "eval_LOC_f1": 0.8486577670408396, "eval_ORG_f1": 0.7658020018726344, "eval_PER_f1": 0.8764428548203543, "eval_loss": 0.28897759318351746, "eval_overall_accuracy": 0.9185185486423599, "eval_overall_f1": 0.8335781872027352, "eval_overall_precision": 0.8256759558603319, "eval_overall_recall": 0.8416331386884168, "eval_runtime": 906.7935, "eval_samples_per_second": 72.453, "eval_steps_per_second": 0.283, "step": 60000 }, { "epoch": 2.7, "grad_norm": 2.0383992195129395, "learning_rate": 1.2284145816995119e-05, "loss": 0.2117, "step": 61000 }, { "epoch": 2.7, "eval_LOC_f1": 0.8508689748097309, "eval_ORG_f1": 0.7658278739306382, "eval_PER_f1": 0.8731916232956014, "eval_loss": 0.28091031312942505, "eval_overall_accuracy": 0.9196301183889559, "eval_overall_f1": 0.833681650059079, "eval_overall_precision": 0.8200686015431561, "eval_overall_recall": 0.8477542777525557, "eval_runtime": 885.0192, "eval_samples_per_second": 74.236, "eval_steps_per_second": 0.29, "step": 61000 }, { "epoch": 2.74, "grad_norm": 1.6501883268356323, "learning_rate": 1.2157656404158972e-05, "loss": 0.1994, "step": 62000 }, { "epoch": 2.74, "eval_LOC_f1": 0.8548620423851409, "eval_ORG_f1": 0.7666146057733736, "eval_PER_f1": 0.8740409497434966, "eval_loss": 0.267339825630188, "eval_overall_accuracy": 0.9214282857900041, "eval_overall_f1": 0.835940143844595, "eval_overall_precision": 0.8308500027062763, "eval_overall_recall": 0.8410930381827575, "eval_runtime": 886.6936, "eval_samples_per_second": 74.095, "eval_steps_per_second": 0.29, "step": 62000 }, { "epoch": 2.79, "grad_norm": 3.6416823863983154, "learning_rate": 1.2031166991322827e-05, "loss": 0.2075, "step": 63000 }, { "epoch": 2.79, "eval_LOC_f1": 0.8514531524204939, "eval_ORG_f1": 0.7741699019900634, "eval_PER_f1": 0.8762775257778839, "eval_loss": 0.2862880229949951, "eval_overall_accuracy": 0.9219467924096053, "eval_overall_f1": 0.8376946930582835, "eval_overall_precision": 0.8390993269298734, "eval_overall_recall": 0.836294753980306, "eval_runtime": 883.1341, "eval_samples_per_second": 74.394, "eval_steps_per_second": 0.291, "step": 63000 }, { "epoch": 2.83, "grad_norm": 3.767646312713623, "learning_rate": 1.1904677578486681e-05, "loss": 0.2144, "step": 64000 }, { "epoch": 2.83, "eval_LOC_f1": 0.8542264412564663, "eval_ORG_f1": 0.762882333169584, "eval_PER_f1": 0.8737852991218755, "eval_loss": 0.2919914424419403, "eval_overall_accuracy": 0.9192736027263282, "eval_overall_f1": 0.8335330637616842, "eval_overall_precision": 0.8234296712697055, "eval_overall_recall": 0.8438874712337774, "eval_runtime": 882.9236, "eval_samples_per_second": 74.412, "eval_steps_per_second": 0.291, "step": 64000 }, { "epoch": 2.88, "grad_norm": 1.2373511791229248, "learning_rate": 1.1778188165650538e-05, "loss": 0.2107, "step": 65000 }, { "epoch": 2.88, "eval_LOC_f1": 0.8536771728748805, "eval_ORG_f1": 0.7691218130311614, "eval_PER_f1": 0.8777075297286194, "eval_loss": 0.2956686019897461, "eval_overall_accuracy": 0.92124189452152, "eval_overall_f1": 0.8368564609614728, "eval_overall_precision": 0.8285232067510548, "eval_overall_recall": 0.84535904942311, "eval_runtime": 930.5121, "eval_samples_per_second": 70.606, "eval_steps_per_second": 0.276, "step": 65000 }, { "epoch": 2.92, "grad_norm": 13.250840187072754, "learning_rate": 1.1651698752814391e-05, "loss": 0.2133, "step": 66000 }, { "epoch": 2.92, "eval_LOC_f1": 0.8533000763334159, "eval_ORG_f1": 0.7736413979491799, "eval_PER_f1": 0.8740390436699679, "eval_loss": 0.2793155908584595, "eval_overall_accuracy": 0.9226618570941534, "eval_overall_f1": 0.8369070216139791, "eval_overall_precision": 0.8351781983723613, "eval_overall_recall": 0.8386430170483898, "eval_runtime": 953.5624, "eval_samples_per_second": 68.9, "eval_steps_per_second": 0.27, "step": 66000 }, { "epoch": 2.97, "grad_norm": 25.819507598876953, "learning_rate": 1.1525209339978245e-05, "loss": 0.2112, "step": 67000 }, { "epoch": 2.97, "eval_LOC_f1": 0.8548619072433559, "eval_ORG_f1": 0.7661784507158363, "eval_PER_f1": 0.8776364551402296, "eval_loss": 0.2820794880390167, "eval_overall_accuracy": 0.9220823496957755, "eval_overall_f1": 0.8374375390381013, "eval_overall_precision": 0.8353114340451381, "eval_overall_recall": 0.8395744947320631, "eval_runtime": 899.3376, "eval_samples_per_second": 73.054, "eval_steps_per_second": 0.286, "step": 67000 }, { "epoch": 3.01, "grad_norm": 13.493629455566406, "learning_rate": 1.13987199271421e-05, "loss": 0.1983, "step": 68000 }, { "epoch": 3.01, "eval_LOC_f1": 0.8558231253148143, "eval_ORG_f1": 0.7679850431851696, "eval_PER_f1": 0.8760919620026149, "eval_loss": 0.2852949798107147, "eval_overall_accuracy": 0.9224734324663767, "eval_overall_f1": 0.8365820844153812, "eval_overall_precision": 0.8359935591789517, "eval_overall_recall": 0.8371714388590572, "eval_runtime": 881.1865, "eval_samples_per_second": 74.559, "eval_steps_per_second": 0.292, "step": 68000 } ], "logging_steps": 1000, "max_steps": 158116, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "total_flos": 2.1809439865622904e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }