diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3830 @@ +{ + "best_metric": 0.9595749595749595, + "best_model_checkpoint": "ds-v6-large/checkpoint-2805", + "epoch": 33.287101248266296, + "eval_steps": 15, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1664355062413315, + "grad_norm": 2.6665048599243164, + "learning_rate": 9.950000000000001e-06, + "loss": 1.9852, + "step": 15 + }, + { + "epoch": 0.1664355062413315, + "eval_accuracy": 0.8101443789541989, + "eval_f1": 0.0, + "eval_loss": 1.1499630212783813, + "eval_precision": 0.0, + "eval_recall": 0.0, + "eval_runtime": 35.775, + "eval_samples_per_second": 5.563, + "eval_steps_per_second": 1.398, + "step": 15 + }, + { + "epoch": 0.332871012482663, + "grad_norm": 2.08683443069458, + "learning_rate": 9.9e-06, + "loss": 1.0244, + "step": 30 + }, + { + "epoch": 0.332871012482663, + "eval_accuracy": 0.8122532850267669, + "eval_f1": 0.012939749292357462, + "eval_loss": 0.834208607673645, + "eval_precision": 0.05, + "eval_recall": 0.0074314909428704135, + "eval_runtime": 35.1999, + "eval_samples_per_second": 5.653, + "eval_steps_per_second": 1.42, + "step": 30 + }, + { + "epoch": 0.49930651872399445, + "grad_norm": 2.1074297428131104, + "learning_rate": 9.85e-06, + "loss": 0.7826, + "step": 45 + }, + { + "epoch": 0.49930651872399445, + "eval_accuracy": 0.8479424647163791, + "eval_f1": 0.09320905459387482, + "eval_loss": 0.6794766187667847, + "eval_precision": 0.07893041237113402, + "eval_recall": 0.1137947050627032, + "eval_runtime": 35.557, + "eval_samples_per_second": 5.597, + "eval_steps_per_second": 1.406, + "step": 45 + }, + { + "epoch": 0.665742024965326, + "grad_norm": 1.8543498516082764, + "learning_rate": 9.800000000000001e-06, + "loss": 0.6767, + "step": 60 + }, + { + "epoch": 0.665742024965326, + "eval_accuracy": 0.8578381009030444, + "eval_f1": 0.13828125, + "eval_loss": 0.5963338613510132, + "eval_precision": 0.11931243680485339, + "eval_recall": 0.1644217371110079, + "eval_runtime": 35.7897, + "eval_samples_per_second": 5.56, + "eval_steps_per_second": 1.397, + "step": 60 + }, + { + "epoch": 0.8321775312066574, + "grad_norm": 1.9123793840408325, + "learning_rate": 9.75e-06, + "loss": 0.6031, + "step": 75 + }, + { + "epoch": 0.8321775312066574, + "eval_accuracy": 0.8690855999567404, + "eval_f1": 0.1916831683168317, + "eval_loss": 0.5405648946762085, + "eval_precision": 0.1670693821194339, + "eval_recall": 0.22480260102183, + "eval_runtime": 35.6034, + "eval_samples_per_second": 5.589, + "eval_steps_per_second": 1.404, + "step": 75 + }, + { + "epoch": 0.9986130374479889, + "grad_norm": 1.9384328126907349, + "learning_rate": 9.7e-06, + "loss": 0.5756, + "step": 90 + }, + { + "epoch": 0.9986130374479889, + "eval_accuracy": 0.8777915968204185, + "eval_f1": 0.26393539491825885, + "eval_loss": 0.49346938729286194, + "eval_precision": 0.22913816689466485, + "eval_recall": 0.3111936832326986, + "eval_runtime": 35.6547, + "eval_samples_per_second": 5.581, + "eval_steps_per_second": 1.402, + "step": 90 + }, + { + "epoch": 1.1650485436893203, + "grad_norm": 1.751382827758789, + "learning_rate": 9.65e-06, + "loss": 0.5215, + "step": 105 + }, + { + "epoch": 1.1650485436893203, + "eval_accuracy": 0.8904991077705078, + "eval_f1": 0.3575184016824396, + "eval_loss": 0.43015486001968384, + "eval_precision": 0.32667179093005383, + "eval_recall": 0.3947979563399907, + "eval_runtime": 35.5042, + "eval_samples_per_second": 5.605, + "eval_steps_per_second": 1.408, + "step": 105 + }, + { + "epoch": 1.331484049930652, + "grad_norm": 1.7430224418640137, + "learning_rate": 9.600000000000001e-06, + "loss": 0.4782, + "step": 120 + }, + { + "epoch": 1.331484049930652, + "eval_accuracy": 0.9020169793976099, + "eval_f1": 0.4266553119012136, + "eval_loss": 0.37819600105285645, + "eval_precision": 0.3938679245283019, + "eval_recall": 0.46539712029725966, + "eval_runtime": 35.9551, + "eval_samples_per_second": 5.535, + "eval_steps_per_second": 1.391, + "step": 120 + }, + { + "epoch": 1.4979195561719834, + "grad_norm": 2.754100799560547, + "learning_rate": 9.55e-06, + "loss": 0.4208, + "step": 135 + }, + { + "epoch": 1.4979195561719834, + "eval_accuracy": 0.9080733250419077, + "eval_f1": 0.44783505154639175, + "eval_loss": 0.34046444296836853, + "eval_precision": 0.40266963292547275, + "eval_recall": 0.5044124477473293, + "eval_runtime": 35.1015, + "eval_samples_per_second": 5.669, + "eval_steps_per_second": 1.424, + "step": 135 + }, + { + "epoch": 1.664355062413315, + "grad_norm": 1.3271350860595703, + "learning_rate": 9.5e-06, + "loss": 0.3532, + "step": 150 + }, + { + "epoch": 1.664355062413315, + "eval_accuracy": 0.9251608716811767, + "eval_f1": 0.5355845266082496, + "eval_loss": 0.2930045425891876, + "eval_precision": 0.49604117181314333, + "eval_recall": 0.5819786344635393, + "eval_runtime": 34.7775, + "eval_samples_per_second": 5.722, + "eval_steps_per_second": 1.438, + "step": 150 + }, + { + "epoch": 1.8307905686546464, + "grad_norm": 1.9117140769958496, + "learning_rate": 9.450000000000001e-06, + "loss": 0.3458, + "step": 165 + }, + { + "epoch": 1.8307905686546464, + "eval_accuracy": 0.9301357270318499, + "eval_f1": 0.5559597688850845, + "eval_loss": 0.2658008933067322, + "eval_precision": 0.5154761904761904, + "eval_recall": 0.6033441709242917, + "eval_runtime": 34.9787, + "eval_samples_per_second": 5.689, + "eval_steps_per_second": 1.429, + "step": 165 + }, + { + "epoch": 1.9972260748959778, + "grad_norm": 1.9700042009353638, + "learning_rate": 9.4e-06, + "loss": 0.302, + "step": 180 + }, + { + "epoch": 1.9972260748959778, + "eval_accuracy": 0.9474395717298437, + "eval_f1": 0.6529640848117698, + "eval_loss": 0.2320590764284134, + "eval_precision": 0.6111786148238153, + "eval_recall": 0.7008824895494659, + "eval_runtime": 35.1938, + "eval_samples_per_second": 5.654, + "eval_steps_per_second": 1.421, + "step": 180 + }, + { + "epoch": 2.163661581137309, + "grad_norm": 1.2119841575622559, + "learning_rate": 9.350000000000002e-06, + "loss": 0.2655, + "step": 195 + }, + { + "epoch": 2.163661581137309, + "eval_accuracy": 0.9519818309630671, + "eval_f1": 0.6844638949671772, + "eval_loss": 0.20933493971824646, + "eval_precision": 0.6470831609433182, + "eval_recall": 0.7264282396655829, + "eval_runtime": 35.2683, + "eval_samples_per_second": 5.642, + "eval_steps_per_second": 1.418, + "step": 195 + }, + { + "epoch": 2.3300970873786406, + "grad_norm": 2.050490617752075, + "learning_rate": 9.3e-06, + "loss": 0.2598, + "step": 210 + }, + { + "epoch": 2.3300970873786406, + "eval_accuracy": 0.9570107608284215, + "eval_f1": 0.7274759669125868, + "eval_loss": 0.1951305866241455, + "eval_precision": 0.7012931034482759, + "eval_recall": 0.7556897352531352, + "eval_runtime": 35.2296, + "eval_samples_per_second": 5.649, + "eval_steps_per_second": 1.419, + "step": 210 + }, + { + "epoch": 2.496532593619972, + "grad_norm": 2.1060705184936523, + "learning_rate": 9.250000000000001e-06, + "loss": 0.2364, + "step": 225 + }, + { + "epoch": 2.496532593619972, + "eval_accuracy": 0.9590115178716271, + "eval_f1": 0.7402309058614565, + "eval_loss": 0.17936836183071136, + "eval_precision": 0.7090599744789451, + "eval_recall": 0.7742684626103112, + "eval_runtime": 35.4911, + "eval_samples_per_second": 5.607, + "eval_steps_per_second": 1.409, + "step": 225 + }, + { + "epoch": 2.662968099861304, + "grad_norm": 1.8435375690460205, + "learning_rate": 9.200000000000002e-06, + "loss": 0.2218, + "step": 240 + }, + { + "epoch": 2.662968099861304, + "eval_accuracy": 0.9621478397231384, + "eval_f1": 0.7557522123893805, + "eval_loss": 0.1675911545753479, + "eval_precision": 0.721588508660752, + "eval_recall": 0.7933116581514166, + "eval_runtime": 35.3833, + "eval_samples_per_second": 5.624, + "eval_steps_per_second": 1.413, + "step": 240 + }, + { + "epoch": 2.8294036061026353, + "grad_norm": 2.065732479095459, + "learning_rate": 9.15e-06, + "loss": 0.206, + "step": 255 + }, + { + "epoch": 2.8294036061026353, + "eval_accuracy": 0.9650137890012437, + "eval_f1": 0.7758275938680294, + "eval_loss": 0.15723256766796112, + "eval_precision": 0.7436115843270868, + "eval_recall": 0.8109614491407339, + "eval_runtime": 35.4693, + "eval_samples_per_second": 5.61, + "eval_steps_per_second": 1.41, + "step": 255 + }, + { + "epoch": 2.9958391123439667, + "grad_norm": 2.1758480072021484, + "learning_rate": 9.100000000000001e-06, + "loss": 0.2053, + "step": 270 + }, + { + "epoch": 2.9958391123439667, + "eval_accuracy": 0.9640404477369816, + "eval_f1": 0.7730088495575222, + "eval_loss": 0.15795043110847473, + "eval_precision": 0.7380650612589776, + "eval_recall": 0.8114259173246633, + "eval_runtime": 35.4002, + "eval_samples_per_second": 5.621, + "eval_steps_per_second": 1.412, + "step": 270 + }, + { + "epoch": 3.162274618585298, + "grad_norm": 1.6404600143432617, + "learning_rate": 9.050000000000001e-06, + "loss": 0.1876, + "step": 285 + }, + { + "epoch": 3.162274618585298, + "eval_accuracy": 0.9687449305142486, + "eval_f1": 0.801343784994401, + "eval_loss": 0.1406078040599823, + "eval_precision": 0.7737889273356401, + "eval_recall": 0.8309335810496981, + "eval_runtime": 35.6674, + "eval_samples_per_second": 5.579, + "eval_steps_per_second": 1.402, + "step": 285 + }, + { + "epoch": 3.3287101248266295, + "grad_norm": 1.8430469036102295, + "learning_rate": 9e-06, + "loss": 0.1602, + "step": 300 + }, + { + "epoch": 3.3287101248266295, + "eval_accuracy": 0.9670686205591305, + "eval_f1": 0.7985659870042572, + "eval_loss": 0.14204147458076477, + "eval_precision": 0.7714285714285715, + "eval_recall": 0.8276823037621923, + "eval_runtime": 35.5741, + "eval_samples_per_second": 5.594, + "eval_steps_per_second": 1.406, + "step": 300 + }, + { + "epoch": 3.4951456310679614, + "grad_norm": 2.2237956523895264, + "learning_rate": 8.95e-06, + "loss": 0.1706, + "step": 315 + }, + { + "epoch": 3.4951456310679614, + "eval_accuracy": 0.969069377602336, + "eval_f1": 0.8149988705669754, + "eval_loss": 0.13229934871196747, + "eval_precision": 0.793315743183817, + "eval_recall": 0.8379006038086391, + "eval_runtime": 36.0972, + "eval_samples_per_second": 5.513, + "eval_steps_per_second": 1.385, + "step": 315 + }, + { + "epoch": 3.661581137309293, + "grad_norm": 2.04622220993042, + "learning_rate": 8.900000000000001e-06, + "loss": 0.1585, + "step": 330 + }, + { + "epoch": 3.661581137309293, + "eval_accuracy": 0.9700427188665982, + "eval_f1": 0.8298399819697994, + "eval_loss": 0.13131560385227203, + "eval_precision": 0.8060420315236427, + "eval_recall": 0.8550859266140269, + "eval_runtime": 35.6467, + "eval_samples_per_second": 5.583, + "eval_steps_per_second": 1.403, + "step": 330 + }, + { + "epoch": 3.828016643550624, + "grad_norm": 2.0790255069732666, + "learning_rate": 8.85e-06, + "loss": 0.1574, + "step": 345 + }, + { + "epoch": 3.828016643550624, + "eval_accuracy": 0.9717190288217163, + "eval_f1": 0.8376491781130375, + "eval_loss": 0.12674090266227722, + "eval_precision": 0.8129370629370629, + "eval_recall": 0.8639108221086855, + "eval_runtime": 35.5835, + "eval_samples_per_second": 5.592, + "eval_steps_per_second": 1.405, + "step": 345 + }, + { + "epoch": 3.9944521497919556, + "grad_norm": 2.3372180461883545, + "learning_rate": 8.8e-06, + "loss": 0.15, + "step": 360 + }, + { + "epoch": 3.9944521497919556, + "eval_accuracy": 0.97539609582004, + "eval_f1": 0.8535811423390752, + "eval_loss": 0.11569273471832275, + "eval_precision": 0.8335546702080566, + "eval_recall": 0.8745935903390618, + "eval_runtime": 35.7391, + "eval_samples_per_second": 5.568, + "eval_steps_per_second": 1.399, + "step": 360 + }, + { + "epoch": 4.160887656033287, + "grad_norm": 2.763075828552246, + "learning_rate": 8.750000000000001e-06, + "loss": 0.1192, + "step": 375 + }, + { + "epoch": 4.160887656033287, + "eval_accuracy": 0.9740983074676904, + "eval_f1": 0.8524664696521937, + "eval_loss": 0.11200679838657379, + "eval_precision": 0.8348174532502226, + "eval_recall": 0.8708778448676265, + "eval_runtime": 36.0008, + "eval_samples_per_second": 5.528, + "eval_steps_per_second": 1.389, + "step": 375 + }, + { + "epoch": 4.327323162274618, + "grad_norm": 1.7937551736831665, + "learning_rate": 8.700000000000001e-06, + "loss": 0.1313, + "step": 390 + }, + { + "epoch": 4.327323162274618, + "eval_accuracy": 0.9745309035851403, + "eval_f1": 0.8588929219600727, + "eval_loss": 0.1129654049873352, + "eval_precision": 0.8394678492239468, + "eval_recall": 0.8792382721783558, + "eval_runtime": 36.0013, + "eval_samples_per_second": 5.528, + "eval_steps_per_second": 1.389, + "step": 390 + }, + { + "epoch": 4.49375866851595, + "grad_norm": 1.4142848253250122, + "learning_rate": 8.65e-06, + "loss": 0.1179, + "step": 405 + }, + { + "epoch": 4.49375866851595, + "eval_accuracy": 0.9755042448494025, + "eval_f1": 0.8613303269447576, + "eval_loss": 0.109279565513134, + "eval_precision": 0.8369851007887817, + "eval_recall": 0.8871342313051556, + "eval_runtime": 36.0609, + "eval_samples_per_second": 5.518, + "eval_steps_per_second": 1.387, + "step": 405 + }, + { + "epoch": 4.660194174757281, + "grad_norm": 1.6794809103012085, + "learning_rate": 8.6e-06, + "loss": 0.1327, + "step": 420 + }, + { + "epoch": 4.660194174757281, + "eval_accuracy": 0.9745849780998216, + "eval_f1": 0.862053369516056, + "eval_loss": 0.11022669076919556, + "eval_precision": 0.8400176289114147, + "eval_recall": 0.885276358569438, + "eval_runtime": 36.0549, + "eval_samples_per_second": 5.519, + "eval_steps_per_second": 1.387, + "step": 420 + }, + { + "epoch": 4.826629680998613, + "grad_norm": 1.8358403444290161, + "learning_rate": 8.550000000000001e-06, + "loss": 0.1323, + "step": 435 + }, + { + "epoch": 4.826629680998613, + "eval_accuracy": 0.978207970583464, + "eval_f1": 0.8795454545454546, + "eval_loss": 0.09974753856658936, + "eval_precision": 0.8611481975967957, + "eval_recall": 0.8987459359033906, + "eval_runtime": 36.3053, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 1.377, + "step": 435 + }, + { + "epoch": 4.993065187239944, + "grad_norm": 2.1321513652801514, + "learning_rate": 8.5e-06, + "loss": 0.1254, + "step": 450 + }, + { + "epoch": 4.993065187239944, + "eval_accuracy": 0.9774509273779268, + "eval_f1": 0.8727683615819208, + "eval_loss": 0.094924695789814, + "eval_precision": 0.8499119718309859, + "eval_recall": 0.896888063167673, + "eval_runtime": 36.229, + "eval_samples_per_second": 5.493, + "eval_steps_per_second": 1.38, + "step": 450 + }, + { + "epoch": 5.159500693481276, + "grad_norm": 1.3562971353530884, + "learning_rate": 8.45e-06, + "loss": 0.0999, + "step": 465 + }, + { + "epoch": 5.159500693481276, + "eval_accuracy": 0.9797220569945385, + "eval_f1": 0.8822055137844612, + "eval_loss": 0.08469922095537186, + "eval_precision": 0.8658318425760286, + "eval_recall": 0.89921040408732, + "eval_runtime": 36.2946, + "eval_samples_per_second": 5.483, + "eval_steps_per_second": 1.378, + "step": 465 + }, + { + "epoch": 5.325936199722608, + "grad_norm": 1.683296799659729, + "learning_rate": 8.400000000000001e-06, + "loss": 0.1017, + "step": 480 + }, + { + "epoch": 5.325936199722608, + "eval_accuracy": 0.981019845346888, + "eval_f1": 0.8923777019340159, + "eval_loss": 0.08026640117168427, + "eval_precision": 0.8746654772524531, + "eval_recall": 0.910822108685555, + "eval_runtime": 36.4129, + "eval_samples_per_second": 5.465, + "eval_steps_per_second": 1.373, + "step": 480 + }, + { + "epoch": 5.492371705963939, + "grad_norm": 2.303062677383423, + "learning_rate": 8.35e-06, + "loss": 0.091, + "step": 495 + }, + { + "epoch": 5.492371705963939, + "eval_accuracy": 0.9805872492294382, + "eval_f1": 0.8918362680082322, + "eval_loss": 0.07959215342998505, + "eval_precision": 0.8783783783783784, + "eval_recall": 0.9057129586623316, + "eval_runtime": 36.4104, + "eval_samples_per_second": 5.465, + "eval_steps_per_second": 1.373, + "step": 495 + }, + { + "epoch": 5.658807212205271, + "grad_norm": 3.383983850479126, + "learning_rate": 8.3e-06, + "loss": 0.0979, + "step": 510 + }, + { + "epoch": 5.658807212205271, + "eval_accuracy": 0.9773427783485643, + "eval_f1": 0.8775045537340619, + "eval_loss": 0.09432032704353333, + "eval_precision": 0.8606520768200089, + "eval_recall": 0.8950301904319554, + "eval_runtime": 36.0505, + "eval_samples_per_second": 5.52, + "eval_steps_per_second": 1.387, + "step": 510 + }, + { + "epoch": 5.825242718446602, + "grad_norm": 2.1892480850219727, + "learning_rate": 8.25e-06, + "loss": 0.1024, + "step": 525 + }, + { + "epoch": 5.825242718446602, + "eval_accuracy": 0.980533174714757, + "eval_f1": 0.8882312770316413, + "eval_loss": 0.08036847412586212, + "eval_precision": 0.8709821428571428, + "eval_recall": 0.906177426846261, + "eval_runtime": 36.1406, + "eval_samples_per_second": 5.506, + "eval_steps_per_second": 1.383, + "step": 525 + }, + { + "epoch": 5.991678224687933, + "grad_norm": 1.8490287065505981, + "learning_rate": 8.2e-06, + "loss": 0.0952, + "step": 540 + }, + { + "epoch": 5.991678224687933, + "eval_accuracy": 0.9816146650083816, + "eval_f1": 0.900843400957374, + "eval_loss": 0.07866356521844864, + "eval_precision": 0.8845120859444942, + "eval_recall": 0.917789131444496, + "eval_runtime": 36.4974, + "eval_samples_per_second": 5.452, + "eval_steps_per_second": 1.37, + "step": 540 + }, + { + "epoch": 6.158113730929265, + "grad_norm": 3.0108256340026855, + "learning_rate": 8.15e-06, + "loss": 0.0742, + "step": 555 + }, + { + "epoch": 6.158113730929265, + "eval_accuracy": 0.9823176336992375, + "eval_f1": 0.9032553874369554, + "eval_loss": 0.07755902409553528, + "eval_precision": 0.8918062471706655, + "eval_recall": 0.9150023223409196, + "eval_runtime": 36.3184, + "eval_samples_per_second": 5.479, + "eval_steps_per_second": 1.377, + "step": 555 + }, + { + "epoch": 6.324549237170596, + "grad_norm": 2.533155679702759, + "learning_rate": 8.1e-06, + "loss": 0.0764, + "step": 570 + }, + { + "epoch": 6.324549237170596, + "eval_accuracy": 0.9837235710809495, + "eval_f1": 0.9106813996316758, + "eval_loss": 0.07210895419120789, + "eval_precision": 0.9027841168416249, + "eval_recall": 0.9187180678123549, + "eval_runtime": 36.5311, + "eval_samples_per_second": 5.447, + "eval_steps_per_second": 1.369, + "step": 570 + }, + { + "epoch": 6.490984743411928, + "grad_norm": 1.943320631980896, + "learning_rate": 8.050000000000001e-06, + "loss": 0.0813, + "step": 585 + }, + { + "epoch": 6.490984743411928, + "eval_accuracy": 0.9844265397718055, + "eval_f1": 0.914614499424626, + "eval_loss": 0.06643209606409073, + "eval_precision": 0.906478102189781, + "eval_recall": 0.9228982814677195, + "eval_runtime": 36.3904, + "eval_samples_per_second": 5.468, + "eval_steps_per_second": 1.374, + "step": 585 + }, + { + "epoch": 6.657420249653259, + "grad_norm": 1.322831392288208, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0791, + "step": 600 + }, + { + "epoch": 6.657420249653259, + "eval_accuracy": 0.9848050613745741, + "eval_f1": 0.9137614678899082, + "eval_loss": 0.06415116786956787, + "eval_precision": 0.902582691436339, + "eval_recall": 0.9252206223873665, + "eval_runtime": 36.27, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 1.379, + "step": 600 + }, + { + "epoch": 6.8238557558945905, + "grad_norm": 1.5891202688217163, + "learning_rate": 7.950000000000002e-06, + "loss": 0.0792, + "step": 615 + }, + { + "epoch": 6.8238557558945905, + "eval_accuracy": 0.9841020926837182, + "eval_f1": 0.9103795153177869, + "eval_loss": 0.06728328764438629, + "eval_precision": 0.8964430436740207, + "eval_recall": 0.924756154203437, + "eval_runtime": 35.8847, + "eval_samples_per_second": 5.546, + "eval_steps_per_second": 1.393, + "step": 615 + }, + { + "epoch": 6.990291262135923, + "grad_norm": 3.470646858215332, + "learning_rate": 7.9e-06, + "loss": 0.078, + "step": 630 + }, + { + "epoch": 6.990291262135923, + "eval_accuracy": 0.9832909749634997, + "eval_f1": 0.9078857142857143, + "eval_loss": 0.06933122873306274, + "eval_precision": 0.8937893789378938, + "eval_recall": 0.92243381328379, + "eval_runtime": 36.1839, + "eval_samples_per_second": 5.5, + "eval_steps_per_second": 1.382, + "step": 630 + }, + { + "epoch": 7.156726768377254, + "grad_norm": 2.4168286323547363, + "learning_rate": 7.850000000000001e-06, + "loss": 0.0678, + "step": 645 + }, + { + "epoch": 7.156726768377254, + "eval_accuracy": 0.985237657492024, + "eval_f1": 0.92025664527956, + "eval_loss": 0.06722652167081833, + "eval_precision": 0.9081863410221619, + "eval_recall": 0.9326521133302369, + "eval_runtime": 36.716, + "eval_samples_per_second": 5.42, + "eval_steps_per_second": 1.362, + "step": 645 + }, + { + "epoch": 7.323162274618586, + "grad_norm": 1.048614501953125, + "learning_rate": 7.800000000000002e-06, + "loss": 0.0685, + "step": 660 + }, + { + "epoch": 7.323162274618586, + "eval_accuracy": 0.9839939436543557, + "eval_f1": 0.9072635906806761, + "eval_loss": 0.06548429280519485, + "eval_precision": 0.8925842696629214, + "eval_recall": 0.92243381328379, + "eval_runtime": 36.897, + "eval_samples_per_second": 5.393, + "eval_steps_per_second": 1.355, + "step": 660 + }, + { + "epoch": 7.489597780859917, + "grad_norm": 2.5844979286193848, + "learning_rate": 7.75e-06, + "loss": 0.0555, + "step": 675 + }, + { + "epoch": 7.489597780859917, + "eval_accuracy": 0.9856161790947926, + "eval_f1": 0.9213016385875836, + "eval_loss": 0.06148982420563698, + "eval_precision": 0.9155963302752294, + "eval_recall": 0.927078495123084, + "eval_runtime": 36.1847, + "eval_samples_per_second": 5.5, + "eval_steps_per_second": 1.382, + "step": 675 + }, + { + "epoch": 7.656033287101248, + "grad_norm": 1.9488413333892822, + "learning_rate": 7.7e-06, + "loss": 0.07, + "step": 690 + }, + { + "epoch": 7.656033287101248, + "eval_accuracy": 0.9867517439030985, + "eval_f1": 0.927176659774868, + "eval_loss": 0.058708589524030685, + "eval_precision": 0.9172727272727272, + "eval_recall": 0.9372967951695309, + "eval_runtime": 36.4405, + "eval_samples_per_second": 5.461, + "eval_steps_per_second": 1.372, + "step": 690 + }, + { + "epoch": 7.82246879334258, + "grad_norm": 1.7437242269515991, + "learning_rate": 7.650000000000001e-06, + "loss": 0.065, + "step": 705 + }, + { + "epoch": 7.82246879334258, + "eval_accuracy": 0.9874547125939545, + "eval_f1": 0.9303928325292902, + "eval_loss": 0.0557989701628685, + "eval_precision": 0.9204545454545454, + "eval_recall": 0.9405480724570366, + "eval_runtime": 36.0661, + "eval_samples_per_second": 5.518, + "eval_steps_per_second": 1.386, + "step": 705 + }, + { + "epoch": 7.988904299583911, + "grad_norm": 1.0527422428131104, + "learning_rate": 7.600000000000001e-06, + "loss": 0.0599, + "step": 720 + }, + { + "epoch": 7.988904299583911, + "eval_accuracy": 0.9878332341967231, + "eval_f1": 0.9342226310947562, + "eval_loss": 0.05789622664451599, + "eval_precision": 0.9252847380410023, + "eval_recall": 0.9433348815606131, + "eval_runtime": 36.3712, + "eval_samples_per_second": 5.471, + "eval_steps_per_second": 1.375, + "step": 720 + }, + { + "epoch": 8.155339805825243, + "grad_norm": 1.6904972791671753, + "learning_rate": 7.5500000000000006e-06, + "loss": 0.0571, + "step": 735 + }, + { + "epoch": 8.155339805825243, + "eval_accuracy": 0.9865895203590548, + "eval_f1": 0.9238905495516211, + "eval_loss": 0.059290919452905655, + "eval_precision": 0.9148451730418944, + "eval_recall": 0.9331165815141663, + "eval_runtime": 36.0084, + "eval_samples_per_second": 5.526, + "eval_steps_per_second": 1.389, + "step": 735 + }, + { + "epoch": 8.321775312066574, + "grad_norm": 1.9831328392028809, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0563, + "step": 750 + }, + { + "epoch": 8.321775312066574, + "eval_accuracy": 0.9863191477856487, + "eval_f1": 0.9236079153244362, + "eval_loss": 0.06046581640839577, + "eval_precision": 0.9151846785225718, + "eval_recall": 0.9321876451463075, + "eval_runtime": 36.0693, + "eval_samples_per_second": 5.517, + "eval_steps_per_second": 1.386, + "step": 750 + }, + { + "epoch": 8.488210818307905, + "grad_norm": 2.0379467010498047, + "learning_rate": 7.450000000000001e-06, + "loss": 0.0602, + "step": 765 + }, + { + "epoch": 8.488210818307905, + "eval_accuracy": 0.9863191477856487, + "eval_f1": 0.927992590877518, + "eval_loss": 0.058113399893045425, + "eval_precision": 0.925207756232687, + "eval_recall": 0.9307942405945193, + "eval_runtime": 35.9178, + "eval_samples_per_second": 5.54, + "eval_steps_per_second": 1.392, + "step": 765 + }, + { + "epoch": 8.654646324549237, + "grad_norm": 3.095200538635254, + "learning_rate": 7.4e-06, + "loss": 0.0582, + "step": 780 + }, + { + "epoch": 8.654646324549237, + "eval_accuracy": 0.9872384145352295, + "eval_f1": 0.9288837744533948, + "eval_loss": 0.05814095214009285, + "eval_precision": 0.9206204379562044, + "eval_recall": 0.9372967951695309, + "eval_runtime": 36.2273, + "eval_samples_per_second": 5.493, + "eval_steps_per_second": 1.38, + "step": 780 + }, + { + "epoch": 8.821081830790568, + "grad_norm": 1.0786473751068115, + "learning_rate": 7.350000000000001e-06, + "loss": 0.0514, + "step": 795 + }, + { + "epoch": 8.821081830790568, + "eval_accuracy": 0.9872924890499107, + "eval_f1": 0.9313047487321346, + "eval_loss": 0.055727362632751465, + "eval_precision": 0.9244851258581236, + "eval_recall": 0.9382257315373896, + "eval_runtime": 36.0241, + "eval_samples_per_second": 5.524, + "eval_steps_per_second": 1.388, + "step": 795 + }, + { + "epoch": 8.9875173370319, + "grad_norm": 1.6077920198440552, + "learning_rate": 7.3e-06, + "loss": 0.0467, + "step": 810 + }, + { + "epoch": 8.9875173370319, + "eval_accuracy": 0.9883199048288541, + "eval_f1": 0.9393661001378043, + "eval_loss": 0.05200658738613129, + "eval_precision": 0.9291231258518855, + "eval_recall": 0.9498374361356247, + "eval_runtime": 35.9411, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 1.391, + "step": 810 + }, + { + "epoch": 9.153952843273231, + "grad_norm": 1.601219892501831, + "learning_rate": 7.25e-06, + "loss": 0.0435, + "step": 825 + }, + { + "epoch": 9.153952843273231, + "eval_accuracy": 0.9879954577407668, + "eval_f1": 0.9336699563920129, + "eval_loss": 0.05260741710662842, + "eval_precision": 0.9228675136116152, + "eval_recall": 0.9447282861124013, + "eval_runtime": 35.7996, + "eval_samples_per_second": 5.559, + "eval_steps_per_second": 1.397, + "step": 825 + }, + { + "epoch": 9.320388349514563, + "grad_norm": 0.7272451519966125, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.0531, + "step": 840 + }, + { + "epoch": 9.320388349514563, + "eval_accuracy": 0.9883739793435354, + "eval_f1": 0.9344978165938865, + "eval_loss": 0.05022520199418068, + "eval_precision": 0.9249317561419472, + "eval_recall": 0.9442638179284719, + "eval_runtime": 36.0285, + "eval_samples_per_second": 5.523, + "eval_steps_per_second": 1.388, + "step": 840 + }, + { + "epoch": 9.486823855755894, + "grad_norm": 0.9556881189346313, + "learning_rate": 7.15e-06, + "loss": 0.0502, + "step": 855 + }, + { + "epoch": 9.486823855755894, + "eval_accuracy": 0.9874006380792733, + "eval_f1": 0.9309240622140896, + "eval_loss": 0.05446859449148178, + "eval_precision": 0.9170797656602073, + "eval_recall": 0.9451927542963307, + "eval_runtime": 36.0609, + "eval_samples_per_second": 5.518, + "eval_steps_per_second": 1.387, + "step": 855 + }, + { + "epoch": 9.653259361997225, + "grad_norm": 1.0404924154281616, + "learning_rate": 7.100000000000001e-06, + "loss": 0.0377, + "step": 870 + }, + { + "epoch": 9.653259361997225, + "eval_accuracy": 0.9850754339479804, + "eval_f1": 0.9220571428571429, + "eval_loss": 0.06175297126173973, + "eval_precision": 0.9077407740774077, + "eval_recall": 0.9368323269856015, + "eval_runtime": 36.326, + "eval_samples_per_second": 5.478, + "eval_steps_per_second": 1.376, + "step": 870 + }, + { + "epoch": 9.819694868238557, + "grad_norm": 1.1249316930770874, + "learning_rate": 7.05e-06, + "loss": 0.0416, + "step": 885 + }, + { + "epoch": 9.819694868238557, + "eval_accuracy": 0.9881036067701292, + "eval_f1": 0.9328719723183392, + "eval_loss": 0.05493583530187607, + "eval_precision": 0.9266727772685609, + "eval_recall": 0.9391546679052485, + "eval_runtime": 36.1852, + "eval_samples_per_second": 5.499, + "eval_steps_per_second": 1.382, + "step": 885 + }, + { + "epoch": 9.986130374479888, + "grad_norm": 1.0846829414367676, + "learning_rate": 7e-06, + "loss": 0.044, + "step": 900 + }, + { + "epoch": 9.986130374479888, + "eval_accuracy": 0.9884280538582166, + "eval_f1": 0.9420457169244978, + "eval_loss": 0.05289188027381897, + "eval_precision": 0.9366391184573003, + "eval_recall": 0.9475150952159777, + "eval_runtime": 36.0505, + "eval_samples_per_second": 5.52, + "eval_steps_per_second": 1.387, + "step": 900 + }, + { + "epoch": 10.152565880721221, + "grad_norm": 0.8957504630088806, + "learning_rate": 6.95e-06, + "loss": 0.0383, + "step": 915 + }, + { + "epoch": 10.152565880721221, + "eval_accuracy": 0.9889147244903477, + "eval_f1": 0.9403088269186448, + "eval_loss": 0.048978183418512344, + "eval_precision": 0.9332113449222323, + "eval_recall": 0.9475150952159777, + "eval_runtime": 36.1551, + "eval_samples_per_second": 5.504, + "eval_steps_per_second": 1.383, + "step": 915 + }, + { + "epoch": 10.319001386962553, + "grad_norm": 1.6940028667449951, + "learning_rate": 6.9e-06, + "loss": 0.0454, + "step": 930 + }, + { + "epoch": 10.319001386962553, + "eval_accuracy": 0.988536202887579, + "eval_f1": 0.9366100137804317, + "eval_loss": 0.05073446407914162, + "eval_precision": 0.9263970922308041, + "eval_recall": 0.9470506270320483, + "eval_runtime": 36.1642, + "eval_samples_per_second": 5.503, + "eval_steps_per_second": 1.383, + "step": 930 + }, + { + "epoch": 10.485436893203884, + "grad_norm": 0.9225968718528748, + "learning_rate": 6.850000000000001e-06, + "loss": 0.0416, + "step": 945 + }, + { + "epoch": 10.485436893203884, + "eval_accuracy": 0.9891310225490726, + "eval_f1": 0.9430481899930827, + "eval_loss": 0.046711865812540054, + "eval_precision": 0.9363553113553114, + "eval_recall": 0.9498374361356247, + "eval_runtime": 36.5741, + "eval_samples_per_second": 5.441, + "eval_steps_per_second": 1.367, + "step": 945 + }, + { + "epoch": 10.651872399445216, + "grad_norm": 2.7210068702697754, + "learning_rate": 6.800000000000001e-06, + "loss": 0.0403, + "step": 960 + }, + { + "epoch": 10.651872399445216, + "eval_accuracy": 0.9886443519169416, + "eval_f1": 0.9384650841207652, + "eval_loss": 0.04987097531557083, + "eval_precision": 0.9313815187557182, + "eval_recall": 0.9456572224802601, + "eval_runtime": 36.5567, + "eval_samples_per_second": 5.444, + "eval_steps_per_second": 1.368, + "step": 960 + }, + { + "epoch": 10.818307905686547, + "grad_norm": 1.160333275794983, + "learning_rate": 6.750000000000001e-06, + "loss": 0.0354, + "step": 975 + }, + { + "epoch": 10.818307905686547, + "eval_accuracy": 0.9882658303141729, + "eval_f1": 0.9354171454837968, + "eval_loss": 0.05233873799443245, + "eval_precision": 0.9258416742493175, + "eval_recall": 0.9451927542963307, + "eval_runtime": 36.44, + "eval_samples_per_second": 5.461, + "eval_steps_per_second": 1.372, + "step": 975 + }, + { + "epoch": 10.984743411927878, + "grad_norm": 0.8807191848754883, + "learning_rate": 6.700000000000001e-06, + "loss": 0.0338, + "step": 990 + }, + { + "epoch": 10.984743411927878, + "eval_accuracy": 0.9879954577407668, + "eval_f1": 0.9318025258323767, + "eval_loss": 0.052071038633584976, + "eval_precision": 0.9214350590372389, + "eval_recall": 0.9424059451927543, + "eval_runtime": 36.6322, + "eval_samples_per_second": 5.432, + "eval_steps_per_second": 1.365, + "step": 990 + }, + { + "epoch": 11.15117891816921, + "grad_norm": 1.1557176113128662, + "learning_rate": 6.650000000000001e-06, + "loss": 0.0347, + "step": 1005 + }, + { + "epoch": 11.15117891816921, + "eval_accuracy": 0.988049532255448, + "eval_f1": 0.9353507565337001, + "eval_loss": 0.053912434726953506, + "eval_precision": 0.9234947940244455, + "eval_recall": 0.9475150952159777, + "eval_runtime": 36.5986, + "eval_samples_per_second": 5.437, + "eval_steps_per_second": 1.366, + "step": 1005 + }, + { + "epoch": 11.317614424410541, + "grad_norm": 1.668484091758728, + "learning_rate": 6.600000000000001e-06, + "loss": 0.0364, + "step": 1020 + }, + { + "epoch": 11.317614424410541, + "eval_accuracy": 0.9870761909911858, + "eval_f1": 0.9334552938486165, + "eval_loss": 0.055973075330257416, + "eval_precision": 0.9193693693693694, + "eval_recall": 0.9479795633999071, + "eval_runtime": 36.6625, + "eval_samples_per_second": 5.428, + "eval_steps_per_second": 1.364, + "step": 1020 + }, + { + "epoch": 11.484049930651873, + "grad_norm": 2.5720293521881104, + "learning_rate": 6.550000000000001e-06, + "loss": 0.0363, + "step": 1035 + }, + { + "epoch": 11.484049930651873, + "eval_accuracy": 0.9889147244903477, + "eval_f1": 0.9381751321535279, + "eval_loss": 0.050925422459840775, + "eval_precision": 0.9285714285714286, + "eval_recall": 0.9479795633999071, + "eval_runtime": 36.4069, + "eval_samples_per_second": 5.466, + "eval_steps_per_second": 1.373, + "step": 1035 + }, + { + "epoch": 11.650485436893204, + "grad_norm": 2.5676207542419434, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.0308, + "step": 1050 + }, + { + "epoch": 11.650485436893204, + "eval_accuracy": 0.9893473206077975, + "eval_f1": 0.94362292051756, + "eval_loss": 0.04982053115963936, + "eval_precision": 0.9388505747126437, + "eval_recall": 0.9484440315838365, + "eval_runtime": 36.3679, + "eval_samples_per_second": 5.472, + "eval_steps_per_second": 1.375, + "step": 1050 + }, + { + "epoch": 11.816920943134535, + "grad_norm": 0.9586185812950134, + "learning_rate": 6.450000000000001e-06, + "loss": 0.032, + "step": 1065 + }, + { + "epoch": 11.816920943134535, + "eval_accuracy": 0.9891310225490726, + "eval_f1": 0.9403330249768733, + "eval_loss": 0.04908496141433716, + "eval_precision": 0.9364348226623675, + "eval_recall": 0.9442638179284719, + "eval_runtime": 35.9979, + "eval_samples_per_second": 5.528, + "eval_steps_per_second": 1.389, + "step": 1065 + }, + { + "epoch": 11.983356449375867, + "grad_norm": 1.067063331604004, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.0331, + "step": 1080 + }, + { + "epoch": 11.983356449375867, + "eval_accuracy": 0.9891850970637539, + "eval_f1": 0.940768162887552, + "eval_loss": 0.0454898327589035, + "eval_precision": 0.9372982941447672, + "eval_recall": 0.9442638179284719, + "eval_runtime": 36.1674, + "eval_samples_per_second": 5.502, + "eval_steps_per_second": 1.382, + "step": 1080 + }, + { + "epoch": 12.149791955617198, + "grad_norm": 1.4905815124511719, + "learning_rate": 6.35e-06, + "loss": 0.0301, + "step": 1095 + }, + { + "epoch": 12.149791955617198, + "eval_accuracy": 0.9891850970637539, + "eval_f1": 0.9423431734317342, + "eval_loss": 0.04859260097146034, + "eval_precision": 0.9358680714612918, + "eval_recall": 0.9489084997677659, + "eval_runtime": 36.286, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.378, + "step": 1095 + }, + { + "epoch": 12.31622746185853, + "grad_norm": 1.3888496160507202, + "learning_rate": 6.300000000000001e-06, + "loss": 0.0308, + "step": 1110 + }, + { + "epoch": 12.31622746185853, + "eval_accuracy": 0.9891310225490726, + "eval_f1": 0.9413388543823326, + "eval_loss": 0.051349248737096786, + "eval_precision": 0.9325432999088423, + "eval_recall": 0.9503019043195541, + "eval_runtime": 36.2143, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 1.381, + "step": 1110 + }, + { + "epoch": 12.482662968099861, + "grad_norm": 0.5457278490066528, + "learning_rate": 6.25e-06, + "loss": 0.0253, + "step": 1125 + }, + { + "epoch": 12.482662968099861, + "eval_accuracy": 0.9891850970637539, + "eval_f1": 0.939825447864033, + "eval_loss": 0.05103699862957001, + "eval_precision": 0.9295774647887324, + "eval_recall": 0.9503019043195541, + "eval_runtime": 36.4491, + "eval_samples_per_second": 5.46, + "eval_steps_per_second": 1.372, + "step": 1125 + }, + { + "epoch": 12.649098474341192, + "grad_norm": 1.106314778327942, + "learning_rate": 6.200000000000001e-06, + "loss": 0.0301, + "step": 1140 + }, + { + "epoch": 12.649098474341192, + "eval_accuracy": 0.9886443519169416, + "eval_f1": 0.9397424103035878, + "eval_loss": 0.053277622908353806, + "eval_precision": 0.9307517084282461, + "eval_recall": 0.9489084997677659, + "eval_runtime": 36.4299, + "eval_samples_per_second": 5.463, + "eval_steps_per_second": 1.372, + "step": 1140 + }, + { + "epoch": 12.815533980582524, + "grad_norm": 0.9172839522361755, + "learning_rate": 6.15e-06, + "loss": 0.0328, + "step": 1155 + }, + { + "epoch": 12.815533980582524, + "eval_accuracy": 0.9884821283728978, + "eval_f1": 0.9364348226623675, + "eval_loss": 0.0548846460878849, + "eval_precision": 0.9287345820009136, + "eval_recall": 0.9442638179284719, + "eval_runtime": 36.3929, + "eval_samples_per_second": 5.468, + "eval_steps_per_second": 1.374, + "step": 1155 + }, + { + "epoch": 12.981969486823855, + "grad_norm": 1.9091347455978394, + "learning_rate": 6.1e-06, + "loss": 0.0298, + "step": 1170 + }, + { + "epoch": 12.981969486823855, + "eval_accuracy": 0.98945546963716, + "eval_f1": 0.9450092421441775, + "eval_loss": 0.05042650178074837, + "eval_precision": 0.9402298850574713, + "eval_recall": 0.9498374361356247, + "eval_runtime": 35.8371, + "eval_samples_per_second": 5.553, + "eval_steps_per_second": 1.395, + "step": 1170 + }, + { + "epoch": 13.148404993065187, + "grad_norm": 1.2674860954284668, + "learning_rate": 6.0500000000000005e-06, + "loss": 0.0256, + "step": 1185 + }, + { + "epoch": 13.148404993065187, + "eval_accuracy": 0.988752500946304, + "eval_f1": 0.9386716037954178, + "eval_loss": 0.051467474550008774, + "eval_precision": 0.9354243542435424, + "eval_recall": 0.9419414770088249, + "eval_runtime": 36.0333, + "eval_samples_per_second": 5.523, + "eval_steps_per_second": 1.388, + "step": 1185 + }, + { + "epoch": 13.314840499306518, + "grad_norm": 1.406807780265808, + "learning_rate": 6e-06, + "loss": 0.0313, + "step": 1200 + }, + { + "epoch": 13.314840499306518, + "eval_accuracy": 0.9905369599307846, + "eval_f1": 0.9480968858131489, + "eval_loss": 0.048274096101522446, + "eval_precision": 0.9417965169569202, + "eval_recall": 0.9544821179749187, + "eval_runtime": 35.8422, + "eval_samples_per_second": 5.552, + "eval_steps_per_second": 1.395, + "step": 1200 + }, + { + "epoch": 13.48127600554785, + "grad_norm": 0.5426374673843384, + "learning_rate": 5.950000000000001e-06, + "loss": 0.022, + "step": 1215 + }, + { + "epoch": 13.48127600554785, + "eval_accuracy": 0.9898880657546099, + "eval_f1": 0.9445339470655927, + "eval_loss": 0.0463298000395298, + "eval_precision": 0.9361313868613139, + "eval_recall": 0.9530887134231305, + "eval_runtime": 36.2558, + "eval_samples_per_second": 5.489, + "eval_steps_per_second": 1.379, + "step": 1215 + }, + { + "epoch": 13.647711511789181, + "grad_norm": 2.050182342529297, + "learning_rate": 5.9e-06, + "loss": 0.0245, + "step": 1230 + }, + { + "epoch": 13.647711511789181, + "eval_accuracy": 0.9893473206077975, + "eval_f1": 0.9430219146482123, + "eval_loss": 0.04942420497536659, + "eval_precision": 0.9367552703941339, + "eval_recall": 0.9493729679516953, + "eval_runtime": 36.4711, + "eval_samples_per_second": 5.456, + "eval_steps_per_second": 1.371, + "step": 1230 + }, + { + "epoch": 13.814147018030512, + "grad_norm": 1.7617555856704712, + "learning_rate": 5.85e-06, + "loss": 0.0251, + "step": 1245 + }, + { + "epoch": 13.814147018030512, + "eval_accuracy": 0.9897799167252473, + "eval_f1": 0.9467128027681662, + "eval_loss": 0.049306854605674744, + "eval_precision": 0.9404216315307058, + "eval_recall": 0.9530887134231305, + "eval_runtime": 36.1814, + "eval_samples_per_second": 5.5, + "eval_steps_per_second": 1.382, + "step": 1245 + }, + { + "epoch": 13.980582524271846, + "grad_norm": 1.183014154434204, + "learning_rate": 5.8e-06, + "loss": 0.0259, + "step": 1260 + }, + { + "epoch": 13.980582524271846, + "eval_accuracy": 0.98945546963716, + "eval_f1": 0.9453539312889093, + "eval_loss": 0.05114530399441719, + "eval_precision": 0.9386446886446886, + "eval_recall": 0.9521597770552717, + "eval_runtime": 36.0831, + "eval_samples_per_second": 5.515, + "eval_steps_per_second": 1.386, + "step": 1260 + }, + { + "epoch": 14.147018030513177, + "grad_norm": 0.6956959962844849, + "learning_rate": 5.75e-06, + "loss": 0.03, + "step": 1275 + }, + { + "epoch": 14.147018030513177, + "eval_accuracy": 0.9888606499756665, + "eval_f1": 0.9399815327793166, + "eval_loss": 0.053482603281736374, + "eval_precision": 0.9343735658558971, + "eval_recall": 0.9456572224802601, + "eval_runtime": 35.8745, + "eval_samples_per_second": 5.547, + "eval_steps_per_second": 1.394, + "step": 1275 + }, + { + "epoch": 14.313453536754508, + "grad_norm": 1.2064058780670166, + "learning_rate": 5.7e-06, + "loss": 0.0192, + "step": 1290 + }, + { + "epoch": 14.313453536754508, + "eval_accuracy": 0.9898880657546099, + "eval_f1": 0.9460772969220087, + "eval_loss": 0.049094799906015396, + "eval_precision": 0.9428044280442804, + "eval_recall": 0.9493729679516953, + "eval_runtime": 35.7923, + "eval_samples_per_second": 5.56, + "eval_steps_per_second": 1.397, + "step": 1290 + }, + { + "epoch": 14.47988904299584, + "grad_norm": 1.727489948272705, + "learning_rate": 5.65e-06, + "loss": 0.0267, + "step": 1305 + }, + { + "epoch": 14.47988904299584, + "eval_accuracy": 0.9901043638133348, + "eval_f1": 0.9500693481276006, + "eval_loss": 0.04895344376564026, + "eval_precision": 0.9456971928209849, + "eval_recall": 0.9544821179749187, + "eval_runtime": 36.349, + "eval_samples_per_second": 5.475, + "eval_steps_per_second": 1.376, + "step": 1305 + }, + { + "epoch": 14.646324549237171, + "grad_norm": 0.6142871379852295, + "learning_rate": 5.600000000000001e-06, + "loss": 0.0241, + "step": 1320 + }, + { + "epoch": 14.646324549237171, + "eval_accuracy": 0.9899421402692911, + "eval_f1": 0.948729792147806, + "eval_loss": 0.050602879375219345, + "eval_precision": 0.9435002296738632, + "eval_recall": 0.9540176497909894, + "eval_runtime": 36.205, + "eval_samples_per_second": 5.496, + "eval_steps_per_second": 1.381, + "step": 1320 + }, + { + "epoch": 14.812760055478503, + "grad_norm": 1.6362483501434326, + "learning_rate": 5.550000000000001e-06, + "loss": 0.0211, + "step": 1335 + }, + { + "epoch": 14.812760055478503, + "eval_accuracy": 0.9903206618720597, + "eval_f1": 0.9491682070240296, + "eval_loss": 0.050954435020685196, + "eval_precision": 0.944367816091954, + "eval_recall": 0.9540176497909894, + "eval_runtime": 36.469, + "eval_samples_per_second": 5.457, + "eval_steps_per_second": 1.371, + "step": 1335 + }, + { + "epoch": 14.979195561719834, + "grad_norm": 0.9267581105232239, + "learning_rate": 5.500000000000001e-06, + "loss": 0.0171, + "step": 1350 + }, + { + "epoch": 14.979195561719834, + "eval_accuracy": 0.9897799167252473, + "eval_f1": 0.9474412171507607, + "eval_loss": 0.04994847625494003, + "eval_precision": 0.9405034324942791, + "eval_recall": 0.9544821179749187, + "eval_runtime": 36.7159, + "eval_samples_per_second": 5.42, + "eval_steps_per_second": 1.362, + "step": 1350 + }, + { + "epoch": 15.145631067961165, + "grad_norm": 0.6142176389694214, + "learning_rate": 5.450000000000001e-06, + "loss": 0.0226, + "step": 1365 + }, + { + "epoch": 15.145631067961165, + "eval_accuracy": 0.9894013951224788, + "eval_f1": 0.9452369995398067, + "eval_loss": 0.05113999918103218, + "eval_precision": 0.9366165070679434, + "eval_recall": 0.9540176497909894, + "eval_runtime": 36.247, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 1.379, + "step": 1365 + }, + { + "epoch": 15.312066574202497, + "grad_norm": 0.46341672539711, + "learning_rate": 5.400000000000001e-06, + "loss": 0.024, + "step": 1380 + }, + { + "epoch": 15.312066574202497, + "eval_accuracy": 0.9899421402692911, + "eval_f1": 0.9501385041551247, + "eval_loss": 0.04835886508226395, + "eval_precision": 0.9444699403396053, + "eval_recall": 0.9558755225267069, + "eval_runtime": 35.8678, + "eval_samples_per_second": 5.548, + "eval_steps_per_second": 1.394, + "step": 1380 + }, + { + "epoch": 15.478502080443828, + "grad_norm": 1.446049690246582, + "learning_rate": 5.3500000000000004e-06, + "loss": 0.018, + "step": 1395 + }, + { + "epoch": 15.478502080443828, + "eval_accuracy": 0.9903206618720597, + "eval_f1": 0.9492703266157054, + "eval_loss": 0.04823274910449982, + "eval_precision": 0.9468576709796673, + "eval_recall": 0.9516953088713423, + "eval_runtime": 35.9765, + "eval_samples_per_second": 5.531, + "eval_steps_per_second": 1.39, + "step": 1395 + }, + { + "epoch": 15.64493758668516, + "grad_norm": 0.7485630512237549, + "learning_rate": 5.300000000000001e-06, + "loss": 0.0191, + "step": 1410 + }, + { + "epoch": 15.64493758668516, + "eval_accuracy": 0.9899421402692911, + "eval_f1": 0.947709393799167, + "eval_loss": 0.04913439229130745, + "eval_precision": 0.9442139234670355, + "eval_recall": 0.9512308406874129, + "eval_runtime": 36.5589, + "eval_samples_per_second": 5.443, + "eval_steps_per_second": 1.368, + "step": 1410 + }, + { + "epoch": 15.811373092926491, + "grad_norm": 0.8376514911651611, + "learning_rate": 5.2500000000000006e-06, + "loss": 0.0203, + "step": 1425 + }, + { + "epoch": 15.811373092926491, + "eval_accuracy": 0.9912399286216407, + "eval_f1": 0.9531974050046339, + "eval_loss": 0.04510456323623657, + "eval_precision": 0.9509939898289412, + "eval_recall": 0.9554110543427775, + "eval_runtime": 36.8157, + "eval_samples_per_second": 5.405, + "eval_steps_per_second": 1.358, + "step": 1425 + }, + { + "epoch": 15.977808599167822, + "grad_norm": 1.1797449588775635, + "learning_rate": 5.2e-06, + "loss": 0.0198, + "step": 1440 + }, + { + "epoch": 15.977808599167822, + "eval_accuracy": 0.9911317795922782, + "eval_f1": 0.952292728114868, + "eval_loss": 0.04465332254767418, + "eval_precision": 0.9496535796766744, + "eval_recall": 0.9549465861588481, + "eval_runtime": 36.3506, + "eval_samples_per_second": 5.474, + "eval_steps_per_second": 1.375, + "step": 1440 + }, + { + "epoch": 16.144244105409154, + "grad_norm": 2.32300066947937, + "learning_rate": 5.150000000000001e-06, + "loss": 0.0167, + "step": 1455 + }, + { + "epoch": 16.144244105409154, + "eval_accuracy": 0.9909154815335532, + "eval_f1": 0.9513663733209818, + "eval_loss": 0.044419851154088974, + "eval_precision": 0.948729792147806, + "eval_recall": 0.9540176497909894, + "eval_runtime": 36.4511, + "eval_samples_per_second": 5.459, + "eval_steps_per_second": 1.372, + "step": 1455 + }, + { + "epoch": 16.310679611650485, + "grad_norm": 1.4079307317733765, + "learning_rate": 5.1e-06, + "loss": 0.0178, + "step": 1470 + }, + { + "epoch": 16.310679611650485, + "eval_accuracy": 0.9891850970637539, + "eval_f1": 0.9448673587081892, + "eval_loss": 0.05134458467364311, + "eval_precision": 0.9385884509624198, + "eval_recall": 0.9512308406874129, + "eval_runtime": 35.9882, + "eval_samples_per_second": 5.53, + "eval_steps_per_second": 1.389, + "step": 1470 + }, + { + "epoch": 16.477115117891817, + "grad_norm": 1.1276496648788452, + "learning_rate": 5.050000000000001e-06, + "loss": 0.024, + "step": 1485 + }, + { + "epoch": 16.477115117891817, + "eval_accuracy": 0.9899421402692911, + "eval_f1": 0.9482678983833718, + "eval_loss": 0.0502447672188282, + "eval_precision": 0.9430408819476344, + "eval_recall": 0.9535531816070599, + "eval_runtime": 36.2001, + "eval_samples_per_second": 5.497, + "eval_steps_per_second": 1.381, + "step": 1485 + }, + { + "epoch": 16.643550624133148, + "grad_norm": 1.1420115232467651, + "learning_rate": 5e-06, + "loss": 0.0206, + "step": 1500 + }, + { + "epoch": 16.643550624133148, + "eval_accuracy": 0.9907532579895095, + "eval_f1": 0.9513888888888888, + "eval_loss": 0.045851416885852814, + "eval_precision": 0.9483156437471159, + "eval_recall": 0.9544821179749187, + "eval_runtime": 36.0375, + "eval_samples_per_second": 5.522, + "eval_steps_per_second": 1.387, + "step": 1500 + }, + { + "epoch": 16.80998613037448, + "grad_norm": 0.6803048849105835, + "learning_rate": 4.95e-06, + "loss": 0.0188, + "step": 1515 + }, + { + "epoch": 16.80998613037448, + "eval_accuracy": 0.9906451089601471, + "eval_f1": 0.9507058551261283, + "eval_loss": 0.04693201929330826, + "eval_precision": 0.9474169741697417, + "eval_recall": 0.9540176497909894, + "eval_runtime": 36.4292, + "eval_samples_per_second": 5.463, + "eval_steps_per_second": 1.373, + "step": 1515 + }, + { + "epoch": 16.97642163661581, + "grad_norm": 0.6494084000587463, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.016, + "step": 1530 + }, + { + "epoch": 16.97642163661581, + "eval_accuracy": 0.9905910344454658, + "eval_f1": 0.9524469067405354, + "eval_loss": 0.04632224142551422, + "eval_precision": 0.9467645709040844, + "eval_recall": 0.9581978634463539, + "eval_runtime": 36.8269, + "eval_samples_per_second": 5.404, + "eval_steps_per_second": 1.358, + "step": 1530 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 0.9313808083534241, + "learning_rate": 4.85e-06, + "loss": 0.0161, + "step": 1545 + }, + { + "epoch": 17.142857142857142, + "eval_accuracy": 0.991077705077597, + "eval_f1": 0.9555966697502312, + "eval_loss": 0.045460253953933716, + "eval_precision": 0.9516351911561493, + "eval_recall": 0.9595912679981421, + "eval_runtime": 36.5267, + "eval_samples_per_second": 5.448, + "eval_steps_per_second": 1.369, + "step": 1545 + }, + { + "epoch": 17.309292649098474, + "grad_norm": 0.6977990865707397, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0135, + "step": 1560 + }, + { + "epoch": 17.309292649098474, + "eval_accuracy": 0.9909154815335532, + "eval_f1": 0.9548297428769978, + "eval_loss": 0.04745380952954292, + "eval_precision": 0.9524029574861368, + "eval_recall": 0.9572689270784951, + "eval_runtime": 36.396, + "eval_samples_per_second": 5.468, + "eval_steps_per_second": 1.374, + "step": 1560 + }, + { + "epoch": 17.475728155339805, + "grad_norm": 0.7467624545097351, + "learning_rate": 4.75e-06, + "loss": 0.0148, + "step": 1575 + }, + { + "epoch": 17.475728155339805, + "eval_accuracy": 0.9904828854161034, + "eval_f1": 0.9491916859122401, + "eval_loss": 0.047850631177425385, + "eval_precision": 0.9439595774000918, + "eval_recall": 0.9544821179749187, + "eval_runtime": 36.2126, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 1.381, + "step": 1575 + }, + { + "epoch": 17.642163661581137, + "grad_norm": 0.7804221510887146, + "learning_rate": 4.7e-06, + "loss": 0.0173, + "step": 1590 + }, + { + "epoch": 17.642163661581137, + "eval_accuracy": 0.9915103011950468, + "eval_f1": 0.9571858366118954, + "eval_loss": 0.04551170393824577, + "eval_precision": 0.9538745387453874, + "eval_recall": 0.9605202043660009, + "eval_runtime": 36.006, + "eval_samples_per_second": 5.527, + "eval_steps_per_second": 1.389, + "step": 1590 + }, + { + "epoch": 17.808599167822468, + "grad_norm": 1.0907295942306519, + "learning_rate": 4.65e-06, + "loss": 0.0173, + "step": 1605 + }, + { + "epoch": 17.808599167822468, + "eval_accuracy": 0.9913480776510031, + "eval_f1": 0.9514338575393155, + "eval_loss": 0.04557771980762482, + "eval_precision": 0.9474896361123906, + "eval_recall": 0.9554110543427775, + "eval_runtime": 36.2064, + "eval_samples_per_second": 5.496, + "eval_steps_per_second": 1.381, + "step": 1605 + }, + { + "epoch": 17.9750346740638, + "grad_norm": 1.295432209968567, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0185, + "step": 1620 + }, + { + "epoch": 17.9750346740638, + "eval_accuracy": 0.9907532579895095, + "eval_f1": 0.9537465309898243, + "eval_loss": 0.04614636301994324, + "eval_precision": 0.9497927222478121, + "eval_recall": 0.9577333952624245, + "eval_runtime": 36.269, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 1.379, + "step": 1620 + }, + { + "epoch": 18.14147018030513, + "grad_norm": 1.0728676319122314, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0153, + "step": 1635 + }, + { + "epoch": 18.14147018030513, + "eval_accuracy": 0.991077705077597, + "eval_f1": 0.9547553093259464, + "eval_loss": 0.04719853028655052, + "eval_precision": 0.9490592014685636, + "eval_recall": 0.9605202043660009, + "eval_runtime": 36.7621, + "eval_samples_per_second": 5.413, + "eval_steps_per_second": 1.36, + "step": 1635 + }, + { + "epoch": 18.307905686546462, + "grad_norm": 0.848417341709137, + "learning_rate": 4.5e-06, + "loss": 0.0148, + "step": 1650 + }, + { + "epoch": 18.307905686546462, + "eval_accuracy": 0.9913480776510031, + "eval_f1": 0.9546716003700277, + "eval_loss": 0.04460978880524635, + "eval_precision": 0.9507139567019807, + "eval_recall": 0.9586623316302834, + "eval_runtime": 36.6036, + "eval_samples_per_second": 5.437, + "eval_steps_per_second": 1.366, + "step": 1650 + }, + { + "epoch": 18.474341192787794, + "grad_norm": 0.8914014101028442, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0136, + "step": 1665 + }, + { + "epoch": 18.474341192787794, + "eval_accuracy": 0.9914021521656843, + "eval_f1": 0.9542936288088641, + "eval_loss": 0.044093821197748184, + "eval_precision": 0.9486002753556677, + "eval_recall": 0.9600557361820715, + "eval_runtime": 36.4626, + "eval_samples_per_second": 5.458, + "eval_steps_per_second": 1.371, + "step": 1665 + }, + { + "epoch": 18.640776699029125, + "grad_norm": 1.768336534500122, + "learning_rate": 4.4e-06, + "loss": 0.0185, + "step": 1680 + }, + { + "epoch": 18.640776699029125, + "eval_accuracy": 0.9914562266803656, + "eval_f1": 0.9550509731232623, + "eval_loss": 0.047818973660469055, + "eval_precision": 0.9528432732316228, + "eval_recall": 0.9572689270784951, + "eval_runtime": 35.9606, + "eval_samples_per_second": 5.534, + "eval_steps_per_second": 1.39, + "step": 1680 + }, + { + "epoch": 18.807212205270456, + "grad_norm": 0.8891735672950745, + "learning_rate": 4.353333333333334e-06, + "loss": 0.0147, + "step": 1695 + }, + { + "epoch": 18.807212205270456, + "eval_accuracy": 0.9911858541069594, + "eval_f1": 0.9582660825455385, + "eval_loss": 0.04927229881286621, + "eval_precision": 0.9514652014652014, + "eval_recall": 0.965164886205295, + "eval_runtime": 36.0431, + "eval_samples_per_second": 5.521, + "eval_steps_per_second": 1.387, + "step": 1695 + }, + { + "epoch": 18.973647711511788, + "grad_norm": 0.860618531703949, + "learning_rate": 4.303333333333334e-06, + "loss": 0.0156, + "step": 1710 + }, + { + "epoch": 18.973647711511788, + "eval_accuracy": 0.9902665873573785, + "eval_f1": 0.9491916859122401, + "eval_loss": 0.05092372000217438, + "eval_precision": 0.9439595774000918, + "eval_recall": 0.9544821179749187, + "eval_runtime": 36.549, + "eval_samples_per_second": 5.445, + "eval_steps_per_second": 1.368, + "step": 1710 + }, + { + "epoch": 19.14008321775312, + "grad_norm": 0.4298454821109772, + "learning_rate": 4.253333333333334e-06, + "loss": 0.0113, + "step": 1725 + }, + { + "epoch": 19.14008321775312, + "eval_accuracy": 0.9911317795922782, + "eval_f1": 0.9566024599675098, + "eval_loss": 0.046022918075323105, + "eval_precision": 0.9559369202226345, + "eval_recall": 0.9572689270784951, + "eval_runtime": 36.802, + "eval_samples_per_second": 5.407, + "eval_steps_per_second": 1.359, + "step": 1725 + }, + { + "epoch": 19.30651872399445, + "grad_norm": 0.7119155526161194, + "learning_rate": 4.2033333333333335e-06, + "loss": 0.014, + "step": 1740 + }, + { + "epoch": 19.30651872399445, + "eval_accuracy": 0.9904828854161034, + "eval_f1": 0.948220064724919, + "eval_loss": 0.04928451031446457, + "eval_precision": 0.9438564196962724, + "eval_recall": 0.9526242452392011, + "eval_runtime": 36.4604, + "eval_samples_per_second": 5.458, + "eval_steps_per_second": 1.371, + "step": 1740 + }, + { + "epoch": 19.472954230235782, + "grad_norm": 0.6270649433135986, + "learning_rate": 4.153333333333334e-06, + "loss": 0.0147, + "step": 1755 + }, + { + "epoch": 19.472954230235782, + "eval_accuracy": 0.9906451089601471, + "eval_f1": 0.9521608504737693, + "eval_loss": 0.04984944686293602, + "eval_precision": 0.9475620975160993, + "eval_recall": 0.9568044588945657, + "eval_runtime": 36.1824, + "eval_samples_per_second": 5.5, + "eval_steps_per_second": 1.382, + "step": 1755 + }, + { + "epoch": 19.639389736477114, + "grad_norm": 0.9536636471748352, + "learning_rate": 4.1033333333333336e-06, + "loss": 0.0126, + "step": 1770 + }, + { + "epoch": 19.639389736477114, + "eval_accuracy": 0.9905910344454658, + "eval_f1": 0.9502199583236861, + "eval_loss": 0.04928808659315109, + "eval_precision": 0.9473684210526315, + "eval_recall": 0.9530887134231305, + "eval_runtime": 36.346, + "eval_samples_per_second": 5.475, + "eval_steps_per_second": 1.376, + "step": 1770 + }, + { + "epoch": 19.805825242718445, + "grad_norm": 2.24277925491333, + "learning_rate": 4.053333333333333e-06, + "loss": 0.0167, + "step": 1785 + }, + { + "epoch": 19.805825242718445, + "eval_accuracy": 0.9903747363867409, + "eval_f1": 0.9519852262234534, + "eval_loss": 0.04912427067756653, + "eval_precision": 0.9463056447911886, + "eval_recall": 0.9577333952624245, + "eval_runtime": 36.3829, + "eval_samples_per_second": 5.47, + "eval_steps_per_second": 1.374, + "step": 1785 + }, + { + "epoch": 19.972260748959776, + "grad_norm": 1.1929985284805298, + "learning_rate": 4.003333333333334e-06, + "loss": 0.0126, + "step": 1800 + }, + { + "epoch": 19.972260748959776, + "eval_accuracy": 0.9907532579895095, + "eval_f1": 0.9515867500579105, + "eval_loss": 0.04741891101002693, + "eval_precision": 0.9491682070240296, + "eval_recall": 0.9540176497909894, + "eval_runtime": 36.3224, + "eval_samples_per_second": 5.479, + "eval_steps_per_second": 1.377, + "step": 1800 + }, + { + "epoch": 20.13869625520111, + "grad_norm": 0.5980396866798401, + "learning_rate": 3.953333333333333e-06, + "loss": 0.0107, + "step": 1815 + }, + { + "epoch": 20.13869625520111, + "eval_accuracy": 0.9914021521656843, + "eval_f1": 0.9550717924965262, + "eval_loss": 0.04617602005600929, + "eval_precision": 0.9524249422632795, + "eval_recall": 0.9577333952624245, + "eval_runtime": 36.2289, + "eval_samples_per_second": 5.493, + "eval_steps_per_second": 1.38, + "step": 1815 + }, + { + "epoch": 20.305131761442443, + "grad_norm": 0.5774451494216919, + "learning_rate": 3.903333333333334e-06, + "loss": 0.0115, + "step": 1830 + }, + { + "epoch": 20.305131761442443, + "eval_accuracy": 0.9911317795922782, + "eval_f1": 0.9558993304086816, + "eval_loss": 0.048068635165691376, + "eval_precision": 0.9504132231404959, + "eval_recall": 0.9614491407338597, + "eval_runtime": 36.6091, + "eval_samples_per_second": 5.436, + "eval_steps_per_second": 1.366, + "step": 1830 + }, + { + "epoch": 20.471567267683774, + "grad_norm": 0.8061049580574036, + "learning_rate": 3.853333333333334e-06, + "loss": 0.0128, + "step": 1845 + }, + { + "epoch": 20.471567267683774, + "eval_accuracy": 0.9906991834748283, + "eval_f1": 0.951918631530282, + "eval_loss": 0.04859815165400505, + "eval_precision": 0.9475379659456972, + "eval_recall": 0.9563399907106364, + "eval_runtime": 36.2061, + "eval_samples_per_second": 5.496, + "eval_steps_per_second": 1.381, + "step": 1845 + }, + { + "epoch": 20.638002773925106, + "grad_norm": 0.5735962986946106, + "learning_rate": 3.803333333333334e-06, + "loss": 0.0113, + "step": 1860 + }, + { + "epoch": 20.638002773925106, + "eval_accuracy": 0.9910236305629156, + "eval_f1": 0.9533702677746998, + "eval_loss": 0.04910165071487427, + "eval_precision": 0.947682423129876, + "eval_recall": 0.9591267998142127, + "eval_runtime": 36.2871, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.378, + "step": 1860 + }, + { + "epoch": 20.804438280166437, + "grad_norm": 0.5703373551368713, + "learning_rate": 3.753333333333334e-06, + "loss": 0.0119, + "step": 1875 + }, + { + "epoch": 20.804438280166437, + "eval_accuracy": 0.9901043638133348, + "eval_f1": 0.9498607242339832, + "eval_loss": 0.05141424015164375, + "eval_precision": 0.9494199535962877, + "eval_recall": 0.9503019043195541, + "eval_runtime": 36.2237, + "eval_samples_per_second": 5.494, + "eval_steps_per_second": 1.38, + "step": 1875 + }, + { + "epoch": 20.97087378640777, + "grad_norm": 0.8812251091003418, + "learning_rate": 3.7033333333333336e-06, + "loss": 0.0122, + "step": 1890 + }, + { + "epoch": 20.97087378640777, + "eval_accuracy": 0.9911317795922782, + "eval_f1": 0.9535903948279844, + "eval_loss": 0.04799521341919899, + "eval_precision": 0.94811753902663, + "eval_recall": 0.9591267998142127, + "eval_runtime": 36.3818, + "eval_samples_per_second": 5.47, + "eval_steps_per_second": 1.374, + "step": 1890 + }, + { + "epoch": 21.1373092926491, + "grad_norm": 0.729183554649353, + "learning_rate": 3.6533333333333336e-06, + "loss": 0.0123, + "step": 1905 + }, + { + "epoch": 21.1373092926491, + "eval_accuracy": 0.9909154815335532, + "eval_f1": 0.9522050334795659, + "eval_loss": 0.04769909009337425, + "eval_precision": 0.9467401285583104, + "eval_recall": 0.9577333952624245, + "eval_runtime": 36.5345, + "eval_samples_per_second": 5.447, + "eval_steps_per_second": 1.369, + "step": 1905 + }, + { + "epoch": 21.30374479889043, + "grad_norm": 0.3428969085216522, + "learning_rate": 3.6033333333333337e-06, + "loss": 0.0116, + "step": 1920 + }, + { + "epoch": 21.30374479889043, + "eval_accuracy": 0.9910236305629156, + "eval_f1": 0.9533271719038817, + "eval_loss": 0.04861655458807945, + "eval_precision": 0.9485057471264368, + "eval_recall": 0.9581978634463539, + "eval_runtime": 36.8199, + "eval_samples_per_second": 5.405, + "eval_steps_per_second": 1.358, + "step": 1920 + }, + { + "epoch": 21.470180305131763, + "grad_norm": 0.4823513925075531, + "learning_rate": 3.5533333333333338e-06, + "loss": 0.0108, + "step": 1935 + }, + { + "epoch": 21.470180305131763, + "eval_accuracy": 0.9904828854161034, + "eval_f1": 0.9511295527893039, + "eval_loss": 0.048778366297483444, + "eval_precision": 0.9441647597254005, + "eval_recall": 0.9581978634463539, + "eval_runtime": 36.5655, + "eval_samples_per_second": 5.442, + "eval_steps_per_second": 1.367, + "step": 1935 + }, + { + "epoch": 21.636615811373094, + "grad_norm": 0.3686061203479767, + "learning_rate": 3.5033333333333334e-06, + "loss": 0.0115, + "step": 1950 + }, + { + "epoch": 21.636615811373094, + "eval_accuracy": 0.9912940031363219, + "eval_f1": 0.9542302357836339, + "eval_loss": 0.04717012122273445, + "eval_precision": 0.9498389323515877, + "eval_recall": 0.9586623316302834, + "eval_runtime": 36.5437, + "eval_samples_per_second": 5.446, + "eval_steps_per_second": 1.368, + "step": 1950 + }, + { + "epoch": 21.803051317614425, + "grad_norm": 1.0370802879333496, + "learning_rate": 3.4533333333333334e-06, + "loss": 0.0083, + "step": 1965 + }, + { + "epoch": 21.803051317614425, + "eval_accuracy": 0.991077705077597, + "eval_f1": 0.954272517321016, + "eval_loss": 0.04759324714541435, + "eval_precision": 0.9490124023886082, + "eval_recall": 0.9595912679981421, + "eval_runtime": 36.2291, + "eval_samples_per_second": 5.493, + "eval_steps_per_second": 1.38, + "step": 1965 + }, + { + "epoch": 21.969486823855757, + "grad_norm": 1.2627676725387573, + "learning_rate": 3.4033333333333335e-06, + "loss": 0.0094, + "step": 1980 + }, + { + "epoch": 21.969486823855757, + "eval_accuracy": 0.990861407018872, + "eval_f1": 0.9543147208121827, + "eval_loss": 0.047525253146886826, + "eval_precision": 0.948188904172398, + "eval_recall": 0.9605202043660009, + "eval_runtime": 36.2268, + "eval_samples_per_second": 5.493, + "eval_steps_per_second": 1.38, + "step": 1980 + }, + { + "epoch": 22.135922330097088, + "grad_norm": 0.2426026463508606, + "learning_rate": 3.3533333333333336e-06, + "loss": 0.0118, + "step": 1995 + }, + { + "epoch": 22.135922330097088, + "eval_accuracy": 0.9904288109014222, + "eval_f1": 0.9501154734411085, + "eval_loss": 0.049215689301490784, + "eval_precision": 0.9448782728525493, + "eval_recall": 0.9554110543427775, + "eval_runtime": 36.1992, + "eval_samples_per_second": 5.497, + "eval_steps_per_second": 1.381, + "step": 1995 + }, + { + "epoch": 22.30235783633842, + "grad_norm": 0.6006263494491577, + "learning_rate": 3.303333333333333e-06, + "loss": 0.01, + "step": 2010 + }, + { + "epoch": 22.30235783633842, + "eval_accuracy": 0.990861407018872, + "eval_f1": 0.9523148148148148, + "eval_loss": 0.048562802374362946, + "eval_precision": 0.949238578680203, + "eval_recall": 0.9554110543427775, + "eval_runtime": 36.2887, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.378, + "step": 2010 + }, + { + "epoch": 22.46879334257975, + "grad_norm": 0.7383334040641785, + "learning_rate": 3.2533333333333332e-06, + "loss": 0.0114, + "step": 2025 + }, + { + "epoch": 22.46879334257975, + "eval_accuracy": 0.9910236305629156, + "eval_f1": 0.9539671524404348, + "eval_loss": 0.04967198148369789, + "eval_precision": 0.9502304147465438, + "eval_recall": 0.9577333952624245, + "eval_runtime": 36.3824, + "eval_samples_per_second": 5.47, + "eval_steps_per_second": 1.374, + "step": 2025 + }, + { + "epoch": 22.635228848821082, + "grad_norm": 0.5105836987495422, + "learning_rate": 3.2033333333333337e-06, + "loss": 0.0091, + "step": 2040 + }, + { + "epoch": 22.635228848821082, + "eval_accuracy": 0.9909695560482344, + "eval_f1": 0.954209065679926, + "eval_loss": 0.049895454198122025, + "eval_precision": 0.9502533394748963, + "eval_recall": 0.9581978634463539, + "eval_runtime": 36.2966, + "eval_samples_per_second": 5.483, + "eval_steps_per_second": 1.378, + "step": 2040 + }, + { + "epoch": 22.801664355062414, + "grad_norm": 0.8460143804550171, + "learning_rate": 3.1533333333333338e-06, + "loss": 0.0077, + "step": 2055 + }, + { + "epoch": 22.801664355062414, + "eval_accuracy": 0.991077705077597, + "eval_f1": 0.9563409563409564, + "eval_loss": 0.05023453012108803, + "eval_precision": 0.9512867647058824, + "eval_recall": 0.9614491407338597, + "eval_runtime": 36.5792, + "eval_samples_per_second": 5.44, + "eval_steps_per_second": 1.367, + "step": 2055 + }, + { + "epoch": 22.968099861303745, + "grad_norm": 0.46876421570777893, + "learning_rate": 3.103333333333334e-06, + "loss": 0.01, + "step": 2070 + }, + { + "epoch": 22.968099861303745, + "eval_accuracy": 0.9912940031363219, + "eval_f1": 0.9586127167630057, + "eval_loss": 0.05132585018873215, + "eval_precision": 0.9544198895027625, + "eval_recall": 0.9628425452856479, + "eval_runtime": 36.5943, + "eval_samples_per_second": 5.438, + "eval_steps_per_second": 1.366, + "step": 2070 + }, + { + "epoch": 23.134535367545077, + "grad_norm": 0.26761332154273987, + "learning_rate": 3.053333333333334e-06, + "loss": 0.0087, + "step": 2085 + }, + { + "epoch": 23.134535367545077, + "eval_accuracy": 0.9911858541069594, + "eval_f1": 0.9554375432925422, + "eval_loss": 0.04853161796927452, + "eval_precision": 0.9499540863177227, + "eval_recall": 0.9609846725499304, + "eval_runtime": 36.2471, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 1.379, + "step": 2085 + }, + { + "epoch": 23.300970873786408, + "grad_norm": 0.32841914892196655, + "learning_rate": 3.0033333333333335e-06, + "loss": 0.0073, + "step": 2100 + }, + { + "epoch": 23.300970873786408, + "eval_accuracy": 0.9916725247390905, + "eval_f1": 0.959278111985192, + "eval_loss": 0.04846283420920372, + "eval_precision": 0.9557399723374828, + "eval_recall": 0.9628425452856479, + "eval_runtime": 36.0113, + "eval_samples_per_second": 5.526, + "eval_steps_per_second": 1.388, + "step": 2100 + }, + { + "epoch": 23.46740638002774, + "grad_norm": 0.3114074766635895, + "learning_rate": 2.9533333333333336e-06, + "loss": 0.0083, + "step": 2115 + }, + { + "epoch": 23.46740638002774, + "eval_accuracy": 0.9913480776510031, + "eval_f1": 0.957205644228545, + "eval_loss": 0.04847896471619606, + "eval_precision": 0.9534562211981567, + "eval_recall": 0.9609846725499304, + "eval_runtime": 36.2766, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.378, + "step": 2115 + }, + { + "epoch": 23.63384188626907, + "grad_norm": 0.815006673336029, + "learning_rate": 2.9033333333333336e-06, + "loss": 0.0117, + "step": 2130 + }, + { + "epoch": 23.63384188626907, + "eval_accuracy": 0.991564375709728, + "eval_f1": 0.9590372598935432, + "eval_loss": 0.04786692187190056, + "eval_precision": 0.955719557195572, + "eval_recall": 0.9623780771017185, + "eval_runtime": 36.5057, + "eval_samples_per_second": 5.451, + "eval_steps_per_second": 1.37, + "step": 2130 + }, + { + "epoch": 23.800277392510402, + "grad_norm": 0.34551236033439636, + "learning_rate": 2.8533333333333337e-06, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 23.800277392510402, + "eval_accuracy": 0.991077705077597, + "eval_f1": 0.9542302357836339, + "eval_loss": 0.05084284767508507, + "eval_precision": 0.9498389323515877, + "eval_recall": 0.9586623316302834, + "eval_runtime": 36.5022, + "eval_samples_per_second": 5.452, + "eval_steps_per_second": 1.37, + "step": 2145 + }, + { + "epoch": 23.966712898751734, + "grad_norm": 0.988761305809021, + "learning_rate": 2.8033333333333333e-06, + "loss": 0.009, + "step": 2160 + }, + { + "epoch": 23.966712898751734, + "eval_accuracy": 0.9909695560482344, + "eval_f1": 0.9559603412497119, + "eval_loss": 0.051338665187358856, + "eval_precision": 0.9491758241758241, + "eval_recall": 0.9628425452856479, + "eval_runtime": 36.4961, + "eval_samples_per_second": 5.453, + "eval_steps_per_second": 1.37, + "step": 2160 + }, + { + "epoch": 24.133148404993065, + "grad_norm": 0.20439928770065308, + "learning_rate": 2.7533333333333334e-06, + "loss": 0.0077, + "step": 2175 + }, + { + "epoch": 24.133148404993065, + "eval_accuracy": 0.9915103011950468, + "eval_f1": 0.9590562109646079, + "eval_loss": 0.050405893474817276, + "eval_precision": 0.9552995391705069, + "eval_recall": 0.9628425452856479, + "eval_runtime": 36.5792, + "eval_samples_per_second": 5.44, + "eval_steps_per_second": 1.367, + "step": 2175 + }, + { + "epoch": 24.299583911234397, + "grad_norm": 0.6065575480461121, + "learning_rate": 2.7033333333333334e-06, + "loss": 0.0087, + "step": 2190 + }, + { + "epoch": 24.299583911234397, + "eval_accuracy": 0.9912940031363219, + "eval_f1": 0.9565418400369857, + "eval_loss": 0.04999900609254837, + "eval_precision": 0.9521398987574782, + "eval_recall": 0.9609846725499304, + "eval_runtime": 36.2889, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.378, + "step": 2190 + }, + { + "epoch": 24.466019417475728, + "grad_norm": 0.4505390226840973, + "learning_rate": 2.6533333333333335e-06, + "loss": 0.0068, + "step": 2205 + }, + { + "epoch": 24.466019417475728, + "eval_accuracy": 0.9912940031363219, + "eval_f1": 0.9574271170754282, + "eval_loss": 0.05055619403719902, + "eval_precision": 0.9538958045182112, + "eval_recall": 0.9609846725499304, + "eval_runtime": 35.8763, + "eval_samples_per_second": 5.547, + "eval_steps_per_second": 1.394, + "step": 2205 + }, + { + "epoch": 24.63245492371706, + "grad_norm": 0.2784092128276825, + "learning_rate": 2.603333333333334e-06, + "loss": 0.0094, + "step": 2220 + }, + { + "epoch": 24.63245492371706, + "eval_accuracy": 0.9913480776510031, + "eval_f1": 0.9549132947976879, + "eval_loss": 0.050024211406707764, + "eval_precision": 0.9507366482504604, + "eval_recall": 0.9591267998142127, + "eval_runtime": 36.3659, + "eval_samples_per_second": 5.472, + "eval_steps_per_second": 1.375, + "step": 2220 + }, + { + "epoch": 24.79889042995839, + "grad_norm": 0.24667127430438995, + "learning_rate": 2.5533333333333336e-06, + "loss": 0.0088, + "step": 2235 + }, + { + "epoch": 24.79889042995839, + "eval_accuracy": 0.9914021521656843, + "eval_f1": 0.9551548774849746, + "eval_loss": 0.048643559217453, + "eval_precision": 0.9507593189139438, + "eval_recall": 0.9595912679981421, + "eval_runtime": 36.4912, + "eval_samples_per_second": 5.453, + "eval_steps_per_second": 1.37, + "step": 2235 + }, + { + "epoch": 24.965325936199722, + "grad_norm": 0.10884588211774826, + "learning_rate": 2.5033333333333336e-06, + "loss": 0.0089, + "step": 2250 + }, + { + "epoch": 24.965325936199722, + "eval_accuracy": 0.991077705077597, + "eval_f1": 0.9558789558789559, + "eval_loss": 0.05070747807621956, + "eval_precision": 0.9508272058823529, + "eval_recall": 0.9609846725499304, + "eval_runtime": 36.2816, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.378, + "step": 2250 + }, + { + "epoch": 25.131761442441054, + "grad_norm": 0.6150490641593933, + "learning_rate": 2.4533333333333333e-06, + "loss": 0.0063, + "step": 2265 + }, + { + "epoch": 25.131761442441054, + "eval_accuracy": 0.9916725247390905, + "eval_f1": 0.9585360203845263, + "eval_loss": 0.04789712280035019, + "eval_precision": 0.9560998151571165, + "eval_recall": 0.9609846725499304, + "eval_runtime": 36.2469, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 1.379, + "step": 2265 + }, + { + "epoch": 25.298196948682385, + "grad_norm": 0.7432591319084167, + "learning_rate": 2.4033333333333338e-06, + "loss": 0.0058, + "step": 2280 + }, + { + "epoch": 25.298196948682385, + "eval_accuracy": 0.991077705077597, + "eval_f1": 0.9572452045296973, + "eval_loss": 0.050580546259880066, + "eval_precision": 0.952621895124195, + "eval_recall": 0.9619136089177891, + "eval_runtime": 36.4588, + "eval_samples_per_second": 5.458, + "eval_steps_per_second": 1.371, + "step": 2280 + }, + { + "epoch": 25.464632454923716, + "grad_norm": 0.742586612701416, + "learning_rate": 2.3533333333333334e-06, + "loss": 0.0102, + "step": 2295 + }, + { + "epoch": 25.464632454923716, + "eval_accuracy": 0.9912399286216407, + "eval_f1": 0.9574861367837338, + "eval_loss": 0.04992222413420677, + "eval_precision": 0.9526436781609195, + "eval_recall": 0.9623780771017185, + "eval_runtime": 36.5208, + "eval_samples_per_second": 5.449, + "eval_steps_per_second": 1.369, + "step": 2295 + }, + { + "epoch": 25.631067961165048, + "grad_norm": 0.9237321019172668, + "learning_rate": 2.3033333333333334e-06, + "loss": 0.0079, + "step": 2310 + }, + { + "epoch": 25.631067961165048, + "eval_accuracy": 0.9905369599307846, + "eval_f1": 0.9541368979027426, + "eval_loss": 0.05427027493715286, + "eval_precision": 0.9469350411710887, + "eval_recall": 0.9614491407338597, + "eval_runtime": 36.0226, + "eval_samples_per_second": 5.524, + "eval_steps_per_second": 1.388, + "step": 2310 + }, + { + "epoch": 25.79750346740638, + "grad_norm": 0.2974264621734619, + "learning_rate": 2.2533333333333335e-06, + "loss": 0.009, + "step": 2325 + }, + { + "epoch": 25.79750346740638, + "eval_accuracy": 0.9914562266803656, + "eval_f1": 0.9572452045296973, + "eval_loss": 0.049834854900836945, + "eval_precision": 0.952621895124195, + "eval_recall": 0.9619136089177891, + "eval_runtime": 36.6625, + "eval_samples_per_second": 5.428, + "eval_steps_per_second": 1.364, + "step": 2325 + }, + { + "epoch": 25.96393897364771, + "grad_norm": 0.6791291236877441, + "learning_rate": 2.2033333333333336e-06, + "loss": 0.0068, + "step": 2340 + }, + { + "epoch": 25.96393897364771, + "eval_accuracy": 0.991077705077597, + "eval_f1": 0.9563611175248211, + "eval_loss": 0.05109778791666031, + "eval_precision": 0.950872359963269, + "eval_recall": 0.9619136089177891, + "eval_runtime": 35.8655, + "eval_samples_per_second": 5.549, + "eval_steps_per_second": 1.394, + "step": 2340 + }, + { + "epoch": 26.130374479889042, + "grad_norm": 0.5723872184753418, + "learning_rate": 2.153333333333333e-06, + "loss": 0.007, + "step": 2355 + }, + { + "epoch": 26.130374479889042, + "eval_accuracy": 0.9914021521656843, + "eval_f1": 0.9579676674364895, + "eval_loss": 0.049178168177604675, + "eval_precision": 0.9526871841984382, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.9503, + "eval_samples_per_second": 5.535, + "eval_steps_per_second": 1.391, + "step": 2355 + }, + { + "epoch": 26.296809986130373, + "grad_norm": 0.3830583393573761, + "learning_rate": 2.1033333333333337e-06, + "loss": 0.0086, + "step": 2370 + }, + { + "epoch": 26.296809986130373, + "eval_accuracy": 0.9912940031363219, + "eval_f1": 0.9554375432925422, + "eval_loss": 0.05156167596578598, + "eval_precision": 0.9499540863177227, + "eval_recall": 0.9609846725499304, + "eval_runtime": 35.8417, + "eval_samples_per_second": 5.552, + "eval_steps_per_second": 1.395, + "step": 2370 + }, + { + "epoch": 26.463245492371705, + "grad_norm": 0.14329634606838226, + "learning_rate": 2.0533333333333337e-06, + "loss": 0.0078, + "step": 2385 + }, + { + "epoch": 26.463245492371705, + "eval_accuracy": 0.9914021521656843, + "eval_f1": 0.9556581986143187, + "eval_loss": 0.05027909576892853, + "eval_precision": 0.9503904455672945, + "eval_recall": 0.9609846725499304, + "eval_runtime": 35.8472, + "eval_samples_per_second": 5.551, + "eval_steps_per_second": 1.395, + "step": 2385 + }, + { + "epoch": 26.629680998613036, + "grad_norm": 0.17582757771015167, + "learning_rate": 2.0033333333333334e-06, + "loss": 0.0067, + "step": 2400 + }, + { + "epoch": 26.629680998613036, + "eval_accuracy": 0.9915103011950468, + "eval_f1": 0.9577269577269578, + "eval_loss": 0.05140436813235283, + "eval_precision": 0.9526654411764706, + "eval_recall": 0.9628425452856479, + "eval_runtime": 35.8691, + "eval_samples_per_second": 5.548, + "eval_steps_per_second": 1.394, + "step": 2400 + }, + { + "epoch": 26.796116504854368, + "grad_norm": 0.6374102830886841, + "learning_rate": 1.9533333333333334e-06, + "loss": 0.0059, + "step": 2415 + }, + { + "epoch": 26.796116504854368, + "eval_accuracy": 0.9918888227978154, + "eval_f1": 0.9588344125809436, + "eval_loss": 0.05035752058029175, + "eval_precision": 0.9548595117457392, + "eval_recall": 0.9628425452856479, + "eval_runtime": 35.7794, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.397, + "step": 2415 + }, + { + "epoch": 26.9625520110957, + "grad_norm": 0.5752395987510681, + "learning_rate": 1.9033333333333335e-06, + "loss": 0.0089, + "step": 2430 + }, + { + "epoch": 26.9625520110957, + "eval_accuracy": 0.9916184502244092, + "eval_f1": 0.9560795191863154, + "eval_loss": 0.051971472799777985, + "eval_precision": 0.9516797054763001, + "eval_recall": 0.9605202043660009, + "eval_runtime": 36.1279, + "eval_samples_per_second": 5.508, + "eval_steps_per_second": 1.384, + "step": 2430 + }, + { + "epoch": 27.12898751733703, + "grad_norm": 0.40148672461509705, + "learning_rate": 1.8533333333333333e-06, + "loss": 0.0059, + "step": 2445 + }, + { + "epoch": 27.12898751733703, + "eval_accuracy": 0.9916725247390905, + "eval_f1": 0.9572649572649573, + "eval_loss": 0.05115849897265434, + "eval_precision": 0.9522058823529411, + "eval_recall": 0.9623780771017185, + "eval_runtime": 36.2324, + "eval_samples_per_second": 5.492, + "eval_steps_per_second": 1.38, + "step": 2445 + }, + { + "epoch": 27.295423023578362, + "grad_norm": 0.19672174751758575, + "learning_rate": 1.8033333333333336e-06, + "loss": 0.0073, + "step": 2460 + }, + { + "epoch": 27.295423023578362, + "eval_accuracy": 0.9916184502244092, + "eval_f1": 0.9569842738205366, + "eval_loss": 0.05259764939546585, + "eval_precision": 0.9530170428374021, + "eval_recall": 0.9609846725499304, + "eval_runtime": 36.3768, + "eval_samples_per_second": 5.471, + "eval_steps_per_second": 1.375, + "step": 2460 + }, + { + "epoch": 27.461858529819693, + "grad_norm": 1.178671956062317, + "learning_rate": 1.7533333333333336e-06, + "loss": 0.0065, + "step": 2475 + }, + { + "epoch": 27.461858529819693, + "eval_accuracy": 0.991564375709728, + "eval_f1": 0.9577269577269578, + "eval_loss": 0.052951879799366, + "eval_precision": 0.9526654411764706, + "eval_recall": 0.9628425452856479, + "eval_runtime": 36.7573, + "eval_samples_per_second": 5.414, + "eval_steps_per_second": 1.36, + "step": 2475 + }, + { + "epoch": 27.628294036061025, + "grad_norm": 0.8156425356864929, + "learning_rate": 1.7033333333333335e-06, + "loss": 0.0064, + "step": 2490 + }, + { + "epoch": 27.628294036061025, + "eval_accuracy": 0.9916725247390905, + "eval_f1": 0.957205644228545, + "eval_loss": 0.05146779865026474, + "eval_precision": 0.9534562211981567, + "eval_recall": 0.9609846725499304, + "eval_runtime": 35.9308, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 1.392, + "step": 2490 + }, + { + "epoch": 27.794729542302356, + "grad_norm": 0.4098323881626129, + "learning_rate": 1.6533333333333335e-06, + "loss": 0.0072, + "step": 2505 + }, + { + "epoch": 27.794729542302356, + "eval_accuracy": 0.9906991834748283, + "eval_f1": 0.9545559400230681, + "eval_loss": 0.054223690181970596, + "eval_precision": 0.9482126489459212, + "eval_recall": 0.9609846725499304, + "eval_runtime": 35.9196, + "eval_samples_per_second": 5.54, + "eval_steps_per_second": 1.392, + "step": 2505 + }, + { + "epoch": 27.96116504854369, + "grad_norm": 0.5159748792648315, + "learning_rate": 1.6033333333333334e-06, + "loss": 0.0066, + "step": 2520 + }, + { + "epoch": 27.96116504854369, + "eval_accuracy": 0.990861407018872, + "eval_f1": 0.9549965381952458, + "eval_loss": 0.05374361574649811, + "eval_precision": 0.9490825688073394, + "eval_recall": 0.9609846725499304, + "eval_runtime": 35.7031, + "eval_samples_per_second": 5.574, + "eval_steps_per_second": 1.4, + "step": 2520 + }, + { + "epoch": 28.127600554785023, + "grad_norm": 0.499012291431427, + "learning_rate": 1.5533333333333334e-06, + "loss": 0.006, + "step": 2535 + }, + { + "epoch": 28.127600554785023, + "eval_accuracy": 0.9915103011950468, + "eval_f1": 0.9579482439926063, + "eval_loss": 0.05182594433426857, + "eval_precision": 0.953103448275862, + "eval_recall": 0.9628425452856479, + "eval_runtime": 35.8174, + "eval_samples_per_second": 5.556, + "eval_steps_per_second": 1.396, + "step": 2535 + }, + { + "epoch": 28.294036061026354, + "grad_norm": 0.5842483639717102, + "learning_rate": 1.5033333333333337e-06, + "loss": 0.0074, + "step": 2550 + }, + { + "epoch": 28.294036061026354, + "eval_accuracy": 0.9914021521656843, + "eval_f1": 0.9565418400369857, + "eval_loss": 0.05230095610022545, + "eval_precision": 0.9521398987574782, + "eval_recall": 0.9609846725499304, + "eval_runtime": 35.928, + "eval_samples_per_second": 5.539, + "eval_steps_per_second": 1.392, + "step": 2550 + }, + { + "epoch": 28.460471567267685, + "grad_norm": 0.4897175431251526, + "learning_rate": 1.4533333333333335e-06, + "loss": 0.0068, + "step": 2565 + }, + { + "epoch": 28.460471567267685, + "eval_accuracy": 0.9912940031363219, + "eval_f1": 0.955458112162474, + "eval_loss": 0.05341142788529396, + "eval_precision": 0.9495412844036697, + "eval_recall": 0.9614491407338597, + "eval_runtime": 36.0278, + "eval_samples_per_second": 5.524, + "eval_steps_per_second": 1.388, + "step": 2565 + }, + { + "epoch": 28.626907073509017, + "grad_norm": 0.4191240668296814, + "learning_rate": 1.4033333333333336e-06, + "loss": 0.0055, + "step": 2580 + }, + { + "epoch": 28.626907073509017, + "eval_accuracy": 0.9916725247390905, + "eval_f1": 0.9583526145303101, + "eval_loss": 0.05210199952125549, + "eval_precision": 0.954817888427847, + "eval_recall": 0.9619136089177891, + "eval_runtime": 36.2636, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 1.379, + "step": 2580 + }, + { + "epoch": 28.793342579750348, + "grad_norm": 0.6655350923538208, + "learning_rate": 1.3533333333333334e-06, + "loss": 0.0056, + "step": 2595 + }, + { + "epoch": 28.793342579750348, + "eval_accuracy": 0.9912940031363219, + "eval_f1": 0.9567829905246129, + "eval_loss": 0.05259960889816284, + "eval_precision": 0.952161913523459, + "eval_recall": 0.9614491407338597, + "eval_runtime": 36.0456, + "eval_samples_per_second": 5.521, + "eval_steps_per_second": 1.387, + "step": 2595 + }, + { + "epoch": 28.95977808599168, + "grad_norm": 0.9510291814804077, + "learning_rate": 1.3033333333333335e-06, + "loss": 0.0066, + "step": 2610 + }, + { + "epoch": 28.95977808599168, + "eval_accuracy": 0.9913480776510031, + "eval_f1": 0.9570240295748613, + "eval_loss": 0.05272991955280304, + "eval_precision": 0.952183908045977, + "eval_recall": 0.9619136089177891, + "eval_runtime": 36.3753, + "eval_samples_per_second": 5.471, + "eval_steps_per_second": 1.375, + "step": 2610 + }, + { + "epoch": 29.12621359223301, + "grad_norm": 0.33463072776794434, + "learning_rate": 1.2533333333333333e-06, + "loss": 0.0053, + "step": 2625 + }, + { + "epoch": 29.12621359223301, + "eval_accuracy": 0.9912940031363219, + "eval_f1": 0.9579482439926063, + "eval_loss": 0.0533275306224823, + "eval_precision": 0.953103448275862, + "eval_recall": 0.9628425452856479, + "eval_runtime": 35.8945, + "eval_samples_per_second": 5.544, + "eval_steps_per_second": 1.393, + "step": 2625 + }, + { + "epoch": 29.292649098474342, + "grad_norm": 0.2936910092830658, + "learning_rate": 1.2033333333333334e-06, + "loss": 0.0063, + "step": 2640 + }, + { + "epoch": 29.292649098474342, + "eval_accuracy": 0.9912940031363219, + "eval_f1": 0.9569842738205366, + "eval_loss": 0.05200694501399994, + "eval_precision": 0.9530170428374021, + "eval_recall": 0.9609846725499304, + "eval_runtime": 35.7745, + "eval_samples_per_second": 5.563, + "eval_steps_per_second": 1.398, + "step": 2640 + }, + { + "epoch": 29.459084604715674, + "grad_norm": 0.45608168840408325, + "learning_rate": 1.1533333333333334e-06, + "loss": 0.0059, + "step": 2655 + }, + { + "epoch": 29.459084604715674, + "eval_accuracy": 0.9910236305629156, + "eval_f1": 0.9554169554169554, + "eval_loss": 0.0532723143696785, + "eval_precision": 0.9503676470588235, + "eval_recall": 0.9605202043660009, + "eval_runtime": 35.9196, + "eval_samples_per_second": 5.54, + "eval_steps_per_second": 1.392, + "step": 2655 + }, + { + "epoch": 29.625520110957005, + "grad_norm": 0.46974512934684753, + "learning_rate": 1.1033333333333335e-06, + "loss": 0.0059, + "step": 2670 + }, + { + "epoch": 29.625520110957005, + "eval_accuracy": 0.9911858541069594, + "eval_f1": 0.9572452045296973, + "eval_loss": 0.05324824899435043, + "eval_precision": 0.952621895124195, + "eval_recall": 0.9619136089177891, + "eval_runtime": 36.0296, + "eval_samples_per_second": 5.523, + "eval_steps_per_second": 1.388, + "step": 2670 + }, + { + "epoch": 29.791955617198337, + "grad_norm": 0.6280196309089661, + "learning_rate": 1.0533333333333333e-06, + "loss": 0.0062, + "step": 2685 + }, + { + "epoch": 29.791955617198337, + "eval_accuracy": 0.9916725247390905, + "eval_f1": 0.9579288025889968, + "eval_loss": 0.05163406580686569, + "eval_precision": 0.9535204786010124, + "eval_recall": 0.9623780771017185, + "eval_runtime": 35.8797, + "eval_samples_per_second": 5.546, + "eval_steps_per_second": 1.394, + "step": 2685 + }, + { + "epoch": 29.958391123439668, + "grad_norm": 0.3609830439090729, + "learning_rate": 1.0033333333333334e-06, + "loss": 0.0064, + "step": 2700 + }, + { + "epoch": 29.958391123439668, + "eval_accuracy": 0.9914562266803656, + "eval_f1": 0.9572649572649573, + "eval_loss": 0.05152719095349312, + "eval_precision": 0.9522058823529411, + "eval_recall": 0.9623780771017185, + "eval_runtime": 36.0059, + "eval_samples_per_second": 5.527, + "eval_steps_per_second": 1.389, + "step": 2700 + }, + { + "epoch": 30.124826629681, + "grad_norm": 0.37590721249580383, + "learning_rate": 9.533333333333335e-07, + "loss": 0.0055, + "step": 2715 + }, + { + "epoch": 30.124826629681, + "eval_accuracy": 0.9917265992537717, + "eval_f1": 0.9590751445086704, + "eval_loss": 0.05128318816423416, + "eval_precision": 0.9548802946593001, + "eval_recall": 0.9633070134695774, + "eval_runtime": 36.0097, + "eval_samples_per_second": 5.526, + "eval_steps_per_second": 1.389, + "step": 2715 + }, + { + "epoch": 30.29126213592233, + "grad_norm": 0.4574069678783417, + "learning_rate": 9.033333333333334e-07, + "loss": 0.0064, + "step": 2730 + }, + { + "epoch": 30.29126213592233, + "eval_accuracy": 0.991564375709728, + "eval_f1": 0.9583911234396673, + "eval_loss": 0.052385713905096054, + "eval_precision": 0.9539806718821905, + "eval_recall": 0.9628425452856479, + "eval_runtime": 35.8265, + "eval_samples_per_second": 5.555, + "eval_steps_per_second": 1.396, + "step": 2730 + }, + { + "epoch": 30.457697642163662, + "grad_norm": 1.509279489517212, + "learning_rate": 8.533333333333334e-07, + "loss": 0.0055, + "step": 2745 + }, + { + "epoch": 30.457697642163662, + "eval_accuracy": 0.9915103011950468, + "eval_f1": 0.9581889581889582, + "eval_loss": 0.05304015427827835, + "eval_precision": 0.953125, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.8068, + "eval_samples_per_second": 5.558, + "eval_steps_per_second": 1.396, + "step": 2745 + }, + { + "epoch": 30.624133148404994, + "grad_norm": 0.08701591938734055, + "learning_rate": 8.033333333333335e-07, + "loss": 0.0065, + "step": 2760 + }, + { + "epoch": 30.624133148404994, + "eval_accuracy": 0.9916725247390905, + "eval_f1": 0.9588914549653579, + "eval_loss": 0.05279012396931648, + "eval_precision": 0.9536058796508957, + "eval_recall": 0.9642359498374361, + "eval_runtime": 36.0763, + "eval_samples_per_second": 5.516, + "eval_steps_per_second": 1.386, + "step": 2760 + }, + { + "epoch": 30.790568654646325, + "grad_norm": 0.39128488302230835, + "learning_rate": 7.533333333333335e-07, + "loss": 0.0068, + "step": 2775 + }, + { + "epoch": 30.790568654646325, + "eval_accuracy": 0.991564375709728, + "eval_f1": 0.9575253924284395, + "eval_loss": 0.05296061187982559, + "eval_precision": 0.9518127581459385, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.9916, + "eval_samples_per_second": 5.529, + "eval_steps_per_second": 1.389, + "step": 2775 + }, + { + "epoch": 30.957004160887656, + "grad_norm": 0.20628976821899414, + "learning_rate": 7.033333333333334e-07, + "loss": 0.0047, + "step": 2790 + }, + { + "epoch": 30.957004160887656, + "eval_accuracy": 0.991564375709728, + "eval_f1": 0.958910433979686, + "eval_loss": 0.05448687821626663, + "eval_precision": 0.953189536484626, + "eval_recall": 0.9647004180213655, + "eval_runtime": 35.9295, + "eval_samples_per_second": 5.539, + "eval_steps_per_second": 1.392, + "step": 2790 + }, + { + "epoch": 31.123439667128988, + "grad_norm": 0.3910321295261383, + "learning_rate": 6.533333333333334e-07, + "loss": 0.0051, + "step": 2805 + }, + { + "epoch": 31.123439667128988, + "eval_accuracy": 0.9916725247390905, + "eval_f1": 0.9595749595749595, + "eval_loss": 0.05336242541670799, + "eval_precision": 0.9545036764705882, + "eval_recall": 0.9647004180213655, + "eval_runtime": 36.0288, + "eval_samples_per_second": 5.523, + "eval_steps_per_second": 1.388, + "step": 2805 + }, + { + "epoch": 31.28987517337032, + "grad_norm": 0.2049601525068283, + "learning_rate": 6.033333333333334e-07, + "loss": 0.0044, + "step": 2820 + }, + { + "epoch": 31.28987517337032, + "eval_accuracy": 0.9914021521656843, + "eval_f1": 0.9581889581889582, + "eval_loss": 0.053161416202783585, + "eval_precision": 0.953125, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.9772, + "eval_samples_per_second": 5.531, + "eval_steps_per_second": 1.39, + "step": 2820 + }, + { + "epoch": 31.45631067961165, + "grad_norm": 0.4429149329662323, + "learning_rate": 5.533333333333334e-07, + "loss": 0.0068, + "step": 2835 + }, + { + "epoch": 31.45631067961165, + "eval_accuracy": 0.9913480776510031, + "eval_f1": 0.9579676674364895, + "eval_loss": 0.05317556858062744, + "eval_precision": 0.9526871841984382, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.6808, + "eval_samples_per_second": 5.577, + "eval_steps_per_second": 1.401, + "step": 2835 + }, + { + "epoch": 31.622746185852982, + "grad_norm": 0.4102032482624054, + "learning_rate": 5.033333333333334e-07, + "loss": 0.0045, + "step": 2850 + }, + { + "epoch": 31.622746185852982, + "eval_accuracy": 0.9915103011950468, + "eval_f1": 0.9590940605500345, + "eval_loss": 0.053103264421224594, + "eval_precision": 0.9544618215271389, + "eval_recall": 0.9637714816535068, + "eval_runtime": 35.706, + "eval_samples_per_second": 5.573, + "eval_steps_per_second": 1.4, + "step": 2850 + }, + { + "epoch": 31.789181692094314, + "grad_norm": 0.8468719720840454, + "learning_rate": 4.533333333333334e-07, + "loss": 0.0047, + "step": 2865 + }, + { + "epoch": 31.789181692094314, + "eval_accuracy": 0.991564375709728, + "eval_f1": 0.9586318465449504, + "eval_loss": 0.05298003926873207, + "eval_precision": 0.954001839926403, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.8749, + "eval_samples_per_second": 5.547, + "eval_steps_per_second": 1.394, + "step": 2865 + }, + { + "epoch": 31.955617198335645, + "grad_norm": 0.2063705176115036, + "learning_rate": 4.0333333333333337e-07, + "loss": 0.0075, + "step": 2880 + }, + { + "epoch": 31.955617198335645, + "eval_accuracy": 0.9916184502244092, + "eval_f1": 0.9593157651410079, + "eval_loss": 0.05329431965947151, + "eval_precision": 0.9549010584445468, + "eval_recall": 0.9637714816535068, + "eval_runtime": 36.0809, + "eval_samples_per_second": 5.515, + "eval_steps_per_second": 1.386, + "step": 2880 + }, + { + "epoch": 32.12205270457698, + "grad_norm": 0.3478763997554779, + "learning_rate": 3.533333333333334e-07, + "loss": 0.0055, + "step": 2895 + }, + { + "epoch": 32.12205270457698, + "eval_accuracy": 0.9917265992537717, + "eval_f1": 0.9595375722543353, + "eval_loss": 0.05245138704776764, + "eval_precision": 0.9553406998158379, + "eval_recall": 0.9637714816535068, + "eval_runtime": 35.9673, + "eval_samples_per_second": 5.533, + "eval_steps_per_second": 1.39, + "step": 2895 + }, + { + "epoch": 32.28848821081831, + "grad_norm": 0.721191942691803, + "learning_rate": 3.033333333333334e-07, + "loss": 0.006, + "step": 2910 + }, + { + "epoch": 32.28848821081831, + "eval_accuracy": 0.9917265992537717, + "eval_f1": 0.9595375722543353, + "eval_loss": 0.05226488783955574, + "eval_precision": 0.9553406998158379, + "eval_recall": 0.9637714816535068, + "eval_runtime": 35.9385, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 1.391, + "step": 2910 + }, + { + "epoch": 32.45492371705964, + "grad_norm": 0.3022706210613251, + "learning_rate": 2.533333333333333e-07, + "loss": 0.0062, + "step": 2925 + }, + { + "epoch": 32.45492371705964, + "eval_accuracy": 0.9916725247390905, + "eval_f1": 0.9588534442903375, + "eval_loss": 0.05245348811149597, + "eval_precision": 0.9544408651633686, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.8766, + "eval_samples_per_second": 5.547, + "eval_steps_per_second": 1.394, + "step": 2925 + }, + { + "epoch": 32.62135922330097, + "grad_norm": 0.4700392186641693, + "learning_rate": 2.0333333333333333e-07, + "loss": 0.0059, + "step": 2940 + }, + { + "epoch": 32.62135922330097, + "eval_accuracy": 0.9917265992537717, + "eval_f1": 0.9593157651410079, + "eval_loss": 0.05246575176715851, + "eval_precision": 0.9549010584445468, + "eval_recall": 0.9637714816535068, + "eval_runtime": 35.8779, + "eval_samples_per_second": 5.547, + "eval_steps_per_second": 1.394, + "step": 2940 + }, + { + "epoch": 32.787794729542306, + "grad_norm": 0.7413909435272217, + "learning_rate": 1.5333333333333333e-07, + "loss": 0.0058, + "step": 2955 + }, + { + "epoch": 32.787794729542306, + "eval_accuracy": 0.9917265992537717, + "eval_f1": 0.959556274555119, + "eval_loss": 0.053051915019750595, + "eval_precision": 0.9549218031278749, + "eval_recall": 0.9642359498374361, + "eval_runtime": 35.8838, + "eval_samples_per_second": 5.546, + "eval_steps_per_second": 1.393, + "step": 2955 + }, + { + "epoch": 32.95423023578363, + "grad_norm": 0.7399964332580566, + "learning_rate": 1.0333333333333335e-07, + "loss": 0.005, + "step": 2970 + }, + { + "epoch": 32.95423023578363, + "eval_accuracy": 0.991564375709728, + "eval_f1": 0.9584103512014789, + "eval_loss": 0.05329006537795067, + "eval_precision": 0.9535632183908046, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.9193, + "eval_samples_per_second": 5.54, + "eval_steps_per_second": 1.392, + "step": 2970 + }, + { + "epoch": 33.12066574202497, + "grad_norm": 0.3113914728164673, + "learning_rate": 5.3333333333333334e-08, + "loss": 0.007, + "step": 2985 + }, + { + "epoch": 33.12066574202497, + "eval_accuracy": 0.991564375709728, + "eval_f1": 0.9584103512014789, + "eval_loss": 0.05327802523970604, + "eval_precision": 0.9535632183908046, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.914, + "eval_samples_per_second": 5.541, + "eval_steps_per_second": 1.392, + "step": 2985 + }, + { + "epoch": 33.287101248266296, + "grad_norm": 0.33092889189720154, + "learning_rate": 3.3333333333333334e-09, + "loss": 0.0047, + "step": 3000 + }, + { + "epoch": 33.287101248266296, + "eval_accuracy": 0.991564375709728, + "eval_f1": 0.9584103512014789, + "eval_loss": 0.05324762314558029, + "eval_precision": 0.9535632183908046, + "eval_recall": 0.9633070134695774, + "eval_runtime": 35.8608, + "eval_samples_per_second": 5.549, + "eval_steps_per_second": 1.394, + "step": 3000 + }, + { + "epoch": 33.287101248266296, + "step": 3000, + "total_flos": 8.9780255686656e+16, + "train_loss": 0.0764370101193587, + "train_runtime": 55223.9534, + "train_samples_per_second": 1.738, + "train_steps_per_second": 0.054 + } + ], + "logging_steps": 15, + "max_steps": 3000, + "num_input_tokens_seen": 0, + "num_train_epochs": 34, + "save_steps": 15, + "total_flos": 8.9780255686656e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}