diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7999 @@ +{ + "best_metric": 0.9373365167161658, + "best_model_checkpoint": "vivit-surf-analytics-runpod/checkpoint-11115", + "epoch": 15.001349527665317, + "eval_steps": 500, + "global_step": 11116, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006747638326585695, + "grad_norm": 32.80915451049805, + "learning_rate": 3.373819163292848e-07, + "loss": 1.5069, + "step": 10 + }, + { + "epoch": 0.001349527665317139, + "grad_norm": 22.916248321533203, + "learning_rate": 6.747638326585696e-07, + "loss": 1.354, + "step": 20 + }, + { + "epoch": 0.0020242914979757085, + "grad_norm": 34.89827346801758, + "learning_rate": 1.0121457489878542e-06, + "loss": 1.5187, + "step": 30 + }, + { + "epoch": 0.002699055330634278, + "grad_norm": 22.8042049407959, + "learning_rate": 1.3495276653171391e-06, + "loss": 1.478, + "step": 40 + }, + { + "epoch": 0.0033738191632928477, + "grad_norm": 27.662748336791992, + "learning_rate": 1.6869095816464238e-06, + "loss": 1.2862, + "step": 50 + }, + { + "epoch": 0.004048582995951417, + "grad_norm": 24.901159286499023, + "learning_rate": 2.0242914979757085e-06, + "loss": 1.2586, + "step": 60 + }, + { + "epoch": 0.004723346828609987, + "grad_norm": 25.327184677124023, + "learning_rate": 2.3616734143049934e-06, + "loss": 1.2728, + "step": 70 + }, + { + "epoch": 0.005398110661268556, + "grad_norm": 18.19566535949707, + "learning_rate": 2.6990553306342783e-06, + "loss": 1.0159, + "step": 80 + }, + { + "epoch": 0.006072874493927126, + "grad_norm": 20.370386123657227, + "learning_rate": 3.0364372469635627e-06, + "loss": 1.2504, + "step": 90 + }, + { + "epoch": 0.006747638326585695, + "grad_norm": 12.196557998657227, + "learning_rate": 3.3738191632928476e-06, + "loss": 2.0246, + "step": 100 + }, + { + "epoch": 0.007422402159244264, + "grad_norm": 12.822103500366211, + "learning_rate": 3.711201079622133e-06, + "loss": 0.8519, + "step": 110 + }, + { + "epoch": 0.008097165991902834, + "grad_norm": 6.872288227081299, + "learning_rate": 4.048582995951417e-06, + "loss": 0.7749, + "step": 120 + }, + { + "epoch": 0.008771929824561403, + "grad_norm": 40.45072937011719, + "learning_rate": 4.3859649122807014e-06, + "loss": 1.3494, + "step": 130 + }, + { + "epoch": 0.009446693657219974, + "grad_norm": 3.996568441390991, + "learning_rate": 4.723346828609987e-06, + "loss": 0.9678, + "step": 140 + }, + { + "epoch": 0.010121457489878543, + "grad_norm": 2.117781400680542, + "learning_rate": 5.060728744939271e-06, + "loss": 1.8095, + "step": 150 + }, + { + "epoch": 0.010796221322537112, + "grad_norm": 1.1970853805541992, + "learning_rate": 5.3981106612685565e-06, + "loss": 1.3044, + "step": 160 + }, + { + "epoch": 0.011470985155195682, + "grad_norm": 56.31877136230469, + "learning_rate": 5.735492577597841e-06, + "loss": 3.0015, + "step": 170 + }, + { + "epoch": 0.012145748987854251, + "grad_norm": 1.2758257389068604, + "learning_rate": 6.0728744939271254e-06, + "loss": 1.7654, + "step": 180 + }, + { + "epoch": 0.01282051282051282, + "grad_norm": 49.485626220703125, + "learning_rate": 6.41025641025641e-06, + "loss": 1.9578, + "step": 190 + }, + { + "epoch": 0.01349527665317139, + "grad_norm": 1.0414538383483887, + "learning_rate": 6.747638326585695e-06, + "loss": 2.0202, + "step": 200 + }, + { + "epoch": 0.01417004048582996, + "grad_norm": 46.221031188964844, + "learning_rate": 7.0850202429149805e-06, + "loss": 2.0222, + "step": 210 + }, + { + "epoch": 0.014844804318488529, + "grad_norm": 5.171656131744385, + "learning_rate": 7.422402159244266e-06, + "loss": 2.3988, + "step": 220 + }, + { + "epoch": 0.0155195681511471, + "grad_norm": 40.51677703857422, + "learning_rate": 7.75978407557355e-06, + "loss": 1.1011, + "step": 230 + }, + { + "epoch": 0.016194331983805668, + "grad_norm": 0.3821451961994171, + "learning_rate": 8.097165991902834e-06, + "loss": 1.7582, + "step": 240 + }, + { + "epoch": 0.016869095816464237, + "grad_norm": 56.244895935058594, + "learning_rate": 8.43454790823212e-06, + "loss": 1.628, + "step": 250 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 2.9704697132110596, + "learning_rate": 8.771929824561403e-06, + "loss": 2.4795, + "step": 260 + }, + { + "epoch": 0.018218623481781375, + "grad_norm": 2.420311689376831, + "learning_rate": 9.109311740890689e-06, + "loss": 1.2225, + "step": 270 + }, + { + "epoch": 0.018893387314439947, + "grad_norm": 3.02461314201355, + "learning_rate": 9.446693657219973e-06, + "loss": 1.813, + "step": 280 + }, + { + "epoch": 0.019568151147098516, + "grad_norm": 1.8302630186080933, + "learning_rate": 9.784075573549258e-06, + "loss": 1.6011, + "step": 290 + }, + { + "epoch": 0.020242914979757085, + "grad_norm": 62.22663497924805, + "learning_rate": 1.0121457489878542e-05, + "loss": 2.1712, + "step": 300 + }, + { + "epoch": 0.020917678812415654, + "grad_norm": 4.137598037719727, + "learning_rate": 1.0458839406207829e-05, + "loss": 1.691, + "step": 310 + }, + { + "epoch": 0.021592442645074223, + "grad_norm": 1.1848357915878296, + "learning_rate": 1.0796221322537113e-05, + "loss": 0.8611, + "step": 320 + }, + { + "epoch": 0.022267206477732792, + "grad_norm": 48.48101043701172, + "learning_rate": 1.1133603238866398e-05, + "loss": 2.4268, + "step": 330 + }, + { + "epoch": 0.022941970310391364, + "grad_norm": 1.995662808418274, + "learning_rate": 1.1470985155195682e-05, + "loss": 1.6822, + "step": 340 + }, + { + "epoch": 0.023616734143049933, + "grad_norm": 4.30789041519165, + "learning_rate": 1.1808367071524966e-05, + "loss": 1.5158, + "step": 350 + }, + { + "epoch": 0.024291497975708502, + "grad_norm": 2.7685494422912598, + "learning_rate": 1.2145748987854251e-05, + "loss": 0.9964, + "step": 360 + }, + { + "epoch": 0.02496626180836707, + "grad_norm": 47.719268798828125, + "learning_rate": 1.2483130904183535e-05, + "loss": 2.2256, + "step": 370 + }, + { + "epoch": 0.02564102564102564, + "grad_norm": 48.7852783203125, + "learning_rate": 1.282051282051282e-05, + "loss": 1.7263, + "step": 380 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 57.43927001953125, + "learning_rate": 1.3157894736842106e-05, + "loss": 1.9673, + "step": 390 + }, + { + "epoch": 0.02699055330634278, + "grad_norm": 44.42695617675781, + "learning_rate": 1.349527665317139e-05, + "loss": 1.6821, + "step": 400 + }, + { + "epoch": 0.02766531713900135, + "grad_norm": 26.7230167388916, + "learning_rate": 1.3832658569500675e-05, + "loss": 1.4562, + "step": 410 + }, + { + "epoch": 0.02834008097165992, + "grad_norm": 39.75962448120117, + "learning_rate": 1.4170040485829961e-05, + "loss": 0.7446, + "step": 420 + }, + { + "epoch": 0.029014844804318488, + "grad_norm": 45.954254150390625, + "learning_rate": 1.4507422402159246e-05, + "loss": 1.1802, + "step": 430 + }, + { + "epoch": 0.029689608636977057, + "grad_norm": 25.454557418823242, + "learning_rate": 1.4844804318488532e-05, + "loss": 1.1458, + "step": 440 + }, + { + "epoch": 0.030364372469635626, + "grad_norm": 39.98874282836914, + "learning_rate": 1.5182186234817813e-05, + "loss": 0.512, + "step": 450 + }, + { + "epoch": 0.0310391363022942, + "grad_norm": 22.448896408081055, + "learning_rate": 1.55195681511471e-05, + "loss": 1.5049, + "step": 460 + }, + { + "epoch": 0.03171390013495277, + "grad_norm": 26.93549346923828, + "learning_rate": 1.5856950067476383e-05, + "loss": 0.7132, + "step": 470 + }, + { + "epoch": 0.032388663967611336, + "grad_norm": 21.29535675048828, + "learning_rate": 1.6194331983805668e-05, + "loss": 1.3895, + "step": 480 + }, + { + "epoch": 0.033063427800269905, + "grad_norm": 0.4730716645717621, + "learning_rate": 1.6531713900134956e-05, + "loss": 0.1323, + "step": 490 + }, + { + "epoch": 0.033738191632928474, + "grad_norm": 6.3616862297058105, + "learning_rate": 1.686909581646424e-05, + "loss": 0.9272, + "step": 500 + }, + { + "epoch": 0.03441295546558704, + "grad_norm": 0.5018609762191772, + "learning_rate": 1.720647773279352e-05, + "loss": 1.3049, + "step": 510 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 62.65403747558594, + "learning_rate": 1.7543859649122806e-05, + "loss": 2.8324, + "step": 520 + }, + { + "epoch": 0.03576248313090418, + "grad_norm": 12.991227149963379, + "learning_rate": 1.7881241565452094e-05, + "loss": 0.5474, + "step": 530 + }, + { + "epoch": 0.03643724696356275, + "grad_norm": 0.5492640733718872, + "learning_rate": 1.8218623481781378e-05, + "loss": 1.2093, + "step": 540 + }, + { + "epoch": 0.037112010796221326, + "grad_norm": 0.07551870495080948, + "learning_rate": 1.8556005398110663e-05, + "loss": 1.2592, + "step": 550 + }, + { + "epoch": 0.037786774628879895, + "grad_norm": 1.5378609895706177, + "learning_rate": 1.8893387314439947e-05, + "loss": 0.3359, + "step": 560 + }, + { + "epoch": 0.038461538461538464, + "grad_norm": 0.23666121065616608, + "learning_rate": 1.923076923076923e-05, + "loss": 1.3151, + "step": 570 + }, + { + "epoch": 0.03913630229419703, + "grad_norm": 7.869609832763672, + "learning_rate": 1.9568151147098516e-05, + "loss": 0.6125, + "step": 580 + }, + { + "epoch": 0.0398110661268556, + "grad_norm": 13.923602104187012, + "learning_rate": 1.99055330634278e-05, + "loss": 1.1772, + "step": 590 + }, + { + "epoch": 0.04048582995951417, + "grad_norm": 29.88282585144043, + "learning_rate": 2.0242914979757085e-05, + "loss": 0.1969, + "step": 600 + }, + { + "epoch": 0.04116059379217274, + "grad_norm": 24.888872146606445, + "learning_rate": 2.058029689608637e-05, + "loss": 0.7039, + "step": 610 + }, + { + "epoch": 0.04183535762483131, + "grad_norm": 0.40080440044403076, + "learning_rate": 2.0917678812415657e-05, + "loss": 1.008, + "step": 620 + }, + { + "epoch": 0.04251012145748988, + "grad_norm": 4.866868495941162, + "learning_rate": 2.125506072874494e-05, + "loss": 1.0502, + "step": 630 + }, + { + "epoch": 0.043184885290148446, + "grad_norm": 1.2502915859222412, + "learning_rate": 2.1592442645074226e-05, + "loss": 0.7422, + "step": 640 + }, + { + "epoch": 0.043859649122807015, + "grad_norm": 0.6650214791297913, + "learning_rate": 2.1929824561403507e-05, + "loss": 0.6072, + "step": 650 + }, + { + "epoch": 0.044534412955465584, + "grad_norm": 16.356952667236328, + "learning_rate": 2.2267206477732795e-05, + "loss": 0.8198, + "step": 660 + }, + { + "epoch": 0.04520917678812416, + "grad_norm": 169.0858154296875, + "learning_rate": 2.260458839406208e-05, + "loss": 1.2371, + "step": 670 + }, + { + "epoch": 0.04588394062078273, + "grad_norm": 98.78671264648438, + "learning_rate": 2.2941970310391364e-05, + "loss": 1.1303, + "step": 680 + }, + { + "epoch": 0.0465587044534413, + "grad_norm": 99.31029510498047, + "learning_rate": 2.327935222672065e-05, + "loss": 0.7183, + "step": 690 + }, + { + "epoch": 0.04723346828609987, + "grad_norm": 30.58230209350586, + "learning_rate": 2.3616734143049933e-05, + "loss": 1.1701, + "step": 700 + }, + { + "epoch": 0.047908232118758436, + "grad_norm": 0.05228818207979202, + "learning_rate": 2.395411605937922e-05, + "loss": 0.1544, + "step": 710 + }, + { + "epoch": 0.048582995951417005, + "grad_norm": 1.974684715270996, + "learning_rate": 2.4291497975708502e-05, + "loss": 1.4774, + "step": 720 + }, + { + "epoch": 0.049257759784075573, + "grad_norm": 0.12068396061658859, + "learning_rate": 2.4628879892037786e-05, + "loss": 0.6994, + "step": 730 + }, + { + "epoch": 0.04993252361673414, + "grad_norm": 76.24126434326172, + "learning_rate": 2.496626180836707e-05, + "loss": 1.3533, + "step": 740 + }, + { + "epoch": 0.05, + "eval_accuracy": 0.8392857142857143, + "eval_f1": 0.8382276099228692, + "eval_loss": 0.7030884623527527, + "eval_runtime": 74.2993, + "eval_samples_per_second": 1.507, + "eval_steps_per_second": 1.507, + "step": 741 + }, + { + "epoch": 1.0006072874493928, + "grad_norm": 0.14856617152690887, + "learning_rate": 2.530364372469636e-05, + "loss": 0.4158, + "step": 750 + }, + { + "epoch": 1.0012820512820513, + "grad_norm": 0.07458806782960892, + "learning_rate": 2.564102564102564e-05, + "loss": 0.0042, + "step": 760 + }, + { + "epoch": 1.0019568151147098, + "grad_norm": 0.08816417306661606, + "learning_rate": 2.5978407557354928e-05, + "loss": 0.0175, + "step": 770 + }, + { + "epoch": 1.0026315789473683, + "grad_norm": 0.07340700924396515, + "learning_rate": 2.6315789473684212e-05, + "loss": 0.0039, + "step": 780 + }, + { + "epoch": 1.003306342780027, + "grad_norm": 0.08517087250947952, + "learning_rate": 2.66531713900135e-05, + "loss": 0.0075, + "step": 790 + }, + { + "epoch": 1.0039811066126856, + "grad_norm": 0.07905049622058868, + "learning_rate": 2.699055330634278e-05, + "loss": 0.0021, + "step": 800 + }, + { + "epoch": 1.004655870445344, + "grad_norm": 0.13749797642230988, + "learning_rate": 2.732793522267207e-05, + "loss": 0.7603, + "step": 810 + }, + { + "epoch": 1.0053306342780026, + "grad_norm": 0.04107066988945007, + "learning_rate": 2.766531713900135e-05, + "loss": 0.0033, + "step": 820 + }, + { + "epoch": 1.0060053981106614, + "grad_norm": 0.05302370712161064, + "learning_rate": 2.8002699055330634e-05, + "loss": 0.0487, + "step": 830 + }, + { + "epoch": 1.0066801619433199, + "grad_norm": 0.050035424530506134, + "learning_rate": 2.8340080971659922e-05, + "loss": 0.0166, + "step": 840 + }, + { + "epoch": 1.0073549257759784, + "grad_norm": 0.17594772577285767, + "learning_rate": 2.8677462887989203e-05, + "loss": 0.9013, + "step": 850 + }, + { + "epoch": 1.008029689608637, + "grad_norm": 5.9323811531066895, + "learning_rate": 2.901484480431849e-05, + "loss": 0.8083, + "step": 860 + }, + { + "epoch": 1.0087044534412954, + "grad_norm": 0.2871362566947937, + "learning_rate": 2.9352226720647776e-05, + "loss": 0.6024, + "step": 870 + }, + { + "epoch": 1.0093792172739542, + "grad_norm": 0.3136674463748932, + "learning_rate": 2.9689608636977063e-05, + "loss": 0.5028, + "step": 880 + }, + { + "epoch": 1.0100539811066127, + "grad_norm": 0.05438687652349472, + "learning_rate": 3.0026990553306344e-05, + "loss": 0.1368, + "step": 890 + }, + { + "epoch": 1.0107287449392712, + "grad_norm": 0.05301366746425629, + "learning_rate": 3.0364372469635626e-05, + "loss": 0.1373, + "step": 900 + }, + { + "epoch": 1.0114035087719297, + "grad_norm": 0.015999358147382736, + "learning_rate": 3.0701754385964913e-05, + "loss": 0.0301, + "step": 910 + }, + { + "epoch": 1.0120782726045885, + "grad_norm": 0.014771537855267525, + "learning_rate": 3.10391363022942e-05, + "loss": 0.0195, + "step": 920 + }, + { + "epoch": 1.012753036437247, + "grad_norm": 31.934608459472656, + "learning_rate": 3.137651821862348e-05, + "loss": 1.6569, + "step": 930 + }, + { + "epoch": 1.0134278002699055, + "grad_norm": 0.031412914395332336, + "learning_rate": 3.171390013495277e-05, + "loss": 0.0009, + "step": 940 + }, + { + "epoch": 1.014102564102564, + "grad_norm": 0.028489330783486366, + "learning_rate": 3.205128205128206e-05, + "loss": 0.1698, + "step": 950 + }, + { + "epoch": 1.0147773279352226, + "grad_norm": 105.16389465332031, + "learning_rate": 3.2388663967611336e-05, + "loss": 0.1704, + "step": 960 + }, + { + "epoch": 1.0154520917678813, + "grad_norm": 0.024373585358262062, + "learning_rate": 3.272604588394062e-05, + "loss": 0.6264, + "step": 970 + }, + { + "epoch": 1.0161268556005398, + "grad_norm": 0.024133900180459023, + "learning_rate": 3.306342780026991e-05, + "loss": 0.8147, + "step": 980 + }, + { + "epoch": 1.0168016194331984, + "grad_norm": 92.91847229003906, + "learning_rate": 3.340080971659919e-05, + "loss": 1.1899, + "step": 990 + }, + { + "epoch": 1.0174763832658569, + "grad_norm": 0.3631739616394043, + "learning_rate": 3.373819163292848e-05, + "loss": 1.2713, + "step": 1000 + }, + { + "epoch": 1.0181511470985156, + "grad_norm": 7.820636749267578, + "learning_rate": 3.407557354925776e-05, + "loss": 0.5495, + "step": 1010 + }, + { + "epoch": 1.0188259109311741, + "grad_norm": 55.31717300415039, + "learning_rate": 3.441295546558704e-05, + "loss": 0.4036, + "step": 1020 + }, + { + "epoch": 1.0195006747638327, + "grad_norm": 0.013262225314974785, + "learning_rate": 3.4750337381916334e-05, + "loss": 0.0089, + "step": 1030 + }, + { + "epoch": 1.0201754385964912, + "grad_norm": 0.8780525922775269, + "learning_rate": 3.508771929824561e-05, + "loss": 0.3444, + "step": 1040 + }, + { + "epoch": 1.0208502024291497, + "grad_norm": 2.9044814109802246, + "learning_rate": 3.54251012145749e-05, + "loss": 0.0081, + "step": 1050 + }, + { + "epoch": 1.0215249662618084, + "grad_norm": 0.020421041175723076, + "learning_rate": 3.576248313090419e-05, + "loss": 1.0599, + "step": 1060 + }, + { + "epoch": 1.022199730094467, + "grad_norm": 0.012594003230333328, + "learning_rate": 3.609986504723347e-05, + "loss": 0.0616, + "step": 1070 + }, + { + "epoch": 1.0228744939271255, + "grad_norm": 0.018383637070655823, + "learning_rate": 3.6437246963562756e-05, + "loss": 0.9516, + "step": 1080 + }, + { + "epoch": 1.023549257759784, + "grad_norm": 0.04205102473497391, + "learning_rate": 3.6774628879892034e-05, + "loss": 0.4867, + "step": 1090 + }, + { + "epoch": 1.0242240215924427, + "grad_norm": 0.022214779630303383, + "learning_rate": 3.7112010796221325e-05, + "loss": 0.0101, + "step": 1100 + }, + { + "epoch": 1.0248987854251013, + "grad_norm": 0.026110410690307617, + "learning_rate": 3.744939271255061e-05, + "loss": 0.3327, + "step": 1110 + }, + { + "epoch": 1.0255735492577598, + "grad_norm": 0.16947214305400848, + "learning_rate": 3.7786774628879894e-05, + "loss": 0.6535, + "step": 1120 + }, + { + "epoch": 1.0262483130904183, + "grad_norm": 0.019961325451731682, + "learning_rate": 3.812415654520918e-05, + "loss": 0.0014, + "step": 1130 + }, + { + "epoch": 1.0269230769230768, + "grad_norm": 213.16741943359375, + "learning_rate": 3.846153846153846e-05, + "loss": 0.348, + "step": 1140 + }, + { + "epoch": 1.0275978407557356, + "grad_norm": 0.26998648047447205, + "learning_rate": 3.879892037786775e-05, + "loss": 1.8138, + "step": 1150 + }, + { + "epoch": 1.028272604588394, + "grad_norm": 16.201974868774414, + "learning_rate": 3.913630229419703e-05, + "loss": 1.1767, + "step": 1160 + }, + { + "epoch": 1.0289473684210526, + "grad_norm": 0.46378159523010254, + "learning_rate": 3.9473684210526316e-05, + "loss": 0.3981, + "step": 1170 + }, + { + "epoch": 1.0296221322537111, + "grad_norm": 0.16117815673351288, + "learning_rate": 3.98110661268556e-05, + "loss": 0.6503, + "step": 1180 + }, + { + "epoch": 1.0302968960863699, + "grad_norm": 0.09139110147953033, + "learning_rate": 4.014844804318489e-05, + "loss": 0.0009, + "step": 1190 + }, + { + "epoch": 1.0309716599190284, + "grad_norm": 59.96378707885742, + "learning_rate": 4.048582995951417e-05, + "loss": 2.7966, + "step": 1200 + }, + { + "epoch": 1.031646423751687, + "grad_norm": 0.1793028563261032, + "learning_rate": 4.082321187584346e-05, + "loss": 0.7813, + "step": 1210 + }, + { + "epoch": 1.0323211875843454, + "grad_norm": 0.04233807325363159, + "learning_rate": 4.116059379217274e-05, + "loss": 0.0117, + "step": 1220 + }, + { + "epoch": 1.032995951417004, + "grad_norm": 0.10781926661729813, + "learning_rate": 4.149797570850202e-05, + "loss": 0.0029, + "step": 1230 + }, + { + "epoch": 1.0336707152496627, + "grad_norm": 0.04887605831027031, + "learning_rate": 4.1835357624831314e-05, + "loss": 0.0023, + "step": 1240 + }, + { + "epoch": 1.0343454790823212, + "grad_norm": 0.0070233517326414585, + "learning_rate": 4.217273954116059e-05, + "loss": 0.5274, + "step": 1250 + }, + { + "epoch": 1.0350202429149797, + "grad_norm": 0.009842370636761189, + "learning_rate": 4.251012145748988e-05, + "loss": 1.1471, + "step": 1260 + }, + { + "epoch": 1.0356950067476383, + "grad_norm": 117.02069091796875, + "learning_rate": 4.284750337381917e-05, + "loss": 0.0518, + "step": 1270 + }, + { + "epoch": 1.036369770580297, + "grad_norm": 0.011584924533963203, + "learning_rate": 4.318488529014845e-05, + "loss": 0.0088, + "step": 1280 + }, + { + "epoch": 1.0370445344129555, + "grad_norm": 0.04845478758215904, + "learning_rate": 4.3522267206477737e-05, + "loss": 0.937, + "step": 1290 + }, + { + "epoch": 1.037719298245614, + "grad_norm": 12.870345115661621, + "learning_rate": 4.3859649122807014e-05, + "loss": 0.1796, + "step": 1300 + }, + { + "epoch": 1.0383940620782726, + "grad_norm": 0.18226304650306702, + "learning_rate": 4.4197031039136306e-05, + "loss": 0.7725, + "step": 1310 + }, + { + "epoch": 1.039068825910931, + "grad_norm": 0.038409680128097534, + "learning_rate": 4.453441295546559e-05, + "loss": 0.5331, + "step": 1320 + }, + { + "epoch": 1.0397435897435898, + "grad_norm": 1.686890721321106, + "learning_rate": 4.4871794871794874e-05, + "loss": 0.6265, + "step": 1330 + }, + { + "epoch": 1.0404183535762483, + "grad_norm": 0.009872148744761944, + "learning_rate": 4.520917678812416e-05, + "loss": 1.2371, + "step": 1340 + }, + { + "epoch": 1.0410931174089069, + "grad_norm": 0.016034213826060295, + "learning_rate": 4.5546558704453443e-05, + "loss": 0.8008, + "step": 1350 + }, + { + "epoch": 1.0417678812415654, + "grad_norm": 161.0729217529297, + "learning_rate": 4.588394062078273e-05, + "loss": 1.6563, + "step": 1360 + }, + { + "epoch": 1.0424426450742241, + "grad_norm": 0.039535123854875565, + "learning_rate": 4.622132253711201e-05, + "loss": 0.698, + "step": 1370 + }, + { + "epoch": 1.0431174089068826, + "grad_norm": 0.02719847857952118, + "learning_rate": 4.65587044534413e-05, + "loss": 0.1234, + "step": 1380 + }, + { + "epoch": 1.0437921727395412, + "grad_norm": 79.83929443359375, + "learning_rate": 4.689608636977058e-05, + "loss": 1.275, + "step": 1390 + }, + { + "epoch": 1.0444669365721997, + "grad_norm": 0.2730661928653717, + "learning_rate": 4.7233468286099866e-05, + "loss": 0.4828, + "step": 1400 + }, + { + "epoch": 1.0451417004048582, + "grad_norm": 0.025355026125907898, + "learning_rate": 4.757085020242915e-05, + "loss": 0.1393, + "step": 1410 + }, + { + "epoch": 1.045816464237517, + "grad_norm": 8.70992374420166, + "learning_rate": 4.790823211875844e-05, + "loss": 0.0709, + "step": 1420 + }, + { + "epoch": 1.0464912280701755, + "grad_norm": 37.11697006225586, + "learning_rate": 4.824561403508772e-05, + "loss": 2.5881, + "step": 1430 + }, + { + "epoch": 1.047165991902834, + "grad_norm": 52.71913528442383, + "learning_rate": 4.8582995951417004e-05, + "loss": 2.0305, + "step": 1440 + }, + { + "epoch": 1.0478407557354925, + "grad_norm": 1.0200884342193604, + "learning_rate": 4.8920377867746295e-05, + "loss": 0.0119, + "step": 1450 + }, + { + "epoch": 1.0485155195681513, + "grad_norm": 0.16433711349964142, + "learning_rate": 4.925775978407557e-05, + "loss": 1.4951, + "step": 1460 + }, + { + "epoch": 1.0491902834008098, + "grad_norm": 0.04836498573422432, + "learning_rate": 4.9595141700404864e-05, + "loss": 0.5514, + "step": 1470 + }, + { + "epoch": 1.0498650472334683, + "grad_norm": 0.021334873512387276, + "learning_rate": 4.993252361673414e-05, + "loss": 0.0028, + "step": 1480 + }, + { + "epoch": 1.05, + "eval_accuracy": 0.8482142857142857, + "eval_f1": 0.8460469703429654, + "eval_loss": 0.7499637603759766, + "eval_runtime": 75.3886, + "eval_samples_per_second": 1.486, + "eval_steps_per_second": 1.486, + "step": 1482 + }, + { + "epoch": 2.000539811066127, + "grad_norm": 0.006392825860530138, + "learning_rate": 4.9970010496326286e-05, + "loss": 0.0052, + "step": 1490 + }, + { + "epoch": 2.0012145748987855, + "grad_norm": 0.1666487753391266, + "learning_rate": 4.993252361673414e-05, + "loss": 0.0088, + "step": 1500 + }, + { + "epoch": 2.001889338731444, + "grad_norm": 0.27092501521110535, + "learning_rate": 4.9895036737142004e-05, + "loss": 0.0005, + "step": 1510 + }, + { + "epoch": 2.0025641025641026, + "grad_norm": 0.01244429126381874, + "learning_rate": 4.985754985754986e-05, + "loss": 1.2941, + "step": 1520 + }, + { + "epoch": 2.003238866396761, + "grad_norm": 0.07986637949943542, + "learning_rate": 4.9820062977957716e-05, + "loss": 1.3347, + "step": 1530 + }, + { + "epoch": 2.0039136302294196, + "grad_norm": 0.010807895101606846, + "learning_rate": 4.978257609836557e-05, + "loss": 0.4777, + "step": 1540 + }, + { + "epoch": 2.004588394062078, + "grad_norm": 0.010884225368499756, + "learning_rate": 4.9745089218773434e-05, + "loss": 1.7184, + "step": 1550 + }, + { + "epoch": 2.0052631578947366, + "grad_norm": 0.17375628650188446, + "learning_rate": 4.970760233918128e-05, + "loss": 0.0067, + "step": 1560 + }, + { + "epoch": 2.0059379217273956, + "grad_norm": 0.006022674031555653, + "learning_rate": 4.9670115459589145e-05, + "loss": 0.0014, + "step": 1570 + }, + { + "epoch": 2.006612685560054, + "grad_norm": 0.07748937606811523, + "learning_rate": 4.9632628579997e-05, + "loss": 0.2612, + "step": 1580 + }, + { + "epoch": 2.0072874493927126, + "grad_norm": 0.2620987296104431, + "learning_rate": 4.9595141700404864e-05, + "loss": 0.6517, + "step": 1590 + }, + { + "epoch": 2.007962213225371, + "grad_norm": 148.11007690429688, + "learning_rate": 4.955765482081271e-05, + "loss": 0.5783, + "step": 1600 + }, + { + "epoch": 2.0086369770580297, + "grad_norm": 0.0034163114614784718, + "learning_rate": 4.9520167941220575e-05, + "loss": 0.0304, + "step": 1610 + }, + { + "epoch": 2.009311740890688, + "grad_norm": 0.02201319858431816, + "learning_rate": 4.948268106162843e-05, + "loss": 0.3777, + "step": 1620 + }, + { + "epoch": 2.0099865047233467, + "grad_norm": 0.01761261560022831, + "learning_rate": 4.9445194182036294e-05, + "loss": 0.6914, + "step": 1630 + }, + { + "epoch": 2.0106612685560052, + "grad_norm": 0.02757342904806137, + "learning_rate": 4.940770730244414e-05, + "loss": 0.001, + "step": 1640 + }, + { + "epoch": 2.0113360323886638, + "grad_norm": 0.016815010458230972, + "learning_rate": 4.9370220422852005e-05, + "loss": 0.0006, + "step": 1650 + }, + { + "epoch": 2.0120107962213227, + "grad_norm": 0.7724957466125488, + "learning_rate": 4.933273354325986e-05, + "loss": 0.4395, + "step": 1660 + }, + { + "epoch": 2.0126855600539812, + "grad_norm": 0.003277893178164959, + "learning_rate": 4.9295246663667724e-05, + "loss": 0.0023, + "step": 1670 + }, + { + "epoch": 2.0133603238866398, + "grad_norm": 0.010450620204210281, + "learning_rate": 4.925775978407557e-05, + "loss": 0.0003, + "step": 1680 + }, + { + "epoch": 2.0140350877192983, + "grad_norm": 0.008632444776594639, + "learning_rate": 4.9220272904483435e-05, + "loss": 0.0588, + "step": 1690 + }, + { + "epoch": 2.014709851551957, + "grad_norm": 0.2135269045829773, + "learning_rate": 4.918278602489129e-05, + "loss": 0.8012, + "step": 1700 + }, + { + "epoch": 2.0153846153846153, + "grad_norm": 0.006235187407582998, + "learning_rate": 4.9145299145299147e-05, + "loss": 0.0007, + "step": 1710 + }, + { + "epoch": 2.016059379217274, + "grad_norm": 0.013167057186365128, + "learning_rate": 4.9107812265707e-05, + "loss": 0.0004, + "step": 1720 + }, + { + "epoch": 2.0167341430499324, + "grad_norm": 0.008585361763834953, + "learning_rate": 4.9070325386114865e-05, + "loss": 0.0006, + "step": 1730 + }, + { + "epoch": 2.017408906882591, + "grad_norm": 55.19523620605469, + "learning_rate": 4.903283850652272e-05, + "loss": 0.8423, + "step": 1740 + }, + { + "epoch": 2.01808367071525, + "grad_norm": 0.005840742029249668, + "learning_rate": 4.8995351626930576e-05, + "loss": 0.432, + "step": 1750 + }, + { + "epoch": 2.0187584345479084, + "grad_norm": 0.007270222995430231, + "learning_rate": 4.895786474733843e-05, + "loss": 0.5094, + "step": 1760 + }, + { + "epoch": 2.019433198380567, + "grad_norm": 0.013795904815196991, + "learning_rate": 4.8920377867746295e-05, + "loss": 0.6975, + "step": 1770 + }, + { + "epoch": 2.0201079622132254, + "grad_norm": 0.44005972146987915, + "learning_rate": 4.888289098815415e-05, + "loss": 0.0006, + "step": 1780 + }, + { + "epoch": 2.020782726045884, + "grad_norm": 0.020803041756153107, + "learning_rate": 4.8845404108562006e-05, + "loss": 0.0003, + "step": 1790 + }, + { + "epoch": 2.0214574898785425, + "grad_norm": 0.004395525902509689, + "learning_rate": 4.880791722896986e-05, + "loss": 0.0007, + "step": 1800 + }, + { + "epoch": 2.022132253711201, + "grad_norm": 0.07428783923387527, + "learning_rate": 4.8770430349377725e-05, + "loss": 0.0006, + "step": 1810 + }, + { + "epoch": 2.0228070175438595, + "grad_norm": 0.007445579394698143, + "learning_rate": 4.8732943469785574e-05, + "loss": 0.0002, + "step": 1820 + }, + { + "epoch": 2.023481781376518, + "grad_norm": 0.02664661407470703, + "learning_rate": 4.8695456590193436e-05, + "loss": 0.0002, + "step": 1830 + }, + { + "epoch": 2.024156545209177, + "grad_norm": 0.08112671971321106, + "learning_rate": 4.865796971060129e-05, + "loss": 0.0003, + "step": 1840 + }, + { + "epoch": 2.0248313090418355, + "grad_norm": 0.002486151410266757, + "learning_rate": 4.862048283100915e-05, + "loss": 0.0018, + "step": 1850 + }, + { + "epoch": 2.025506072874494, + "grad_norm": 0.00320970406755805, + "learning_rate": 4.8582995951417004e-05, + "loss": 0.2271, + "step": 1860 + }, + { + "epoch": 2.0261808367071525, + "grad_norm": 0.1994234174489975, + "learning_rate": 4.8545509071824866e-05, + "loss": 0.5385, + "step": 1870 + }, + { + "epoch": 2.026855600539811, + "grad_norm": 0.0024550287052989006, + "learning_rate": 4.850802219223272e-05, + "loss": 0.0759, + "step": 1880 + }, + { + "epoch": 2.0275303643724696, + "grad_norm": 0.004535624757409096, + "learning_rate": 4.847053531264058e-05, + "loss": 0.0006, + "step": 1890 + }, + { + "epoch": 2.028205128205128, + "grad_norm": 0.07630165666341782, + "learning_rate": 4.8433048433048433e-05, + "loss": 0.0002, + "step": 1900 + }, + { + "epoch": 2.0288798920377866, + "grad_norm": 0.005508648231625557, + "learning_rate": 4.839556155345629e-05, + "loss": 0.0048, + "step": 1910 + }, + { + "epoch": 2.029554655870445, + "grad_norm": 0.00268650334328413, + "learning_rate": 4.835807467386415e-05, + "loss": 0.021, + "step": 1920 + }, + { + "epoch": 2.030229419703104, + "grad_norm": 0.6032857894897461, + "learning_rate": 4.832058779427201e-05, + "loss": 0.9845, + "step": 1930 + }, + { + "epoch": 2.0309041835357626, + "grad_norm": 0.0025021624751389027, + "learning_rate": 4.828310091467986e-05, + "loss": 0.0005, + "step": 1940 + }, + { + "epoch": 2.031578947368421, + "grad_norm": 0.0031213329639285803, + "learning_rate": 4.824561403508772e-05, + "loss": 0.1197, + "step": 1950 + }, + { + "epoch": 2.0322537112010797, + "grad_norm": 0.011701357550919056, + "learning_rate": 4.820812715549558e-05, + "loss": 0.0004, + "step": 1960 + }, + { + "epoch": 2.032928475033738, + "grad_norm": 0.002749168314039707, + "learning_rate": 4.817064027590343e-05, + "loss": 0.0001, + "step": 1970 + }, + { + "epoch": 2.0336032388663967, + "grad_norm": 0.003767299233004451, + "learning_rate": 4.813315339631129e-05, + "loss": 0.0002, + "step": 1980 + }, + { + "epoch": 2.0342780026990552, + "grad_norm": 0.005788211710751057, + "learning_rate": 4.809566651671915e-05, + "loss": 0.0012, + "step": 1990 + }, + { + "epoch": 2.0349527665317138, + "grad_norm": 329.865966796875, + "learning_rate": 4.805817963712701e-05, + "loss": 1.4817, + "step": 2000 + }, + { + "epoch": 2.0356275303643723, + "grad_norm": 0.011975220404565334, + "learning_rate": 4.802069275753486e-05, + "loss": 1.3289, + "step": 2010 + }, + { + "epoch": 2.0363022941970312, + "grad_norm": 0.0021649515256285667, + "learning_rate": 4.798320587794272e-05, + "loss": 0.0055, + "step": 2020 + }, + { + "epoch": 2.0369770580296898, + "grad_norm": 0.0019632915500551462, + "learning_rate": 4.794571899835058e-05, + "loss": 0.0002, + "step": 2030 + }, + { + "epoch": 2.0376518218623483, + "grad_norm": 0.005742478650063276, + "learning_rate": 4.790823211875844e-05, + "loss": 0.0011, + "step": 2040 + }, + { + "epoch": 2.038326585695007, + "grad_norm": 0.009554996155202389, + "learning_rate": 4.787074523916629e-05, + "loss": 0.8594, + "step": 2050 + }, + { + "epoch": 2.0390013495276653, + "grad_norm": 0.0015004322631284595, + "learning_rate": 4.783325835957415e-05, + "loss": 0.0138, + "step": 2060 + }, + { + "epoch": 2.039676113360324, + "grad_norm": 0.005102177150547504, + "learning_rate": 4.779577147998201e-05, + "loss": 0.7251, + "step": 2070 + }, + { + "epoch": 2.0403508771929824, + "grad_norm": 0.0036967694759368896, + "learning_rate": 4.7758284600389865e-05, + "loss": 0.004, + "step": 2080 + }, + { + "epoch": 2.041025641025641, + "grad_norm": 0.0025739429984241724, + "learning_rate": 4.772079772079772e-05, + "loss": 0.9565, + "step": 2090 + }, + { + "epoch": 2.0417004048582994, + "grad_norm": 0.006292873062193394, + "learning_rate": 4.768331084120558e-05, + "loss": 0.0182, + "step": 2100 + }, + { + "epoch": 2.0423751686909584, + "grad_norm": 0.007768746931105852, + "learning_rate": 4.764582396161344e-05, + "loss": 0.4385, + "step": 2110 + }, + { + "epoch": 2.043049932523617, + "grad_norm": 0.005842685699462891, + "learning_rate": 4.7608337082021294e-05, + "loss": 0.6865, + "step": 2120 + }, + { + "epoch": 2.0437246963562754, + "grad_norm": 0.003818152705207467, + "learning_rate": 4.757085020242915e-05, + "loss": 0.1049, + "step": 2130 + }, + { + "epoch": 2.044399460188934, + "grad_norm": 0.0034294927027076483, + "learning_rate": 4.753336332283701e-05, + "loss": 0.0004, + "step": 2140 + }, + { + "epoch": 2.0450742240215924, + "grad_norm": 0.005487513262778521, + "learning_rate": 4.749587644324487e-05, + "loss": 0.2808, + "step": 2150 + }, + { + "epoch": 2.045748987854251, + "grad_norm": 0.004234930034726858, + "learning_rate": 4.7458389563652724e-05, + "loss": 0.0096, + "step": 2160 + }, + { + "epoch": 2.0464237516869095, + "grad_norm": 0.004304991569370031, + "learning_rate": 4.742090268406058e-05, + "loss": 1.0463, + "step": 2170 + }, + { + "epoch": 2.047098515519568, + "grad_norm": 0.06642390042543411, + "learning_rate": 4.738341580446844e-05, + "loss": 0.0003, + "step": 2180 + }, + { + "epoch": 2.0477732793522265, + "grad_norm": 21.008607864379883, + "learning_rate": 4.73459289248763e-05, + "loss": 0.0073, + "step": 2190 + }, + { + "epoch": 2.0484480431848855, + "grad_norm": 0.003075533313676715, + "learning_rate": 4.7308442045284154e-05, + "loss": 0.6295, + "step": 2200 + }, + { + "epoch": 2.049122807017544, + "grad_norm": 0.002309370320290327, + "learning_rate": 4.727095516569201e-05, + "loss": 0.1005, + "step": 2210 + }, + { + "epoch": 2.0497975708502025, + "grad_norm": 0.0027971486561000347, + "learning_rate": 4.7233468286099866e-05, + "loss": 0.0021, + "step": 2220 + }, + { + "epoch": 2.05, + "eval_accuracy": 0.8839285714285714, + "eval_f1": 0.882116388637625, + "eval_loss": 0.5603616833686829, + "eval_runtime": 74.4272, + "eval_samples_per_second": 1.505, + "eval_steps_per_second": 1.505, + "step": 2223 + }, + { + "epoch": 3.000472334682861, + "grad_norm": 0.01166750118136406, + "learning_rate": 4.719598140650772e-05, + "loss": 0.0163, + "step": 2230 + }, + { + "epoch": 3.0011470985155198, + "grad_norm": 96.65850067138672, + "learning_rate": 4.7158494526915584e-05, + "loss": 1.6819, + "step": 2240 + }, + { + "epoch": 3.0018218623481783, + "grad_norm": 0.05787191540002823, + "learning_rate": 4.712100764732344e-05, + "loss": 0.6805, + "step": 2250 + }, + { + "epoch": 3.002496626180837, + "grad_norm": 0.0013539530336856842, + "learning_rate": 4.7083520767731296e-05, + "loss": 0.0011, + "step": 2260 + }, + { + "epoch": 3.0031713900134953, + "grad_norm": 0.010776277631521225, + "learning_rate": 4.704603388813915e-05, + "loss": 0.0148, + "step": 2270 + }, + { + "epoch": 3.003846153846154, + "grad_norm": 0.024213161319494247, + "learning_rate": 4.700854700854701e-05, + "loss": 0.0008, + "step": 2280 + }, + { + "epoch": 3.0045209176788124, + "grad_norm": 0.0691986232995987, + "learning_rate": 4.697106012895487e-05, + "loss": 0.006, + "step": 2290 + }, + { + "epoch": 3.005195681511471, + "grad_norm": 0.009089670144021511, + "learning_rate": 4.6933573249362725e-05, + "loss": 0.6432, + "step": 2300 + }, + { + "epoch": 3.0058704453441294, + "grad_norm": 0.005548300687223673, + "learning_rate": 4.689608636977058e-05, + "loss": 0.0018, + "step": 2310 + }, + { + "epoch": 3.006545209176788, + "grad_norm": 0.006319984793663025, + "learning_rate": 4.685859949017844e-05, + "loss": 0.0001, + "step": 2320 + }, + { + "epoch": 3.007219973009447, + "grad_norm": 0.007898062467575073, + "learning_rate": 4.68211126105863e-05, + "loss": 0.0008, + "step": 2330 + }, + { + "epoch": 3.0078947368421054, + "grad_norm": 0.003347884165123105, + "learning_rate": 4.678362573099415e-05, + "loss": 0.0002, + "step": 2340 + }, + { + "epoch": 3.008569500674764, + "grad_norm": 0.009431365877389908, + "learning_rate": 4.674613885140201e-05, + "loss": 0.0001, + "step": 2350 + }, + { + "epoch": 3.0092442645074224, + "grad_norm": 0.006901255808770657, + "learning_rate": 4.670865197180987e-05, + "loss": 0.0001, + "step": 2360 + }, + { + "epoch": 3.009919028340081, + "grad_norm": 0.00315679213963449, + "learning_rate": 4.667116509221773e-05, + "loss": 0.0002, + "step": 2370 + }, + { + "epoch": 3.0105937921727395, + "grad_norm": 0.21266283094882965, + "learning_rate": 4.663367821262558e-05, + "loss": 0.0006, + "step": 2380 + }, + { + "epoch": 3.011268556005398, + "grad_norm": 0.004384478088468313, + "learning_rate": 4.659619133303344e-05, + "loss": 0.0006, + "step": 2390 + }, + { + "epoch": 3.0119433198380565, + "grad_norm": 0.013708599843084812, + "learning_rate": 4.65587044534413e-05, + "loss": 0.2589, + "step": 2400 + }, + { + "epoch": 3.012618083670715, + "grad_norm": 308.8554992675781, + "learning_rate": 4.652121757384916e-05, + "loss": 1.1215, + "step": 2410 + }, + { + "epoch": 3.013292847503374, + "grad_norm": 0.0031652101315557957, + "learning_rate": 4.648373069425701e-05, + "loss": 0.5235, + "step": 2420 + }, + { + "epoch": 3.0139676113360325, + "grad_norm": 0.00223003257997334, + "learning_rate": 4.644624381466487e-05, + "loss": 0.0001, + "step": 2430 + }, + { + "epoch": 3.014642375168691, + "grad_norm": 0.0067682513035833836, + "learning_rate": 4.640875693507273e-05, + "loss": 0.0001, + "step": 2440 + }, + { + "epoch": 3.0153171390013496, + "grad_norm": 0.0025887356605380774, + "learning_rate": 4.637127005548059e-05, + "loss": 0.0003, + "step": 2450 + }, + { + "epoch": 3.015991902834008, + "grad_norm": 0.0077194697223603725, + "learning_rate": 4.633378317588844e-05, + "loss": 0.6217, + "step": 2460 + }, + { + "epoch": 3.0166666666666666, + "grad_norm": 0.03473236411809921, + "learning_rate": 4.62962962962963e-05, + "loss": 0.8179, + "step": 2470 + }, + { + "epoch": 3.017341430499325, + "grad_norm": 0.014423678629100323, + "learning_rate": 4.6258809416704157e-05, + "loss": 0.2263, + "step": 2480 + }, + { + "epoch": 3.0180161943319836, + "grad_norm": 0.006188780535012484, + "learning_rate": 4.622132253711201e-05, + "loss": 0.6701, + "step": 2490 + }, + { + "epoch": 3.018690958164642, + "grad_norm": 0.35851019620895386, + "learning_rate": 4.618383565751987e-05, + "loss": 0.0008, + "step": 2500 + }, + { + "epoch": 3.019365721997301, + "grad_norm": 0.0032032101880759, + "learning_rate": 4.614634877792773e-05, + "loss": 0.0137, + "step": 2510 + }, + { + "epoch": 3.0200404858299597, + "grad_norm": 0.006460473407059908, + "learning_rate": 4.6108861898335586e-05, + "loss": 0.0016, + "step": 2520 + }, + { + "epoch": 3.020715249662618, + "grad_norm": 0.0026447370182722807, + "learning_rate": 4.607137501874344e-05, + "loss": 0.0014, + "step": 2530 + }, + { + "epoch": 3.0213900134952767, + "grad_norm": 0.0030527382623404264, + "learning_rate": 4.60338881391513e-05, + "loss": 0.0001, + "step": 2540 + }, + { + "epoch": 3.022064777327935, + "grad_norm": 0.007262419909238815, + "learning_rate": 4.599640125955916e-05, + "loss": 0.0001, + "step": 2550 + }, + { + "epoch": 3.0227395411605937, + "grad_norm": 0.0038091035094112158, + "learning_rate": 4.5958914379967016e-05, + "loss": 0.2222, + "step": 2560 + }, + { + "epoch": 3.0234143049932523, + "grad_norm": 0.0035387033130973577, + "learning_rate": 4.592142750037487e-05, + "loss": 0.8073, + "step": 2570 + }, + { + "epoch": 3.0240890688259108, + "grad_norm": 0.0033677646424621344, + "learning_rate": 4.588394062078273e-05, + "loss": 0.0006, + "step": 2580 + }, + { + "epoch": 3.0247638326585693, + "grad_norm": 0.006484442390501499, + "learning_rate": 4.5846453741190584e-05, + "loss": 0.0004, + "step": 2590 + }, + { + "epoch": 3.0254385964912283, + "grad_norm": 0.010489704087376595, + "learning_rate": 4.580896686159844e-05, + "loss": 0.0009, + "step": 2600 + }, + { + "epoch": 3.026113360323887, + "grad_norm": 0.0032699282746762037, + "learning_rate": 4.57714799820063e-05, + "loss": 0.0039, + "step": 2610 + }, + { + "epoch": 3.0267881241565453, + "grad_norm": 97.59777069091797, + "learning_rate": 4.573399310241416e-05, + "loss": 1.8702, + "step": 2620 + }, + { + "epoch": 3.027462887989204, + "grad_norm": 0.05291153863072395, + "learning_rate": 4.5696506222822014e-05, + "loss": 0.9784, + "step": 2630 + }, + { + "epoch": 3.0281376518218623, + "grad_norm": 0.0015122004551813006, + "learning_rate": 4.565901934322987e-05, + "loss": 0.0008, + "step": 2640 + }, + { + "epoch": 3.028812415654521, + "grad_norm": 0.10103687644004822, + "learning_rate": 4.5621532463637725e-05, + "loss": 0.0013, + "step": 2650 + }, + { + "epoch": 3.0294871794871794, + "grad_norm": 0.002090906724333763, + "learning_rate": 4.558404558404559e-05, + "loss": 0.0005, + "step": 2660 + }, + { + "epoch": 3.030161943319838, + "grad_norm": 0.0011990441707894206, + "learning_rate": 4.5546558704453443e-05, + "loss": 0.772, + "step": 2670 + }, + { + "epoch": 3.0308367071524964, + "grad_norm": 0.0113350385800004, + "learning_rate": 4.55090718248613e-05, + "loss": 0.0039, + "step": 2680 + }, + { + "epoch": 3.0315114709851554, + "grad_norm": 379.9673156738281, + "learning_rate": 4.5471584945269155e-05, + "loss": 1.1636, + "step": 2690 + }, + { + "epoch": 3.032186234817814, + "grad_norm": 0.03890157490968704, + "learning_rate": 4.543409806567702e-05, + "loss": 1.1822, + "step": 2700 + }, + { + "epoch": 3.0328609986504724, + "grad_norm": 0.0033322779927402735, + "learning_rate": 4.5396611186084866e-05, + "loss": 0.0014, + "step": 2710 + }, + { + "epoch": 3.033535762483131, + "grad_norm": 0.6530995965003967, + "learning_rate": 4.535912430649273e-05, + "loss": 0.6605, + "step": 2720 + }, + { + "epoch": 3.0342105263157895, + "grad_norm": 0.03727166727185249, + "learning_rate": 4.5321637426900585e-05, + "loss": 0.9511, + "step": 2730 + }, + { + "epoch": 3.034885290148448, + "grad_norm": 0.0015920967562124133, + "learning_rate": 4.528415054730845e-05, + "loss": 0.0008, + "step": 2740 + }, + { + "epoch": 3.0355600539811065, + "grad_norm": 0.08293965458869934, + "learning_rate": 4.5246663667716296e-05, + "loss": 0.0007, + "step": 2750 + }, + { + "epoch": 3.036234817813765, + "grad_norm": 0.04548066109418869, + "learning_rate": 4.520917678812416e-05, + "loss": 0.0015, + "step": 2760 + }, + { + "epoch": 3.0369095816464236, + "grad_norm": 0.011057593859732151, + "learning_rate": 4.5171689908532015e-05, + "loss": 0.6973, + "step": 2770 + }, + { + "epoch": 3.0375843454790825, + "grad_norm": 113.07095336914062, + "learning_rate": 4.513420302893988e-05, + "loss": 0.3667, + "step": 2780 + }, + { + "epoch": 3.038259109311741, + "grad_norm": 0.0016718521947041154, + "learning_rate": 4.5096716149347726e-05, + "loss": 0.6746, + "step": 2790 + }, + { + "epoch": 3.0389338731443996, + "grad_norm": 0.006011700723320246, + "learning_rate": 4.505922926975559e-05, + "loss": 1.4009, + "step": 2800 + }, + { + "epoch": 3.039608636977058, + "grad_norm": 0.0032539258245378733, + "learning_rate": 4.5021742390163445e-05, + "loss": 0.005, + "step": 2810 + }, + { + "epoch": 3.0402834008097166, + "grad_norm": 0.024762948974967003, + "learning_rate": 4.498425551057131e-05, + "loss": 0.0007, + "step": 2820 + }, + { + "epoch": 3.040958164642375, + "grad_norm": 0.008271398954093456, + "learning_rate": 4.4946768630979156e-05, + "loss": 0.0004, + "step": 2830 + }, + { + "epoch": 3.0416329284750336, + "grad_norm": 0.0073724472895264626, + "learning_rate": 4.490928175138702e-05, + "loss": 0.7153, + "step": 2840 + }, + { + "epoch": 3.042307692307692, + "grad_norm": 0.01329676155000925, + "learning_rate": 4.4871794871794874e-05, + "loss": 0.1339, + "step": 2850 + }, + { + "epoch": 3.0429824561403507, + "grad_norm": 0.00492237601429224, + "learning_rate": 4.483430799220273e-05, + "loss": 0.7432, + "step": 2860 + }, + { + "epoch": 3.0436572199730096, + "grad_norm": 0.006463408935815096, + "learning_rate": 4.4796821112610586e-05, + "loss": 0.0007, + "step": 2870 + }, + { + "epoch": 3.044331983805668, + "grad_norm": 0.0007826614892110229, + "learning_rate": 4.475933423301845e-05, + "loss": 0.5263, + "step": 2880 + }, + { + "epoch": 3.0450067476383267, + "grad_norm": 0.0012907817726954818, + "learning_rate": 4.4721847353426304e-05, + "loss": 0.0017, + "step": 2890 + }, + { + "epoch": 3.045681511470985, + "grad_norm": 0.0011142657604068518, + "learning_rate": 4.468436047383416e-05, + "loss": 0.0004, + "step": 2900 + }, + { + "epoch": 3.0463562753036437, + "grad_norm": 0.0039123659953475, + "learning_rate": 4.4646873594242016e-05, + "loss": 0.025, + "step": 2910 + }, + { + "epoch": 3.0470310391363022, + "grad_norm": 0.006876886822283268, + "learning_rate": 4.460938671464988e-05, + "loss": 0.5972, + "step": 2920 + }, + { + "epoch": 3.0477058029689608, + "grad_norm": 0.0013078979682177305, + "learning_rate": 4.4571899835057734e-05, + "loss": 0.0216, + "step": 2930 + }, + { + "epoch": 3.0483805668016193, + "grad_norm": 0.01804491877555847, + "learning_rate": 4.453441295546559e-05, + "loss": 0.0025, + "step": 2940 + }, + { + "epoch": 3.049055330634278, + "grad_norm": 0.0017017913050949574, + "learning_rate": 4.4496926075873446e-05, + "loss": 0.1553, + "step": 2950 + }, + { + "epoch": 3.0497300944669368, + "grad_norm": 0.004222176969051361, + "learning_rate": 4.445943919628131e-05, + "loss": 0.0002, + "step": 2960 + }, + { + "epoch": 3.05, + "eval_accuracy": 0.9017857142857143, + "eval_f1": 0.900079642364192, + "eval_loss": 0.3880017399787903, + "eval_runtime": 72.9967, + "eval_samples_per_second": 1.534, + "eval_steps_per_second": 1.534, + "step": 2964 + }, + { + "epoch": 4.0004048582995955, + "grad_norm": 0.0011517743114382029, + "learning_rate": 4.442195231668916e-05, + "loss": 0.4772, + "step": 2970 + }, + { + "epoch": 4.001079622132254, + "grad_norm": 0.0008661440806463361, + "learning_rate": 4.438446543709702e-05, + "loss": 0.0001, + "step": 2980 + }, + { + "epoch": 4.0017543859649125, + "grad_norm": 0.005399093497544527, + "learning_rate": 4.4346978557504876e-05, + "loss": 0.0033, + "step": 2990 + }, + { + "epoch": 4.002429149797571, + "grad_norm": 0.0038267234340310097, + "learning_rate": 4.430949167791273e-05, + "loss": 0.0005, + "step": 3000 + }, + { + "epoch": 4.0031039136302295, + "grad_norm": 0.0029461942613124847, + "learning_rate": 4.427200479832059e-05, + "loss": 0.0002, + "step": 3010 + }, + { + "epoch": 4.003778677462888, + "grad_norm": 0.0006391266360878944, + "learning_rate": 4.423451791872845e-05, + "loss": 0.0001, + "step": 3020 + }, + { + "epoch": 4.004453441295547, + "grad_norm": 0.004189279396086931, + "learning_rate": 4.4197031039136306e-05, + "loss": 0.0001, + "step": 3030 + }, + { + "epoch": 4.005128205128205, + "grad_norm": 0.0011289932299405336, + "learning_rate": 4.415954415954416e-05, + "loss": 0.0001, + "step": 3040 + }, + { + "epoch": 4.005802968960864, + "grad_norm": 0.0023520805407315493, + "learning_rate": 4.412205727995202e-05, + "loss": 0.0001, + "step": 3050 + }, + { + "epoch": 4.006477732793522, + "grad_norm": 0.0018153834389522672, + "learning_rate": 4.408457040035987e-05, + "loss": 0.8745, + "step": 3060 + }, + { + "epoch": 4.007152496626181, + "grad_norm": 0.001743017346598208, + "learning_rate": 4.4047083520767735e-05, + "loss": 0.0003, + "step": 3070 + }, + { + "epoch": 4.007827260458839, + "grad_norm": 0.002831714926287532, + "learning_rate": 4.400959664117559e-05, + "loss": 0.0066, + "step": 3080 + }, + { + "epoch": 4.008502024291498, + "grad_norm": 0.005015307106077671, + "learning_rate": 4.397210976158345e-05, + "loss": 0.1127, + "step": 3090 + }, + { + "epoch": 4.009176788124156, + "grad_norm": 0.0019009409006685019, + "learning_rate": 4.39346228819913e-05, + "loss": 0.001, + "step": 3100 + }, + { + "epoch": 4.009851551956815, + "grad_norm": 0.0011994624510407448, + "learning_rate": 4.3897136002399165e-05, + "loss": 0.8256, + "step": 3110 + }, + { + "epoch": 4.010526315789473, + "grad_norm": 0.002758684800937772, + "learning_rate": 4.3859649122807014e-05, + "loss": 0.0002, + "step": 3120 + }, + { + "epoch": 4.011201079622133, + "grad_norm": 0.014079189859330654, + "learning_rate": 4.382216224321488e-05, + "loss": 0.0001, + "step": 3130 + }, + { + "epoch": 4.011875843454791, + "grad_norm": 0.001694743288680911, + "learning_rate": 4.378467536362273e-05, + "loss": 0.0001, + "step": 3140 + }, + { + "epoch": 4.01255060728745, + "grad_norm": 0.005108845420181751, + "learning_rate": 4.3747188484030595e-05, + "loss": 0.0001, + "step": 3150 + }, + { + "epoch": 4.013225371120108, + "grad_norm": 0.0009567590313963592, + "learning_rate": 4.3709701604438444e-05, + "loss": 0.0003, + "step": 3160 + }, + { + "epoch": 4.013900134952767, + "grad_norm": 0.005206429865211248, + "learning_rate": 4.367221472484631e-05, + "loss": 0.0139, + "step": 3170 + }, + { + "epoch": 4.014574898785425, + "grad_norm": 0.0010895140003412962, + "learning_rate": 4.363472784525416e-05, + "loss": 0.0001, + "step": 3180 + }, + { + "epoch": 4.015249662618084, + "grad_norm": 0.0026008691638708115, + "learning_rate": 4.3597240965662025e-05, + "loss": 0.0002, + "step": 3190 + }, + { + "epoch": 4.015924426450742, + "grad_norm": 0.00945541262626648, + "learning_rate": 4.3559754086069874e-05, + "loss": 0.0001, + "step": 3200 + }, + { + "epoch": 4.016599190283401, + "grad_norm": 0.002652823692187667, + "learning_rate": 4.3522267206477737e-05, + "loss": 0.0003, + "step": 3210 + }, + { + "epoch": 4.017273954116059, + "grad_norm": 0.011731209233403206, + "learning_rate": 4.348478032688559e-05, + "loss": 0.0001, + "step": 3220 + }, + { + "epoch": 4.017948717948718, + "grad_norm": 0.002854161197319627, + "learning_rate": 4.344729344729345e-05, + "loss": 0.0001, + "step": 3230 + }, + { + "epoch": 4.018623481781376, + "grad_norm": 0.0006263653049245477, + "learning_rate": 4.3409806567701304e-05, + "loss": 0.0002, + "step": 3240 + }, + { + "epoch": 4.019298245614035, + "grad_norm": 0.008615193888545036, + "learning_rate": 4.3372319688109166e-05, + "loss": 0.7675, + "step": 3250 + }, + { + "epoch": 4.0199730094466934, + "grad_norm": 0.0012555683497339487, + "learning_rate": 4.333483280851702e-05, + "loss": 0.0001, + "step": 3260 + }, + { + "epoch": 4.020647773279352, + "grad_norm": 0.0026209617499262094, + "learning_rate": 4.329734592892488e-05, + "loss": 0.0001, + "step": 3270 + }, + { + "epoch": 4.0213225371120105, + "grad_norm": 0.0008131062495522201, + "learning_rate": 4.3259859049332734e-05, + "loss": 0.495, + "step": 3280 + }, + { + "epoch": 4.021997300944669, + "grad_norm": 0.004160483367741108, + "learning_rate": 4.3222372169740596e-05, + "loss": 0.0001, + "step": 3290 + }, + { + "epoch": 4.0226720647773275, + "grad_norm": 0.00135552987921983, + "learning_rate": 4.318488529014845e-05, + "loss": 0.0001, + "step": 3300 + }, + { + "epoch": 4.023346828609987, + "grad_norm": 0.0020715997088700533, + "learning_rate": 4.314739841055631e-05, + "loss": 0.0001, + "step": 3310 + }, + { + "epoch": 4.0240215924426455, + "grad_norm": 0.0006134248687885702, + "learning_rate": 4.3109911530964164e-05, + "loss": 0.0003, + "step": 3320 + }, + { + "epoch": 4.024696356275304, + "grad_norm": 0.005337740760296583, + "learning_rate": 4.3072424651372026e-05, + "loss": 0.0002, + "step": 3330 + }, + { + "epoch": 4.0253711201079625, + "grad_norm": 0.002447796519845724, + "learning_rate": 4.303493777177988e-05, + "loss": 0.0013, + "step": 3340 + }, + { + "epoch": 4.026045883940621, + "grad_norm": 0.0020753496792167425, + "learning_rate": 4.299745089218774e-05, + "loss": 0.0001, + "step": 3350 + }, + { + "epoch": 4.0267206477732795, + "grad_norm": 0.001169373164884746, + "learning_rate": 4.2959964012595594e-05, + "loss": 0.4363, + "step": 3360 + }, + { + "epoch": 4.027395411605938, + "grad_norm": 0.0031577907502651215, + "learning_rate": 4.292247713300345e-05, + "loss": 0.4359, + "step": 3370 + }, + { + "epoch": 4.028070175438597, + "grad_norm": 0.0011828079586848617, + "learning_rate": 4.2884990253411305e-05, + "loss": 0.0001, + "step": 3380 + }, + { + "epoch": 4.028744939271255, + "grad_norm": 0.0016030353726819158, + "learning_rate": 4.284750337381917e-05, + "loss": 0.0001, + "step": 3390 + }, + { + "epoch": 4.029419703103914, + "grad_norm": 0.014403590932488441, + "learning_rate": 4.2810016494227023e-05, + "loss": 0.7807, + "step": 3400 + }, + { + "epoch": 4.030094466936572, + "grad_norm": 0.005019639153033495, + "learning_rate": 4.277252961463488e-05, + "loss": 0.4727, + "step": 3410 + }, + { + "epoch": 4.030769230769231, + "grad_norm": 0.002246898366138339, + "learning_rate": 4.2735042735042735e-05, + "loss": 0.0499, + "step": 3420 + }, + { + "epoch": 4.031443994601889, + "grad_norm": 0.013324781320989132, + "learning_rate": 4.269755585545059e-05, + "loss": 0.5992, + "step": 3430 + }, + { + "epoch": 4.032118758434548, + "grad_norm": 0.0579649917781353, + "learning_rate": 4.266006897585845e-05, + "loss": 0.0039, + "step": 3440 + }, + { + "epoch": 4.032793522267206, + "grad_norm": 1.7032642364501953, + "learning_rate": 4.262258209626631e-05, + "loss": 0.6145, + "step": 3450 + }, + { + "epoch": 4.033468286099865, + "grad_norm": 0.013759407214820385, + "learning_rate": 4.2585095216674165e-05, + "loss": 0.0002, + "step": 3460 + }, + { + "epoch": 4.034143049932523, + "grad_norm": 0.00753359729424119, + "learning_rate": 4.254760833708202e-05, + "loss": 0.0071, + "step": 3470 + }, + { + "epoch": 4.034817813765182, + "grad_norm": 0.0020441561937332153, + "learning_rate": 4.251012145748988e-05, + "loss": 0.001, + "step": 3480 + }, + { + "epoch": 4.035492577597841, + "grad_norm": 0.001379093388095498, + "learning_rate": 4.247263457789773e-05, + "loss": 0.0013, + "step": 3490 + }, + { + "epoch": 4.0361673414305, + "grad_norm": 0.002510966034606099, + "learning_rate": 4.2435147698305595e-05, + "loss": 0.0003, + "step": 3500 + }, + { + "epoch": 4.036842105263158, + "grad_norm": 0.0011007965076714754, + "learning_rate": 4.239766081871345e-05, + "loss": 1.0836, + "step": 3510 + }, + { + "epoch": 4.037516869095817, + "grad_norm": 0.022373057901859283, + "learning_rate": 4.236017393912131e-05, + "loss": 0.0838, + "step": 3520 + }, + { + "epoch": 4.038191632928475, + "grad_norm": 0.0008921432308852673, + "learning_rate": 4.232268705952916e-05, + "loss": 0.0001, + "step": 3530 + }, + { + "epoch": 4.038866396761134, + "grad_norm": 0.0007166191353462636, + "learning_rate": 4.2285200179937025e-05, + "loss": 0.0076, + "step": 3540 + }, + { + "epoch": 4.039541160593792, + "grad_norm": 8.101381301879883, + "learning_rate": 4.224771330034488e-05, + "loss": 0.0233, + "step": 3550 + }, + { + "epoch": 4.040215924426451, + "grad_norm": 0.0007625047001056373, + "learning_rate": 4.221022642075274e-05, + "loss": 0.0003, + "step": 3560 + }, + { + "epoch": 4.040890688259109, + "grad_norm": 0.00398569880053401, + "learning_rate": 4.217273954116059e-05, + "loss": 0.0001, + "step": 3570 + }, + { + "epoch": 4.041565452091768, + "grad_norm": 0.0010361782042309642, + "learning_rate": 4.2135252661568455e-05, + "loss": 0.0001, + "step": 3580 + }, + { + "epoch": 4.042240215924426, + "grad_norm": 0.001946108415722847, + "learning_rate": 4.209776578197631e-05, + "loss": 0.0004, + "step": 3590 + }, + { + "epoch": 4.042914979757085, + "grad_norm": 0.003806932596489787, + "learning_rate": 4.2060278902384166e-05, + "loss": 0.0, + "step": 3600 + }, + { + "epoch": 4.043589743589743, + "grad_norm": 0.0009996455628424883, + "learning_rate": 4.202279202279202e-05, + "loss": 0.0002, + "step": 3610 + }, + { + "epoch": 4.044264507422402, + "grad_norm": 0.0016769858775660396, + "learning_rate": 4.1985305143199884e-05, + "loss": 0.0001, + "step": 3620 + }, + { + "epoch": 4.0449392712550605, + "grad_norm": 0.00047590630128979683, + "learning_rate": 4.194781826360774e-05, + "loss": 0.0001, + "step": 3630 + }, + { + "epoch": 4.045614035087719, + "grad_norm": 0.0010459835175424814, + "learning_rate": 4.1910331384015596e-05, + "loss": 0.3976, + "step": 3640 + }, + { + "epoch": 4.0462887989203775, + "grad_norm": 0.003536689095199108, + "learning_rate": 4.187284450442345e-05, + "loss": 0.5592, + "step": 3650 + }, + { + "epoch": 4.046963562753036, + "grad_norm": 0.004078584257513285, + "learning_rate": 4.1835357624831314e-05, + "loss": 0.2639, + "step": 3660 + }, + { + "epoch": 4.0476383265856954, + "grad_norm": 0.01091256644576788, + "learning_rate": 4.179787074523917e-05, + "loss": 0.0001, + "step": 3670 + }, + { + "epoch": 4.048313090418354, + "grad_norm": 0.0032140237744897604, + "learning_rate": 4.1760383865647026e-05, + "loss": 0.2047, + "step": 3680 + }, + { + "epoch": 4.0489878542510125, + "grad_norm": 0.003986234311014414, + "learning_rate": 4.172289698605488e-05, + "loss": 0.0019, + "step": 3690 + }, + { + "epoch": 4.049662618083671, + "grad_norm": 0.0013649433385580778, + "learning_rate": 4.1685410106462744e-05, + "loss": 0.0001, + "step": 3700 + }, + { + "epoch": 4.05, + "eval_accuracy": 0.9285714285714286, + "eval_f1": 0.9284473859473861, + "eval_loss": 0.43087735772132874, + "eval_runtime": 74.3247, + "eval_samples_per_second": 1.507, + "eval_steps_per_second": 1.507, + "step": 3705 + }, + { + "epoch": 5.00033738191633, + "grad_norm": 0.0009709022124297917, + "learning_rate": 4.16479232268706e-05, + "loss": 0.0001, + "step": 3710 + }, + { + "epoch": 5.001012145748988, + "grad_norm": 0.00450406176969409, + "learning_rate": 4.1610436347278456e-05, + "loss": 0.0001, + "step": 3720 + }, + { + "epoch": 5.001686909581647, + "grad_norm": 490.396240234375, + "learning_rate": 4.157294946768631e-05, + "loss": 0.3041, + "step": 3730 + }, + { + "epoch": 5.002361673414305, + "grad_norm": 0.00026446336414664984, + "learning_rate": 4.153546258809417e-05, + "loss": 0.0001, + "step": 3740 + }, + { + "epoch": 5.003036437246964, + "grad_norm": 0.0011977544054389, + "learning_rate": 4.149797570850202e-05, + "loss": 0.0001, + "step": 3750 + }, + { + "epoch": 5.003711201079622, + "grad_norm": 0.0008563337032683194, + "learning_rate": 4.1460488828909886e-05, + "loss": 0.6888, + "step": 3760 + }, + { + "epoch": 5.004385964912281, + "grad_norm": 0.0008433638722635806, + "learning_rate": 4.142300194931774e-05, + "loss": 0.0003, + "step": 3770 + }, + { + "epoch": 5.005060728744939, + "grad_norm": 0.0007336140261031687, + "learning_rate": 4.13855150697256e-05, + "loss": 0.5238, + "step": 3780 + }, + { + "epoch": 5.005735492577598, + "grad_norm": 0.0012576148146763444, + "learning_rate": 4.134802819013345e-05, + "loss": 0.0023, + "step": 3790 + }, + { + "epoch": 5.006410256410256, + "grad_norm": 0.0009189122938551009, + "learning_rate": 4.131054131054131e-05, + "loss": 0.0131, + "step": 3800 + }, + { + "epoch": 5.007085020242915, + "grad_norm": 0.008739179000258446, + "learning_rate": 4.127305443094917e-05, + "loss": 0.0003, + "step": 3810 + }, + { + "epoch": 5.007759784075573, + "grad_norm": 0.0012460118159651756, + "learning_rate": 4.123556755135703e-05, + "loss": 0.0001, + "step": 3820 + }, + { + "epoch": 5.008434547908232, + "grad_norm": 0.002039340790361166, + "learning_rate": 4.119808067176488e-05, + "loss": 0.0003, + "step": 3830 + }, + { + "epoch": 5.0091093117408905, + "grad_norm": 0.0009501971653662622, + "learning_rate": 4.116059379217274e-05, + "loss": 0.0052, + "step": 3840 + }, + { + "epoch": 5.009784075573549, + "grad_norm": 0.07869889587163925, + "learning_rate": 4.11231069125806e-05, + "loss": 0.0002, + "step": 3850 + }, + { + "epoch": 5.0104588394062075, + "grad_norm": 0.0006638221675530076, + "learning_rate": 4.108562003298845e-05, + "loss": 0.0005, + "step": 3860 + }, + { + "epoch": 5.011133603238866, + "grad_norm": 0.0008539034170098603, + "learning_rate": 4.104813315339631e-05, + "loss": 0.0001, + "step": 3870 + }, + { + "epoch": 5.0118083670715246, + "grad_norm": 0.0006605815142393112, + "learning_rate": 4.101064627380417e-05, + "loss": 0.0004, + "step": 3880 + }, + { + "epoch": 5.012483130904184, + "grad_norm": 0.0008256967412307858, + "learning_rate": 4.097315939421203e-05, + "loss": 0.0001, + "step": 3890 + }, + { + "epoch": 5.0131578947368425, + "grad_norm": 0.008075601421296597, + "learning_rate": 4.093567251461988e-05, + "loss": 0.0018, + "step": 3900 + }, + { + "epoch": 5.013832658569501, + "grad_norm": 0.0012110425159335136, + "learning_rate": 4.089818563502774e-05, + "loss": 0.0011, + "step": 3910 + }, + { + "epoch": 5.0145074224021595, + "grad_norm": 0.0048310281708836555, + "learning_rate": 4.08606987554356e-05, + "loss": 0.0001, + "step": 3920 + }, + { + "epoch": 5.015182186234818, + "grad_norm": 0.0012771515175700188, + "learning_rate": 4.082321187584346e-05, + "loss": 0.0003, + "step": 3930 + }, + { + "epoch": 5.015856950067477, + "grad_norm": 0.0013642838457599282, + "learning_rate": 4.078572499625131e-05, + "loss": 0.0001, + "step": 3940 + }, + { + "epoch": 5.016531713900135, + "grad_norm": 311.0769348144531, + "learning_rate": 4.074823811665917e-05, + "loss": 0.7081, + "step": 3950 + }, + { + "epoch": 5.017206477732794, + "grad_norm": 0.002835233462974429, + "learning_rate": 4.071075123706703e-05, + "loss": 0.0003, + "step": 3960 + }, + { + "epoch": 5.017881241565452, + "grad_norm": 0.0006811009370721877, + "learning_rate": 4.067326435747489e-05, + "loss": 0.4166, + "step": 3970 + }, + { + "epoch": 5.018556005398111, + "grad_norm": 0.0010262362193316221, + "learning_rate": 4.063577747788274e-05, + "loss": 0.0001, + "step": 3980 + }, + { + "epoch": 5.019230769230769, + "grad_norm": 0.11619503796100616, + "learning_rate": 4.05982905982906e-05, + "loss": 0.0002, + "step": 3990 + }, + { + "epoch": 5.019905533063428, + "grad_norm": 0.011183816939592361, + "learning_rate": 4.056080371869846e-05, + "loss": 0.0006, + "step": 4000 + }, + { + "epoch": 5.020580296896086, + "grad_norm": 0.0007078946800902486, + "learning_rate": 4.0523316839106314e-05, + "loss": 0.0004, + "step": 4010 + }, + { + "epoch": 5.021255060728745, + "grad_norm": 0.008296789601445198, + "learning_rate": 4.048582995951417e-05, + "loss": 0.0134, + "step": 4020 + }, + { + "epoch": 5.021929824561403, + "grad_norm": 0.013501118868589401, + "learning_rate": 4.044834307992203e-05, + "loss": 0.0003, + "step": 4030 + }, + { + "epoch": 5.022604588394062, + "grad_norm": 0.15977753698825836, + "learning_rate": 4.041085620032989e-05, + "loss": 0.0001, + "step": 4040 + }, + { + "epoch": 5.02327935222672, + "grad_norm": 0.004472650587558746, + "learning_rate": 4.0373369320737744e-05, + "loss": 0.0032, + "step": 4050 + }, + { + "epoch": 5.023954116059379, + "grad_norm": 0.0012224495876580477, + "learning_rate": 4.03358824411456e-05, + "loss": 0.0, + "step": 4060 + }, + { + "epoch": 5.024628879892038, + "grad_norm": 0.0016181441023945808, + "learning_rate": 4.029839556155346e-05, + "loss": 0.7806, + "step": 4070 + }, + { + "epoch": 5.025303643724697, + "grad_norm": 0.004258355125784874, + "learning_rate": 4.026090868196132e-05, + "loss": 0.0, + "step": 4080 + }, + { + "epoch": 5.025978407557355, + "grad_norm": 0.0011408330174162984, + "learning_rate": 4.0223421802369174e-05, + "loss": 0.0001, + "step": 4090 + }, + { + "epoch": 5.026653171390014, + "grad_norm": 0.010054398328065872, + "learning_rate": 4.018593492277703e-05, + "loss": 0.0001, + "step": 4100 + }, + { + "epoch": 5.027327935222672, + "grad_norm": 0.0009806094458326697, + "learning_rate": 4.014844804318489e-05, + "loss": 0.0001, + "step": 4110 + }, + { + "epoch": 5.028002699055331, + "grad_norm": 0.0007722462760284543, + "learning_rate": 4.011096116359274e-05, + "loss": 0.0003, + "step": 4120 + }, + { + "epoch": 5.028677462887989, + "grad_norm": 0.01538068987429142, + "learning_rate": 4.0073474284000604e-05, + "loss": 0.6961, + "step": 4130 + }, + { + "epoch": 5.029352226720648, + "grad_norm": 0.00021896508405916393, + "learning_rate": 4.003598740440846e-05, + "loss": 0.0001, + "step": 4140 + }, + { + "epoch": 5.030026990553306, + "grad_norm": 0.0006867019692435861, + "learning_rate": 3.9998500524816315e-05, + "loss": 0.0, + "step": 4150 + }, + { + "epoch": 5.030701754385965, + "grad_norm": 0.0021174189168959856, + "learning_rate": 3.996101364522417e-05, + "loss": 0.0, + "step": 4160 + }, + { + "epoch": 5.031376518218623, + "grad_norm": 0.0005668731173500419, + "learning_rate": 3.992352676563203e-05, + "loss": 0.0, + "step": 4170 + }, + { + "epoch": 5.032051282051282, + "grad_norm": 0.0007015119190327823, + "learning_rate": 3.988603988603989e-05, + "loss": 0.4088, + "step": 4180 + }, + { + "epoch": 5.0327260458839405, + "grad_norm": 0.007248507812619209, + "learning_rate": 3.9848553006447745e-05, + "loss": 0.0212, + "step": 4190 + }, + { + "epoch": 5.033400809716599, + "grad_norm": 0.0023328044917434454, + "learning_rate": 3.98110661268556e-05, + "loss": 0.0001, + "step": 4200 + }, + { + "epoch": 5.0340755735492575, + "grad_norm": 0.0011781149078160524, + "learning_rate": 3.9773579247263456e-05, + "loss": 0.0001, + "step": 4210 + }, + { + "epoch": 5.034750337381916, + "grad_norm": 0.000842131907120347, + "learning_rate": 3.973609236767132e-05, + "loss": 0.0001, + "step": 4220 + }, + { + "epoch": 5.0354251012145745, + "grad_norm": 0.0013578764628618956, + "learning_rate": 3.9698605488079175e-05, + "loss": 0.0001, + "step": 4230 + }, + { + "epoch": 5.036099865047233, + "grad_norm": 0.0005201473250053823, + "learning_rate": 3.966111860848703e-05, + "loss": 0.0001, + "step": 4240 + }, + { + "epoch": 5.0367746288798925, + "grad_norm": 0.0011828228598460555, + "learning_rate": 3.9623631728894886e-05, + "loss": 0.0065, + "step": 4250 + }, + { + "epoch": 5.037449392712551, + "grad_norm": 0.000755178218241781, + "learning_rate": 3.958614484930275e-05, + "loss": 0.207, + "step": 4260 + }, + { + "epoch": 5.0381241565452095, + "grad_norm": 0.0009751113248057663, + "learning_rate": 3.95486579697106e-05, + "loss": 0.0001, + "step": 4270 + }, + { + "epoch": 5.038798920377868, + "grad_norm": 0.00031620432855561376, + "learning_rate": 3.951117109011846e-05, + "loss": 0.337, + "step": 4280 + }, + { + "epoch": 5.0394736842105265, + "grad_norm": 0.0007090018480084836, + "learning_rate": 3.9473684210526316e-05, + "loss": 0.0006, + "step": 4290 + }, + { + "epoch": 5.040148448043185, + "grad_norm": 0.0010267384350299835, + "learning_rate": 3.943619733093418e-05, + "loss": 0.0, + "step": 4300 + }, + { + "epoch": 5.040823211875844, + "grad_norm": 0.014587147161364555, + "learning_rate": 3.939871045134203e-05, + "loss": 0.0001, + "step": 4310 + }, + { + "epoch": 5.041497975708502, + "grad_norm": 0.000788258679676801, + "learning_rate": 3.936122357174989e-05, + "loss": 0.0, + "step": 4320 + }, + { + "epoch": 5.042172739541161, + "grad_norm": 0.0006495325942523777, + "learning_rate": 3.9323736692157746e-05, + "loss": 0.0, + "step": 4330 + }, + { + "epoch": 5.042847503373819, + "grad_norm": 0.0006167737883515656, + "learning_rate": 3.928624981256561e-05, + "loss": 0.1018, + "step": 4340 + }, + { + "epoch": 5.043522267206478, + "grad_norm": 0.0014920184621587396, + "learning_rate": 3.924876293297346e-05, + "loss": 0.0, + "step": 4350 + }, + { + "epoch": 5.044197031039136, + "grad_norm": 0.0015535310376435518, + "learning_rate": 3.921127605338132e-05, + "loss": 0.0007, + "step": 4360 + }, + { + "epoch": 5.044871794871795, + "grad_norm": 0.0006431335350498557, + "learning_rate": 3.9173789173789176e-05, + "loss": 0.0001, + "step": 4370 + }, + { + "epoch": 5.045546558704453, + "grad_norm": 0.005366568453609943, + "learning_rate": 3.913630229419703e-05, + "loss": 0.0, + "step": 4380 + }, + { + "epoch": 5.046221322537112, + "grad_norm": 0.0013297253753989935, + "learning_rate": 3.909881541460489e-05, + "loss": 0.0, + "step": 4390 + }, + { + "epoch": 5.04689608636977, + "grad_norm": 0.0004990586312487721, + "learning_rate": 3.906132853501275e-05, + "loss": 0.0, + "step": 4400 + }, + { + "epoch": 5.047570850202429, + "grad_norm": 0.0013985860859975219, + "learning_rate": 3.9023841655420606e-05, + "loss": 0.0, + "step": 4410 + }, + { + "epoch": 5.048245614035087, + "grad_norm": 0.0006711781024932861, + "learning_rate": 3.898635477582846e-05, + "loss": 0.0, + "step": 4420 + }, + { + "epoch": 5.048920377867747, + "grad_norm": 0.0006565306102856994, + "learning_rate": 3.894886789623632e-05, + "loss": 0.0, + "step": 4430 + }, + { + "epoch": 5.049595141700405, + "grad_norm": 0.0009195157326757908, + "learning_rate": 3.891138101664418e-05, + "loss": 0.0001, + "step": 4440 + }, + { + "epoch": 5.05, + "eval_accuracy": 0.9107142857142857, + "eval_f1": 0.9105137981578073, + "eval_loss": 0.7364658117294312, + "eval_runtime": 73.1769, + "eval_samples_per_second": 1.531, + "eval_steps_per_second": 1.531, + "step": 4446 + }, + { + "epoch": 6.000269905533063, + "grad_norm": 0.0008725410443730652, + "learning_rate": 3.8873894137052036e-05, + "loss": 0.0, + "step": 4450 + }, + { + "epoch": 6.000944669365722, + "grad_norm": 0.0006686112028546631, + "learning_rate": 3.883640725745989e-05, + "loss": 0.0, + "step": 4460 + }, + { + "epoch": 6.001619433198381, + "grad_norm": 0.000973099609836936, + "learning_rate": 3.879892037786775e-05, + "loss": 0.0, + "step": 4470 + }, + { + "epoch": 6.0022941970310395, + "grad_norm": 0.0036273570731282234, + "learning_rate": 3.876143349827561e-05, + "loss": 0.0, + "step": 4480 + }, + { + "epoch": 6.002968960863698, + "grad_norm": 0.0030524057801812887, + "learning_rate": 3.8723946618683466e-05, + "loss": 0.9891, + "step": 4490 + }, + { + "epoch": 6.0036437246963565, + "grad_norm": 0.0005925680161453784, + "learning_rate": 3.868645973909132e-05, + "loss": 0.0001, + "step": 4500 + }, + { + "epoch": 6.004318488529015, + "grad_norm": 0.0012102797627449036, + "learning_rate": 3.864897285949918e-05, + "loss": 0.0004, + "step": 4510 + }, + { + "epoch": 6.004993252361674, + "grad_norm": 0.001870299456641078, + "learning_rate": 3.861148597990703e-05, + "loss": 0.0001, + "step": 4520 + }, + { + "epoch": 6.005668016194332, + "grad_norm": 0.0008334846352227032, + "learning_rate": 3.857399910031489e-05, + "loss": 0.0, + "step": 4530 + }, + { + "epoch": 6.006342780026991, + "grad_norm": 0.0909259095788002, + "learning_rate": 3.853651222072275e-05, + "loss": 0.0033, + "step": 4540 + }, + { + "epoch": 6.007017543859649, + "grad_norm": 0.08534003794193268, + "learning_rate": 3.849902534113061e-05, + "loss": 0.0001, + "step": 4550 + }, + { + "epoch": 6.007692307692308, + "grad_norm": 0.009015407413244247, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0001, + "step": 4560 + }, + { + "epoch": 6.008367071524966, + "grad_norm": 0.0005771831492893398, + "learning_rate": 3.842405158194632e-05, + "loss": 0.0, + "step": 4570 + }, + { + "epoch": 6.009041835357625, + "grad_norm": 0.00015217051259241998, + "learning_rate": 3.8386564702354174e-05, + "loss": 0.0, + "step": 4580 + }, + { + "epoch": 6.009716599190283, + "grad_norm": 0.001618007430806756, + "learning_rate": 3.834907782276204e-05, + "loss": 0.0001, + "step": 4590 + }, + { + "epoch": 6.010391363022942, + "grad_norm": 0.0008747613755986094, + "learning_rate": 3.831159094316989e-05, + "loss": 0.0, + "step": 4600 + }, + { + "epoch": 6.0110661268556, + "grad_norm": 0.0011886496795341372, + "learning_rate": 3.827410406357775e-05, + "loss": 0.0001, + "step": 4610 + }, + { + "epoch": 6.011740890688259, + "grad_norm": 0.0006136572919785976, + "learning_rate": 3.8236617183985604e-05, + "loss": 0.0, + "step": 4620 + }, + { + "epoch": 6.012415654520917, + "grad_norm": 0.0002797636261675507, + "learning_rate": 3.819913030439347e-05, + "loss": 0.0, + "step": 4630 + }, + { + "epoch": 6.013090418353576, + "grad_norm": 0.0005924575380049646, + "learning_rate": 3.8161643424801316e-05, + "loss": 0.0, + "step": 4640 + }, + { + "epoch": 6.013765182186235, + "grad_norm": 381.5912170410156, + "learning_rate": 3.812415654520918e-05, + "loss": 0.6612, + "step": 4650 + }, + { + "epoch": 6.014439946018894, + "grad_norm": 0.0007501631625927985, + "learning_rate": 3.8086669665617034e-05, + "loss": 0.057, + "step": 4660 + }, + { + "epoch": 6.015114709851552, + "grad_norm": 0.00048053194768726826, + "learning_rate": 3.80491827860249e-05, + "loss": 0.7472, + "step": 4670 + }, + { + "epoch": 6.015789473684211, + "grad_norm": 0.0008806756814010441, + "learning_rate": 3.8011695906432746e-05, + "loss": 0.0, + "step": 4680 + }, + { + "epoch": 6.016464237516869, + "grad_norm": 0.0007039654301479459, + "learning_rate": 3.797420902684061e-05, + "loss": 0.0002, + "step": 4690 + }, + { + "epoch": 6.017139001349528, + "grad_norm": 0.0005677440203726292, + "learning_rate": 3.7936722147248464e-05, + "loss": 0.0, + "step": 4700 + }, + { + "epoch": 6.017813765182186, + "grad_norm": 0.0006246105185709894, + "learning_rate": 3.7899235267656327e-05, + "loss": 0.0002, + "step": 4710 + }, + { + "epoch": 6.018488529014845, + "grad_norm": 0.0003905866760760546, + "learning_rate": 3.7861748388064176e-05, + "loss": 0.0, + "step": 4720 + }, + { + "epoch": 6.019163292847503, + "grad_norm": 0.0004027994582429528, + "learning_rate": 3.782426150847204e-05, + "loss": 0.0002, + "step": 4730 + }, + { + "epoch": 6.019838056680162, + "grad_norm": 0.0017455661436542869, + "learning_rate": 3.7786774628879894e-05, + "loss": 0.0001, + "step": 4740 + }, + { + "epoch": 6.02051282051282, + "grad_norm": 0.0022832180839031935, + "learning_rate": 3.774928774928775e-05, + "loss": 0.0001, + "step": 4750 + }, + { + "epoch": 6.021187584345479, + "grad_norm": 295.60693359375, + "learning_rate": 3.7711800869695605e-05, + "loss": 0.7359, + "step": 4760 + }, + { + "epoch": 6.0218623481781375, + "grad_norm": 0.0004823520721402019, + "learning_rate": 3.767431399010347e-05, + "loss": 0.0, + "step": 4770 + }, + { + "epoch": 6.022537112010796, + "grad_norm": 0.003145309165120125, + "learning_rate": 3.7636827110511324e-05, + "loss": 0.0, + "step": 4780 + }, + { + "epoch": 6.0232118758434545, + "grad_norm": 0.00026828868431039155, + "learning_rate": 3.759934023091918e-05, + "loss": 0.0, + "step": 4790 + }, + { + "epoch": 6.023886639676113, + "grad_norm": 0.000310034112771973, + "learning_rate": 3.7561853351327035e-05, + "loss": 0.0, + "step": 4800 + }, + { + "epoch": 6.024561403508772, + "grad_norm": 0.00041966387652792037, + "learning_rate": 3.75243664717349e-05, + "loss": 0.0, + "step": 4810 + }, + { + "epoch": 6.02523616734143, + "grad_norm": 0.0011529176263138652, + "learning_rate": 3.7486879592142754e-05, + "loss": 0.5445, + "step": 4820 + }, + { + "epoch": 6.0259109311740895, + "grad_norm": 0.02147838845849037, + "learning_rate": 3.744939271255061e-05, + "loss": 1.0205, + "step": 4830 + }, + { + "epoch": 6.026585695006748, + "grad_norm": 0.000508416909724474, + "learning_rate": 3.7411905832958465e-05, + "loss": 0.001, + "step": 4840 + }, + { + "epoch": 6.0272604588394065, + "grad_norm": 0.008615111000835896, + "learning_rate": 3.737441895336633e-05, + "loss": 0.0001, + "step": 4850 + }, + { + "epoch": 6.027935222672065, + "grad_norm": 0.444153755903244, + "learning_rate": 3.7336932073774184e-05, + "loss": 0.9325, + "step": 4860 + }, + { + "epoch": 6.028609986504724, + "grad_norm": 0.0013290736824274063, + "learning_rate": 3.729944519418204e-05, + "loss": 0.0001, + "step": 4870 + }, + { + "epoch": 6.029284750337382, + "grad_norm": 0.000803654664196074, + "learning_rate": 3.7261958314589895e-05, + "loss": 0.0044, + "step": 4880 + }, + { + "epoch": 6.029959514170041, + "grad_norm": 0.0021947200875729322, + "learning_rate": 3.722447143499775e-05, + "loss": 0.9785, + "step": 4890 + }, + { + "epoch": 6.030634278002699, + "grad_norm": 0.0023971525952219963, + "learning_rate": 3.718698455540561e-05, + "loss": 0.0001, + "step": 4900 + }, + { + "epoch": 6.031309041835358, + "grad_norm": 0.00609954446554184, + "learning_rate": 3.714949767581347e-05, + "loss": 0.0002, + "step": 4910 + }, + { + "epoch": 6.031983805668016, + "grad_norm": 0.0020932150073349476, + "learning_rate": 3.7112010796221325e-05, + "loss": 0.0002, + "step": 4920 + }, + { + "epoch": 6.032658569500675, + "grad_norm": 0.0034460346214473248, + "learning_rate": 3.707452391662918e-05, + "loss": 0.0004, + "step": 4930 + }, + { + "epoch": 6.033333333333333, + "grad_norm": 0.0021088484209030867, + "learning_rate": 3.7037037037037037e-05, + "loss": 0.0001, + "step": 4940 + }, + { + "epoch": 6.034008097165992, + "grad_norm": 0.002742623910307884, + "learning_rate": 3.699955015744489e-05, + "loss": 0.0006, + "step": 4950 + }, + { + "epoch": 6.03468286099865, + "grad_norm": 0.002541649155318737, + "learning_rate": 3.6962063277852755e-05, + "loss": 0.0001, + "step": 4960 + }, + { + "epoch": 6.035357624831309, + "grad_norm": 0.000678271462675184, + "learning_rate": 3.692457639826061e-05, + "loss": 0.0, + "step": 4970 + }, + { + "epoch": 6.036032388663967, + "grad_norm": 0.0022359860595315695, + "learning_rate": 3.6887089518668466e-05, + "loss": 0.0002, + "step": 4980 + }, + { + "epoch": 6.036707152496626, + "grad_norm": 0.003631311934441328, + "learning_rate": 3.684960263907632e-05, + "loss": 0.0139, + "step": 4990 + }, + { + "epoch": 6.037381916329284, + "grad_norm": 408.66119384765625, + "learning_rate": 3.6812115759484185e-05, + "loss": 0.3617, + "step": 5000 + }, + { + "epoch": 6.038056680161944, + "grad_norm": 0.001363090705126524, + "learning_rate": 3.6774628879892034e-05, + "loss": 0.7014, + "step": 5010 + }, + { + "epoch": 6.038731443994602, + "grad_norm": 0.0028585607651621103, + "learning_rate": 3.6737142000299896e-05, + "loss": 0.0209, + "step": 5020 + }, + { + "epoch": 6.039406207827261, + "grad_norm": 0.0029073706828057766, + "learning_rate": 3.669965512070775e-05, + "loss": 0.0007, + "step": 5030 + }, + { + "epoch": 6.040080971659919, + "grad_norm": 0.021762054413557053, + "learning_rate": 3.6662168241115615e-05, + "loss": 3.0967, + "step": 5040 + }, + { + "epoch": 6.040755735492578, + "grad_norm": 1.7035624980926514, + "learning_rate": 3.6624681361523464e-05, + "loss": 1.3983, + "step": 5050 + }, + { + "epoch": 6.041430499325236, + "grad_norm": 0.07881853729486465, + "learning_rate": 3.6587194481931326e-05, + "loss": 0.8778, + "step": 5060 + }, + { + "epoch": 6.042105263157895, + "grad_norm": 49.91697311401367, + "learning_rate": 3.654970760233918e-05, + "loss": 0.0293, + "step": 5070 + }, + { + "epoch": 6.042780026990553, + "grad_norm": 0.01630672998726368, + "learning_rate": 3.6512220722747045e-05, + "loss": 1.1156, + "step": 5080 + }, + { + "epoch": 6.043454790823212, + "grad_norm": 0.007935232482850552, + "learning_rate": 3.6474733843154894e-05, + "loss": 0.0031, + "step": 5090 + }, + { + "epoch": 6.04412955465587, + "grad_norm": 299.0083923339844, + "learning_rate": 3.6437246963562756e-05, + "loss": 0.6953, + "step": 5100 + }, + { + "epoch": 6.044804318488529, + "grad_norm": 0.014369282871484756, + "learning_rate": 3.639976008397061e-05, + "loss": 0.0002, + "step": 5110 + }, + { + "epoch": 6.0454790823211875, + "grad_norm": 0.0033456783276051283, + "learning_rate": 3.6362273204378474e-05, + "loss": 0.0009, + "step": 5120 + }, + { + "epoch": 6.046153846153846, + "grad_norm": 0.0012127397349104285, + "learning_rate": 3.6324786324786323e-05, + "loss": 0.0001, + "step": 5130 + }, + { + "epoch": 6.0468286099865045, + "grad_norm": 0.003025912446901202, + "learning_rate": 3.6287299445194186e-05, + "loss": 0.0001, + "step": 5140 + }, + { + "epoch": 6.047503373819163, + "grad_norm": 0.006771762855350971, + "learning_rate": 3.624981256560204e-05, + "loss": 0.0003, + "step": 5150 + }, + { + "epoch": 6.0481781376518216, + "grad_norm": 0.006291988305747509, + "learning_rate": 3.62123256860099e-05, + "loss": 0.6232, + "step": 5160 + }, + { + "epoch": 6.04885290148448, + "grad_norm": 0.010942903347313404, + "learning_rate": 3.617483880641775e-05, + "loss": 0.9909, + "step": 5170 + }, + { + "epoch": 6.049527665317139, + "grad_norm": 0.0050459960475564, + "learning_rate": 3.6137351926825616e-05, + "loss": 0.8987, + "step": 5180 + }, + { + "epoch": 6.05, + "eval_accuracy": 0.8392857142857143, + "eval_f1": 0.8294011707968183, + "eval_loss": 0.930968701839447, + "eval_runtime": 74.4165, + "eval_samples_per_second": 1.505, + "eval_steps_per_second": 1.505, + "step": 5187 + }, + { + "epoch": 7.000202429149797, + "grad_norm": 0.012029584497213364, + "learning_rate": 3.609986504723347e-05, + "loss": 0.0003, + "step": 5190 + }, + { + "epoch": 7.000877192982456, + "grad_norm": 0.002462017349898815, + "learning_rate": 3.606237816764133e-05, + "loss": 0.0005, + "step": 5200 + }, + { + "epoch": 7.001551956815114, + "grad_norm": 0.0375690832734108, + "learning_rate": 3.602489128804918e-05, + "loss": 0.1058, + "step": 5210 + }, + { + "epoch": 7.002226720647773, + "grad_norm": 0.026218879967927933, + "learning_rate": 3.5987404408457046e-05, + "loss": 0.0083, + "step": 5220 + }, + { + "epoch": 7.002901484480432, + "grad_norm": 0.0031192379537969828, + "learning_rate": 3.59499175288649e-05, + "loss": 0.0342, + "step": 5230 + }, + { + "epoch": 7.003576248313091, + "grad_norm": 0.002261426765471697, + "learning_rate": 3.591243064927276e-05, + "loss": 0.8758, + "step": 5240 + }, + { + "epoch": 7.004251012145749, + "grad_norm": 0.7252321839332581, + "learning_rate": 3.587494376968061e-05, + "loss": 0.0008, + "step": 5250 + }, + { + "epoch": 7.004925775978408, + "grad_norm": 0.002154165878891945, + "learning_rate": 3.583745689008847e-05, + "loss": 0.0002, + "step": 5260 + }, + { + "epoch": 7.005600539811066, + "grad_norm": 0.0012370734475553036, + "learning_rate": 3.5799970010496325e-05, + "loss": 1.0515, + "step": 5270 + }, + { + "epoch": 7.006275303643725, + "grad_norm": 0.0021348996087908745, + "learning_rate": 3.576248313090419e-05, + "loss": 0.0001, + "step": 5280 + }, + { + "epoch": 7.006950067476383, + "grad_norm": 0.006049524061381817, + "learning_rate": 3.572499625131204e-05, + "loss": 0.0003, + "step": 5290 + }, + { + "epoch": 7.007624831309042, + "grad_norm": 0.01275632157921791, + "learning_rate": 3.56875093717199e-05, + "loss": 0.0021, + "step": 5300 + }, + { + "epoch": 7.0082995951417, + "grad_norm": 0.0016850440297275782, + "learning_rate": 3.5650022492127754e-05, + "loss": 0.0001, + "step": 5310 + }, + { + "epoch": 7.008974358974359, + "grad_norm": 0.0009741022950038314, + "learning_rate": 3.561253561253561e-05, + "loss": 0.0005, + "step": 5320 + }, + { + "epoch": 7.0096491228070175, + "grad_norm": 0.000799846719019115, + "learning_rate": 3.557504873294347e-05, + "loss": 0.0002, + "step": 5330 + }, + { + "epoch": 7.010323886639676, + "grad_norm": 0.0008095399825833738, + "learning_rate": 3.553756185335133e-05, + "loss": 0.0024, + "step": 5340 + }, + { + "epoch": 7.0109986504723345, + "grad_norm": 0.0016390876844525337, + "learning_rate": 3.5500074973759184e-05, + "loss": 0.0, + "step": 5350 + }, + { + "epoch": 7.011673414304993, + "grad_norm": 0.0013130076695233583, + "learning_rate": 3.546258809416704e-05, + "loss": 0.8843, + "step": 5360 + }, + { + "epoch": 7.0123481781376515, + "grad_norm": 0.015013671480119228, + "learning_rate": 3.54251012145749e-05, + "loss": 0.7296, + "step": 5370 + }, + { + "epoch": 7.01302294197031, + "grad_norm": 0.003729419782757759, + "learning_rate": 3.538761433498276e-05, + "loss": 0.004, + "step": 5380 + }, + { + "epoch": 7.013697705802969, + "grad_norm": 0.007766401395201683, + "learning_rate": 3.5350127455390614e-05, + "loss": 0.0001, + "step": 5390 + }, + { + "epoch": 7.014372469635627, + "grad_norm": 0.03760051354765892, + "learning_rate": 3.531264057579847e-05, + "loss": 0.0002, + "step": 5400 + }, + { + "epoch": 7.0150472334682865, + "grad_norm": 0.003396588610485196, + "learning_rate": 3.527515369620633e-05, + "loss": 0.0001, + "step": 5410 + }, + { + "epoch": 7.015721997300945, + "grad_norm": 0.005965414922684431, + "learning_rate": 3.523766681661418e-05, + "loss": 0.0001, + "step": 5420 + }, + { + "epoch": 7.0163967611336036, + "grad_norm": 0.002591415075585246, + "learning_rate": 3.5200179937022044e-05, + "loss": 0.0001, + "step": 5430 + }, + { + "epoch": 7.017071524966262, + "grad_norm": 0.0007187007577158511, + "learning_rate": 3.51626930574299e-05, + "loss": 0.6273, + "step": 5440 + }, + { + "epoch": 7.017746288798921, + "grad_norm": 0.0018147805240005255, + "learning_rate": 3.512520617783776e-05, + "loss": 0.0001, + "step": 5450 + }, + { + "epoch": 7.018421052631579, + "grad_norm": 0.0007241186103783548, + "learning_rate": 3.508771929824561e-05, + "loss": 0.0002, + "step": 5460 + }, + { + "epoch": 7.019095816464238, + "grad_norm": 0.002352670766413212, + "learning_rate": 3.5050232418653474e-05, + "loss": 0.0001, + "step": 5470 + }, + { + "epoch": 7.019770580296896, + "grad_norm": 0.0018704934045672417, + "learning_rate": 3.501274553906133e-05, + "loss": 0.0004, + "step": 5480 + }, + { + "epoch": 7.020445344129555, + "grad_norm": 0.002092360518872738, + "learning_rate": 3.497525865946919e-05, + "loss": 0.0001, + "step": 5490 + }, + { + "epoch": 7.021120107962213, + "grad_norm": 0.001126096467487514, + "learning_rate": 3.493777177987704e-05, + "loss": 0.6823, + "step": 5500 + }, + { + "epoch": 7.021794871794872, + "grad_norm": 0.0008661505416966975, + "learning_rate": 3.4900284900284904e-05, + "loss": 0.0001, + "step": 5510 + }, + { + "epoch": 7.02246963562753, + "grad_norm": 0.02058524824678898, + "learning_rate": 3.486279802069276e-05, + "loss": 0.0001, + "step": 5520 + }, + { + "epoch": 7.023144399460189, + "grad_norm": 0.002387122018262744, + "learning_rate": 3.4825311141100615e-05, + "loss": 0.0002, + "step": 5530 + }, + { + "epoch": 7.023819163292847, + "grad_norm": 0.0011330017587170005, + "learning_rate": 3.478782426150847e-05, + "loss": 0.0001, + "step": 5540 + }, + { + "epoch": 7.024493927125506, + "grad_norm": 0.0005625615012831986, + "learning_rate": 3.4750337381916334e-05, + "loss": 0.0002, + "step": 5550 + }, + { + "epoch": 7.025168690958164, + "grad_norm": 0.0008695796132087708, + "learning_rate": 3.471285050232419e-05, + "loss": 0.0, + "step": 5560 + }, + { + "epoch": 7.025843454790823, + "grad_norm": 0.0016092468285933137, + "learning_rate": 3.4675363622732045e-05, + "loss": 0.0001, + "step": 5570 + }, + { + "epoch": 7.026518218623481, + "grad_norm": 0.0011349094565957785, + "learning_rate": 3.46378767431399e-05, + "loss": 0.0003, + "step": 5580 + }, + { + "epoch": 7.027192982456141, + "grad_norm": 0.0005459162639454007, + "learning_rate": 3.4600389863547764e-05, + "loss": 0.0001, + "step": 5590 + }, + { + "epoch": 7.027867746288799, + "grad_norm": 0.0009417292312718928, + "learning_rate": 3.456290298395562e-05, + "loss": 0.0, + "step": 5600 + }, + { + "epoch": 7.028542510121458, + "grad_norm": 0.0005761535139754415, + "learning_rate": 3.4525416104363475e-05, + "loss": 0.0, + "step": 5610 + }, + { + "epoch": 7.029217273954116, + "grad_norm": 0.0007409591344185174, + "learning_rate": 3.448792922477133e-05, + "loss": 0.0001, + "step": 5620 + }, + { + "epoch": 7.029892037786775, + "grad_norm": 0.004374117590487003, + "learning_rate": 3.4450442345179194e-05, + "loss": 0.0, + "step": 5630 + }, + { + "epoch": 7.030566801619433, + "grad_norm": 0.017210789024829865, + "learning_rate": 3.441295546558704e-05, + "loss": 0.0001, + "step": 5640 + }, + { + "epoch": 7.031241565452092, + "grad_norm": 0.0008836057968437672, + "learning_rate": 3.4375468585994905e-05, + "loss": 0.0012, + "step": 5650 + }, + { + "epoch": 7.03191632928475, + "grad_norm": 0.0015315774362534285, + "learning_rate": 3.433798170640276e-05, + "loss": 0.0, + "step": 5660 + }, + { + "epoch": 7.032591093117409, + "grad_norm": 0.0006376684177666903, + "learning_rate": 3.4300494826810617e-05, + "loss": 0.0, + "step": 5670 + }, + { + "epoch": 7.0332658569500675, + "grad_norm": 0.0005232661496847868, + "learning_rate": 3.426300794721847e-05, + "loss": 0.0, + "step": 5680 + }, + { + "epoch": 7.033940620782726, + "grad_norm": 0.0008468987653031945, + "learning_rate": 3.4225521067626335e-05, + "loss": 0.0003, + "step": 5690 + }, + { + "epoch": 7.0346153846153845, + "grad_norm": 0.000993360416032374, + "learning_rate": 3.418803418803419e-05, + "loss": 0.0, + "step": 5700 + }, + { + "epoch": 7.035290148448043, + "grad_norm": 0.0020066085271537304, + "learning_rate": 3.4150547308442046e-05, + "loss": 0.0001, + "step": 5710 + }, + { + "epoch": 7.0359649122807015, + "grad_norm": 0.00036297430051490664, + "learning_rate": 3.41130604288499e-05, + "loss": 0.0, + "step": 5720 + }, + { + "epoch": 7.03663967611336, + "grad_norm": 0.0009432988590560853, + "learning_rate": 3.407557354925776e-05, + "loss": 0.0001, + "step": 5730 + }, + { + "epoch": 7.037314439946019, + "grad_norm": 0.0018047185149043798, + "learning_rate": 3.403808666966562e-05, + "loss": 0.8627, + "step": 5740 + }, + { + "epoch": 7.037989203778677, + "grad_norm": 0.0037690841127187014, + "learning_rate": 3.4000599790073476e-05, + "loss": 0.0781, + "step": 5750 + }, + { + "epoch": 7.038663967611336, + "grad_norm": 0.023057781159877777, + "learning_rate": 3.396311291048133e-05, + "loss": 0.0001, + "step": 5760 + }, + { + "epoch": 7.039338731443995, + "grad_norm": 0.004012484569102526, + "learning_rate": 3.392562603088919e-05, + "loss": 0.0001, + "step": 5770 + }, + { + "epoch": 7.0400134952766535, + "grad_norm": 0.0012608218239620328, + "learning_rate": 3.388813915129705e-05, + "loss": 0.0001, + "step": 5780 + }, + { + "epoch": 7.040688259109312, + "grad_norm": 0.002351221162825823, + "learning_rate": 3.38506522717049e-05, + "loss": 0.0001, + "step": 5790 + }, + { + "epoch": 7.041363022941971, + "grad_norm": 0.000716827402357012, + "learning_rate": 3.381316539211276e-05, + "loss": 0.0005, + "step": 5800 + }, + { + "epoch": 7.042037786774629, + "grad_norm": 0.0029892646707594395, + "learning_rate": 3.377567851252062e-05, + "loss": 0.0, + "step": 5810 + }, + { + "epoch": 7.042712550607288, + "grad_norm": 372.2917175292969, + "learning_rate": 3.373819163292848e-05, + "loss": 0.5735, + "step": 5820 + }, + { + "epoch": 7.043387314439946, + "grad_norm": 0.0010425182990729809, + "learning_rate": 3.370070475333633e-05, + "loss": 0.0, + "step": 5830 + }, + { + "epoch": 7.044062078272605, + "grad_norm": 43.60670852661133, + "learning_rate": 3.366321787374419e-05, + "loss": 1.7134, + "step": 5840 + }, + { + "epoch": 7.044736842105263, + "grad_norm": 44.16180419921875, + "learning_rate": 3.362573099415205e-05, + "loss": 0.4081, + "step": 5850 + }, + { + "epoch": 7.045411605937922, + "grad_norm": 0.002386684063822031, + "learning_rate": 3.358824411455991e-05, + "loss": 0.0014, + "step": 5860 + }, + { + "epoch": 7.04608636977058, + "grad_norm": 0.000626052962616086, + "learning_rate": 3.355075723496776e-05, + "loss": 0.6338, + "step": 5870 + }, + { + "epoch": 7.046761133603239, + "grad_norm": 0.0048158965073525906, + "learning_rate": 3.351327035537562e-05, + "loss": 0.0002, + "step": 5880 + }, + { + "epoch": 7.047435897435897, + "grad_norm": 102.3766860961914, + "learning_rate": 3.347578347578348e-05, + "loss": 0.7, + "step": 5890 + }, + { + "epoch": 7.048110661268556, + "grad_norm": 0.0005689793615601957, + "learning_rate": 3.343829659619133e-05, + "loss": 0.7073, + "step": 5900 + }, + { + "epoch": 7.048785425101214, + "grad_norm": 0.013288695365190506, + "learning_rate": 3.340080971659919e-05, + "loss": 0.0572, + "step": 5910 + }, + { + "epoch": 7.049460188933873, + "grad_norm": 0.0011189569486305118, + "learning_rate": 3.336332283700705e-05, + "loss": 0.4888, + "step": 5920 + }, + { + "epoch": 7.05, + "eval_accuracy": 0.875, + "eval_f1": 0.8702947845804988, + "eval_loss": 0.856253445148468, + "eval_runtime": 74.1698, + "eval_samples_per_second": 1.51, + "eval_steps_per_second": 1.51, + "step": 5928 + }, + { + "epoch": 8.000134952766532, + "grad_norm": 0.0010889604454860091, + "learning_rate": 3.332583595741491e-05, + "loss": 0.0492, + "step": 5930 + }, + { + "epoch": 8.000809716599191, + "grad_norm": 0.0005811200244352221, + "learning_rate": 3.328834907782276e-05, + "loss": 0.0003, + "step": 5940 + }, + { + "epoch": 8.001484480431849, + "grad_norm": 0.0028562431689351797, + "learning_rate": 3.325086219823062e-05, + "loss": 0.0003, + "step": 5950 + }, + { + "epoch": 8.002159244264508, + "grad_norm": 0.0011086298618465662, + "learning_rate": 3.321337531863848e-05, + "loss": 0.0001, + "step": 5960 + }, + { + "epoch": 8.002834008097166, + "grad_norm": 0.0018863864243030548, + "learning_rate": 3.317588843904634e-05, + "loss": 0.0001, + "step": 5970 + }, + { + "epoch": 8.003508771929825, + "grad_norm": 0.0009740761015564203, + "learning_rate": 3.313840155945419e-05, + "loss": 0.0002, + "step": 5980 + }, + { + "epoch": 8.004183535762483, + "grad_norm": 0.0005378098576329648, + "learning_rate": 3.310091467986205e-05, + "loss": 0.0001, + "step": 5990 + }, + { + "epoch": 8.004858299595142, + "grad_norm": 0.001058222958818078, + "learning_rate": 3.306342780026991e-05, + "loss": 0.0001, + "step": 6000 + }, + { + "epoch": 8.0055330634278, + "grad_norm": 0.0010611525503918529, + "learning_rate": 3.302594092067777e-05, + "loss": 0.0001, + "step": 6010 + }, + { + "epoch": 8.006207827260459, + "grad_norm": 0.002727857790887356, + "learning_rate": 3.298845404108562e-05, + "loss": 0.0001, + "step": 6020 + }, + { + "epoch": 8.006882591093117, + "grad_norm": 0.0007821051403880119, + "learning_rate": 3.295096716149348e-05, + "loss": 0.0017, + "step": 6030 + }, + { + "epoch": 8.007557354925776, + "grad_norm": 0.001169922179542482, + "learning_rate": 3.2913480281901335e-05, + "loss": 0.0001, + "step": 6040 + }, + { + "epoch": 8.008232118758434, + "grad_norm": 0.0011363876983523369, + "learning_rate": 3.287599340230919e-05, + "loss": 0.0001, + "step": 6050 + }, + { + "epoch": 8.008906882591093, + "grad_norm": 0.0005207853973843157, + "learning_rate": 3.283850652271705e-05, + "loss": 0.6813, + "step": 6060 + }, + { + "epoch": 8.00958164642375, + "grad_norm": 0.0005264964420348406, + "learning_rate": 3.280101964312491e-05, + "loss": 0.0001, + "step": 6070 + }, + { + "epoch": 8.01025641025641, + "grad_norm": 0.0005870209424756467, + "learning_rate": 3.2763532763532764e-05, + "loss": 0.0001, + "step": 6080 + }, + { + "epoch": 8.01093117408907, + "grad_norm": 0.0016355343395844102, + "learning_rate": 3.272604588394062e-05, + "loss": 0.0004, + "step": 6090 + }, + { + "epoch": 8.011605937921727, + "grad_norm": 0.004568756558001041, + "learning_rate": 3.2688559004348476e-05, + "loss": 0.0004, + "step": 6100 + }, + { + "epoch": 8.012280701754387, + "grad_norm": 0.0005888245650567114, + "learning_rate": 3.265107212475634e-05, + "loss": 0.0001, + "step": 6110 + }, + { + "epoch": 8.012955465587044, + "grad_norm": 0.0023943374399095774, + "learning_rate": 3.2613585245164194e-05, + "loss": 0.0, + "step": 6120 + }, + { + "epoch": 8.013630229419704, + "grad_norm": 0.0004357252037152648, + "learning_rate": 3.257609836557205e-05, + "loss": 0.0002, + "step": 6130 + }, + { + "epoch": 8.014304993252361, + "grad_norm": 0.0006332327611744404, + "learning_rate": 3.2538611485979906e-05, + "loss": 0.0001, + "step": 6140 + }, + { + "epoch": 8.01497975708502, + "grad_norm": 0.0006531701656058431, + "learning_rate": 3.250112460638777e-05, + "loss": 0.0001, + "step": 6150 + }, + { + "epoch": 8.015654520917678, + "grad_norm": 0.0005107235629111528, + "learning_rate": 3.246363772679562e-05, + "loss": 0.0, + "step": 6160 + }, + { + "epoch": 8.016329284750338, + "grad_norm": 0.012723034247756004, + "learning_rate": 3.242615084720348e-05, + "loss": 0.0001, + "step": 6170 + }, + { + "epoch": 8.017004048582995, + "grad_norm": 0.0427851527929306, + "learning_rate": 3.2388663967611336e-05, + "loss": 0.0002, + "step": 6180 + }, + { + "epoch": 8.017678812415655, + "grad_norm": 0.001141960732638836, + "learning_rate": 3.23511770880192e-05, + "loss": 0.0, + "step": 6190 + }, + { + "epoch": 8.018353576248312, + "grad_norm": 0.0015029623173177242, + "learning_rate": 3.231369020842705e-05, + "loss": 0.0001, + "step": 6200 + }, + { + "epoch": 8.019028340080972, + "grad_norm": 0.0005648156511597335, + "learning_rate": 3.227620332883491e-05, + "loss": 0.0001, + "step": 6210 + }, + { + "epoch": 8.01970310391363, + "grad_norm": 0.0006971880211494863, + "learning_rate": 3.2238716449242766e-05, + "loss": 0.0, + "step": 6220 + }, + { + "epoch": 8.020377867746289, + "grad_norm": 0.0005205124034546316, + "learning_rate": 3.220122956965063e-05, + "loss": 0.0001, + "step": 6230 + }, + { + "epoch": 8.021052631578947, + "grad_norm": 0.0007245125016197562, + "learning_rate": 3.216374269005848e-05, + "loss": 0.0001, + "step": 6240 + }, + { + "epoch": 8.021727395411606, + "grad_norm": 0.0005247213994152844, + "learning_rate": 3.212625581046634e-05, + "loss": 0.0001, + "step": 6250 + }, + { + "epoch": 8.022402159244265, + "grad_norm": 0.0005060233525000513, + "learning_rate": 3.2088768930874195e-05, + "loss": 0.0014, + "step": 6260 + }, + { + "epoch": 8.023076923076923, + "grad_norm": 0.01399776991456747, + "learning_rate": 3.205128205128206e-05, + "loss": 0.0001, + "step": 6270 + }, + { + "epoch": 8.023751686909582, + "grad_norm": 0.0013257871614769101, + "learning_rate": 3.201379517168991e-05, + "loss": 0.0, + "step": 6280 + }, + { + "epoch": 8.02442645074224, + "grad_norm": 0.00038729843799956143, + "learning_rate": 3.197630829209777e-05, + "loss": 0.0, + "step": 6290 + }, + { + "epoch": 8.0251012145749, + "grad_norm": 0.0013562028761953115, + "learning_rate": 3.1938821412505625e-05, + "loss": 0.0002, + "step": 6300 + }, + { + "epoch": 8.025775978407557, + "grad_norm": 0.0023358569014817476, + "learning_rate": 3.190133453291348e-05, + "loss": 0.0, + "step": 6310 + }, + { + "epoch": 8.026450742240216, + "grad_norm": 0.0007051244028843939, + "learning_rate": 3.186384765332134e-05, + "loss": 0.0, + "step": 6320 + }, + { + "epoch": 8.027125506072874, + "grad_norm": 0.00045763421803712845, + "learning_rate": 3.18263607737292e-05, + "loss": 0.0, + "step": 6330 + }, + { + "epoch": 8.027800269905534, + "grad_norm": 0.0003405519819352776, + "learning_rate": 3.1788873894137055e-05, + "loss": 0.0, + "step": 6340 + }, + { + "epoch": 8.028475033738191, + "grad_norm": 0.0009031207882799208, + "learning_rate": 3.175138701454491e-05, + "loss": 0.3007, + "step": 6350 + }, + { + "epoch": 8.02914979757085, + "grad_norm": 0.00048344547394663095, + "learning_rate": 3.171390013495277e-05, + "loss": 0.0, + "step": 6360 + }, + { + "epoch": 8.029824561403508, + "grad_norm": 0.005110772326588631, + "learning_rate": 3.167641325536063e-05, + "loss": 0.0, + "step": 6370 + }, + { + "epoch": 8.030499325236168, + "grad_norm": 0.0005193505785427988, + "learning_rate": 3.1638926375768485e-05, + "loss": 1.2393, + "step": 6380 + }, + { + "epoch": 8.031174089068825, + "grad_norm": 0.001544152619317174, + "learning_rate": 3.160143949617634e-05, + "loss": 0.0, + "step": 6390 + }, + { + "epoch": 8.031848852901485, + "grad_norm": 0.004015884827822447, + "learning_rate": 3.15639526165842e-05, + "loss": 0.0001, + "step": 6400 + }, + { + "epoch": 8.032523616734142, + "grad_norm": 0.005030054599046707, + "learning_rate": 3.152646573699205e-05, + "loss": 0.0002, + "step": 6410 + }, + { + "epoch": 8.033198380566802, + "grad_norm": 0.08386117219924927, + "learning_rate": 3.148897885739991e-05, + "loss": 0.0002, + "step": 6420 + }, + { + "epoch": 8.03387314439946, + "grad_norm": 0.004819917026907206, + "learning_rate": 3.145149197780777e-05, + "loss": 0.1815, + "step": 6430 + }, + { + "epoch": 8.034547908232119, + "grad_norm": 0.0022033504210412502, + "learning_rate": 3.1414005098215627e-05, + "loss": 0.0001, + "step": 6440 + }, + { + "epoch": 8.035222672064778, + "grad_norm": 0.0040964060463011265, + "learning_rate": 3.137651821862348e-05, + "loss": 0.0001, + "step": 6450 + }, + { + "epoch": 8.035897435897436, + "grad_norm": 0.004042464308440685, + "learning_rate": 3.133903133903134e-05, + "loss": 0.0001, + "step": 6460 + }, + { + "epoch": 8.036572199730095, + "grad_norm": 0.0027346210554242134, + "learning_rate": 3.1301544459439194e-05, + "loss": 0.0001, + "step": 6470 + }, + { + "epoch": 8.037246963562753, + "grad_norm": 0.0005888897576369345, + "learning_rate": 3.1264057579847056e-05, + "loss": 0.0001, + "step": 6480 + }, + { + "epoch": 8.037921727395412, + "grad_norm": 0.004620389547199011, + "learning_rate": 3.122657070025491e-05, + "loss": 0.0001, + "step": 6490 + }, + { + "epoch": 8.03859649122807, + "grad_norm": 0.0017953782808035612, + "learning_rate": 3.118908382066277e-05, + "loss": 0.0001, + "step": 6500 + }, + { + "epoch": 8.03927125506073, + "grad_norm": 0.0019287167815491557, + "learning_rate": 3.1151596941070624e-05, + "loss": 0.0, + "step": 6510 + }, + { + "epoch": 8.039946018893387, + "grad_norm": 0.017189156264066696, + "learning_rate": 3.1114110061478486e-05, + "loss": 0.0001, + "step": 6520 + }, + { + "epoch": 8.040620782726046, + "grad_norm": 0.0002868880983442068, + "learning_rate": 3.107662318188634e-05, + "loss": 0.0001, + "step": 6530 + }, + { + "epoch": 8.041295546558704, + "grad_norm": 0.003237192053347826, + "learning_rate": 3.10391363022942e-05, + "loss": 0.0, + "step": 6540 + }, + { + "epoch": 8.041970310391363, + "grad_norm": 0.010104048997163773, + "learning_rate": 3.1001649422702054e-05, + "loss": 0.0001, + "step": 6550 + }, + { + "epoch": 8.042645074224021, + "grad_norm": 0.0012962371110916138, + "learning_rate": 3.0964162543109916e-05, + "loss": 0.0001, + "step": 6560 + }, + { + "epoch": 8.04331983805668, + "grad_norm": 0.0021973999682813883, + "learning_rate": 3.0926675663517765e-05, + "loss": 0.0001, + "step": 6570 + }, + { + "epoch": 8.043994601889338, + "grad_norm": 0.004213243722915649, + "learning_rate": 3.088918878392563e-05, + "loss": 0.0001, + "step": 6580 + }, + { + "epoch": 8.044669365721997, + "grad_norm": 0.0007371046231128275, + "learning_rate": 3.0851701904333484e-05, + "loss": 0.0, + "step": 6590 + }, + { + "epoch": 8.045344129554655, + "grad_norm": 0.0029181931167840958, + "learning_rate": 3.0814215024741346e-05, + "loss": 0.0001, + "step": 6600 + }, + { + "epoch": 8.046018893387314, + "grad_norm": 0.001932345563545823, + "learning_rate": 3.0776728145149195e-05, + "loss": 0.0001, + "step": 6610 + }, + { + "epoch": 8.046693657219974, + "grad_norm": 0.0020557758398354053, + "learning_rate": 3.073924126555706e-05, + "loss": 1.2559, + "step": 6620 + }, + { + "epoch": 8.047368421052632, + "grad_norm": 0.005378492642194033, + "learning_rate": 3.0701754385964913e-05, + "loss": 0.084, + "step": 6630 + }, + { + "epoch": 8.048043184885291, + "grad_norm": 0.0023935220669955015, + "learning_rate": 3.0664267506372776e-05, + "loss": 0.2057, + "step": 6640 + }, + { + "epoch": 8.048717948717949, + "grad_norm": 0.0029694943223148584, + "learning_rate": 3.0626780626780625e-05, + "loss": 0.0001, + "step": 6650 + }, + { + "epoch": 8.049392712550608, + "grad_norm": 0.0006969795795157552, + "learning_rate": 3.058929374718849e-05, + "loss": 0.0001, + "step": 6660 + }, + { + "epoch": 8.05, + "eval_accuracy": 0.8928571428571429, + "eval_f1": 0.8894495468057416, + "eval_loss": 0.6908820867538452, + "eval_runtime": 70.8817, + "eval_samples_per_second": 1.58, + "eval_steps_per_second": 1.58, + "step": 6669 + }, + { + "epoch": 9.000067476383267, + "grad_norm": 0.007031037472188473, + "learning_rate": 3.055180686759634e-05, + "loss": 0.0001, + "step": 6670 + }, + { + "epoch": 9.000742240215924, + "grad_norm": 0.0013843988999724388, + "learning_rate": 3.05143199880042e-05, + "loss": 0.0001, + "step": 6680 + }, + { + "epoch": 9.001417004048584, + "grad_norm": 0.07400429248809814, + "learning_rate": 3.0476833108412055e-05, + "loss": 0.0002, + "step": 6690 + }, + { + "epoch": 9.002091767881241, + "grad_norm": 0.001845911960117519, + "learning_rate": 3.0439346228819914e-05, + "loss": 0.0, + "step": 6700 + }, + { + "epoch": 9.0027665317139, + "grad_norm": 0.00020295576541684568, + "learning_rate": 3.0401859349227773e-05, + "loss": 0.0014, + "step": 6710 + }, + { + "epoch": 9.003441295546558, + "grad_norm": 0.001036637695506215, + "learning_rate": 3.0364372469635626e-05, + "loss": 0.0003, + "step": 6720 + }, + { + "epoch": 9.004116059379218, + "grad_norm": 0.001262377598322928, + "learning_rate": 3.0326885590043485e-05, + "loss": 0.0001, + "step": 6730 + }, + { + "epoch": 9.004790823211875, + "grad_norm": 0.0012167665408924222, + "learning_rate": 3.0289398710451344e-05, + "loss": 0.6511, + "step": 6740 + }, + { + "epoch": 9.005465587044535, + "grad_norm": 0.0019521707436069846, + "learning_rate": 3.0251911830859203e-05, + "loss": 0.0, + "step": 6750 + }, + { + "epoch": 9.006140350877192, + "grad_norm": 0.0013618938392028213, + "learning_rate": 3.0214424951267055e-05, + "loss": 0.0001, + "step": 6760 + }, + { + "epoch": 9.006815114709852, + "grad_norm": 0.0009306151187047362, + "learning_rate": 3.0176938071674915e-05, + "loss": 0.0001, + "step": 6770 + }, + { + "epoch": 9.00748987854251, + "grad_norm": 0.0007624576683156192, + "learning_rate": 3.0139451192082774e-05, + "loss": 0.0001, + "step": 6780 + }, + { + "epoch": 9.008164642375169, + "grad_norm": 0.0007957870257087052, + "learning_rate": 3.0101964312490626e-05, + "loss": 0.4765, + "step": 6790 + }, + { + "epoch": 9.008839406207827, + "grad_norm": 0.47597193717956543, + "learning_rate": 3.0064477432898485e-05, + "loss": 0.0019, + "step": 6800 + }, + { + "epoch": 9.009514170040486, + "grad_norm": 0.0003396008105482906, + "learning_rate": 3.0026990553306344e-05, + "loss": 0.0001, + "step": 6810 + }, + { + "epoch": 9.010188933873144, + "grad_norm": 0.0011485237628221512, + "learning_rate": 2.9989503673714204e-05, + "loss": 0.0, + "step": 6820 + }, + { + "epoch": 9.010863697705803, + "grad_norm": 0.0008013169863261282, + "learning_rate": 2.9952016794122056e-05, + "loss": 0.0001, + "step": 6830 + }, + { + "epoch": 9.011538461538462, + "grad_norm": 0.00038786802906543016, + "learning_rate": 2.9914529914529915e-05, + "loss": 0.0, + "step": 6840 + }, + { + "epoch": 9.01221322537112, + "grad_norm": 0.003582603298127651, + "learning_rate": 2.9877043034937774e-05, + "loss": 0.9191, + "step": 6850 + }, + { + "epoch": 9.01288798920378, + "grad_norm": 0.0014808046398684382, + "learning_rate": 2.9839556155345634e-05, + "loss": 0.0001, + "step": 6860 + }, + { + "epoch": 9.013562753036437, + "grad_norm": 0.01157829724252224, + "learning_rate": 2.9802069275753486e-05, + "loss": 0.0001, + "step": 6870 + }, + { + "epoch": 9.014237516869096, + "grad_norm": 0.007076776586472988, + "learning_rate": 2.9764582396161345e-05, + "loss": 0.0012, + "step": 6880 + }, + { + "epoch": 9.014912280701754, + "grad_norm": 0.003984262701123953, + "learning_rate": 2.9727095516569204e-05, + "loss": 0.0001, + "step": 6890 + }, + { + "epoch": 9.015587044534414, + "grad_norm": 0.00039073076914064586, + "learning_rate": 2.9689608636977063e-05, + "loss": 0.0001, + "step": 6900 + }, + { + "epoch": 9.016261808367071, + "grad_norm": 0.005625125020742416, + "learning_rate": 2.9652121757384916e-05, + "loss": 0.0001, + "step": 6910 + }, + { + "epoch": 9.01693657219973, + "grad_norm": 0.0015515730483457446, + "learning_rate": 2.9614634877792775e-05, + "loss": 0.0, + "step": 6920 + }, + { + "epoch": 9.017611336032388, + "grad_norm": 0.0017237714491784573, + "learning_rate": 2.9577147998200634e-05, + "loss": 0.0, + "step": 6930 + }, + { + "epoch": 9.018286099865048, + "grad_norm": 0.008184783160686493, + "learning_rate": 2.9539661118608486e-05, + "loss": 0.0001, + "step": 6940 + }, + { + "epoch": 9.018960863697705, + "grad_norm": 0.002028749557211995, + "learning_rate": 2.9502174239016346e-05, + "loss": 0.0001, + "step": 6950 + }, + { + "epoch": 9.019635627530365, + "grad_norm": 0.0036216990556567907, + "learning_rate": 2.9464687359424205e-05, + "loss": 0.0, + "step": 6960 + }, + { + "epoch": 9.020310391363022, + "grad_norm": 0.0013016269076615572, + "learning_rate": 2.942720047983206e-05, + "loss": 0.0001, + "step": 6970 + }, + { + "epoch": 9.020985155195682, + "grad_norm": 0.00772570027038455, + "learning_rate": 2.9389713600239916e-05, + "loss": 0.0001, + "step": 6980 + }, + { + "epoch": 9.02165991902834, + "grad_norm": 0.0003020280273631215, + "learning_rate": 2.9352226720647776e-05, + "loss": 0.0, + "step": 6990 + }, + { + "epoch": 9.022334682860999, + "grad_norm": 0.0012822924181818962, + "learning_rate": 2.931473984105563e-05, + "loss": 0.0, + "step": 7000 + }, + { + "epoch": 9.023009446693656, + "grad_norm": 0.0010099551873281598, + "learning_rate": 2.927725296146349e-05, + "loss": 0.0001, + "step": 7010 + }, + { + "epoch": 9.023684210526316, + "grad_norm": 0.0024363386910408735, + "learning_rate": 2.9239766081871346e-05, + "loss": 0.0001, + "step": 7020 + }, + { + "epoch": 9.024358974358975, + "grad_norm": 0.0023049945011734962, + "learning_rate": 2.9202279202279202e-05, + "loss": 0.0001, + "step": 7030 + }, + { + "epoch": 9.025033738191633, + "grad_norm": 0.0029273051768541336, + "learning_rate": 2.916479232268706e-05, + "loss": 0.0, + "step": 7040 + }, + { + "epoch": 9.025708502024292, + "grad_norm": 0.003555365838110447, + "learning_rate": 2.9127305443094917e-05, + "loss": 0.0001, + "step": 7050 + }, + { + "epoch": 9.02638326585695, + "grad_norm": 0.0033711865544319153, + "learning_rate": 2.9089818563502773e-05, + "loss": 0.0, + "step": 7060 + }, + { + "epoch": 9.02705802968961, + "grad_norm": 0.00046359331463463604, + "learning_rate": 2.9052331683910632e-05, + "loss": 0.0001, + "step": 7070 + }, + { + "epoch": 9.027732793522267, + "grad_norm": 0.0003137718595098704, + "learning_rate": 2.901484480431849e-05, + "loss": 0.0, + "step": 7080 + }, + { + "epoch": 9.028407557354926, + "grad_norm": 0.0016707087634131312, + "learning_rate": 2.8977357924726343e-05, + "loss": 0.0, + "step": 7090 + }, + { + "epoch": 9.029082321187584, + "grad_norm": 0.0012837687972933054, + "learning_rate": 2.8939871045134203e-05, + "loss": 0.0, + "step": 7100 + }, + { + "epoch": 9.029757085020243, + "grad_norm": 0.00030405522556975484, + "learning_rate": 2.8902384165542062e-05, + "loss": 0.0, + "step": 7110 + }, + { + "epoch": 9.030431848852901, + "grad_norm": 0.000334856566041708, + "learning_rate": 2.886489728594992e-05, + "loss": 0.0, + "step": 7120 + }, + { + "epoch": 9.03110661268556, + "grad_norm": 0.00024141219910234213, + "learning_rate": 2.8827410406357773e-05, + "loss": 0.0, + "step": 7130 + }, + { + "epoch": 9.031781376518218, + "grad_norm": 0.0014251351822167635, + "learning_rate": 2.8789923526765633e-05, + "loss": 0.0001, + "step": 7140 + }, + { + "epoch": 9.032456140350877, + "grad_norm": 0.0001798996381694451, + "learning_rate": 2.875243664717349e-05, + "loss": 0.0, + "step": 7150 + }, + { + "epoch": 9.033130904183535, + "grad_norm": 0.00026806764071807265, + "learning_rate": 2.871494976758135e-05, + "loss": 0.0, + "step": 7160 + }, + { + "epoch": 9.033805668016194, + "grad_norm": 0.001039984286762774, + "learning_rate": 2.8677462887989203e-05, + "loss": 0.0, + "step": 7170 + }, + { + "epoch": 9.034480431848852, + "grad_norm": 0.00029442558297887444, + "learning_rate": 2.8639976008397062e-05, + "loss": 0.0, + "step": 7180 + }, + { + "epoch": 9.035155195681511, + "grad_norm": 0.0010803727200254798, + "learning_rate": 2.860248912880492e-05, + "loss": 0.0, + "step": 7190 + }, + { + "epoch": 9.035829959514171, + "grad_norm": 0.0009579784818924963, + "learning_rate": 2.8565002249212774e-05, + "loss": 0.0, + "step": 7200 + }, + { + "epoch": 9.036504723346829, + "grad_norm": 0.00148207473102957, + "learning_rate": 2.8527515369620633e-05, + "loss": 0.5707, + "step": 7210 + }, + { + "epoch": 9.037179487179488, + "grad_norm": 0.0010521633084863424, + "learning_rate": 2.8490028490028492e-05, + "loss": 0.0627, + "step": 7220 + }, + { + "epoch": 9.037854251012146, + "grad_norm": 0.0016639038221910596, + "learning_rate": 2.845254161043635e-05, + "loss": 0.0205, + "step": 7230 + }, + { + "epoch": 9.038529014844805, + "grad_norm": 0.0019760627765208483, + "learning_rate": 2.8415054730844204e-05, + "loss": 0.0001, + "step": 7240 + }, + { + "epoch": 9.039203778677463, + "grad_norm": 0.0023020838852971792, + "learning_rate": 2.8377567851252063e-05, + "loss": 0.0001, + "step": 7250 + }, + { + "epoch": 9.039878542510122, + "grad_norm": 0.9819605946540833, + "learning_rate": 2.8340080971659922e-05, + "loss": 0.0009, + "step": 7260 + }, + { + "epoch": 9.04055330634278, + "grad_norm": 0.002409159205853939, + "learning_rate": 2.830259409206778e-05, + "loss": 0.6277, + "step": 7270 + }, + { + "epoch": 9.041228070175439, + "grad_norm": 298.6535339355469, + "learning_rate": 2.8265107212475634e-05, + "loss": 0.947, + "step": 7280 + }, + { + "epoch": 9.041902834008097, + "grad_norm": 0.034443099051713943, + "learning_rate": 2.8227620332883493e-05, + "loss": 0.0001, + "step": 7290 + }, + { + "epoch": 9.042577597840756, + "grad_norm": 0.040302518755197525, + "learning_rate": 2.8190133453291352e-05, + "loss": 0.003, + "step": 7300 + }, + { + "epoch": 9.043252361673414, + "grad_norm": 0.0009369853651151061, + "learning_rate": 2.8152646573699204e-05, + "loss": 0.0, + "step": 7310 + }, + { + "epoch": 9.043927125506073, + "grad_norm": 0.0013028283137828112, + "learning_rate": 2.8115159694107064e-05, + "loss": 0.0, + "step": 7320 + }, + { + "epoch": 9.04460188933873, + "grad_norm": 0.001541333505883813, + "learning_rate": 2.8077672814514923e-05, + "loss": 0.0, + "step": 7330 + }, + { + "epoch": 9.04527665317139, + "grad_norm": 0.000400466175051406, + "learning_rate": 2.8040185934922782e-05, + "loss": 0.0006, + "step": 7340 + }, + { + "epoch": 9.045951417004048, + "grad_norm": 0.001137162558734417, + "learning_rate": 2.8002699055330634e-05, + "loss": 0.0002, + "step": 7350 + }, + { + "epoch": 9.046626180836707, + "grad_norm": 0.0009733253973536193, + "learning_rate": 2.7965212175738493e-05, + "loss": 0.0, + "step": 7360 + }, + { + "epoch": 9.047300944669365, + "grad_norm": 0.0002777110203169286, + "learning_rate": 2.792772529614635e-05, + "loss": 0.0, + "step": 7370 + }, + { + "epoch": 9.047975708502024, + "grad_norm": 0.0009547212393954396, + "learning_rate": 2.789023841655421e-05, + "loss": 0.0, + "step": 7380 + }, + { + "epoch": 9.048650472334684, + "grad_norm": 0.0003457583661656827, + "learning_rate": 2.7852751536962064e-05, + "loss": 0.0008, + "step": 7390 + }, + { + "epoch": 9.049325236167341, + "grad_norm": 0.0019107568077743053, + "learning_rate": 2.781526465736992e-05, + "loss": 0.0009, + "step": 7400 + }, + { + "epoch": 9.05, + "grad_norm": 0.0008839426445774734, + "learning_rate": 2.777777777777778e-05, + "loss": 0.0018, + "step": 7410 + }, + { + "epoch": 9.05, + "eval_accuracy": 0.8928571428571429, + "eval_f1": 0.8916871416871418, + "eval_loss": 0.9169295430183411, + "eval_runtime": 70.5688, + "eval_samples_per_second": 1.587, + "eval_steps_per_second": 1.587, + "step": 7410 + }, + { + "epoch": 10.00067476383266, + "grad_norm": 0.0002696131123229861, + "learning_rate": 2.7740290898185638e-05, + "loss": 0.002, + "step": 7420 + }, + { + "epoch": 10.001349527665317, + "grad_norm": 0.00017847323033493012, + "learning_rate": 2.770280401859349e-05, + "loss": 0.0, + "step": 7430 + }, + { + "epoch": 10.002024291497976, + "grad_norm": 0.0010017943568527699, + "learning_rate": 2.766531713900135e-05, + "loss": 0.0, + "step": 7440 + }, + { + "epoch": 10.002699055330634, + "grad_norm": 0.0006036867271177471, + "learning_rate": 2.762783025940921e-05, + "loss": 0.0, + "step": 7450 + }, + { + "epoch": 10.003373819163293, + "grad_norm": 0.00019583333050832152, + "learning_rate": 2.759034337981706e-05, + "loss": 0.1066, + "step": 7460 + }, + { + "epoch": 10.004048582995951, + "grad_norm": 0.08632688224315643, + "learning_rate": 2.755285650022492e-05, + "loss": 0.0003, + "step": 7470 + }, + { + "epoch": 10.00472334682861, + "grad_norm": 0.00013941490033175796, + "learning_rate": 2.751536962063278e-05, + "loss": 0.0, + "step": 7480 + }, + { + "epoch": 10.005398110661268, + "grad_norm": 0.0003023550088983029, + "learning_rate": 2.747788274104064e-05, + "loss": 0.0001, + "step": 7490 + }, + { + "epoch": 10.006072874493928, + "grad_norm": 0.0005739156622439623, + "learning_rate": 2.744039586144849e-05, + "loss": 0.3069, + "step": 7500 + }, + { + "epoch": 10.006747638326585, + "grad_norm": 0.0005304102669470012, + "learning_rate": 2.740290898185635e-05, + "loss": 0.0, + "step": 7510 + }, + { + "epoch": 10.007422402159245, + "grad_norm": 0.0009174313163384795, + "learning_rate": 2.736542210226421e-05, + "loss": 0.0, + "step": 7520 + }, + { + "epoch": 10.008097165991902, + "grad_norm": 0.0004933126620016992, + "learning_rate": 2.732793522267207e-05, + "loss": 0.0, + "step": 7530 + }, + { + "epoch": 10.008771929824562, + "grad_norm": 0.002700564218685031, + "learning_rate": 2.729044834307992e-05, + "loss": 0.0, + "step": 7540 + }, + { + "epoch": 10.00944669365722, + "grad_norm": 0.0008284652722068131, + "learning_rate": 2.725296146348778e-05, + "loss": 0.7602, + "step": 7550 + }, + { + "epoch": 10.010121457489879, + "grad_norm": 0.0005742429639212787, + "learning_rate": 2.721547458389564e-05, + "loss": 0.0013, + "step": 7560 + }, + { + "epoch": 10.010796221322536, + "grad_norm": 0.0001865791855379939, + "learning_rate": 2.7177987704303492e-05, + "loss": 1.0409, + "step": 7570 + }, + { + "epoch": 10.011470985155196, + "grad_norm": 0.0005401599337346852, + "learning_rate": 2.714050082471135e-05, + "loss": 0.0, + "step": 7580 + }, + { + "epoch": 10.012145748987853, + "grad_norm": 10.862272262573242, + "learning_rate": 2.710301394511921e-05, + "loss": 0.6573, + "step": 7590 + }, + { + "epoch": 10.012820512820513, + "grad_norm": 37.7309455871582, + "learning_rate": 2.706552706552707e-05, + "loss": 0.7899, + "step": 7600 + }, + { + "epoch": 10.013495276653172, + "grad_norm": 0.0009414503001607955, + "learning_rate": 2.7028040185934922e-05, + "loss": 0.0011, + "step": 7610 + }, + { + "epoch": 10.01417004048583, + "grad_norm": 0.0004630287585314363, + "learning_rate": 2.699055330634278e-05, + "loss": 0.0001, + "step": 7620 + }, + { + "epoch": 10.01484480431849, + "grad_norm": 0.0013565809931606054, + "learning_rate": 2.695306642675064e-05, + "loss": 0.0001, + "step": 7630 + }, + { + "epoch": 10.015519568151147, + "grad_norm": 0.0022902884520590305, + "learning_rate": 2.69155795471585e-05, + "loss": 0.0, + "step": 7640 + }, + { + "epoch": 10.016194331983806, + "grad_norm": 0.0009432418155483902, + "learning_rate": 2.687809266756635e-05, + "loss": 0.0, + "step": 7650 + }, + { + "epoch": 10.016869095816464, + "grad_norm": 0.0009669333812780678, + "learning_rate": 2.684060578797421e-05, + "loss": 0.0001, + "step": 7660 + }, + { + "epoch": 10.017543859649123, + "grad_norm": 0.0011604432947933674, + "learning_rate": 2.680311890838207e-05, + "loss": 0.0011, + "step": 7670 + }, + { + "epoch": 10.018218623481781, + "grad_norm": 0.0037133977748453617, + "learning_rate": 2.6765632028789922e-05, + "loss": 0.0089, + "step": 7680 + }, + { + "epoch": 10.01889338731444, + "grad_norm": 0.0019840672612190247, + "learning_rate": 2.672814514919778e-05, + "loss": 0.0001, + "step": 7690 + }, + { + "epoch": 10.019568151147098, + "grad_norm": 0.0010515927569940686, + "learning_rate": 2.669065826960564e-05, + "loss": 0.0, + "step": 7700 + }, + { + "epoch": 10.020242914979757, + "grad_norm": 0.00031027224031277, + "learning_rate": 2.66531713900135e-05, + "loss": 0.0001, + "step": 7710 + }, + { + "epoch": 10.020917678812415, + "grad_norm": 0.0026109693571925163, + "learning_rate": 2.6615684510421352e-05, + "loss": 0.0001, + "step": 7720 + }, + { + "epoch": 10.021592442645074, + "grad_norm": 0.001366731128655374, + "learning_rate": 2.657819763082921e-05, + "loss": 0.0, + "step": 7730 + }, + { + "epoch": 10.022267206477732, + "grad_norm": 0.0010099642677232623, + "learning_rate": 2.654071075123707e-05, + "loss": 0.0, + "step": 7740 + }, + { + "epoch": 10.022941970310391, + "grad_norm": 0.0007431610720232129, + "learning_rate": 2.6503223871644926e-05, + "loss": 0.3974, + "step": 7750 + }, + { + "epoch": 10.023616734143049, + "grad_norm": 0.0005235990975052118, + "learning_rate": 2.6465736992052782e-05, + "loss": 0.0001, + "step": 7760 + }, + { + "epoch": 10.024291497975709, + "grad_norm": 0.002703143283724785, + "learning_rate": 2.642825011246064e-05, + "loss": 0.9305, + "step": 7770 + }, + { + "epoch": 10.024966261808368, + "grad_norm": 0.0013169089797884226, + "learning_rate": 2.6390763232868497e-05, + "loss": 0.0061, + "step": 7780 + }, + { + "epoch": 10.025641025641026, + "grad_norm": 0.0006970075191929936, + "learning_rate": 2.6353276353276356e-05, + "loss": 0.0, + "step": 7790 + }, + { + "epoch": 10.026315789473685, + "grad_norm": 0.0022921450436115265, + "learning_rate": 2.6315789473684212e-05, + "loss": 2.1218, + "step": 7800 + }, + { + "epoch": 10.026990553306343, + "grad_norm": 0.015075190924108028, + "learning_rate": 2.6278302594092068e-05, + "loss": 0.0001, + "step": 7810 + }, + { + "epoch": 10.027665317139002, + "grad_norm": 0.0003634750028140843, + "learning_rate": 2.6240815714499927e-05, + "loss": 0.0003, + "step": 7820 + }, + { + "epoch": 10.02834008097166, + "grad_norm": 0.005189963150769472, + "learning_rate": 2.6203328834907783e-05, + "loss": 0.0002, + "step": 7830 + }, + { + "epoch": 10.029014844804319, + "grad_norm": 0.0013347219210118055, + "learning_rate": 2.616584195531564e-05, + "loss": 0.0004, + "step": 7840 + }, + { + "epoch": 10.029689608636977, + "grad_norm": 0.011999278329312801, + "learning_rate": 2.6128355075723498e-05, + "loss": 0.0001, + "step": 7850 + }, + { + "epoch": 10.030364372469636, + "grad_norm": 0.0007896720780991018, + "learning_rate": 2.6090868196131357e-05, + "loss": 0.0001, + "step": 7860 + }, + { + "epoch": 10.031039136302294, + "grad_norm": 0.004586980678141117, + "learning_rate": 2.605338131653921e-05, + "loss": 0.0001, + "step": 7870 + }, + { + "epoch": 10.031713900134953, + "grad_norm": 0.001417971565388143, + "learning_rate": 2.601589443694707e-05, + "loss": 0.0, + "step": 7880 + }, + { + "epoch": 10.03238866396761, + "grad_norm": 0.0019554668106138706, + "learning_rate": 2.5978407557354928e-05, + "loss": 0.0001, + "step": 7890 + }, + { + "epoch": 10.03306342780027, + "grad_norm": 0.028743397444486618, + "learning_rate": 2.5940920677762787e-05, + "loss": 0.0001, + "step": 7900 + }, + { + "epoch": 10.033738191632928, + "grad_norm": 0.0008731328416615725, + "learning_rate": 2.590343379817064e-05, + "loss": 0.0001, + "step": 7910 + }, + { + "epoch": 10.034412955465587, + "grad_norm": 0.0012366612209007144, + "learning_rate": 2.5865946918578498e-05, + "loss": 0.0001, + "step": 7920 + }, + { + "epoch": 10.035087719298245, + "grad_norm": 0.0026165838353335857, + "learning_rate": 2.5828460038986357e-05, + "loss": 0.0001, + "step": 7930 + }, + { + "epoch": 10.035762483130904, + "grad_norm": 0.014659812673926353, + "learning_rate": 2.579097315939421e-05, + "loss": 0.0002, + "step": 7940 + }, + { + "epoch": 10.036437246963562, + "grad_norm": 0.00143991329241544, + "learning_rate": 2.575348627980207e-05, + "loss": 0.0, + "step": 7950 + }, + { + "epoch": 10.037112010796221, + "grad_norm": 0.00752654206007719, + "learning_rate": 2.5715999400209928e-05, + "loss": 0.0001, + "step": 7960 + }, + { + "epoch": 10.03778677462888, + "grad_norm": 0.0011906948639079928, + "learning_rate": 2.5678512520617787e-05, + "loss": 0.0001, + "step": 7970 + }, + { + "epoch": 10.038461538461538, + "grad_norm": 0.004429694265127182, + "learning_rate": 2.564102564102564e-05, + "loss": 0.0001, + "step": 7980 + }, + { + "epoch": 10.039136302294198, + "grad_norm": 0.00023650593357160687, + "learning_rate": 2.56035387614335e-05, + "loss": 0.0001, + "step": 7990 + }, + { + "epoch": 10.039811066126855, + "grad_norm": 0.0007866804371587932, + "learning_rate": 2.5566051881841358e-05, + "loss": 0.0, + "step": 8000 + }, + { + "epoch": 10.040485829959515, + "grad_norm": 0.0013989802682772279, + "learning_rate": 2.5528565002249217e-05, + "loss": 0.0001, + "step": 8010 + }, + { + "epoch": 10.041160593792172, + "grad_norm": 0.0008867586147971451, + "learning_rate": 2.549107812265707e-05, + "loss": 0.2682, + "step": 8020 + }, + { + "epoch": 10.041835357624832, + "grad_norm": 0.001083207200281322, + "learning_rate": 2.545359124306493e-05, + "loss": 0.0, + "step": 8030 + }, + { + "epoch": 10.04251012145749, + "grad_norm": 0.0010164374252781272, + "learning_rate": 2.5416104363472788e-05, + "loss": 0.0014, + "step": 8040 + }, + { + "epoch": 10.043184885290149, + "grad_norm": 0.0032585004810243845, + "learning_rate": 2.5378617483880647e-05, + "loss": 0.0001, + "step": 8050 + }, + { + "epoch": 10.043859649122806, + "grad_norm": 0.0007220272673293948, + "learning_rate": 2.53411306042885e-05, + "loss": 0.0001, + "step": 8060 + }, + { + "epoch": 10.044534412955466, + "grad_norm": 0.0010795597918331623, + "learning_rate": 2.530364372469636e-05, + "loss": 0.0001, + "step": 8070 + }, + { + "epoch": 10.045209176788124, + "grad_norm": 0.0033428198657929897, + "learning_rate": 2.5266156845104218e-05, + "loss": 0.0, + "step": 8080 + }, + { + "epoch": 10.045883940620783, + "grad_norm": 0.0007780479500070214, + "learning_rate": 2.522866996551207e-05, + "loss": 0.0003, + "step": 8090 + }, + { + "epoch": 10.04655870445344, + "grad_norm": 0.002177152084186673, + "learning_rate": 2.519118308591993e-05, + "loss": 1.034, + "step": 8100 + }, + { + "epoch": 10.0472334682861, + "grad_norm": 0.012076308950781822, + "learning_rate": 2.515369620632779e-05, + "loss": 0.0001, + "step": 8110 + }, + { + "epoch": 10.047908232118758, + "grad_norm": 0.00882900319993496, + "learning_rate": 2.5116209326735644e-05, + "loss": 0.0001, + "step": 8120 + }, + { + "epoch": 10.048582995951417, + "grad_norm": 0.0017163383308798075, + "learning_rate": 2.50787224471435e-05, + "loss": 0.0002, + "step": 8130 + }, + { + "epoch": 10.049257759784076, + "grad_norm": 0.07908181846141815, + "learning_rate": 2.504123556755136e-05, + "loss": 0.0002, + "step": 8140 + }, + { + "epoch": 10.049932523616734, + "grad_norm": 0.0007900993805378675, + "learning_rate": 2.5003748687959215e-05, + "loss": 0.0, + "step": 8150 + }, + { + "epoch": 10.05, + "eval_accuracy": 0.8928571428571429, + "eval_f1": 0.8927764491849939, + "eval_loss": 0.6104062795639038, + "eval_runtime": 74.468, + "eval_samples_per_second": 1.504, + "eval_steps_per_second": 1.504, + "step": 8151 + }, + { + "epoch": 11.000607287449393, + "grad_norm": 0.0007240193081088364, + "learning_rate": 2.496626180836707e-05, + "loss": 0.0, + "step": 8160 + }, + { + "epoch": 11.001282051282052, + "grad_norm": 0.0002468556631356478, + "learning_rate": 2.492877492877493e-05, + "loss": 0.11, + "step": 8170 + }, + { + "epoch": 11.00195681511471, + "grad_norm": 0.0006738382508046925, + "learning_rate": 2.4891288049182786e-05, + "loss": 0.0009, + "step": 8180 + }, + { + "epoch": 11.00263157894737, + "grad_norm": 0.0002363823732594028, + "learning_rate": 2.485380116959064e-05, + "loss": 0.0001, + "step": 8190 + }, + { + "epoch": 11.003306342780027, + "grad_norm": 0.01611531712114811, + "learning_rate": 2.48163142899985e-05, + "loss": 0.0001, + "step": 8200 + }, + { + "epoch": 11.003981106612686, + "grad_norm": 0.00017891006427817047, + "learning_rate": 2.4778827410406356e-05, + "loss": 0.0001, + "step": 8210 + }, + { + "epoch": 11.004655870445344, + "grad_norm": 0.0012173757422715425, + "learning_rate": 2.4741340530814216e-05, + "loss": 0.0, + "step": 8220 + }, + { + "epoch": 11.005330634278003, + "grad_norm": 0.00027030581259168684, + "learning_rate": 2.470385365122207e-05, + "loss": 0.0001, + "step": 8230 + }, + { + "epoch": 11.006005398110661, + "grad_norm": 0.0007059440249577165, + "learning_rate": 2.466636677162993e-05, + "loss": 0.0038, + "step": 8240 + }, + { + "epoch": 11.00668016194332, + "grad_norm": 0.0038354801945388317, + "learning_rate": 2.4628879892037786e-05, + "loss": 0.0, + "step": 8250 + }, + { + "epoch": 11.007354925775978, + "grad_norm": 0.002050234004855156, + "learning_rate": 2.4591393012445645e-05, + "loss": 0.0001, + "step": 8260 + }, + { + "epoch": 11.008029689608637, + "grad_norm": 0.0007953056483529508, + "learning_rate": 2.45539061328535e-05, + "loss": 0.0001, + "step": 8270 + }, + { + "epoch": 11.008704453441295, + "grad_norm": 0.0005133861559443176, + "learning_rate": 2.451641925326136e-05, + "loss": 0.0001, + "step": 8280 + }, + { + "epoch": 11.009379217273954, + "grad_norm": 0.00046163739170879126, + "learning_rate": 2.4478932373669216e-05, + "loss": 0.0, + "step": 8290 + }, + { + "epoch": 11.010053981106612, + "grad_norm": 0.0001449552073609084, + "learning_rate": 2.4441445494077075e-05, + "loss": 0.3172, + "step": 8300 + }, + { + "epoch": 11.010728744939271, + "grad_norm": 164.93666076660156, + "learning_rate": 2.440395861448493e-05, + "loss": 0.6368, + "step": 8310 + }, + { + "epoch": 11.011403508771929, + "grad_norm": 0.000476795103168115, + "learning_rate": 2.4366471734892787e-05, + "loss": 0.0, + "step": 8320 + }, + { + "epoch": 11.012078272604588, + "grad_norm": 0.0037983739748597145, + "learning_rate": 2.4328984855300646e-05, + "loss": 0.0002, + "step": 8330 + }, + { + "epoch": 11.012753036437246, + "grad_norm": 0.000796021893620491, + "learning_rate": 2.4291497975708502e-05, + "loss": 0.0, + "step": 8340 + }, + { + "epoch": 11.013427800269906, + "grad_norm": 0.0005037175142206252, + "learning_rate": 2.425401109611636e-05, + "loss": 0.0002, + "step": 8350 + }, + { + "epoch": 11.014102564102565, + "grad_norm": 0.0043189083226025105, + "learning_rate": 2.4216524216524217e-05, + "loss": 0.0001, + "step": 8360 + }, + { + "epoch": 11.014777327935223, + "grad_norm": 0.0015088323270902038, + "learning_rate": 2.4179037336932076e-05, + "loss": 0.0, + "step": 8370 + }, + { + "epoch": 11.015452091767882, + "grad_norm": 0.009932787157595158, + "learning_rate": 2.414155045733993e-05, + "loss": 0.0, + "step": 8380 + }, + { + "epoch": 11.01612685560054, + "grad_norm": 0.0006705676787532866, + "learning_rate": 2.410406357774779e-05, + "loss": 0.0, + "step": 8390 + }, + { + "epoch": 11.016801619433199, + "grad_norm": 0.0004983929102309048, + "learning_rate": 2.4066576698155647e-05, + "loss": 0.0, + "step": 8400 + }, + { + "epoch": 11.017476383265857, + "grad_norm": 0.0002321622014278546, + "learning_rate": 2.4029089818563506e-05, + "loss": 0.0, + "step": 8410 + }, + { + "epoch": 11.018151147098516, + "grad_norm": 0.00045225844951346517, + "learning_rate": 2.399160293897136e-05, + "loss": 0.0001, + "step": 8420 + }, + { + "epoch": 11.018825910931174, + "grad_norm": 0.0006059862207621336, + "learning_rate": 2.395411605937922e-05, + "loss": 0.0, + "step": 8430 + }, + { + "epoch": 11.019500674763833, + "grad_norm": 0.00025944746448658407, + "learning_rate": 2.3916629179787076e-05, + "loss": 0.0, + "step": 8440 + }, + { + "epoch": 11.02017543859649, + "grad_norm": 0.005270655732601881, + "learning_rate": 2.3879142300194932e-05, + "loss": 0.0, + "step": 8450 + }, + { + "epoch": 11.02085020242915, + "grad_norm": 0.0001714004756649956, + "learning_rate": 2.384165542060279e-05, + "loss": 0.0, + "step": 8460 + }, + { + "epoch": 11.021524966261808, + "grad_norm": 0.0004896153695881367, + "learning_rate": 2.3804168541010647e-05, + "loss": 0.0001, + "step": 8470 + }, + { + "epoch": 11.022199730094467, + "grad_norm": 0.0004871699493378401, + "learning_rate": 2.3766681661418506e-05, + "loss": 0.0, + "step": 8480 + }, + { + "epoch": 11.022874493927125, + "grad_norm": 0.00332398503087461, + "learning_rate": 2.3729194781826362e-05, + "loss": 0.0, + "step": 8490 + }, + { + "epoch": 11.023549257759784, + "grad_norm": 0.0004967203130945563, + "learning_rate": 2.369170790223422e-05, + "loss": 0.0, + "step": 8500 + }, + { + "epoch": 11.024224021592442, + "grad_norm": 0.0006828585756011307, + "learning_rate": 2.3654221022642077e-05, + "loss": 0.0, + "step": 8510 + }, + { + "epoch": 11.024898785425101, + "grad_norm": 0.00026437186170369387, + "learning_rate": 2.3616734143049933e-05, + "loss": 0.0923, + "step": 8520 + }, + { + "epoch": 11.025573549257759, + "grad_norm": 0.001157809398137033, + "learning_rate": 2.3579247263457792e-05, + "loss": 0.3747, + "step": 8530 + }, + { + "epoch": 11.026248313090418, + "grad_norm": 0.0006130054825916886, + "learning_rate": 2.3541760383865648e-05, + "loss": 0.1333, + "step": 8540 + }, + { + "epoch": 11.026923076923078, + "grad_norm": 0.004360508639365435, + "learning_rate": 2.3504273504273504e-05, + "loss": 0.0001, + "step": 8550 + }, + { + "epoch": 11.027597840755735, + "grad_norm": 0.0038445070385932922, + "learning_rate": 2.3466786624681363e-05, + "loss": 0.0005, + "step": 8560 + }, + { + "epoch": 11.028272604588395, + "grad_norm": 0.0003999462933279574, + "learning_rate": 2.342929974508922e-05, + "loss": 0.0, + "step": 8570 + }, + { + "epoch": 11.028947368421052, + "grad_norm": 0.0013614681083709002, + "learning_rate": 2.3391812865497074e-05, + "loss": 0.0, + "step": 8580 + }, + { + "epoch": 11.029622132253712, + "grad_norm": 8.867596625350416e-05, + "learning_rate": 2.3354325985904933e-05, + "loss": 0.0, + "step": 8590 + }, + { + "epoch": 11.03029689608637, + "grad_norm": 0.0005633147084154189, + "learning_rate": 2.331683910631279e-05, + "loss": 0.0, + "step": 8600 + }, + { + "epoch": 11.030971659919029, + "grad_norm": 0.0005220117163844407, + "learning_rate": 2.327935222672065e-05, + "loss": 0.0001, + "step": 8610 + }, + { + "epoch": 11.031646423751686, + "grad_norm": 0.0004213712236378342, + "learning_rate": 2.3241865347128504e-05, + "loss": 0.0, + "step": 8620 + }, + { + "epoch": 11.032321187584346, + "grad_norm": 0.00038689616485498846, + "learning_rate": 2.3204378467536363e-05, + "loss": 0.0, + "step": 8630 + }, + { + "epoch": 11.032995951417004, + "grad_norm": 0.00039902018033899367, + "learning_rate": 2.316689158794422e-05, + "loss": 0.0, + "step": 8640 + }, + { + "epoch": 11.033670715249663, + "grad_norm": 0.0026982324197888374, + "learning_rate": 2.3129404708352078e-05, + "loss": 0.0, + "step": 8650 + }, + { + "epoch": 11.03434547908232, + "grad_norm": 0.0001991643221117556, + "learning_rate": 2.3091917828759934e-05, + "loss": 0.0, + "step": 8660 + }, + { + "epoch": 11.03502024291498, + "grad_norm": 0.0019273010548204184, + "learning_rate": 2.3054430949167793e-05, + "loss": 0.0, + "step": 8670 + }, + { + "epoch": 11.035695006747638, + "grad_norm": 0.000698404386639595, + "learning_rate": 2.301694406957565e-05, + "loss": 0.0001, + "step": 8680 + }, + { + "epoch": 11.036369770580297, + "grad_norm": 0.00025344561436213553, + "learning_rate": 2.2979457189983508e-05, + "loss": 0.0, + "step": 8690 + }, + { + "epoch": 11.037044534412955, + "grad_norm": 0.0027119882870465517, + "learning_rate": 2.2941970310391364e-05, + "loss": 0.4804, + "step": 8700 + }, + { + "epoch": 11.037719298245614, + "grad_norm": 0.00020401214715093374, + "learning_rate": 2.290448343079922e-05, + "loss": 0.0, + "step": 8710 + }, + { + "epoch": 11.038394062078273, + "grad_norm": 0.0004772163520101458, + "learning_rate": 2.286699655120708e-05, + "loss": 0.0, + "step": 8720 + }, + { + "epoch": 11.039068825910931, + "grad_norm": 0.0004061859508510679, + "learning_rate": 2.2829509671614935e-05, + "loss": 0.0001, + "step": 8730 + }, + { + "epoch": 11.03974358974359, + "grad_norm": 0.0010080209467560053, + "learning_rate": 2.2792022792022794e-05, + "loss": 0.0, + "step": 8740 + }, + { + "epoch": 11.040418353576248, + "grad_norm": 0.00021367882436607033, + "learning_rate": 2.275453591243065e-05, + "loss": 0.0, + "step": 8750 + }, + { + "epoch": 11.041093117408908, + "grad_norm": 0.002230451675131917, + "learning_rate": 2.271704903283851e-05, + "loss": 0.0, + "step": 8760 + }, + { + "epoch": 11.041767881241565, + "grad_norm": 0.0003300936659798026, + "learning_rate": 2.2679562153246365e-05, + "loss": 0.0, + "step": 8770 + }, + { + "epoch": 11.042442645074225, + "grad_norm": 0.0023498530499637127, + "learning_rate": 2.2642075273654224e-05, + "loss": 0.0001, + "step": 8780 + }, + { + "epoch": 11.043117408906882, + "grad_norm": 0.0011958705727010965, + "learning_rate": 2.260458839406208e-05, + "loss": 0.0001, + "step": 8790 + }, + { + "epoch": 11.043792172739542, + "grad_norm": 0.0022039199247956276, + "learning_rate": 2.256710151446994e-05, + "loss": 0.0, + "step": 8800 + }, + { + "epoch": 11.0444669365722, + "grad_norm": 0.0003688375581987202, + "learning_rate": 2.2529614634877794e-05, + "loss": 0.0059, + "step": 8810 + }, + { + "epoch": 11.045141700404859, + "grad_norm": 0.0007805086788721383, + "learning_rate": 2.2492127755285654e-05, + "loss": 0.0, + "step": 8820 + }, + { + "epoch": 11.045816464237516, + "grad_norm": 0.0009934029076248407, + "learning_rate": 2.245464087569351e-05, + "loss": 0.0, + "step": 8830 + }, + { + "epoch": 11.046491228070176, + "grad_norm": 0.001246001455001533, + "learning_rate": 2.2417153996101365e-05, + "loss": 0.0, + "step": 8840 + }, + { + "epoch": 11.047165991902833, + "grad_norm": 9.812816279008985e-05, + "learning_rate": 2.2379667116509224e-05, + "loss": 0.0, + "step": 8850 + }, + { + "epoch": 11.047840755735493, + "grad_norm": 0.0004926809924654663, + "learning_rate": 2.234218023691708e-05, + "loss": 0.0, + "step": 8860 + }, + { + "epoch": 11.04851551956815, + "grad_norm": 0.0003611448628362268, + "learning_rate": 2.230469335732494e-05, + "loss": 0.0, + "step": 8870 + }, + { + "epoch": 11.04919028340081, + "grad_norm": 0.000520729401614517, + "learning_rate": 2.2267206477732795e-05, + "loss": 0.0, + "step": 8880 + }, + { + "epoch": 11.049865047233467, + "grad_norm": 0.00023293115373235196, + "learning_rate": 2.2229719598140654e-05, + "loss": 0.0, + "step": 8890 + }, + { + "epoch": 11.05, + "eval_accuracy": 0.9196428571428571, + "eval_f1": 0.9207212368977075, + "eval_loss": 0.6125034689903259, + "eval_runtime": 71.2839, + "eval_samples_per_second": 1.571, + "eval_steps_per_second": 1.571, + "step": 8892 + }, + { + "epoch": 12.000539811066126, + "grad_norm": 0.0007233197102323174, + "learning_rate": 2.219223271854851e-05, + "loss": 0.4448, + "step": 8900 + }, + { + "epoch": 12.001214574898786, + "grad_norm": 0.0002516189415473491, + "learning_rate": 2.2154745838956366e-05, + "loss": 0.0354, + "step": 8910 + }, + { + "epoch": 12.001889338731443, + "grad_norm": 0.0003420355205889791, + "learning_rate": 2.2117258959364225e-05, + "loss": 0.0, + "step": 8920 + }, + { + "epoch": 12.002564102564103, + "grad_norm": 0.0004494291788432747, + "learning_rate": 2.207977207977208e-05, + "loss": 0.0, + "step": 8930 + }, + { + "epoch": 12.003238866396762, + "grad_norm": 0.00031234361813403666, + "learning_rate": 2.2042285200179936e-05, + "loss": 0.0, + "step": 8940 + }, + { + "epoch": 12.00391363022942, + "grad_norm": 0.00012721461826004088, + "learning_rate": 2.2004798320587796e-05, + "loss": 0.0838, + "step": 8950 + }, + { + "epoch": 12.004588394062079, + "grad_norm": 0.000489223632030189, + "learning_rate": 2.196731144099565e-05, + "loss": 0.0, + "step": 8960 + }, + { + "epoch": 12.005263157894737, + "grad_norm": 0.0033521486911922693, + "learning_rate": 2.1929824561403507e-05, + "loss": 0.1973, + "step": 8970 + }, + { + "epoch": 12.005937921727396, + "grad_norm": 0.009397713467478752, + "learning_rate": 2.1892337681811366e-05, + "loss": 0.0, + "step": 8980 + }, + { + "epoch": 12.006612685560054, + "grad_norm": 0.006849181838333607, + "learning_rate": 2.1854850802219222e-05, + "loss": 0.0, + "step": 8990 + }, + { + "epoch": 12.007287449392713, + "grad_norm": 0.0006626849644817412, + "learning_rate": 2.181736392262708e-05, + "loss": 0.0, + "step": 9000 + }, + { + "epoch": 12.00796221322537, + "grad_norm": 0.000323317275615409, + "learning_rate": 2.1779877043034937e-05, + "loss": 0.4233, + "step": 9010 + }, + { + "epoch": 12.00863697705803, + "grad_norm": 0.00013916198804508895, + "learning_rate": 2.1742390163442796e-05, + "loss": 0.0006, + "step": 9020 + }, + { + "epoch": 12.009311740890688, + "grad_norm": 0.0004281499423086643, + "learning_rate": 2.1704903283850652e-05, + "loss": 0.0, + "step": 9030 + }, + { + "epoch": 12.009986504723347, + "grad_norm": 0.0038120527751743793, + "learning_rate": 2.166741640425851e-05, + "loss": 0.2496, + "step": 9040 + }, + { + "epoch": 12.010661268556005, + "grad_norm": 0.007827537134289742, + "learning_rate": 2.1629929524666367e-05, + "loss": 0.0, + "step": 9050 + }, + { + "epoch": 12.011336032388664, + "grad_norm": 0.0004882031353190541, + "learning_rate": 2.1592442645074226e-05, + "loss": 0.0236, + "step": 9060 + }, + { + "epoch": 12.012010796221322, + "grad_norm": 0.0006974562420509756, + "learning_rate": 2.1554955765482082e-05, + "loss": 0.0, + "step": 9070 + }, + { + "epoch": 12.012685560053981, + "grad_norm": 0.0007927274564281106, + "learning_rate": 2.151746888588994e-05, + "loss": 0.0, + "step": 9080 + }, + { + "epoch": 12.013360323886639, + "grad_norm": 0.0005972622311674058, + "learning_rate": 2.1479982006297797e-05, + "loss": 0.0, + "step": 9090 + }, + { + "epoch": 12.014035087719298, + "grad_norm": 0.0020678879227489233, + "learning_rate": 2.1442495126705653e-05, + "loss": 0.0, + "step": 9100 + }, + { + "epoch": 12.014709851551958, + "grad_norm": 0.0017037901561707258, + "learning_rate": 2.1405008247113512e-05, + "loss": 0.0, + "step": 9110 + }, + { + "epoch": 12.015384615384615, + "grad_norm": 0.002625885419547558, + "learning_rate": 2.1367521367521368e-05, + "loss": 0.0, + "step": 9120 + }, + { + "epoch": 12.016059379217275, + "grad_norm": 0.00016007163503672928, + "learning_rate": 2.1330034487929227e-05, + "loss": 0.0, + "step": 9130 + }, + { + "epoch": 12.016734143049932, + "grad_norm": 8.975803211797029e-05, + "learning_rate": 2.1292547608337082e-05, + "loss": 0.0, + "step": 9140 + }, + { + "epoch": 12.017408906882592, + "grad_norm": 0.00010270516213495284, + "learning_rate": 2.125506072874494e-05, + "loss": 0.0, + "step": 9150 + }, + { + "epoch": 12.01808367071525, + "grad_norm": 0.0003781057021114975, + "learning_rate": 2.1217573849152797e-05, + "loss": 0.0, + "step": 9160 + }, + { + "epoch": 12.018758434547909, + "grad_norm": 0.00045806102571077645, + "learning_rate": 2.1180086969560657e-05, + "loss": 0.0, + "step": 9170 + }, + { + "epoch": 12.019433198380566, + "grad_norm": 0.00040667993016541004, + "learning_rate": 2.1142600089968512e-05, + "loss": 0.0, + "step": 9180 + }, + { + "epoch": 12.020107962213226, + "grad_norm": 7.579607336083427e-05, + "learning_rate": 2.110511321037637e-05, + "loss": 0.0, + "step": 9190 + }, + { + "epoch": 12.020782726045883, + "grad_norm": 0.0002768370322883129, + "learning_rate": 2.1067626330784227e-05, + "loss": 0.0, + "step": 9200 + }, + { + "epoch": 12.021457489878543, + "grad_norm": 0.0010953324381262064, + "learning_rate": 2.1030139451192083e-05, + "loss": 0.0, + "step": 9210 + }, + { + "epoch": 12.0221322537112, + "grad_norm": 0.00658809207379818, + "learning_rate": 2.0992652571599942e-05, + "loss": 0.0919, + "step": 9220 + }, + { + "epoch": 12.02280701754386, + "grad_norm": 0.0006163925281725824, + "learning_rate": 2.0955165692007798e-05, + "loss": 0.0, + "step": 9230 + }, + { + "epoch": 12.023481781376518, + "grad_norm": 0.000813082791864872, + "learning_rate": 2.0917678812415657e-05, + "loss": 0.0001, + "step": 9240 + }, + { + "epoch": 12.024156545209177, + "grad_norm": 0.00046772375935688615, + "learning_rate": 2.0880191932823513e-05, + "loss": 0.0, + "step": 9250 + }, + { + "epoch": 12.024831309041835, + "grad_norm": 0.0005937941023148596, + "learning_rate": 2.0842705053231372e-05, + "loss": 0.0002, + "step": 9260 + }, + { + "epoch": 12.025506072874494, + "grad_norm": 0.000659748911857605, + "learning_rate": 2.0805218173639228e-05, + "loss": 0.0, + "step": 9270 + }, + { + "epoch": 12.026180836707152, + "grad_norm": 0.0006786544108763337, + "learning_rate": 2.0767731294047084e-05, + "loss": 0.0, + "step": 9280 + }, + { + "epoch": 12.026855600539811, + "grad_norm": 0.000225842886720784, + "learning_rate": 2.0730244414454943e-05, + "loss": 0.0, + "step": 9290 + }, + { + "epoch": 12.02753036437247, + "grad_norm": 0.0006020697182975709, + "learning_rate": 2.06927575348628e-05, + "loss": 0.0, + "step": 9300 + }, + { + "epoch": 12.028205128205128, + "grad_norm": 0.0005702193011529744, + "learning_rate": 2.0655270655270654e-05, + "loss": 0.0, + "step": 9310 + }, + { + "epoch": 12.028879892037788, + "grad_norm": 0.000844390713609755, + "learning_rate": 2.0617783775678514e-05, + "loss": 0.0, + "step": 9320 + }, + { + "epoch": 12.029554655870445, + "grad_norm": 9.666190453572199e-05, + "learning_rate": 2.058029689608637e-05, + "loss": 0.0, + "step": 9330 + }, + { + "epoch": 12.030229419703105, + "grad_norm": 0.0001864578080130741, + "learning_rate": 2.0542810016494225e-05, + "loss": 0.0, + "step": 9340 + }, + { + "epoch": 12.030904183535762, + "grad_norm": 0.00014394025492947549, + "learning_rate": 2.0505323136902084e-05, + "loss": 0.0, + "step": 9350 + }, + { + "epoch": 12.031578947368422, + "grad_norm": 0.00027057836996391416, + "learning_rate": 2.046783625730994e-05, + "loss": 0.0, + "step": 9360 + }, + { + "epoch": 12.03225371120108, + "grad_norm": 0.0004066646215505898, + "learning_rate": 2.04303493777178e-05, + "loss": 0.0, + "step": 9370 + }, + { + "epoch": 12.032928475033739, + "grad_norm": 0.00043117342283949256, + "learning_rate": 2.0392862498125655e-05, + "loss": 0.0, + "step": 9380 + }, + { + "epoch": 12.033603238866396, + "grad_norm": 0.00019329691713210195, + "learning_rate": 2.0355375618533514e-05, + "loss": 0.0001, + "step": 9390 + }, + { + "epoch": 12.034278002699056, + "grad_norm": 0.00036019805702380836, + "learning_rate": 2.031788873894137e-05, + "loss": 0.0, + "step": 9400 + }, + { + "epoch": 12.034952766531713, + "grad_norm": 0.0006936113350093365, + "learning_rate": 2.028040185934923e-05, + "loss": 0.0, + "step": 9410 + }, + { + "epoch": 12.035627530364373, + "grad_norm": 0.00041965124546550214, + "learning_rate": 2.0242914979757085e-05, + "loss": 0.0, + "step": 9420 + }, + { + "epoch": 12.03630229419703, + "grad_norm": 0.00011109585466329008, + "learning_rate": 2.0205428100164944e-05, + "loss": 0.0, + "step": 9430 + }, + { + "epoch": 12.03697705802969, + "grad_norm": 0.000144297766382806, + "learning_rate": 2.01679412205728e-05, + "loss": 0.0281, + "step": 9440 + }, + { + "epoch": 12.037651821862347, + "grad_norm": 0.0002551145735196769, + "learning_rate": 2.013045434098066e-05, + "loss": 0.0, + "step": 9450 + }, + { + "epoch": 12.038326585695007, + "grad_norm": 0.006847582757472992, + "learning_rate": 2.0092967461388515e-05, + "loss": 0.0, + "step": 9460 + }, + { + "epoch": 12.039001349527666, + "grad_norm": 0.00011437670036684722, + "learning_rate": 2.005548058179637e-05, + "loss": 0.0, + "step": 9470 + }, + { + "epoch": 12.039676113360324, + "grad_norm": 0.00040303889545612037, + "learning_rate": 2.001799370220423e-05, + "loss": 0.0, + "step": 9480 + }, + { + "epoch": 12.040350877192983, + "grad_norm": 0.00046083523193374276, + "learning_rate": 1.9980506822612085e-05, + "loss": 0.0, + "step": 9490 + }, + { + "epoch": 12.04102564102564, + "grad_norm": 0.0006515540299005806, + "learning_rate": 1.9943019943019945e-05, + "loss": 0.0, + "step": 9500 + }, + { + "epoch": 12.0417004048583, + "grad_norm": 0.00014752485731150955, + "learning_rate": 1.99055330634278e-05, + "loss": 0.0, + "step": 9510 + }, + { + "epoch": 12.042375168690958, + "grad_norm": 0.0005620931042358279, + "learning_rate": 1.986804618383566e-05, + "loss": 0.0, + "step": 9520 + }, + { + "epoch": 12.043049932523617, + "grad_norm": 0.00011923335841856897, + "learning_rate": 1.9830559304243515e-05, + "loss": 0.0, + "step": 9530 + }, + { + "epoch": 12.043724696356275, + "grad_norm": 0.0002657576696947217, + "learning_rate": 1.9793072424651374e-05, + "loss": 0.0, + "step": 9540 + }, + { + "epoch": 12.044399460188934, + "grad_norm": 0.0001235770614584908, + "learning_rate": 1.975558554505923e-05, + "loss": 0.0, + "step": 9550 + }, + { + "epoch": 12.045074224021592, + "grad_norm": 0.0001751129748299718, + "learning_rate": 1.971809866546709e-05, + "loss": 0.4854, + "step": 9560 + }, + { + "epoch": 12.045748987854251, + "grad_norm": 0.000554791884496808, + "learning_rate": 1.9680611785874945e-05, + "loss": 0.0, + "step": 9570 + }, + { + "epoch": 12.046423751686909, + "grad_norm": 0.0003107208467554301, + "learning_rate": 1.9643124906282804e-05, + "loss": 0.0, + "step": 9580 + }, + { + "epoch": 12.047098515519568, + "grad_norm": 0.0002857028157450259, + "learning_rate": 1.960563802669066e-05, + "loss": 0.0, + "step": 9590 + }, + { + "epoch": 12.047773279352226, + "grad_norm": 0.0001487692934460938, + "learning_rate": 1.9568151147098516e-05, + "loss": 0.0, + "step": 9600 + }, + { + "epoch": 12.048448043184885, + "grad_norm": 0.0004835377912968397, + "learning_rate": 1.9530664267506375e-05, + "loss": 0.0, + "step": 9610 + }, + { + "epoch": 12.049122807017543, + "grad_norm": 0.004288305062800646, + "learning_rate": 1.949317738791423e-05, + "loss": 0.0, + "step": 9620 + }, + { + "epoch": 12.049797570850203, + "grad_norm": 0.0002630397502798587, + "learning_rate": 1.945569050832209e-05, + "loss": 0.0, + "step": 9630 + }, + { + "epoch": 12.05, + "eval_accuracy": 0.9285714285714286, + "eval_f1": 0.9281167328042328, + "eval_loss": 0.5643919110298157, + "eval_runtime": 75.5753, + "eval_samples_per_second": 1.482, + "eval_steps_per_second": 1.482, + "step": 9633 + }, + { + "epoch": 13.000472334682861, + "grad_norm": 0.00026892725145444274, + "learning_rate": 1.9418203628729946e-05, + "loss": 0.0, + "step": 9640 + }, + { + "epoch": 13.001147098515519, + "grad_norm": 0.00012843680451624095, + "learning_rate": 1.9380716749137805e-05, + "loss": 0.0, + "step": 9650 + }, + { + "epoch": 13.001821862348178, + "grad_norm": 0.00029701701714657247, + "learning_rate": 1.934322986954566e-05, + "loss": 0.0, + "step": 9660 + }, + { + "epoch": 13.002496626180836, + "grad_norm": 0.00036974012618884444, + "learning_rate": 1.9305742989953516e-05, + "loss": 0.0, + "step": 9670 + }, + { + "epoch": 13.003171390013495, + "grad_norm": 0.0001296445552725345, + "learning_rate": 1.9268256110361376e-05, + "loss": 0.0078, + "step": 9680 + }, + { + "epoch": 13.003846153846155, + "grad_norm": 0.0002359377540415153, + "learning_rate": 1.923076923076923e-05, + "loss": 0.0, + "step": 9690 + }, + { + "epoch": 13.004520917678812, + "grad_norm": 0.0003535948053468019, + "learning_rate": 1.9193282351177087e-05, + "loss": 0.0, + "step": 9700 + }, + { + "epoch": 13.005195681511472, + "grad_norm": 0.00025236004148609936, + "learning_rate": 1.9155795471584946e-05, + "loss": 0.0, + "step": 9710 + }, + { + "epoch": 13.00587044534413, + "grad_norm": 0.0002863478730432689, + "learning_rate": 1.9118308591992802e-05, + "loss": 0.0, + "step": 9720 + }, + { + "epoch": 13.006545209176789, + "grad_norm": 0.00016143821994774044, + "learning_rate": 1.9080821712400658e-05, + "loss": 0.3645, + "step": 9730 + }, + { + "epoch": 13.007219973009446, + "grad_norm": 0.0004113702161703259, + "learning_rate": 1.9043334832808517e-05, + "loss": 0.0, + "step": 9740 + }, + { + "epoch": 13.007894736842106, + "grad_norm": 0.0008134804083965719, + "learning_rate": 1.9005847953216373e-05, + "loss": 0.0, + "step": 9750 + }, + { + "epoch": 13.008569500674763, + "grad_norm": 0.00027760997181758285, + "learning_rate": 1.8968361073624232e-05, + "loss": 0.0, + "step": 9760 + }, + { + "epoch": 13.009244264507423, + "grad_norm": 0.0016426608199253678, + "learning_rate": 1.8930874194032088e-05, + "loss": 0.0, + "step": 9770 + }, + { + "epoch": 13.00991902834008, + "grad_norm": 0.0008006367716006935, + "learning_rate": 1.8893387314439947e-05, + "loss": 0.0, + "step": 9780 + }, + { + "epoch": 13.01059379217274, + "grad_norm": 0.00025531640858389437, + "learning_rate": 1.8855900434847803e-05, + "loss": 0.0, + "step": 9790 + }, + { + "epoch": 13.011268556005398, + "grad_norm": 0.0003084157651755959, + "learning_rate": 1.8818413555255662e-05, + "loss": 0.0, + "step": 9800 + }, + { + "epoch": 13.011943319838057, + "grad_norm": 0.0007207695161923766, + "learning_rate": 1.8780926675663518e-05, + "loss": 0.0001, + "step": 9810 + }, + { + "epoch": 13.012618083670715, + "grad_norm": 0.00012202781363157555, + "learning_rate": 1.8743439796071377e-05, + "loss": 0.0, + "step": 9820 + }, + { + "epoch": 13.013292847503374, + "grad_norm": 0.0012473361566662788, + "learning_rate": 1.8705952916479233e-05, + "loss": 0.0, + "step": 9830 + }, + { + "epoch": 13.013967611336032, + "grad_norm": 0.0007895145681686699, + "learning_rate": 1.8668466036887092e-05, + "loss": 0.0, + "step": 9840 + }, + { + "epoch": 13.014642375168691, + "grad_norm": 0.0002717502065934241, + "learning_rate": 1.8630979157294948e-05, + "loss": 0.0, + "step": 9850 + }, + { + "epoch": 13.015317139001349, + "grad_norm": 0.0002320138446521014, + "learning_rate": 1.8593492277702803e-05, + "loss": 0.0, + "step": 9860 + }, + { + "epoch": 13.015991902834008, + "grad_norm": 0.0002716576855164021, + "learning_rate": 1.8556005398110663e-05, + "loss": 0.0, + "step": 9870 + }, + { + "epoch": 13.016666666666667, + "grad_norm": 7.131123129511252e-05, + "learning_rate": 1.8518518518518518e-05, + "loss": 0.0, + "step": 9880 + }, + { + "epoch": 13.017341430499325, + "grad_norm": 0.00045431696344166994, + "learning_rate": 1.8481031638926377e-05, + "loss": 0.0, + "step": 9890 + }, + { + "epoch": 13.018016194331985, + "grad_norm": 0.00013243043213151395, + "learning_rate": 1.8443544759334233e-05, + "loss": 0.0, + "step": 9900 + }, + { + "epoch": 13.018690958164642, + "grad_norm": 0.00031196267809718847, + "learning_rate": 1.8406057879742092e-05, + "loss": 0.0, + "step": 9910 + }, + { + "epoch": 13.019365721997302, + "grad_norm": 0.000940505473408848, + "learning_rate": 1.8368571000149948e-05, + "loss": 0.0, + "step": 9920 + }, + { + "epoch": 13.02004048582996, + "grad_norm": 0.0002774264430627227, + "learning_rate": 1.8331084120557807e-05, + "loss": 0.0, + "step": 9930 + }, + { + "epoch": 13.020715249662619, + "grad_norm": 0.0002633021795190871, + "learning_rate": 1.8293597240965663e-05, + "loss": 0.0, + "step": 9940 + }, + { + "epoch": 13.021390013495276, + "grad_norm": 7.044156518531963e-05, + "learning_rate": 1.8256110361373522e-05, + "loss": 0.0, + "step": 9950 + }, + { + "epoch": 13.022064777327936, + "grad_norm": 0.00017661662423051894, + "learning_rate": 1.8218623481781378e-05, + "loss": 0.0, + "step": 9960 + }, + { + "epoch": 13.022739541160593, + "grad_norm": 0.00028747491887770593, + "learning_rate": 1.8181136602189237e-05, + "loss": 0.0, + "step": 9970 + }, + { + "epoch": 13.023414304993253, + "grad_norm": 0.00039829890010878444, + "learning_rate": 1.8143649722597093e-05, + "loss": 0.0, + "step": 9980 + }, + { + "epoch": 13.02408906882591, + "grad_norm": 0.00022789667127653956, + "learning_rate": 1.810616284300495e-05, + "loss": 0.0, + "step": 9990 + }, + { + "epoch": 13.02476383265857, + "grad_norm": 0.00028411843231879175, + "learning_rate": 1.8068675963412808e-05, + "loss": 0.0, + "step": 10000 + }, + { + "epoch": 13.025438596491227, + "grad_norm": 0.0002080064732581377, + "learning_rate": 1.8031189083820664e-05, + "loss": 0.0, + "step": 10010 + }, + { + "epoch": 13.026113360323887, + "grad_norm": 0.00023453705944120884, + "learning_rate": 1.7993702204228523e-05, + "loss": 0.0096, + "step": 10020 + }, + { + "epoch": 13.026788124156544, + "grad_norm": 0.00010610045865178108, + "learning_rate": 1.795621532463638e-05, + "loss": 0.0, + "step": 10030 + }, + { + "epoch": 13.027462887989204, + "grad_norm": 0.0001514716714154929, + "learning_rate": 1.7918728445044234e-05, + "loss": 0.0, + "step": 10040 + }, + { + "epoch": 13.028137651821863, + "grad_norm": 0.00033169661764986813, + "learning_rate": 1.7881241565452094e-05, + "loss": 0.0, + "step": 10050 + }, + { + "epoch": 13.02881241565452, + "grad_norm": 0.00013784744078293443, + "learning_rate": 1.784375468585995e-05, + "loss": 0.0, + "step": 10060 + }, + { + "epoch": 13.02948717948718, + "grad_norm": 8.872824400896206e-05, + "learning_rate": 1.7806267806267805e-05, + "loss": 0.0, + "step": 10070 + }, + { + "epoch": 13.030161943319838, + "grad_norm": 0.00037344591692090034, + "learning_rate": 1.7768780926675664e-05, + "loss": 0.0, + "step": 10080 + }, + { + "epoch": 13.030836707152497, + "grad_norm": 0.0003687291464302689, + "learning_rate": 1.773129404708352e-05, + "loss": 0.0, + "step": 10090 + }, + { + "epoch": 13.031511470985155, + "grad_norm": 0.00017588827176950872, + "learning_rate": 1.769380716749138e-05, + "loss": 0.0, + "step": 10100 + }, + { + "epoch": 13.032186234817814, + "grad_norm": 0.00026350162806920707, + "learning_rate": 1.7656320287899235e-05, + "loss": 0.0, + "step": 10110 + }, + { + "epoch": 13.032860998650472, + "grad_norm": 9.849424532148987e-05, + "learning_rate": 1.761883340830709e-05, + "loss": 0.0, + "step": 10120 + }, + { + "epoch": 13.033535762483131, + "grad_norm": 0.00028973835287615657, + "learning_rate": 1.758134652871495e-05, + "loss": 0.0, + "step": 10130 + }, + { + "epoch": 13.034210526315789, + "grad_norm": 0.00022602990793529898, + "learning_rate": 1.7543859649122806e-05, + "loss": 0.0, + "step": 10140 + }, + { + "epoch": 13.034885290148448, + "grad_norm": 0.000543447386007756, + "learning_rate": 1.7506372769530665e-05, + "loss": 0.0, + "step": 10150 + }, + { + "epoch": 13.035560053981106, + "grad_norm": 0.0006508603109978139, + "learning_rate": 1.746888588993852e-05, + "loss": 0.0, + "step": 10160 + }, + { + "epoch": 13.036234817813765, + "grad_norm": 6.645211396971717e-05, + "learning_rate": 1.743139901034638e-05, + "loss": 0.4286, + "step": 10170 + }, + { + "epoch": 13.036909581646423, + "grad_norm": 0.00017078538076020777, + "learning_rate": 1.7393912130754236e-05, + "loss": 0.0, + "step": 10180 + }, + { + "epoch": 13.037584345479083, + "grad_norm": 0.0010123905958607793, + "learning_rate": 1.7356425251162095e-05, + "loss": 0.0, + "step": 10190 + }, + { + "epoch": 13.03825910931174, + "grad_norm": 0.00027252710424363613, + "learning_rate": 1.731893837156995e-05, + "loss": 0.0, + "step": 10200 + }, + { + "epoch": 13.0389338731444, + "grad_norm": 0.00013458417379297316, + "learning_rate": 1.728145149197781e-05, + "loss": 0.0, + "step": 10210 + }, + { + "epoch": 13.039608636977057, + "grad_norm": 0.00022678014647681266, + "learning_rate": 1.7243964612385665e-05, + "loss": 0.0, + "step": 10220 + }, + { + "epoch": 13.040283400809717, + "grad_norm": 0.00022790237562730908, + "learning_rate": 1.720647773279352e-05, + "loss": 0.0, + "step": 10230 + }, + { + "epoch": 13.040958164642376, + "grad_norm": 0.0002460694231558591, + "learning_rate": 1.716899085320138e-05, + "loss": 0.0, + "step": 10240 + }, + { + "epoch": 13.041632928475034, + "grad_norm": 0.00018956181884277612, + "learning_rate": 1.7131503973609236e-05, + "loss": 0.0, + "step": 10250 + }, + { + "epoch": 13.042307692307693, + "grad_norm": 0.00017144810408353806, + "learning_rate": 1.7094017094017095e-05, + "loss": 0.0, + "step": 10260 + }, + { + "epoch": 13.04298245614035, + "grad_norm": 0.0002925437001977116, + "learning_rate": 1.705653021442495e-05, + "loss": 0.0, + "step": 10270 + }, + { + "epoch": 13.04365721997301, + "grad_norm": 0.0002330515708308667, + "learning_rate": 1.701904333483281e-05, + "loss": 0.013, + "step": 10280 + }, + { + "epoch": 13.044331983805668, + "grad_norm": 0.00011631449160631746, + "learning_rate": 1.6981556455240666e-05, + "loss": 0.0, + "step": 10290 + }, + { + "epoch": 13.045006747638327, + "grad_norm": 0.0003174786688759923, + "learning_rate": 1.6944069575648525e-05, + "loss": 0.0, + "step": 10300 + }, + { + "epoch": 13.045681511470985, + "grad_norm": 0.0001684718154137954, + "learning_rate": 1.690658269605638e-05, + "loss": 0.0, + "step": 10310 + }, + { + "epoch": 13.046356275303644, + "grad_norm": 0.001750526949763298, + "learning_rate": 1.686909581646424e-05, + "loss": 0.0, + "step": 10320 + }, + { + "epoch": 13.047031039136302, + "grad_norm": 0.00024045804457273334, + "learning_rate": 1.6831608936872096e-05, + "loss": 0.0, + "step": 10330 + }, + { + "epoch": 13.047705802968961, + "grad_norm": 0.0006596571765840054, + "learning_rate": 1.6794122057279955e-05, + "loss": 0.0, + "step": 10340 + }, + { + "epoch": 13.048380566801619, + "grad_norm": 0.001252808142453432, + "learning_rate": 1.675663517768781e-05, + "loss": 0.3996, + "step": 10350 + }, + { + "epoch": 13.049055330634278, + "grad_norm": 0.0002453498891554773, + "learning_rate": 1.6719148298095667e-05, + "loss": 0.0, + "step": 10360 + }, + { + "epoch": 13.049730094466936, + "grad_norm": 0.0005040777614340186, + "learning_rate": 1.6681661418503526e-05, + "loss": 0.0, + "step": 10370 + }, + { + "epoch": 13.05, + "eval_accuracy": 0.9285714285714286, + "eval_f1": 0.9285714285714286, + "eval_loss": 0.5062018632888794, + "eval_runtime": 72.8565, + "eval_samples_per_second": 1.537, + "eval_steps_per_second": 1.537, + "step": 10374 + }, + { + "epoch": 14.000404858299595, + "grad_norm": 6.942117033759132e-05, + "learning_rate": 1.664417453891138e-05, + "loss": 0.0, + "step": 10380 + }, + { + "epoch": 14.001079622132254, + "grad_norm": 0.0004584739508572966, + "learning_rate": 1.660668765931924e-05, + "loss": 0.0, + "step": 10390 + }, + { + "epoch": 14.001754385964912, + "grad_norm": 0.0002316083264304325, + "learning_rate": 1.6569200779727097e-05, + "loss": 0.2714, + "step": 10400 + }, + { + "epoch": 14.002429149797571, + "grad_norm": 0.00024051779473666102, + "learning_rate": 1.6531713900134956e-05, + "loss": 0.0, + "step": 10410 + }, + { + "epoch": 14.003103913630229, + "grad_norm": 0.0008334843441843987, + "learning_rate": 1.649422702054281e-05, + "loss": 0.0, + "step": 10420 + }, + { + "epoch": 14.003778677462888, + "grad_norm": 0.00020968765602447093, + "learning_rate": 1.6456740140950667e-05, + "loss": 0.0178, + "step": 10430 + }, + { + "epoch": 14.004453441295546, + "grad_norm": 0.00022330092906486243, + "learning_rate": 1.6419253261358526e-05, + "loss": 0.0, + "step": 10440 + }, + { + "epoch": 14.005128205128205, + "grad_norm": 0.00021671153081115335, + "learning_rate": 1.6381766381766382e-05, + "loss": 0.009, + "step": 10450 + }, + { + "epoch": 14.005802968960865, + "grad_norm": 0.00033940834691748023, + "learning_rate": 1.6344279502174238e-05, + "loss": 0.0, + "step": 10460 + }, + { + "epoch": 14.006477732793522, + "grad_norm": 0.00048104580491781235, + "learning_rate": 1.6306792622582097e-05, + "loss": 0.0, + "step": 10470 + }, + { + "epoch": 14.007152496626182, + "grad_norm": 0.00029779202304780483, + "learning_rate": 1.6269305742989953e-05, + "loss": 0.0, + "step": 10480 + }, + { + "epoch": 14.00782726045884, + "grad_norm": 0.0004120915837120265, + "learning_rate": 1.623181886339781e-05, + "loss": 0.0, + "step": 10490 + }, + { + "epoch": 14.008502024291499, + "grad_norm": 0.0003056660061702132, + "learning_rate": 1.6194331983805668e-05, + "loss": 0.0, + "step": 10500 + }, + { + "epoch": 14.009176788124156, + "grad_norm": 0.000378406752133742, + "learning_rate": 1.6156845104213524e-05, + "loss": 0.0039, + "step": 10510 + }, + { + "epoch": 14.009851551956816, + "grad_norm": 0.0005049049505032599, + "learning_rate": 1.6119358224621383e-05, + "loss": 0.0, + "step": 10520 + }, + { + "epoch": 14.010526315789473, + "grad_norm": 0.00025037440354935825, + "learning_rate": 1.608187134502924e-05, + "loss": 0.0, + "step": 10530 + }, + { + "epoch": 14.011201079622133, + "grad_norm": 0.00037562023499049246, + "learning_rate": 1.6044384465437098e-05, + "loss": 0.0, + "step": 10540 + }, + { + "epoch": 14.01187584345479, + "grad_norm": 0.0003121852350886911, + "learning_rate": 1.6006897585844954e-05, + "loss": 0.0, + "step": 10550 + }, + { + "epoch": 14.01255060728745, + "grad_norm": 0.0003679589426610619, + "learning_rate": 1.5969410706252813e-05, + "loss": 0.0, + "step": 10560 + }, + { + "epoch": 14.013225371120107, + "grad_norm": 0.00028154728352092206, + "learning_rate": 1.593192382666067e-05, + "loss": 0.0, + "step": 10570 + }, + { + "epoch": 14.013900134952767, + "grad_norm": 0.00020654525724239647, + "learning_rate": 1.5894436947068528e-05, + "loss": 0.0, + "step": 10580 + }, + { + "epoch": 14.014574898785424, + "grad_norm": 0.00034096045419573784, + "learning_rate": 1.5856950067476383e-05, + "loss": 0.0, + "step": 10590 + }, + { + "epoch": 14.015249662618084, + "grad_norm": 0.00026030451408587396, + "learning_rate": 1.5819463187884243e-05, + "loss": 0.0, + "step": 10600 + }, + { + "epoch": 14.015924426450741, + "grad_norm": 8.031875040614977e-05, + "learning_rate": 1.57819763082921e-05, + "loss": 0.0, + "step": 10610 + }, + { + "epoch": 14.0165991902834, + "grad_norm": 0.000621096114628017, + "learning_rate": 1.5744489428699954e-05, + "loss": 0.0, + "step": 10620 + }, + { + "epoch": 14.01727395411606, + "grad_norm": 0.000524580420460552, + "learning_rate": 1.5707002549107813e-05, + "loss": 0.0, + "step": 10630 + }, + { + "epoch": 14.017948717948718, + "grad_norm": 0.00011200064182048663, + "learning_rate": 1.566951566951567e-05, + "loss": 0.0, + "step": 10640 + }, + { + "epoch": 14.018623481781377, + "grad_norm": 0.00032178129185922444, + "learning_rate": 1.5632028789923528e-05, + "loss": 0.0, + "step": 10650 + }, + { + "epoch": 14.019298245614035, + "grad_norm": 0.00024140749883372337, + "learning_rate": 1.5594541910331384e-05, + "loss": 0.0, + "step": 10660 + }, + { + "epoch": 14.019973009446694, + "grad_norm": 0.00022133818129077554, + "learning_rate": 1.5557055030739243e-05, + "loss": 0.0, + "step": 10670 + }, + { + "epoch": 14.020647773279352, + "grad_norm": 0.0002797930792439729, + "learning_rate": 1.55195681511471e-05, + "loss": 0.0, + "step": 10680 + }, + { + "epoch": 14.021322537112011, + "grad_norm": 0.0002334755117772147, + "learning_rate": 1.5482081271554958e-05, + "loss": 0.0, + "step": 10690 + }, + { + "epoch": 14.021997300944669, + "grad_norm": 0.0002469551400281489, + "learning_rate": 1.5444594391962814e-05, + "loss": 0.0, + "step": 10700 + }, + { + "epoch": 14.022672064777328, + "grad_norm": 8.5323081293609e-05, + "learning_rate": 1.5407107512370673e-05, + "loss": 0.0, + "step": 10710 + }, + { + "epoch": 14.023346828609986, + "grad_norm": 0.00019482328207232058, + "learning_rate": 1.536962063277853e-05, + "loss": 0.0, + "step": 10720 + }, + { + "epoch": 14.024021592442645, + "grad_norm": 0.00021449346968438476, + "learning_rate": 1.5332133753186388e-05, + "loss": 0.4463, + "step": 10730 + }, + { + "epoch": 14.024696356275303, + "grad_norm": 0.00064310641027987, + "learning_rate": 1.5294646873594244e-05, + "loss": 0.0, + "step": 10740 + }, + { + "epoch": 14.025371120107962, + "grad_norm": 0.00020890735322609544, + "learning_rate": 1.52571599940021e-05, + "loss": 0.0, + "step": 10750 + }, + { + "epoch": 14.02604588394062, + "grad_norm": 0.0005201689782552421, + "learning_rate": 1.5219673114409957e-05, + "loss": 0.0, + "step": 10760 + }, + { + "epoch": 14.02672064777328, + "grad_norm": 0.0005751597345806658, + "learning_rate": 1.5182186234817813e-05, + "loss": 0.0, + "step": 10770 + }, + { + "epoch": 14.027395411605937, + "grad_norm": 0.0009388537146151066, + "learning_rate": 1.5144699355225672e-05, + "loss": 0.0, + "step": 10780 + }, + { + "epoch": 14.028070175438597, + "grad_norm": 0.0005402613314799964, + "learning_rate": 1.5107212475633528e-05, + "loss": 0.0, + "step": 10790 + }, + { + "epoch": 14.028744939271254, + "grad_norm": 0.00010339209256926551, + "learning_rate": 1.5069725596041387e-05, + "loss": 0.0, + "step": 10800 + }, + { + "epoch": 14.029419703103914, + "grad_norm": 0.0005152708035893738, + "learning_rate": 1.5032238716449243e-05, + "loss": 0.0, + "step": 10810 + }, + { + "epoch": 14.030094466936573, + "grad_norm": 0.0007186134462244809, + "learning_rate": 1.4994751836857102e-05, + "loss": 0.0, + "step": 10820 + }, + { + "epoch": 14.03076923076923, + "grad_norm": 0.0005925975274294615, + "learning_rate": 1.4957264957264958e-05, + "loss": 0.0, + "step": 10830 + }, + { + "epoch": 14.03144399460189, + "grad_norm": 0.00019110101857222617, + "learning_rate": 1.4919778077672817e-05, + "loss": 0.0, + "step": 10840 + }, + { + "epoch": 14.032118758434548, + "grad_norm": 0.00018360813555773348, + "learning_rate": 1.4882291198080673e-05, + "loss": 0.0, + "step": 10850 + }, + { + "epoch": 14.032793522267207, + "grad_norm": 0.00020973542996216565, + "learning_rate": 1.4844804318488532e-05, + "loss": 0.0, + "step": 10860 + }, + { + "epoch": 14.033468286099865, + "grad_norm": 0.0007199271931312978, + "learning_rate": 1.4807317438896387e-05, + "loss": 0.0, + "step": 10870 + }, + { + "epoch": 14.034143049932524, + "grad_norm": 9.265208791475743e-05, + "learning_rate": 1.4769830559304243e-05, + "loss": 0.0, + "step": 10880 + }, + { + "epoch": 14.034817813765182, + "grad_norm": 8.818476635497063e-05, + "learning_rate": 1.4732343679712102e-05, + "loss": 0.0, + "step": 10890 + }, + { + "epoch": 14.035492577597841, + "grad_norm": 0.00018744076078291982, + "learning_rate": 1.4694856800119958e-05, + "loss": 0.0, + "step": 10900 + }, + { + "epoch": 14.036167341430499, + "grad_norm": 0.0003558373427949846, + "learning_rate": 1.4657369920527816e-05, + "loss": 0.0, + "step": 10910 + }, + { + "epoch": 14.036842105263158, + "grad_norm": 0.00015756840002723038, + "learning_rate": 1.4619883040935673e-05, + "loss": 0.0, + "step": 10920 + }, + { + "epoch": 14.037516869095816, + "grad_norm": 0.00011693660053424537, + "learning_rate": 1.458239616134353e-05, + "loss": 0.0, + "step": 10930 + }, + { + "epoch": 14.038191632928475, + "grad_norm": 0.00013403450429905206, + "learning_rate": 1.4544909281751386e-05, + "loss": 0.0, + "step": 10940 + }, + { + "epoch": 14.038866396761133, + "grad_norm": 0.00014881876995787024, + "learning_rate": 1.4507422402159246e-05, + "loss": 0.0, + "step": 10950 + }, + { + "epoch": 14.039541160593792, + "grad_norm": 0.00014527350140269846, + "learning_rate": 1.4469935522567101e-05, + "loss": 0.0, + "step": 10960 + }, + { + "epoch": 14.04021592442645, + "grad_norm": 0.00016278887051157653, + "learning_rate": 1.443244864297496e-05, + "loss": 0.0, + "step": 10970 + }, + { + "epoch": 14.04089068825911, + "grad_norm": 8.402692037634552e-05, + "learning_rate": 1.4394961763382816e-05, + "loss": 0.0, + "step": 10980 + }, + { + "epoch": 14.041565452091769, + "grad_norm": 0.00017224009206984192, + "learning_rate": 1.4357474883790675e-05, + "loss": 0.0, + "step": 10990 + }, + { + "epoch": 14.042240215924426, + "grad_norm": 0.0005430065211839974, + "learning_rate": 1.4319988004198531e-05, + "loss": 0.0, + "step": 11000 + }, + { + "epoch": 14.042914979757086, + "grad_norm": 0.0009919034782797098, + "learning_rate": 1.4282501124606387e-05, + "loss": 0.0, + "step": 11010 + }, + { + "epoch": 14.043589743589743, + "grad_norm": 0.0003526155778672546, + "learning_rate": 1.4245014245014246e-05, + "loss": 0.0, + "step": 11020 + }, + { + "epoch": 14.044264507422403, + "grad_norm": 9.54778806772083e-05, + "learning_rate": 1.4207527365422102e-05, + "loss": 0.0, + "step": 11030 + }, + { + "epoch": 14.04493927125506, + "grad_norm": 0.0001671431091381237, + "learning_rate": 1.4170040485829961e-05, + "loss": 0.0, + "step": 11040 + }, + { + "epoch": 14.04561403508772, + "grad_norm": 0.00022146198898553848, + "learning_rate": 1.4132553606237817e-05, + "loss": 0.3607, + "step": 11050 + }, + { + "epoch": 14.046288798920378, + "grad_norm": 0.0001517270429758355, + "learning_rate": 1.4095066726645676e-05, + "loss": 0.0, + "step": 11060 + }, + { + "epoch": 14.046963562753037, + "grad_norm": 0.0006123693310655653, + "learning_rate": 1.4057579847053532e-05, + "loss": 0.0, + "step": 11070 + }, + { + "epoch": 14.047638326585695, + "grad_norm": 0.001610752660781145, + "learning_rate": 1.4020092967461391e-05, + "loss": 0.0, + "step": 11080 + }, + { + "epoch": 14.048313090418354, + "grad_norm": 0.0001440331107005477, + "learning_rate": 1.3982606087869247e-05, + "loss": 0.0, + "step": 11090 + }, + { + "epoch": 14.048987854251012, + "grad_norm": 0.0007454275619238615, + "learning_rate": 1.3945119208277104e-05, + "loss": 0.0, + "step": 11100 + }, + { + "epoch": 14.049662618083671, + "grad_norm": 0.0003447613853495568, + "learning_rate": 1.390763232868496e-05, + "loss": 0.0, + "step": 11110 + }, + { + "epoch": 14.05, + "eval_accuracy": 0.9375, + "eval_f1": 0.9373365167161658, + "eval_loss": 0.5185861587524414, + "eval_runtime": 73.7028, + "eval_samples_per_second": 1.52, + "eval_steps_per_second": 1.52, + "step": 11115 + }, + { + "epoch": 15.001349527665317, + "eval_accuracy": 0.9023255813953488, + "eval_f1": 0.9016146713373171, + "eval_loss": 0.7568970918655396, + "eval_runtime": 137.2112, + "eval_samples_per_second": 1.567, + "eval_steps_per_second": 1.567, + "step": 11116 + }, + { + "epoch": 15.001349527665317, + "step": 11116, + "total_flos": 2.8480212872085897e+19, + "train_loss": 5.147429101647338e-09, + "train_runtime": 143.5742, + "train_samples_per_second": 5.161, + "train_steps_per_second": 5.161 + }, + { + "epoch": 15.001349527665317, + "eval_accuracy": 0.9375, + "eval_f1": 0.9373365167161658, + "eval_loss": 0.5185860991477966, + "eval_runtime": 72.3734, + "eval_samples_per_second": 1.548, + "eval_steps_per_second": 1.548, + "step": 11116 + }, + { + "epoch": 15.001349527665317, + "eval_accuracy": 0.9023255813953488, + "eval_f1": 0.9016146713373171, + "eval_loss": 0.756963849067688, + "eval_runtime": 137.6677, + "eval_samples_per_second": 1.562, + "eval_steps_per_second": 1.562, + "step": 11116 + } + ], + "logging_steps": 10, + "max_steps": 741, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.8480212872085897e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}