diff --git "a/checkpoint-14620/trainer_state.json" "b/checkpoint-14620/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-14620/trainer_state.json" @@ -0,0 +1,4289 @@ +{ + "best_metric": 0.22053596377372742, + "best_model_checkpoint": "autotrain-beit-base-patch16-224/checkpoint-14620", + "epoch": 10.0, + "eval_steps": 500, + "global_step": 14620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01709986320109439, + "grad_norm": 46.709171295166016, + "learning_rate": 7.523939808481532e-07, + "loss": 2.7484, + "step": 25 + }, + { + "epoch": 0.03419972640218878, + "grad_norm": 38.754066467285156, + "learning_rate": 1.6073871409028727e-06, + "loss": 2.4985, + "step": 50 + }, + { + "epoch": 0.05129958960328317, + "grad_norm": 43.13520431518555, + "learning_rate": 2.4623803009575924e-06, + "loss": 2.1311, + "step": 75 + }, + { + "epoch": 0.06839945280437756, + "grad_norm": 51.77237319946289, + "learning_rate": 3.3173734610123124e-06, + "loss": 1.916, + "step": 100 + }, + { + "epoch": 0.08549931600547196, + "grad_norm": 52.562774658203125, + "learning_rate": 4.138166894664843e-06, + "loss": 1.5007, + "step": 125 + }, + { + "epoch": 0.10259917920656635, + "grad_norm": 48.85651779174805, + "learning_rate": 4.993160054719562e-06, + "loss": 1.5403, + "step": 150 + }, + { + "epoch": 0.11969904240766074, + "grad_norm": 34.78243637084961, + "learning_rate": 5.848153214774282e-06, + "loss": 1.3732, + "step": 175 + }, + { + "epoch": 0.13679890560875513, + "grad_norm": 36.370094299316406, + "learning_rate": 6.7031463748290014e-06, + "loss": 1.32, + "step": 200 + }, + { + "epoch": 0.1538987688098495, + "grad_norm": 36.674888610839844, + "learning_rate": 7.558139534883721e-06, + "loss": 1.2517, + "step": 225 + }, + { + "epoch": 0.17099863201094392, + "grad_norm": 39.88427734375, + "learning_rate": 8.41313269493844e-06, + "loss": 1.1245, + "step": 250 + }, + { + "epoch": 0.1880984952120383, + "grad_norm": 37.10840606689453, + "learning_rate": 9.26812585499316e-06, + "loss": 1.0716, + "step": 275 + }, + { + "epoch": 0.2051983584131327, + "grad_norm": 40.91892623901367, + "learning_rate": 1.0123119015047879e-05, + "loss": 1.2636, + "step": 300 + }, + { + "epoch": 0.22229822161422708, + "grad_norm": 25.208866119384766, + "learning_rate": 1.09781121751026e-05, + "loss": 1.0415, + "step": 325 + }, + { + "epoch": 0.2393980848153215, + "grad_norm": 32.90418243408203, + "learning_rate": 1.183310533515732e-05, + "loss": 0.9204, + "step": 350 + }, + { + "epoch": 0.25649794801641584, + "grad_norm": 22.914512634277344, + "learning_rate": 1.2688098495212038e-05, + "loss": 0.8906, + "step": 375 + }, + { + "epoch": 0.27359781121751026, + "grad_norm": 26.361547470092773, + "learning_rate": 1.354309165526676e-05, + "loss": 0.7451, + "step": 400 + }, + { + "epoch": 0.29069767441860467, + "grad_norm": 6.595183849334717, + "learning_rate": 1.4398084815321477e-05, + "loss": 0.8366, + "step": 425 + }, + { + "epoch": 0.307797537619699, + "grad_norm": 21.60556983947754, + "learning_rate": 1.5253077975376198e-05, + "loss": 0.8602, + "step": 450 + }, + { + "epoch": 0.32489740082079344, + "grad_norm": 19.94957160949707, + "learning_rate": 1.6108071135430915e-05, + "loss": 0.8112, + "step": 475 + }, + { + "epoch": 0.34199726402188785, + "grad_norm": 18.41588020324707, + "learning_rate": 1.6963064295485636e-05, + "loss": 0.7563, + "step": 500 + }, + { + "epoch": 0.3590971272229822, + "grad_norm": 26.010887145996094, + "learning_rate": 1.7818057455540357e-05, + "loss": 0.745, + "step": 525 + }, + { + "epoch": 0.3761969904240766, + "grad_norm": 29.746938705444336, + "learning_rate": 1.8673050615595075e-05, + "loss": 0.9174, + "step": 550 + }, + { + "epoch": 0.393296853625171, + "grad_norm": 9.71439266204834, + "learning_rate": 1.9528043775649796e-05, + "loss": 0.7998, + "step": 575 + }, + { + "epoch": 0.4103967168262654, + "grad_norm": 25.851831436157227, + "learning_rate": 2.0383036935704516e-05, + "loss": 0.8316, + "step": 600 + }, + { + "epoch": 0.4274965800273598, + "grad_norm": 6.615222454071045, + "learning_rate": 2.1238030095759234e-05, + "loss": 0.7572, + "step": 625 + }, + { + "epoch": 0.44459644322845415, + "grad_norm": 21.029979705810547, + "learning_rate": 2.2093023255813955e-05, + "loss": 0.8479, + "step": 650 + }, + { + "epoch": 0.46169630642954856, + "grad_norm": 34.08558654785156, + "learning_rate": 2.2948016415868672e-05, + "loss": 0.9019, + "step": 675 + }, + { + "epoch": 0.478796169630643, + "grad_norm": 21.173648834228516, + "learning_rate": 2.3803009575923393e-05, + "loss": 0.6358, + "step": 700 + }, + { + "epoch": 0.49589603283173733, + "grad_norm": 28.03355598449707, + "learning_rate": 2.4658002735978114e-05, + "loss": 0.6318, + "step": 725 + }, + { + "epoch": 0.5129958960328317, + "grad_norm": 32.51506423950195, + "learning_rate": 2.5512995896032832e-05, + "loss": 0.7156, + "step": 750 + }, + { + "epoch": 0.5300957592339262, + "grad_norm": 15.213716506958008, + "learning_rate": 2.6367989056087556e-05, + "loss": 0.7459, + "step": 775 + }, + { + "epoch": 0.5471956224350205, + "grad_norm": 25.61234474182129, + "learning_rate": 2.7222982216142274e-05, + "loss": 0.6615, + "step": 800 + }, + { + "epoch": 0.5642954856361149, + "grad_norm": 6.342990875244141, + "learning_rate": 2.807797537619699e-05, + "loss": 0.6494, + "step": 825 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 8.253657341003418, + "learning_rate": 2.893296853625171e-05, + "loss": 0.7401, + "step": 850 + }, + { + "epoch": 0.5984952120383037, + "grad_norm": 20.726346969604492, + "learning_rate": 2.9787961696306433e-05, + "loss": 0.8272, + "step": 875 + }, + { + "epoch": 0.615595075239398, + "grad_norm": 14.493860244750977, + "learning_rate": 3.064295485636115e-05, + "loss": 0.7291, + "step": 900 + }, + { + "epoch": 0.6326949384404925, + "grad_norm": 26.049036026000977, + "learning_rate": 3.149794801641587e-05, + "loss": 0.7125, + "step": 925 + }, + { + "epoch": 0.6497948016415869, + "grad_norm": 24.505985260009766, + "learning_rate": 3.235294117647059e-05, + "loss": 0.7522, + "step": 950 + }, + { + "epoch": 0.6668946648426812, + "grad_norm": 16.45219612121582, + "learning_rate": 3.3207934336525306e-05, + "loss": 0.9847, + "step": 975 + }, + { + "epoch": 0.6839945280437757, + "grad_norm": 16.54450798034668, + "learning_rate": 3.406292749658003e-05, + "loss": 0.6838, + "step": 1000 + }, + { + "epoch": 0.70109439124487, + "grad_norm": 20.877397537231445, + "learning_rate": 3.488372093023256e-05, + "loss": 0.8538, + "step": 1025 + }, + { + "epoch": 0.7181942544459644, + "grad_norm": 16.170602798461914, + "learning_rate": 3.573871409028728e-05, + "loss": 0.5981, + "step": 1050 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 4.53754186630249, + "learning_rate": 3.6593707250342e-05, + "loss": 0.7112, + "step": 1075 + }, + { + "epoch": 0.7523939808481532, + "grad_norm": 104.9443359375, + "learning_rate": 3.741450068399453e-05, + "loss": 0.8659, + "step": 1100 + }, + { + "epoch": 0.7694938440492476, + "grad_norm": 9.153970718383789, + "learning_rate": 3.826949384404925e-05, + "loss": 0.7048, + "step": 1125 + }, + { + "epoch": 0.786593707250342, + "grad_norm": 8.457886695861816, + "learning_rate": 3.912448700410397e-05, + "loss": 0.6735, + "step": 1150 + }, + { + "epoch": 0.8036935704514364, + "grad_norm": 22.64398765563965, + "learning_rate": 3.997948016415869e-05, + "loss": 0.6603, + "step": 1175 + }, + { + "epoch": 0.8207934336525308, + "grad_norm": 8.883172988891602, + "learning_rate": 4.083447332421341e-05, + "loss": 0.6347, + "step": 1200 + }, + { + "epoch": 0.8378932968536251, + "grad_norm": 15.982242584228516, + "learning_rate": 4.168946648426813e-05, + "loss": 0.8553, + "step": 1225 + }, + { + "epoch": 0.8549931600547196, + "grad_norm": 17.371896743774414, + "learning_rate": 4.2544459644322845e-05, + "loss": 0.8338, + "step": 1250 + }, + { + "epoch": 0.872093023255814, + "grad_norm": 9.03439998626709, + "learning_rate": 4.3399452804377566e-05, + "loss": 0.6968, + "step": 1275 + }, + { + "epoch": 0.8891928864569083, + "grad_norm": 12.585426330566406, + "learning_rate": 4.425444596443229e-05, + "loss": 0.7398, + "step": 1300 + }, + { + "epoch": 0.9062927496580028, + "grad_norm": 24.8383731842041, + "learning_rate": 4.510943912448701e-05, + "loss": 0.74, + "step": 1325 + }, + { + "epoch": 0.9233926128590971, + "grad_norm": 23.374618530273438, + "learning_rate": 4.596443228454172e-05, + "loss": 0.8325, + "step": 1350 + }, + { + "epoch": 0.9404924760601915, + "grad_norm": 18.9708251953125, + "learning_rate": 4.681942544459644e-05, + "loss": 0.705, + "step": 1375 + }, + { + "epoch": 0.957592339261286, + "grad_norm": 17.654476165771484, + "learning_rate": 4.7674418604651164e-05, + "loss": 0.6591, + "step": 1400 + }, + { + "epoch": 0.9746922024623803, + "grad_norm": 15.2923583984375, + "learning_rate": 4.8529411764705885e-05, + "loss": 0.6672, + "step": 1425 + }, + { + "epoch": 0.9917920656634747, + "grad_norm": 20.557374954223633, + "learning_rate": 4.93844049247606e-05, + "loss": 0.7082, + "step": 1450 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.7678388528508023, + "eval_f1_macro": 0.36714416708471204, + "eval_f1_micro": 0.7678388528508023, + "eval_f1_weighted": 0.7507762998391535, + "eval_loss": 0.7348673939704895, + "eval_precision_macro": 0.4581539504891226, + "eval_precision_micro": 0.7678388528508023, + "eval_precision_weighted": 0.7986681621261933, + "eval_recall_macro": 0.3826291513511869, + "eval_recall_micro": 0.7678388528508023, + "eval_recall_weighted": 0.7678388528508023, + "eval_runtime": 19.2382, + "eval_samples_per_second": 152.249, + "eval_steps_per_second": 9.564, + "step": 1462 + }, + { + "epoch": 1.008891928864569, + "grad_norm": 8.590130805969238, + "learning_rate": 4.99734002127983e-05, + "loss": 0.6613, + "step": 1475 + }, + { + "epoch": 1.0259917920656634, + "grad_norm": 7.963166236877441, + "learning_rate": 4.987840097279222e-05, + "loss": 0.6899, + "step": 1500 + }, + { + "epoch": 1.043091655266758, + "grad_norm": 21.036991119384766, + "learning_rate": 4.978340173278614e-05, + "loss": 0.7883, + "step": 1525 + }, + { + "epoch": 1.0601915184678523, + "grad_norm": 18.586833953857422, + "learning_rate": 4.968840249278006e-05, + "loss": 0.5927, + "step": 1550 + }, + { + "epoch": 1.0772913816689467, + "grad_norm": 10.346550941467285, + "learning_rate": 4.959340325277398e-05, + "loss": 0.7403, + "step": 1575 + }, + { + "epoch": 1.094391244870041, + "grad_norm": 15.1462984085083, + "learning_rate": 4.94984040127679e-05, + "loss": 0.67, + "step": 1600 + }, + { + "epoch": 1.1114911080711354, + "grad_norm": 29.532644271850586, + "learning_rate": 4.940340477276182e-05, + "loss": 0.724, + "step": 1625 + }, + { + "epoch": 1.1285909712722297, + "grad_norm": 20.413328170776367, + "learning_rate": 4.930840553275574e-05, + "loss": 0.7664, + "step": 1650 + }, + { + "epoch": 1.1456908344733243, + "grad_norm": 6.083666801452637, + "learning_rate": 4.921340629274966e-05, + "loss": 0.6986, + "step": 1675 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 10.168739318847656, + "learning_rate": 4.911840705274358e-05, + "loss": 0.6873, + "step": 1700 + }, + { + "epoch": 1.179890560875513, + "grad_norm": 16.213897705078125, + "learning_rate": 4.90234078127375e-05, + "loss": 0.6432, + "step": 1725 + }, + { + "epoch": 1.1969904240766074, + "grad_norm": 14.341080665588379, + "learning_rate": 4.892840857273142e-05, + "loss": 0.6205, + "step": 1750 + }, + { + "epoch": 1.2140902872777017, + "grad_norm": 8.960565567016602, + "learning_rate": 4.883340933272534e-05, + "loss": 0.7297, + "step": 1775 + }, + { + "epoch": 1.231190150478796, + "grad_norm": 3.6972837448120117, + "learning_rate": 4.8738410092719264e-05, + "loss": 0.5647, + "step": 1800 + }, + { + "epoch": 1.2482900136798905, + "grad_norm": 17.91363525390625, + "learning_rate": 4.864341085271318e-05, + "loss": 0.5821, + "step": 1825 + }, + { + "epoch": 1.265389876880985, + "grad_norm": 16.03091049194336, + "learning_rate": 4.8548411612707104e-05, + "loss": 0.8215, + "step": 1850 + }, + { + "epoch": 1.2824897400820794, + "grad_norm": 20.258285522460938, + "learning_rate": 4.845341237270102e-05, + "loss": 0.6713, + "step": 1875 + }, + { + "epoch": 1.2995896032831737, + "grad_norm": 21.53881072998047, + "learning_rate": 4.835841313269494e-05, + "loss": 0.6425, + "step": 1900 + }, + { + "epoch": 1.316689466484268, + "grad_norm": 17.40162467956543, + "learning_rate": 4.826341389268886e-05, + "loss": 0.7661, + "step": 1925 + }, + { + "epoch": 1.3337893296853625, + "grad_norm": 11.091559410095215, + "learning_rate": 4.816841465268278e-05, + "loss": 0.6502, + "step": 1950 + }, + { + "epoch": 1.350889192886457, + "grad_norm": 16.19685173034668, + "learning_rate": 4.80734154126767e-05, + "loss": 0.6064, + "step": 1975 + }, + { + "epoch": 1.3679890560875512, + "grad_norm": 18.266754150390625, + "learning_rate": 4.797841617267062e-05, + "loss": 0.7654, + "step": 2000 + }, + { + "epoch": 1.3850889192886457, + "grad_norm": 2.7492332458496094, + "learning_rate": 4.788341693266454e-05, + "loss": 0.6592, + "step": 2025 + }, + { + "epoch": 1.40218878248974, + "grad_norm": 6.613536357879639, + "learning_rate": 4.778841769265846e-05, + "loss": 0.8603, + "step": 2050 + }, + { + "epoch": 1.4192886456908345, + "grad_norm": 12.348869323730469, + "learning_rate": 4.769341845265238e-05, + "loss": 0.7457, + "step": 2075 + }, + { + "epoch": 1.4363885088919288, + "grad_norm": 10.183746337890625, + "learning_rate": 4.75984192126463e-05, + "loss": 0.5621, + "step": 2100 + }, + { + "epoch": 1.4534883720930232, + "grad_norm": 6.9152045249938965, + "learning_rate": 4.7503419972640224e-05, + "loss": 0.6217, + "step": 2125 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 9.911653518676758, + "learning_rate": 4.740842073263414e-05, + "loss": 0.8387, + "step": 2150 + }, + { + "epoch": 1.487688098495212, + "grad_norm": 19.982093811035156, + "learning_rate": 4.7313421492628064e-05, + "loss": 0.5981, + "step": 2175 + }, + { + "epoch": 1.5047879616963065, + "grad_norm": 11.546788215637207, + "learning_rate": 4.721842225262198e-05, + "loss": 0.6257, + "step": 2200 + }, + { + "epoch": 1.5218878248974008, + "grad_norm": 11.54325008392334, + "learning_rate": 4.7123423012615905e-05, + "loss": 0.6088, + "step": 2225 + }, + { + "epoch": 1.5389876880984952, + "grad_norm": 9.77811050415039, + "learning_rate": 4.703222374221006e-05, + "loss": 0.4845, + "step": 2250 + }, + { + "epoch": 1.5560875512995898, + "grad_norm": 9.156554222106934, + "learning_rate": 4.6937224502203985e-05, + "loss": 0.701, + "step": 2275 + }, + { + "epoch": 1.573187414500684, + "grad_norm": 9.874088287353516, + "learning_rate": 4.68422252621979e-05, + "loss": 0.7453, + "step": 2300 + }, + { + "epoch": 1.5902872777017785, + "grad_norm": 9.201342582702637, + "learning_rate": 4.6747226022191826e-05, + "loss": 0.5633, + "step": 2325 + }, + { + "epoch": 1.6073871409028728, + "grad_norm": 29.79956817626953, + "learning_rate": 4.665222678218574e-05, + "loss": 0.6438, + "step": 2350 + }, + { + "epoch": 1.6244870041039672, + "grad_norm": 8.855256080627441, + "learning_rate": 4.6557227542179666e-05, + "loss": 0.5698, + "step": 2375 + }, + { + "epoch": 1.6415868673050615, + "grad_norm": 8.34626579284668, + "learning_rate": 4.646222830217358e-05, + "loss": 0.7143, + "step": 2400 + }, + { + "epoch": 1.658686730506156, + "grad_norm": 18.65626335144043, + "learning_rate": 4.636722906216751e-05, + "loss": 0.6445, + "step": 2425 + }, + { + "epoch": 1.6757865937072505, + "grad_norm": 8.134085655212402, + "learning_rate": 4.6272229822161424e-05, + "loss": 0.4898, + "step": 2450 + }, + { + "epoch": 1.6928864569083446, + "grad_norm": 15.127388000488281, + "learning_rate": 4.617723058215535e-05, + "loss": 0.6497, + "step": 2475 + }, + { + "epoch": 1.7099863201094392, + "grad_norm": 17.599140167236328, + "learning_rate": 4.6082231342149264e-05, + "loss": 0.6605, + "step": 2500 + }, + { + "epoch": 1.7270861833105335, + "grad_norm": 22.064022064208984, + "learning_rate": 4.598723210214319e-05, + "loss": 0.5562, + "step": 2525 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 8.446691513061523, + "learning_rate": 4.5892232862137105e-05, + "loss": 0.591, + "step": 2550 + }, + { + "epoch": 1.7612859097127223, + "grad_norm": 15.194252967834473, + "learning_rate": 4.579723362213102e-05, + "loss": 0.635, + "step": 2575 + }, + { + "epoch": 1.7783857729138166, + "grad_norm": 14.25900936126709, + "learning_rate": 4.5702234382124946e-05, + "loss": 0.6253, + "step": 2600 + }, + { + "epoch": 1.7954856361149112, + "grad_norm": 9.975160598754883, + "learning_rate": 4.560723514211886e-05, + "loss": 0.4944, + "step": 2625 + }, + { + "epoch": 1.8125854993160053, + "grad_norm": 12.048364639282227, + "learning_rate": 4.5512235902112786e-05, + "loss": 0.5643, + "step": 2650 + }, + { + "epoch": 1.8296853625171, + "grad_norm": 16.70762825012207, + "learning_rate": 4.54172366621067e-05, + "loss": 0.6445, + "step": 2675 + }, + { + "epoch": 1.8467852257181943, + "grad_norm": 15.203225135803223, + "learning_rate": 4.532223742210063e-05, + "loss": 0.4731, + "step": 2700 + }, + { + "epoch": 1.8638850889192886, + "grad_norm": 6.673511028289795, + "learning_rate": 4.5227238182094544e-05, + "loss": 0.6115, + "step": 2725 + }, + { + "epoch": 1.8809849521203832, + "grad_norm": 3.219144105911255, + "learning_rate": 4.513223894208847e-05, + "loss": 0.6447, + "step": 2750 + }, + { + "epoch": 1.8980848153214773, + "grad_norm": 10.122079849243164, + "learning_rate": 4.5037239702082384e-05, + "loss": 0.5048, + "step": 2775 + }, + { + "epoch": 1.915184678522572, + "grad_norm": 8.148963928222656, + "learning_rate": 4.494224046207631e-05, + "loss": 0.522, + "step": 2800 + }, + { + "epoch": 1.9322845417236663, + "grad_norm": 17.671016693115234, + "learning_rate": 4.4847241222070225e-05, + "loss": 0.4838, + "step": 2825 + }, + { + "epoch": 1.9493844049247606, + "grad_norm": 11.81804370880127, + "learning_rate": 4.475224198206415e-05, + "loss": 0.6314, + "step": 2850 + }, + { + "epoch": 1.966484268125855, + "grad_norm": 9.446462631225586, + "learning_rate": 4.4657242742058065e-05, + "loss": 0.6951, + "step": 2875 + }, + { + "epoch": 1.9835841313269493, + "grad_norm": 0.3831145167350769, + "learning_rate": 4.456224350205199e-05, + "loss": 0.5709, + "step": 2900 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.890406282007511, + "eval_f1_macro": 0.6277004140591871, + "eval_f1_micro": 0.890406282007511, + "eval_f1_weighted": 0.8835937168276483, + "eval_loss": 0.3201202154159546, + "eval_precision_macro": 0.7489533576066802, + "eval_precision_micro": 0.890406282007511, + "eval_precision_weighted": 0.8996932956273017, + "eval_recall_macro": 0.6160822936733158, + "eval_recall_micro": 0.890406282007511, + "eval_recall_weighted": 0.890406282007511, + "eval_runtime": 19.0268, + "eval_samples_per_second": 153.941, + "eval_steps_per_second": 9.671, + "step": 2924 + }, + { + "epoch": 2.000683994528044, + "grad_norm": 8.932915687561035, + "learning_rate": 4.4467244262045906e-05, + "loss": 0.6213, + "step": 2925 + }, + { + "epoch": 2.017783857729138, + "grad_norm": 53.977447509765625, + "learning_rate": 4.437224502203983e-05, + "loss": 0.4965, + "step": 2950 + }, + { + "epoch": 2.0348837209302326, + "grad_norm": 5.680665969848633, + "learning_rate": 4.4277245782033746e-05, + "loss": 0.6174, + "step": 2975 + }, + { + "epoch": 2.0519835841313268, + "grad_norm": 14.235191345214844, + "learning_rate": 4.418224654202767e-05, + "loss": 0.5522, + "step": 3000 + }, + { + "epoch": 2.0690834473324213, + "grad_norm": 3.3336021900177, + "learning_rate": 4.408724730202159e-05, + "loss": 0.6311, + "step": 3025 + }, + { + "epoch": 2.086183310533516, + "grad_norm": 17.21300506591797, + "learning_rate": 4.3992248062015504e-05, + "loss": 0.6788, + "step": 3050 + }, + { + "epoch": 2.10328317373461, + "grad_norm": 8.625337600708008, + "learning_rate": 4.389724882200942e-05, + "loss": 0.4847, + "step": 3075 + }, + { + "epoch": 2.1203830369357046, + "grad_norm": 13.06096076965332, + "learning_rate": 4.3802249582003344e-05, + "loss": 0.5771, + "step": 3100 + }, + { + "epoch": 2.1374829001367988, + "grad_norm": 14.993678092956543, + "learning_rate": 4.370725034199726e-05, + "loss": 0.495, + "step": 3125 + }, + { + "epoch": 2.1545827633378933, + "grad_norm": 18.640869140625, + "learning_rate": 4.3612251101991185e-05, + "loss": 0.5442, + "step": 3150 + }, + { + "epoch": 2.1716826265389875, + "grad_norm": 13.761073112487793, + "learning_rate": 4.351725186198511e-05, + "loss": 0.563, + "step": 3175 + }, + { + "epoch": 2.188782489740082, + "grad_norm": 11.877754211425781, + "learning_rate": 4.3422252621979025e-05, + "loss": 0.8003, + "step": 3200 + }, + { + "epoch": 2.2058823529411766, + "grad_norm": 8.639129638671875, + "learning_rate": 4.332725338197295e-05, + "loss": 0.6284, + "step": 3225 + }, + { + "epoch": 2.2229822161422708, + "grad_norm": 6.337215900421143, + "learning_rate": 4.3232254141966866e-05, + "loss": 0.5617, + "step": 3250 + }, + { + "epoch": 2.2400820793433653, + "grad_norm": 1.9488357305526733, + "learning_rate": 4.313725490196079e-05, + "loss": 0.3964, + "step": 3275 + }, + { + "epoch": 2.2571819425444595, + "grad_norm": 4.854595184326172, + "learning_rate": 4.3042255661954706e-05, + "loss": 0.6221, + "step": 3300 + }, + { + "epoch": 2.274281805745554, + "grad_norm": 10.604134559631348, + "learning_rate": 4.294725642194863e-05, + "loss": 0.4416, + "step": 3325 + }, + { + "epoch": 2.2913816689466486, + "grad_norm": 9.8331937789917, + "learning_rate": 4.285225718194255e-05, + "loss": 0.6644, + "step": 3350 + }, + { + "epoch": 2.3084815321477428, + "grad_norm": 7.878199100494385, + "learning_rate": 4.275725794193647e-05, + "loss": 0.4562, + "step": 3375 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 5.875706195831299, + "learning_rate": 4.266225870193039e-05, + "loss": 0.6038, + "step": 3400 + }, + { + "epoch": 2.3426812585499315, + "grad_norm": 1.360823631286621, + "learning_rate": 4.256725946192431e-05, + "loss": 0.3918, + "step": 3425 + }, + { + "epoch": 2.359781121751026, + "grad_norm": 5.344891548156738, + "learning_rate": 4.247226022191823e-05, + "loss": 0.5924, + "step": 3450 + }, + { + "epoch": 2.37688098495212, + "grad_norm": 19.596725463867188, + "learning_rate": 4.2377260981912145e-05, + "loss": 0.6252, + "step": 3475 + }, + { + "epoch": 2.3939808481532148, + "grad_norm": 9.855287551879883, + "learning_rate": 4.228226174190606e-05, + "loss": 0.6215, + "step": 3500 + }, + { + "epoch": 2.4110807113543093, + "grad_norm": 10.44688606262207, + "learning_rate": 4.2187262501899986e-05, + "loss": 0.4234, + "step": 3525 + }, + { + "epoch": 2.4281805745554035, + "grad_norm": 8.25647258758545, + "learning_rate": 4.20922632618939e-05, + "loss": 0.4522, + "step": 3550 + }, + { + "epoch": 2.445280437756498, + "grad_norm": 18.42440414428711, + "learning_rate": 4.1997264021887826e-05, + "loss": 0.5475, + "step": 3575 + }, + { + "epoch": 2.462380300957592, + "grad_norm": 3.88397216796875, + "learning_rate": 4.190226478188174e-05, + "loss": 0.4464, + "step": 3600 + }, + { + "epoch": 2.4794801641586868, + "grad_norm": 15.069050788879395, + "learning_rate": 4.180726554187567e-05, + "loss": 0.6738, + "step": 3625 + }, + { + "epoch": 2.496580027359781, + "grad_norm": 5.434013366699219, + "learning_rate": 4.1712266301869584e-05, + "loss": 0.5139, + "step": 3650 + }, + { + "epoch": 2.5136798905608755, + "grad_norm": 6.18742036819458, + "learning_rate": 4.161726706186351e-05, + "loss": 0.6905, + "step": 3675 + }, + { + "epoch": 2.53077975376197, + "grad_norm": 4.691986560821533, + "learning_rate": 4.1522267821857424e-05, + "loss": 0.5514, + "step": 3700 + }, + { + "epoch": 2.547879616963064, + "grad_norm": 11.21522331237793, + "learning_rate": 4.142726858185135e-05, + "loss": 0.5283, + "step": 3725 + }, + { + "epoch": 2.5649794801641588, + "grad_norm": 18.263111114501953, + "learning_rate": 4.1332269341845265e-05, + "loss": 0.5471, + "step": 3750 + }, + { + "epoch": 2.582079343365253, + "grad_norm": 2.245192766189575, + "learning_rate": 4.123727010183919e-05, + "loss": 0.4889, + "step": 3775 + }, + { + "epoch": 2.5991792065663475, + "grad_norm": 8.650074005126953, + "learning_rate": 4.114227086183311e-05, + "loss": 0.5821, + "step": 3800 + }, + { + "epoch": 2.616279069767442, + "grad_norm": 8.487887382507324, + "learning_rate": 4.104727162182703e-05, + "loss": 0.4633, + "step": 3825 + }, + { + "epoch": 2.633378932968536, + "grad_norm": 3.491182327270508, + "learning_rate": 4.095227238182095e-05, + "loss": 0.4839, + "step": 3850 + }, + { + "epoch": 2.650478796169631, + "grad_norm": 15.229668617248535, + "learning_rate": 4.085727314181487e-05, + "loss": 0.4741, + "step": 3875 + }, + { + "epoch": 2.667578659370725, + "grad_norm": 5.991665363311768, + "learning_rate": 4.0762273901808786e-05, + "loss": 0.6269, + "step": 3900 + }, + { + "epoch": 2.6846785225718195, + "grad_norm": 3.6225790977478027, + "learning_rate": 4.066727466180271e-05, + "loss": 0.5778, + "step": 3925 + }, + { + "epoch": 2.701778385772914, + "grad_norm": 7.361936092376709, + "learning_rate": 4.057227542179663e-05, + "loss": 0.5857, + "step": 3950 + }, + { + "epoch": 2.718878248974008, + "grad_norm": 18.498151779174805, + "learning_rate": 4.0477276181790544e-05, + "loss": 0.599, + "step": 3975 + }, + { + "epoch": 2.7359781121751023, + "grad_norm": 11.898250579833984, + "learning_rate": 4.038227694178447e-05, + "loss": 0.5114, + "step": 4000 + }, + { + "epoch": 2.753077975376197, + "grad_norm": 5.535077095031738, + "learning_rate": 4.0287277701778384e-05, + "loss": 0.5272, + "step": 4025 + }, + { + "epoch": 2.7701778385772915, + "grad_norm": 2.3556160926818848, + "learning_rate": 4.019227846177231e-05, + "loss": 0.5648, + "step": 4050 + }, + { + "epoch": 2.7872777017783856, + "grad_norm": 11.369132041931152, + "learning_rate": 4.0097279221766225e-05, + "loss": 0.5935, + "step": 4075 + }, + { + "epoch": 2.80437756497948, + "grad_norm": 5.496129989624023, + "learning_rate": 4.000227998176015e-05, + "loss": 0.699, + "step": 4100 + }, + { + "epoch": 2.8214774281805743, + "grad_norm": 12.352839469909668, + "learning_rate": 3.9907280741754065e-05, + "loss": 0.5325, + "step": 4125 + }, + { + "epoch": 2.838577291381669, + "grad_norm": 2.7082407474517822, + "learning_rate": 3.981228150174799e-05, + "loss": 0.5331, + "step": 4150 + }, + { + "epoch": 2.8556771545827635, + "grad_norm": 12.403038024902344, + "learning_rate": 3.9717282261741906e-05, + "loss": 0.6043, + "step": 4175 + }, + { + "epoch": 2.8727770177838576, + "grad_norm": 12.153759002685547, + "learning_rate": 3.962228302173583e-05, + "loss": 0.4958, + "step": 4200 + }, + { + "epoch": 2.889876880984952, + "grad_norm": 6.992998123168945, + "learning_rate": 3.9527283781729746e-05, + "loss": 0.3868, + "step": 4225 + }, + { + "epoch": 2.9069767441860463, + "grad_norm": 3.785193681716919, + "learning_rate": 3.943228454172367e-05, + "loss": 0.5372, + "step": 4250 + }, + { + "epoch": 2.924076607387141, + "grad_norm": 22.4363956451416, + "learning_rate": 3.933728530171759e-05, + "loss": 0.5244, + "step": 4275 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 3.622431516647339, + "learning_rate": 3.924228606171151e-05, + "loss": 0.4722, + "step": 4300 + }, + { + "epoch": 2.9582763337893296, + "grad_norm": 1.2941017150878906, + "learning_rate": 3.914728682170543e-05, + "loss": 0.4208, + "step": 4325 + }, + { + "epoch": 2.975376196990424, + "grad_norm": 10.482751846313477, + "learning_rate": 3.905228758169935e-05, + "loss": 0.6347, + "step": 4350 + }, + { + "epoch": 2.9924760601915183, + "grad_norm": 4.376351356506348, + "learning_rate": 3.895728834169327e-05, + "loss": 0.6077, + "step": 4375 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.896551724137931, + "eval_f1_macro": 0.6998434390712878, + "eval_f1_micro": 0.896551724137931, + "eval_f1_weighted": 0.8937364843753902, + "eval_loss": 0.3129188120365143, + "eval_precision_macro": 0.8102800926777759, + "eval_precision_micro": 0.896551724137931, + "eval_precision_weighted": 0.9029461578223588, + "eval_recall_macro": 0.6655916075939068, + "eval_recall_micro": 0.896551724137931, + "eval_recall_weighted": 0.896551724137931, + "eval_runtime": 18.8573, + "eval_samples_per_second": 155.324, + "eval_steps_per_second": 9.757, + "step": 4386 + }, + { + "epoch": 3.009575923392613, + "grad_norm": 8.401654243469238, + "learning_rate": 3.8862289101687185e-05, + "loss": 0.5739, + "step": 4400 + }, + { + "epoch": 3.026675786593707, + "grad_norm": 10.48408031463623, + "learning_rate": 3.876728986168111e-05, + "loss": 0.6117, + "step": 4425 + }, + { + "epoch": 3.0437756497948016, + "grad_norm": 19.265623092651367, + "learning_rate": 3.8672290621675026e-05, + "loss": 0.4945, + "step": 4450 + }, + { + "epoch": 3.060875512995896, + "grad_norm": 25.774412155151367, + "learning_rate": 3.857729138166895e-05, + "loss": 0.5458, + "step": 4475 + }, + { + "epoch": 3.0779753761969904, + "grad_norm": 4.172712326049805, + "learning_rate": 3.8482292141662866e-05, + "loss": 0.4408, + "step": 4500 + }, + { + "epoch": 3.095075239398085, + "grad_norm": 5.7756876945495605, + "learning_rate": 3.838729290165679e-05, + "loss": 0.3037, + "step": 4525 + }, + { + "epoch": 3.112175102599179, + "grad_norm": 12.178646087646484, + "learning_rate": 3.829229366165071e-05, + "loss": 0.6773, + "step": 4550 + }, + { + "epoch": 3.1292749658002736, + "grad_norm": 4.9638800621032715, + "learning_rate": 3.819729442164463e-05, + "loss": 0.4036, + "step": 4575 + }, + { + "epoch": 3.146374829001368, + "grad_norm": 6.199288845062256, + "learning_rate": 3.810229518163855e-05, + "loss": 0.5313, + "step": 4600 + }, + { + "epoch": 3.1634746922024624, + "grad_norm": 19.781579971313477, + "learning_rate": 3.800729594163247e-05, + "loss": 0.5946, + "step": 4625 + }, + { + "epoch": 3.180574555403557, + "grad_norm": 0.15058183670043945, + "learning_rate": 3.791229670162639e-05, + "loss": 0.516, + "step": 4650 + }, + { + "epoch": 3.197674418604651, + "grad_norm": 13.215787887573242, + "learning_rate": 3.781729746162031e-05, + "loss": 0.4023, + "step": 4675 + }, + { + "epoch": 3.2147742818057456, + "grad_norm": 5.896836757659912, + "learning_rate": 3.772229822161423e-05, + "loss": 0.4748, + "step": 4700 + }, + { + "epoch": 3.23187414500684, + "grad_norm": 0.36866021156311035, + "learning_rate": 3.762729898160815e-05, + "loss": 0.5704, + "step": 4725 + }, + { + "epoch": 3.2489740082079344, + "grad_norm": 10.511465072631836, + "learning_rate": 3.753229974160207e-05, + "loss": 0.5316, + "step": 4750 + }, + { + "epoch": 3.266073871409029, + "grad_norm": 3.424712896347046, + "learning_rate": 3.743730050159599e-05, + "loss": 0.4717, + "step": 4775 + }, + { + "epoch": 3.283173734610123, + "grad_norm": 14.572440147399902, + "learning_rate": 3.734230126158991e-05, + "loss": 0.6337, + "step": 4800 + }, + { + "epoch": 3.3002735978112177, + "grad_norm": 10.70576286315918, + "learning_rate": 3.724730202158383e-05, + "loss": 0.6731, + "step": 4825 + }, + { + "epoch": 3.317373461012312, + "grad_norm": 11.98401165008545, + "learning_rate": 3.715230278157775e-05, + "loss": 0.4016, + "step": 4850 + }, + { + "epoch": 3.3344733242134064, + "grad_norm": 11.411341667175293, + "learning_rate": 3.705730354157167e-05, + "loss": 0.4779, + "step": 4875 + }, + { + "epoch": 3.3515731874145005, + "grad_norm": 15.914603233337402, + "learning_rate": 3.6962304301565584e-05, + "loss": 0.5832, + "step": 4900 + }, + { + "epoch": 3.368673050615595, + "grad_norm": 3.610494613647461, + "learning_rate": 3.686730506155951e-05, + "loss": 0.5463, + "step": 4925 + }, + { + "epoch": 3.3857729138166897, + "grad_norm": 14.400090217590332, + "learning_rate": 3.6772305821553424e-05, + "loss": 0.5733, + "step": 4950 + }, + { + "epoch": 3.402872777017784, + "grad_norm": 6.468245506286621, + "learning_rate": 3.667730658154735e-05, + "loss": 0.5193, + "step": 4975 + }, + { + "epoch": 3.4199726402188784, + "grad_norm": 8.739253044128418, + "learning_rate": 3.658230734154127e-05, + "loss": 0.4821, + "step": 5000 + }, + { + "epoch": 3.4370725034199725, + "grad_norm": 0.5965850949287415, + "learning_rate": 3.648730810153519e-05, + "loss": 0.3247, + "step": 5025 + }, + { + "epoch": 3.454172366621067, + "grad_norm": 2.4634127616882324, + "learning_rate": 3.639230886152911e-05, + "loss": 0.5018, + "step": 5050 + }, + { + "epoch": 3.471272229822161, + "grad_norm": 12.17545223236084, + "learning_rate": 3.629730962152303e-05, + "loss": 0.4185, + "step": 5075 + }, + { + "epoch": 3.488372093023256, + "grad_norm": 10.63932991027832, + "learning_rate": 3.620231038151695e-05, + "loss": 0.7251, + "step": 5100 + }, + { + "epoch": 3.5054719562243504, + "grad_norm": 3.384568214416504, + "learning_rate": 3.610731114151087e-05, + "loss": 0.4883, + "step": 5125 + }, + { + "epoch": 3.5225718194254445, + "grad_norm": 7.895840167999268, + "learning_rate": 3.601231190150479e-05, + "loss": 0.5038, + "step": 5150 + }, + { + "epoch": 3.539671682626539, + "grad_norm": 7.191064834594727, + "learning_rate": 3.591731266149871e-05, + "loss": 0.467, + "step": 5175 + }, + { + "epoch": 3.556771545827633, + "grad_norm": 8.865562438964844, + "learning_rate": 3.5822313421492634e-05, + "loss": 0.504, + "step": 5200 + }, + { + "epoch": 3.573871409028728, + "grad_norm": 5.6215434074401855, + "learning_rate": 3.572731418148655e-05, + "loss": 0.6207, + "step": 5225 + }, + { + "epoch": 3.5909712722298224, + "grad_norm": 3.1758780479431152, + "learning_rate": 3.5632314941480474e-05, + "loss": 0.5304, + "step": 5250 + }, + { + "epoch": 3.6080711354309165, + "grad_norm": 6.1815056800842285, + "learning_rate": 3.553731570147439e-05, + "loss": 0.4599, + "step": 5275 + }, + { + "epoch": 3.625170998632011, + "grad_norm": 29.166934967041016, + "learning_rate": 3.544231646146831e-05, + "loss": 0.5521, + "step": 5300 + }, + { + "epoch": 3.6422708618331052, + "grad_norm": 10.150755882263184, + "learning_rate": 3.5347317221462225e-05, + "loss": 0.4214, + "step": 5325 + }, + { + "epoch": 3.6593707250342, + "grad_norm": 12.637552261352539, + "learning_rate": 3.525231798145615e-05, + "loss": 0.3804, + "step": 5350 + }, + { + "epoch": 3.6764705882352944, + "grad_norm": 5.059484481811523, + "learning_rate": 3.5157318741450066e-05, + "loss": 0.5716, + "step": 5375 + }, + { + "epoch": 3.6935704514363885, + "grad_norm": 0.10250476002693176, + "learning_rate": 3.506231950144399e-05, + "loss": 0.4074, + "step": 5400 + }, + { + "epoch": 3.7106703146374826, + "grad_norm": 8.807113647460938, + "learning_rate": 3.4967320261437906e-05, + "loss": 0.4943, + "step": 5425 + }, + { + "epoch": 3.7277701778385772, + "grad_norm": 11.27835750579834, + "learning_rate": 3.487232102143183e-05, + "loss": 0.52, + "step": 5450 + }, + { + "epoch": 3.744870041039672, + "grad_norm": 9.193815231323242, + "learning_rate": 3.477732178142575e-05, + "loss": 0.4272, + "step": 5475 + }, + { + "epoch": 3.761969904240766, + "grad_norm": 5.949501991271973, + "learning_rate": 3.468232254141967e-05, + "loss": 0.5627, + "step": 5500 + }, + { + "epoch": 3.7790697674418605, + "grad_norm": 8.378783226013184, + "learning_rate": 3.458732330141359e-05, + "loss": 0.5856, + "step": 5525 + }, + { + "epoch": 3.7961696306429547, + "grad_norm": 10.514230728149414, + "learning_rate": 3.449232406140751e-05, + "loss": 0.4319, + "step": 5550 + }, + { + "epoch": 3.8132694938440492, + "grad_norm": 9.19676399230957, + "learning_rate": 3.4397324821401435e-05, + "loss": 0.5279, + "step": 5575 + }, + { + "epoch": 3.830369357045144, + "grad_norm": 33.51396560668945, + "learning_rate": 3.430232558139535e-05, + "loss": 0.4681, + "step": 5600 + }, + { + "epoch": 3.847469220246238, + "grad_norm": 9.01288890838623, + "learning_rate": 3.4207326341389275e-05, + "loss": 0.476, + "step": 5625 + }, + { + "epoch": 3.8645690834473325, + "grad_norm": 8.594268798828125, + "learning_rate": 3.411232710138319e-05, + "loss": 0.3972, + "step": 5650 + }, + { + "epoch": 3.8816689466484267, + "grad_norm": 16.336450576782227, + "learning_rate": 3.4017327861377116e-05, + "loss": 0.556, + "step": 5675 + }, + { + "epoch": 3.8987688098495212, + "grad_norm": 9.880993843078613, + "learning_rate": 3.392232862137103e-05, + "loss": 0.4506, + "step": 5700 + }, + { + "epoch": 3.915868673050616, + "grad_norm": 17.31952476501465, + "learning_rate": 3.382732938136495e-05, + "loss": 0.5402, + "step": 5725 + }, + { + "epoch": 3.93296853625171, + "grad_norm": 27.180463790893555, + "learning_rate": 3.373233014135887e-05, + "loss": 0.5221, + "step": 5750 + }, + { + "epoch": 3.9500683994528045, + "grad_norm": 6.002215385437012, + "learning_rate": 3.363733090135279e-05, + "loss": 0.4936, + "step": 5775 + }, + { + "epoch": 3.9671682626538987, + "grad_norm": 18.105520248413086, + "learning_rate": 3.354233166134671e-05, + "loss": 0.4508, + "step": 5800 + }, + { + "epoch": 3.9842681258549932, + "grad_norm": 10.82498550415039, + "learning_rate": 3.344733242134063e-05, + "loss": 0.45, + "step": 5825 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.8955274837828611, + "eval_f1_macro": 0.7057064371974652, + "eval_f1_micro": 0.8955274837828611, + "eval_f1_weighted": 0.8886748900635787, + "eval_loss": 0.31441542506217957, + "eval_precision_macro": 0.8491702927441283, + "eval_precision_micro": 0.8955274837828611, + "eval_precision_weighted": 0.904039527542345, + "eval_recall_macro": 0.6801244258050726, + "eval_recall_micro": 0.8955274837828611, + "eval_recall_weighted": 0.8955274837828611, + "eval_runtime": 19.2708, + "eval_samples_per_second": 151.992, + "eval_steps_per_second": 9.548, + "step": 5848 + }, + { + "epoch": 4.001367989056088, + "grad_norm": 11.459450721740723, + "learning_rate": 3.335233318133455e-05, + "loss": 0.3849, + "step": 5850 + }, + { + "epoch": 4.0184678522571815, + "grad_norm": 5.290565013885498, + "learning_rate": 3.325733394132847e-05, + "loss": 0.4288, + "step": 5875 + }, + { + "epoch": 4.035567715458276, + "grad_norm": 5.566415309906006, + "learning_rate": 3.316233470132239e-05, + "loss": 0.5999, + "step": 5900 + }, + { + "epoch": 4.052667578659371, + "grad_norm": 14.68671703338623, + "learning_rate": 3.306733546131631e-05, + "loss": 0.3921, + "step": 5925 + }, + { + "epoch": 4.069767441860465, + "grad_norm": 4.023522853851318, + "learning_rate": 3.297233622131023e-05, + "loss": 0.5771, + "step": 5950 + }, + { + "epoch": 4.08686730506156, + "grad_norm": 21.95399284362793, + "learning_rate": 3.287733698130415e-05, + "loss": 0.4062, + "step": 5975 + }, + { + "epoch": 4.1039671682626535, + "grad_norm": 0.2723749279975891, + "learning_rate": 3.278233774129807e-05, + "loss": 0.474, + "step": 6000 + }, + { + "epoch": 4.121067031463748, + "grad_norm": 2.193208694458008, + "learning_rate": 3.268733850129199e-05, + "loss": 0.3756, + "step": 6025 + }, + { + "epoch": 4.138166894664843, + "grad_norm": 7.093472480773926, + "learning_rate": 3.259233926128591e-05, + "loss": 0.4341, + "step": 6050 + }, + { + "epoch": 4.155266757865937, + "grad_norm": 15.10814380645752, + "learning_rate": 3.249734002127983e-05, + "loss": 0.6389, + "step": 6075 + }, + { + "epoch": 4.172366621067032, + "grad_norm": 1.5080924034118652, + "learning_rate": 3.240234078127375e-05, + "loss": 0.3716, + "step": 6100 + }, + { + "epoch": 4.1894664842681255, + "grad_norm": 6.386539936065674, + "learning_rate": 3.2307341541267674e-05, + "loss": 0.4362, + "step": 6125 + }, + { + "epoch": 4.20656634746922, + "grad_norm": 5.12455415725708, + "learning_rate": 3.221234230126159e-05, + "loss": 0.3797, + "step": 6150 + }, + { + "epoch": 4.223666210670315, + "grad_norm": 17.729442596435547, + "learning_rate": 3.2117343061255514e-05, + "loss": 0.5386, + "step": 6175 + }, + { + "epoch": 4.240766073871409, + "grad_norm": 11.959110260009766, + "learning_rate": 3.202234382124943e-05, + "loss": 0.5592, + "step": 6200 + }, + { + "epoch": 4.257865937072504, + "grad_norm": 8.719466209411621, + "learning_rate": 3.192734458124335e-05, + "loss": 0.5439, + "step": 6225 + }, + { + "epoch": 4.2749658002735975, + "grad_norm": 16.87335205078125, + "learning_rate": 3.183234534123727e-05, + "loss": 0.4024, + "step": 6250 + }, + { + "epoch": 4.292065663474692, + "grad_norm": 18.301565170288086, + "learning_rate": 3.173734610123119e-05, + "loss": 0.5395, + "step": 6275 + }, + { + "epoch": 4.309165526675787, + "grad_norm": 3.5666756629943848, + "learning_rate": 3.164234686122511e-05, + "loss": 0.4384, + "step": 6300 + }, + { + "epoch": 4.326265389876881, + "grad_norm": 6.758172035217285, + "learning_rate": 3.154734762121903e-05, + "loss": 0.4922, + "step": 6325 + }, + { + "epoch": 4.343365253077975, + "grad_norm": 10.049732208251953, + "learning_rate": 3.145234838121295e-05, + "loss": 0.5458, + "step": 6350 + }, + { + "epoch": 4.3604651162790695, + "grad_norm": 8.759356498718262, + "learning_rate": 3.135734914120687e-05, + "loss": 0.3634, + "step": 6375 + }, + { + "epoch": 4.377564979480164, + "grad_norm": 26.165199279785156, + "learning_rate": 3.1262349901200794e-05, + "loss": 0.5239, + "step": 6400 + }, + { + "epoch": 4.394664842681259, + "grad_norm": 9.9360990524292, + "learning_rate": 3.116735066119471e-05, + "loss": 0.3593, + "step": 6425 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 6.546799182891846, + "learning_rate": 3.1072351421188634e-05, + "loss": 0.5414, + "step": 6450 + }, + { + "epoch": 4.428864569083447, + "grad_norm": 10.599846839904785, + "learning_rate": 3.097735218118255e-05, + "loss": 0.493, + "step": 6475 + }, + { + "epoch": 4.4459644322845415, + "grad_norm": 13.960310935974121, + "learning_rate": 3.0882352941176475e-05, + "loss": 0.3437, + "step": 6500 + }, + { + "epoch": 4.463064295485636, + "grad_norm": 5.401963710784912, + "learning_rate": 3.078735370117039e-05, + "loss": 0.4259, + "step": 6525 + }, + { + "epoch": 4.480164158686731, + "grad_norm": 4.808218955993652, + "learning_rate": 3.0692354461164315e-05, + "loss": 0.4627, + "step": 6550 + }, + { + "epoch": 4.497264021887825, + "grad_norm": 22.903667449951172, + "learning_rate": 3.059735522115823e-05, + "loss": 0.5063, + "step": 6575 + }, + { + "epoch": 4.514363885088919, + "grad_norm": 4.878890037536621, + "learning_rate": 3.0502355981152152e-05, + "loss": 0.3412, + "step": 6600 + }, + { + "epoch": 4.5314637482900135, + "grad_norm": 6.41884708404541, + "learning_rate": 3.040735674114607e-05, + "loss": 0.4038, + "step": 6625 + }, + { + "epoch": 4.548563611491108, + "grad_norm": 7.6325154304504395, + "learning_rate": 3.0312357501139993e-05, + "loss": 0.5503, + "step": 6650 + }, + { + "epoch": 4.565663474692203, + "grad_norm": 10.409296035766602, + "learning_rate": 3.021735826113391e-05, + "loss": 0.4306, + "step": 6675 + }, + { + "epoch": 4.582763337893297, + "grad_norm": 9.539959907531738, + "learning_rate": 3.0122359021127833e-05, + "loss": 0.5506, + "step": 6700 + }, + { + "epoch": 4.599863201094391, + "grad_norm": 15.213808059692383, + "learning_rate": 3.002735978112175e-05, + "loss": 0.4715, + "step": 6725 + }, + { + "epoch": 4.6169630642954855, + "grad_norm": 15.897672653198242, + "learning_rate": 2.9932360541115674e-05, + "loss": 0.4674, + "step": 6750 + }, + { + "epoch": 4.63406292749658, + "grad_norm": 2.023172378540039, + "learning_rate": 2.983736130110959e-05, + "loss": 0.5035, + "step": 6775 + }, + { + "epoch": 4.651162790697675, + "grad_norm": 9.661181449890137, + "learning_rate": 2.974236206110351e-05, + "loss": 0.4261, + "step": 6800 + }, + { + "epoch": 4.668262653898768, + "grad_norm": 6.808616638183594, + "learning_rate": 2.9647362821097435e-05, + "loss": 0.3963, + "step": 6825 + }, + { + "epoch": 4.685362517099863, + "grad_norm": 2.418628215789795, + "learning_rate": 2.9552363581091352e-05, + "loss": 0.5116, + "step": 6850 + }, + { + "epoch": 4.7024623803009575, + "grad_norm": 5.1149749755859375, + "learning_rate": 2.9457364341085275e-05, + "loss": 0.3861, + "step": 6875 + }, + { + "epoch": 4.719562243502052, + "grad_norm": 10.152314186096191, + "learning_rate": 2.9362365101079192e-05, + "loss": 0.4005, + "step": 6900 + }, + { + "epoch": 4.736662106703147, + "grad_norm": 11.572530746459961, + "learning_rate": 2.9267365861073116e-05, + "loss": 0.5105, + "step": 6925 + }, + { + "epoch": 4.75376196990424, + "grad_norm": 11.438729286193848, + "learning_rate": 2.9172366621067033e-05, + "loss": 0.3964, + "step": 6950 + }, + { + "epoch": 4.770861833105335, + "grad_norm": 2.2795422077178955, + "learning_rate": 2.9077367381060953e-05, + "loss": 0.4141, + "step": 6975 + }, + { + "epoch": 4.7879616963064295, + "grad_norm": 17.774606704711914, + "learning_rate": 2.8982368141054873e-05, + "loss": 0.4031, + "step": 7000 + }, + { + "epoch": 4.805061559507524, + "grad_norm": 5.122858047485352, + "learning_rate": 2.8887368901048794e-05, + "loss": 0.5089, + "step": 7025 + }, + { + "epoch": 4.822161422708619, + "grad_norm": 9.22169303894043, + "learning_rate": 2.879236966104271e-05, + "loss": 0.4628, + "step": 7050 + }, + { + "epoch": 4.839261285909712, + "grad_norm": 7.689781665802002, + "learning_rate": 2.8697370421036634e-05, + "loss": 0.435, + "step": 7075 + }, + { + "epoch": 4.856361149110807, + "grad_norm": 14.785922050476074, + "learning_rate": 2.860237118103055e-05, + "loss": 0.5333, + "step": 7100 + }, + { + "epoch": 4.8734610123119015, + "grad_norm": 9.352224349975586, + "learning_rate": 2.8507371941024475e-05, + "loss": 0.5289, + "step": 7125 + }, + { + "epoch": 4.890560875512996, + "grad_norm": 13.73246955871582, + "learning_rate": 2.841237270101839e-05, + "loss": 0.4828, + "step": 7150 + }, + { + "epoch": 4.907660738714091, + "grad_norm": 25.362621307373047, + "learning_rate": 2.8317373461012315e-05, + "loss": 0.4774, + "step": 7175 + }, + { + "epoch": 4.924760601915184, + "grad_norm": 7.927663803100586, + "learning_rate": 2.8222374221006232e-05, + "loss": 0.4548, + "step": 7200 + }, + { + "epoch": 4.941860465116279, + "grad_norm": 7.368469715118408, + "learning_rate": 2.8127374981000152e-05, + "loss": 0.503, + "step": 7225 + }, + { + "epoch": 4.9589603283173735, + "grad_norm": 4.176021575927734, + "learning_rate": 2.8032375740994073e-05, + "loss": 0.4104, + "step": 7250 + }, + { + "epoch": 4.976060191518468, + "grad_norm": 9.954981803894043, + "learning_rate": 2.7937376500987993e-05, + "loss": 0.4093, + "step": 7275 + }, + { + "epoch": 4.993160054719562, + "grad_norm": 6.885503768920898, + "learning_rate": 2.784237726098191e-05, + "loss": 0.5022, + "step": 7300 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.8955274837828611, + "eval_f1_macro": 0.7161875284577881, + "eval_f1_micro": 0.8955274837828611, + "eval_f1_weighted": 0.8926041027507408, + "eval_loss": 0.28958025574684143, + "eval_precision_macro": 0.862110704875171, + "eval_precision_micro": 0.8955274837828611, + "eval_precision_weighted": 0.9050944481184527, + "eval_recall_macro": 0.6655249799036212, + "eval_recall_micro": 0.8955274837828611, + "eval_recall_weighted": 0.8955274837828611, + "eval_runtime": 18.9608, + "eval_samples_per_second": 154.477, + "eval_steps_per_second": 9.704, + "step": 7310 + }, + { + "epoch": 5.010259917920656, + "grad_norm": 0.8062827587127686, + "learning_rate": 2.7747378020975834e-05, + "loss": 0.458, + "step": 7325 + }, + { + "epoch": 5.027359781121751, + "grad_norm": 7.012026786804199, + "learning_rate": 2.765237878096975e-05, + "loss": 0.4691, + "step": 7350 + }, + { + "epoch": 5.0444596443228455, + "grad_norm": 3.819838762283325, + "learning_rate": 2.7557379540963674e-05, + "loss": 0.5331, + "step": 7375 + }, + { + "epoch": 5.06155950752394, + "grad_norm": 11.148397445678711, + "learning_rate": 2.7462380300957598e-05, + "loss": 0.4309, + "step": 7400 + }, + { + "epoch": 5.078659370725034, + "grad_norm": 4.97418737411499, + "learning_rate": 2.7367381060951515e-05, + "loss": 0.4308, + "step": 7425 + }, + { + "epoch": 5.095759233926128, + "grad_norm": 9.843364715576172, + "learning_rate": 2.7272381820945435e-05, + "loss": 0.5226, + "step": 7450 + }, + { + "epoch": 5.112859097127223, + "grad_norm": 11.50365924835205, + "learning_rate": 2.7177382580939352e-05, + "loss": 0.4092, + "step": 7475 + }, + { + "epoch": 5.1299589603283176, + "grad_norm": 6.617554187774658, + "learning_rate": 2.7082383340933276e-05, + "loss": 0.3954, + "step": 7500 + }, + { + "epoch": 5.147058823529412, + "grad_norm": 0.518602728843689, + "learning_rate": 2.6987384100927192e-05, + "loss": 0.422, + "step": 7525 + }, + { + "epoch": 5.164158686730506, + "grad_norm": 16.087276458740234, + "learning_rate": 2.6892384860921116e-05, + "loss": 0.3651, + "step": 7550 + }, + { + "epoch": 5.1812585499316, + "grad_norm": 0.1962614506483078, + "learning_rate": 2.6797385620915033e-05, + "loss": 0.5446, + "step": 7575 + }, + { + "epoch": 5.198358413132695, + "grad_norm": 8.01890754699707, + "learning_rate": 2.6702386380908957e-05, + "loss": 0.3318, + "step": 7600 + }, + { + "epoch": 5.2154582763337896, + "grad_norm": 36.442684173583984, + "learning_rate": 2.6607387140902874e-05, + "loss": 0.4495, + "step": 7625 + }, + { + "epoch": 5.232558139534884, + "grad_norm": 8.66895866394043, + "learning_rate": 2.6512387900896797e-05, + "loss": 0.4476, + "step": 7650 + }, + { + "epoch": 5.249658002735978, + "grad_norm": 14.132843971252441, + "learning_rate": 2.6417388660890714e-05, + "loss": 0.3548, + "step": 7675 + }, + { + "epoch": 5.266757865937072, + "grad_norm": 11.379664421081543, + "learning_rate": 2.6322389420884634e-05, + "loss": 0.4658, + "step": 7700 + }, + { + "epoch": 5.283857729138167, + "grad_norm": 12.820823669433594, + "learning_rate": 2.622739018087855e-05, + "loss": 0.2941, + "step": 7725 + }, + { + "epoch": 5.300957592339262, + "grad_norm": 26.1966609954834, + "learning_rate": 2.6132390940872475e-05, + "loss": 0.4083, + "step": 7750 + }, + { + "epoch": 5.318057455540355, + "grad_norm": 12.518375396728516, + "learning_rate": 2.6041191670466635e-05, + "loss": 0.3166, + "step": 7775 + }, + { + "epoch": 5.33515731874145, + "grad_norm": 4.027897834777832, + "learning_rate": 2.594619243046056e-05, + "loss": 0.4087, + "step": 7800 + }, + { + "epoch": 5.352257181942544, + "grad_norm": 13.574274063110352, + "learning_rate": 2.5851193190454476e-05, + "loss": 0.5478, + "step": 7825 + }, + { + "epoch": 5.369357045143639, + "grad_norm": 12.73529052734375, + "learning_rate": 2.57561939504484e-05, + "loss": 0.4394, + "step": 7850 + }, + { + "epoch": 5.386456908344734, + "grad_norm": 8.502470016479492, + "learning_rate": 2.5661194710442316e-05, + "loss": 0.397, + "step": 7875 + }, + { + "epoch": 5.403556771545827, + "grad_norm": 7.308871746063232, + "learning_rate": 2.556619547043624e-05, + "loss": 0.4541, + "step": 7900 + }, + { + "epoch": 5.420656634746922, + "grad_norm": 14.608325004577637, + "learning_rate": 2.5471196230430157e-05, + "loss": 0.4646, + "step": 7925 + }, + { + "epoch": 5.437756497948016, + "grad_norm": 6.4289655685424805, + "learning_rate": 2.5376196990424077e-05, + "loss": 0.399, + "step": 7950 + }, + { + "epoch": 5.454856361149111, + "grad_norm": 3.8061683177948, + "learning_rate": 2.5281197750417994e-05, + "loss": 0.4327, + "step": 7975 + }, + { + "epoch": 5.471956224350206, + "grad_norm": 6.391703128814697, + "learning_rate": 2.5186198510411917e-05, + "loss": 0.4641, + "step": 8000 + }, + { + "epoch": 5.489056087551299, + "grad_norm": 2.9124350547790527, + "learning_rate": 2.509119927040584e-05, + "loss": 0.3654, + "step": 8025 + }, + { + "epoch": 5.506155950752394, + "grad_norm": 3.834289789199829, + "learning_rate": 2.4996200030399758e-05, + "loss": 0.5162, + "step": 8050 + }, + { + "epoch": 5.523255813953488, + "grad_norm": 16.672739028930664, + "learning_rate": 2.4901200790393678e-05, + "loss": 0.5626, + "step": 8075 + }, + { + "epoch": 5.540355677154583, + "grad_norm": 26.094615936279297, + "learning_rate": 2.48062015503876e-05, + "loss": 0.3838, + "step": 8100 + }, + { + "epoch": 5.557455540355678, + "grad_norm": 1.9188295602798462, + "learning_rate": 2.471120231038152e-05, + "loss": 0.3746, + "step": 8125 + }, + { + "epoch": 5.574555403556771, + "grad_norm": 3.0162570476531982, + "learning_rate": 2.461620307037544e-05, + "loss": 0.4517, + "step": 8150 + }, + { + "epoch": 5.591655266757866, + "grad_norm": 14.349656105041504, + "learning_rate": 2.4521203830369356e-05, + "loss": 0.4876, + "step": 8175 + }, + { + "epoch": 5.60875512995896, + "grad_norm": 9.013519287109375, + "learning_rate": 2.4426204590363276e-05, + "loss": 0.4394, + "step": 8200 + }, + { + "epoch": 5.625854993160055, + "grad_norm": 3.7371058464050293, + "learning_rate": 2.4331205350357197e-05, + "loss": 0.4345, + "step": 8225 + }, + { + "epoch": 5.642954856361149, + "grad_norm": 13.115042686462402, + "learning_rate": 2.4236206110351117e-05, + "loss": 0.4146, + "step": 8250 + }, + { + "epoch": 5.660054719562243, + "grad_norm": 12.576549530029297, + "learning_rate": 2.4141206870345037e-05, + "loss": 0.4096, + "step": 8275 + }, + { + "epoch": 5.677154582763338, + "grad_norm": 7.4951605796813965, + "learning_rate": 2.4046207630338957e-05, + "loss": 0.3997, + "step": 8300 + }, + { + "epoch": 5.694254445964432, + "grad_norm": 8.070563316345215, + "learning_rate": 2.3951208390332878e-05, + "loss": 0.3297, + "step": 8325 + }, + { + "epoch": 5.711354309165527, + "grad_norm": 14.807238578796387, + "learning_rate": 2.38562091503268e-05, + "loss": 0.3864, + "step": 8350 + }, + { + "epoch": 5.728454172366621, + "grad_norm": 6.503055572509766, + "learning_rate": 2.3761209910320718e-05, + "loss": 0.5571, + "step": 8375 + }, + { + "epoch": 5.745554035567715, + "grad_norm": 3.811549186706543, + "learning_rate": 2.366621067031464e-05, + "loss": 0.3065, + "step": 8400 + }, + { + "epoch": 5.76265389876881, + "grad_norm": 4.377668857574463, + "learning_rate": 2.357121143030856e-05, + "loss": 0.3606, + "step": 8425 + }, + { + "epoch": 5.779753761969904, + "grad_norm": 6.7863874435424805, + "learning_rate": 2.347621219030248e-05, + "loss": 0.3654, + "step": 8450 + }, + { + "epoch": 5.796853625170999, + "grad_norm": 8.570117950439453, + "learning_rate": 2.33812129502964e-05, + "loss": 0.3821, + "step": 8475 + }, + { + "epoch": 5.813953488372093, + "grad_norm": 3.4964771270751953, + "learning_rate": 2.328621371029032e-05, + "loss": 0.3593, + "step": 8500 + }, + { + "epoch": 5.831053351573187, + "grad_norm": 5.006895065307617, + "learning_rate": 2.319121447028424e-05, + "loss": 0.3856, + "step": 8525 + }, + { + "epoch": 5.848153214774282, + "grad_norm": 7.012197971343994, + "learning_rate": 2.309621523027816e-05, + "loss": 0.5216, + "step": 8550 + }, + { + "epoch": 5.865253077975376, + "grad_norm": 11.0383882522583, + "learning_rate": 2.300121599027208e-05, + "loss": 0.5002, + "step": 8575 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 6.153685092926025, + "learning_rate": 2.2906216750266e-05, + "loss": 0.4749, + "step": 8600 + }, + { + "epoch": 5.899452804377565, + "grad_norm": 21.01350975036621, + "learning_rate": 2.2811217510259918e-05, + "loss": 0.3583, + "step": 8625 + }, + { + "epoch": 5.916552667578659, + "grad_norm": 6.175297737121582, + "learning_rate": 2.2716218270253838e-05, + "loss": 0.4317, + "step": 8650 + }, + { + "epoch": 5.933652530779754, + "grad_norm": 2.6204943656921387, + "learning_rate": 2.2621219030247758e-05, + "loss": 0.4452, + "step": 8675 + }, + { + "epoch": 5.950752393980848, + "grad_norm": 2.762593984603882, + "learning_rate": 2.252621979024168e-05, + "loss": 0.3466, + "step": 8700 + }, + { + "epoch": 5.967852257181942, + "grad_norm": 11.155779838562012, + "learning_rate": 2.24312205502356e-05, + "loss": 0.4283, + "step": 8725 + }, + { + "epoch": 5.984952120383037, + "grad_norm": 61.69544219970703, + "learning_rate": 2.233622131022952e-05, + "loss": 0.3336, + "step": 8750 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.9095254353021509, + "eval_f1_macro": 0.748356749541202, + "eval_f1_micro": 0.9095254353021509, + "eval_f1_weighted": 0.9037758276025493, + "eval_loss": 0.297870934009552, + "eval_precision_macro": 0.83851223488873, + "eval_precision_micro": 0.9095254353021509, + "eval_precision_weighted": 0.9132744969964606, + "eval_recall_macro": 0.7323996923201544, + "eval_recall_micro": 0.9095254353021509, + "eval_recall_weighted": 0.9095254353021509, + "eval_runtime": 18.9912, + "eval_samples_per_second": 154.229, + "eval_steps_per_second": 9.689, + "step": 8772 + }, + { + "epoch": 6.002051983584131, + "grad_norm": 9.449117660522461, + "learning_rate": 2.224122207022344e-05, + "loss": 0.3617, + "step": 8775 + }, + { + "epoch": 6.019151846785226, + "grad_norm": 9.420016288757324, + "learning_rate": 2.214622283021736e-05, + "loss": 0.5047, + "step": 8800 + }, + { + "epoch": 6.03625170998632, + "grad_norm": 8.470691680908203, + "learning_rate": 2.2055023559811523e-05, + "loss": 0.4069, + "step": 8825 + }, + { + "epoch": 6.053351573187414, + "grad_norm": 16.81625747680664, + "learning_rate": 2.1960024319805443e-05, + "loss": 0.5216, + "step": 8850 + }, + { + "epoch": 6.070451436388509, + "grad_norm": 14.323150634765625, + "learning_rate": 2.186502507979936e-05, + "loss": 0.3137, + "step": 8875 + }, + { + "epoch": 6.087551299589603, + "grad_norm": 5.009669780731201, + "learning_rate": 2.177002583979328e-05, + "loss": 0.4713, + "step": 8900 + }, + { + "epoch": 6.104651162790698, + "grad_norm": 14.51624870300293, + "learning_rate": 2.16750265997872e-05, + "loss": 0.373, + "step": 8925 + }, + { + "epoch": 6.121751025991792, + "grad_norm": 4.42010498046875, + "learning_rate": 2.158002735978112e-05, + "loss": 0.3218, + "step": 8950 + }, + { + "epoch": 6.138850889192886, + "grad_norm": 18.838573455810547, + "learning_rate": 2.1485028119775045e-05, + "loss": 0.3754, + "step": 8975 + }, + { + "epoch": 6.155950752393981, + "grad_norm": 2.5859174728393555, + "learning_rate": 2.1390028879768965e-05, + "loss": 0.4086, + "step": 9000 + }, + { + "epoch": 6.173050615595075, + "grad_norm": 4.829029560089111, + "learning_rate": 2.1295029639762885e-05, + "loss": 0.3722, + "step": 9025 + }, + { + "epoch": 6.19015047879617, + "grad_norm": 11.934502601623535, + "learning_rate": 2.1200030399756805e-05, + "loss": 0.3506, + "step": 9050 + }, + { + "epoch": 6.207250341997264, + "grad_norm": 3.9261722564697266, + "learning_rate": 2.1105031159750722e-05, + "loss": 0.2612, + "step": 9075 + }, + { + "epoch": 6.224350205198358, + "grad_norm": 0.23096883296966553, + "learning_rate": 2.1010031919744643e-05, + "loss": 0.5446, + "step": 9100 + }, + { + "epoch": 6.241450068399453, + "grad_norm": 13.32019329071045, + "learning_rate": 2.0915032679738563e-05, + "loss": 0.4225, + "step": 9125 + }, + { + "epoch": 6.258549931600547, + "grad_norm": 12.433130264282227, + "learning_rate": 2.0820033439732483e-05, + "loss": 0.3323, + "step": 9150 + }, + { + "epoch": 6.275649794801642, + "grad_norm": 22.49323844909668, + "learning_rate": 2.0725034199726403e-05, + "loss": 0.4702, + "step": 9175 + }, + { + "epoch": 6.292749658002736, + "grad_norm": 7.992762088775635, + "learning_rate": 2.0630034959720324e-05, + "loss": 0.4188, + "step": 9200 + }, + { + "epoch": 6.30984952120383, + "grad_norm": 2.31046986579895, + "learning_rate": 2.0535035719714244e-05, + "loss": 0.2581, + "step": 9225 + }, + { + "epoch": 6.326949384404925, + "grad_norm": 13.177254676818848, + "learning_rate": 2.0440036479708164e-05, + "loss": 0.5264, + "step": 9250 + }, + { + "epoch": 6.344049247606019, + "grad_norm": 16.654388427734375, + "learning_rate": 2.0345037239702085e-05, + "loss": 0.5404, + "step": 9275 + }, + { + "epoch": 6.361149110807114, + "grad_norm": 7.191986083984375, + "learning_rate": 2.0250037999696005e-05, + "loss": 0.3926, + "step": 9300 + }, + { + "epoch": 6.378248974008208, + "grad_norm": 2.7967660427093506, + "learning_rate": 2.0155038759689922e-05, + "loss": 0.3759, + "step": 9325 + }, + { + "epoch": 6.395348837209302, + "grad_norm": 11.951244354248047, + "learning_rate": 2.0060039519683842e-05, + "loss": 0.3715, + "step": 9350 + }, + { + "epoch": 6.412448700410397, + "grad_norm": 20.298959732055664, + "learning_rate": 1.9965040279677762e-05, + "loss": 0.3348, + "step": 9375 + }, + { + "epoch": 6.429548563611491, + "grad_norm": 4.485177516937256, + "learning_rate": 1.9870041039671683e-05, + "loss": 0.3164, + "step": 9400 + }, + { + "epoch": 6.446648426812586, + "grad_norm": 8.650040626525879, + "learning_rate": 1.9775041799665603e-05, + "loss": 0.4892, + "step": 9425 + }, + { + "epoch": 6.46374829001368, + "grad_norm": 8.256196975708008, + "learning_rate": 1.9680042559659523e-05, + "loss": 0.4008, + "step": 9450 + }, + { + "epoch": 6.480848153214774, + "grad_norm": 13.1589994430542, + "learning_rate": 1.9585043319653443e-05, + "loss": 0.3471, + "step": 9475 + }, + { + "epoch": 6.497948016415869, + "grad_norm": 5.785964488983154, + "learning_rate": 1.9490044079647364e-05, + "loss": 0.4492, + "step": 9500 + }, + { + "epoch": 6.515047879616963, + "grad_norm": 5.720312118530273, + "learning_rate": 1.9395044839641284e-05, + "loss": 0.3955, + "step": 9525 + }, + { + "epoch": 6.532147742818058, + "grad_norm": 4.752621650695801, + "learning_rate": 1.9300045599635204e-05, + "loss": 0.5226, + "step": 9550 + }, + { + "epoch": 6.549247606019152, + "grad_norm": 6.577572822570801, + "learning_rate": 1.9205046359629124e-05, + "loss": 0.4383, + "step": 9575 + }, + { + "epoch": 6.566347469220246, + "grad_norm": 2.3268673419952393, + "learning_rate": 1.9110047119623045e-05, + "loss": 0.4475, + "step": 9600 + }, + { + "epoch": 6.583447332421341, + "grad_norm": 7.915472030639648, + "learning_rate": 1.9015047879616965e-05, + "loss": 0.4374, + "step": 9625 + }, + { + "epoch": 6.600547195622435, + "grad_norm": 14.391087532043457, + "learning_rate": 1.8920048639610885e-05, + "loss": 0.3706, + "step": 9650 + }, + { + "epoch": 6.617647058823529, + "grad_norm": 5.97300386428833, + "learning_rate": 1.8825049399604806e-05, + "loss": 0.425, + "step": 9675 + }, + { + "epoch": 6.634746922024624, + "grad_norm": 9.130365371704102, + "learning_rate": 1.8730050159598726e-05, + "loss": 0.3341, + "step": 9700 + }, + { + "epoch": 6.651846785225718, + "grad_norm": 5.5994038581848145, + "learning_rate": 1.8635050919592646e-05, + "loss": 0.4933, + "step": 9725 + }, + { + "epoch": 6.668946648426813, + "grad_norm": 9.19884967803955, + "learning_rate": 1.8540051679586566e-05, + "loss": 0.4012, + "step": 9750 + }, + { + "epoch": 6.686046511627907, + "grad_norm": 3.408245325088501, + "learning_rate": 1.8445052439580483e-05, + "loss": 0.3444, + "step": 9775 + }, + { + "epoch": 6.703146374829001, + "grad_norm": 11.616069793701172, + "learning_rate": 1.8350053199574404e-05, + "loss": 0.3627, + "step": 9800 + }, + { + "epoch": 6.720246238030096, + "grad_norm": 12.855060577392578, + "learning_rate": 1.8255053959568324e-05, + "loss": 0.4833, + "step": 9825 + }, + { + "epoch": 6.73734610123119, + "grad_norm": 4.252665042877197, + "learning_rate": 1.8160054719562244e-05, + "loss": 0.3607, + "step": 9850 + }, + { + "epoch": 6.754445964432285, + "grad_norm": 8.759148597717285, + "learning_rate": 1.8065055479556164e-05, + "loss": 0.403, + "step": 9875 + }, + { + "epoch": 6.771545827633379, + "grad_norm": 11.92839527130127, + "learning_rate": 1.7970056239550085e-05, + "loss": 0.3562, + "step": 9900 + }, + { + "epoch": 6.788645690834473, + "grad_norm": 1.0502179861068726, + "learning_rate": 1.7875056999544005e-05, + "loss": 0.4002, + "step": 9925 + }, + { + "epoch": 6.805745554035568, + "grad_norm": 8.642801284790039, + "learning_rate": 1.7780057759537925e-05, + "loss": 0.4298, + "step": 9950 + }, + { + "epoch": 6.822845417236662, + "grad_norm": 3.608553886413574, + "learning_rate": 1.7685058519531845e-05, + "loss": 0.3687, + "step": 9975 + }, + { + "epoch": 6.839945280437757, + "grad_norm": 17.244091033935547, + "learning_rate": 1.7590059279525762e-05, + "loss": 0.4086, + "step": 10000 + }, + { + "epoch": 6.857045143638851, + "grad_norm": 9.269475936889648, + "learning_rate": 1.7495060039519683e-05, + "loss": 0.4166, + "step": 10025 + }, + { + "epoch": 6.874145006839945, + "grad_norm": 6.287049293518066, + "learning_rate": 1.7400060799513603e-05, + "loss": 0.5342, + "step": 10050 + }, + { + "epoch": 6.89124487004104, + "grad_norm": 2.380673408508301, + "learning_rate": 1.7305061559507523e-05, + "loss": 0.3687, + "step": 10075 + }, + { + "epoch": 6.908344733242134, + "grad_norm": 23.413028717041016, + "learning_rate": 1.7210062319501443e-05, + "loss": 0.3996, + "step": 10100 + }, + { + "epoch": 6.925444596443229, + "grad_norm": 16.1468563079834, + "learning_rate": 1.7115063079495364e-05, + "loss": 0.3844, + "step": 10125 + }, + { + "epoch": 6.942544459644322, + "grad_norm": 1.6500098705291748, + "learning_rate": 1.7020063839489284e-05, + "loss": 0.523, + "step": 10150 + }, + { + "epoch": 6.959644322845417, + "grad_norm": 9.402831077575684, + "learning_rate": 1.6925064599483208e-05, + "loss": 0.3376, + "step": 10175 + }, + { + "epoch": 6.976744186046512, + "grad_norm": 2.928579807281494, + "learning_rate": 1.6830065359477125e-05, + "loss": 0.3303, + "step": 10200 + }, + { + "epoch": 6.993844049247606, + "grad_norm": 2.99859881401062, + "learning_rate": 1.6735066119471045e-05, + "loss": 0.4049, + "step": 10225 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.9156708774325708, + "eval_f1_macro": 0.8149754054596434, + "eval_f1_micro": 0.9156708774325708, + "eval_f1_weighted": 0.9140872488341879, + "eval_loss": 0.250088632106781, + "eval_precision_macro": 0.9251793446372559, + "eval_precision_micro": 0.9156708774325708, + "eval_precision_weighted": 0.9218398298083498, + "eval_recall_macro": 0.7805045376076867, + "eval_recall_micro": 0.9156708774325708, + "eval_recall_weighted": 0.9156708774325708, + "eval_runtime": 19.166, + "eval_samples_per_second": 152.822, + "eval_steps_per_second": 9.6, + "step": 10234 + }, + { + "epoch": 7.010943912448701, + "grad_norm": 5.946883678436279, + "learning_rate": 1.6640066879464965e-05, + "loss": 0.4273, + "step": 10250 + }, + { + "epoch": 7.028043775649794, + "grad_norm": 12.816991806030273, + "learning_rate": 1.6545067639458885e-05, + "loss": 0.3663, + "step": 10275 + }, + { + "epoch": 7.045143638850889, + "grad_norm": 10.432554244995117, + "learning_rate": 1.6450068399452806e-05, + "loss": 0.3136, + "step": 10300 + }, + { + "epoch": 7.062243502051984, + "grad_norm": 13.881523132324219, + "learning_rate": 1.6355069159446726e-05, + "loss": 0.3795, + "step": 10325 + }, + { + "epoch": 7.079343365253078, + "grad_norm": 8.671323776245117, + "learning_rate": 1.6260069919440646e-05, + "loss": 0.4158, + "step": 10350 + }, + { + "epoch": 7.096443228454173, + "grad_norm": 7.5603132247924805, + "learning_rate": 1.6165070679434567e-05, + "loss": 0.3809, + "step": 10375 + }, + { + "epoch": 7.113543091655266, + "grad_norm": 13.723405838012695, + "learning_rate": 1.6070071439428487e-05, + "loss": 0.391, + "step": 10400 + }, + { + "epoch": 7.130642954856361, + "grad_norm": 9.176318168640137, + "learning_rate": 1.5975072199422407e-05, + "loss": 0.3475, + "step": 10425 + }, + { + "epoch": 7.147742818057456, + "grad_norm": 5.787652015686035, + "learning_rate": 1.5880072959416324e-05, + "loss": 0.3745, + "step": 10450 + }, + { + "epoch": 7.16484268125855, + "grad_norm": 3.6111419200897217, + "learning_rate": 1.5785073719410244e-05, + "loss": 0.3897, + "step": 10475 + }, + { + "epoch": 7.181942544459645, + "grad_norm": 9.432286262512207, + "learning_rate": 1.5690074479404165e-05, + "loss": 0.5103, + "step": 10500 + }, + { + "epoch": 7.199042407660738, + "grad_norm": 6.067584037780762, + "learning_rate": 1.5595075239398085e-05, + "loss": 0.4322, + "step": 10525 + }, + { + "epoch": 7.216142270861833, + "grad_norm": 0.6759016513824463, + "learning_rate": 1.5500075999392005e-05, + "loss": 0.4045, + "step": 10550 + }, + { + "epoch": 7.233242134062928, + "grad_norm": 6.492595672607422, + "learning_rate": 1.5405076759385925e-05, + "loss": 0.3742, + "step": 10575 + }, + { + "epoch": 7.250341997264022, + "grad_norm": 10.5081148147583, + "learning_rate": 1.5310077519379846e-05, + "loss": 0.3432, + "step": 10600 + }, + { + "epoch": 7.267441860465116, + "grad_norm": 6.45819616317749, + "learning_rate": 1.5215078279373766e-05, + "loss": 0.3557, + "step": 10625 + }, + { + "epoch": 7.2845417236662104, + "grad_norm": 1.3473492860794067, + "learning_rate": 1.5120079039367684e-05, + "loss": 0.3995, + "step": 10650 + }, + { + "epoch": 7.301641586867305, + "grad_norm": 15.663151741027832, + "learning_rate": 1.5025079799361605e-05, + "loss": 0.4619, + "step": 10675 + }, + { + "epoch": 7.3187414500684, + "grad_norm": 2.441596746444702, + "learning_rate": 1.4930080559355525e-05, + "loss": 0.3351, + "step": 10700 + }, + { + "epoch": 7.335841313269494, + "grad_norm": 18.481773376464844, + "learning_rate": 1.4835081319349445e-05, + "loss": 0.4416, + "step": 10725 + }, + { + "epoch": 7.352941176470588, + "grad_norm": 3.074429750442505, + "learning_rate": 1.4740082079343364e-05, + "loss": 0.314, + "step": 10750 + }, + { + "epoch": 7.3700410396716824, + "grad_norm": 8.20934772491455, + "learning_rate": 1.4645082839337284e-05, + "loss": 0.287, + "step": 10775 + }, + { + "epoch": 7.387140902872777, + "grad_norm": 9.531194686889648, + "learning_rate": 1.4550083599331208e-05, + "loss": 0.4132, + "step": 10800 + }, + { + "epoch": 7.404240766073872, + "grad_norm": 9.128312110900879, + "learning_rate": 1.4455084359325128e-05, + "loss": 0.5293, + "step": 10825 + }, + { + "epoch": 7.421340629274966, + "grad_norm": 12.818424224853516, + "learning_rate": 1.4360085119319047e-05, + "loss": 0.3633, + "step": 10850 + }, + { + "epoch": 7.43844049247606, + "grad_norm": 2.5819342136383057, + "learning_rate": 1.4265085879312967e-05, + "loss": 0.2941, + "step": 10875 + }, + { + "epoch": 7.4555403556771545, + "grad_norm": 0.3548867702484131, + "learning_rate": 1.4170086639306887e-05, + "loss": 0.3477, + "step": 10900 + }, + { + "epoch": 7.472640218878249, + "grad_norm": 9.35716438293457, + "learning_rate": 1.4075087399300808e-05, + "loss": 0.3415, + "step": 10925 + }, + { + "epoch": 7.489740082079344, + "grad_norm": 0.888134241104126, + "learning_rate": 1.3980088159294726e-05, + "loss": 0.4376, + "step": 10950 + }, + { + "epoch": 7.506839945280438, + "grad_norm": 3.009415626525879, + "learning_rate": 1.3885088919288646e-05, + "loss": 0.3566, + "step": 10975 + }, + { + "epoch": 7.523939808481532, + "grad_norm": 0.4245036542415619, + "learning_rate": 1.3790089679282567e-05, + "loss": 0.3407, + "step": 11000 + }, + { + "epoch": 7.5410396716826265, + "grad_norm": 9.772459983825684, + "learning_rate": 1.3695090439276487e-05, + "loss": 0.5112, + "step": 11025 + }, + { + "epoch": 7.558139534883721, + "grad_norm": 8.6549654006958, + "learning_rate": 1.3600091199270407e-05, + "loss": 0.3654, + "step": 11050 + }, + { + "epoch": 7.575239398084816, + "grad_norm": 12.258879661560059, + "learning_rate": 1.3505091959264326e-05, + "loss": 0.394, + "step": 11075 + }, + { + "epoch": 7.592339261285909, + "grad_norm": 8.852180480957031, + "learning_rate": 1.3410092719258246e-05, + "loss": 0.3667, + "step": 11100 + }, + { + "epoch": 7.609439124487004, + "grad_norm": 19.00887680053711, + "learning_rate": 1.3315093479252166e-05, + "loss": 0.3465, + "step": 11125 + }, + { + "epoch": 7.6265389876880985, + "grad_norm": 24.143585205078125, + "learning_rate": 1.3220094239246087e-05, + "loss": 0.3878, + "step": 11150 + }, + { + "epoch": 7.643638850889193, + "grad_norm": 4.1856889724731445, + "learning_rate": 1.3125094999240007e-05, + "loss": 0.3615, + "step": 11175 + }, + { + "epoch": 7.660738714090288, + "grad_norm": 11.348432540893555, + "learning_rate": 1.3030095759233925e-05, + "loss": 0.3192, + "step": 11200 + }, + { + "epoch": 7.677838577291381, + "grad_norm": 4.999576091766357, + "learning_rate": 1.2935096519227846e-05, + "loss": 0.3134, + "step": 11225 + }, + { + "epoch": 7.694938440492476, + "grad_norm": 11.35132122039795, + "learning_rate": 1.2840097279221766e-05, + "loss": 0.4056, + "step": 11250 + }, + { + "epoch": 7.7120383036935705, + "grad_norm": 15.860554695129395, + "learning_rate": 1.2745098039215686e-05, + "loss": 0.2678, + "step": 11275 + }, + { + "epoch": 7.729138166894665, + "grad_norm": 3.4646947383880615, + "learning_rate": 1.2650098799209607e-05, + "loss": 0.396, + "step": 11300 + }, + { + "epoch": 7.74623803009576, + "grad_norm": 3.1925065517425537, + "learning_rate": 1.2555099559203525e-05, + "loss": 0.324, + "step": 11325 + }, + { + "epoch": 7.763337893296853, + "grad_norm": 3.6302490234375, + "learning_rate": 1.2460100319197447e-05, + "loss": 0.3766, + "step": 11350 + }, + { + "epoch": 7.780437756497948, + "grad_norm": 20.079179763793945, + "learning_rate": 1.2365101079191367e-05, + "loss": 0.3841, + "step": 11375 + }, + { + "epoch": 7.7975376196990425, + "grad_norm": 11.020298957824707, + "learning_rate": 1.2270101839185288e-05, + "loss": 0.4496, + "step": 11400 + }, + { + "epoch": 7.814637482900137, + "grad_norm": 4.884584426879883, + "learning_rate": 1.2175102599179206e-05, + "loss": 0.3219, + "step": 11425 + }, + { + "epoch": 7.831737346101232, + "grad_norm": 18.95062828063965, + "learning_rate": 1.2080103359173127e-05, + "loss": 0.2958, + "step": 11450 + }, + { + "epoch": 7.848837209302325, + "grad_norm": 7.927674770355225, + "learning_rate": 1.1985104119167047e-05, + "loss": 0.5062, + "step": 11475 + }, + { + "epoch": 7.86593707250342, + "grad_norm": 18.551855087280273, + "learning_rate": 1.189390484876121e-05, + "loss": 0.4039, + "step": 11500 + }, + { + "epoch": 7.8830369357045145, + "grad_norm": 5.578052520751953, + "learning_rate": 1.179890560875513e-05, + "loss": 0.332, + "step": 11525 + }, + { + "epoch": 7.900136798905609, + "grad_norm": 0.06869751960039139, + "learning_rate": 1.1703906368749049e-05, + "loss": 0.4136, + "step": 11550 + }, + { + "epoch": 7.917236662106703, + "grad_norm": 7.070012092590332, + "learning_rate": 1.1608907128742971e-05, + "loss": 0.3402, + "step": 11575 + }, + { + "epoch": 7.934336525307797, + "grad_norm": 2.309910774230957, + "learning_rate": 1.1513907888736891e-05, + "loss": 0.3272, + "step": 11600 + }, + { + "epoch": 7.951436388508892, + "grad_norm": 20.965015411376953, + "learning_rate": 1.1418908648730812e-05, + "loss": 0.2962, + "step": 11625 + }, + { + "epoch": 7.9685362517099865, + "grad_norm": 5.13886022567749, + "learning_rate": 1.132390940872473e-05, + "loss": 0.455, + "step": 11650 + }, + { + "epoch": 7.985636114911081, + "grad_norm": 3.935183525085449, + "learning_rate": 1.122891016871865e-05, + "loss": 0.3484, + "step": 11675 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.9211334926596108, + "eval_f1_macro": 0.8568820222911021, + "eval_f1_micro": 0.9211334926596108, + "eval_f1_weighted": 0.9210426818413497, + "eval_loss": 0.2283647209405899, + "eval_precision_macro": 0.9255394303052991, + "eval_precision_micro": 0.9211334926596108, + "eval_precision_weighted": 0.9272957997209303, + "eval_recall_macro": 0.8317609357993986, + "eval_recall_micro": 0.9211334926596108, + "eval_recall_weighted": 0.9211334926596108, + "eval_runtime": 19.3065, + "eval_samples_per_second": 151.711, + "eval_steps_per_second": 9.53, + "step": 11696 + }, + { + "epoch": 8.002735978112176, + "grad_norm": 14.681279182434082, + "learning_rate": 1.113391092871257e-05, + "loss": 0.2686, + "step": 11700 + }, + { + "epoch": 8.01983584131327, + "grad_norm": 26.67691993713379, + "learning_rate": 1.1042711658306734e-05, + "loss": 0.3767, + "step": 11725 + }, + { + "epoch": 8.036935704514363, + "grad_norm": 4.675159931182861, + "learning_rate": 1.0947712418300655e-05, + "loss": 0.304, + "step": 11750 + }, + { + "epoch": 8.054035567715458, + "grad_norm": 8.4456787109375, + "learning_rate": 1.0852713178294575e-05, + "loss": 0.2553, + "step": 11775 + }, + { + "epoch": 8.071135430916552, + "grad_norm": 15.122594833374023, + "learning_rate": 1.0757713938288493e-05, + "loss": 0.3248, + "step": 11800 + }, + { + "epoch": 8.088235294117647, + "grad_norm": 10.912254333496094, + "learning_rate": 1.0662714698282414e-05, + "loss": 0.3009, + "step": 11825 + }, + { + "epoch": 8.105335157318741, + "grad_norm": 13.658234596252441, + "learning_rate": 1.0567715458276334e-05, + "loss": 0.4445, + "step": 11850 + }, + { + "epoch": 8.122435020519836, + "grad_norm": 0.18706431984901428, + "learning_rate": 1.0472716218270254e-05, + "loss": 0.267, + "step": 11875 + }, + { + "epoch": 8.13953488372093, + "grad_norm": 24.79719352722168, + "learning_rate": 1.0377716978264174e-05, + "loss": 0.3466, + "step": 11900 + }, + { + "epoch": 8.156634746922025, + "grad_norm": 18.876535415649414, + "learning_rate": 1.0282717738258095e-05, + "loss": 0.4939, + "step": 11925 + }, + { + "epoch": 8.17373461012312, + "grad_norm": 7.15775728225708, + "learning_rate": 1.0187718498252015e-05, + "loss": 0.3728, + "step": 11950 + }, + { + "epoch": 8.190834473324214, + "grad_norm": 4.604434967041016, + "learning_rate": 1.0092719258245935e-05, + "loss": 0.3492, + "step": 11975 + }, + { + "epoch": 8.207934336525307, + "grad_norm": 6.463050365447998, + "learning_rate": 9.997720018239856e-06, + "loss": 0.3298, + "step": 12000 + }, + { + "epoch": 8.225034199726402, + "grad_norm": 16.29618263244629, + "learning_rate": 9.902720778233774e-06, + "loss": 0.3415, + "step": 12025 + }, + { + "epoch": 8.242134062927496, + "grad_norm": 5.63080358505249, + "learning_rate": 9.807721538227694e-06, + "loss": 0.2608, + "step": 12050 + }, + { + "epoch": 8.25923392612859, + "grad_norm": 0.7199766039848328, + "learning_rate": 9.712722298221615e-06, + "loss": 0.3974, + "step": 12075 + }, + { + "epoch": 8.276333789329685, + "grad_norm": 15.456204414367676, + "learning_rate": 9.617723058215535e-06, + "loss": 0.3329, + "step": 12100 + }, + { + "epoch": 8.29343365253078, + "grad_norm": 18.643985748291016, + "learning_rate": 9.522723818209454e-06, + "loss": 0.4801, + "step": 12125 + }, + { + "epoch": 8.310533515731874, + "grad_norm": 4.800582408905029, + "learning_rate": 9.427724578203374e-06, + "loss": 0.557, + "step": 12150 + }, + { + "epoch": 8.327633378932969, + "grad_norm": 22.22751808166504, + "learning_rate": 9.332725338197294e-06, + "loss": 0.3648, + "step": 12175 + }, + { + "epoch": 8.344733242134064, + "grad_norm": 5.446302890777588, + "learning_rate": 9.237726098191216e-06, + "loss": 0.2558, + "step": 12200 + }, + { + "epoch": 8.361833105335158, + "grad_norm": 0.26866602897644043, + "learning_rate": 9.142726858185136e-06, + "loss": 0.3962, + "step": 12225 + }, + { + "epoch": 8.378932968536251, + "grad_norm": 3.1288976669311523, + "learning_rate": 9.047727618179055e-06, + "loss": 0.439, + "step": 12250 + }, + { + "epoch": 8.396032831737346, + "grad_norm": 2.740288496017456, + "learning_rate": 8.952728378172975e-06, + "loss": 0.3076, + "step": 12275 + }, + { + "epoch": 8.41313269493844, + "grad_norm": 4.094404697418213, + "learning_rate": 8.857729138166896e-06, + "loss": 0.3551, + "step": 12300 + }, + { + "epoch": 8.430232558139535, + "grad_norm": 9.859013557434082, + "learning_rate": 8.762729898160816e-06, + "loss": 0.3046, + "step": 12325 + }, + { + "epoch": 8.44733242134063, + "grad_norm": 7.303380966186523, + "learning_rate": 8.667730658154734e-06, + "loss": 0.2405, + "step": 12350 + }, + { + "epoch": 8.464432284541724, + "grad_norm": 11.945883750915527, + "learning_rate": 8.572731418148655e-06, + "loss": 0.367, + "step": 12375 + }, + { + "epoch": 8.481532147742818, + "grad_norm": 8.770705223083496, + "learning_rate": 8.477732178142575e-06, + "loss": 0.3977, + "step": 12400 + }, + { + "epoch": 8.498632010943913, + "grad_norm": 5.229104042053223, + "learning_rate": 8.382732938136495e-06, + "loss": 0.3075, + "step": 12425 + }, + { + "epoch": 8.515731874145008, + "grad_norm": 44.49745178222656, + "learning_rate": 8.287733698130415e-06, + "loss": 0.373, + "step": 12450 + }, + { + "epoch": 8.5328317373461, + "grad_norm": 11.067756652832031, + "learning_rate": 8.192734458124334e-06, + "loss": 0.5501, + "step": 12475 + }, + { + "epoch": 8.549931600547195, + "grad_norm": 3.7558584213256836, + "learning_rate": 8.097735218118254e-06, + "loss": 0.3831, + "step": 12500 + }, + { + "epoch": 8.56703146374829, + "grad_norm": 6.008462429046631, + "learning_rate": 8.002735978112176e-06, + "loss": 0.2394, + "step": 12525 + }, + { + "epoch": 8.584131326949384, + "grad_norm": 10.782341003417969, + "learning_rate": 7.907736738106097e-06, + "loss": 0.2815, + "step": 12550 + }, + { + "epoch": 8.601231190150479, + "grad_norm": 3.08451247215271, + "learning_rate": 7.812737498100015e-06, + "loss": 0.4385, + "step": 12575 + }, + { + "epoch": 8.618331053351573, + "grad_norm": 2.4561235904693604, + "learning_rate": 7.717738258093935e-06, + "loss": 0.3698, + "step": 12600 + }, + { + "epoch": 8.635430916552668, + "grad_norm": 6.739116668701172, + "learning_rate": 7.622739018087856e-06, + "loss": 0.3201, + "step": 12625 + }, + { + "epoch": 8.652530779753763, + "grad_norm": 11.243478775024414, + "learning_rate": 7.527739778081776e-06, + "loss": 0.4415, + "step": 12650 + }, + { + "epoch": 8.669630642954857, + "grad_norm": 3.1412322521209717, + "learning_rate": 7.432740538075695e-06, + "loss": 0.2533, + "step": 12675 + }, + { + "epoch": 8.68673050615595, + "grad_norm": 14.60197639465332, + "learning_rate": 7.337741298069616e-06, + "loss": 0.4057, + "step": 12700 + }, + { + "epoch": 8.703830369357044, + "grad_norm": 9.934842109680176, + "learning_rate": 7.242742058063535e-06, + "loss": 0.3252, + "step": 12725 + }, + { + "epoch": 8.720930232558139, + "grad_norm": 1.3907521963119507, + "learning_rate": 7.147742818057455e-06, + "loss": 0.4068, + "step": 12750 + }, + { + "epoch": 8.738030095759234, + "grad_norm": 5.904654502868652, + "learning_rate": 7.052743578051376e-06, + "loss": 0.3572, + "step": 12775 + }, + { + "epoch": 8.755129958960328, + "grad_norm": 12.644196510314941, + "learning_rate": 6.957744338045295e-06, + "loss": 0.3342, + "step": 12800 + }, + { + "epoch": 8.772229822161423, + "grad_norm": 13.406341552734375, + "learning_rate": 6.862745098039216e-06, + "loss": 0.3859, + "step": 12825 + }, + { + "epoch": 8.789329685362517, + "grad_norm": 7.523469924926758, + "learning_rate": 6.7677458580331365e-06, + "loss": 0.2771, + "step": 12850 + }, + { + "epoch": 8.806429548563612, + "grad_norm": 2.058061122894287, + "learning_rate": 6.672746618027057e-06, + "loss": 0.3956, + "step": 12875 + }, + { + "epoch": 8.823529411764707, + "grad_norm": 13.852447509765625, + "learning_rate": 6.577747378020976e-06, + "loss": 0.288, + "step": 12900 + }, + { + "epoch": 8.840629274965801, + "grad_norm": 3.28694748878479, + "learning_rate": 6.4827481380148965e-06, + "loss": 0.3175, + "step": 12925 + }, + { + "epoch": 8.857729138166894, + "grad_norm": 4.923558235168457, + "learning_rate": 6.387748898008816e-06, + "loss": 0.4003, + "step": 12950 + }, + { + "epoch": 8.874829001367988, + "grad_norm": 13.867571830749512, + "learning_rate": 6.292749658002736e-06, + "loss": 0.3514, + "step": 12975 + }, + { + "epoch": 8.891928864569083, + "grad_norm": 3.354799747467041, + "learning_rate": 6.1977504179966565e-06, + "loss": 0.3073, + "step": 13000 + }, + { + "epoch": 8.909028727770178, + "grad_norm": 20.982271194458008, + "learning_rate": 6.102751177990576e-06, + "loss": 0.4001, + "step": 13025 + }, + { + "epoch": 8.926128590971272, + "grad_norm": 1.5266101360321045, + "learning_rate": 6.007751937984497e-06, + "loss": 0.2836, + "step": 13050 + }, + { + "epoch": 8.943228454172367, + "grad_norm": 4.203621864318848, + "learning_rate": 5.9127526979784164e-06, + "loss": 0.3879, + "step": 13075 + }, + { + "epoch": 8.960328317373461, + "grad_norm": 13.059199333190918, + "learning_rate": 5.817753457972337e-06, + "loss": 0.2895, + "step": 13100 + }, + { + "epoch": 8.977428180574556, + "grad_norm": 11.570258140563965, + "learning_rate": 5.722754217966256e-06, + "loss": 0.3616, + "step": 13125 + }, + { + "epoch": 8.99452804377565, + "grad_norm": 31.507492065429688, + "learning_rate": 5.627754977960176e-06, + "loss": 0.3524, + "step": 13150 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.9231819733697507, + "eval_f1_macro": 0.8652261940397432, + "eval_f1_micro": 0.9231819733697507, + "eval_f1_weighted": 0.9229570596156854, + "eval_loss": 0.22409066557884216, + "eval_precision_macro": 0.939434014505588, + "eval_precision_micro": 0.9231819733697507, + "eval_precision_weighted": 0.928643976460822, + "eval_recall_macro": 0.824494199524642, + "eval_recall_micro": 0.9231819733697507, + "eval_recall_weighted": 0.9231819733697507, + "eval_runtime": 19.264, + "eval_samples_per_second": 152.046, + "eval_steps_per_second": 9.552, + "step": 13158 + }, + { + "epoch": 9.011627906976743, + "grad_norm": 3.1529383659362793, + "learning_rate": 5.532755737954097e-06, + "loss": 0.2935, + "step": 13175 + }, + { + "epoch": 9.028727770177838, + "grad_norm": 1.6082165241241455, + "learning_rate": 5.437756497948017e-06, + "loss": 0.2694, + "step": 13200 + }, + { + "epoch": 9.045827633378932, + "grad_norm": 6.932997703552246, + "learning_rate": 5.342757257941937e-06, + "loss": 0.4234, + "step": 13225 + }, + { + "epoch": 9.062927496580027, + "grad_norm": 2.4087891578674316, + "learning_rate": 5.247758017935857e-06, + "loss": 0.297, + "step": 13250 + }, + { + "epoch": 9.080027359781122, + "grad_norm": 8.607876777648926, + "learning_rate": 5.152758777929777e-06, + "loss": 0.3279, + "step": 13275 + }, + { + "epoch": 9.097127222982216, + "grad_norm": 4.843038082122803, + "learning_rate": 5.057759537923696e-06, + "loss": 0.2534, + "step": 13300 + }, + { + "epoch": 9.11422708618331, + "grad_norm": 9.388402938842773, + "learning_rate": 4.962760297917617e-06, + "loss": 0.2849, + "step": 13325 + }, + { + "epoch": 9.131326949384405, + "grad_norm": 2.4661998748779297, + "learning_rate": 4.867761057911537e-06, + "loss": 0.3157, + "step": 13350 + }, + { + "epoch": 9.1484268125855, + "grad_norm": 13.333016395568848, + "learning_rate": 4.772761817905457e-06, + "loss": 0.2772, + "step": 13375 + }, + { + "epoch": 9.165526675786595, + "grad_norm": 6.937953948974609, + "learning_rate": 4.6777625778993775e-06, + "loss": 0.3582, + "step": 13400 + }, + { + "epoch": 9.182626538987687, + "grad_norm": 5.6831159591674805, + "learning_rate": 4.582763337893297e-06, + "loss": 0.3183, + "step": 13425 + }, + { + "epoch": 9.199726402188782, + "grad_norm": 7.25540018081665, + "learning_rate": 4.487764097887217e-06, + "loss": 0.4322, + "step": 13450 + }, + { + "epoch": 9.216826265389876, + "grad_norm": 4.3177103996276855, + "learning_rate": 4.392764857881137e-06, + "loss": 0.3983, + "step": 13475 + }, + { + "epoch": 9.233926128590971, + "grad_norm": 15.372535705566406, + "learning_rate": 4.297765617875058e-06, + "loss": 0.3684, + "step": 13500 + }, + { + "epoch": 9.251025991792066, + "grad_norm": 8.219186782836914, + "learning_rate": 4.202766377868977e-06, + "loss": 0.2893, + "step": 13525 + }, + { + "epoch": 9.26812585499316, + "grad_norm": 14.162530899047852, + "learning_rate": 4.1077671378628974e-06, + "loss": 0.3841, + "step": 13550 + }, + { + "epoch": 9.285225718194255, + "grad_norm": 2.816765308380127, + "learning_rate": 4.012767897856817e-06, + "loss": 0.4276, + "step": 13575 + }, + { + "epoch": 9.30232558139535, + "grad_norm": 1.3700157403945923, + "learning_rate": 3.917768657850737e-06, + "loss": 0.4496, + "step": 13600 + }, + { + "epoch": 9.319425444596444, + "grad_norm": 8.893135070800781, + "learning_rate": 3.822769417844657e-06, + "loss": 0.2779, + "step": 13625 + }, + { + "epoch": 9.336525307797537, + "grad_norm": 6.580329895019531, + "learning_rate": 3.7277701778385777e-06, + "loss": 0.341, + "step": 13650 + }, + { + "epoch": 9.353625170998631, + "grad_norm": 6.170793533325195, + "learning_rate": 3.6327709378324975e-06, + "loss": 0.3211, + "step": 13675 + }, + { + "epoch": 9.370725034199726, + "grad_norm": 6.2319254875183105, + "learning_rate": 3.537771697826418e-06, + "loss": 0.3529, + "step": 13700 + }, + { + "epoch": 9.38782489740082, + "grad_norm": 3.14901065826416, + "learning_rate": 3.4427724578203377e-06, + "loss": 0.2847, + "step": 13725 + }, + { + "epoch": 9.404924760601915, + "grad_norm": 12.451719284057617, + "learning_rate": 3.3477732178142575e-06, + "loss": 0.3679, + "step": 13750 + }, + { + "epoch": 9.42202462380301, + "grad_norm": 2.5386195182800293, + "learning_rate": 3.2527739778081774e-06, + "loss": 0.2476, + "step": 13775 + }, + { + "epoch": 9.439124487004104, + "grad_norm": 11.419671058654785, + "learning_rate": 3.157774737802098e-06, + "loss": 0.3914, + "step": 13800 + }, + { + "epoch": 9.456224350205199, + "grad_norm": 23.787368774414062, + "learning_rate": 3.0627754977960175e-06, + "loss": 0.3023, + "step": 13825 + }, + { + "epoch": 9.473324213406293, + "grad_norm": 13.726613998413086, + "learning_rate": 2.9677762577899378e-06, + "loss": 0.3243, + "step": 13850 + }, + { + "epoch": 9.490424076607388, + "grad_norm": 3.7777926921844482, + "learning_rate": 2.8727770177838576e-06, + "loss": 0.3515, + "step": 13875 + }, + { + "epoch": 9.50752393980848, + "grad_norm": 3.651082992553711, + "learning_rate": 2.777777777777778e-06, + "loss": 0.3076, + "step": 13900 + }, + { + "epoch": 9.524623803009575, + "grad_norm": 2.7207062244415283, + "learning_rate": 2.682778537771698e-06, + "loss": 0.3613, + "step": 13925 + }, + { + "epoch": 9.54172366621067, + "grad_norm": 6.451671600341797, + "learning_rate": 2.587779297765618e-06, + "loss": 0.2136, + "step": 13950 + }, + { + "epoch": 9.558823529411764, + "grad_norm": 10.220746040344238, + "learning_rate": 2.492780057759538e-06, + "loss": 0.3348, + "step": 13975 + }, + { + "epoch": 9.575923392612859, + "grad_norm": 14.093595504760742, + "learning_rate": 2.397780817753458e-06, + "loss": 0.2827, + "step": 14000 + }, + { + "epoch": 9.593023255813954, + "grad_norm": 2.391063928604126, + "learning_rate": 2.302781577747378e-06, + "loss": 0.3201, + "step": 14025 + }, + { + "epoch": 9.610123119015048, + "grad_norm": 15.106823921203613, + "learning_rate": 2.207782337741298e-06, + "loss": 0.3192, + "step": 14050 + }, + { + "epoch": 9.627222982216143, + "grad_norm": 4.812911510467529, + "learning_rate": 2.112783097735218e-06, + "loss": 0.3065, + "step": 14075 + }, + { + "epoch": 9.644322845417237, + "grad_norm": 4.565815448760986, + "learning_rate": 2.0177838577291384e-06, + "loss": 0.3443, + "step": 14100 + }, + { + "epoch": 9.661422708618332, + "grad_norm": 0.13094140589237213, + "learning_rate": 1.9227846177230583e-06, + "loss": 0.2713, + "step": 14125 + }, + { + "epoch": 9.678522571819425, + "grad_norm": 22.36683464050293, + "learning_rate": 1.8277853777169783e-06, + "loss": 0.314, + "step": 14150 + }, + { + "epoch": 9.69562243502052, + "grad_norm": 8.649237632751465, + "learning_rate": 1.7327861377108984e-06, + "loss": 0.3816, + "step": 14175 + }, + { + "epoch": 9.712722298221614, + "grad_norm": 2.255821466445923, + "learning_rate": 1.6377868977048183e-06, + "loss": 0.3827, + "step": 14200 + }, + { + "epoch": 9.729822161422709, + "grad_norm": 5.888030052185059, + "learning_rate": 1.5427876576987383e-06, + "loss": 0.3207, + "step": 14225 + }, + { + "epoch": 9.746922024623803, + "grad_norm": 1.6394869089126587, + "learning_rate": 1.4477884176926586e-06, + "loss": 0.2782, + "step": 14250 + }, + { + "epoch": 9.764021887824898, + "grad_norm": 0.9336591362953186, + "learning_rate": 1.3527891776865787e-06, + "loss": 0.3653, + "step": 14275 + }, + { + "epoch": 9.781121751025992, + "grad_norm": 8.919906616210938, + "learning_rate": 1.2577899376804985e-06, + "loss": 0.2396, + "step": 14300 + }, + { + "epoch": 9.798221614227087, + "grad_norm": 6.571496963500977, + "learning_rate": 1.1627906976744186e-06, + "loss": 0.303, + "step": 14325 + }, + { + "epoch": 9.815321477428181, + "grad_norm": 13.167415618896484, + "learning_rate": 1.0677914576683389e-06, + "loss": 0.2993, + "step": 14350 + }, + { + "epoch": 9.832421340629274, + "grad_norm": 1.0842267274856567, + "learning_rate": 9.727922176622587e-07, + "loss": 0.3435, + "step": 14375 + }, + { + "epoch": 9.849521203830369, + "grad_norm": 4.068078517913818, + "learning_rate": 8.777929776561788e-07, + "loss": 0.2995, + "step": 14400 + }, + { + "epoch": 9.866621067031463, + "grad_norm": 11.969517707824707, + "learning_rate": 7.827937376500988e-07, + "loss": 0.3289, + "step": 14425 + }, + { + "epoch": 9.883720930232558, + "grad_norm": 9.880623817443848, + "learning_rate": 6.877944976440189e-07, + "loss": 0.292, + "step": 14450 + }, + { + "epoch": 9.900820793433653, + "grad_norm": 11.973766326904297, + "learning_rate": 5.92795257637939e-07, + "loss": 0.3398, + "step": 14475 + }, + { + "epoch": 9.917920656634747, + "grad_norm": 2.8612163066864014, + "learning_rate": 4.977960176318589e-07, + "loss": 0.414, + "step": 14500 + }, + { + "epoch": 9.935020519835842, + "grad_norm": 31.290515899658203, + "learning_rate": 4.0279677762577904e-07, + "loss": 0.3436, + "step": 14525 + }, + { + "epoch": 9.952120383036936, + "grad_norm": 6.3889241218566895, + "learning_rate": 3.0779753761969905e-07, + "loss": 0.3069, + "step": 14550 + }, + { + "epoch": 9.96922024623803, + "grad_norm": 9.988734245300293, + "learning_rate": 2.127982976136191e-07, + "loss": 0.4289, + "step": 14575 + }, + { + "epoch": 9.986320109439124, + "grad_norm": 13.143084526062012, + "learning_rate": 1.1779905760753915e-07, + "loss": 0.2766, + "step": 14600 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.9269375213383407, + "eval_f1_macro": 0.881587062204185, + "eval_f1_micro": 0.9269375213383407, + "eval_f1_weighted": 0.9267500134300362, + "eval_loss": 0.22053596377372742, + "eval_precision_macro": 0.9520135455160805, + "eval_precision_micro": 0.9269375213383407, + "eval_precision_weighted": 0.932072731880276, + "eval_recall_macro": 0.8425714533291321, + "eval_recall_micro": 0.9269375213383407, + "eval_recall_weighted": 0.9269375213383407, + "eval_runtime": 19.2006, + "eval_samples_per_second": 152.547, + "eval_steps_per_second": 9.583, + "step": 14620 + } + ], + "logging_steps": 25, + "max_steps": 14620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 9.058483691559752e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}