diff --git "a/checkpoint-20391/trainer_state.json" "b/checkpoint-20391/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-20391/trainer_state.json" @@ -0,0 +1,5786 @@ +{ + "best_metric": 0.01777876727283001, + "best_model_checkpoint": "autotrain-vp92t-1q2id/checkpoint-20391", + "epoch": 2.9998896612600685, + "eval_steps": 500, + "global_step": 20391, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003677957997719666, + "grad_norm": 4.279318809509277, + "learning_rate": 3.6764705882352943e-07, + "loss": 0.685, + "step": 25 + }, + { + "epoch": 0.007355915995439332, + "grad_norm": 5.647529602050781, + "learning_rate": 7.352941176470589e-07, + "loss": 0.6757, + "step": 50 + }, + { + "epoch": 0.011033873993158997, + "grad_norm": 5.691235065460205, + "learning_rate": 1.1029411764705884e-06, + "loss": 0.6611, + "step": 75 + }, + { + "epoch": 0.014711831990878664, + "grad_norm": 3.6039223670959473, + "learning_rate": 1.4705882352941177e-06, + "loss": 0.6402, + "step": 100 + }, + { + "epoch": 0.018389789988598332, + "grad_norm": 5.447757244110107, + "learning_rate": 1.8382352941176471e-06, + "loss": 0.5745, + "step": 125 + }, + { + "epoch": 0.022067747986317995, + "grad_norm": 11.862848281860352, + "learning_rate": 2.2058823529411767e-06, + "loss": 0.4869, + "step": 150 + }, + { + "epoch": 0.02574570598403766, + "grad_norm": 8.085945129394531, + "learning_rate": 2.573529411764706e-06, + "loss": 0.3687, + "step": 175 + }, + { + "epoch": 0.029423663981757327, + "grad_norm": 7.246406555175781, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.3029, + "step": 200 + }, + { + "epoch": 0.033101621979477, + "grad_norm": 17.822601318359375, + "learning_rate": 3.308823529411765e-06, + "loss": 0.2098, + "step": 225 + }, + { + "epoch": 0.036779579977196664, + "grad_norm": 21.39044952392578, + "learning_rate": 3.6764705882352942e-06, + "loss": 0.1483, + "step": 250 + }, + { + "epoch": 0.04045753797491632, + "grad_norm": 1.4089692831039429, + "learning_rate": 4.044117647058823e-06, + "loss": 0.0803, + "step": 275 + }, + { + "epoch": 0.04413549597263599, + "grad_norm": 0.7371423840522766, + "learning_rate": 4.411764705882353e-06, + "loss": 0.0396, + "step": 300 + }, + { + "epoch": 0.047813453970355656, + "grad_norm": 0.22398647665977478, + "learning_rate": 4.779411764705882e-06, + "loss": 0.0381, + "step": 325 + }, + { + "epoch": 0.05149141196807532, + "grad_norm": 0.25332173705101013, + "learning_rate": 5.147058823529412e-06, + "loss": 0.0306, + "step": 350 + }, + { + "epoch": 0.05516936996579499, + "grad_norm": 0.24578100442886353, + "learning_rate": 5.5147058823529415e-06, + "loss": 0.0342, + "step": 375 + }, + { + "epoch": 0.058847327963514655, + "grad_norm": 0.09213005006313324, + "learning_rate": 5.882352941176471e-06, + "loss": 0.0094, + "step": 400 + }, + { + "epoch": 0.06252528596123433, + "grad_norm": 11.745342254638672, + "learning_rate": 6.25e-06, + "loss": 0.0627, + "step": 425 + }, + { + "epoch": 0.066203243958954, + "grad_norm": 12.147088050842285, + "learning_rate": 6.61764705882353e-06, + "loss": 0.0755, + "step": 450 + }, + { + "epoch": 0.06988120195667366, + "grad_norm": 0.14864382147789001, + "learning_rate": 6.985294117647059e-06, + "loss": 0.0532, + "step": 475 + }, + { + "epoch": 0.07355915995439333, + "grad_norm": 0.06755024939775467, + "learning_rate": 7.3529411764705884e-06, + "loss": 0.049, + "step": 500 + }, + { + "epoch": 0.077237117952113, + "grad_norm": 0.0582403726875782, + "learning_rate": 7.720588235294117e-06, + "loss": 0.0213, + "step": 525 + }, + { + "epoch": 0.08091507594983265, + "grad_norm": 0.04456046596169472, + "learning_rate": 8.088235294117646e-06, + "loss": 0.0446, + "step": 550 + }, + { + "epoch": 0.08459303394755231, + "grad_norm": 0.058339089155197144, + "learning_rate": 8.455882352941177e-06, + "loss": 0.0442, + "step": 575 + }, + { + "epoch": 0.08827099194527198, + "grad_norm": 0.03951073810458183, + "learning_rate": 8.823529411764707e-06, + "loss": 0.0131, + "step": 600 + }, + { + "epoch": 0.09194894994299165, + "grad_norm": 0.08159155398607254, + "learning_rate": 9.191176470588236e-06, + "loss": 0.0429, + "step": 625 + }, + { + "epoch": 0.09562690794071131, + "grad_norm": 0.0362938717007637, + "learning_rate": 9.558823529411764e-06, + "loss": 0.0336, + "step": 650 + }, + { + "epoch": 0.09930486593843098, + "grad_norm": 0.10369551926851273, + "learning_rate": 9.926470588235293e-06, + "loss": 0.0277, + "step": 675 + }, + { + "epoch": 0.10298282393615064, + "grad_norm": 0.032908402383327484, + "learning_rate": 1.0294117647058824e-05, + "loss": 0.0213, + "step": 700 + }, + { + "epoch": 0.10666078193387031, + "grad_norm": 0.017092719674110413, + "learning_rate": 1.0661764705882354e-05, + "loss": 0.0784, + "step": 725 + }, + { + "epoch": 0.11033873993158998, + "grad_norm": 0.015081583522260189, + "learning_rate": 1.1029411764705883e-05, + "loss": 0.0265, + "step": 750 + }, + { + "epoch": 0.11401669792930964, + "grad_norm": 0.09147176891565323, + "learning_rate": 1.139705882352941e-05, + "loss": 0.0381, + "step": 775 + }, + { + "epoch": 0.11769465592702931, + "grad_norm": 0.08481771498918533, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.128, + "step": 800 + }, + { + "epoch": 0.12137261392474898, + "grad_norm": 0.014765892177820206, + "learning_rate": 1.2132352941176471e-05, + "loss": 0.0141, + "step": 825 + }, + { + "epoch": 0.12505057192246866, + "grad_norm": 0.018918083980679512, + "learning_rate": 1.25e-05, + "loss": 0.0538, + "step": 850 + }, + { + "epoch": 0.12872852992018832, + "grad_norm": 0.015013976022601128, + "learning_rate": 1.2867647058823528e-05, + "loss": 0.0019, + "step": 875 + }, + { + "epoch": 0.132406487917908, + "grad_norm": 11.109874725341797, + "learning_rate": 1.323529411764706e-05, + "loss": 0.0394, + "step": 900 + }, + { + "epoch": 0.13608444591562766, + "grad_norm": 0.015857884660363197, + "learning_rate": 1.3602941176470589e-05, + "loss": 0.0005, + "step": 925 + }, + { + "epoch": 0.13976240391334732, + "grad_norm": 0.015862109139561653, + "learning_rate": 1.3970588235294118e-05, + "loss": 0.0214, + "step": 950 + }, + { + "epoch": 0.143440361911067, + "grad_norm": 0.020424969494342804, + "learning_rate": 1.4338235294117647e-05, + "loss": 0.0367, + "step": 975 + }, + { + "epoch": 0.14711831990878665, + "grad_norm": 0.027131319046020508, + "learning_rate": 1.4705882352941177e-05, + "loss": 0.08, + "step": 1000 + }, + { + "epoch": 0.15079627790650632, + "grad_norm": 0.03147580847144127, + "learning_rate": 1.5073529411764706e-05, + "loss": 0.0329, + "step": 1025 + }, + { + "epoch": 0.154474235904226, + "grad_norm": 24.15705680847168, + "learning_rate": 1.5441176470588234e-05, + "loss": 0.0064, + "step": 1050 + }, + { + "epoch": 0.15815219390194565, + "grad_norm": 0.011522412300109863, + "learning_rate": 1.5808823529411767e-05, + "loss": 0.0762, + "step": 1075 + }, + { + "epoch": 0.1618301518996653, + "grad_norm": 0.04401927441358566, + "learning_rate": 1.6176470588235293e-05, + "loss": 0.0656, + "step": 1100 + }, + { + "epoch": 0.16550810989738496, + "grad_norm": 0.034190475940704346, + "learning_rate": 1.6544117647058825e-05, + "loss": 0.0308, + "step": 1125 + }, + { + "epoch": 0.16918606789510462, + "grad_norm": 0.021350180730223656, + "learning_rate": 1.6911764705882355e-05, + "loss": 0.0539, + "step": 1150 + }, + { + "epoch": 0.1728640258928243, + "grad_norm": 0.0446242094039917, + "learning_rate": 1.727941176470588e-05, + "loss": 0.0681, + "step": 1175 + }, + { + "epoch": 0.17654198389054396, + "grad_norm": 1.6311242580413818, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.0293, + "step": 1200 + }, + { + "epoch": 0.18021994188826362, + "grad_norm": 0.00914335809648037, + "learning_rate": 1.801470588235294e-05, + "loss": 0.0386, + "step": 1225 + }, + { + "epoch": 0.1838978998859833, + "grad_norm": 0.009417989291250706, + "learning_rate": 1.8382352941176472e-05, + "loss": 0.0004, + "step": 1250 + }, + { + "epoch": 0.18757585788370296, + "grad_norm": 0.9801831245422363, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.057, + "step": 1275 + }, + { + "epoch": 0.19125381588142262, + "grad_norm": 0.005170044023543596, + "learning_rate": 1.9117647058823528e-05, + "loss": 0.0002, + "step": 1300 + }, + { + "epoch": 0.1949317738791423, + "grad_norm": 0.03409629687666893, + "learning_rate": 1.948529411764706e-05, + "loss": 0.0394, + "step": 1325 + }, + { + "epoch": 0.19860973187686196, + "grad_norm": 0.008645136840641499, + "learning_rate": 1.9852941176470586e-05, + "loss": 0.0053, + "step": 1350 + }, + { + "epoch": 0.20228768987458162, + "grad_norm": 0.004454991314560175, + "learning_rate": 2.022058823529412e-05, + "loss": 0.0002, + "step": 1375 + }, + { + "epoch": 0.2059656478723013, + "grad_norm": 0.00453265942633152, + "learning_rate": 2.058823529411765e-05, + "loss": 0.0216, + "step": 1400 + }, + { + "epoch": 0.20964360587002095, + "grad_norm": 0.005607594270259142, + "learning_rate": 2.0955882352941175e-05, + "loss": 0.0206, + "step": 1425 + }, + { + "epoch": 0.21332156386774062, + "grad_norm": 0.002988673048093915, + "learning_rate": 2.1323529411764707e-05, + "loss": 0.0002, + "step": 1450 + }, + { + "epoch": 0.2169995218654603, + "grad_norm": 0.15066058933734894, + "learning_rate": 2.1691176470588237e-05, + "loss": 0.0002, + "step": 1475 + }, + { + "epoch": 0.22067747986317995, + "grad_norm": 0.2750360071659088, + "learning_rate": 2.2058823529411766e-05, + "loss": 0.0002, + "step": 1500 + }, + { + "epoch": 0.22435543786089962, + "grad_norm": 0.00299286050722003, + "learning_rate": 2.2426470588235296e-05, + "loss": 0.0004, + "step": 1525 + }, + { + "epoch": 0.2280333958586193, + "grad_norm": 0.004124614410102367, + "learning_rate": 2.279411764705882e-05, + "loss": 0.0001, + "step": 1550 + }, + { + "epoch": 0.23171135385633895, + "grad_norm": 0.5924888849258423, + "learning_rate": 2.3161764705882354e-05, + "loss": 0.1041, + "step": 1575 + }, + { + "epoch": 0.23538931185405862, + "grad_norm": 0.011225424706935883, + "learning_rate": 2.3529411764705884e-05, + "loss": 0.0528, + "step": 1600 + }, + { + "epoch": 0.23906726985177829, + "grad_norm": 0.008940880186855793, + "learning_rate": 2.389705882352941e-05, + "loss": 0.0972, + "step": 1625 + }, + { + "epoch": 0.24274522784949795, + "grad_norm": 0.008405734784901142, + "learning_rate": 2.4264705882352942e-05, + "loss": 0.0495, + "step": 1650 + }, + { + "epoch": 0.24642318584721762, + "grad_norm": 0.008656616322696209, + "learning_rate": 2.4632352941176472e-05, + "loss": 0.0216, + "step": 1675 + }, + { + "epoch": 0.2501011438449373, + "grad_norm": 0.02361353114247322, + "learning_rate": 2.5e-05, + "loss": 0.0385, + "step": 1700 + }, + { + "epoch": 0.25377910184265695, + "grad_norm": 0.05051364749670029, + "learning_rate": 2.536764705882353e-05, + "loss": 0.0814, + "step": 1725 + }, + { + "epoch": 0.25745705984037665, + "grad_norm": 0.00914950855076313, + "learning_rate": 2.5735294117647057e-05, + "loss": 0.0006, + "step": 1750 + }, + { + "epoch": 0.2611350178380963, + "grad_norm": 0.008635000325739384, + "learning_rate": 2.610294117647059e-05, + "loss": 0.0003, + "step": 1775 + }, + { + "epoch": 0.264812975835816, + "grad_norm": 0.005725966300815344, + "learning_rate": 2.647058823529412e-05, + "loss": 0.0002, + "step": 1800 + }, + { + "epoch": 0.2684909338335356, + "grad_norm": 0.014133188873529434, + "learning_rate": 2.6838235294117648e-05, + "loss": 0.0562, + "step": 1825 + }, + { + "epoch": 0.2721688918312553, + "grad_norm": 0.0024135392159223557, + "learning_rate": 2.7205882352941177e-05, + "loss": 0.003, + "step": 1850 + }, + { + "epoch": 0.27584684982897495, + "grad_norm": 0.03300013393163681, + "learning_rate": 2.7573529411764707e-05, + "loss": 0.0947, + "step": 1875 + }, + { + "epoch": 0.27952480782669464, + "grad_norm": 0.046893417835235596, + "learning_rate": 2.7941176470588236e-05, + "loss": 0.0613, + "step": 1900 + }, + { + "epoch": 0.2832027658244143, + "grad_norm": 0.009986027143895626, + "learning_rate": 2.8308823529411766e-05, + "loss": 0.0616, + "step": 1925 + }, + { + "epoch": 0.286880723822134, + "grad_norm": 0.05905308201909065, + "learning_rate": 2.8676470588235295e-05, + "loss": 0.1182, + "step": 1950 + }, + { + "epoch": 0.2905586818198536, + "grad_norm": 0.00858025811612606, + "learning_rate": 2.9044117647058824e-05, + "loss": 0.0007, + "step": 1975 + }, + { + "epoch": 0.2942366398175733, + "grad_norm": 95.36400604248047, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.0869, + "step": 2000 + }, + { + "epoch": 0.29791459781529295, + "grad_norm": 0.005693793762475252, + "learning_rate": 2.9779411764705883e-05, + "loss": 0.0348, + "step": 2025 + }, + { + "epoch": 0.30159255581301264, + "grad_norm": 0.009921176359057426, + "learning_rate": 2.998365211705084e-05, + "loss": 0.0514, + "step": 2050 + }, + { + "epoch": 0.3052705138107323, + "grad_norm": 0.01893715187907219, + "learning_rate": 2.9942782409677946e-05, + "loss": 0.0575, + "step": 2075 + }, + { + "epoch": 0.308948471808452, + "grad_norm": 0.012356853112578392, + "learning_rate": 2.990191270230505e-05, + "loss": 0.0085, + "step": 2100 + }, + { + "epoch": 0.3126264298061716, + "grad_norm": 0.027572082355618477, + "learning_rate": 2.9861042994932156e-05, + "loss": 0.0071, + "step": 2125 + }, + { + "epoch": 0.3163043878038913, + "grad_norm": 0.0033696063328534365, + "learning_rate": 2.9820173287559262e-05, + "loss": 0.0153, + "step": 2150 + }, + { + "epoch": 0.31998234580161095, + "grad_norm": 0.0032816240563988686, + "learning_rate": 2.9779303580186367e-05, + "loss": 0.0425, + "step": 2175 + }, + { + "epoch": 0.3236603037993306, + "grad_norm": 0.013268062844872475, + "learning_rate": 2.9738433872813472e-05, + "loss": 0.0529, + "step": 2200 + }, + { + "epoch": 0.3273382617970503, + "grad_norm": 13.58910083770752, + "learning_rate": 2.9697564165440574e-05, + "loss": 0.1147, + "step": 2225 + }, + { + "epoch": 0.3310162197947699, + "grad_norm": 80.59501647949219, + "learning_rate": 2.965669445806768e-05, + "loss": 0.0043, + "step": 2250 + }, + { + "epoch": 0.3346941777924896, + "grad_norm": 81.63990020751953, + "learning_rate": 2.9615824750694785e-05, + "loss": 0.0355, + "step": 2275 + }, + { + "epoch": 0.33837213579020925, + "grad_norm": 0.014708608388900757, + "learning_rate": 2.957495504332189e-05, + "loss": 0.0668, + "step": 2300 + }, + { + "epoch": 0.34205009378792894, + "grad_norm": 0.015443817712366581, + "learning_rate": 2.9534085335948996e-05, + "loss": 0.0006, + "step": 2325 + }, + { + "epoch": 0.3457280517856486, + "grad_norm": 0.005841911304742098, + "learning_rate": 2.94932156285761e-05, + "loss": 0.0487, + "step": 2350 + }, + { + "epoch": 0.3494060097833683, + "grad_norm": 0.06746743619441986, + "learning_rate": 2.9452345921203207e-05, + "loss": 0.0229, + "step": 2375 + }, + { + "epoch": 0.3530839677810879, + "grad_norm": 0.002315863035619259, + "learning_rate": 2.941147621383031e-05, + "loss": 0.0003, + "step": 2400 + }, + { + "epoch": 0.3567619257788076, + "grad_norm": 0.0017154604429379106, + "learning_rate": 2.9370606506457414e-05, + "loss": 0.0308, + "step": 2425 + }, + { + "epoch": 0.36043988377652725, + "grad_norm": 0.009691163897514343, + "learning_rate": 2.932973679908452e-05, + "loss": 0.0256, + "step": 2450 + }, + { + "epoch": 0.36411784177424694, + "grad_norm": 0.011957678943872452, + "learning_rate": 2.9288867091711625e-05, + "loss": 0.0713, + "step": 2475 + }, + { + "epoch": 0.3677957997719666, + "grad_norm": 0.02373524010181427, + "learning_rate": 2.924799738433873e-05, + "loss": 0.0504, + "step": 2500 + }, + { + "epoch": 0.3714737577696863, + "grad_norm": 0.0031693673226982355, + "learning_rate": 2.9207127676965836e-05, + "loss": 0.0251, + "step": 2525 + }, + { + "epoch": 0.3751517157674059, + "grad_norm": 0.025995498523116112, + "learning_rate": 2.916625796959294e-05, + "loss": 0.0459, + "step": 2550 + }, + { + "epoch": 0.3788296737651256, + "grad_norm": 0.006027763709425926, + "learning_rate": 2.9125388262220043e-05, + "loss": 0.0013, + "step": 2575 + }, + { + "epoch": 0.38250763176284525, + "grad_norm": 0.014026220887899399, + "learning_rate": 2.9084518554847148e-05, + "loss": 0.0831, + "step": 2600 + }, + { + "epoch": 0.38618558976056494, + "grad_norm": 0.025293108075857162, + "learning_rate": 2.9043648847474254e-05, + "loss": 0.0275, + "step": 2625 + }, + { + "epoch": 0.3898635477582846, + "grad_norm": 0.0041050901636481285, + "learning_rate": 2.900277914010136e-05, + "loss": 0.0007, + "step": 2650 + }, + { + "epoch": 0.3935415057560043, + "grad_norm": 0.027650628238916397, + "learning_rate": 2.8961909432728464e-05, + "loss": 0.0002, + "step": 2675 + }, + { + "epoch": 0.3972194637537239, + "grad_norm": 0.06839253753423691, + "learning_rate": 2.8921039725355566e-05, + "loss": 0.0353, + "step": 2700 + }, + { + "epoch": 0.4008974217514436, + "grad_norm": 0.005996390245854855, + "learning_rate": 2.8880170017982672e-05, + "loss": 0.0134, + "step": 2725 + }, + { + "epoch": 0.40457537974916324, + "grad_norm": 0.0020017025526612997, + "learning_rate": 2.8839300310609774e-05, + "loss": 0.0002, + "step": 2750 + }, + { + "epoch": 0.40825333774688294, + "grad_norm": 0.04533281922340393, + "learning_rate": 2.879843060323688e-05, + "loss": 0.0367, + "step": 2775 + }, + { + "epoch": 0.4119312957446026, + "grad_norm": 0.005575124174356461, + "learning_rate": 2.8757560895863984e-05, + "loss": 0.064, + "step": 2800 + }, + { + "epoch": 0.41560925374232227, + "grad_norm": 0.0019691460765898228, + "learning_rate": 2.871669118849109e-05, + "loss": 0.0005, + "step": 2825 + }, + { + "epoch": 0.4192872117400419, + "grad_norm": 0.004097863100469112, + "learning_rate": 2.8675821481118195e-05, + "loss": 0.0557, + "step": 2850 + }, + { + "epoch": 0.4229651697377616, + "grad_norm": 0.0018194678705185652, + "learning_rate": 2.86349517737453e-05, + "loss": 0.0009, + "step": 2875 + }, + { + "epoch": 0.42664312773548124, + "grad_norm": 0.006679282058030367, + "learning_rate": 2.8594082066372406e-05, + "loss": 0.0882, + "step": 2900 + }, + { + "epoch": 0.43032108573320094, + "grad_norm": 0.0030163535848259926, + "learning_rate": 2.8553212358999508e-05, + "loss": 0.0005, + "step": 2925 + }, + { + "epoch": 0.4339990437309206, + "grad_norm": 0.2035808265209198, + "learning_rate": 2.8512342651626613e-05, + "loss": 0.0007, + "step": 2950 + }, + { + "epoch": 0.43767700172864027, + "grad_norm": 0.022791976109147072, + "learning_rate": 2.847147294425372e-05, + "loss": 0.0941, + "step": 2975 + }, + { + "epoch": 0.4413549597263599, + "grad_norm": 0.007299873046576977, + "learning_rate": 2.8430603236880824e-05, + "loss": 0.0511, + "step": 3000 + }, + { + "epoch": 0.4450329177240796, + "grad_norm": 0.003951882012188435, + "learning_rate": 2.838973352950793e-05, + "loss": 0.0096, + "step": 3025 + }, + { + "epoch": 0.44871087572179924, + "grad_norm": 0.009184204041957855, + "learning_rate": 2.8348863822135035e-05, + "loss": 0.0372, + "step": 3050 + }, + { + "epoch": 0.45238883371951893, + "grad_norm": 0.032343972474336624, + "learning_rate": 2.830799411476214e-05, + "loss": 0.0197, + "step": 3075 + }, + { + "epoch": 0.4560667917172386, + "grad_norm": 0.002111822599545121, + "learning_rate": 2.8267124407389246e-05, + "loss": 0.0004, + "step": 3100 + }, + { + "epoch": 0.45974474971495827, + "grad_norm": 0.0024695848114788532, + "learning_rate": 2.8226254700016348e-05, + "loss": 0.0757, + "step": 3125 + }, + { + "epoch": 0.4634227077126779, + "grad_norm": 0.001880451338365674, + "learning_rate": 2.8185384992643453e-05, + "loss": 0.0181, + "step": 3150 + }, + { + "epoch": 0.4671006657103976, + "grad_norm": 0.02519827149808407, + "learning_rate": 2.814451528527056e-05, + "loss": 0.0006, + "step": 3175 + }, + { + "epoch": 0.47077862370811724, + "grad_norm": 0.0016058700857684016, + "learning_rate": 2.8103645577897664e-05, + "loss": 0.0088, + "step": 3200 + }, + { + "epoch": 0.47445658170583693, + "grad_norm": 0.02888200432062149, + "learning_rate": 2.806277587052477e-05, + "loss": 0.0496, + "step": 3225 + }, + { + "epoch": 0.47813453970355657, + "grad_norm": 0.0747719258069992, + "learning_rate": 2.8021906163151874e-05, + "loss": 0.1338, + "step": 3250 + }, + { + "epoch": 0.48181249770127627, + "grad_norm": 4.114503860473633, + "learning_rate": 2.798103645577898e-05, + "loss": 0.063, + "step": 3275 + }, + { + "epoch": 0.4854904556989959, + "grad_norm": 0.10994021594524384, + "learning_rate": 2.7940166748406082e-05, + "loss": 0.0525, + "step": 3300 + }, + { + "epoch": 0.4891684136967156, + "grad_norm": 0.016218269243836403, + "learning_rate": 2.7899297041033187e-05, + "loss": 0.0547, + "step": 3325 + }, + { + "epoch": 0.49284637169443524, + "grad_norm": 0.08570988476276398, + "learning_rate": 2.7858427333660293e-05, + "loss": 0.0551, + "step": 3350 + }, + { + "epoch": 0.49652432969215493, + "grad_norm": 0.013475511223077774, + "learning_rate": 2.7817557626287398e-05, + "loss": 0.0445, + "step": 3375 + }, + { + "epoch": 0.5002022876898746, + "grad_norm": 0.006590835750102997, + "learning_rate": 2.7776687918914503e-05, + "loss": 0.0005, + "step": 3400 + }, + { + "epoch": 0.5038802456875943, + "grad_norm": 0.004501729272305965, + "learning_rate": 2.773581821154161e-05, + "loss": 0.0219, + "step": 3425 + }, + { + "epoch": 0.5075582036853139, + "grad_norm": 0.06999096274375916, + "learning_rate": 2.769494850416871e-05, + "loss": 0.0569, + "step": 3450 + }, + { + "epoch": 0.5112361616830335, + "grad_norm": 0.003883685451000929, + "learning_rate": 2.7654078796795813e-05, + "loss": 0.001, + "step": 3475 + }, + { + "epoch": 0.5149141196807533, + "grad_norm": 0.0029312793631106615, + "learning_rate": 2.7613209089422918e-05, + "loss": 0.0229, + "step": 3500 + }, + { + "epoch": 0.5185920776784729, + "grad_norm": 0.006315870210528374, + "learning_rate": 2.7572339382050023e-05, + "loss": 0.0731, + "step": 3525 + }, + { + "epoch": 0.5222700356761926, + "grad_norm": 0.0030722382944077253, + "learning_rate": 2.753146967467713e-05, + "loss": 0.0392, + "step": 3550 + }, + { + "epoch": 0.5259479936739122, + "grad_norm": 0.005796592216938734, + "learning_rate": 2.7490599967304234e-05, + "loss": 0.0139, + "step": 3575 + }, + { + "epoch": 0.529625951671632, + "grad_norm": 0.6967291831970215, + "learning_rate": 2.744973025993134e-05, + "loss": 0.0325, + "step": 3600 + }, + { + "epoch": 0.5333039096693516, + "grad_norm": 0.017705194652080536, + "learning_rate": 2.7408860552558445e-05, + "loss": 0.0433, + "step": 3625 + }, + { + "epoch": 0.5369818676670712, + "grad_norm": 0.020230021327733994, + "learning_rate": 2.7367990845185547e-05, + "loss": 0.0007, + "step": 3650 + }, + { + "epoch": 0.5406598256647909, + "grad_norm": 0.0023030710872262716, + "learning_rate": 2.7327121137812652e-05, + "loss": 0.0002, + "step": 3675 + }, + { + "epoch": 0.5443377836625106, + "grad_norm": 0.0020703673362731934, + "learning_rate": 2.7286251430439758e-05, + "loss": 0.0002, + "step": 3700 + }, + { + "epoch": 0.5480157416602303, + "grad_norm": 0.002691243775188923, + "learning_rate": 2.7245381723066863e-05, + "loss": 0.0005, + "step": 3725 + }, + { + "epoch": 0.5516936996579499, + "grad_norm": 0.002691768342629075, + "learning_rate": 2.720451201569397e-05, + "loss": 0.0957, + "step": 3750 + }, + { + "epoch": 0.5553716576556695, + "grad_norm": 0.05962933972477913, + "learning_rate": 2.7163642308321074e-05, + "loss": 0.1317, + "step": 3775 + }, + { + "epoch": 0.5590496156533893, + "grad_norm": 0.06475093215703964, + "learning_rate": 2.712277260094818e-05, + "loss": 0.0257, + "step": 3800 + }, + { + "epoch": 0.5627275736511089, + "grad_norm": 0.0121241370216012, + "learning_rate": 2.708190289357528e-05, + "loss": 0.0609, + "step": 3825 + }, + { + "epoch": 0.5664055316488286, + "grad_norm": 0.007753327488899231, + "learning_rate": 2.7041033186202387e-05, + "loss": 0.0008, + "step": 3850 + }, + { + "epoch": 0.5700834896465482, + "grad_norm": 0.005270315799862146, + "learning_rate": 2.7000163478829492e-05, + "loss": 0.0002, + "step": 3875 + }, + { + "epoch": 0.573761447644268, + "grad_norm": 0.004358434583991766, + "learning_rate": 2.6959293771456597e-05, + "loss": 0.0174, + "step": 3900 + }, + { + "epoch": 0.5774394056419876, + "grad_norm": 0.003769191913306713, + "learning_rate": 2.6918424064083703e-05, + "loss": 0.0513, + "step": 3925 + }, + { + "epoch": 0.5811173636397072, + "grad_norm": 0.0043784258887171745, + "learning_rate": 2.6877554356710808e-05, + "loss": 0.0185, + "step": 3950 + }, + { + "epoch": 0.5847953216374269, + "grad_norm": 0.004602793138474226, + "learning_rate": 2.6836684649337913e-05, + "loss": 0.0236, + "step": 3975 + }, + { + "epoch": 0.5884732796351466, + "grad_norm": 0.002638947917148471, + "learning_rate": 2.679581494196502e-05, + "loss": 0.0155, + "step": 4000 + }, + { + "epoch": 0.5921512376328663, + "grad_norm": 0.002830574056133628, + "learning_rate": 2.675494523459212e-05, + "loss": 0.0161, + "step": 4025 + }, + { + "epoch": 0.5958291956305859, + "grad_norm": 0.015412558801472187, + "learning_rate": 2.6714075527219226e-05, + "loss": 0.131, + "step": 4050 + }, + { + "epoch": 0.5995071536283055, + "grad_norm": 0.016349300742149353, + "learning_rate": 2.667320581984633e-05, + "loss": 0.0132, + "step": 4075 + }, + { + "epoch": 0.6031851116260253, + "grad_norm": 0.013236075639724731, + "learning_rate": 2.6632336112473437e-05, + "loss": 0.0005, + "step": 4100 + }, + { + "epoch": 0.6068630696237449, + "grad_norm": 0.007088659331202507, + "learning_rate": 2.6591466405100542e-05, + "loss": 0.0004, + "step": 4125 + }, + { + "epoch": 0.6105410276214646, + "grad_norm": 0.02121301181614399, + "learning_rate": 2.6550596697727648e-05, + "loss": 0.0397, + "step": 4150 + }, + { + "epoch": 0.6142189856191842, + "grad_norm": 0.030070917680859566, + "learning_rate": 2.650972699035475e-05, + "loss": 0.0754, + "step": 4175 + }, + { + "epoch": 0.617896943616904, + "grad_norm": 10.173595428466797, + "learning_rate": 2.646885728298185e-05, + "loss": 0.0336, + "step": 4200 + }, + { + "epoch": 0.6215749016146236, + "grad_norm": 0.014447388239204884, + "learning_rate": 2.6427987575608957e-05, + "loss": 0.003, + "step": 4225 + }, + { + "epoch": 0.6252528596123432, + "grad_norm": 0.012096612714231014, + "learning_rate": 2.6387117868236062e-05, + "loss": 0.0481, + "step": 4250 + }, + { + "epoch": 0.6289308176100629, + "grad_norm": 0.02047719806432724, + "learning_rate": 2.6346248160863168e-05, + "loss": 0.051, + "step": 4275 + }, + { + "epoch": 0.6326087756077826, + "grad_norm": 0.01152089238166809, + "learning_rate": 2.6305378453490273e-05, + "loss": 0.011, + "step": 4300 + }, + { + "epoch": 0.6362867336055023, + "grad_norm": 0.01178329810500145, + "learning_rate": 2.626450874611738e-05, + "loss": 0.0187, + "step": 4325 + }, + { + "epoch": 0.6399646916032219, + "grad_norm": 0.012962247245013714, + "learning_rate": 2.6223639038744484e-05, + "loss": 0.041, + "step": 4350 + }, + { + "epoch": 0.6436426496009415, + "grad_norm": 0.012993029318749905, + "learning_rate": 2.6182769331371586e-05, + "loss": 0.03, + "step": 4375 + }, + { + "epoch": 0.6473206075986612, + "grad_norm": 0.01311455201357603, + "learning_rate": 2.614189962399869e-05, + "loss": 0.0421, + "step": 4400 + }, + { + "epoch": 0.6509985655963809, + "grad_norm": 0.022407829761505127, + "learning_rate": 2.6101029916625797e-05, + "loss": 0.0312, + "step": 4425 + }, + { + "epoch": 0.6546765235941006, + "grad_norm": 0.007614122703671455, + "learning_rate": 2.6060160209252902e-05, + "loss": 0.0014, + "step": 4450 + }, + { + "epoch": 0.6583544815918202, + "grad_norm": 0.006891134660691023, + "learning_rate": 2.6019290501880007e-05, + "loss": 0.0966, + "step": 4475 + }, + { + "epoch": 0.6620324395895398, + "grad_norm": 0.026897389441728592, + "learning_rate": 2.5978420794507113e-05, + "loss": 0.0387, + "step": 4500 + }, + { + "epoch": 0.6657103975872596, + "grad_norm": 0.013364088721573353, + "learning_rate": 2.5937551087134218e-05, + "loss": 0.0007, + "step": 4525 + }, + { + "epoch": 0.6693883555849792, + "grad_norm": 0.006984102539718151, + "learning_rate": 2.589668137976132e-05, + "loss": 0.0008, + "step": 4550 + }, + { + "epoch": 0.6730663135826989, + "grad_norm": 0.005882107652723789, + "learning_rate": 2.5855811672388425e-05, + "loss": 0.0003, + "step": 4575 + }, + { + "epoch": 0.6767442715804185, + "grad_norm": 0.008882598020136356, + "learning_rate": 2.581494196501553e-05, + "loss": 0.0389, + "step": 4600 + }, + { + "epoch": 0.6804222295781382, + "grad_norm": 0.01086785364896059, + "learning_rate": 2.5774072257642636e-05, + "loss": 0.0305, + "step": 4625 + }, + { + "epoch": 0.6841001875758579, + "grad_norm": 0.005837304517626762, + "learning_rate": 2.573320255026974e-05, + "loss": 0.0277, + "step": 4650 + }, + { + "epoch": 0.6877781455735775, + "grad_norm": 0.006613869220018387, + "learning_rate": 2.5692332842896847e-05, + "loss": 0.0003, + "step": 4675 + }, + { + "epoch": 0.6914561035712972, + "grad_norm": 0.012274155393242836, + "learning_rate": 2.5651463135523952e-05, + "loss": 0.0383, + "step": 4700 + }, + { + "epoch": 0.6951340615690169, + "grad_norm": 0.0031378071289509535, + "learning_rate": 2.5610593428151054e-05, + "loss": 0.0065, + "step": 4725 + }, + { + "epoch": 0.6988120195667366, + "grad_norm": 0.12304351478815079, + "learning_rate": 2.556972372077816e-05, + "loss": 0.0103, + "step": 4750 + }, + { + "epoch": 0.7024899775644562, + "grad_norm": 0.005349988583475351, + "learning_rate": 2.5528854013405265e-05, + "loss": 0.0292, + "step": 4775 + }, + { + "epoch": 0.7061679355621758, + "grad_norm": 0.0023686892818659544, + "learning_rate": 2.548798430603237e-05, + "loss": 0.0169, + "step": 4800 + }, + { + "epoch": 0.7098458935598956, + "grad_norm": 0.0018137163715437055, + "learning_rate": 2.5447114598659476e-05, + "loss": 0.0444, + "step": 4825 + }, + { + "epoch": 0.7135238515576152, + "grad_norm": 0.0029049592558294535, + "learning_rate": 2.540624489128658e-05, + "loss": 0.0591, + "step": 4850 + }, + { + "epoch": 0.7172018095553349, + "grad_norm": 0.0024209930561482906, + "learning_rate": 2.5365375183913687e-05, + "loss": 0.0311, + "step": 4875 + }, + { + "epoch": 0.7208797675530545, + "grad_norm": 62.02742385864258, + "learning_rate": 2.532450547654079e-05, + "loss": 0.0324, + "step": 4900 + }, + { + "epoch": 0.7245577255507742, + "grad_norm": 0.002258418360725045, + "learning_rate": 2.528363576916789e-05, + "loss": 0.0003, + "step": 4925 + }, + { + "epoch": 0.7282356835484939, + "grad_norm": 0.001804179628379643, + "learning_rate": 2.5242766061794996e-05, + "loss": 0.0499, + "step": 4950 + }, + { + "epoch": 0.7319136415462135, + "grad_norm": 0.33615773916244507, + "learning_rate": 2.52018963544221e-05, + "loss": 0.0005, + "step": 4975 + }, + { + "epoch": 0.7355915995439332, + "grad_norm": 0.0010956133482977748, + "learning_rate": 2.5161026647049207e-05, + "loss": 0.0008, + "step": 5000 + }, + { + "epoch": 0.7392695575416529, + "grad_norm": 0.0012902173912152648, + "learning_rate": 2.5120156939676312e-05, + "loss": 0.0273, + "step": 5025 + }, + { + "epoch": 0.7429475155393725, + "grad_norm": 0.013881388120353222, + "learning_rate": 2.5079287232303417e-05, + "loss": 0.0488, + "step": 5050 + }, + { + "epoch": 0.7466254735370922, + "grad_norm": 0.011136908084154129, + "learning_rate": 2.5038417524930523e-05, + "loss": 0.0003, + "step": 5075 + }, + { + "epoch": 0.7503034315348118, + "grad_norm": 0.020626788958907127, + "learning_rate": 2.4997547817557625e-05, + "loss": 0.0621, + "step": 5100 + }, + { + "epoch": 0.7539813895325316, + "grad_norm": 0.039804015308618546, + "learning_rate": 2.495667811018473e-05, + "loss": 0.0941, + "step": 5125 + }, + { + "epoch": 0.7576593475302512, + "grad_norm": 0.019914086908102036, + "learning_rate": 2.4915808402811835e-05, + "loss": 0.0021, + "step": 5150 + }, + { + "epoch": 0.7613373055279709, + "grad_norm": 0.027103891596198082, + "learning_rate": 2.487493869543894e-05, + "loss": 0.0375, + "step": 5175 + }, + { + "epoch": 0.7650152635256905, + "grad_norm": 0.008572924882173538, + "learning_rate": 2.4834068988066046e-05, + "loss": 0.0007, + "step": 5200 + }, + { + "epoch": 0.7686932215234102, + "grad_norm": 0.011288322508335114, + "learning_rate": 2.479319928069315e-05, + "loss": 0.0339, + "step": 5225 + }, + { + "epoch": 0.7723711795211299, + "grad_norm": 11.5412015914917, + "learning_rate": 2.4752329573320257e-05, + "loss": 0.0414, + "step": 5250 + }, + { + "epoch": 0.7760491375188495, + "grad_norm": 0.016787946224212646, + "learning_rate": 2.471145986594736e-05, + "loss": 0.0817, + "step": 5275 + }, + { + "epoch": 0.7797270955165692, + "grad_norm": 0.3828181326389313, + "learning_rate": 2.4670590158574464e-05, + "loss": 0.0013, + "step": 5300 + }, + { + "epoch": 0.7834050535142889, + "grad_norm": 0.00423394562676549, + "learning_rate": 2.462972045120157e-05, + "loss": 0.0009, + "step": 5325 + }, + { + "epoch": 0.7870830115120085, + "grad_norm": 0.0038542733527719975, + "learning_rate": 2.4588850743828675e-05, + "loss": 0.0006, + "step": 5350 + }, + { + "epoch": 0.7907609695097282, + "grad_norm": 0.002444320358335972, + "learning_rate": 2.454798103645578e-05, + "loss": 0.0208, + "step": 5375 + }, + { + "epoch": 0.7944389275074478, + "grad_norm": 0.06983044743537903, + "learning_rate": 2.4507111329082886e-05, + "loss": 0.0611, + "step": 5400 + }, + { + "epoch": 0.7981168855051676, + "grad_norm": 0.0033168047666549683, + "learning_rate": 2.446624162170999e-05, + "loss": 0.0016, + "step": 5425 + }, + { + "epoch": 0.8017948435028872, + "grad_norm": 0.0031268312595784664, + "learning_rate": 2.4425371914337093e-05, + "loss": 0.0119, + "step": 5450 + }, + { + "epoch": 0.8054728015006068, + "grad_norm": 0.0019544719252735376, + "learning_rate": 2.43845022069642e-05, + "loss": 0.0337, + "step": 5475 + }, + { + "epoch": 0.8091507594983265, + "grad_norm": 0.017085539177060127, + "learning_rate": 2.4343632499591304e-05, + "loss": 0.0776, + "step": 5500 + }, + { + "epoch": 0.8128287174960462, + "grad_norm": 0.916741669178009, + "learning_rate": 2.430276279221841e-05, + "loss": 0.0009, + "step": 5525 + }, + { + "epoch": 0.8165066754937659, + "grad_norm": 0.0018967619398608804, + "learning_rate": 2.4261893084845515e-05, + "loss": 0.0005, + "step": 5550 + }, + { + "epoch": 0.8201846334914855, + "grad_norm": 0.002946459921076894, + "learning_rate": 2.422102337747262e-05, + "loss": 0.052, + "step": 5575 + }, + { + "epoch": 0.8238625914892052, + "grad_norm": 0.00457314308732748, + "learning_rate": 2.4180153670099725e-05, + "loss": 0.0386, + "step": 5600 + }, + { + "epoch": 0.8275405494869249, + "grad_norm": 0.00886601209640503, + "learning_rate": 2.4139283962726827e-05, + "loss": 0.0003, + "step": 5625 + }, + { + "epoch": 0.8312185074846445, + "grad_norm": 0.004110053181648254, + "learning_rate": 2.409841425535393e-05, + "loss": 0.0002, + "step": 5650 + }, + { + "epoch": 0.8348964654823642, + "grad_norm": 0.002550973556935787, + "learning_rate": 2.4057544547981035e-05, + "loss": 0.0893, + "step": 5675 + }, + { + "epoch": 0.8385744234800838, + "grad_norm": 0.0047971270978450775, + "learning_rate": 2.401667484060814e-05, + "loss": 0.0649, + "step": 5700 + }, + { + "epoch": 0.8422523814778036, + "grad_norm": 22.79808235168457, + "learning_rate": 2.3975805133235246e-05, + "loss": 0.0509, + "step": 5725 + }, + { + "epoch": 0.8459303394755232, + "grad_norm": 0.02145661599934101, + "learning_rate": 2.393493542586235e-05, + "loss": 0.0456, + "step": 5750 + }, + { + "epoch": 0.8496082974732428, + "grad_norm": 0.8593617081642151, + "learning_rate": 2.3894065718489456e-05, + "loss": 0.0727, + "step": 5775 + }, + { + "epoch": 0.8532862554709625, + "grad_norm": 0.004426372237503529, + "learning_rate": 2.385319601111656e-05, + "loss": 0.0004, + "step": 5800 + }, + { + "epoch": 0.8569642134686822, + "grad_norm": 0.0030661604832857847, + "learning_rate": 2.3812326303743664e-05, + "loss": 0.0001, + "step": 5825 + }, + { + "epoch": 0.8606421714664019, + "grad_norm": 0.0062530264258384705, + "learning_rate": 2.377145659637077e-05, + "loss": 0.044, + "step": 5850 + }, + { + "epoch": 0.8643201294641215, + "grad_norm": 0.023851774632930756, + "learning_rate": 2.3730586888997874e-05, + "loss": 0.0412, + "step": 5875 + }, + { + "epoch": 0.8679980874618412, + "grad_norm": 0.013338697142899036, + "learning_rate": 2.368971718162498e-05, + "loss": 0.0006, + "step": 5900 + }, + { + "epoch": 0.8716760454595609, + "grad_norm": 0.01904129609465599, + "learning_rate": 2.3648847474252085e-05, + "loss": 0.0726, + "step": 5925 + }, + { + "epoch": 0.8753540034572805, + "grad_norm": 0.010262302123010159, + "learning_rate": 2.360797776687919e-05, + "loss": 0.0087, + "step": 5950 + }, + { + "epoch": 0.8790319614550002, + "grad_norm": 0.006104280706495047, + "learning_rate": 2.3567108059506296e-05, + "loss": 0.0004, + "step": 5975 + }, + { + "epoch": 0.8827099194527198, + "grad_norm": 0.019870450720191002, + "learning_rate": 2.3526238352133398e-05, + "loss": 0.0795, + "step": 6000 + }, + { + "epoch": 0.8863878774504396, + "grad_norm": 0.021579677239060402, + "learning_rate": 2.3485368644760503e-05, + "loss": 0.0009, + "step": 6025 + }, + { + "epoch": 0.8900658354481592, + "grad_norm": 0.007828918285667896, + "learning_rate": 2.344449893738761e-05, + "loss": 0.0017, + "step": 6050 + }, + { + "epoch": 0.8937437934458788, + "grad_norm": 0.006341638043522835, + "learning_rate": 2.3403629230014714e-05, + "loss": 0.0198, + "step": 6075 + }, + { + "epoch": 0.8974217514435985, + "grad_norm": 0.004665954038500786, + "learning_rate": 2.336275952264182e-05, + "loss": 0.0002, + "step": 6100 + }, + { + "epoch": 0.9010997094413182, + "grad_norm": 0.0059740557335317135, + "learning_rate": 2.3321889815268925e-05, + "loss": 0.0398, + "step": 6125 + }, + { + "epoch": 0.9047776674390379, + "grad_norm": 0.09372496604919434, + "learning_rate": 2.328102010789603e-05, + "loss": 0.0596, + "step": 6150 + }, + { + "epoch": 0.9084556254367575, + "grad_norm": 0.06878636032342911, + "learning_rate": 2.3240150400523132e-05, + "loss": 0.0858, + "step": 6175 + }, + { + "epoch": 0.9121335834344771, + "grad_norm": 5.581681728363037, + "learning_rate": 2.3199280693150238e-05, + "loss": 0.0728, + "step": 6200 + }, + { + "epoch": 0.9158115414321969, + "grad_norm": 0.017690079286694527, + "learning_rate": 2.3158410985777343e-05, + "loss": 0.0109, + "step": 6225 + }, + { + "epoch": 0.9194894994299165, + "grad_norm": 0.009789933450520039, + "learning_rate": 2.3117541278404448e-05, + "loss": 0.003, + "step": 6250 + }, + { + "epoch": 0.9231674574276362, + "grad_norm": 0.007185524329543114, + "learning_rate": 2.3076671571031554e-05, + "loss": 0.0003, + "step": 6275 + }, + { + "epoch": 0.9268454154253558, + "grad_norm": 0.29879918694496155, + "learning_rate": 2.303580186365866e-05, + "loss": 0.0004, + "step": 6300 + }, + { + "epoch": 0.9305233734230756, + "grad_norm": 0.005276743322610855, + "learning_rate": 2.2994932156285764e-05, + "loss": 0.0209, + "step": 6325 + }, + { + "epoch": 0.9342013314207952, + "grad_norm": 4.756071090698242, + "learning_rate": 2.2954062448912866e-05, + "loss": 0.0696, + "step": 6350 + }, + { + "epoch": 0.9378792894185148, + "grad_norm": 0.005177750252187252, + "learning_rate": 2.291319274153997e-05, + "loss": 0.0294, + "step": 6375 + }, + { + "epoch": 0.9415572474162345, + "grad_norm": 0.005691983737051487, + "learning_rate": 2.2872323034167074e-05, + "loss": 0.0102, + "step": 6400 + }, + { + "epoch": 0.9452352054139542, + "grad_norm": 0.012254934757947922, + "learning_rate": 2.283145332679418e-05, + "loss": 0.0204, + "step": 6425 + }, + { + "epoch": 0.9489131634116739, + "grad_norm": 0.007204866968095303, + "learning_rate": 2.2790583619421284e-05, + "loss": 0.001, + "step": 6450 + }, + { + "epoch": 0.9525911214093935, + "grad_norm": 0.0022422156762331724, + "learning_rate": 2.274971391204839e-05, + "loss": 0.0074, + "step": 6475 + }, + { + "epoch": 0.9562690794071131, + "grad_norm": 0.0029815786983817816, + "learning_rate": 2.2708844204675495e-05, + "loss": 0.0001, + "step": 6500 + }, + { + "epoch": 0.9599470374048328, + "grad_norm": 0.0027428902685642242, + "learning_rate": 2.26679744973026e-05, + "loss": 0.0534, + "step": 6525 + }, + { + "epoch": 0.9636249954025525, + "grad_norm": 0.0038738884031772614, + "learning_rate": 2.2627104789929703e-05, + "loss": 0.0167, + "step": 6550 + }, + { + "epoch": 0.9673029534002722, + "grad_norm": 0.002053373260423541, + "learning_rate": 2.2586235082556808e-05, + "loss": 0.0119, + "step": 6575 + }, + { + "epoch": 0.9709809113979918, + "grad_norm": 0.015416144393384457, + "learning_rate": 2.2545365375183913e-05, + "loss": 0.0436, + "step": 6600 + }, + { + "epoch": 0.9746588693957114, + "grad_norm": 0.028199590742588043, + "learning_rate": 2.250449566781102e-05, + "loss": 0.06, + "step": 6625 + }, + { + "epoch": 0.9783368273934312, + "grad_norm": 0.00808124803006649, + "learning_rate": 2.2463625960438124e-05, + "loss": 0.0082, + "step": 6650 + }, + { + "epoch": 0.9820147853911508, + "grad_norm": 0.896677553653717, + "learning_rate": 2.242275625306523e-05, + "loss": 0.0004, + "step": 6675 + }, + { + "epoch": 0.9856927433888705, + "grad_norm": 0.014748472720384598, + "learning_rate": 2.2381886545692335e-05, + "loss": 0.0554, + "step": 6700 + }, + { + "epoch": 0.9893707013865901, + "grad_norm": 0.08279622346162796, + "learning_rate": 2.2341016838319437e-05, + "loss": 0.0727, + "step": 6725 + }, + { + "epoch": 0.9930486593843099, + "grad_norm": 0.0343361496925354, + "learning_rate": 2.2300147130946542e-05, + "loss": 0.0653, + "step": 6750 + }, + { + "epoch": 0.9967266173820295, + "grad_norm": 0.01778659224510193, + "learning_rate": 2.2259277423573648e-05, + "loss": 0.0468, + "step": 6775 + }, + { + "epoch": 0.9999632204200228, + "eval_accuracy": 0.9960279514527399, + "eval_auc": 0.9999026317054973, + "eval_f1": 0.9960253201825409, + "eval_loss": 0.020395906642079353, + "eval_precision": 0.9967589864466706, + "eval_recall": 0.9952927331568108, + "eval_runtime": 2488.2544, + "eval_samples_per_second": 5.464, + "eval_steps_per_second": 1.366, + "step": 6797 + }, + { + "epoch": 1.0004045753797493, + "grad_norm": 0.038309529423713684, + "learning_rate": 2.2218407716200753e-05, + "loss": 0.0867, + "step": 6800 + }, + { + "epoch": 1.0040825333774688, + "grad_norm": 0.03099379874765873, + "learning_rate": 2.217753800882786e-05, + "loss": 0.0251, + "step": 6825 + }, + { + "epoch": 1.0077604913751885, + "grad_norm": 0.014889312908053398, + "learning_rate": 2.2136668301454964e-05, + "loss": 0.0007, + "step": 6850 + }, + { + "epoch": 1.011438449372908, + "grad_norm": 0.011484134942293167, + "learning_rate": 2.209579859408207e-05, + "loss": 0.0042, + "step": 6875 + }, + { + "epoch": 1.0151164073706278, + "grad_norm": 0.008166844956576824, + "learning_rate": 2.205492888670917e-05, + "loss": 0.0003, + "step": 6900 + }, + { + "epoch": 1.0187943653683476, + "grad_norm": 0.006568376440554857, + "learning_rate": 2.2014059179336276e-05, + "loss": 0.0003, + "step": 6925 + }, + { + "epoch": 1.022472323366067, + "grad_norm": 0.0057509117759764194, + "learning_rate": 2.1973189471963382e-05, + "loss": 0.0084, + "step": 6950 + }, + { + "epoch": 1.0261502813637868, + "grad_norm": 0.004868589341640472, + "learning_rate": 2.1932319764590487e-05, + "loss": 0.0043, + "step": 6975 + }, + { + "epoch": 1.0298282393615066, + "grad_norm": 0.004712184425443411, + "learning_rate": 2.1891450057217593e-05, + "loss": 0.0029, + "step": 7000 + }, + { + "epoch": 1.033506197359226, + "grad_norm": 0.0035947624128311872, + "learning_rate": 2.1850580349844698e-05, + "loss": 0.0051, + "step": 7025 + }, + { + "epoch": 1.0371841553569459, + "grad_norm": 0.0033714687451720238, + "learning_rate": 2.1809710642471803e-05, + "loss": 0.0377, + "step": 7050 + }, + { + "epoch": 1.0408621133546654, + "grad_norm": 12.332621574401855, + "learning_rate": 2.1768840935098905e-05, + "loss": 0.0061, + "step": 7075 + }, + { + "epoch": 1.0445400713523851, + "grad_norm": 0.002749204868450761, + "learning_rate": 2.172797122772601e-05, + "loss": 0.0003, + "step": 7100 + }, + { + "epoch": 1.0482180293501049, + "grad_norm": 0.0026924049016088247, + "learning_rate": 2.1687101520353113e-05, + "loss": 0.0001, + "step": 7125 + }, + { + "epoch": 1.0518959873478244, + "grad_norm": 0.006290792487561703, + "learning_rate": 2.1646231812980218e-05, + "loss": 0.0443, + "step": 7150 + }, + { + "epoch": 1.0555739453455442, + "grad_norm": 0.0048763868398964405, + "learning_rate": 2.1605362105607323e-05, + "loss": 0.0002, + "step": 7175 + }, + { + "epoch": 1.059251903343264, + "grad_norm": 0.003825924126431346, + "learning_rate": 2.156449239823443e-05, + "loss": 0.0002, + "step": 7200 + }, + { + "epoch": 1.0629298613409834, + "grad_norm": 0.0068919663317501545, + "learning_rate": 2.1523622690861534e-05, + "loss": 0.0001, + "step": 7225 + }, + { + "epoch": 1.0666078193387032, + "grad_norm": 0.0029492308385670185, + "learning_rate": 2.1482752983488636e-05, + "loss": 0.0001, + "step": 7250 + }, + { + "epoch": 1.0702857773364227, + "grad_norm": 0.0031761634163558483, + "learning_rate": 2.144188327611574e-05, + "loss": 0.0001, + "step": 7275 + }, + { + "epoch": 1.0739637353341425, + "grad_norm": 0.004821736365556717, + "learning_rate": 2.1401013568742847e-05, + "loss": 0.0373, + "step": 7300 + }, + { + "epoch": 1.0776416933318622, + "grad_norm": 0.003594837849959731, + "learning_rate": 2.1360143861369952e-05, + "loss": 0.0004, + "step": 7325 + }, + { + "epoch": 1.0813196513295817, + "grad_norm": 0.004811630584299564, + "learning_rate": 2.1319274153997058e-05, + "loss": 0.0001, + "step": 7350 + }, + { + "epoch": 1.0849976093273015, + "grad_norm": 0.006440363824367523, + "learning_rate": 2.1278404446624163e-05, + "loss": 0.0453, + "step": 7375 + }, + { + "epoch": 1.0886755673250212, + "grad_norm": 0.007900132797658443, + "learning_rate": 2.123753473925127e-05, + "loss": 0.0003, + "step": 7400 + }, + { + "epoch": 1.0923535253227408, + "grad_norm": 0.00898217223584652, + "learning_rate": 2.1196665031878374e-05, + "loss": 0.0811, + "step": 7425 + }, + { + "epoch": 1.0960314833204605, + "grad_norm": 0.031215157359838486, + "learning_rate": 2.1155795324505476e-05, + "loss": 0.035, + "step": 7450 + }, + { + "epoch": 1.09970944131818, + "grad_norm": 0.022409003227949142, + "learning_rate": 2.111492561713258e-05, + "loss": 0.0014, + "step": 7475 + }, + { + "epoch": 1.1033873993158998, + "grad_norm": 0.0137456264346838, + "learning_rate": 2.1074055909759686e-05, + "loss": 0.0006, + "step": 7500 + }, + { + "epoch": 1.1070653573136195, + "grad_norm": 0.006075088866055012, + "learning_rate": 2.1033186202386792e-05, + "loss": 0.0005, + "step": 7525 + }, + { + "epoch": 1.110743315311339, + "grad_norm": 0.007382239680737257, + "learning_rate": 2.0992316495013897e-05, + "loss": 0.0003, + "step": 7550 + }, + { + "epoch": 1.1144212733090588, + "grad_norm": 0.016082163900136948, + "learning_rate": 2.0951446787641003e-05, + "loss": 0.0469, + "step": 7575 + }, + { + "epoch": 1.1180992313067786, + "grad_norm": 0.02028113603591919, + "learning_rate": 2.0910577080268108e-05, + "loss": 0.0398, + "step": 7600 + }, + { + "epoch": 1.121777189304498, + "grad_norm": 0.014643259346485138, + "learning_rate": 2.086970737289521e-05, + "loss": 0.0007, + "step": 7625 + }, + { + "epoch": 1.1254551473022179, + "grad_norm": 0.010461482219398022, + "learning_rate": 2.0828837665522315e-05, + "loss": 0.0004, + "step": 7650 + }, + { + "epoch": 1.1291331052999374, + "grad_norm": 0.009396770037710667, + "learning_rate": 2.078796795814942e-05, + "loss": 0.0004, + "step": 7675 + }, + { + "epoch": 1.1328110632976571, + "grad_norm": 0.007909806445240974, + "learning_rate": 2.0747098250776526e-05, + "loss": 0.016, + "step": 7700 + }, + { + "epoch": 1.1364890212953769, + "grad_norm": 0.006153750233352184, + "learning_rate": 2.070622854340363e-05, + "loss": 0.0055, + "step": 7725 + }, + { + "epoch": 1.1401669792930964, + "grad_norm": 0.006996823474764824, + "learning_rate": 2.0665358836030737e-05, + "loss": 0.0002, + "step": 7750 + }, + { + "epoch": 1.1438449372908162, + "grad_norm": 0.006032935809344053, + "learning_rate": 2.0624489128657842e-05, + "loss": 0.0331, + "step": 7775 + }, + { + "epoch": 1.1475228952885357, + "grad_norm": 0.003607578342780471, + "learning_rate": 2.0583619421284944e-05, + "loss": 0.0002, + "step": 7800 + }, + { + "epoch": 1.1512008532862554, + "grad_norm": 0.004726866725832224, + "learning_rate": 2.054274971391205e-05, + "loss": 0.0002, + "step": 7825 + }, + { + "epoch": 1.1548788112839752, + "grad_norm": 0.004033273551613092, + "learning_rate": 2.050188000653915e-05, + "loss": 0.0001, + "step": 7850 + }, + { + "epoch": 1.1585567692816947, + "grad_norm": 0.0035559283569455147, + "learning_rate": 2.0461010299166257e-05, + "loss": 0.0001, + "step": 7875 + }, + { + "epoch": 1.1622347272794145, + "grad_norm": 0.002765959594398737, + "learning_rate": 2.0420140591793362e-05, + "loss": 0.0001, + "step": 7900 + }, + { + "epoch": 1.1659126852771342, + "grad_norm": 0.003123935777693987, + "learning_rate": 2.0379270884420468e-05, + "loss": 0.0001, + "step": 7925 + }, + { + "epoch": 1.1695906432748537, + "grad_norm": 0.0030226910021156073, + "learning_rate": 2.0338401177047573e-05, + "loss": 0.0443, + "step": 7950 + }, + { + "epoch": 1.1732686012725735, + "grad_norm": 0.002675386844202876, + "learning_rate": 2.0297531469674675e-05, + "loss": 0.0001, + "step": 7975 + }, + { + "epoch": 1.1769465592702932, + "grad_norm": 0.002876314101740718, + "learning_rate": 2.025666176230178e-05, + "loss": 0.0001, + "step": 8000 + }, + { + "epoch": 1.1806245172680128, + "grad_norm": 0.003930400125682354, + "learning_rate": 2.0215792054928886e-05, + "loss": 0.0463, + "step": 8025 + }, + { + "epoch": 1.1843024752657325, + "grad_norm": 0.004908836912363768, + "learning_rate": 2.017492234755599e-05, + "loss": 0.0002, + "step": 8050 + }, + { + "epoch": 1.187980433263452, + "grad_norm": 0.005489639472216368, + "learning_rate": 2.0134052640183097e-05, + "loss": 0.0014, + "step": 8075 + }, + { + "epoch": 1.1916583912611718, + "grad_norm": 0.0054463837295770645, + "learning_rate": 2.0093182932810202e-05, + "loss": 0.0418, + "step": 8100 + }, + { + "epoch": 1.1953363492588915, + "grad_norm": 0.004771388601511717, + "learning_rate": 2.0052313225437307e-05, + "loss": 0.0002, + "step": 8125 + }, + { + "epoch": 1.199014307256611, + "grad_norm": 0.004579597618430853, + "learning_rate": 2.001144351806441e-05, + "loss": 0.0002, + "step": 8150 + }, + { + "epoch": 1.2026922652543308, + "grad_norm": 0.005399708636105061, + "learning_rate": 1.9970573810691515e-05, + "loss": 0.0002, + "step": 8175 + }, + { + "epoch": 1.2063702232520503, + "grad_norm": 0.0028218550141900778, + "learning_rate": 1.992970410331862e-05, + "loss": 0.0001, + "step": 8200 + }, + { + "epoch": 1.21004818124977, + "grad_norm": 0.0270390622317791, + "learning_rate": 1.9888834395945725e-05, + "loss": 0.1464, + "step": 8225 + }, + { + "epoch": 1.2137261392474898, + "grad_norm": 0.007817487232387066, + "learning_rate": 1.984796468857283e-05, + "loss": 0.0005, + "step": 8250 + }, + { + "epoch": 1.2174040972452094, + "grad_norm": 0.009673170745372772, + "learning_rate": 1.9807094981199936e-05, + "loss": 0.0003, + "step": 8275 + }, + { + "epoch": 1.2210820552429291, + "grad_norm": 0.006883264984935522, + "learning_rate": 1.976622527382704e-05, + "loss": 0.0364, + "step": 8300 + }, + { + "epoch": 1.2247600132406489, + "grad_norm": 0.038729436695575714, + "learning_rate": 1.9725355566454147e-05, + "loss": 0.0002, + "step": 8325 + }, + { + "epoch": 1.2284379712383684, + "grad_norm": 0.004570882301777601, + "learning_rate": 1.968448585908125e-05, + "loss": 0.0002, + "step": 8350 + }, + { + "epoch": 1.2321159292360881, + "grad_norm": 0.010231226682662964, + "learning_rate": 1.9643616151708354e-05, + "loss": 0.0463, + "step": 8375 + }, + { + "epoch": 1.235793887233808, + "grad_norm": 0.008044122718274593, + "learning_rate": 1.960274644433546e-05, + "loss": 0.0003, + "step": 8400 + }, + { + "epoch": 1.2394718452315274, + "grad_norm": 0.005202152766287327, + "learning_rate": 1.9561876736962565e-05, + "loss": 0.0391, + "step": 8425 + }, + { + "epoch": 1.2431498032292472, + "grad_norm": 0.0054007298313081264, + "learning_rate": 1.952100702958967e-05, + "loss": 0.0182, + "step": 8450 + }, + { + "epoch": 1.2468277612269667, + "grad_norm": 0.005195588804781437, + "learning_rate": 1.9480137322216776e-05, + "loss": 0.0392, + "step": 8475 + }, + { + "epoch": 1.2505057192246865, + "grad_norm": 0.00451032817363739, + "learning_rate": 1.943926761484388e-05, + "loss": 0.0002, + "step": 8500 + }, + { + "epoch": 1.2541836772224062, + "grad_norm": 0.00390147278085351, + "learning_rate": 1.9398397907470983e-05, + "loss": 0.0002, + "step": 8525 + }, + { + "epoch": 1.2578616352201257, + "grad_norm": 0.0030624952632933855, + "learning_rate": 1.935752820009809e-05, + "loss": 0.0001, + "step": 8550 + }, + { + "epoch": 1.2615395932178455, + "grad_norm": 0.0030448674224317074, + "learning_rate": 1.931665849272519e-05, + "loss": 0.0001, + "step": 8575 + }, + { + "epoch": 1.265217551215565, + "grad_norm": 0.003369387937709689, + "learning_rate": 1.9275788785352296e-05, + "loss": 0.0001, + "step": 8600 + }, + { + "epoch": 1.2688955092132848, + "grad_norm": 0.0026294661220163107, + "learning_rate": 1.92349190779794e-05, + "loss": 0.0001, + "step": 8625 + }, + { + "epoch": 1.2725734672110045, + "grad_norm": 0.002674271585419774, + "learning_rate": 1.9194049370606507e-05, + "loss": 0.0001, + "step": 8650 + }, + { + "epoch": 1.276251425208724, + "grad_norm": 0.016562707722187042, + "learning_rate": 1.9153179663233612e-05, + "loss": 0.0001, + "step": 8675 + }, + { + "epoch": 1.2799293832064438, + "grad_norm": 0.002845450770109892, + "learning_rate": 1.9112309955860714e-05, + "loss": 0.0376, + "step": 8700 + }, + { + "epoch": 1.2836073412041635, + "grad_norm": 0.002954358235001564, + "learning_rate": 1.907144024848782e-05, + "loss": 0.0001, + "step": 8725 + }, + { + "epoch": 1.287285299201883, + "grad_norm": 0.002028050599619746, + "learning_rate": 1.9030570541114925e-05, + "loss": 0.0047, + "step": 8750 + }, + { + "epoch": 1.2909632571996028, + "grad_norm": 0.002608607057482004, + "learning_rate": 1.898970083374203e-05, + "loss": 0.0001, + "step": 8775 + }, + { + "epoch": 1.2946412151973226, + "grad_norm": 0.0024424525909125805, + "learning_rate": 1.8948831126369135e-05, + "loss": 0.0001, + "step": 8800 + }, + { + "epoch": 1.298319173195042, + "grad_norm": 0.001993270590901375, + "learning_rate": 1.890796141899624e-05, + "loss": 0.0001, + "step": 8825 + }, + { + "epoch": 1.3019971311927618, + "grad_norm": 0.009992810897529125, + "learning_rate": 1.8867091711623346e-05, + "loss": 0.0001, + "step": 8850 + }, + { + "epoch": 1.3056750891904814, + "grad_norm": 0.003959705121815205, + "learning_rate": 1.8826222004250448e-05, + "loss": 0.0336, + "step": 8875 + }, + { + "epoch": 1.3093530471882011, + "grad_norm": 0.002648918190971017, + "learning_rate": 1.8785352296877554e-05, + "loss": 0.0002, + "step": 8900 + }, + { + "epoch": 1.3130310051859206, + "grad_norm": 0.001997936749830842, + "learning_rate": 1.874448258950466e-05, + "loss": 0.0001, + "step": 8925 + }, + { + "epoch": 1.3167089631836404, + "grad_norm": 0.0019702455028891563, + "learning_rate": 1.8703612882131764e-05, + "loss": 0.0001, + "step": 8950 + }, + { + "epoch": 1.3203869211813601, + "grad_norm": 0.0019666815642267466, + "learning_rate": 1.866274317475887e-05, + "loss": 0.015, + "step": 8975 + }, + { + "epoch": 1.3240648791790797, + "grad_norm": 0.016209330409765244, + "learning_rate": 1.8621873467385975e-05, + "loss": 0.0499, + "step": 9000 + }, + { + "epoch": 1.3277428371767994, + "grad_norm": 0.002770668361335993, + "learning_rate": 1.858100376001308e-05, + "loss": 0.0001, + "step": 9025 + }, + { + "epoch": 1.3314207951745192, + "grad_norm": 0.0025566229596734047, + "learning_rate": 1.8540134052640182e-05, + "loss": 0.0429, + "step": 9050 + }, + { + "epoch": 1.3350987531722387, + "grad_norm": 0.00490075396373868, + "learning_rate": 1.8499264345267288e-05, + "loss": 0.0391, + "step": 9075 + }, + { + "epoch": 1.3387767111699584, + "grad_norm": 0.002448379760608077, + "learning_rate": 1.8458394637894393e-05, + "loss": 0.0002, + "step": 9100 + }, + { + "epoch": 1.3424546691676782, + "grad_norm": 0.0027882566209882498, + "learning_rate": 1.84175249305215e-05, + "loss": 0.0001, + "step": 9125 + }, + { + "epoch": 1.3461326271653977, + "grad_norm": 0.0021890706848353148, + "learning_rate": 1.8376655223148604e-05, + "loss": 0.0001, + "step": 9150 + }, + { + "epoch": 1.3498105851631175, + "grad_norm": 0.002767590805888176, + "learning_rate": 1.833578551577571e-05, + "loss": 0.0001, + "step": 9175 + }, + { + "epoch": 1.3534885431608372, + "grad_norm": 0.0018375491490587592, + "learning_rate": 1.8294915808402815e-05, + "loss": 0.0003, + "step": 9200 + }, + { + "epoch": 1.3571665011585567, + "grad_norm": 0.0020680581219494343, + "learning_rate": 1.825404610102992e-05, + "loss": 0.0001, + "step": 9225 + }, + { + "epoch": 1.3608444591562765, + "grad_norm": 0.001452911994419992, + "learning_rate": 1.8213176393657022e-05, + "loss": 0.0001, + "step": 9250 + }, + { + "epoch": 1.364522417153996, + "grad_norm": 0.011856326833367348, + "learning_rate": 1.8172306686284127e-05, + "loss": 0.0498, + "step": 9275 + }, + { + "epoch": 1.3682003751517158, + "grad_norm": 0.005070924758911133, + "learning_rate": 1.813143697891123e-05, + "loss": 0.0003, + "step": 9300 + }, + { + "epoch": 1.3718783331494353, + "grad_norm": 0.003941578324884176, + "learning_rate": 1.8090567271538335e-05, + "loss": 0.0001, + "step": 9325 + }, + { + "epoch": 1.375556291147155, + "grad_norm": 0.0044369762763381, + "learning_rate": 1.804969756416544e-05, + "loss": 0.0395, + "step": 9350 + }, + { + "epoch": 1.3792342491448748, + "grad_norm": 0.003973621409386396, + "learning_rate": 1.8008827856792546e-05, + "loss": 0.0002, + "step": 9375 + }, + { + "epoch": 1.3829122071425943, + "grad_norm": 0.00455184280872345, + "learning_rate": 1.796795814941965e-05, + "loss": 0.0001, + "step": 9400 + }, + { + "epoch": 1.386590165140314, + "grad_norm": 0.0031091428827494383, + "learning_rate": 1.7927088442046753e-05, + "loss": 0.0001, + "step": 9425 + }, + { + "epoch": 1.3902681231380338, + "grad_norm": 0.0024325144477188587, + "learning_rate": 1.7886218734673858e-05, + "loss": 0.0001, + "step": 9450 + }, + { + "epoch": 1.3939460811357534, + "grad_norm": 0.0036399061791598797, + "learning_rate": 1.7845349027300964e-05, + "loss": 0.0001, + "step": 9475 + }, + { + "epoch": 1.397624039133473, + "grad_norm": 0.0023723021149635315, + "learning_rate": 1.780447931992807e-05, + "loss": 0.0001, + "step": 9500 + }, + { + "epoch": 1.4013019971311929, + "grad_norm": 0.0027509965002536774, + "learning_rate": 1.7763609612555174e-05, + "loss": 0.0001, + "step": 9525 + }, + { + "epoch": 1.4049799551289124, + "grad_norm": 0.0033826676663011312, + "learning_rate": 1.772273990518228e-05, + "loss": 0.0001, + "step": 9550 + }, + { + "epoch": 1.4086579131266321, + "grad_norm": 0.011138912290334702, + "learning_rate": 1.7681870197809385e-05, + "loss": 0.0398, + "step": 9575 + }, + { + "epoch": 1.4123358711243519, + "grad_norm": 0.023271048441529274, + "learning_rate": 1.7641000490436487e-05, + "loss": 0.0747, + "step": 9600 + }, + { + "epoch": 1.4160138291220714, + "grad_norm": 0.18063010275363922, + "learning_rate": 1.7600130783063593e-05, + "loss": 0.0009, + "step": 9625 + }, + { + "epoch": 1.4196917871197912, + "grad_norm": 0.012859140522778034, + "learning_rate": 1.7559261075690698e-05, + "loss": 0.0444, + "step": 9650 + }, + { + "epoch": 1.4233697451175107, + "grad_norm": 0.003733620513230562, + "learning_rate": 1.7518391368317803e-05, + "loss": 0.0219, + "step": 9675 + }, + { + "epoch": 1.4270477031152304, + "grad_norm": 4.048089504241943, + "learning_rate": 1.747752166094491e-05, + "loss": 0.052, + "step": 9700 + }, + { + "epoch": 1.43072566111295, + "grad_norm": 0.02329842559993267, + "learning_rate": 1.7436651953572014e-05, + "loss": 0.0033, + "step": 9725 + }, + { + "epoch": 1.4344036191106697, + "grad_norm": 0.5609085559844971, + "learning_rate": 1.739578224619912e-05, + "loss": 0.0468, + "step": 9750 + }, + { + "epoch": 1.4380815771083895, + "grad_norm": 0.010268951766192913, + "learning_rate": 1.735491253882622e-05, + "loss": 0.0004, + "step": 9775 + }, + { + "epoch": 1.441759535106109, + "grad_norm": 0.005183890461921692, + "learning_rate": 1.7314042831453327e-05, + "loss": 0.0002, + "step": 9800 + }, + { + "epoch": 1.4454374931038287, + "grad_norm": 0.006362477317452431, + "learning_rate": 1.7273173124080432e-05, + "loss": 0.0623, + "step": 9825 + }, + { + "epoch": 1.4491154511015485, + "grad_norm": 0.004158661235123873, + "learning_rate": 1.7232303416707537e-05, + "loss": 0.0002, + "step": 9850 + }, + { + "epoch": 1.452793409099268, + "grad_norm": 0.003037210088223219, + "learning_rate": 1.7191433709334643e-05, + "loss": 0.0001, + "step": 9875 + }, + { + "epoch": 1.4564713670969878, + "grad_norm": 0.006479774601757526, + "learning_rate": 1.7150564001961748e-05, + "loss": 0.0562, + "step": 9900 + }, + { + "epoch": 1.4601493250947075, + "grad_norm": 34.625465393066406, + "learning_rate": 1.7109694294588854e-05, + "loss": 0.0423, + "step": 9925 + }, + { + "epoch": 1.463827283092427, + "grad_norm": 0.003740801243111491, + "learning_rate": 1.706882458721596e-05, + "loss": 0.0001, + "step": 9950 + }, + { + "epoch": 1.4675052410901468, + "grad_norm": 0.06391607969999313, + "learning_rate": 1.702795487984306e-05, + "loss": 0.0211, + "step": 9975 + }, + { + "epoch": 1.4711831990878665, + "grad_norm": 0.0029998337849974632, + "learning_rate": 1.6987085172470166e-05, + "loss": 0.0012, + "step": 10000 + }, + { + "epoch": 1.474861157085586, + "grad_norm": 0.002598424442112446, + "learning_rate": 1.6946215465097272e-05, + "loss": 0.0056, + "step": 10025 + }, + { + "epoch": 1.4785391150833058, + "grad_norm": 0.0026498546358197927, + "learning_rate": 1.6905345757724374e-05, + "loss": 0.0003, + "step": 10050 + }, + { + "epoch": 1.4822170730810253, + "grad_norm": 0.002896289573982358, + "learning_rate": 1.686447605035148e-05, + "loss": 0.0244, + "step": 10075 + }, + { + "epoch": 1.485895031078745, + "grad_norm": 0.002737634815275669, + "learning_rate": 1.6823606342978584e-05, + "loss": 0.0002, + "step": 10100 + }, + { + "epoch": 1.4895729890764646, + "grad_norm": 0.002295145532116294, + "learning_rate": 1.678273663560569e-05, + "loss": 0.0001, + "step": 10125 + }, + { + "epoch": 1.4932509470741844, + "grad_norm": 0.0018749627051874995, + "learning_rate": 1.6741866928232792e-05, + "loss": 0.0001, + "step": 10150 + }, + { + "epoch": 1.4969289050719041, + "grad_norm": 0.002252426231279969, + "learning_rate": 1.6700997220859897e-05, + "loss": 0.0091, + "step": 10175 + }, + { + "epoch": 1.5006068630696237, + "grad_norm": 0.001987684750929475, + "learning_rate": 1.6660127513487003e-05, + "loss": 0.0059, + "step": 10200 + }, + { + "epoch": 1.5042848210673434, + "grad_norm": 0.0018681609071791172, + "learning_rate": 1.6619257806114108e-05, + "loss": 0.0036, + "step": 10225 + }, + { + "epoch": 1.5079627790650632, + "grad_norm": 0.002243634080514312, + "learning_rate": 1.6578388098741213e-05, + "loss": 0.0001, + "step": 10250 + }, + { + "epoch": 1.5116407370627827, + "grad_norm": 0.005282828118652105, + "learning_rate": 1.653751839136832e-05, + "loss": 0.0508, + "step": 10275 + }, + { + "epoch": 1.5153186950605024, + "grad_norm": 0.0033266160171478987, + "learning_rate": 1.6496648683995424e-05, + "loss": 0.0036, + "step": 10300 + }, + { + "epoch": 1.5189966530582222, + "grad_norm": 0.0024327326100319624, + "learning_rate": 1.6455778976622526e-05, + "loss": 0.0001, + "step": 10325 + }, + { + "epoch": 1.5226746110559417, + "grad_norm": 0.0037725428119301796, + "learning_rate": 1.641490926924963e-05, + "loss": 0.0859, + "step": 10350 + }, + { + "epoch": 1.5263525690536615, + "grad_norm": 0.01479677390307188, + "learning_rate": 1.6374039561876737e-05, + "loss": 0.0002, + "step": 10375 + }, + { + "epoch": 1.5300305270513812, + "grad_norm": 0.002465145429596305, + "learning_rate": 1.6333169854503842e-05, + "loss": 0.0009, + "step": 10400 + }, + { + "epoch": 1.5337084850491007, + "grad_norm": 0.002028359565883875, + "learning_rate": 1.6292300147130948e-05, + "loss": 0.0001, + "step": 10425 + }, + { + "epoch": 1.5373864430468203, + "grad_norm": 0.0017766653327271342, + "learning_rate": 1.6251430439758053e-05, + "loss": 0.0001, + "step": 10450 + }, + { + "epoch": 1.5410644010445402, + "grad_norm": 0.002013767370954156, + "learning_rate": 1.6210560732385158e-05, + "loss": 0.0255, + "step": 10475 + }, + { + "epoch": 1.5447423590422598, + "grad_norm": 0.0019861028995364904, + "learning_rate": 1.616969102501226e-05, + "loss": 0.0109, + "step": 10500 + }, + { + "epoch": 1.5484203170399793, + "grad_norm": 0.0017919589299708605, + "learning_rate": 1.6128821317639366e-05, + "loss": 0.0063, + "step": 10525 + }, + { + "epoch": 1.552098275037699, + "grad_norm": 0.001575242611579597, + "learning_rate": 1.608795161026647e-05, + "loss": 0.0001, + "step": 10550 + }, + { + "epoch": 1.5557762330354188, + "grad_norm": 0.0017625424079596996, + "learning_rate": 1.6047081902893576e-05, + "loss": 0.0001, + "step": 10575 + }, + { + "epoch": 1.5594541910331383, + "grad_norm": 0.0014293509302660823, + "learning_rate": 1.6006212195520682e-05, + "loss": 0.0001, + "step": 10600 + }, + { + "epoch": 1.563132149030858, + "grad_norm": 3.637284994125366, + "learning_rate": 1.5965342488147787e-05, + "loss": 0.0319, + "step": 10625 + }, + { + "epoch": 1.5668101070285778, + "grad_norm": 0.0015190584817901254, + "learning_rate": 1.5924472780774893e-05, + "loss": 0.1112, + "step": 10650 + }, + { + "epoch": 1.5704880650262973, + "grad_norm": 0.0019073854200541973, + "learning_rate": 1.5883603073401995e-05, + "loss": 0.0001, + "step": 10675 + }, + { + "epoch": 1.574166023024017, + "grad_norm": 0.15334878861904144, + "learning_rate": 1.58427333660291e-05, + "loss": 0.0001, + "step": 10700 + }, + { + "epoch": 1.5778439810217368, + "grad_norm": 0.0013233659556135535, + "learning_rate": 1.5801863658656205e-05, + "loss": 0.0006, + "step": 10725 + }, + { + "epoch": 1.5815219390194564, + "grad_norm": 60.88636779785156, + "learning_rate": 1.576099395128331e-05, + "loss": 0.0794, + "step": 10750 + }, + { + "epoch": 1.5851998970171761, + "grad_norm": 0.006810314953327179, + "learning_rate": 1.5720124243910413e-05, + "loss": 0.0272, + "step": 10775 + }, + { + "epoch": 1.5888778550148959, + "grad_norm": 0.006012595724314451, + "learning_rate": 1.5679254536537518e-05, + "loss": 0.0177, + "step": 10800 + }, + { + "epoch": 1.5925558130126154, + "grad_norm": 0.0041669500060379505, + "learning_rate": 1.5638384829164623e-05, + "loss": 0.0007, + "step": 10825 + }, + { + "epoch": 1.596233771010335, + "grad_norm": 0.0024410944897681475, + "learning_rate": 1.559751512179173e-05, + "loss": 0.0001, + "step": 10850 + }, + { + "epoch": 1.5999117290080547, + "grad_norm": 0.002287843730300665, + "learning_rate": 1.555664541441883e-05, + "loss": 0.0001, + "step": 10875 + }, + { + "epoch": 1.6035896870057744, + "grad_norm": 0.002450288040563464, + "learning_rate": 1.5515775707045936e-05, + "loss": 0.0001, + "step": 10900 + }, + { + "epoch": 1.607267645003494, + "grad_norm": 0.0017540917033329606, + "learning_rate": 1.547490599967304e-05, + "loss": 0.0001, + "step": 10925 + }, + { + "epoch": 1.6109456030012137, + "grad_norm": 0.0018945990595966578, + "learning_rate": 1.5434036292300147e-05, + "loss": 0.0001, + "step": 10950 + }, + { + "epoch": 1.6146235609989334, + "grad_norm": 0.38427916169166565, + "learning_rate": 1.5393166584927252e-05, + "loss": 0.0478, + "step": 10975 + }, + { + "epoch": 1.618301518996653, + "grad_norm": 0.005249540787190199, + "learning_rate": 1.5352296877554358e-05, + "loss": 0.0005, + "step": 11000 + }, + { + "epoch": 1.6219794769943727, + "grad_norm": 0.049626559019088745, + "learning_rate": 1.5311427170181463e-05, + "loss": 0.0803, + "step": 11025 + }, + { + "epoch": 1.6256574349920925, + "grad_norm": 0.006765100173652172, + "learning_rate": 1.5270557462808565e-05, + "loss": 0.021, + "step": 11050 + }, + { + "epoch": 1.629335392989812, + "grad_norm": 0.012057892046868801, + "learning_rate": 1.522968775543567e-05, + "loss": 0.0005, + "step": 11075 + }, + { + "epoch": 1.6330133509875318, + "grad_norm": 0.012171362526714802, + "learning_rate": 1.5188818048062776e-05, + "loss": 0.0171, + "step": 11100 + }, + { + "epoch": 1.6366913089852515, + "grad_norm": 0.006173169240355492, + "learning_rate": 1.5147948340689881e-05, + "loss": 0.0183, + "step": 11125 + }, + { + "epoch": 1.640369266982971, + "grad_norm": 0.025982793420553207, + "learning_rate": 1.5107078633316986e-05, + "loss": 0.041, + "step": 11150 + }, + { + "epoch": 1.6440472249806906, + "grad_norm": 0.0121184466406703, + "learning_rate": 1.5066208925944092e-05, + "loss": 0.0066, + "step": 11175 + }, + { + "epoch": 1.6477251829784105, + "grad_norm": 0.008928947150707245, + "learning_rate": 1.5025339218571197e-05, + "loss": 0.0013, + "step": 11200 + }, + { + "epoch": 1.65140314097613, + "grad_norm": 0.003572331042960286, + "learning_rate": 1.4984469511198301e-05, + "loss": 0.0448, + "step": 11225 + }, + { + "epoch": 1.6550810989738496, + "grad_norm": 0.012093408964574337, + "learning_rate": 1.4943599803825406e-05, + "loss": 0.0003, + "step": 11250 + }, + { + "epoch": 1.6587590569715693, + "grad_norm": 0.005746824201196432, + "learning_rate": 1.490273009645251e-05, + "loss": 0.0002, + "step": 11275 + }, + { + "epoch": 1.662437014969289, + "grad_norm": 0.005075458902865648, + "learning_rate": 1.4861860389079615e-05, + "loss": 0.0431, + "step": 11300 + }, + { + "epoch": 1.6661149729670086, + "grad_norm": 0.006644480861723423, + "learning_rate": 1.4820990681706719e-05, + "loss": 0.0003, + "step": 11325 + }, + { + "epoch": 1.6697929309647284, + "grad_norm": 0.016171354800462723, + "learning_rate": 1.4780120974333823e-05, + "loss": 0.0163, + "step": 11350 + }, + { + "epoch": 1.673470888962448, + "grad_norm": 0.005658384878188372, + "learning_rate": 1.4739251266960928e-05, + "loss": 0.0022, + "step": 11375 + }, + { + "epoch": 1.6771488469601676, + "grad_norm": 0.010968804359436035, + "learning_rate": 1.4698381559588033e-05, + "loss": 0.0804, + "step": 11400 + }, + { + "epoch": 1.6808268049578874, + "grad_norm": 0.029876096174120903, + "learning_rate": 1.4657511852215139e-05, + "loss": 0.067, + "step": 11425 + }, + { + "epoch": 1.6845047629556071, + "grad_norm": 0.03841656073927879, + "learning_rate": 1.4616642144842242e-05, + "loss": 0.0349, + "step": 11450 + }, + { + "epoch": 1.6881827209533267, + "grad_norm": 0.017025554552674294, + "learning_rate": 1.4575772437469348e-05, + "loss": 0.001, + "step": 11475 + }, + { + "epoch": 1.6918606789510464, + "grad_norm": 0.024776197969913483, + "learning_rate": 1.4534902730096453e-05, + "loss": 0.0356, + "step": 11500 + }, + { + "epoch": 1.6955386369487662, + "grad_norm": 0.018094466999173164, + "learning_rate": 1.4494033022723559e-05, + "loss": 0.0006, + "step": 11525 + }, + { + "epoch": 1.6992165949464857, + "grad_norm": 0.010948434472084045, + "learning_rate": 1.4453163315350662e-05, + "loss": 0.0566, + "step": 11550 + }, + { + "epoch": 1.7028945529442052, + "grad_norm": 0.06060256063938141, + "learning_rate": 1.4412293607977768e-05, + "loss": 0.087, + "step": 11575 + }, + { + "epoch": 1.7065725109419252, + "grad_norm": 0.0425218902528286, + "learning_rate": 1.4371423900604873e-05, + "loss": 0.0014, + "step": 11600 + }, + { + "epoch": 1.7102504689396447, + "grad_norm": 0.03931298479437828, + "learning_rate": 1.4330554193231977e-05, + "loss": 0.0329, + "step": 11625 + }, + { + "epoch": 1.7139284269373642, + "grad_norm": 0.05203554406762123, + "learning_rate": 1.4289684485859082e-05, + "loss": 0.0667, + "step": 11650 + }, + { + "epoch": 1.717606384935084, + "grad_norm": 0.059145841747522354, + "learning_rate": 1.4248814778486187e-05, + "loss": 0.0464, + "step": 11675 + }, + { + "epoch": 1.7212843429328037, + "grad_norm": 0.053441960364580154, + "learning_rate": 1.4207945071113291e-05, + "loss": 0.0598, + "step": 11700 + }, + { + "epoch": 1.7249623009305233, + "grad_norm": 0.0338728241622448, + "learning_rate": 1.4167075363740395e-05, + "loss": 0.0014, + "step": 11725 + }, + { + "epoch": 1.728640258928243, + "grad_norm": 0.03298606723546982, + "learning_rate": 1.41262056563675e-05, + "loss": 0.0011, + "step": 11750 + }, + { + "epoch": 1.7323182169259628, + "grad_norm": 0.007968394085764885, + "learning_rate": 1.4085335948994606e-05, + "loss": 0.0332, + "step": 11775 + }, + { + "epoch": 1.7359961749236823, + "grad_norm": 0.033015619963407516, + "learning_rate": 1.404446624162171e-05, + "loss": 0.0471, + "step": 11800 + }, + { + "epoch": 1.739674132921402, + "grad_norm": 0.03123684972524643, + "learning_rate": 1.4003596534248815e-05, + "loss": 0.0008, + "step": 11825 + }, + { + "epoch": 1.7433520909191218, + "grad_norm": 0.026270106434822083, + "learning_rate": 1.396272682687592e-05, + "loss": 0.027, + "step": 11850 + }, + { + "epoch": 1.7470300489168413, + "grad_norm": 0.025614146143198013, + "learning_rate": 1.3921857119503025e-05, + "loss": 0.0006, + "step": 11875 + }, + { + "epoch": 1.750708006914561, + "grad_norm": 0.011196363717317581, + "learning_rate": 1.3880987412130129e-05, + "loss": 0.0004, + "step": 11900 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.014085380360484123, + "learning_rate": 1.3840117704757234e-05, + "loss": 0.0007, + "step": 11925 + }, + { + "epoch": 1.7580639229100004, + "grad_norm": 0.2520334720611572, + "learning_rate": 1.379924799738434e-05, + "loss": 0.0012, + "step": 11950 + }, + { + "epoch": 1.7617418809077199, + "grad_norm": 0.0027042387519031763, + "learning_rate": 1.3758378290011445e-05, + "loss": 0.0657, + "step": 11975 + }, + { + "epoch": 1.7654198389054399, + "grad_norm": 0.007959190756082535, + "learning_rate": 1.3717508582638549e-05, + "loss": 0.0009, + "step": 12000 + }, + { + "epoch": 1.7690977969031594, + "grad_norm": 0.006802896503359079, + "learning_rate": 1.3676638875265654e-05, + "loss": 0.0002, + "step": 12025 + }, + { + "epoch": 1.772775754900879, + "grad_norm": 0.0037322076968848705, + "learning_rate": 1.3635769167892758e-05, + "loss": 0.0002, + "step": 12050 + }, + { + "epoch": 1.7764537128985987, + "grad_norm": 0.004444212652742863, + "learning_rate": 1.3594899460519862e-05, + "loss": 0.0113, + "step": 12075 + }, + { + "epoch": 1.7801316708963184, + "grad_norm": 0.0029294530395418406, + "learning_rate": 1.3554029753146967e-05, + "loss": 0.0024, + "step": 12100 + }, + { + "epoch": 1.783809628894038, + "grad_norm": 0.006351064890623093, + "learning_rate": 1.3513160045774072e-05, + "loss": 0.0339, + "step": 12125 + }, + { + "epoch": 1.7874875868917577, + "grad_norm": 0.0033591645769774914, + "learning_rate": 1.3472290338401178e-05, + "loss": 0.003, + "step": 12150 + }, + { + "epoch": 1.7911655448894774, + "grad_norm": 0.003340468741953373, + "learning_rate": 1.3431420631028281e-05, + "loss": 0.0002, + "step": 12175 + }, + { + "epoch": 1.794843502887197, + "grad_norm": 0.12212031334638596, + "learning_rate": 1.3390550923655387e-05, + "loss": 0.0836, + "step": 12200 + }, + { + "epoch": 1.7985214608849167, + "grad_norm": 0.014243889600038528, + "learning_rate": 1.3349681216282492e-05, + "loss": 0.0318, + "step": 12225 + }, + { + "epoch": 1.8021994188826365, + "grad_norm": 0.016160359606146812, + "learning_rate": 1.3308811508909596e-05, + "loss": 0.0003, + "step": 12250 + }, + { + "epoch": 1.805877376880356, + "grad_norm": 0.011376752518117428, + "learning_rate": 1.3267941801536701e-05, + "loss": 0.0003, + "step": 12275 + }, + { + "epoch": 1.8095553348780757, + "grad_norm": 0.00865715742111206, + "learning_rate": 1.3227072094163807e-05, + "loss": 0.0133, + "step": 12300 + }, + { + "epoch": 1.8132332928757955, + "grad_norm": 0.007116909604519606, + "learning_rate": 1.3186202386790912e-05, + "loss": 0.0003, + "step": 12325 + }, + { + "epoch": 1.816911250873515, + "grad_norm": 0.008155121468007565, + "learning_rate": 1.3145332679418016e-05, + "loss": 0.0385, + "step": 12350 + }, + { + "epoch": 1.8205892088712345, + "grad_norm": 0.013204419054090977, + "learning_rate": 1.3104462972045121e-05, + "loss": 0.0356, + "step": 12375 + }, + { + "epoch": 1.8242671668689545, + "grad_norm": 0.013173281215131283, + "learning_rate": 1.3063593264672226e-05, + "loss": 0.0004, + "step": 12400 + }, + { + "epoch": 1.827945124866674, + "grad_norm": 0.010820701718330383, + "learning_rate": 1.302272355729933e-05, + "loss": 0.0003, + "step": 12425 + }, + { + "epoch": 1.8316230828643936, + "grad_norm": 0.00571137759834528, + "learning_rate": 1.2981853849926434e-05, + "loss": 0.0011, + "step": 12450 + }, + { + "epoch": 1.8353010408621133, + "grad_norm": 0.007815693505108356, + "learning_rate": 1.2940984142553539e-05, + "loss": 0.0002, + "step": 12475 + }, + { + "epoch": 1.838978998859833, + "grad_norm": 0.04561807960271835, + "learning_rate": 1.2900114435180645e-05, + "loss": 0.0002, + "step": 12500 + }, + { + "epoch": 1.8426569568575526, + "grad_norm": 0.007523215841501951, + "learning_rate": 1.2859244727807748e-05, + "loss": 0.0203, + "step": 12525 + }, + { + "epoch": 1.8463349148552723, + "grad_norm": 0.007975575514137745, + "learning_rate": 1.2818375020434854e-05, + "loss": 0.0002, + "step": 12550 + }, + { + "epoch": 1.850012872852992, + "grad_norm": 0.007269065361469984, + "learning_rate": 1.2777505313061959e-05, + "loss": 0.0002, + "step": 12575 + }, + { + "epoch": 1.8536908308507116, + "grad_norm": 0.004501336719840765, + "learning_rate": 1.2736635605689064e-05, + "loss": 0.0001, + "step": 12600 + }, + { + "epoch": 1.8573687888484314, + "grad_norm": 0.004011464770883322, + "learning_rate": 1.2695765898316168e-05, + "loss": 0.0003, + "step": 12625 + }, + { + "epoch": 1.8610467468461511, + "grad_norm": 0.002334051998332143, + "learning_rate": 1.2654896190943273e-05, + "loss": 0.0234, + "step": 12650 + }, + { + "epoch": 1.8647247048438707, + "grad_norm": 0.004475513007491827, + "learning_rate": 1.2614026483570379e-05, + "loss": 0.0002, + "step": 12675 + }, + { + "epoch": 1.8684026628415904, + "grad_norm": 0.003851409535855055, + "learning_rate": 1.2573156776197482e-05, + "loss": 0.0001, + "step": 12700 + }, + { + "epoch": 1.8720806208393101, + "grad_norm": 0.0028481779154390097, + "learning_rate": 1.2532287068824588e-05, + "loss": 0.0255, + "step": 12725 + }, + { + "epoch": 1.8757585788370297, + "grad_norm": 0.0030939916614443064, + "learning_rate": 1.2491417361451693e-05, + "loss": 0.0332, + "step": 12750 + }, + { + "epoch": 1.8794365368347492, + "grad_norm": 0.0065445504151284695, + "learning_rate": 1.2450547654078797e-05, + "loss": 0.0422, + "step": 12775 + }, + { + "epoch": 1.8831144948324692, + "grad_norm": 0.005459626670926809, + "learning_rate": 1.24096779467059e-05, + "loss": 0.0113, + "step": 12800 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.002942801220342517, + "learning_rate": 1.2368808239333006e-05, + "loss": 0.0002, + "step": 12825 + }, + { + "epoch": 1.8904704108279082, + "grad_norm": 0.0067766509018838406, + "learning_rate": 1.2327938531960111e-05, + "loss": 0.018, + "step": 12850 + }, + { + "epoch": 1.894148368825628, + "grad_norm": 0.005411918740719557, + "learning_rate": 1.2287068824587217e-05, + "loss": 0.0486, + "step": 12875 + }, + { + "epoch": 1.8978263268233477, + "grad_norm": 0.006009817123413086, + "learning_rate": 1.224619911721432e-05, + "loss": 0.0002, + "step": 12900 + }, + { + "epoch": 1.9015042848210673, + "grad_norm": 0.005595459137111902, + "learning_rate": 1.2205329409841426e-05, + "loss": 0.0408, + "step": 12925 + }, + { + "epoch": 1.905182242818787, + "grad_norm": 0.012987248599529266, + "learning_rate": 1.2164459702468531e-05, + "loss": 0.0823, + "step": 12950 + }, + { + "epoch": 1.9088602008165068, + "grad_norm": 0.16368244588375092, + "learning_rate": 1.2123589995095635e-05, + "loss": 0.0018, + "step": 12975 + }, + { + "epoch": 1.9125381588142263, + "grad_norm": 0.00949876382946968, + "learning_rate": 1.208272028772274e-05, + "loss": 0.0163, + "step": 13000 + }, + { + "epoch": 1.916216116811946, + "grad_norm": 14.246623039245605, + "learning_rate": 1.2041850580349846e-05, + "loss": 0.0342, + "step": 13025 + }, + { + "epoch": 1.9198940748096658, + "grad_norm": 0.00826562475413084, + "learning_rate": 1.2000980872976951e-05, + "loss": 0.0003, + "step": 13050 + }, + { + "epoch": 1.9235720328073853, + "grad_norm": 0.006868135649710894, + "learning_rate": 1.1960111165604055e-05, + "loss": 0.0003, + "step": 13075 + }, + { + "epoch": 1.9272499908051048, + "grad_norm": 0.00808362290263176, + "learning_rate": 1.191924145823116e-05, + "loss": 0.0402, + "step": 13100 + }, + { + "epoch": 1.9309279488028248, + "grad_norm": 0.010807299055159092, + "learning_rate": 1.1878371750858265e-05, + "loss": 0.078, + "step": 13125 + }, + { + "epoch": 1.9346059068005443, + "grad_norm": 0.01139509491622448, + "learning_rate": 1.1837502043485367e-05, + "loss": 0.0007, + "step": 13150 + }, + { + "epoch": 1.9382838647982639, + "grad_norm": 0.00977110955864191, + "learning_rate": 1.1796632336112473e-05, + "loss": 0.0004, + "step": 13175 + }, + { + "epoch": 1.9419618227959836, + "grad_norm": 0.006910570897161961, + "learning_rate": 1.1755762628739578e-05, + "loss": 0.0003, + "step": 13200 + }, + { + "epoch": 1.9456397807937034, + "grad_norm": 4.1620564460754395, + "learning_rate": 1.1714892921366683e-05, + "loss": 0.0667, + "step": 13225 + }, + { + "epoch": 1.949317738791423, + "grad_norm": 0.015238853171467781, + "learning_rate": 1.1674023213993787e-05, + "loss": 0.0015, + "step": 13250 + }, + { + "epoch": 1.9529956967891426, + "grad_norm": 0.007931707426905632, + "learning_rate": 1.1633153506620892e-05, + "loss": 0.0003, + "step": 13275 + }, + { + "epoch": 1.9566736547868624, + "grad_norm": 0.009560568258166313, + "learning_rate": 1.1592283799247998e-05, + "loss": 0.0003, + "step": 13300 + }, + { + "epoch": 1.960351612784582, + "grad_norm": 0.008578946813941002, + "learning_rate": 1.1551414091875103e-05, + "loss": 0.0008, + "step": 13325 + }, + { + "epoch": 1.9640295707823017, + "grad_norm": 0.011748207733035088, + "learning_rate": 1.1510544384502207e-05, + "loss": 0.0002, + "step": 13350 + }, + { + "epoch": 1.9677075287800214, + "grad_norm": 0.007073475047945976, + "learning_rate": 1.1469674677129312e-05, + "loss": 0.0002, + "step": 13375 + }, + { + "epoch": 1.971385486777741, + "grad_norm": 0.003219211706891656, + "learning_rate": 1.1428804969756418e-05, + "loss": 0.0323, + "step": 13400 + }, + { + "epoch": 1.9750634447754607, + "grad_norm": 0.0061137378215789795, + "learning_rate": 1.1387935262383521e-05, + "loss": 0.0002, + "step": 13425 + }, + { + "epoch": 1.9787414027731804, + "grad_norm": 0.006435078103095293, + "learning_rate": 1.1347065555010627e-05, + "loss": 0.0409, + "step": 13450 + }, + { + "epoch": 1.9824193607709, + "grad_norm": 0.002217411994934082, + "learning_rate": 1.1306195847637732e-05, + "loss": 0.0002, + "step": 13475 + }, + { + "epoch": 1.9860973187686195, + "grad_norm": 0.009155605919659138, + "learning_rate": 1.1265326140264837e-05, + "loss": 0.0487, + "step": 13500 + }, + { + "epoch": 1.9897752767663395, + "grad_norm": 0.011870177462697029, + "learning_rate": 1.122445643289194e-05, + "loss": 0.0004, + "step": 13525 + }, + { + "epoch": 1.993453234764059, + "grad_norm": 0.008746917359530926, + "learning_rate": 1.1183586725519045e-05, + "loss": 0.0411, + "step": 13550 + }, + { + "epoch": 1.9971311927617785, + "grad_norm": 0.005829541012644768, + "learning_rate": 1.114271701814615e-05, + "loss": 0.0003, + "step": 13575 + }, + { + "epoch": 1.9999264408400457, + "eval_accuracy": 0.9963221772710555, + "eval_auc": 0.9999360039904769, + "eval_f1": 0.9963186570460905, + "eval_loss": 0.023916827514767647, + "eval_precision": 0.9973466981132075, + "eval_recall": 0.9952927331568108, + "eval_runtime": 2353.5774, + "eval_samples_per_second": 5.776, + "eval_steps_per_second": 1.444, + "step": 13594 + }, + { + "epoch": 2.0008091507594985, + "grad_norm": 0.010372490622103214, + "learning_rate": 1.1101847310773254e-05, + "loss": 0.0003, + "step": 13600 + }, + { + "epoch": 2.004487108757218, + "grad_norm": 0.014902903698384762, + "learning_rate": 1.106097760340036e-05, + "loss": 0.0673, + "step": 13625 + }, + { + "epoch": 2.0081650667549376, + "grad_norm": 0.005416123196482658, + "learning_rate": 1.1020107896027465e-05, + "loss": 0.0003, + "step": 13650 + }, + { + "epoch": 2.0118430247526575, + "grad_norm": 0.007089643273502588, + "learning_rate": 1.097923818865457e-05, + "loss": 0.0003, + "step": 13675 + }, + { + "epoch": 2.015520982750377, + "grad_norm": 0.005935342982411385, + "learning_rate": 1.0938368481281674e-05, + "loss": 0.0004, + "step": 13700 + }, + { + "epoch": 2.0191989407480966, + "grad_norm": 0.004356461577117443, + "learning_rate": 1.0897498773908779e-05, + "loss": 0.0003, + "step": 13725 + }, + { + "epoch": 2.022876898745816, + "grad_norm": 0.010521539486944675, + "learning_rate": 1.0856629066535884e-05, + "loss": 0.0839, + "step": 13750 + }, + { + "epoch": 2.026554856743536, + "grad_norm": 0.007211623247712851, + "learning_rate": 1.081575935916299e-05, + "loss": 0.0215, + "step": 13775 + }, + { + "epoch": 2.0302328147412556, + "grad_norm": 0.008732822723686695, + "learning_rate": 1.0774889651790093e-05, + "loss": 0.0002, + "step": 13800 + }, + { + "epoch": 2.033910772738975, + "grad_norm": 0.005103670991957188, + "learning_rate": 1.0734019944417199e-05, + "loss": 0.0002, + "step": 13825 + }, + { + "epoch": 2.037588730736695, + "grad_norm": 0.00569286709651351, + "learning_rate": 1.0693150237044304e-05, + "loss": 0.0002, + "step": 13850 + }, + { + "epoch": 2.0412666887344146, + "grad_norm": 0.004690663423389196, + "learning_rate": 1.0652280529671408e-05, + "loss": 0.0002, + "step": 13875 + }, + { + "epoch": 2.044944646732134, + "grad_norm": 0.003813117044046521, + "learning_rate": 1.0611410822298512e-05, + "loss": 0.0001, + "step": 13900 + }, + { + "epoch": 2.048622604729854, + "grad_norm": 0.0031241225078701973, + "learning_rate": 1.0570541114925617e-05, + "loss": 0.0001, + "step": 13925 + }, + { + "epoch": 2.0523005627275737, + "grad_norm": 0.001760639250278473, + "learning_rate": 1.0529671407552722e-05, + "loss": 0.0003, + "step": 13950 + }, + { + "epoch": 2.055978520725293, + "grad_norm": 0.00507943844422698, + "learning_rate": 1.0488801700179826e-05, + "loss": 0.0465, + "step": 13975 + }, + { + "epoch": 2.059656478723013, + "grad_norm": 0.005704312119632959, + "learning_rate": 1.0447931992806931e-05, + "loss": 0.0002, + "step": 14000 + }, + { + "epoch": 2.0633344367207327, + "grad_norm": 0.0037137740291655064, + "learning_rate": 1.0407062285434037e-05, + "loss": 0.0002, + "step": 14025 + }, + { + "epoch": 2.067012394718452, + "grad_norm": 0.004969414323568344, + "learning_rate": 1.036619257806114e-05, + "loss": 0.0002, + "step": 14050 + }, + { + "epoch": 2.070690352716172, + "grad_norm": 0.002151261083781719, + "learning_rate": 1.0325322870688246e-05, + "loss": 0.0001, + "step": 14075 + }, + { + "epoch": 2.0743683107138917, + "grad_norm": 0.004214055370539427, + "learning_rate": 1.0284453163315351e-05, + "loss": 0.0001, + "step": 14100 + }, + { + "epoch": 2.0780462687116112, + "grad_norm": 0.004696809686720371, + "learning_rate": 1.0243583455942457e-05, + "loss": 0.0001, + "step": 14125 + }, + { + "epoch": 2.0817242267093308, + "grad_norm": 8.668023109436035, + "learning_rate": 1.020271374856956e-05, + "loss": 0.0642, + "step": 14150 + }, + { + "epoch": 2.0854021847070507, + "grad_norm": 0.00823593232780695, + "learning_rate": 1.0161844041196666e-05, + "loss": 0.0004, + "step": 14175 + }, + { + "epoch": 2.0890801427047703, + "grad_norm": 0.006173284724354744, + "learning_rate": 1.0120974333823771e-05, + "loss": 0.0002, + "step": 14200 + }, + { + "epoch": 2.09275810070249, + "grad_norm": 0.004422744270414114, + "learning_rate": 1.0080104626450876e-05, + "loss": 0.0002, + "step": 14225 + }, + { + "epoch": 2.0964360587002098, + "grad_norm": 0.0038796046283096075, + "learning_rate": 1.0039234919077978e-05, + "loss": 0.0002, + "step": 14250 + }, + { + "epoch": 2.1001140166979293, + "grad_norm": 0.003889993764460087, + "learning_rate": 9.998365211705084e-06, + "loss": 0.0008, + "step": 14275 + }, + { + "epoch": 2.103791974695649, + "grad_norm": 0.0035641242284327745, + "learning_rate": 9.957495504332189e-06, + "loss": 0.0001, + "step": 14300 + }, + { + "epoch": 2.107469932693369, + "grad_norm": 0.0037507452070713043, + "learning_rate": 9.916625796959293e-06, + "loss": 0.0001, + "step": 14325 + }, + { + "epoch": 2.1111478906910883, + "grad_norm": 0.002810309175401926, + "learning_rate": 9.875756089586398e-06, + "loss": 0.0001, + "step": 14350 + }, + { + "epoch": 2.114825848688808, + "grad_norm": 0.0030445558950304985, + "learning_rate": 9.834886382213504e-06, + "loss": 0.0001, + "step": 14375 + }, + { + "epoch": 2.118503806686528, + "grad_norm": 0.0025213556364178658, + "learning_rate": 9.794016674840609e-06, + "loss": 0.0001, + "step": 14400 + }, + { + "epoch": 2.1221817646842474, + "grad_norm": 0.0027236223686486483, + "learning_rate": 9.753146967467713e-06, + "loss": 0.0001, + "step": 14425 + }, + { + "epoch": 2.125859722681967, + "grad_norm": 0.002416795352473855, + "learning_rate": 9.712277260094818e-06, + "loss": 0.0004, + "step": 14450 + }, + { + "epoch": 2.129537680679687, + "grad_norm": 0.0019158340292051435, + "learning_rate": 9.671407552721923e-06, + "loss": 0.0001, + "step": 14475 + }, + { + "epoch": 2.1332156386774064, + "grad_norm": 0.002519650151953101, + "learning_rate": 9.630537845349029e-06, + "loss": 0.0001, + "step": 14500 + }, + { + "epoch": 2.136893596675126, + "grad_norm": 0.002294061239808798, + "learning_rate": 9.589668137976132e-06, + "loss": 0.0001, + "step": 14525 + }, + { + "epoch": 2.1405715546728454, + "grad_norm": 0.0021358055528253317, + "learning_rate": 9.548798430603238e-06, + "loss": 0.0471, + "step": 14550 + }, + { + "epoch": 2.1442495126705654, + "grad_norm": 0.001824073726311326, + "learning_rate": 9.507928723230343e-06, + "loss": 0.0001, + "step": 14575 + }, + { + "epoch": 2.147927470668285, + "grad_norm": 0.001960406079888344, + "learning_rate": 9.467059015857447e-06, + "loss": 0.0001, + "step": 14600 + }, + { + "epoch": 2.1516054286660045, + "grad_norm": 0.0018290438456460834, + "learning_rate": 9.42618930848455e-06, + "loss": 0.0001, + "step": 14625 + }, + { + "epoch": 2.1552833866637244, + "grad_norm": 0.0019052918069064617, + "learning_rate": 9.385319601111656e-06, + "loss": 0.0001, + "step": 14650 + }, + { + "epoch": 2.158961344661444, + "grad_norm": 0.0018661071080714464, + "learning_rate": 9.344449893738761e-06, + "loss": 0.0001, + "step": 14675 + }, + { + "epoch": 2.1626393026591635, + "grad_norm": 0.0031746248714625835, + "learning_rate": 9.303580186365865e-06, + "loss": 0.049, + "step": 14700 + }, + { + "epoch": 2.1663172606568835, + "grad_norm": 0.003573804395273328, + "learning_rate": 9.26271047899297e-06, + "loss": 0.0001, + "step": 14725 + }, + { + "epoch": 2.169995218654603, + "grad_norm": 0.003289070213213563, + "learning_rate": 9.221840771620076e-06, + "loss": 0.0113, + "step": 14750 + }, + { + "epoch": 2.1736731766523225, + "grad_norm": 0.00257130921818316, + "learning_rate": 9.18097106424718e-06, + "loss": 0.0483, + "step": 14775 + }, + { + "epoch": 2.1773511346500425, + "grad_norm": 0.005980730522423983, + "learning_rate": 9.140101356874285e-06, + "loss": 0.0002, + "step": 14800 + }, + { + "epoch": 2.181029092647762, + "grad_norm": 0.005953842308372259, + "learning_rate": 9.09923164950139e-06, + "loss": 0.0002, + "step": 14825 + }, + { + "epoch": 2.1847070506454815, + "grad_norm": 0.037090156227350235, + "learning_rate": 9.058361942128496e-06, + "loss": 0.0785, + "step": 14850 + }, + { + "epoch": 2.188385008643201, + "grad_norm": 0.007919345051050186, + "learning_rate": 9.0174922347556e-06, + "loss": 0.0006, + "step": 14875 + }, + { + "epoch": 2.192062966640921, + "grad_norm": 0.021819893270730972, + "learning_rate": 8.976622527382705e-06, + "loss": 0.0376, + "step": 14900 + }, + { + "epoch": 2.1957409246386406, + "grad_norm": 0.024493372067809105, + "learning_rate": 8.93575282000981e-06, + "loss": 0.0439, + "step": 14925 + }, + { + "epoch": 2.19941888263636, + "grad_norm": 0.038370776921510696, + "learning_rate": 8.894883112636915e-06, + "loss": 0.0802, + "step": 14950 + }, + { + "epoch": 2.20309684063408, + "grad_norm": 0.019332151859998703, + "learning_rate": 8.854013405264019e-06, + "loss": 0.0012, + "step": 14975 + }, + { + "epoch": 2.2067747986317996, + "grad_norm": 0.03362823650240898, + "learning_rate": 8.813143697891123e-06, + "loss": 0.0369, + "step": 15000 + }, + { + "epoch": 2.210452756629519, + "grad_norm": 0.024772603064775467, + "learning_rate": 8.772273990518228e-06, + "loss": 0.0008, + "step": 15025 + }, + { + "epoch": 2.214130714627239, + "grad_norm": 0.02276591770350933, + "learning_rate": 8.731404283145332e-06, + "loss": 0.1007, + "step": 15050 + }, + { + "epoch": 2.2178086726249586, + "grad_norm": 0.016099456697702408, + "learning_rate": 8.690534575772437e-06, + "loss": 0.0009, + "step": 15075 + }, + { + "epoch": 2.221486630622678, + "grad_norm": 0.003277967683970928, + "learning_rate": 8.649664868399542e-06, + "loss": 0.0069, + "step": 15100 + }, + { + "epoch": 2.225164588620398, + "grad_norm": 0.011233772151172161, + "learning_rate": 8.608795161026648e-06, + "loss": 0.0386, + "step": 15125 + }, + { + "epoch": 2.2288425466181176, + "grad_norm": 0.007455474231392145, + "learning_rate": 8.567925453653752e-06, + "loss": 0.0003, + "step": 15150 + }, + { + "epoch": 2.232520504615837, + "grad_norm": 0.011497107334434986, + "learning_rate": 8.527055746280857e-06, + "loss": 0.0004, + "step": 15175 + }, + { + "epoch": 2.236198462613557, + "grad_norm": 0.003145186696201563, + "learning_rate": 8.486186038907962e-06, + "loss": 0.0003, + "step": 15200 + }, + { + "epoch": 2.2398764206112767, + "grad_norm": 0.00954380352050066, + "learning_rate": 8.445316331535066e-06, + "loss": 0.0595, + "step": 15225 + }, + { + "epoch": 2.243554378608996, + "grad_norm": 0.007323611527681351, + "learning_rate": 8.404446624162171e-06, + "loss": 0.0004, + "step": 15250 + }, + { + "epoch": 2.247232336606716, + "grad_norm": 0.011944909580051899, + "learning_rate": 8.363576916789277e-06, + "loss": 0.0003, + "step": 15275 + }, + { + "epoch": 2.2509102946044357, + "grad_norm": 0.01304931566119194, + "learning_rate": 8.322707209416382e-06, + "loss": 0.0389, + "step": 15300 + }, + { + "epoch": 2.2545882526021552, + "grad_norm": 0.008787041530013084, + "learning_rate": 8.281837502043486e-06, + "loss": 0.0004, + "step": 15325 + }, + { + "epoch": 2.2582662105998748, + "grad_norm": 0.011969480663537979, + "learning_rate": 8.24096779467059e-06, + "loss": 0.0004, + "step": 15350 + }, + { + "epoch": 2.2619441685975947, + "grad_norm": 0.011229045689105988, + "learning_rate": 8.200098087297695e-06, + "loss": 0.0003, + "step": 15375 + }, + { + "epoch": 2.2656221265953143, + "grad_norm": 0.00922977551817894, + "learning_rate": 8.1592283799248e-06, + "loss": 0.0004, + "step": 15400 + }, + { + "epoch": 2.269300084593034, + "grad_norm": 0.008094431832432747, + "learning_rate": 8.118358672551904e-06, + "loss": 0.0003, + "step": 15425 + }, + { + "epoch": 2.2729780425907538, + "grad_norm": 0.0032492594327777624, + "learning_rate": 8.07748896517901e-06, + "loss": 0.0002, + "step": 15450 + }, + { + "epoch": 2.2766560005884733, + "grad_norm": 0.004196746740490198, + "learning_rate": 8.036619257806115e-06, + "loss": 0.0002, + "step": 15475 + }, + { + "epoch": 2.280333958586193, + "grad_norm": 0.005214506760239601, + "learning_rate": 7.995749550433218e-06, + "loss": 0.0002, + "step": 15500 + }, + { + "epoch": 2.284011916583913, + "grad_norm": 0.0034893976990133524, + "learning_rate": 7.954879843060324e-06, + "loss": 0.0002, + "step": 15525 + }, + { + "epoch": 2.2876898745816323, + "grad_norm": 0.0036745897959917784, + "learning_rate": 7.914010135687429e-06, + "loss": 0.0002, + "step": 15550 + }, + { + "epoch": 2.291367832579352, + "grad_norm": 0.0020664865151047707, + "learning_rate": 7.873140428314534e-06, + "loss": 0.0001, + "step": 15575 + }, + { + "epoch": 2.2950457905770714, + "grad_norm": 0.005072563886642456, + "learning_rate": 7.832270720941638e-06, + "loss": 0.0417, + "step": 15600 + }, + { + "epoch": 2.2987237485747913, + "grad_norm": 0.004465815611183643, + "learning_rate": 7.791401013568743e-06, + "loss": 0.0002, + "step": 15625 + }, + { + "epoch": 2.302401706572511, + "grad_norm": 0.005166616756469011, + "learning_rate": 7.750531306195849e-06, + "loss": 0.016, + "step": 15650 + }, + { + "epoch": 2.3060796645702304, + "grad_norm": 0.0010274857049807906, + "learning_rate": 7.709661598822953e-06, + "loss": 0.0002, + "step": 15675 + }, + { + "epoch": 2.3097576225679504, + "grad_norm": 0.006900500506162643, + "learning_rate": 7.668791891450058e-06, + "loss": 0.0002, + "step": 15700 + }, + { + "epoch": 2.31343558056567, + "grad_norm": 0.004663816653192043, + "learning_rate": 7.6279221840771624e-06, + "loss": 0.0001, + "step": 15725 + }, + { + "epoch": 2.3171135385633894, + "grad_norm": 0.006946474779397249, + "learning_rate": 7.587052476704268e-06, + "loss": 0.0001, + "step": 15750 + }, + { + "epoch": 2.3207914965611094, + "grad_norm": 0.003868917003273964, + "learning_rate": 7.5461827693313715e-06, + "loss": 0.0342, + "step": 15775 + }, + { + "epoch": 2.324469454558829, + "grad_norm": 0.0028817090205848217, + "learning_rate": 7.505313061958477e-06, + "loss": 0.0138, + "step": 15800 + }, + { + "epoch": 2.3281474125565484, + "grad_norm": 0.0059151784516870975, + "learning_rate": 7.464443354585581e-06, + "loss": 0.0733, + "step": 15825 + }, + { + "epoch": 2.3318253705542684, + "grad_norm": 0.004359770100563765, + "learning_rate": 7.423573647212686e-06, + "loss": 0.0421, + "step": 15850 + }, + { + "epoch": 2.335503328551988, + "grad_norm": 0.011809108778834343, + "learning_rate": 7.3827039398397904e-06, + "loss": 0.0003, + "step": 15875 + }, + { + "epoch": 2.3391812865497075, + "grad_norm": 0.005823772866278887, + "learning_rate": 7.341834232466896e-06, + "loss": 0.0003, + "step": 15900 + }, + { + "epoch": 2.3428592445474274, + "grad_norm": 0.003460386535152793, + "learning_rate": 7.300964525094e-06, + "loss": 0.0002, + "step": 15925 + }, + { + "epoch": 2.346537202545147, + "grad_norm": 0.008056416176259518, + "learning_rate": 7.260094817721106e-06, + "loss": 0.0381, + "step": 15950 + }, + { + "epoch": 2.3502151605428665, + "grad_norm": 0.007788171526044607, + "learning_rate": 7.21922511034821e-06, + "loss": 0.0002, + "step": 15975 + }, + { + "epoch": 2.3538931185405865, + "grad_norm": 0.0066045369021594524, + "learning_rate": 7.178355402975315e-06, + "loss": 0.0002, + "step": 16000 + }, + { + "epoch": 2.357571076538306, + "grad_norm": 0.004805906675755978, + "learning_rate": 7.137485695602419e-06, + "loss": 0.0053, + "step": 16025 + }, + { + "epoch": 2.3612490345360255, + "grad_norm": 0.010813217610120773, + "learning_rate": 7.096615988229525e-06, + "loss": 0.0381, + "step": 16050 + }, + { + "epoch": 2.3649269925337455, + "grad_norm": 0.009302555583417416, + "learning_rate": 7.055746280856629e-06, + "loss": 0.0393, + "step": 16075 + }, + { + "epoch": 2.368604950531465, + "grad_norm": 0.011496507562696934, + "learning_rate": 7.014876573483734e-06, + "loss": 0.0386, + "step": 16100 + }, + { + "epoch": 2.3722829085291846, + "grad_norm": 0.025231193751096725, + "learning_rate": 6.974006866110839e-06, + "loss": 0.0367, + "step": 16125 + }, + { + "epoch": 2.375960866526904, + "grad_norm": 0.020235830917954445, + "learning_rate": 6.933137158737944e-06, + "loss": 0.0006, + "step": 16150 + }, + { + "epoch": 2.379638824524624, + "grad_norm": 0.006687480956315994, + "learning_rate": 6.892267451365048e-06, + "loss": 0.0004, + "step": 16175 + }, + { + "epoch": 2.3833167825223436, + "grad_norm": 0.003918817732483149, + "learning_rate": 6.851397743992153e-06, + "loss": 0.0003, + "step": 16200 + }, + { + "epoch": 2.386994740520063, + "grad_norm": 0.011175381019711494, + "learning_rate": 6.810528036619258e-06, + "loss": 0.0003, + "step": 16225 + }, + { + "epoch": 2.390672698517783, + "grad_norm": 0.007755937986075878, + "learning_rate": 6.769658329246363e-06, + "loss": 0.0002, + "step": 16250 + }, + { + "epoch": 2.3943506565155026, + "grad_norm": 0.004887331277132034, + "learning_rate": 6.728788621873468e-06, + "loss": 0.0002, + "step": 16275 + }, + { + "epoch": 2.398028614513222, + "grad_norm": 0.0048552751541137695, + "learning_rate": 6.6879189145005725e-06, + "loss": 0.0002, + "step": 16300 + }, + { + "epoch": 2.401706572510942, + "grad_norm": 0.011255592107772827, + "learning_rate": 6.647049207127677e-06, + "loss": 0.0002, + "step": 16325 + }, + { + "epoch": 2.4053845305086616, + "grad_norm": 0.009114415384829044, + "learning_rate": 6.6061794997547816e-06, + "loss": 0.0002, + "step": 16350 + }, + { + "epoch": 2.409062488506381, + "grad_norm": 0.009386932477355003, + "learning_rate": 6.565309792381886e-06, + "loss": 0.0395, + "step": 16375 + }, + { + "epoch": 2.4127404465041007, + "grad_norm": 0.005927698221057653, + "learning_rate": 6.5244400850089915e-06, + "loss": 0.0002, + "step": 16400 + }, + { + "epoch": 2.4164184045018207, + "grad_norm": 0.0084453159943223, + "learning_rate": 6.483570377636096e-06, + "loss": 0.0506, + "step": 16425 + }, + { + "epoch": 2.42009636249954, + "grad_norm": 0.008083072490990162, + "learning_rate": 6.442700670263201e-06, + "loss": 0.0003, + "step": 16450 + }, + { + "epoch": 2.4237743204972597, + "grad_norm": 0.00735598336905241, + "learning_rate": 6.401830962890306e-06, + "loss": 0.0003, + "step": 16475 + }, + { + "epoch": 2.4274522784949797, + "grad_norm": 0.007824303582310677, + "learning_rate": 6.360961255517411e-06, + "loss": 0.0398, + "step": 16500 + }, + { + "epoch": 2.431130236492699, + "grad_norm": 0.009155460633337498, + "learning_rate": 6.320091548144516e-06, + "loss": 0.0003, + "step": 16525 + }, + { + "epoch": 2.4348081944904187, + "grad_norm": 0.005739257670938969, + "learning_rate": 6.27922184077162e-06, + "loss": 0.0003, + "step": 16550 + }, + { + "epoch": 2.4384861524881387, + "grad_norm": 0.006940542254596949, + "learning_rate": 6.238352133398725e-06, + "loss": 0.0003, + "step": 16575 + }, + { + "epoch": 2.4421641104858582, + "grad_norm": 0.0053449515253305435, + "learning_rate": 6.197482426025829e-06, + "loss": 0.0002, + "step": 16600 + }, + { + "epoch": 2.4458420684835778, + "grad_norm": 0.005325790494680405, + "learning_rate": 6.156612718652935e-06, + "loss": 0.0002, + "step": 16625 + }, + { + "epoch": 2.4495200264812977, + "grad_norm": 0.006259521469473839, + "learning_rate": 6.115743011280039e-06, + "loss": 0.0002, + "step": 16650 + }, + { + "epoch": 2.4531979844790173, + "grad_norm": 0.006854058708995581, + "learning_rate": 6.074873303907145e-06, + "loss": 0.0002, + "step": 16675 + }, + { + "epoch": 2.456875942476737, + "grad_norm": 0.004361658822745085, + "learning_rate": 6.034003596534249e-06, + "loss": 0.0002, + "step": 16700 + }, + { + "epoch": 2.4605539004744568, + "grad_norm": 0.0055083055049180984, + "learning_rate": 5.993133889161354e-06, + "loss": 0.0002, + "step": 16725 + }, + { + "epoch": 2.4642318584721763, + "grad_norm": 0.0033617918379604816, + "learning_rate": 5.952264181788458e-06, + "loss": 0.0002, + "step": 16750 + }, + { + "epoch": 2.467909816469896, + "grad_norm": 0.0048737069591879845, + "learning_rate": 5.911394474415564e-06, + "loss": 0.0001, + "step": 16775 + }, + { + "epoch": 2.471587774467616, + "grad_norm": 0.0036280914209783077, + "learning_rate": 5.870524767042668e-06, + "loss": 0.0001, + "step": 16800 + }, + { + "epoch": 2.4752657324653353, + "grad_norm": 0.003542742459103465, + "learning_rate": 5.829655059669773e-06, + "loss": 0.0001, + "step": 16825 + }, + { + "epoch": 2.478943690463055, + "grad_norm": 0.004226271994411945, + "learning_rate": 5.788785352296878e-06, + "loss": 0.0001, + "step": 16850 + }, + { + "epoch": 2.482621648460775, + "grad_norm": 0.0033333373721688986, + "learning_rate": 5.7479156449239826e-06, + "loss": 0.0001, + "step": 16875 + }, + { + "epoch": 2.4862996064584943, + "grad_norm": 0.003888545325025916, + "learning_rate": 5.707045937551087e-06, + "loss": 0.0001, + "step": 16900 + }, + { + "epoch": 2.489977564456214, + "grad_norm": 0.0031992702279239893, + "learning_rate": 5.666176230178192e-06, + "loss": 0.0001, + "step": 16925 + }, + { + "epoch": 2.4936555224539334, + "grad_norm": 0.0026705926284193993, + "learning_rate": 5.625306522805297e-06, + "loss": 0.0001, + "step": 16950 + }, + { + "epoch": 2.4973334804516534, + "grad_norm": 0.001754347002133727, + "learning_rate": 5.5844368154324015e-06, + "loss": 0.0001, + "step": 16975 + }, + { + "epoch": 2.501011438449373, + "grad_norm": 0.0018643263028934598, + "learning_rate": 5.543567108059507e-06, + "loss": 0.0006, + "step": 17000 + }, + { + "epoch": 2.5046893964470924, + "grad_norm": 0.002491478342562914, + "learning_rate": 5.502697400686611e-06, + "loss": 0.0001, + "step": 17025 + }, + { + "epoch": 2.5083673544448124, + "grad_norm": 0.002735487651079893, + "learning_rate": 5.461827693313716e-06, + "loss": 0.0001, + "step": 17050 + }, + { + "epoch": 2.512045312442532, + "grad_norm": 0.002121156081557274, + "learning_rate": 5.420957985940821e-06, + "loss": 0.0013, + "step": 17075 + }, + { + "epoch": 2.5157232704402515, + "grad_norm": 0.001368986559100449, + "learning_rate": 5.380088278567925e-06, + "loss": 0.0001, + "step": 17100 + }, + { + "epoch": 2.519401228437971, + "grad_norm": 0.0018654069863259792, + "learning_rate": 5.33921857119503e-06, + "loss": 0.0001, + "step": 17125 + }, + { + "epoch": 2.523079186435691, + "grad_norm": 0.0008688032394275069, + "learning_rate": 5.298348863822135e-06, + "loss": 0.0001, + "step": 17150 + }, + { + "epoch": 2.5267571444334105, + "grad_norm": 0.0014730022521689534, + "learning_rate": 5.25747915644924e-06, + "loss": 0.0001, + "step": 17175 + }, + { + "epoch": 2.53043510243113, + "grad_norm": 589.290283203125, + "learning_rate": 5.216609449076345e-06, + "loss": 0.0295, + "step": 17200 + }, + { + "epoch": 2.53411306042885, + "grad_norm": 0.0014689558884128928, + "learning_rate": 5.17573974170345e-06, + "loss": 0.0, + "step": 17225 + }, + { + "epoch": 2.5377910184265695, + "grad_norm": 0.001330269267782569, + "learning_rate": 5.134870034330555e-06, + "loss": 0.0, + "step": 17250 + }, + { + "epoch": 2.541468976424289, + "grad_norm": 0.001491030678153038, + "learning_rate": 5.094000326957658e-06, + "loss": 0.0, + "step": 17275 + }, + { + "epoch": 2.545146934422009, + "grad_norm": 0.002089619869366288, + "learning_rate": 5.053130619584764e-06, + "loss": 0.0778, + "step": 17300 + }, + { + "epoch": 2.5488248924197285, + "grad_norm": 0.0015247270930558443, + "learning_rate": 5.012260912211868e-06, + "loss": 0.0188, + "step": 17325 + }, + { + "epoch": 2.552502850417448, + "grad_norm": 0.002242110203951597, + "learning_rate": 4.971391204838974e-06, + "loss": 0.0179, + "step": 17350 + }, + { + "epoch": 2.556180808415168, + "grad_norm": 0.0018629367696121335, + "learning_rate": 4.930521497466078e-06, + "loss": 0.0181, + "step": 17375 + }, + { + "epoch": 2.5598587664128876, + "grad_norm": 0.0014634733088314533, + "learning_rate": 4.8896517900931836e-06, + "loss": 0.0328, + "step": 17400 + }, + { + "epoch": 2.563536724410607, + "grad_norm": 0.001321232644841075, + "learning_rate": 4.848782082720288e-06, + "loss": 0.0, + "step": 17425 + }, + { + "epoch": 2.567214682408327, + "grad_norm": 0.0012456915574148297, + "learning_rate": 4.807912375347393e-06, + "loss": 0.0003, + "step": 17450 + }, + { + "epoch": 2.5708926404060466, + "grad_norm": 0.0009979073656722903, + "learning_rate": 4.767042667974497e-06, + "loss": 0.0001, + "step": 17475 + }, + { + "epoch": 2.574570598403766, + "grad_norm": 0.001377744134515524, + "learning_rate": 4.726172960601602e-06, + "loss": 0.0, + "step": 17500 + }, + { + "epoch": 2.578248556401486, + "grad_norm": 0.0022715404629707336, + "learning_rate": 4.685303253228707e-06, + "loss": 0.0498, + "step": 17525 + }, + { + "epoch": 2.5819265143992056, + "grad_norm": 0.002307375194504857, + "learning_rate": 4.644433545855812e-06, + "loss": 0.0001, + "step": 17550 + }, + { + "epoch": 2.585604472396925, + "grad_norm": 0.002744297729805112, + "learning_rate": 4.603563838482917e-06, + "loss": 0.0444, + "step": 17575 + }, + { + "epoch": 2.589282430394645, + "grad_norm": 0.004225959535688162, + "learning_rate": 4.5626941311100215e-06, + "loss": 0.0148, + "step": 17600 + }, + { + "epoch": 2.5929603883923646, + "grad_norm": 0.0028173536993563175, + "learning_rate": 4.521824423737127e-06, + "loss": 0.0033, + "step": 17625 + }, + { + "epoch": 2.596638346390084, + "grad_norm": 0.00215067807585001, + "learning_rate": 4.4809547163642305e-06, + "loss": 0.0001, + "step": 17650 + }, + { + "epoch": 2.600316304387804, + "grad_norm": 0.004402931313961744, + "learning_rate": 4.440085008991336e-06, + "loss": 0.0001, + "step": 17675 + }, + { + "epoch": 2.6039942623855237, + "grad_norm": 0.0019863785710185766, + "learning_rate": 4.3992153016184404e-06, + "loss": 0.0001, + "step": 17700 + }, + { + "epoch": 2.607672220383243, + "grad_norm": 0.0032948977313935757, + "learning_rate": 4.358345594245545e-06, + "loss": 0.0001, + "step": 17725 + }, + { + "epoch": 2.6113501783809627, + "grad_norm": 0.0017591605428606272, + "learning_rate": 4.31747588687265e-06, + "loss": 0.0001, + "step": 17750 + }, + { + "epoch": 2.6150281363786827, + "grad_norm": 0.5669000148773193, + "learning_rate": 4.276606179499755e-06, + "loss": 0.0002, + "step": 17775 + }, + { + "epoch": 2.6187060943764022, + "grad_norm": 0.0018617259338498116, + "learning_rate": 4.23573647212686e-06, + "loss": 0.044, + "step": 17800 + }, + { + "epoch": 2.6223840523741218, + "grad_norm": 0.004173843190073967, + "learning_rate": 4.194866764753964e-06, + "loss": 0.0001, + "step": 17825 + }, + { + "epoch": 2.6260620103718413, + "grad_norm": 0.005529914982616901, + "learning_rate": 4.153997057381069e-06, + "loss": 0.0001, + "step": 17850 + }, + { + "epoch": 2.6297399683695613, + "grad_norm": 0.003100366098806262, + "learning_rate": 4.113127350008174e-06, + "loss": 0.0001, + "step": 17875 + }, + { + "epoch": 2.633417926367281, + "grad_norm": 0.0017961232224479318, + "learning_rate": 4.072257642635279e-06, + "loss": 0.012, + "step": 17900 + }, + { + "epoch": 2.6370958843650003, + "grad_norm": 0.0022237550001591444, + "learning_rate": 4.031387935262384e-06, + "loss": 0.0001, + "step": 17925 + }, + { + "epoch": 2.6407738423627203, + "grad_norm": 0.002973005408421159, + "learning_rate": 3.990518227889488e-06, + "loss": 0.0438, + "step": 17950 + }, + { + "epoch": 2.64445180036044, + "grad_norm": 0.003434759797528386, + "learning_rate": 3.949648520516594e-06, + "loss": 0.0003, + "step": 17975 + }, + { + "epoch": 2.6481297583581593, + "grad_norm": 0.003463399363681674, + "learning_rate": 3.908778813143697e-06, + "loss": 0.0001, + "step": 18000 + }, + { + "epoch": 2.6518077163558793, + "grad_norm": 0.003393635619431734, + "learning_rate": 3.867909105770803e-06, + "loss": 0.0002, + "step": 18025 + }, + { + "epoch": 2.655485674353599, + "grad_norm": 0.0027733049355447292, + "learning_rate": 3.827039398397907e-06, + "loss": 0.0001, + "step": 18050 + }, + { + "epoch": 2.6591636323513184, + "grad_norm": 0.0038054571487009525, + "learning_rate": 3.7861696910250126e-06, + "loss": 0.0001, + "step": 18075 + }, + { + "epoch": 2.6628415903490383, + "grad_norm": 0.0029823731165379286, + "learning_rate": 3.745299983652117e-06, + "loss": 0.0001, + "step": 18100 + }, + { + "epoch": 2.666519548346758, + "grad_norm": 0.0019862265326082706, + "learning_rate": 3.704430276279222e-06, + "loss": 0.0001, + "step": 18125 + }, + { + "epoch": 2.6701975063444774, + "grad_norm": 0.003500757971778512, + "learning_rate": 3.6635605689063266e-06, + "loss": 0.0001, + "step": 18150 + }, + { + "epoch": 2.6738754643421974, + "grad_norm": 0.002085187705233693, + "learning_rate": 3.6226908615334315e-06, + "loss": 0.0001, + "step": 18175 + }, + { + "epoch": 2.677553422339917, + "grad_norm": 0.0023257972206920385, + "learning_rate": 3.5818211541605365e-06, + "loss": 0.0001, + "step": 18200 + }, + { + "epoch": 2.6812313803376364, + "grad_norm": 0.0022203666158020496, + "learning_rate": 3.5409514467876414e-06, + "loss": 0.0001, + "step": 18225 + }, + { + "epoch": 2.6849093383353564, + "grad_norm": 0.0012388962786644697, + "learning_rate": 3.500081739414746e-06, + "loss": 0.0, + "step": 18250 + }, + { + "epoch": 2.688587296333076, + "grad_norm": 0.0008910479955375195, + "learning_rate": 3.4592120320418505e-06, + "loss": 0.0003, + "step": 18275 + }, + { + "epoch": 2.6922652543307954, + "grad_norm": 0.0010503758676350117, + "learning_rate": 3.4183423246689554e-06, + "loss": 0.0, + "step": 18300 + }, + { + "epoch": 2.6959432123285154, + "grad_norm": 0.000730241066776216, + "learning_rate": 3.37747261729606e-06, + "loss": 0.0001, + "step": 18325 + }, + { + "epoch": 2.699621170326235, + "grad_norm": 0.000822307774797082, + "learning_rate": 3.336602909923165e-06, + "loss": 0.0, + "step": 18350 + }, + { + "epoch": 2.7032991283239545, + "grad_norm": 1.4722820520401, + "learning_rate": 3.29573320255027e-06, + "loss": 0.1083, + "step": 18375 + }, + { + "epoch": 2.7069770863216744, + "grad_norm": 0.004885438829660416, + "learning_rate": 3.254863495177375e-06, + "loss": 0.0002, + "step": 18400 + }, + { + "epoch": 2.710655044319394, + "grad_norm": 0.0033965399488806725, + "learning_rate": 3.2139937878044794e-06, + "loss": 0.0001, + "step": 18425 + }, + { + "epoch": 2.7143330023171135, + "grad_norm": 0.004250906407833099, + "learning_rate": 3.1731240804315843e-06, + "loss": 0.0231, + "step": 18450 + }, + { + "epoch": 2.7180109603148335, + "grad_norm": 0.003409018972888589, + "learning_rate": 3.1322543730586893e-06, + "loss": 0.0002, + "step": 18475 + }, + { + "epoch": 2.721688918312553, + "grad_norm": 0.0036356241907924414, + "learning_rate": 3.0913846656857938e-06, + "loss": 0.0409, + "step": 18500 + }, + { + "epoch": 2.7253668763102725, + "grad_norm": 0.006237304303795099, + "learning_rate": 3.0505149583128983e-06, + "loss": 0.0386, + "step": 18525 + }, + { + "epoch": 2.729044834307992, + "grad_norm": 0.006783687509596348, + "learning_rate": 3.0096452509400033e-06, + "loss": 0.0002, + "step": 18550 + }, + { + "epoch": 2.732722792305712, + "grad_norm": 0.04287054389715195, + "learning_rate": 2.9687755435671082e-06, + "loss": 0.0321, + "step": 18575 + }, + { + "epoch": 2.7364007503034316, + "grad_norm": 0.0038001120556145906, + "learning_rate": 2.9279058361942127e-06, + "loss": 0.0003, + "step": 18600 + }, + { + "epoch": 2.740078708301151, + "grad_norm": 0.003841620171442628, + "learning_rate": 2.8870361288213177e-06, + "loss": 0.0001, + "step": 18625 + }, + { + "epoch": 2.7437566662988706, + "grad_norm": 0.002676568925380707, + "learning_rate": 2.8461664214484226e-06, + "loss": 0.0001, + "step": 18650 + }, + { + "epoch": 2.7474346242965906, + "grad_norm": 0.008307211101055145, + "learning_rate": 2.8052967140755276e-06, + "loss": 0.0001, + "step": 18675 + }, + { + "epoch": 2.75111258229431, + "grad_norm": 0.0034743708092719316, + "learning_rate": 2.764427006702632e-06, + "loss": 0.0001, + "step": 18700 + }, + { + "epoch": 2.7547905402920296, + "grad_norm": 0.0020617684349417686, + "learning_rate": 2.7235572993297367e-06, + "loss": 0.0001, + "step": 18725 + }, + { + "epoch": 2.7584684982897496, + "grad_norm": 0.0017286173533648252, + "learning_rate": 2.6826875919568416e-06, + "loss": 0.0001, + "step": 18750 + }, + { + "epoch": 2.762146456287469, + "grad_norm": 0.001774169155396521, + "learning_rate": 2.6418178845839466e-06, + "loss": 0.0001, + "step": 18775 + }, + { + "epoch": 2.7658244142851887, + "grad_norm": 0.003061393741518259, + "learning_rate": 2.600948177211051e-06, + "loss": 0.0298, + "step": 18800 + }, + { + "epoch": 2.7695023722829086, + "grad_norm": 0.00195386353880167, + "learning_rate": 2.560078469838156e-06, + "loss": 0.0001, + "step": 18825 + }, + { + "epoch": 2.773180330280628, + "grad_norm": 0.0015053004026412964, + "learning_rate": 2.519208762465261e-06, + "loss": 0.0001, + "step": 18850 + }, + { + "epoch": 2.7768582882783477, + "grad_norm": 0.002827111864462495, + "learning_rate": 2.4783390550923655e-06, + "loss": 0.0001, + "step": 18875 + }, + { + "epoch": 2.7805362462760677, + "grad_norm": 0.0010932940058410168, + "learning_rate": 2.4374693477194705e-06, + "loss": 0.0001, + "step": 18900 + }, + { + "epoch": 2.784214204273787, + "grad_norm": 7.858973026275635, + "learning_rate": 2.3965996403465754e-06, + "loss": 0.0468, + "step": 18925 + }, + { + "epoch": 2.7878921622715067, + "grad_norm": 0.002107949461787939, + "learning_rate": 2.35572993297368e-06, + "loss": 0.0001, + "step": 18950 + }, + { + "epoch": 2.7915701202692267, + "grad_norm": 0.001860212185420096, + "learning_rate": 2.3148602256007845e-06, + "loss": 0.0001, + "step": 18975 + }, + { + "epoch": 2.795248078266946, + "grad_norm": 0.002180658746510744, + "learning_rate": 2.2739905182278894e-06, + "loss": 0.0001, + "step": 19000 + }, + { + "epoch": 2.7989260362646657, + "grad_norm": 0.001684672199189663, + "learning_rate": 2.2331208108549944e-06, + "loss": 0.0001, + "step": 19025 + }, + { + "epoch": 2.8026039942623857, + "grad_norm": 0.0015821090200915933, + "learning_rate": 2.1922511034820993e-06, + "loss": 0.0001, + "step": 19050 + }, + { + "epoch": 2.8062819522601052, + "grad_norm": 0.0031413165852427483, + "learning_rate": 2.151381396109204e-06, + "loss": 0.0222, + "step": 19075 + }, + { + "epoch": 2.8099599102578248, + "grad_norm": 0.001654456602409482, + "learning_rate": 2.110511688736309e-06, + "loss": 0.0001, + "step": 19100 + }, + { + "epoch": 2.8136378682555447, + "grad_norm": 0.0025208396837115288, + "learning_rate": 2.0696419813634138e-06, + "loss": 0.0297, + "step": 19125 + }, + { + "epoch": 2.8173158262532643, + "grad_norm": 0.0016039038309827447, + "learning_rate": 2.0287722739905183e-06, + "loss": 0.0001, + "step": 19150 + }, + { + "epoch": 2.820993784250984, + "grad_norm": 0.0015692878514528275, + "learning_rate": 1.987902566617623e-06, + "loss": 0.0001, + "step": 19175 + }, + { + "epoch": 2.8246717422487038, + "grad_norm": 0.0014573705848306417, + "learning_rate": 1.9470328592447278e-06, + "loss": 0.0001, + "step": 19200 + }, + { + "epoch": 2.8283497002464233, + "grad_norm": 0.005317123141139746, + "learning_rate": 1.9061631518718325e-06, + "loss": 0.0001, + "step": 19225 + }, + { + "epoch": 2.832027658244143, + "grad_norm": 0.0014695243444293737, + "learning_rate": 1.8652934444989374e-06, + "loss": 0.0312, + "step": 19250 + }, + { + "epoch": 2.835705616241863, + "grad_norm": 0.04826376587152481, + "learning_rate": 1.8244237371260422e-06, + "loss": 0.0263, + "step": 19275 + }, + { + "epoch": 2.8393835742395823, + "grad_norm": 0.0012747733853757381, + "learning_rate": 1.7835540297531471e-06, + "loss": 0.0001, + "step": 19300 + }, + { + "epoch": 2.843061532237302, + "grad_norm": 0.0011536297388374805, + "learning_rate": 1.7426843223802519e-06, + "loss": 0.025, + "step": 19325 + }, + { + "epoch": 2.8467394902350214, + "grad_norm": 0.00559173384681344, + "learning_rate": 1.7018146150073564e-06, + "loss": 0.0001, + "step": 19350 + }, + { + "epoch": 2.8504174482327413, + "grad_norm": 0.0011801973450928926, + "learning_rate": 1.6609449076344614e-06, + "loss": 0.0001, + "step": 19375 + }, + { + "epoch": 2.854095406230461, + "grad_norm": 0.020327366888523102, + "learning_rate": 1.620075200261566e-06, + "loss": 0.0001, + "step": 19400 + }, + { + "epoch": 2.8577733642281804, + "grad_norm": 0.0012536696158349514, + "learning_rate": 1.579205492888671e-06, + "loss": 0.0001, + "step": 19425 + }, + { + "epoch": 2.8614513222259, + "grad_norm": 0.0010541353840380907, + "learning_rate": 1.5383357855157758e-06, + "loss": 0.0, + "step": 19450 + }, + { + "epoch": 2.86512928022362, + "grad_norm": 0.0011492278426885605, + "learning_rate": 1.4974660781428805e-06, + "loss": 0.0001, + "step": 19475 + }, + { + "epoch": 2.8688072382213394, + "grad_norm": 0.002121875062584877, + "learning_rate": 1.4565963707699853e-06, + "loss": 0.0339, + "step": 19500 + }, + { + "epoch": 2.872485196219059, + "grad_norm": 0.0013062648940831423, + "learning_rate": 1.4157266633970902e-06, + "loss": 0.0001, + "step": 19525 + }, + { + "epoch": 2.876163154216779, + "grad_norm": 0.0012365735601633787, + "learning_rate": 1.374856956024195e-06, + "loss": 0.0001, + "step": 19550 + }, + { + "epoch": 2.8798411122144985, + "grad_norm": 0.001490547088906169, + "learning_rate": 1.3339872486512997e-06, + "loss": 0.0389, + "step": 19575 + }, + { + "epoch": 2.883519070212218, + "grad_norm": 0.0010857345769181848, + "learning_rate": 1.2931175412784044e-06, + "loss": 0.0002, + "step": 19600 + }, + { + "epoch": 2.887197028209938, + "grad_norm": 0.0016767021734267473, + "learning_rate": 1.2522478339055092e-06, + "loss": 0.0001, + "step": 19625 + }, + { + "epoch": 2.8908749862076575, + "grad_norm": 0.004218839108943939, + "learning_rate": 1.2113781265326141e-06, + "loss": 0.0001, + "step": 19650 + }, + { + "epoch": 2.894552944205377, + "grad_norm": 0.0010596220381557941, + "learning_rate": 1.1705084191597189e-06, + "loss": 0.0001, + "step": 19675 + }, + { + "epoch": 2.898230902203097, + "grad_norm": 0.005758639425039291, + "learning_rate": 1.1296387117868236e-06, + "loss": 0.0001, + "step": 19700 + }, + { + "epoch": 2.9019088602008165, + "grad_norm": 0.004077006597071886, + "learning_rate": 1.0887690044139283e-06, + "loss": 0.0001, + "step": 19725 + }, + { + "epoch": 2.905586818198536, + "grad_norm": 0.023057300597429276, + "learning_rate": 1.0478992970410333e-06, + "loss": 0.0001, + "step": 19750 + }, + { + "epoch": 2.909264776196256, + "grad_norm": 0.0010171595495194197, + "learning_rate": 1.007029589668138e-06, + "loss": 0.0002, + "step": 19775 + }, + { + "epoch": 2.9129427341939755, + "grad_norm": 0.0021811590995639563, + "learning_rate": 9.661598822952428e-07, + "loss": 0.0018, + "step": 19800 + }, + { + "epoch": 2.916620692191695, + "grad_norm": 0.0007530258735641837, + "learning_rate": 9.252901749223475e-07, + "loss": 0.0, + "step": 19825 + }, + { + "epoch": 2.920298650189415, + "grad_norm": 0.0008248965605162084, + "learning_rate": 8.844204675494524e-07, + "loss": 0.0, + "step": 19850 + }, + { + "epoch": 2.9239766081871346, + "grad_norm": 0.0008437008364126086, + "learning_rate": 8.435507601765572e-07, + "loss": 0.0001, + "step": 19875 + }, + { + "epoch": 2.927654566184854, + "grad_norm": 0.0011598097626119852, + "learning_rate": 8.026810528036619e-07, + "loss": 0.029, + "step": 19900 + }, + { + "epoch": 2.931332524182574, + "grad_norm": 0.000989201944321394, + "learning_rate": 7.618113454307668e-07, + "loss": 0.0001, + "step": 19925 + }, + { + "epoch": 2.9350104821802936, + "grad_norm": 0.0009332878980785608, + "learning_rate": 7.209416380578715e-07, + "loss": 0.0001, + "step": 19950 + }, + { + "epoch": 2.938688440178013, + "grad_norm": 0.0010302929440513253, + "learning_rate": 6.800719306849764e-07, + "loss": 0.0316, + "step": 19975 + }, + { + "epoch": 2.942366398175733, + "grad_norm": 0.0011053696507588029, + "learning_rate": 6.392022233120811e-07, + "loss": 0.0001, + "step": 20000 + }, + { + "epoch": 2.9460443561734526, + "grad_norm": 0.001087658922187984, + "learning_rate": 5.983325159391858e-07, + "loss": 0.0, + "step": 20025 + }, + { + "epoch": 2.949722314171172, + "grad_norm": 0.0008900929242372513, + "learning_rate": 5.574628085662906e-07, + "loss": 0.0001, + "step": 20050 + }, + { + "epoch": 2.9534002721688917, + "grad_norm": 0.001053415471687913, + "learning_rate": 5.165931011933954e-07, + "loss": 0.0, + "step": 20075 + }, + { + "epoch": 2.9570782301666116, + "grad_norm": 0.0008429349982179701, + "learning_rate": 4.757233938205003e-07, + "loss": 0.0, + "step": 20100 + }, + { + "epoch": 2.960756188164331, + "grad_norm": 0.0009649925632402301, + "learning_rate": 4.34853686447605e-07, + "loss": 0.0, + "step": 20125 + }, + { + "epoch": 2.9644341461620507, + "grad_norm": 0.0009367198217660189, + "learning_rate": 3.939839790747098e-07, + "loss": 0.0002, + "step": 20150 + }, + { + "epoch": 2.9681121041597702, + "grad_norm": 0.0008432368049398065, + "learning_rate": 3.5311427170181465e-07, + "loss": 0.0, + "step": 20175 + }, + { + "epoch": 2.97179006215749, + "grad_norm": 2.9367611408233643, + "learning_rate": 3.1224456432891944e-07, + "loss": 0.0002, + "step": 20200 + }, + { + "epoch": 2.9754680201552097, + "grad_norm": 0.0008842748356983066, + "learning_rate": 2.7137485695602424e-07, + "loss": 0.0001, + "step": 20225 + }, + { + "epoch": 2.9791459781529293, + "grad_norm": 0.3803035616874695, + "learning_rate": 2.30505149583129e-07, + "loss": 0.0303, + "step": 20250 + }, + { + "epoch": 2.9828239361506492, + "grad_norm": 0.001255788840353489, + "learning_rate": 1.8963544221023377e-07, + "loss": 0.0001, + "step": 20275 + }, + { + "epoch": 2.9865018941483688, + "grad_norm": 0.0012517735594883561, + "learning_rate": 1.4876573483733856e-07, + "loss": 0.0001, + "step": 20300 + }, + { + "epoch": 2.9901798521460883, + "grad_norm": 0.0008377633057534695, + "learning_rate": 1.0789602746444335e-07, + "loss": 0.0001, + "step": 20325 + }, + { + "epoch": 2.9938578101438083, + "grad_norm": 0.0008699085447005928, + "learning_rate": 6.702632009154815e-08, + "loss": 0.0001, + "step": 20350 + }, + { + "epoch": 2.997535768141528, + "grad_norm": 0.000927777262404561, + "learning_rate": 2.6156612718652934e-08, + "loss": 0.023, + "step": 20375 + }, + { + "epoch": 2.9998896612600685, + "eval_accuracy": 0.9969841853622655, + "eval_auc": 0.9999289486306174, + "eval_f1": 0.9969837416317222, + "eval_loss": 0.01777876727283001, + "eval_precision": 0.9972038263428992, + "eval_recall": 0.9967637540453075, + "eval_runtime": 2385.463, + "eval_samples_per_second": 5.699, + "eval_steps_per_second": 1.425, + "step": 20391 + } + ], + "logging_steps": 25, + "max_steps": 20391, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.073019505969152e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}