{ "best_metric": 0.33997730174492835, "best_model_checkpoint": "Mrohit01/1_lakh_cards-swinv2-base-patch4-window12to16-192to256-22kto1k-ft-finetuned/checkpoint-54461", "epoch": 9.99958686221855, "eval_steps": 500, "global_step": 60510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016525511258004544, "grad_norm": 34.320098876953125, "learning_rate": 8.263097008758884e-08, "loss": 2.4664, "step": 10 }, { "epoch": 0.003305102251600909, "grad_norm": 319.06292724609375, "learning_rate": 1.6526194017517767e-07, "loss": 2.4352, "step": 20 }, { "epoch": 0.004957653377401363, "grad_norm": 57.93561553955078, "learning_rate": 2.478929102627665e-07, "loss": 2.3859, "step": 30 }, { "epoch": 0.006610204503201818, "grad_norm": 37.62372589111328, "learning_rate": 3.3052388035035534e-07, "loss": 2.3897, "step": 40 }, { "epoch": 0.008262755629002272, "grad_norm": 44.00041580200195, "learning_rate": 4.131548504379442e-07, "loss": 2.4616, "step": 50 }, { "epoch": 0.009915306754802726, "grad_norm": 25.988052368164062, "learning_rate": 4.95785820525533e-07, "loss": 2.427, "step": 60 }, { "epoch": 0.011567857880603181, "grad_norm": 105.19337463378906, "learning_rate": 5.784167906131218e-07, "loss": 2.4586, "step": 70 }, { "epoch": 0.013220409006403635, "grad_norm": 30.26386833190918, "learning_rate": 6.610477607007107e-07, "loss": 2.3847, "step": 80 }, { "epoch": 0.01487296013220409, "grad_norm": 35.36358642578125, "learning_rate": 7.436787307882995e-07, "loss": 2.3328, "step": 90 }, { "epoch": 0.016525511258004544, "grad_norm": 262.9168395996094, "learning_rate": 8.263097008758884e-07, "loss": 2.3209, "step": 100 }, { "epoch": 0.018178062383805, "grad_norm": 31.34343910217285, "learning_rate": 9.089406709634771e-07, "loss": 2.3854, "step": 110 }, { "epoch": 0.019830613509605452, "grad_norm": 21.935100555419922, "learning_rate": 9.91571641051066e-07, "loss": 2.3157, "step": 120 }, { "epoch": 0.021483164635405907, "grad_norm": 41.69834899902344, "learning_rate": 1.0742026111386547e-06, "loss": 2.3495, "step": 130 }, { "epoch": 0.023135715761206363, "grad_norm": 25.686676025390625, "learning_rate": 1.1568335812262436e-06, "loss": 2.2876, "step": 140 }, { "epoch": 0.02478826688700682, "grad_norm": 20.90071678161621, "learning_rate": 1.2394645513138325e-06, "loss": 2.333, "step": 150 }, { "epoch": 0.02644081801280727, "grad_norm": 30.158994674682617, "learning_rate": 1.3220955214014214e-06, "loss": 2.3064, "step": 160 }, { "epoch": 0.028093369138607726, "grad_norm": 29.495481491088867, "learning_rate": 1.4047264914890103e-06, "loss": 2.2628, "step": 170 }, { "epoch": 0.02974592026440818, "grad_norm": 20.178300857543945, "learning_rate": 1.487357461576599e-06, "loss": 2.2932, "step": 180 }, { "epoch": 0.031398471390208633, "grad_norm": 36.64740753173828, "learning_rate": 1.5699884316641878e-06, "loss": 2.3664, "step": 190 }, { "epoch": 0.03305102251600909, "grad_norm": 19.702604293823242, "learning_rate": 1.6526194017517767e-06, "loss": 2.3554, "step": 200 }, { "epoch": 0.034703573641809544, "grad_norm": 18.79967498779297, "learning_rate": 1.7352503718393656e-06, "loss": 2.32, "step": 210 }, { "epoch": 0.03635612476761, "grad_norm": 24.093412399291992, "learning_rate": 1.8178813419269543e-06, "loss": 2.2258, "step": 220 }, { "epoch": 0.038008675893410455, "grad_norm": 20.522418975830078, "learning_rate": 1.9005123120145432e-06, "loss": 2.2418, "step": 230 }, { "epoch": 0.039661227019210904, "grad_norm": 21.670164108276367, "learning_rate": 1.983143282102132e-06, "loss": 2.2425, "step": 240 }, { "epoch": 0.04131377814501136, "grad_norm": 24.221439361572266, "learning_rate": 2.0657742521897205e-06, "loss": 2.2311, "step": 250 }, { "epoch": 0.042966329270811815, "grad_norm": 26.810293197631836, "learning_rate": 2.1484052222773094e-06, "loss": 2.3007, "step": 260 }, { "epoch": 0.04461888039661227, "grad_norm": 33.146942138671875, "learning_rate": 2.2310361923648987e-06, "loss": 2.1901, "step": 270 }, { "epoch": 0.046271431522412726, "grad_norm": 24.769393920898438, "learning_rate": 2.313667162452487e-06, "loss": 2.2548, "step": 280 }, { "epoch": 0.04792398264821318, "grad_norm": 16.452192306518555, "learning_rate": 2.396298132540076e-06, "loss": 2.1442, "step": 290 }, { "epoch": 0.04957653377401364, "grad_norm": 17.729570388793945, "learning_rate": 2.478929102627665e-06, "loss": 2.1889, "step": 300 }, { "epoch": 0.051229084899814085, "grad_norm": 19.994186401367188, "learning_rate": 2.561560072715254e-06, "loss": 2.2463, "step": 310 }, { "epoch": 0.05288163602561454, "grad_norm": 27.13626480102539, "learning_rate": 2.6441910428028427e-06, "loss": 2.2024, "step": 320 }, { "epoch": 0.054534187151414996, "grad_norm": 20.60004997253418, "learning_rate": 2.7268220128904316e-06, "loss": 2.1578, "step": 330 }, { "epoch": 0.05618673827721545, "grad_norm": 20.759340286254883, "learning_rate": 2.8094529829780205e-06, "loss": 2.0856, "step": 340 }, { "epoch": 0.05783928940301591, "grad_norm": 34.48766326904297, "learning_rate": 2.892083953065609e-06, "loss": 2.1879, "step": 350 }, { "epoch": 0.05949184052881636, "grad_norm": 21.343008041381836, "learning_rate": 2.974714923153198e-06, "loss": 2.1215, "step": 360 }, { "epoch": 0.06114439165461681, "grad_norm": 20.634981155395508, "learning_rate": 3.0573458932407868e-06, "loss": 2.1139, "step": 370 }, { "epoch": 0.06279694278041727, "grad_norm": 23.69481086730957, "learning_rate": 3.1399768633283757e-06, "loss": 2.1182, "step": 380 }, { "epoch": 0.06444949390621772, "grad_norm": 21.073101043701172, "learning_rate": 3.2226078334159645e-06, "loss": 2.1002, "step": 390 }, { "epoch": 0.06610204503201818, "grad_norm": 74.09575653076172, "learning_rate": 3.3052388035035534e-06, "loss": 2.0813, "step": 400 }, { "epoch": 0.06775459615781863, "grad_norm": 18.449506759643555, "learning_rate": 3.3878697735911423e-06, "loss": 2.0566, "step": 410 }, { "epoch": 0.06940714728361909, "grad_norm": 28.852758407592773, "learning_rate": 3.470500743678731e-06, "loss": 1.9957, "step": 420 }, { "epoch": 0.07105969840941954, "grad_norm": 26.230737686157227, "learning_rate": 3.5531317137663197e-06, "loss": 1.9894, "step": 430 }, { "epoch": 0.07271224953522, "grad_norm": 24.988384246826172, "learning_rate": 3.6357626838539086e-06, "loss": 2.0197, "step": 440 }, { "epoch": 0.07436480066102046, "grad_norm": 101.62005615234375, "learning_rate": 3.7183936539414975e-06, "loss": 2.074, "step": 450 }, { "epoch": 0.07601735178682091, "grad_norm": 24.157245635986328, "learning_rate": 3.8010246240290863e-06, "loss": 2.1024, "step": 460 }, { "epoch": 0.07766990291262135, "grad_norm": 26.56095314025879, "learning_rate": 3.883655594116676e-06, "loss": 2.0546, "step": 470 }, { "epoch": 0.07932245403842181, "grad_norm": 59.286495208740234, "learning_rate": 3.966286564204264e-06, "loss": 2.0861, "step": 480 }, { "epoch": 0.08097500516422226, "grad_norm": 27.197782516479492, "learning_rate": 4.0489175342918534e-06, "loss": 2.0757, "step": 490 }, { "epoch": 0.08262755629002272, "grad_norm": 28.272613525390625, "learning_rate": 4.131548504379441e-06, "loss": 1.8954, "step": 500 }, { "epoch": 0.08428010741582317, "grad_norm": 23.627155303955078, "learning_rate": 4.21417947446703e-06, "loss": 1.9886, "step": 510 }, { "epoch": 0.08593265854162363, "grad_norm": 28.818801879882812, "learning_rate": 4.296810444554619e-06, "loss": 2.0049, "step": 520 }, { "epoch": 0.08758520966742409, "grad_norm": 31.500320434570312, "learning_rate": 4.379441414642208e-06, "loss": 1.9709, "step": 530 }, { "epoch": 0.08923776079322454, "grad_norm": 32.4036865234375, "learning_rate": 4.4620723847297975e-06, "loss": 1.957, "step": 540 }, { "epoch": 0.090890311919025, "grad_norm": 18.269474029541016, "learning_rate": 4.544703354817386e-06, "loss": 2.0274, "step": 550 }, { "epoch": 0.09254286304482545, "grad_norm": 25.879230499267578, "learning_rate": 4.627334324904974e-06, "loss": 1.982, "step": 560 }, { "epoch": 0.09419541417062591, "grad_norm": 78.83243560791016, "learning_rate": 4.709965294992563e-06, "loss": 1.9635, "step": 570 }, { "epoch": 0.09584796529642636, "grad_norm": 24.732942581176758, "learning_rate": 4.792596265080152e-06, "loss": 1.9254, "step": 580 }, { "epoch": 0.09750051642222682, "grad_norm": 23.506771087646484, "learning_rate": 4.8752272351677415e-06, "loss": 1.951, "step": 590 }, { "epoch": 0.09915306754802727, "grad_norm": 24.17493438720703, "learning_rate": 4.95785820525533e-06, "loss": 1.9573, "step": 600 }, { "epoch": 0.10080561867382772, "grad_norm": 25.89427375793457, "learning_rate": 5.040489175342919e-06, "loss": 2.0148, "step": 610 }, { "epoch": 0.10245816979962817, "grad_norm": 22.84876251220703, "learning_rate": 5.123120145430508e-06, "loss": 1.8556, "step": 620 }, { "epoch": 0.10411072092542863, "grad_norm": 229.06407165527344, "learning_rate": 5.205751115518096e-06, "loss": 1.9412, "step": 630 }, { "epoch": 0.10576327205122908, "grad_norm": 22.079832077026367, "learning_rate": 5.2883820856056855e-06, "loss": 1.8359, "step": 640 }, { "epoch": 0.10741582317702954, "grad_norm": 23.32947540283203, "learning_rate": 5.371013055693274e-06, "loss": 1.7962, "step": 650 }, { "epoch": 0.10906837430282999, "grad_norm": 25.554899215698242, "learning_rate": 5.453644025780863e-06, "loss": 1.8114, "step": 660 }, { "epoch": 0.11072092542863045, "grad_norm": 23.066320419311523, "learning_rate": 5.536274995868452e-06, "loss": 1.9601, "step": 670 }, { "epoch": 0.1123734765544309, "grad_norm": 29.641036987304688, "learning_rate": 5.618905965956041e-06, "loss": 1.8361, "step": 680 }, { "epoch": 0.11402602768023136, "grad_norm": 40.079246520996094, "learning_rate": 5.7015369360436295e-06, "loss": 1.7822, "step": 690 }, { "epoch": 0.11567857880603181, "grad_norm": 46.70073699951172, "learning_rate": 5.784167906131218e-06, "loss": 1.7975, "step": 700 }, { "epoch": 0.11733112993183227, "grad_norm": 27.957921981811523, "learning_rate": 5.866798876218807e-06, "loss": 1.9022, "step": 710 }, { "epoch": 0.11898368105763273, "grad_norm": 87.9314956665039, "learning_rate": 5.949429846306396e-06, "loss": 1.7631, "step": 720 }, { "epoch": 0.12063623218343318, "grad_norm": 28.518705368041992, "learning_rate": 6.032060816393985e-06, "loss": 1.8346, "step": 730 }, { "epoch": 0.12228878330923362, "grad_norm": 35.01418685913086, "learning_rate": 6.1146917864815735e-06, "loss": 1.7283, "step": 740 }, { "epoch": 0.12394133443503408, "grad_norm": 25.934650421142578, "learning_rate": 6.197322756569163e-06, "loss": 1.7489, "step": 750 }, { "epoch": 0.12559388556083453, "grad_norm": 49.37101364135742, "learning_rate": 6.279953726656751e-06, "loss": 1.7778, "step": 760 }, { "epoch": 0.127246436686635, "grad_norm": 40.169677734375, "learning_rate": 6.362584696744341e-06, "loss": 1.9885, "step": 770 }, { "epoch": 0.12889898781243544, "grad_norm": 39.495182037353516, "learning_rate": 6.445215666831929e-06, "loss": 1.8897, "step": 780 }, { "epoch": 0.1305515389382359, "grad_norm": 33.42755889892578, "learning_rate": 6.527846636919517e-06, "loss": 1.8193, "step": 790 }, { "epoch": 0.13220409006403636, "grad_norm": 53.90789794921875, "learning_rate": 6.610477607007107e-06, "loss": 1.7312, "step": 800 }, { "epoch": 0.1338566411898368, "grad_norm": 42.573890686035156, "learning_rate": 6.693108577094695e-06, "loss": 1.8778, "step": 810 }, { "epoch": 0.13550919231563727, "grad_norm": 20.9980525970459, "learning_rate": 6.775739547182285e-06, "loss": 1.7827, "step": 820 }, { "epoch": 0.13716174344143772, "grad_norm": 27.766277313232422, "learning_rate": 6.858370517269873e-06, "loss": 1.8191, "step": 830 }, { "epoch": 0.13881429456723818, "grad_norm": 26.448427200317383, "learning_rate": 6.941001487357462e-06, "loss": 1.8897, "step": 840 }, { "epoch": 0.14046684569303863, "grad_norm": 52.19208908081055, "learning_rate": 7.023632457445051e-06, "loss": 1.932, "step": 850 }, { "epoch": 0.1421193968188391, "grad_norm": 21.94788360595703, "learning_rate": 7.106263427532639e-06, "loss": 1.8672, "step": 860 }, { "epoch": 0.14377194794463954, "grad_norm": 40.3758430480957, "learning_rate": 7.188894397620229e-06, "loss": 1.8301, "step": 870 }, { "epoch": 0.14542449907044, "grad_norm": 22.807653427124023, "learning_rate": 7.271525367707817e-06, "loss": 1.8321, "step": 880 }, { "epoch": 0.14707705019624046, "grad_norm": 22.939292907714844, "learning_rate": 7.3541563377954064e-06, "loss": 1.7721, "step": 890 }, { "epoch": 0.1487296013220409, "grad_norm": 34.43024444580078, "learning_rate": 7.436787307882995e-06, "loss": 1.7898, "step": 900 }, { "epoch": 0.15038215244784137, "grad_norm": 19.193397521972656, "learning_rate": 7.519418277970584e-06, "loss": 1.8499, "step": 910 }, { "epoch": 0.15203470357364182, "grad_norm": 26.9038143157959, "learning_rate": 7.602049248058173e-06, "loss": 1.8749, "step": 920 }, { "epoch": 0.15368725469944228, "grad_norm": 30.20845603942871, "learning_rate": 7.684680218145761e-06, "loss": 1.7995, "step": 930 }, { "epoch": 0.1553398058252427, "grad_norm": 30.089397430419922, "learning_rate": 7.767311188233351e-06, "loss": 1.8802, "step": 940 }, { "epoch": 0.15699235695104316, "grad_norm": 18.006601333618164, "learning_rate": 7.849942158320938e-06, "loss": 1.8049, "step": 950 }, { "epoch": 0.15864490807684362, "grad_norm": 22.729703903198242, "learning_rate": 7.932573128408528e-06, "loss": 1.8031, "step": 960 }, { "epoch": 0.16029745920264407, "grad_norm": 22.711090087890625, "learning_rate": 8.015204098496117e-06, "loss": 1.7107, "step": 970 }, { "epoch": 0.16195001032844453, "grad_norm": 24.80642318725586, "learning_rate": 8.097835068583707e-06, "loss": 1.8531, "step": 980 }, { "epoch": 0.16360256145424498, "grad_norm": 28.32562255859375, "learning_rate": 8.180466038671294e-06, "loss": 1.927, "step": 990 }, { "epoch": 0.16525511258004544, "grad_norm": 20.05076789855957, "learning_rate": 8.263097008758882e-06, "loss": 1.7359, "step": 1000 }, { "epoch": 0.1669076637058459, "grad_norm": 32.90656661987305, "learning_rate": 8.345727978846472e-06, "loss": 1.7466, "step": 1010 }, { "epoch": 0.16856021483164635, "grad_norm": 44.9754524230957, "learning_rate": 8.42835894893406e-06, "loss": 1.8069, "step": 1020 }, { "epoch": 0.1702127659574468, "grad_norm": 201.73477172851562, "learning_rate": 8.510989919021651e-06, "loss": 1.7286, "step": 1030 }, { "epoch": 0.17186531708324726, "grad_norm": 18.8424015045166, "learning_rate": 8.593620889109238e-06, "loss": 1.8063, "step": 1040 }, { "epoch": 0.17351786820904772, "grad_norm": 26.58818244934082, "learning_rate": 8.676251859196826e-06, "loss": 1.9267, "step": 1050 }, { "epoch": 0.17517041933484817, "grad_norm": 37.0977897644043, "learning_rate": 8.758882829284416e-06, "loss": 1.7116, "step": 1060 }, { "epoch": 0.17682297046064863, "grad_norm": 19.845773696899414, "learning_rate": 8.841513799372005e-06, "loss": 1.7994, "step": 1070 }, { "epoch": 0.17847552158644908, "grad_norm": 93.75279235839844, "learning_rate": 8.924144769459595e-06, "loss": 1.8503, "step": 1080 }, { "epoch": 0.18012807271224954, "grad_norm": 50.5566291809082, "learning_rate": 9.006775739547182e-06, "loss": 1.7882, "step": 1090 }, { "epoch": 0.18178062383805, "grad_norm": 19.830806732177734, "learning_rate": 9.089406709634772e-06, "loss": 1.7492, "step": 1100 }, { "epoch": 0.18343317496385045, "grad_norm": 88.4405746459961, "learning_rate": 9.17203767972236e-06, "loss": 1.8902, "step": 1110 }, { "epoch": 0.1850857260896509, "grad_norm": 38.67186737060547, "learning_rate": 9.254668649809949e-06, "loss": 1.7064, "step": 1120 }, { "epoch": 0.18673827721545136, "grad_norm": 17.757261276245117, "learning_rate": 9.337299619897539e-06, "loss": 1.7849, "step": 1130 }, { "epoch": 0.18839082834125181, "grad_norm": 20.545978546142578, "learning_rate": 9.419930589985126e-06, "loss": 1.7574, "step": 1140 }, { "epoch": 0.19004337946705227, "grad_norm": 15.944082260131836, "learning_rate": 9.502561560072716e-06, "loss": 1.7188, "step": 1150 }, { "epoch": 0.19169593059285273, "grad_norm": 32.81608963012695, "learning_rate": 9.585192530160304e-06, "loss": 1.9046, "step": 1160 }, { "epoch": 0.19334848171865318, "grad_norm": 18.563947677612305, "learning_rate": 9.667823500247894e-06, "loss": 1.7162, "step": 1170 }, { "epoch": 0.19500103284445364, "grad_norm": 25.642000198364258, "learning_rate": 9.750454470335483e-06, "loss": 1.7451, "step": 1180 }, { "epoch": 0.1966535839702541, "grad_norm": 32.40825271606445, "learning_rate": 9.83308544042307e-06, "loss": 1.7892, "step": 1190 }, { "epoch": 0.19830613509605455, "grad_norm": 19.13682746887207, "learning_rate": 9.91571641051066e-06, "loss": 1.8015, "step": 1200 }, { "epoch": 0.19995868622185498, "grad_norm": 28.85736083984375, "learning_rate": 9.998347380598248e-06, "loss": 1.7506, "step": 1210 }, { "epoch": 0.20161123734765543, "grad_norm": 15.917028427124023, "learning_rate": 1.0080978350685838e-05, "loss": 1.7728, "step": 1220 }, { "epoch": 0.2032637884734559, "grad_norm": 29.365341186523438, "learning_rate": 1.0163609320773427e-05, "loss": 1.8862, "step": 1230 }, { "epoch": 0.20491633959925634, "grad_norm": 30.120737075805664, "learning_rate": 1.0246240290861015e-05, "loss": 1.7482, "step": 1240 }, { "epoch": 0.2065688907250568, "grad_norm": 19.36783790588379, "learning_rate": 1.0328871260948604e-05, "loss": 1.6657, "step": 1250 }, { "epoch": 0.20822144185085725, "grad_norm": 21.701364517211914, "learning_rate": 1.0411502231036192e-05, "loss": 1.8962, "step": 1260 }, { "epoch": 0.2098739929766577, "grad_norm": 23.340579986572266, "learning_rate": 1.0494133201123783e-05, "loss": 1.828, "step": 1270 }, { "epoch": 0.21152654410245816, "grad_norm": 22.94394874572754, "learning_rate": 1.0576764171211371e-05, "loss": 1.7493, "step": 1280 }, { "epoch": 0.21317909522825862, "grad_norm": 24.351810455322266, "learning_rate": 1.065939514129896e-05, "loss": 1.7058, "step": 1290 }, { "epoch": 0.21483164635405907, "grad_norm": 35.676246643066406, "learning_rate": 1.0742026111386548e-05, "loss": 1.6535, "step": 1300 }, { "epoch": 0.21648419747985953, "grad_norm": 19.702678680419922, "learning_rate": 1.0824657081474138e-05, "loss": 1.8294, "step": 1310 }, { "epoch": 0.21813674860565999, "grad_norm": 29.777185440063477, "learning_rate": 1.0907288051561727e-05, "loss": 1.7602, "step": 1320 }, { "epoch": 0.21978929973146044, "grad_norm": 41.81754684448242, "learning_rate": 1.0989919021649315e-05, "loss": 1.6949, "step": 1330 }, { "epoch": 0.2214418508572609, "grad_norm": 24.266855239868164, "learning_rate": 1.1072549991736903e-05, "loss": 1.7718, "step": 1340 }, { "epoch": 0.22309440198306135, "grad_norm": 36.88713073730469, "learning_rate": 1.1155180961824492e-05, "loss": 1.7773, "step": 1350 }, { "epoch": 0.2247469531088618, "grad_norm": 20.02907943725586, "learning_rate": 1.1237811931912082e-05, "loss": 1.8017, "step": 1360 }, { "epoch": 0.22639950423466226, "grad_norm": 32.76356887817383, "learning_rate": 1.132044290199967e-05, "loss": 1.6808, "step": 1370 }, { "epoch": 0.22805205536046272, "grad_norm": 28.06949234008789, "learning_rate": 1.1403073872087259e-05, "loss": 1.7669, "step": 1380 }, { "epoch": 0.22970460648626317, "grad_norm": 27.715717315673828, "learning_rate": 1.1485704842174847e-05, "loss": 1.8211, "step": 1390 }, { "epoch": 0.23135715761206363, "grad_norm": 14.42801570892334, "learning_rate": 1.1568335812262436e-05, "loss": 1.7236, "step": 1400 }, { "epoch": 0.23300970873786409, "grad_norm": 24.549047470092773, "learning_rate": 1.1650966782350026e-05, "loss": 1.7445, "step": 1410 }, { "epoch": 0.23466225986366454, "grad_norm": 19.45368003845215, "learning_rate": 1.1733597752437615e-05, "loss": 1.7746, "step": 1420 }, { "epoch": 0.236314810989465, "grad_norm": 25.40043830871582, "learning_rate": 1.1816228722525203e-05, "loss": 1.88, "step": 1430 }, { "epoch": 0.23796736211526545, "grad_norm": 80.87067413330078, "learning_rate": 1.1898859692612792e-05, "loss": 1.7234, "step": 1440 }, { "epoch": 0.2396199132410659, "grad_norm": 12.040018081665039, "learning_rate": 1.198149066270038e-05, "loss": 1.7625, "step": 1450 }, { "epoch": 0.24127246436686636, "grad_norm": 16.50065803527832, "learning_rate": 1.206412163278797e-05, "loss": 1.8548, "step": 1460 }, { "epoch": 0.24292501549266682, "grad_norm": 21.08123207092285, "learning_rate": 1.2146752602875559e-05, "loss": 1.7773, "step": 1470 }, { "epoch": 0.24457756661846725, "grad_norm": 13.436989784240723, "learning_rate": 1.2229383572963147e-05, "loss": 1.6914, "step": 1480 }, { "epoch": 0.2462301177442677, "grad_norm": 157.91610717773438, "learning_rate": 1.2312014543050736e-05, "loss": 1.7556, "step": 1490 }, { "epoch": 0.24788266887006816, "grad_norm": 37.71729278564453, "learning_rate": 1.2394645513138326e-05, "loss": 1.691, "step": 1500 }, { "epoch": 0.2495352199958686, "grad_norm": 73.44625854492188, "learning_rate": 1.2477276483225914e-05, "loss": 1.7093, "step": 1510 }, { "epoch": 0.25118777112166907, "grad_norm": 24.017974853515625, "learning_rate": 1.2559907453313503e-05, "loss": 1.691, "step": 1520 }, { "epoch": 0.25284032224746955, "grad_norm": 34.528289794921875, "learning_rate": 1.264253842340109e-05, "loss": 1.9174, "step": 1530 }, { "epoch": 0.25449287337327, "grad_norm": 16.658061981201172, "learning_rate": 1.2725169393488681e-05, "loss": 1.6134, "step": 1540 }, { "epoch": 0.25614542449907046, "grad_norm": 49.6403694152832, "learning_rate": 1.280780036357627e-05, "loss": 1.7812, "step": 1550 }, { "epoch": 0.2577979756248709, "grad_norm": 13.805188179016113, "learning_rate": 1.2890431333663858e-05, "loss": 1.7681, "step": 1560 }, { "epoch": 0.2594505267506714, "grad_norm": 51.3933219909668, "learning_rate": 1.2973062303751447e-05, "loss": 1.7339, "step": 1570 }, { "epoch": 0.2611030778764718, "grad_norm": 12.193310737609863, "learning_rate": 1.3055693273839033e-05, "loss": 1.8874, "step": 1580 }, { "epoch": 0.2627556290022723, "grad_norm": 19.450271606445312, "learning_rate": 1.3138324243926625e-05, "loss": 1.8667, "step": 1590 }, { "epoch": 0.2644081801280727, "grad_norm": 22.680065155029297, "learning_rate": 1.3220955214014214e-05, "loss": 1.8435, "step": 1600 }, { "epoch": 0.26606073125387314, "grad_norm": 43.32194900512695, "learning_rate": 1.3303586184101802e-05, "loss": 1.685, "step": 1610 }, { "epoch": 0.2677132823796736, "grad_norm": 14.092884063720703, "learning_rate": 1.338621715418939e-05, "loss": 1.7688, "step": 1620 }, { "epoch": 0.26936583350547405, "grad_norm": 20.905393600463867, "learning_rate": 1.346884812427698e-05, "loss": 1.8398, "step": 1630 }, { "epoch": 0.27101838463127453, "grad_norm": 12.114250183105469, "learning_rate": 1.355147909436457e-05, "loss": 1.7383, "step": 1640 }, { "epoch": 0.27267093575707496, "grad_norm": 16.64354133605957, "learning_rate": 1.3634110064452158e-05, "loss": 1.6308, "step": 1650 }, { "epoch": 0.27432348688287544, "grad_norm": 18.8261661529541, "learning_rate": 1.3716741034539746e-05, "loss": 1.7076, "step": 1660 }, { "epoch": 0.2759760380086759, "grad_norm": 11.279623031616211, "learning_rate": 1.3799372004627335e-05, "loss": 1.774, "step": 1670 }, { "epoch": 0.27762858913447636, "grad_norm": 12.221219062805176, "learning_rate": 1.3882002974714925e-05, "loss": 1.5898, "step": 1680 }, { "epoch": 0.2792811402602768, "grad_norm": 14.965773582458496, "learning_rate": 1.3964633944802513e-05, "loss": 1.6842, "step": 1690 }, { "epoch": 0.28093369138607727, "grad_norm": 28.31591796875, "learning_rate": 1.4047264914890102e-05, "loss": 1.6575, "step": 1700 }, { "epoch": 0.2825862425118777, "grad_norm": 12.584307670593262, "learning_rate": 1.412989588497769e-05, "loss": 1.6283, "step": 1710 }, { "epoch": 0.2842387936376782, "grad_norm": 30.381515502929688, "learning_rate": 1.4212526855065279e-05, "loss": 1.7115, "step": 1720 }, { "epoch": 0.2858913447634786, "grad_norm": 20.99606704711914, "learning_rate": 1.4295157825152869e-05, "loss": 1.7297, "step": 1730 }, { "epoch": 0.2875438958892791, "grad_norm": 91.88641357421875, "learning_rate": 1.4377788795240457e-05, "loss": 1.7907, "step": 1740 }, { "epoch": 0.2891964470150795, "grad_norm": 14.486876487731934, "learning_rate": 1.4460419765328046e-05, "loss": 1.763, "step": 1750 }, { "epoch": 0.29084899814088, "grad_norm": 32.83678436279297, "learning_rate": 1.4543050735415634e-05, "loss": 1.8511, "step": 1760 }, { "epoch": 0.2925015492666804, "grad_norm": 15.216378211975098, "learning_rate": 1.4625681705503224e-05, "loss": 1.8471, "step": 1770 }, { "epoch": 0.2941541003924809, "grad_norm": 34.238189697265625, "learning_rate": 1.4708312675590813e-05, "loss": 1.7476, "step": 1780 }, { "epoch": 0.29580665151828134, "grad_norm": 26.102210998535156, "learning_rate": 1.4790943645678401e-05, "loss": 1.5983, "step": 1790 }, { "epoch": 0.2974592026440818, "grad_norm": 23.120174407958984, "learning_rate": 1.487357461576599e-05, "loss": 1.7779, "step": 1800 }, { "epoch": 0.29911175376988225, "grad_norm": 14.618244171142578, "learning_rate": 1.4956205585853578e-05, "loss": 1.6681, "step": 1810 }, { "epoch": 0.30076430489568273, "grad_norm": 33.785186767578125, "learning_rate": 1.5038836555941168e-05, "loss": 1.7867, "step": 1820 }, { "epoch": 0.30241685602148316, "grad_norm": 11.777277946472168, "learning_rate": 1.5121467526028757e-05, "loss": 1.6726, "step": 1830 }, { "epoch": 0.30406940714728364, "grad_norm": 15.27955150604248, "learning_rate": 1.5204098496116345e-05, "loss": 1.7069, "step": 1840 }, { "epoch": 0.30572195827308407, "grad_norm": 25.062849044799805, "learning_rate": 1.5286729466203932e-05, "loss": 1.6681, "step": 1850 }, { "epoch": 0.30737450939888455, "grad_norm": 19.023670196533203, "learning_rate": 1.5369360436291522e-05, "loss": 1.7546, "step": 1860 }, { "epoch": 0.309027060524685, "grad_norm": 14.75047492980957, "learning_rate": 1.5451991406379112e-05, "loss": 1.7295, "step": 1870 }, { "epoch": 0.3106796116504854, "grad_norm": 12.37612247467041, "learning_rate": 1.5534622376466703e-05, "loss": 1.8116, "step": 1880 }, { "epoch": 0.3123321627762859, "grad_norm": 26.212312698364258, "learning_rate": 1.561725334655429e-05, "loss": 1.7477, "step": 1890 }, { "epoch": 0.3139847139020863, "grad_norm": 52.385616302490234, "learning_rate": 1.5699884316641876e-05, "loss": 1.6689, "step": 1900 }, { "epoch": 0.3156372650278868, "grad_norm": 15.535853385925293, "learning_rate": 1.5782515286729466e-05, "loss": 1.7874, "step": 1910 }, { "epoch": 0.31728981615368723, "grad_norm": 15.0889892578125, "learning_rate": 1.5865146256817056e-05, "loss": 1.812, "step": 1920 }, { "epoch": 0.3189423672794877, "grad_norm": 14.585269927978516, "learning_rate": 1.5947777226904643e-05, "loss": 1.7859, "step": 1930 }, { "epoch": 0.32059491840528814, "grad_norm": 26.54705810546875, "learning_rate": 1.6030408196992233e-05, "loss": 1.6718, "step": 1940 }, { "epoch": 0.3222474695310886, "grad_norm": 23.145902633666992, "learning_rate": 1.611303916707982e-05, "loss": 1.7847, "step": 1950 }, { "epoch": 0.32390002065688905, "grad_norm": 19.175052642822266, "learning_rate": 1.6195670137167414e-05, "loss": 1.895, "step": 1960 }, { "epoch": 0.32555257178268954, "grad_norm": 11.821075439453125, "learning_rate": 1.6278301107255e-05, "loss": 1.7336, "step": 1970 }, { "epoch": 0.32720512290848996, "grad_norm": 13.310040473937988, "learning_rate": 1.6360932077342587e-05, "loss": 1.6935, "step": 1980 }, { "epoch": 0.32885767403429045, "grad_norm": 20.371007919311523, "learning_rate": 1.6443563047430177e-05, "loss": 1.6693, "step": 1990 }, { "epoch": 0.3305102251600909, "grad_norm": 9.38290786743164, "learning_rate": 1.6526194017517764e-05, "loss": 1.8035, "step": 2000 }, { "epoch": 0.33216277628589136, "grad_norm": 11.759653091430664, "learning_rate": 1.6608824987605358e-05, "loss": 1.6504, "step": 2010 }, { "epoch": 0.3338153274116918, "grad_norm": 9.93994140625, "learning_rate": 1.6691455957692945e-05, "loss": 1.6363, "step": 2020 }, { "epoch": 0.33546787853749227, "grad_norm": 39.654541015625, "learning_rate": 1.677408692778053e-05, "loss": 1.6831, "step": 2030 }, { "epoch": 0.3371204296632927, "grad_norm": 15.763498306274414, "learning_rate": 1.685671789786812e-05, "loss": 1.7134, "step": 2040 }, { "epoch": 0.3387729807890932, "grad_norm": 13.894293785095215, "learning_rate": 1.6939348867955708e-05, "loss": 1.6449, "step": 2050 }, { "epoch": 0.3404255319148936, "grad_norm": 12.600088119506836, "learning_rate": 1.7021979838043302e-05, "loss": 1.8048, "step": 2060 }, { "epoch": 0.3420780830406941, "grad_norm": 49.38233947753906, "learning_rate": 1.710461080813089e-05, "loss": 1.669, "step": 2070 }, { "epoch": 0.3437306341664945, "grad_norm": 123.16156768798828, "learning_rate": 1.7187241778218475e-05, "loss": 1.6796, "step": 2080 }, { "epoch": 0.345383185292295, "grad_norm": 12.5492525100708, "learning_rate": 1.7269872748306065e-05, "loss": 1.5977, "step": 2090 }, { "epoch": 0.34703573641809543, "grad_norm": 27.242778778076172, "learning_rate": 1.7352503718393652e-05, "loss": 1.8755, "step": 2100 }, { "epoch": 0.3486882875438959, "grad_norm": 12.04410457611084, "learning_rate": 1.7435134688481246e-05, "loss": 1.7807, "step": 2110 }, { "epoch": 0.35034083866969634, "grad_norm": 37.36049270629883, "learning_rate": 1.7517765658568833e-05, "loss": 1.7438, "step": 2120 }, { "epoch": 0.3519933897954968, "grad_norm": 19.858129501342773, "learning_rate": 1.760039662865642e-05, "loss": 1.9511, "step": 2130 }, { "epoch": 0.35364594092129725, "grad_norm": 11.481547355651855, "learning_rate": 1.768302759874401e-05, "loss": 1.7014, "step": 2140 }, { "epoch": 0.3552984920470977, "grad_norm": 81.52596282958984, "learning_rate": 1.77656585688316e-05, "loss": 1.6589, "step": 2150 }, { "epoch": 0.35695104317289816, "grad_norm": 17.031217575073242, "learning_rate": 1.784828953891919e-05, "loss": 1.6914, "step": 2160 }, { "epoch": 0.3586035942986986, "grad_norm": 16.88812255859375, "learning_rate": 1.7930920509006777e-05, "loss": 1.8675, "step": 2170 }, { "epoch": 0.3602561454244991, "grad_norm": 10.55776309967041, "learning_rate": 1.8013551479094363e-05, "loss": 1.7589, "step": 2180 }, { "epoch": 0.3619086965502995, "grad_norm": 84.68812561035156, "learning_rate": 1.8096182449181954e-05, "loss": 1.6879, "step": 2190 }, { "epoch": 0.3635612476761, "grad_norm": 18.757328033447266, "learning_rate": 1.8178813419269544e-05, "loss": 1.7931, "step": 2200 }, { "epoch": 0.3652137988019004, "grad_norm": 23.67148780822754, "learning_rate": 1.8261444389357134e-05, "loss": 1.7531, "step": 2210 }, { "epoch": 0.3668663499277009, "grad_norm": 38.11347198486328, "learning_rate": 1.834407535944472e-05, "loss": 1.6541, "step": 2220 }, { "epoch": 0.3685189010535013, "grad_norm": 15.99669075012207, "learning_rate": 1.8426706329532307e-05, "loss": 1.7447, "step": 2230 }, { "epoch": 0.3701714521793018, "grad_norm": 16.732406616210938, "learning_rate": 1.8509337299619898e-05, "loss": 1.7383, "step": 2240 }, { "epoch": 0.37182400330510224, "grad_norm": 36.895389556884766, "learning_rate": 1.8591968269707488e-05, "loss": 1.7366, "step": 2250 }, { "epoch": 0.3734765544309027, "grad_norm": 14.426408767700195, "learning_rate": 1.8674599239795078e-05, "loss": 1.7979, "step": 2260 }, { "epoch": 0.37512910555670315, "grad_norm": 19.50206184387207, "learning_rate": 1.8757230209882665e-05, "loss": 1.7249, "step": 2270 }, { "epoch": 0.37678165668250363, "grad_norm": 33.446937561035156, "learning_rate": 1.883986117997025e-05, "loss": 1.8677, "step": 2280 }, { "epoch": 0.37843420780830406, "grad_norm": 16.420921325683594, "learning_rate": 1.8922492150057845e-05, "loss": 1.608, "step": 2290 }, { "epoch": 0.38008675893410454, "grad_norm": 33.9530029296875, "learning_rate": 1.9005123120145432e-05, "loss": 1.7731, "step": 2300 }, { "epoch": 0.38173931005990497, "grad_norm": 14.426531791687012, "learning_rate": 1.9087754090233022e-05, "loss": 1.7938, "step": 2310 }, { "epoch": 0.38339186118570545, "grad_norm": 18.52570152282715, "learning_rate": 1.917038506032061e-05, "loss": 1.7116, "step": 2320 }, { "epoch": 0.3850444123115059, "grad_norm": 11.881404876708984, "learning_rate": 1.9253016030408195e-05, "loss": 1.719, "step": 2330 }, { "epoch": 0.38669696343730636, "grad_norm": 13.673921585083008, "learning_rate": 1.933564700049579e-05, "loss": 1.6652, "step": 2340 }, { "epoch": 0.3883495145631068, "grad_norm": 12.703744888305664, "learning_rate": 1.9418277970583376e-05, "loss": 1.5334, "step": 2350 }, { "epoch": 0.3900020656889073, "grad_norm": 25.498388290405273, "learning_rate": 1.9500908940670966e-05, "loss": 1.7857, "step": 2360 }, { "epoch": 0.3916546168147077, "grad_norm": 10.993797302246094, "learning_rate": 1.9583539910758553e-05, "loss": 1.6038, "step": 2370 }, { "epoch": 0.3933071679405082, "grad_norm": 17.91847038269043, "learning_rate": 1.966617088084614e-05, "loss": 1.6546, "step": 2380 }, { "epoch": 0.3949597190663086, "grad_norm": 22.401233673095703, "learning_rate": 1.9748801850933733e-05, "loss": 1.7847, "step": 2390 }, { "epoch": 0.3966122701921091, "grad_norm": 20.50080108642578, "learning_rate": 1.983143282102132e-05, "loss": 1.6773, "step": 2400 }, { "epoch": 0.3982648213179095, "grad_norm": 22.017650604248047, "learning_rate": 1.991406379110891e-05, "loss": 1.7695, "step": 2410 }, { "epoch": 0.39991737244370995, "grad_norm": 10.337092399597168, "learning_rate": 1.9996694761196497e-05, "loss": 1.7029, "step": 2420 }, { "epoch": 0.40156992356951043, "grad_norm": 10.79664134979248, "learning_rate": 2.0079325731284083e-05, "loss": 1.6576, "step": 2430 }, { "epoch": 0.40322247469531086, "grad_norm": 33.853580474853516, "learning_rate": 2.0161956701371677e-05, "loss": 1.7565, "step": 2440 }, { "epoch": 0.40487502582111135, "grad_norm": 15.0186767578125, "learning_rate": 2.0244587671459264e-05, "loss": 1.7022, "step": 2450 }, { "epoch": 0.4065275769469118, "grad_norm": 21.504180908203125, "learning_rate": 2.0327218641546854e-05, "loss": 1.5559, "step": 2460 }, { "epoch": 0.40818012807271226, "grad_norm": 15.873148918151855, "learning_rate": 2.040984961163444e-05, "loss": 1.7129, "step": 2470 }, { "epoch": 0.4098326791985127, "grad_norm": 19.291330337524414, "learning_rate": 2.049248058172203e-05, "loss": 1.7669, "step": 2480 }, { "epoch": 0.41148523032431317, "grad_norm": 10.179187774658203, "learning_rate": 2.057511155180962e-05, "loss": 1.7552, "step": 2490 }, { "epoch": 0.4131377814501136, "grad_norm": 46.47639083862305, "learning_rate": 2.0657742521897208e-05, "loss": 1.757, "step": 2500 }, { "epoch": 0.4147903325759141, "grad_norm": 13.199604988098145, "learning_rate": 2.0740373491984798e-05, "loss": 1.6626, "step": 2510 }, { "epoch": 0.4164428837017145, "grad_norm": 9.901959419250488, "learning_rate": 2.0823004462072385e-05, "loss": 1.6169, "step": 2520 }, { "epoch": 0.418095434827515, "grad_norm": 21.13959312438965, "learning_rate": 2.0905635432159975e-05, "loss": 1.6024, "step": 2530 }, { "epoch": 0.4197479859533154, "grad_norm": 7.372568130493164, "learning_rate": 2.0988266402247565e-05, "loss": 1.7535, "step": 2540 }, { "epoch": 0.4214005370791159, "grad_norm": 61.1468620300293, "learning_rate": 2.1070897372335152e-05, "loss": 1.6967, "step": 2550 }, { "epoch": 0.4230530882049163, "grad_norm": 16.36248779296875, "learning_rate": 2.1153528342422742e-05, "loss": 1.6661, "step": 2560 }, { "epoch": 0.4247056393307168, "grad_norm": 13.132911682128906, "learning_rate": 2.123615931251033e-05, "loss": 1.7467, "step": 2570 }, { "epoch": 0.42635819045651724, "grad_norm": 17.8082218170166, "learning_rate": 2.131879028259792e-05, "loss": 1.7569, "step": 2580 }, { "epoch": 0.4280107415823177, "grad_norm": 48.110652923583984, "learning_rate": 2.140142125268551e-05, "loss": 1.6307, "step": 2590 }, { "epoch": 0.42966329270811815, "grad_norm": 32.73516082763672, "learning_rate": 2.1484052222773096e-05, "loss": 1.7427, "step": 2600 }, { "epoch": 0.43131584383391863, "grad_norm": 29.785804748535156, "learning_rate": 2.1566683192860686e-05, "loss": 1.8084, "step": 2610 }, { "epoch": 0.43296839495971906, "grad_norm": 21.294347763061523, "learning_rate": 2.1649314162948276e-05, "loss": 1.8063, "step": 2620 }, { "epoch": 0.43462094608551954, "grad_norm": 10.334474563598633, "learning_rate": 2.1731945133035863e-05, "loss": 1.7292, "step": 2630 }, { "epoch": 0.43627349721131997, "grad_norm": 37.316349029541016, "learning_rate": 2.1814576103123453e-05, "loss": 1.7178, "step": 2640 }, { "epoch": 0.43792604833712045, "grad_norm": 15.207347869873047, "learning_rate": 2.189720707321104e-05, "loss": 1.6157, "step": 2650 }, { "epoch": 0.4395785994629209, "grad_norm": 44.354366302490234, "learning_rate": 2.197983804329863e-05, "loss": 1.6401, "step": 2660 }, { "epoch": 0.44123115058872137, "grad_norm": 13.655917167663574, "learning_rate": 2.206246901338622e-05, "loss": 1.6388, "step": 2670 }, { "epoch": 0.4428837017145218, "grad_norm": 25.89215660095215, "learning_rate": 2.2145099983473807e-05, "loss": 1.7077, "step": 2680 }, { "epoch": 0.4445362528403222, "grad_norm": 9.225777626037598, "learning_rate": 2.2227730953561397e-05, "loss": 1.7651, "step": 2690 }, { "epoch": 0.4461888039661227, "grad_norm": 21.08796501159668, "learning_rate": 2.2310361923648984e-05, "loss": 1.6447, "step": 2700 }, { "epoch": 0.44784135509192313, "grad_norm": 16.045692443847656, "learning_rate": 2.2392992893736574e-05, "loss": 1.6521, "step": 2710 }, { "epoch": 0.4494939062177236, "grad_norm": 16.895357131958008, "learning_rate": 2.2475623863824164e-05, "loss": 1.5923, "step": 2720 }, { "epoch": 0.45114645734352404, "grad_norm": 19.075815200805664, "learning_rate": 2.255825483391175e-05, "loss": 1.6618, "step": 2730 }, { "epoch": 0.4527990084693245, "grad_norm": 12.131684303283691, "learning_rate": 2.264088580399934e-05, "loss": 1.7285, "step": 2740 }, { "epoch": 0.45445155959512495, "grad_norm": 60.209259033203125, "learning_rate": 2.2723516774086928e-05, "loss": 1.6896, "step": 2750 }, { "epoch": 0.45610411072092544, "grad_norm": 22.111814498901367, "learning_rate": 2.2806147744174518e-05, "loss": 1.7719, "step": 2760 }, { "epoch": 0.45775666184672587, "grad_norm": 9.202242851257324, "learning_rate": 2.2888778714262108e-05, "loss": 1.7777, "step": 2770 }, { "epoch": 0.45940921297252635, "grad_norm": 63.52800750732422, "learning_rate": 2.2971409684349695e-05, "loss": 1.6549, "step": 2780 }, { "epoch": 0.4610617640983268, "grad_norm": 12.614968299865723, "learning_rate": 2.3054040654437285e-05, "loss": 1.8158, "step": 2790 }, { "epoch": 0.46271431522412726, "grad_norm": 10.92754077911377, "learning_rate": 2.3136671624524872e-05, "loss": 1.6745, "step": 2800 }, { "epoch": 0.4643668663499277, "grad_norm": 57.359619140625, "learning_rate": 2.3219302594612462e-05, "loss": 1.7044, "step": 2810 }, { "epoch": 0.46601941747572817, "grad_norm": 19.017126083374023, "learning_rate": 2.3301933564700052e-05, "loss": 1.6981, "step": 2820 }, { "epoch": 0.4676719686015286, "grad_norm": 26.398160934448242, "learning_rate": 2.338456453478764e-05, "loss": 1.7188, "step": 2830 }, { "epoch": 0.4693245197273291, "grad_norm": 13.000535011291504, "learning_rate": 2.346719550487523e-05, "loss": 1.6572, "step": 2840 }, { "epoch": 0.4709770708531295, "grad_norm": 18.722341537475586, "learning_rate": 2.3549826474962816e-05, "loss": 1.8101, "step": 2850 }, { "epoch": 0.47262962197893, "grad_norm": 94.03982543945312, "learning_rate": 2.3632457445050406e-05, "loss": 1.5997, "step": 2860 }, { "epoch": 0.4742821731047304, "grad_norm": 45.00182342529297, "learning_rate": 2.3715088415137996e-05, "loss": 1.6486, "step": 2870 }, { "epoch": 0.4759347242305309, "grad_norm": 26.30855941772461, "learning_rate": 2.3797719385225583e-05, "loss": 1.7285, "step": 2880 }, { "epoch": 0.47758727535633133, "grad_norm": 127.1923599243164, "learning_rate": 2.3880350355313173e-05, "loss": 1.7588, "step": 2890 }, { "epoch": 0.4792398264821318, "grad_norm": 22.71497917175293, "learning_rate": 2.396298132540076e-05, "loss": 1.6556, "step": 2900 }, { "epoch": 0.48089237760793224, "grad_norm": 17.4209041595459, "learning_rate": 2.404561229548835e-05, "loss": 1.6399, "step": 2910 }, { "epoch": 0.4825449287337327, "grad_norm": 38.692264556884766, "learning_rate": 2.412824326557594e-05, "loss": 1.6147, "step": 2920 }, { "epoch": 0.48419747985953315, "grad_norm": 26.006572723388672, "learning_rate": 2.4210874235663527e-05, "loss": 1.6381, "step": 2930 }, { "epoch": 0.48585003098533364, "grad_norm": 13.967598915100098, "learning_rate": 2.4293505205751117e-05, "loss": 1.6556, "step": 2940 }, { "epoch": 0.48750258211113406, "grad_norm": 15.196572303771973, "learning_rate": 2.4376136175838707e-05, "loss": 1.6714, "step": 2950 }, { "epoch": 0.4891551332369345, "grad_norm": 23.98309326171875, "learning_rate": 2.4458767145926294e-05, "loss": 1.7578, "step": 2960 }, { "epoch": 0.490807684362735, "grad_norm": 19.997920989990234, "learning_rate": 2.4541398116013884e-05, "loss": 1.7737, "step": 2970 }, { "epoch": 0.4924602354885354, "grad_norm": 26.095102310180664, "learning_rate": 2.462402908610147e-05, "loss": 1.709, "step": 2980 }, { "epoch": 0.4941127866143359, "grad_norm": 60.75287628173828, "learning_rate": 2.470666005618906e-05, "loss": 1.7374, "step": 2990 }, { "epoch": 0.4957653377401363, "grad_norm": 12.193611145019531, "learning_rate": 2.478929102627665e-05, "loss": 1.7192, "step": 3000 }, { "epoch": 0.4974178888659368, "grad_norm": 21.721214294433594, "learning_rate": 2.4871921996364238e-05, "loss": 1.6808, "step": 3010 }, { "epoch": 0.4990704399917372, "grad_norm": 9.847155570983887, "learning_rate": 2.4954552966451828e-05, "loss": 1.6721, "step": 3020 }, { "epoch": 0.5007229911175377, "grad_norm": 12.325818061828613, "learning_rate": 2.5037183936539415e-05, "loss": 1.8163, "step": 3030 }, { "epoch": 0.5023755422433381, "grad_norm": 33.089630126953125, "learning_rate": 2.5119814906627005e-05, "loss": 1.8137, "step": 3040 }, { "epoch": 0.5040280933691386, "grad_norm": 47.662208557128906, "learning_rate": 2.5202445876714592e-05, "loss": 1.7631, "step": 3050 }, { "epoch": 0.5056806444949391, "grad_norm": 11.936186790466309, "learning_rate": 2.528507684680218e-05, "loss": 1.6799, "step": 3060 }, { "epoch": 0.5073331956207395, "grad_norm": 24.59088706970215, "learning_rate": 2.5367707816889776e-05, "loss": 1.7288, "step": 3070 }, { "epoch": 0.50898574674654, "grad_norm": 12.187572479248047, "learning_rate": 2.5450338786977362e-05, "loss": 1.7636, "step": 3080 }, { "epoch": 0.5106382978723404, "grad_norm": 12.487037658691406, "learning_rate": 2.553296975706495e-05, "loss": 1.6933, "step": 3090 }, { "epoch": 0.5122908489981409, "grad_norm": 16.490032196044922, "learning_rate": 2.561560072715254e-05, "loss": 1.8374, "step": 3100 }, { "epoch": 0.5139434001239414, "grad_norm": 17.2027530670166, "learning_rate": 2.5698231697240126e-05, "loss": 1.6313, "step": 3110 }, { "epoch": 0.5155959512497418, "grad_norm": 67.94872283935547, "learning_rate": 2.5780862667327716e-05, "loss": 1.6479, "step": 3120 }, { "epoch": 0.5172485023755422, "grad_norm": 17.46609878540039, "learning_rate": 2.5863493637415303e-05, "loss": 1.7751, "step": 3130 }, { "epoch": 0.5189010535013427, "grad_norm": 13.204253196716309, "learning_rate": 2.5946124607502893e-05, "loss": 1.6845, "step": 3140 }, { "epoch": 0.5205536046271432, "grad_norm": 10.950384140014648, "learning_rate": 2.602875557759048e-05, "loss": 1.6612, "step": 3150 }, { "epoch": 0.5222061557529436, "grad_norm": 25.03174591064453, "learning_rate": 2.6111386547678067e-05, "loss": 1.7298, "step": 3160 }, { "epoch": 0.523858706878744, "grad_norm": 41.82197952270508, "learning_rate": 2.6194017517765664e-05, "loss": 1.7301, "step": 3170 }, { "epoch": 0.5255112580045446, "grad_norm": 26.862686157226562, "learning_rate": 2.627664848785325e-05, "loss": 1.6881, "step": 3180 }, { "epoch": 0.527163809130345, "grad_norm": 13.66534423828125, "learning_rate": 2.6359279457940837e-05, "loss": 1.6658, "step": 3190 }, { "epoch": 0.5288163602561454, "grad_norm": 17.450504302978516, "learning_rate": 2.6441910428028427e-05, "loss": 1.6795, "step": 3200 }, { "epoch": 0.5304689113819459, "grad_norm": 12.588276863098145, "learning_rate": 2.6524541398116014e-05, "loss": 1.6239, "step": 3210 }, { "epoch": 0.5321214625077463, "grad_norm": 34.62938690185547, "learning_rate": 2.6607172368203604e-05, "loss": 1.5944, "step": 3220 }, { "epoch": 0.5337740136335468, "grad_norm": 22.4811954498291, "learning_rate": 2.668980333829119e-05, "loss": 1.6605, "step": 3230 }, { "epoch": 0.5354265647593472, "grad_norm": 13.54268741607666, "learning_rate": 2.677243430837878e-05, "loss": 1.6363, "step": 3240 }, { "epoch": 0.5370791158851477, "grad_norm": 13.956992149353027, "learning_rate": 2.6855065278466368e-05, "loss": 1.6435, "step": 3250 }, { "epoch": 0.5387316670109481, "grad_norm": 23.956907272338867, "learning_rate": 2.693769624855396e-05, "loss": 1.6956, "step": 3260 }, { "epoch": 0.5403842181367486, "grad_norm": 16.56192398071289, "learning_rate": 2.7020327218641552e-05, "loss": 1.6613, "step": 3270 }, { "epoch": 0.5420367692625491, "grad_norm": 21.902639389038086, "learning_rate": 2.710295818872914e-05, "loss": 1.7387, "step": 3280 }, { "epoch": 0.5436893203883495, "grad_norm": 24.28936004638672, "learning_rate": 2.7185589158816725e-05, "loss": 1.8334, "step": 3290 }, { "epoch": 0.5453418715141499, "grad_norm": 6.949965476989746, "learning_rate": 2.7268220128904315e-05, "loss": 1.6161, "step": 3300 }, { "epoch": 0.5469944226399505, "grad_norm": 10.36733627319336, "learning_rate": 2.7350851098991902e-05, "loss": 1.6539, "step": 3310 }, { "epoch": 0.5486469737657509, "grad_norm": 15.764904975891113, "learning_rate": 2.7433482069079492e-05, "loss": 1.7367, "step": 3320 }, { "epoch": 0.5502995248915513, "grad_norm": 30.997188568115234, "learning_rate": 2.751611303916708e-05, "loss": 1.7039, "step": 3330 }, { "epoch": 0.5519520760173517, "grad_norm": 63.2742919921875, "learning_rate": 2.759874400925467e-05, "loss": 1.6414, "step": 3340 }, { "epoch": 0.5536046271431523, "grad_norm": 13.54851245880127, "learning_rate": 2.7681374979342256e-05, "loss": 1.741, "step": 3350 }, { "epoch": 0.5552571782689527, "grad_norm": 31.921226501464844, "learning_rate": 2.776400594942985e-05, "loss": 1.65, "step": 3360 }, { "epoch": 0.5569097293947531, "grad_norm": 20.03881072998047, "learning_rate": 2.7846636919517436e-05, "loss": 1.6868, "step": 3370 }, { "epoch": 0.5585622805205536, "grad_norm": 11.299986839294434, "learning_rate": 2.7929267889605027e-05, "loss": 1.7552, "step": 3380 }, { "epoch": 0.5602148316463541, "grad_norm": 11.070747375488281, "learning_rate": 2.8011898859692613e-05, "loss": 1.6897, "step": 3390 }, { "epoch": 0.5618673827721545, "grad_norm": 8.979963302612305, "learning_rate": 2.8094529829780204e-05, "loss": 1.6199, "step": 3400 }, { "epoch": 0.563519933897955, "grad_norm": 11.309082984924316, "learning_rate": 2.817716079986779e-05, "loss": 1.8134, "step": 3410 }, { "epoch": 0.5651724850237554, "grad_norm": 17.12977409362793, "learning_rate": 2.825979176995538e-05, "loss": 1.6499, "step": 3420 }, { "epoch": 0.5668250361495559, "grad_norm": 6.682351589202881, "learning_rate": 2.8342422740042967e-05, "loss": 1.6524, "step": 3430 }, { "epoch": 0.5684775872753564, "grad_norm": 20.999141693115234, "learning_rate": 2.8425053710130557e-05, "loss": 1.6885, "step": 3440 }, { "epoch": 0.5701301384011568, "grad_norm": 10.989826202392578, "learning_rate": 2.850768468021815e-05, "loss": 1.5895, "step": 3450 }, { "epoch": 0.5717826895269572, "grad_norm": 11.188637733459473, "learning_rate": 2.8590315650305738e-05, "loss": 1.7236, "step": 3460 }, { "epoch": 0.5734352406527577, "grad_norm": 11.868768692016602, "learning_rate": 2.8672946620393324e-05, "loss": 1.7752, "step": 3470 }, { "epoch": 0.5750877917785582, "grad_norm": 13.934696197509766, "learning_rate": 2.8755577590480915e-05, "loss": 1.6321, "step": 3480 }, { "epoch": 0.5767403429043586, "grad_norm": 10.623023986816406, "learning_rate": 2.88382085605685e-05, "loss": 1.7567, "step": 3490 }, { "epoch": 0.578392894030159, "grad_norm": 21.444114685058594, "learning_rate": 2.892083953065609e-05, "loss": 1.7353, "step": 3500 }, { "epoch": 0.5800454451559595, "grad_norm": 22.53701400756836, "learning_rate": 2.900347050074368e-05, "loss": 1.6643, "step": 3510 }, { "epoch": 0.58169799628176, "grad_norm": 8.35075855255127, "learning_rate": 2.908610147083127e-05, "loss": 1.6099, "step": 3520 }, { "epoch": 0.5833505474075604, "grad_norm": 10.763632774353027, "learning_rate": 2.9168732440918855e-05, "loss": 1.7619, "step": 3530 }, { "epoch": 0.5850030985333609, "grad_norm": 10.737512588500977, "learning_rate": 2.925136341100645e-05, "loss": 1.6345, "step": 3540 }, { "epoch": 0.5866556496591613, "grad_norm": 15.459436416625977, "learning_rate": 2.933399438109404e-05, "loss": 1.6186, "step": 3550 }, { "epoch": 0.5883082007849618, "grad_norm": 16.63528823852539, "learning_rate": 2.9416625351181626e-05, "loss": 1.7349, "step": 3560 }, { "epoch": 0.5899607519107622, "grad_norm": 12.49666690826416, "learning_rate": 2.9499256321269213e-05, "loss": 1.7992, "step": 3570 }, { "epoch": 0.5916133030365627, "grad_norm": 30.694421768188477, "learning_rate": 2.9581887291356803e-05, "loss": 1.7004, "step": 3580 }, { "epoch": 0.5932658541623631, "grad_norm": 12.530024528503418, "learning_rate": 2.966451826144439e-05, "loss": 1.5653, "step": 3590 }, { "epoch": 0.5949184052881636, "grad_norm": 10.325366020202637, "learning_rate": 2.974714923153198e-05, "loss": 1.6872, "step": 3600 }, { "epoch": 0.5965709564139641, "grad_norm": 17.523683547973633, "learning_rate": 2.9829780201619566e-05, "loss": 1.5351, "step": 3610 }, { "epoch": 0.5982235075397645, "grad_norm": 12.359028816223145, "learning_rate": 2.9912411171707157e-05, "loss": 1.6894, "step": 3620 }, { "epoch": 0.5998760586655649, "grad_norm": 9.718979835510254, "learning_rate": 2.9995042141794743e-05, "loss": 1.4912, "step": 3630 }, { "epoch": 0.6015286097913655, "grad_norm": 12.050354957580566, "learning_rate": 3.0077673111882337e-05, "loss": 1.6147, "step": 3640 }, { "epoch": 0.6031811609171659, "grad_norm": 73.88978576660156, "learning_rate": 3.0160304081969927e-05, "loss": 1.639, "step": 3650 }, { "epoch": 0.6048337120429663, "grad_norm": 12.085061073303223, "learning_rate": 3.0242935052057514e-05, "loss": 1.7605, "step": 3660 }, { "epoch": 0.6064862631687667, "grad_norm": 12.490886688232422, "learning_rate": 3.03255660221451e-05, "loss": 1.6698, "step": 3670 }, { "epoch": 0.6081388142945673, "grad_norm": 11.048460006713867, "learning_rate": 3.040819699223269e-05, "loss": 1.7394, "step": 3680 }, { "epoch": 0.6097913654203677, "grad_norm": 9.499938011169434, "learning_rate": 3.0490827962320277e-05, "loss": 1.8048, "step": 3690 }, { "epoch": 0.6114439165461681, "grad_norm": 17.809703826904297, "learning_rate": 3.0573458932407864e-05, "loss": 1.768, "step": 3700 }, { "epoch": 0.6130964676719686, "grad_norm": 45.0641975402832, "learning_rate": 3.0656089902495454e-05, "loss": 1.7266, "step": 3710 }, { "epoch": 0.6147490187977691, "grad_norm": 37.341739654541016, "learning_rate": 3.0738720872583045e-05, "loss": 1.7567, "step": 3720 }, { "epoch": 0.6164015699235695, "grad_norm": 8.673922538757324, "learning_rate": 3.0821351842670635e-05, "loss": 1.8379, "step": 3730 }, { "epoch": 0.61805412104937, "grad_norm": 8.250535011291504, "learning_rate": 3.0903982812758225e-05, "loss": 1.7141, "step": 3740 }, { "epoch": 0.6197066721751704, "grad_norm": 22.169246673583984, "learning_rate": 3.0986613782845815e-05, "loss": 1.6403, "step": 3750 }, { "epoch": 0.6213592233009708, "grad_norm": 8.162503242492676, "learning_rate": 3.1069244752933405e-05, "loss": 1.7723, "step": 3760 }, { "epoch": 0.6230117744267714, "grad_norm": 20.341426849365234, "learning_rate": 3.115187572302099e-05, "loss": 1.7345, "step": 3770 }, { "epoch": 0.6246643255525718, "grad_norm": 10.553240776062012, "learning_rate": 3.123450669310858e-05, "loss": 1.8238, "step": 3780 }, { "epoch": 0.6263168766783722, "grad_norm": 15.004631042480469, "learning_rate": 3.131713766319617e-05, "loss": 1.5615, "step": 3790 }, { "epoch": 0.6279694278041726, "grad_norm": 9.84469223022461, "learning_rate": 3.139976863328375e-05, "loss": 1.724, "step": 3800 }, { "epoch": 0.6296219789299732, "grad_norm": 10.025922775268555, "learning_rate": 3.148239960337134e-05, "loss": 1.729, "step": 3810 }, { "epoch": 0.6312745300557736, "grad_norm": 11.77392292022705, "learning_rate": 3.156503057345893e-05, "loss": 1.7107, "step": 3820 }, { "epoch": 0.632927081181574, "grad_norm": 10.818305969238281, "learning_rate": 3.164766154354652e-05, "loss": 1.5897, "step": 3830 }, { "epoch": 0.6345796323073745, "grad_norm": 10.746159553527832, "learning_rate": 3.173029251363411e-05, "loss": 1.7638, "step": 3840 }, { "epoch": 0.636232183433175, "grad_norm": 13.713776588439941, "learning_rate": 3.18129234837217e-05, "loss": 1.7332, "step": 3850 }, { "epoch": 0.6378847345589754, "grad_norm": 10.594388008117676, "learning_rate": 3.1895554453809286e-05, "loss": 1.8471, "step": 3860 }, { "epoch": 0.6395372856847759, "grad_norm": 16.44990348815918, "learning_rate": 3.197818542389688e-05, "loss": 1.7023, "step": 3870 }, { "epoch": 0.6411898368105763, "grad_norm": 9.56428050994873, "learning_rate": 3.206081639398447e-05, "loss": 1.7214, "step": 3880 }, { "epoch": 0.6428423879363768, "grad_norm": 13.247419357299805, "learning_rate": 3.214344736407206e-05, "loss": 1.6734, "step": 3890 }, { "epoch": 0.6444949390621773, "grad_norm": 13.604035377502441, "learning_rate": 3.222607833415964e-05, "loss": 1.6755, "step": 3900 }, { "epoch": 0.6461474901879777, "grad_norm": 16.204240798950195, "learning_rate": 3.230870930424723e-05, "loss": 1.7442, "step": 3910 }, { "epoch": 0.6478000413137781, "grad_norm": 22.446441650390625, "learning_rate": 3.239134027433483e-05, "loss": 1.7222, "step": 3920 }, { "epoch": 0.6494525924395786, "grad_norm": 13.957686424255371, "learning_rate": 3.247397124442241e-05, "loss": 1.6194, "step": 3930 }, { "epoch": 0.6511051435653791, "grad_norm": 17.994192123413086, "learning_rate": 3.255660221451e-05, "loss": 1.7454, "step": 3940 }, { "epoch": 0.6527576946911795, "grad_norm": 10.868365287780762, "learning_rate": 3.263923318459759e-05, "loss": 1.707, "step": 3950 }, { "epoch": 0.6544102458169799, "grad_norm": 8.294291496276855, "learning_rate": 3.2721864154685175e-05, "loss": 1.7903, "step": 3960 }, { "epoch": 0.6560627969427805, "grad_norm": 10.930803298950195, "learning_rate": 3.2804495124772765e-05, "loss": 1.6172, "step": 3970 }, { "epoch": 0.6577153480685809, "grad_norm": 74.83100128173828, "learning_rate": 3.2887126094860355e-05, "loss": 1.7242, "step": 3980 }, { "epoch": 0.6593678991943813, "grad_norm": 10.721229553222656, "learning_rate": 3.2969757064947945e-05, "loss": 1.678, "step": 3990 }, { "epoch": 0.6610204503201818, "grad_norm": 22.046354293823242, "learning_rate": 3.305238803503553e-05, "loss": 1.5762, "step": 4000 }, { "epoch": 0.6626730014459823, "grad_norm": 8.417013168334961, "learning_rate": 3.313501900512312e-05, "loss": 1.6341, "step": 4010 }, { "epoch": 0.6643255525717827, "grad_norm": 9.018980026245117, "learning_rate": 3.3217649975210715e-05, "loss": 1.5761, "step": 4020 }, { "epoch": 0.6659781036975831, "grad_norm": 19.58799171447754, "learning_rate": 3.33002809452983e-05, "loss": 1.6891, "step": 4030 }, { "epoch": 0.6676306548233836, "grad_norm": 13.492208480834961, "learning_rate": 3.338291191538589e-05, "loss": 1.6399, "step": 4040 }, { "epoch": 0.669283205949184, "grad_norm": 10.820428848266602, "learning_rate": 3.346554288547348e-05, "loss": 1.7323, "step": 4050 }, { "epoch": 0.6709357570749845, "grad_norm": 24.102577209472656, "learning_rate": 3.354817385556106e-05, "loss": 1.7554, "step": 4060 }, { "epoch": 0.672588308200785, "grad_norm": 10.357276916503906, "learning_rate": 3.363080482564865e-05, "loss": 1.7642, "step": 4070 }, { "epoch": 0.6742408593265854, "grad_norm": 16.072227478027344, "learning_rate": 3.371343579573624e-05, "loss": 1.6417, "step": 4080 }, { "epoch": 0.6758934104523858, "grad_norm": 12.857048034667969, "learning_rate": 3.379606676582383e-05, "loss": 1.7107, "step": 4090 }, { "epoch": 0.6775459615781864, "grad_norm": 10.534919738769531, "learning_rate": 3.3878697735911416e-05, "loss": 1.7853, "step": 4100 }, { "epoch": 0.6791985127039868, "grad_norm": 22.40790367126465, "learning_rate": 3.396132870599901e-05, "loss": 1.7863, "step": 4110 }, { "epoch": 0.6808510638297872, "grad_norm": 19.036958694458008, "learning_rate": 3.4043959676086604e-05, "loss": 1.5541, "step": 4120 }, { "epoch": 0.6825036149555876, "grad_norm": 12.263296127319336, "learning_rate": 3.412659064617419e-05, "loss": 1.7194, "step": 4130 }, { "epoch": 0.6841561660813882, "grad_norm": 13.276509284973145, "learning_rate": 3.420922161626178e-05, "loss": 1.7551, "step": 4140 }, { "epoch": 0.6858087172071886, "grad_norm": 21.00751304626465, "learning_rate": 3.429185258634937e-05, "loss": 1.7179, "step": 4150 }, { "epoch": 0.687461268332989, "grad_norm": 27.047283172607422, "learning_rate": 3.437448355643695e-05, "loss": 1.7047, "step": 4160 }, { "epoch": 0.6891138194587895, "grad_norm": 12.669699668884277, "learning_rate": 3.445711452652454e-05, "loss": 1.6317, "step": 4170 }, { "epoch": 0.69076637058459, "grad_norm": 16.105510711669922, "learning_rate": 3.453974549661213e-05, "loss": 1.6754, "step": 4180 }, { "epoch": 0.6924189217103904, "grad_norm": 9.046321868896484, "learning_rate": 3.462237646669972e-05, "loss": 1.5878, "step": 4190 }, { "epoch": 0.6940714728361909, "grad_norm": 18.94280433654785, "learning_rate": 3.4705007436787304e-05, "loss": 1.6219, "step": 4200 }, { "epoch": 0.6957240239619913, "grad_norm": 11.86888313293457, "learning_rate": 3.47876384068749e-05, "loss": 1.867, "step": 4210 }, { "epoch": 0.6973765750877918, "grad_norm": 11.21365737915039, "learning_rate": 3.487026937696249e-05, "loss": 1.7011, "step": 4220 }, { "epoch": 0.6990291262135923, "grad_norm": 36.31560134887695, "learning_rate": 3.4952900347050075e-05, "loss": 1.6172, "step": 4230 }, { "epoch": 0.7006816773393927, "grad_norm": 7.454902648925781, "learning_rate": 3.5035531317137665e-05, "loss": 1.682, "step": 4240 }, { "epoch": 0.7023342284651931, "grad_norm": 8.024889945983887, "learning_rate": 3.5118162287225255e-05, "loss": 1.6474, "step": 4250 }, { "epoch": 0.7039867795909936, "grad_norm": 11.65849494934082, "learning_rate": 3.520079325731284e-05, "loss": 1.6873, "step": 4260 }, { "epoch": 0.7056393307167941, "grad_norm": 9.83031940460205, "learning_rate": 3.528342422740043e-05, "loss": 1.6696, "step": 4270 }, { "epoch": 0.7072918818425945, "grad_norm": 27.40468978881836, "learning_rate": 3.536605519748802e-05, "loss": 1.6058, "step": 4280 }, { "epoch": 0.7089444329683949, "grad_norm": 7.991322994232178, "learning_rate": 3.544868616757561e-05, "loss": 1.6647, "step": 4290 }, { "epoch": 0.7105969840941954, "grad_norm": 17.9750919342041, "learning_rate": 3.55313171376632e-05, "loss": 1.7095, "step": 4300 }, { "epoch": 0.7122495352199959, "grad_norm": 33.60334777832031, "learning_rate": 3.561394810775079e-05, "loss": 1.6818, "step": 4310 }, { "epoch": 0.7139020863457963, "grad_norm": 12.004261016845703, "learning_rate": 3.569657907783838e-05, "loss": 1.6787, "step": 4320 }, { "epoch": 0.7155546374715968, "grad_norm": 8.619003295898438, "learning_rate": 3.577921004792596e-05, "loss": 1.6197, "step": 4330 }, { "epoch": 0.7172071885973972, "grad_norm": 11.942529678344727, "learning_rate": 3.586184101801355e-05, "loss": 1.6431, "step": 4340 }, { "epoch": 0.7188597397231977, "grad_norm": 8.107340812683105, "learning_rate": 3.594447198810114e-05, "loss": 1.6827, "step": 4350 }, { "epoch": 0.7205122908489981, "grad_norm": 11.563468933105469, "learning_rate": 3.602710295818873e-05, "loss": 1.5576, "step": 4360 }, { "epoch": 0.7221648419747986, "grad_norm": 17.708297729492188, "learning_rate": 3.610973392827632e-05, "loss": 1.6501, "step": 4370 }, { "epoch": 0.723817393100599, "grad_norm": 9.903080940246582, "learning_rate": 3.619236489836391e-05, "loss": 1.604, "step": 4380 }, { "epoch": 0.7254699442263995, "grad_norm": 36.305904388427734, "learning_rate": 3.62749958684515e-05, "loss": 1.6622, "step": 4390 }, { "epoch": 0.7271224953522, "grad_norm": 17.942659378051758, "learning_rate": 3.635762683853909e-05, "loss": 1.5846, "step": 4400 }, { "epoch": 0.7287750464780004, "grad_norm": 15.10368537902832, "learning_rate": 3.644025780862668e-05, "loss": 1.6528, "step": 4410 }, { "epoch": 0.7304275976038008, "grad_norm": 11.274897575378418, "learning_rate": 3.652288877871427e-05, "loss": 1.6126, "step": 4420 }, { "epoch": 0.7320801487296014, "grad_norm": 9.983580589294434, "learning_rate": 3.660551974880185e-05, "loss": 1.7114, "step": 4430 }, { "epoch": 0.7337326998554018, "grad_norm": 6.765944480895996, "learning_rate": 3.668815071888944e-05, "loss": 1.6243, "step": 4440 }, { "epoch": 0.7353852509812022, "grad_norm": 6.97224760055542, "learning_rate": 3.677078168897703e-05, "loss": 1.6103, "step": 4450 }, { "epoch": 0.7370378021070026, "grad_norm": 18.04844856262207, "learning_rate": 3.6853412659064615e-05, "loss": 1.5527, "step": 4460 }, { "epoch": 0.7386903532328032, "grad_norm": 16.644474029541016, "learning_rate": 3.6936043629152205e-05, "loss": 1.7021, "step": 4470 }, { "epoch": 0.7403429043586036, "grad_norm": 12.994518280029297, "learning_rate": 3.7018674599239795e-05, "loss": 1.779, "step": 4480 }, { "epoch": 0.741995455484404, "grad_norm": 8.77868938446045, "learning_rate": 3.7101305569327385e-05, "loss": 1.6658, "step": 4490 }, { "epoch": 0.7436480066102045, "grad_norm": 16.25444793701172, "learning_rate": 3.7183936539414975e-05, "loss": 1.7969, "step": 4500 }, { "epoch": 0.745300557736005, "grad_norm": 15.957270622253418, "learning_rate": 3.7266567509502566e-05, "loss": 1.6981, "step": 4510 }, { "epoch": 0.7469531088618054, "grad_norm": 8.807640075683594, "learning_rate": 3.7349198479590156e-05, "loss": 1.8519, "step": 4520 }, { "epoch": 0.7486056599876059, "grad_norm": 13.07224178314209, "learning_rate": 3.743182944967774e-05, "loss": 1.8313, "step": 4530 }, { "epoch": 0.7502582111134063, "grad_norm": 14.359870910644531, "learning_rate": 3.751446041976533e-05, "loss": 1.7593, "step": 4540 }, { "epoch": 0.7519107622392067, "grad_norm": 10.050626754760742, "learning_rate": 3.759709138985292e-05, "loss": 1.7225, "step": 4550 }, { "epoch": 0.7535633133650073, "grad_norm": 20.963890075683594, "learning_rate": 3.76797223599405e-05, "loss": 1.7461, "step": 4560 }, { "epoch": 0.7552158644908077, "grad_norm": 14.734147071838379, "learning_rate": 3.776235333002809e-05, "loss": 1.7066, "step": 4570 }, { "epoch": 0.7568684156166081, "grad_norm": 12.787677764892578, "learning_rate": 3.784498430011569e-05, "loss": 1.7834, "step": 4580 }, { "epoch": 0.7585209667424085, "grad_norm": 11.134603500366211, "learning_rate": 3.792761527020327e-05, "loss": 1.6588, "step": 4590 }, { "epoch": 0.7601735178682091, "grad_norm": 7.746461391448975, "learning_rate": 3.8010246240290863e-05, "loss": 1.5581, "step": 4600 }, { "epoch": 0.7618260689940095, "grad_norm": 8.252470016479492, "learning_rate": 3.8092877210378454e-05, "loss": 1.7961, "step": 4610 }, { "epoch": 0.7634786201198099, "grad_norm": 18.933685302734375, "learning_rate": 3.8175508180466044e-05, "loss": 1.5873, "step": 4620 }, { "epoch": 0.7651311712456104, "grad_norm": 8.816574096679688, "learning_rate": 3.825813915055363e-05, "loss": 1.6589, "step": 4630 }, { "epoch": 0.7667837223714109, "grad_norm": 36.53535842895508, "learning_rate": 3.834077012064122e-05, "loss": 1.6683, "step": 4640 }, { "epoch": 0.7684362734972113, "grad_norm": 18.802400588989258, "learning_rate": 3.842340109072881e-05, "loss": 1.5741, "step": 4650 }, { "epoch": 0.7700888246230118, "grad_norm": 15.650837898254395, "learning_rate": 3.850603206081639e-05, "loss": 1.7178, "step": 4660 }, { "epoch": 0.7717413757488122, "grad_norm": 9.750154495239258, "learning_rate": 3.858866303090398e-05, "loss": 1.5336, "step": 4670 }, { "epoch": 0.7733939268746127, "grad_norm": 10.917224884033203, "learning_rate": 3.867129400099158e-05, "loss": 1.6023, "step": 4680 }, { "epoch": 0.7750464780004132, "grad_norm": 15.657227516174316, "learning_rate": 3.875392497107916e-05, "loss": 1.7192, "step": 4690 }, { "epoch": 0.7766990291262136, "grad_norm": 8.965598106384277, "learning_rate": 3.883655594116675e-05, "loss": 1.7072, "step": 4700 }, { "epoch": 0.778351580252014, "grad_norm": 5.406772613525391, "learning_rate": 3.891918691125434e-05, "loss": 1.617, "step": 4710 }, { "epoch": 0.7800041313778145, "grad_norm": 40.23686599731445, "learning_rate": 3.900181788134193e-05, "loss": 1.5586, "step": 4720 }, { "epoch": 0.781656682503615, "grad_norm": 6.773040294647217, "learning_rate": 3.9084448851429515e-05, "loss": 1.6325, "step": 4730 }, { "epoch": 0.7833092336294154, "grad_norm": 15.4874906539917, "learning_rate": 3.9167079821517105e-05, "loss": 1.8084, "step": 4740 }, { "epoch": 0.7849617847552158, "grad_norm": 9.330727577209473, "learning_rate": 3.9249710791604695e-05, "loss": 1.6506, "step": 4750 }, { "epoch": 0.7866143358810164, "grad_norm": 8.181884765625, "learning_rate": 3.933234176169228e-05, "loss": 1.74, "step": 4760 }, { "epoch": 0.7882668870068168, "grad_norm": 18.493993759155273, "learning_rate": 3.9414972731779876e-05, "loss": 1.5708, "step": 4770 }, { "epoch": 0.7899194381326172, "grad_norm": 8.724126815795898, "learning_rate": 3.9497603701867466e-05, "loss": 1.6966, "step": 4780 }, { "epoch": 0.7915719892584177, "grad_norm": 11.860855102539062, "learning_rate": 3.958023467195505e-05, "loss": 1.6671, "step": 4790 }, { "epoch": 0.7932245403842182, "grad_norm": 15.39936351776123, "learning_rate": 3.966286564204264e-05, "loss": 1.6334, "step": 4800 }, { "epoch": 0.7948770915100186, "grad_norm": 13.233650207519531, "learning_rate": 3.974549661213023e-05, "loss": 1.7003, "step": 4810 }, { "epoch": 0.796529642635819, "grad_norm": 13.286338806152344, "learning_rate": 3.982812758221782e-05, "loss": 1.6586, "step": 4820 }, { "epoch": 0.7981821937616195, "grad_norm": 70.44236755371094, "learning_rate": 3.99107585523054e-05, "loss": 1.7162, "step": 4830 }, { "epoch": 0.7998347448874199, "grad_norm": 38.03678894042969, "learning_rate": 3.999338952239299e-05, "loss": 1.6638, "step": 4840 }, { "epoch": 0.8014872960132204, "grad_norm": 9.280917167663574, "learning_rate": 4.0076020492480584e-05, "loss": 1.5643, "step": 4850 }, { "epoch": 0.8031398471390209, "grad_norm": 46.079708099365234, "learning_rate": 4.015865146256817e-05, "loss": 1.7017, "step": 4860 }, { "epoch": 0.8047923982648213, "grad_norm": 7.869788646697998, "learning_rate": 4.0241282432655764e-05, "loss": 1.6566, "step": 4870 }, { "epoch": 0.8064449493906217, "grad_norm": 13.424615859985352, "learning_rate": 4.0323913402743354e-05, "loss": 1.7328, "step": 4880 }, { "epoch": 0.8080975005164223, "grad_norm": 16.283950805664062, "learning_rate": 4.040654437283094e-05, "loss": 1.6783, "step": 4890 }, { "epoch": 0.8097500516422227, "grad_norm": 53.05949020385742, "learning_rate": 4.048917534291853e-05, "loss": 1.6176, "step": 4900 }, { "epoch": 0.8114026027680231, "grad_norm": 11.25322151184082, "learning_rate": 4.057180631300612e-05, "loss": 1.6248, "step": 4910 }, { "epoch": 0.8130551538938235, "grad_norm": 30.585168838500977, "learning_rate": 4.065443728309371e-05, "loss": 1.6711, "step": 4920 }, { "epoch": 0.8147077050196241, "grad_norm": 8.419675827026367, "learning_rate": 4.073706825318129e-05, "loss": 1.6889, "step": 4930 }, { "epoch": 0.8163602561454245, "grad_norm": 32.17693328857422, "learning_rate": 4.081969922326888e-05, "loss": 1.7163, "step": 4940 }, { "epoch": 0.8180128072712249, "grad_norm": 11.359280586242676, "learning_rate": 4.090233019335647e-05, "loss": 1.6454, "step": 4950 }, { "epoch": 0.8196653583970254, "grad_norm": 15.660289764404297, "learning_rate": 4.098496116344406e-05, "loss": 1.7507, "step": 4960 }, { "epoch": 0.8213179095228259, "grad_norm": 10.861181259155273, "learning_rate": 4.106759213353165e-05, "loss": 1.648, "step": 4970 }, { "epoch": 0.8229704606486263, "grad_norm": 8.45966625213623, "learning_rate": 4.115022310361924e-05, "loss": 1.6779, "step": 4980 }, { "epoch": 0.8246230117744268, "grad_norm": 8.247032165527344, "learning_rate": 4.1232854073706825e-05, "loss": 1.7897, "step": 4990 }, { "epoch": 0.8262755629002272, "grad_norm": 10.746397018432617, "learning_rate": 4.1315485043794416e-05, "loss": 1.6776, "step": 5000 }, { "epoch": 0.8279281140260277, "grad_norm": 11.24117660522461, "learning_rate": 4.1398116013882006e-05, "loss": 1.6349, "step": 5010 }, { "epoch": 0.8295806651518282, "grad_norm": 13.31236457824707, "learning_rate": 4.1480746983969596e-05, "loss": 1.5985, "step": 5020 }, { "epoch": 0.8312332162776286, "grad_norm": 32.01264572143555, "learning_rate": 4.156337795405718e-05, "loss": 1.7555, "step": 5030 }, { "epoch": 0.832885767403429, "grad_norm": 9.02853775024414, "learning_rate": 4.164600892414477e-05, "loss": 1.5571, "step": 5040 }, { "epoch": 0.8345383185292296, "grad_norm": 20.635509490966797, "learning_rate": 4.172863989423236e-05, "loss": 1.7462, "step": 5050 }, { "epoch": 0.83619086965503, "grad_norm": 11.947609901428223, "learning_rate": 4.181127086431995e-05, "loss": 1.6536, "step": 5060 }, { "epoch": 0.8378434207808304, "grad_norm": 7.472456932067871, "learning_rate": 4.189390183440754e-05, "loss": 1.8139, "step": 5070 }, { "epoch": 0.8394959719066308, "grad_norm": 11.594853401184082, "learning_rate": 4.197653280449513e-05, "loss": 1.5398, "step": 5080 }, { "epoch": 0.8411485230324313, "grad_norm": 22.656166076660156, "learning_rate": 4.2059163774582713e-05, "loss": 1.7391, "step": 5090 }, { "epoch": 0.8428010741582318, "grad_norm": 15.495370864868164, "learning_rate": 4.2141794744670304e-05, "loss": 1.8129, "step": 5100 }, { "epoch": 0.8444536252840322, "grad_norm": 6.661768436431885, "learning_rate": 4.2224425714757894e-05, "loss": 1.6834, "step": 5110 }, { "epoch": 0.8461061764098327, "grad_norm": 7.477358818054199, "learning_rate": 4.2307056684845484e-05, "loss": 1.5815, "step": 5120 }, { "epoch": 0.8477587275356331, "grad_norm": 11.148330688476562, "learning_rate": 4.238968765493307e-05, "loss": 1.6634, "step": 5130 }, { "epoch": 0.8494112786614336, "grad_norm": 9.076679229736328, "learning_rate": 4.247231862502066e-05, "loss": 1.6155, "step": 5140 }, { "epoch": 0.851063829787234, "grad_norm": 10.360162734985352, "learning_rate": 4.2554949595108254e-05, "loss": 1.5972, "step": 5150 }, { "epoch": 0.8527163809130345, "grad_norm": 10.570571899414062, "learning_rate": 4.263758056519584e-05, "loss": 1.7933, "step": 5160 }, { "epoch": 0.8543689320388349, "grad_norm": 7.64302396774292, "learning_rate": 4.272021153528343e-05, "loss": 1.7115, "step": 5170 }, { "epoch": 0.8560214831646354, "grad_norm": 11.577961921691895, "learning_rate": 4.280284250537102e-05, "loss": 1.7791, "step": 5180 }, { "epoch": 0.8576740342904359, "grad_norm": 9.828375816345215, "learning_rate": 4.28854734754586e-05, "loss": 1.7543, "step": 5190 }, { "epoch": 0.8593265854162363, "grad_norm": 10.680025100708008, "learning_rate": 4.296810444554619e-05, "loss": 1.569, "step": 5200 }, { "epoch": 0.8609791365420367, "grad_norm": 10.61168384552002, "learning_rate": 4.305073541563378e-05, "loss": 1.6397, "step": 5210 }, { "epoch": 0.8626316876678373, "grad_norm": 13.09295654296875, "learning_rate": 4.313336638572137e-05, "loss": 1.663, "step": 5220 }, { "epoch": 0.8642842387936377, "grad_norm": 10.244070053100586, "learning_rate": 4.3215997355808955e-05, "loss": 1.8205, "step": 5230 }, { "epoch": 0.8659367899194381, "grad_norm": 14.051331520080566, "learning_rate": 4.329862832589655e-05, "loss": 1.7455, "step": 5240 }, { "epoch": 0.8675893410452385, "grad_norm": 14.486235618591309, "learning_rate": 4.338125929598414e-05, "loss": 1.6745, "step": 5250 }, { "epoch": 0.8692418921710391, "grad_norm": 10.930896759033203, "learning_rate": 4.3463890266071726e-05, "loss": 1.6661, "step": 5260 }, { "epoch": 0.8708944432968395, "grad_norm": 23.3206729888916, "learning_rate": 4.3546521236159316e-05, "loss": 1.6653, "step": 5270 }, { "epoch": 0.8725469944226399, "grad_norm": 22.273244857788086, "learning_rate": 4.3629152206246906e-05, "loss": 1.7524, "step": 5280 }, { "epoch": 0.8741995455484404, "grad_norm": 11.886351585388184, "learning_rate": 4.371178317633449e-05, "loss": 1.7256, "step": 5290 }, { "epoch": 0.8758520966742409, "grad_norm": 8.98186206817627, "learning_rate": 4.379441414642208e-05, "loss": 1.7292, "step": 5300 }, { "epoch": 0.8775046478000413, "grad_norm": 10.950017929077148, "learning_rate": 4.387704511650967e-05, "loss": 1.7558, "step": 5310 }, { "epoch": 0.8791571989258418, "grad_norm": 10.48299789428711, "learning_rate": 4.395967608659726e-05, "loss": 1.7535, "step": 5320 }, { "epoch": 0.8808097500516422, "grad_norm": 22.365188598632812, "learning_rate": 4.4042307056684843e-05, "loss": 1.7458, "step": 5330 }, { "epoch": 0.8824623011774427, "grad_norm": 9.840352058410645, "learning_rate": 4.412493802677244e-05, "loss": 1.668, "step": 5340 }, { "epoch": 0.8841148523032432, "grad_norm": 7.298482894897461, "learning_rate": 4.4207568996860024e-05, "loss": 1.672, "step": 5350 }, { "epoch": 0.8857674034290436, "grad_norm": 26.250947952270508, "learning_rate": 4.4290199966947614e-05, "loss": 1.7415, "step": 5360 }, { "epoch": 0.887419954554844, "grad_norm": 15.844700813293457, "learning_rate": 4.4372830937035204e-05, "loss": 1.6139, "step": 5370 }, { "epoch": 0.8890725056806444, "grad_norm": 7.872781276702881, "learning_rate": 4.4455461907122794e-05, "loss": 1.6652, "step": 5380 }, { "epoch": 0.890725056806445, "grad_norm": 11.659095764160156, "learning_rate": 4.453809287721038e-05, "loss": 1.6276, "step": 5390 }, { "epoch": 0.8923776079322454, "grad_norm": 25.442895889282227, "learning_rate": 4.462072384729797e-05, "loss": 1.7076, "step": 5400 }, { "epoch": 0.8940301590580458, "grad_norm": 40.754371643066406, "learning_rate": 4.470335481738556e-05, "loss": 1.7066, "step": 5410 }, { "epoch": 0.8956827101838463, "grad_norm": 23.09174156188965, "learning_rate": 4.478598578747315e-05, "loss": 1.7866, "step": 5420 }, { "epoch": 0.8973352613096468, "grad_norm": 24.503602981567383, "learning_rate": 4.486861675756074e-05, "loss": 1.5919, "step": 5430 }, { "epoch": 0.8989878124354472, "grad_norm": 23.102312088012695, "learning_rate": 4.495124772764833e-05, "loss": 1.7015, "step": 5440 }, { "epoch": 0.9006403635612477, "grad_norm": 8.59858226776123, "learning_rate": 4.503387869773591e-05, "loss": 1.7018, "step": 5450 }, { "epoch": 0.9022929146870481, "grad_norm": 23.99196434020996, "learning_rate": 4.51165096678235e-05, "loss": 1.627, "step": 5460 }, { "epoch": 0.9039454658128486, "grad_norm": 14.380967140197754, "learning_rate": 4.519914063791109e-05, "loss": 1.5606, "step": 5470 }, { "epoch": 0.905598016938649, "grad_norm": 18.584856033325195, "learning_rate": 4.528177160799868e-05, "loss": 1.8168, "step": 5480 }, { "epoch": 0.9072505680644495, "grad_norm": 11.22840690612793, "learning_rate": 4.5364402578086266e-05, "loss": 1.612, "step": 5490 }, { "epoch": 0.9089031191902499, "grad_norm": 20.63224983215332, "learning_rate": 4.5447033548173856e-05, "loss": 1.6359, "step": 5500 }, { "epoch": 0.9105556703160504, "grad_norm": 9.386175155639648, "learning_rate": 4.5529664518261446e-05, "loss": 1.7047, "step": 5510 }, { "epoch": 0.9122082214418509, "grad_norm": 12.939002990722656, "learning_rate": 4.5612295488349036e-05, "loss": 1.6124, "step": 5520 }, { "epoch": 0.9138607725676513, "grad_norm": 10.644198417663574, "learning_rate": 4.5694926458436626e-05, "loss": 1.6824, "step": 5530 }, { "epoch": 0.9155133236934517, "grad_norm": 15.673925399780273, "learning_rate": 4.5777557428524216e-05, "loss": 1.7603, "step": 5540 }, { "epoch": 0.9171658748192523, "grad_norm": 10.065011024475098, "learning_rate": 4.58601883986118e-05, "loss": 1.804, "step": 5550 }, { "epoch": 0.9188184259450527, "grad_norm": 9.452924728393555, "learning_rate": 4.594281936869939e-05, "loss": 1.7851, "step": 5560 }, { "epoch": 0.9204709770708531, "grad_norm": 10.779854774475098, "learning_rate": 4.602545033878698e-05, "loss": 1.7237, "step": 5570 }, { "epoch": 0.9221235281966536, "grad_norm": 10.967293739318848, "learning_rate": 4.610808130887457e-05, "loss": 1.6173, "step": 5580 }, { "epoch": 0.9237760793224541, "grad_norm": 91.86517333984375, "learning_rate": 4.6190712278962154e-05, "loss": 1.6469, "step": 5590 }, { "epoch": 0.9254286304482545, "grad_norm": 4.923385143280029, "learning_rate": 4.6273343249049744e-05, "loss": 1.5892, "step": 5600 }, { "epoch": 0.927081181574055, "grad_norm": 19.811214447021484, "learning_rate": 4.6355974219137334e-05, "loss": 1.5998, "step": 5610 }, { "epoch": 0.9287337326998554, "grad_norm": 9.930047035217285, "learning_rate": 4.6438605189224924e-05, "loss": 1.6312, "step": 5620 }, { "epoch": 0.9303862838256558, "grad_norm": 20.98347282409668, "learning_rate": 4.6521236159312514e-05, "loss": 1.7513, "step": 5630 }, { "epoch": 0.9320388349514563, "grad_norm": 11.083711624145508, "learning_rate": 4.6603867129400104e-05, "loss": 1.7647, "step": 5640 }, { "epoch": 0.9336913860772568, "grad_norm": 15.540210723876953, "learning_rate": 4.668649809948769e-05, "loss": 1.606, "step": 5650 }, { "epoch": 0.9353439372030572, "grad_norm": 40.91162872314453, "learning_rate": 4.676912906957528e-05, "loss": 1.5821, "step": 5660 }, { "epoch": 0.9369964883288576, "grad_norm": 10.069815635681152, "learning_rate": 4.685176003966287e-05, "loss": 1.5963, "step": 5670 }, { "epoch": 0.9386490394546582, "grad_norm": 7.199189186096191, "learning_rate": 4.693439100975046e-05, "loss": 1.6151, "step": 5680 }, { "epoch": 0.9403015905804586, "grad_norm": 14.112994194030762, "learning_rate": 4.701702197983804e-05, "loss": 1.8498, "step": 5690 }, { "epoch": 0.941954141706259, "grad_norm": 17.025121688842773, "learning_rate": 4.709965294992563e-05, "loss": 1.5699, "step": 5700 }, { "epoch": 0.9436066928320594, "grad_norm": 22.465831756591797, "learning_rate": 4.718228392001322e-05, "loss": 1.7636, "step": 5710 }, { "epoch": 0.94525924395786, "grad_norm": 16.813859939575195, "learning_rate": 4.726491489010081e-05, "loss": 1.6191, "step": 5720 }, { "epoch": 0.9469117950836604, "grad_norm": 12.327750205993652, "learning_rate": 4.73475458601884e-05, "loss": 1.7167, "step": 5730 }, { "epoch": 0.9485643462094608, "grad_norm": 12.070796966552734, "learning_rate": 4.743017683027599e-05, "loss": 1.7759, "step": 5740 }, { "epoch": 0.9502168973352613, "grad_norm": 23.6120548248291, "learning_rate": 4.7512807800363576e-05, "loss": 1.781, "step": 5750 }, { "epoch": 0.9518694484610618, "grad_norm": 12.13965892791748, "learning_rate": 4.7595438770451166e-05, "loss": 1.4901, "step": 5760 }, { "epoch": 0.9535219995868622, "grad_norm": 10.353153228759766, "learning_rate": 4.7678069740538756e-05, "loss": 1.7373, "step": 5770 }, { "epoch": 0.9551745507126627, "grad_norm": 13.95853328704834, "learning_rate": 4.7760700710626346e-05, "loss": 1.771, "step": 5780 }, { "epoch": 0.9568271018384631, "grad_norm": 9.67155647277832, "learning_rate": 4.784333168071393e-05, "loss": 1.6026, "step": 5790 }, { "epoch": 0.9584796529642636, "grad_norm": 20.621294021606445, "learning_rate": 4.792596265080152e-05, "loss": 1.6695, "step": 5800 }, { "epoch": 0.9601322040900641, "grad_norm": 24.754667282104492, "learning_rate": 4.800859362088912e-05, "loss": 1.8208, "step": 5810 }, { "epoch": 0.9617847552158645, "grad_norm": 8.666321754455566, "learning_rate": 4.80912245909767e-05, "loss": 1.7137, "step": 5820 }, { "epoch": 0.9634373063416649, "grad_norm": 43.4861946105957, "learning_rate": 4.817385556106429e-05, "loss": 1.6497, "step": 5830 }, { "epoch": 0.9650898574674655, "grad_norm": 30.73326873779297, "learning_rate": 4.825648653115188e-05, "loss": 1.7121, "step": 5840 }, { "epoch": 0.9667424085932659, "grad_norm": 33.42872619628906, "learning_rate": 4.8339117501239464e-05, "loss": 1.8122, "step": 5850 }, { "epoch": 0.9683949597190663, "grad_norm": 27.492687225341797, "learning_rate": 4.8421748471327054e-05, "loss": 1.6811, "step": 5860 }, { "epoch": 0.9700475108448667, "grad_norm": 6.809346675872803, "learning_rate": 4.8504379441414644e-05, "loss": 1.6784, "step": 5870 }, { "epoch": 0.9717000619706673, "grad_norm": 6.992137432098389, "learning_rate": 4.8587010411502234e-05, "loss": 1.5981, "step": 5880 }, { "epoch": 0.9733526130964677, "grad_norm": 9.034411430358887, "learning_rate": 4.866964138158982e-05, "loss": 1.7404, "step": 5890 }, { "epoch": 0.9750051642222681, "grad_norm": 7.147427082061768, "learning_rate": 4.8752272351677415e-05, "loss": 1.719, "step": 5900 }, { "epoch": 0.9766577153480686, "grad_norm": 6.848790168762207, "learning_rate": 4.8834903321765005e-05, "loss": 1.6843, "step": 5910 }, { "epoch": 0.978310266473869, "grad_norm": 13.439620971679688, "learning_rate": 4.891753429185259e-05, "loss": 1.7116, "step": 5920 }, { "epoch": 0.9799628175996695, "grad_norm": 6.594333648681641, "learning_rate": 4.900016526194018e-05, "loss": 1.8479, "step": 5930 }, { "epoch": 0.98161536872547, "grad_norm": 7.724569320678711, "learning_rate": 4.908279623202777e-05, "loss": 1.6023, "step": 5940 }, { "epoch": 0.9832679198512704, "grad_norm": 41.57175064086914, "learning_rate": 4.916542720211535e-05, "loss": 1.7183, "step": 5950 }, { "epoch": 0.9849204709770708, "grad_norm": 16.64209747314453, "learning_rate": 4.924805817220294e-05, "loss": 1.6207, "step": 5960 }, { "epoch": 0.9865730221028713, "grad_norm": 13.659209251403809, "learning_rate": 4.933068914229053e-05, "loss": 1.6747, "step": 5970 }, { "epoch": 0.9882255732286718, "grad_norm": 7.966761589050293, "learning_rate": 4.941332011237812e-05, "loss": 1.6441, "step": 5980 }, { "epoch": 0.9898781243544722, "grad_norm": 5.628538131713867, "learning_rate": 4.9495951082465706e-05, "loss": 1.7499, "step": 5990 }, { "epoch": 0.9915306754802726, "grad_norm": 8.069196701049805, "learning_rate": 4.95785820525533e-05, "loss": 1.88, "step": 6000 }, { "epoch": 0.9931832266060732, "grad_norm": 51.40681457519531, "learning_rate": 4.966121302264089e-05, "loss": 1.6778, "step": 6010 }, { "epoch": 0.9948357777318736, "grad_norm": 6.667893886566162, "learning_rate": 4.9743843992728476e-05, "loss": 1.6331, "step": 6020 }, { "epoch": 0.996488328857674, "grad_norm": 8.195232391357422, "learning_rate": 4.9826474962816066e-05, "loss": 1.6024, "step": 6030 }, { "epoch": 0.9981408799834744, "grad_norm": 14.904173851013184, "learning_rate": 4.9909105932903657e-05, "loss": 1.7938, "step": 6040 }, { "epoch": 0.999793431109275, "grad_norm": 9.155719757080078, "learning_rate": 4.999173690299124e-05, "loss": 1.5715, "step": 6050 }, { "epoch": 0.999958686221855, "eval_accuracy": 0.2757837991204426, "eval_loss": 2.0155398845672607, "eval_runtime": 854.7614, "eval_samples_per_second": 32.987, "eval_steps_per_second": 8.247, "step": 6051 }, { "epoch": 1.0014459822350754, "grad_norm": 9.607635498046875, "learning_rate": 4.999173690299124e-05, "loss": 1.6654, "step": 6060 }, { "epoch": 1.003098533360876, "grad_norm": 5.138330459594727, "learning_rate": 4.998255568409262e-05, "loss": 1.5768, "step": 6070 }, { "epoch": 1.0047510844866763, "grad_norm": 28.794178009033203, "learning_rate": 4.9973374465194e-05, "loss": 1.6652, "step": 6080 }, { "epoch": 1.0064036356124768, "grad_norm": 14.260908126831055, "learning_rate": 4.996419324629538e-05, "loss": 1.5967, "step": 6090 }, { "epoch": 1.0080561867382771, "grad_norm": 11.221604347229004, "learning_rate": 4.995501202739676e-05, "loss": 1.6488, "step": 6100 }, { "epoch": 1.0097087378640777, "grad_norm": 8.829093933105469, "learning_rate": 4.994583080849814e-05, "loss": 1.6887, "step": 6110 }, { "epoch": 1.0113612889898782, "grad_norm": 10.191542625427246, "learning_rate": 4.9936649589599515e-05, "loss": 1.7274, "step": 6120 }, { "epoch": 1.0130138401156785, "grad_norm": 9.99532413482666, "learning_rate": 4.99274683707009e-05, "loss": 1.6879, "step": 6130 }, { "epoch": 1.014666391241479, "grad_norm": 26.136659622192383, "learning_rate": 4.9918287151802274e-05, "loss": 1.8429, "step": 6140 }, { "epoch": 1.0163189423672794, "grad_norm": 7.408060073852539, "learning_rate": 4.9909105932903657e-05, "loss": 1.6909, "step": 6150 }, { "epoch": 1.01797149349308, "grad_norm": 15.327078819274902, "learning_rate": 4.989992471400504e-05, "loss": 1.7461, "step": 6160 }, { "epoch": 1.0196240446188805, "grad_norm": 22.485273361206055, "learning_rate": 4.9890743495106415e-05, "loss": 1.767, "step": 6170 }, { "epoch": 1.0212765957446808, "grad_norm": 21.41472816467285, "learning_rate": 4.988156227620779e-05, "loss": 1.6902, "step": 6180 }, { "epoch": 1.0229291468704813, "grad_norm": 8.6235933303833, "learning_rate": 4.987238105730917e-05, "loss": 1.6762, "step": 6190 }, { "epoch": 1.0245816979962818, "grad_norm": 13.099306106567383, "learning_rate": 4.986319983841055e-05, "loss": 1.7881, "step": 6200 }, { "epoch": 1.0262342491220822, "grad_norm": 6.052161693572998, "learning_rate": 4.9854018619511925e-05, "loss": 1.5323, "step": 6210 }, { "epoch": 1.0278868002478827, "grad_norm": 9.501975059509277, "learning_rate": 4.984483740061331e-05, "loss": 1.6617, "step": 6220 }, { "epoch": 1.029539351373683, "grad_norm": 68.59492492675781, "learning_rate": 4.9835656181714684e-05, "loss": 1.655, "step": 6230 }, { "epoch": 1.0311919024994836, "grad_norm": 33.55989456176758, "learning_rate": 4.9826474962816066e-05, "loss": 1.7361, "step": 6240 }, { "epoch": 1.032844453625284, "grad_norm": 33.48898696899414, "learning_rate": 4.981729374391744e-05, "loss": 1.667, "step": 6250 }, { "epoch": 1.0344970047510844, "grad_norm": 21.978069305419922, "learning_rate": 4.9808112525018825e-05, "loss": 1.6726, "step": 6260 }, { "epoch": 1.036149555876885, "grad_norm": 27.184450149536133, "learning_rate": 4.979893130612021e-05, "loss": 1.6208, "step": 6270 }, { "epoch": 1.0378021070026855, "grad_norm": 12.480914115905762, "learning_rate": 4.9789750087221583e-05, "loss": 1.707, "step": 6280 }, { "epoch": 1.0394546581284858, "grad_norm": 9.19139575958252, "learning_rate": 4.9780568868322966e-05, "loss": 1.6405, "step": 6290 }, { "epoch": 1.0411072092542863, "grad_norm": 92.05270385742188, "learning_rate": 4.977138764942434e-05, "loss": 1.5893, "step": 6300 }, { "epoch": 1.0427597603800867, "grad_norm": 17.508121490478516, "learning_rate": 4.976220643052572e-05, "loss": 1.6037, "step": 6310 }, { "epoch": 1.0444123115058872, "grad_norm": 46.28389358520508, "learning_rate": 4.9753025211627094e-05, "loss": 1.5507, "step": 6320 }, { "epoch": 1.0460648626316877, "grad_norm": 26.94911003112793, "learning_rate": 4.9743843992728476e-05, "loss": 1.8003, "step": 6330 }, { "epoch": 1.047717413757488, "grad_norm": 7.172421932220459, "learning_rate": 4.973466277382985e-05, "loss": 1.7725, "step": 6340 }, { "epoch": 1.0493699648832886, "grad_norm": 13.676774978637695, "learning_rate": 4.9725481554931235e-05, "loss": 1.5975, "step": 6350 }, { "epoch": 1.0510225160090891, "grad_norm": 9.980374336242676, "learning_rate": 4.971630033603261e-05, "loss": 1.5381, "step": 6360 }, { "epoch": 1.0526750671348895, "grad_norm": 11.59304428100586, "learning_rate": 4.970711911713399e-05, "loss": 1.7605, "step": 6370 }, { "epoch": 1.05432761826069, "grad_norm": 5.923514366149902, "learning_rate": 4.969793789823537e-05, "loss": 1.6649, "step": 6380 }, { "epoch": 1.0559801693864903, "grad_norm": 10.02888298034668, "learning_rate": 4.968875667933675e-05, "loss": 1.7624, "step": 6390 }, { "epoch": 1.0576327205122908, "grad_norm": 24.110645294189453, "learning_rate": 4.9679575460438134e-05, "loss": 1.7263, "step": 6400 }, { "epoch": 1.0592852716380914, "grad_norm": 29.78762435913086, "learning_rate": 4.967039424153951e-05, "loss": 1.5858, "step": 6410 }, { "epoch": 1.0609378227638917, "grad_norm": 10.339710235595703, "learning_rate": 4.966121302264089e-05, "loss": 1.5688, "step": 6420 }, { "epoch": 1.0625903738896922, "grad_norm": 12.489291191101074, "learning_rate": 4.965203180374227e-05, "loss": 1.7094, "step": 6430 }, { "epoch": 1.0642429250154928, "grad_norm": 7.957202434539795, "learning_rate": 4.9642850584843645e-05, "loss": 1.6154, "step": 6440 }, { "epoch": 1.065895476141293, "grad_norm": 8.301098823547363, "learning_rate": 4.963366936594502e-05, "loss": 1.7572, "step": 6450 }, { "epoch": 1.0675480272670936, "grad_norm": 16.680442810058594, "learning_rate": 4.96244881470464e-05, "loss": 1.7111, "step": 6460 }, { "epoch": 1.069200578392894, "grad_norm": 8.970675468444824, "learning_rate": 4.961530692814778e-05, "loss": 1.7194, "step": 6470 }, { "epoch": 1.0708531295186945, "grad_norm": 16.222583770751953, "learning_rate": 4.960612570924916e-05, "loss": 1.6507, "step": 6480 }, { "epoch": 1.072505680644495, "grad_norm": 7.895381450653076, "learning_rate": 4.959694449035054e-05, "loss": 1.6932, "step": 6490 }, { "epoch": 1.0741582317702953, "grad_norm": 19.77764129638672, "learning_rate": 4.958776327145192e-05, "loss": 1.8433, "step": 6500 }, { "epoch": 1.0758107828960959, "grad_norm": 14.908355712890625, "learning_rate": 4.95785820525533e-05, "loss": 1.5765, "step": 6510 }, { "epoch": 1.0774633340218962, "grad_norm": 10.35196304321289, "learning_rate": 4.956940083365468e-05, "loss": 1.8107, "step": 6520 }, { "epoch": 1.0791158851476967, "grad_norm": 9.052248001098633, "learning_rate": 4.956021961475606e-05, "loss": 1.7856, "step": 6530 }, { "epoch": 1.0807684362734973, "grad_norm": 7.395838737487793, "learning_rate": 4.955103839585744e-05, "loss": 1.691, "step": 6540 }, { "epoch": 1.0824209873992976, "grad_norm": 36.31234359741211, "learning_rate": 4.954185717695882e-05, "loss": 1.6575, "step": 6550 }, { "epoch": 1.0840735385250981, "grad_norm": 19.741281509399414, "learning_rate": 4.9532675958060196e-05, "loss": 1.7273, "step": 6560 }, { "epoch": 1.0857260896508987, "grad_norm": 44.102500915527344, "learning_rate": 4.952349473916157e-05, "loss": 1.7415, "step": 6570 }, { "epoch": 1.087378640776699, "grad_norm": 5.051490783691406, "learning_rate": 4.951431352026295e-05, "loss": 1.6828, "step": 6580 }, { "epoch": 1.0890311919024995, "grad_norm": 6.60805606842041, "learning_rate": 4.950513230136433e-05, "loss": 1.6364, "step": 6590 }, { "epoch": 1.0906837430282998, "grad_norm": 18.499725341796875, "learning_rate": 4.9495951082465706e-05, "loss": 1.6844, "step": 6600 }, { "epoch": 1.0923362941541004, "grad_norm": 18.149259567260742, "learning_rate": 4.948676986356709e-05, "loss": 1.5962, "step": 6610 }, { "epoch": 1.093988845279901, "grad_norm": 17.02660369873047, "learning_rate": 4.947758864466847e-05, "loss": 1.6441, "step": 6620 }, { "epoch": 1.0956413964057012, "grad_norm": 13.559640884399414, "learning_rate": 4.946840742576985e-05, "loss": 1.5714, "step": 6630 }, { "epoch": 1.0972939475315018, "grad_norm": 12.437646865844727, "learning_rate": 4.945922620687123e-05, "loss": 1.7886, "step": 6640 }, { "epoch": 1.0989464986573023, "grad_norm": 10.138182640075684, "learning_rate": 4.9450044987972605e-05, "loss": 1.6831, "step": 6650 }, { "epoch": 1.1005990497831026, "grad_norm": 18.947307586669922, "learning_rate": 4.944086376907399e-05, "loss": 1.634, "step": 6660 }, { "epoch": 1.1022516009089032, "grad_norm": 7.246707439422607, "learning_rate": 4.9431682550175364e-05, "loss": 1.6895, "step": 6670 }, { "epoch": 1.1039041520347035, "grad_norm": 8.21375846862793, "learning_rate": 4.9422501331276747e-05, "loss": 1.6634, "step": 6680 }, { "epoch": 1.105556703160504, "grad_norm": 14.554459571838379, "learning_rate": 4.941332011237812e-05, "loss": 1.7784, "step": 6690 }, { "epoch": 1.1072092542863046, "grad_norm": 16.16337776184082, "learning_rate": 4.94041388934795e-05, "loss": 1.6209, "step": 6700 }, { "epoch": 1.1088618054121049, "grad_norm": 25.227270126342773, "learning_rate": 4.9394957674580874e-05, "loss": 1.5893, "step": 6710 }, { "epoch": 1.1105143565379054, "grad_norm": 11.184518814086914, "learning_rate": 4.938577645568226e-05, "loss": 1.7018, "step": 6720 }, { "epoch": 1.1121669076637057, "grad_norm": 7.978889465332031, "learning_rate": 4.937659523678364e-05, "loss": 1.7783, "step": 6730 }, { "epoch": 1.1138194587895063, "grad_norm": 11.834269523620605, "learning_rate": 4.9367414017885015e-05, "loss": 1.6588, "step": 6740 }, { "epoch": 1.1154720099153068, "grad_norm": 7.372729778289795, "learning_rate": 4.93582327989864e-05, "loss": 1.7051, "step": 6750 }, { "epoch": 1.1171245610411071, "grad_norm": 15.451234817504883, "learning_rate": 4.9349051580087774e-05, "loss": 1.5759, "step": 6760 }, { "epoch": 1.1187771121669077, "grad_norm": 7.165411949157715, "learning_rate": 4.9339870361189156e-05, "loss": 1.4484, "step": 6770 }, { "epoch": 1.1204296632927082, "grad_norm": 9.450922012329102, "learning_rate": 4.933068914229053e-05, "loss": 1.6673, "step": 6780 }, { "epoch": 1.1220822144185085, "grad_norm": 7.094033718109131, "learning_rate": 4.9321507923391915e-05, "loss": 1.7646, "step": 6790 }, { "epoch": 1.123734765544309, "grad_norm": 19.683202743530273, "learning_rate": 4.931232670449329e-05, "loss": 1.576, "step": 6800 }, { "epoch": 1.1253873166701096, "grad_norm": 7.054886341094971, "learning_rate": 4.9303145485594673e-05, "loss": 1.6599, "step": 6810 }, { "epoch": 1.12703986779591, "grad_norm": 11.178540229797363, "learning_rate": 4.929396426669605e-05, "loss": 1.7556, "step": 6820 }, { "epoch": 1.1286924189217105, "grad_norm": 7.649514198303223, "learning_rate": 4.9284783047797425e-05, "loss": 1.6763, "step": 6830 }, { "epoch": 1.1303449700475108, "grad_norm": 27.767852783203125, "learning_rate": 4.927560182889881e-05, "loss": 1.6954, "step": 6840 }, { "epoch": 1.1319975211733113, "grad_norm": 46.87465286254883, "learning_rate": 4.9266420610000184e-05, "loss": 1.6347, "step": 6850 }, { "epoch": 1.1336500722991119, "grad_norm": 11.488672256469727, "learning_rate": 4.9257239391101566e-05, "loss": 1.6059, "step": 6860 }, { "epoch": 1.1353026234249122, "grad_norm": 21.95306396484375, "learning_rate": 4.924805817220294e-05, "loss": 1.7236, "step": 6870 }, { "epoch": 1.1369551745507127, "grad_norm": 7.683815956115723, "learning_rate": 4.9238876953304325e-05, "loss": 1.7474, "step": 6880 }, { "epoch": 1.138607725676513, "grad_norm": 41.58155822753906, "learning_rate": 4.92296957344057e-05, "loss": 1.6803, "step": 6890 }, { "epoch": 1.1402602768023136, "grad_norm": 6.919406890869141, "learning_rate": 4.922051451550708e-05, "loss": 1.5835, "step": 6900 }, { "epoch": 1.141912827928114, "grad_norm": 11.925729751586914, "learning_rate": 4.921133329660846e-05, "loss": 1.52, "step": 6910 }, { "epoch": 1.1435653790539144, "grad_norm": 7.959477424621582, "learning_rate": 4.920215207770984e-05, "loss": 1.5885, "step": 6920 }, { "epoch": 1.145217930179715, "grad_norm": 8.630328178405762, "learning_rate": 4.919297085881122e-05, "loss": 1.616, "step": 6930 }, { "epoch": 1.1468704813055153, "grad_norm": 15.663494110107422, "learning_rate": 4.91837896399126e-05, "loss": 1.6099, "step": 6940 }, { "epoch": 1.1485230324313158, "grad_norm": 9.951146125793457, "learning_rate": 4.9174608421013976e-05, "loss": 1.6716, "step": 6950 }, { "epoch": 1.1501755835571164, "grad_norm": 7.788767337799072, "learning_rate": 4.916542720211535e-05, "loss": 1.7533, "step": 6960 }, { "epoch": 1.1518281346829167, "grad_norm": 18.3308048248291, "learning_rate": 4.9156245983216735e-05, "loss": 1.8288, "step": 6970 }, { "epoch": 1.1534806858087172, "grad_norm": 26.051170349121094, "learning_rate": 4.914706476431811e-05, "loss": 1.6644, "step": 6980 }, { "epoch": 1.1551332369345177, "grad_norm": 12.27104663848877, "learning_rate": 4.913788354541949e-05, "loss": 1.6927, "step": 6990 }, { "epoch": 1.156785788060318, "grad_norm": 18.62868881225586, "learning_rate": 4.912870232652087e-05, "loss": 1.6167, "step": 7000 }, { "epoch": 1.1584383391861186, "grad_norm": 5.9332122802734375, "learning_rate": 4.911952110762225e-05, "loss": 1.7339, "step": 7010 }, { "epoch": 1.1600908903119191, "grad_norm": 12.154903411865234, "learning_rate": 4.911033988872363e-05, "loss": 1.7497, "step": 7020 }, { "epoch": 1.1617434414377195, "grad_norm": 19.24437713623047, "learning_rate": 4.910115866982501e-05, "loss": 1.7414, "step": 7030 }, { "epoch": 1.16339599256352, "grad_norm": 26.495933532714844, "learning_rate": 4.9091977450926386e-05, "loss": 1.6141, "step": 7040 }, { "epoch": 1.1650485436893203, "grad_norm": 6.528138160705566, "learning_rate": 4.908279623202777e-05, "loss": 1.5709, "step": 7050 }, { "epoch": 1.1667010948151209, "grad_norm": 9.650068283081055, "learning_rate": 4.9073615013129144e-05, "loss": 1.5795, "step": 7060 }, { "epoch": 1.1683536459409214, "grad_norm": 8.132938385009766, "learning_rate": 4.906443379423053e-05, "loss": 1.6091, "step": 7070 }, { "epoch": 1.1700061970667217, "grad_norm": 6.303673267364502, "learning_rate": 4.90552525753319e-05, "loss": 1.5958, "step": 7080 }, { "epoch": 1.1716587481925222, "grad_norm": 8.630463600158691, "learning_rate": 4.904607135643328e-05, "loss": 1.5698, "step": 7090 }, { "epoch": 1.1733112993183226, "grad_norm": 12.576354026794434, "learning_rate": 4.903689013753466e-05, "loss": 1.7056, "step": 7100 }, { "epoch": 1.174963850444123, "grad_norm": 56.43805694580078, "learning_rate": 4.902770891863604e-05, "loss": 1.7336, "step": 7110 }, { "epoch": 1.1766164015699236, "grad_norm": 9.944549560546875, "learning_rate": 4.901852769973742e-05, "loss": 1.6921, "step": 7120 }, { "epoch": 1.178268952695724, "grad_norm": 7.986307621002197, "learning_rate": 4.9009346480838796e-05, "loss": 1.544, "step": 7130 }, { "epoch": 1.1799215038215245, "grad_norm": 6.309409141540527, "learning_rate": 4.900016526194018e-05, "loss": 1.6083, "step": 7140 }, { "epoch": 1.1815740549473248, "grad_norm": 36.086181640625, "learning_rate": 4.8990984043041554e-05, "loss": 1.6664, "step": 7150 }, { "epoch": 1.1832266060731254, "grad_norm": 8.67955493927002, "learning_rate": 4.898180282414294e-05, "loss": 1.574, "step": 7160 }, { "epoch": 1.184879157198926, "grad_norm": 7.240811347961426, "learning_rate": 4.897262160524431e-05, "loss": 1.6852, "step": 7170 }, { "epoch": 1.1865317083247262, "grad_norm": 15.414640426635742, "learning_rate": 4.8963440386345695e-05, "loss": 1.5493, "step": 7180 }, { "epoch": 1.1881842594505267, "grad_norm": 9.224071502685547, "learning_rate": 4.895425916744708e-05, "loss": 1.6202, "step": 7190 }, { "epoch": 1.1898368105763273, "grad_norm": 14.074639320373535, "learning_rate": 4.8945077948548454e-05, "loss": 1.5506, "step": 7200 }, { "epoch": 1.1914893617021276, "grad_norm": 11.120677947998047, "learning_rate": 4.893589672964983e-05, "loss": 1.6851, "step": 7210 }, { "epoch": 1.1931419128279281, "grad_norm": 10.850142478942871, "learning_rate": 4.8926715510751206e-05, "loss": 1.4931, "step": 7220 }, { "epoch": 1.1947944639537287, "grad_norm": 18.69784164428711, "learning_rate": 4.891753429185259e-05, "loss": 1.7118, "step": 7230 }, { "epoch": 1.196447015079529, "grad_norm": 8.451671600341797, "learning_rate": 4.8908353072953964e-05, "loss": 1.6661, "step": 7240 }, { "epoch": 1.1980995662053295, "grad_norm": 28.36526107788086, "learning_rate": 4.889917185405535e-05, "loss": 1.6517, "step": 7250 }, { "epoch": 1.1997521173311299, "grad_norm": 8.399593353271484, "learning_rate": 4.888999063515672e-05, "loss": 1.5306, "step": 7260 }, { "epoch": 1.2014046684569304, "grad_norm": 15.712769508361816, "learning_rate": 4.8880809416258105e-05, "loss": 1.6245, "step": 7270 }, { "epoch": 1.203057219582731, "grad_norm": 8.373371124267578, "learning_rate": 4.887162819735948e-05, "loss": 1.6816, "step": 7280 }, { "epoch": 1.2047097707085312, "grad_norm": 11.655881881713867, "learning_rate": 4.8862446978460864e-05, "loss": 1.76, "step": 7290 }, { "epoch": 1.2063623218343318, "grad_norm": 8.07646369934082, "learning_rate": 4.8853265759562246e-05, "loss": 1.7914, "step": 7300 }, { "epoch": 1.208014872960132, "grad_norm": 24.092042922973633, "learning_rate": 4.884408454066362e-05, "loss": 1.6793, "step": 7310 }, { "epoch": 1.2096674240859326, "grad_norm": 10.025934219360352, "learning_rate": 4.8834903321765005e-05, "loss": 1.7057, "step": 7320 }, { "epoch": 1.2113199752117332, "grad_norm": 9.85206413269043, "learning_rate": 4.882572210286638e-05, "loss": 1.5728, "step": 7330 }, { "epoch": 1.2129725263375335, "grad_norm": 13.544818878173828, "learning_rate": 4.8816540883967757e-05, "loss": 1.783, "step": 7340 }, { "epoch": 1.214625077463334, "grad_norm": 17.217933654785156, "learning_rate": 4.880735966506913e-05, "loss": 1.8045, "step": 7350 }, { "epoch": 1.2162776285891344, "grad_norm": 24.209354400634766, "learning_rate": 4.8798178446170515e-05, "loss": 1.6106, "step": 7360 }, { "epoch": 1.217930179714935, "grad_norm": 24.636571884155273, "learning_rate": 4.878899722727189e-05, "loss": 1.6975, "step": 7370 }, { "epoch": 1.2195827308407354, "grad_norm": 16.614473342895508, "learning_rate": 4.8779816008373274e-05, "loss": 1.7127, "step": 7380 }, { "epoch": 1.2212352819665357, "grad_norm": 57.88620376586914, "learning_rate": 4.877063478947465e-05, "loss": 1.7516, "step": 7390 }, { "epoch": 1.2228878330923363, "grad_norm": 28.026735305786133, "learning_rate": 4.876145357057603e-05, "loss": 1.7203, "step": 7400 }, { "epoch": 1.2245403842181368, "grad_norm": 8.524139404296875, "learning_rate": 4.8752272351677415e-05, "loss": 1.5154, "step": 7410 }, { "epoch": 1.2261929353439371, "grad_norm": 11.738126754760742, "learning_rate": 4.874309113277879e-05, "loss": 1.6059, "step": 7420 }, { "epoch": 1.2278454864697377, "grad_norm": 35.693504333496094, "learning_rate": 4.873390991388017e-05, "loss": 1.716, "step": 7430 }, { "epoch": 1.2294980375955382, "grad_norm": 16.629175186157227, "learning_rate": 4.872472869498155e-05, "loss": 1.7924, "step": 7440 }, { "epoch": 1.2311505887213385, "grad_norm": 9.903701782226562, "learning_rate": 4.871554747608293e-05, "loss": 1.8271, "step": 7450 }, { "epoch": 1.232803139847139, "grad_norm": 10.66246509552002, "learning_rate": 4.870636625718431e-05, "loss": 1.7185, "step": 7460 }, { "epoch": 1.2344556909729394, "grad_norm": 8.404884338378906, "learning_rate": 4.8697185038285683e-05, "loss": 1.7562, "step": 7470 }, { "epoch": 1.23610824209874, "grad_norm": 14.62071418762207, "learning_rate": 4.868800381938706e-05, "loss": 1.7265, "step": 7480 }, { "epoch": 1.2377607932245405, "grad_norm": 21.035802841186523, "learning_rate": 4.867882260048844e-05, "loss": 1.6935, "step": 7490 }, { "epoch": 1.2394133443503408, "grad_norm": 13.605401039123535, "learning_rate": 4.866964138158982e-05, "loss": 1.6896, "step": 7500 }, { "epoch": 1.2410658954761413, "grad_norm": 31.078174591064453, "learning_rate": 4.86604601626912e-05, "loss": 1.6084, "step": 7510 }, { "epoch": 1.2427184466019416, "grad_norm": 11.396821975708008, "learning_rate": 4.8651278943792576e-05, "loss": 1.6015, "step": 7520 }, { "epoch": 1.2443709977277422, "grad_norm": 10.65982723236084, "learning_rate": 4.864209772489396e-05, "loss": 1.6715, "step": 7530 }, { "epoch": 1.2460235488535427, "grad_norm": 34.76524353027344, "learning_rate": 4.863291650599534e-05, "loss": 1.7029, "step": 7540 }, { "epoch": 1.247676099979343, "grad_norm": 21.453048706054688, "learning_rate": 4.862373528709672e-05, "loss": 1.6779, "step": 7550 }, { "epoch": 1.2493286511051436, "grad_norm": 9.208992958068848, "learning_rate": 4.86145540681981e-05, "loss": 1.7251, "step": 7560 }, { "epoch": 1.250981202230944, "grad_norm": 9.565735816955566, "learning_rate": 4.8605372849299476e-05, "loss": 1.6829, "step": 7570 }, { "epoch": 1.2526337533567444, "grad_norm": 8.265256881713867, "learning_rate": 4.859619163040086e-05, "loss": 1.6577, "step": 7580 }, { "epoch": 1.254286304482545, "grad_norm": 14.750138282775879, "learning_rate": 4.8587010411502234e-05, "loss": 1.6659, "step": 7590 }, { "epoch": 1.2559388556083455, "grad_norm": 30.3494930267334, "learning_rate": 4.857782919260361e-05, "loss": 1.5829, "step": 7600 }, { "epoch": 1.2575914067341458, "grad_norm": 13.287981033325195, "learning_rate": 4.8568647973704986e-05, "loss": 1.6724, "step": 7610 }, { "epoch": 1.2592439578599464, "grad_norm": 29.33921241760254, "learning_rate": 4.855946675480637e-05, "loss": 1.5848, "step": 7620 }, { "epoch": 1.2608965089857467, "grad_norm": 7.407712459564209, "learning_rate": 4.8550285535907745e-05, "loss": 1.6882, "step": 7630 }, { "epoch": 1.2625490601115472, "grad_norm": 6.948355674743652, "learning_rate": 4.854110431700913e-05, "loss": 1.6093, "step": 7640 }, { "epoch": 1.2642016112373478, "grad_norm": 13.627409934997559, "learning_rate": 4.853192309811051e-05, "loss": 1.6143, "step": 7650 }, { "epoch": 1.265854162363148, "grad_norm": 10.397058486938477, "learning_rate": 4.8522741879211886e-05, "loss": 1.6205, "step": 7660 }, { "epoch": 1.2675067134889486, "grad_norm": 12.780577659606934, "learning_rate": 4.851356066031327e-05, "loss": 1.6489, "step": 7670 }, { "epoch": 1.269159264614749, "grad_norm": 36.843563079833984, "learning_rate": 4.8504379441414644e-05, "loss": 1.5179, "step": 7680 }, { "epoch": 1.2708118157405495, "grad_norm": 8.480043411254883, "learning_rate": 4.849519822251603e-05, "loss": 1.9982, "step": 7690 }, { "epoch": 1.27246436686635, "grad_norm": 8.302135467529297, "learning_rate": 4.84860170036174e-05, "loss": 1.7339, "step": 7700 }, { "epoch": 1.2741169179921503, "grad_norm": 7.312380790710449, "learning_rate": 4.8476835784718785e-05, "loss": 1.7085, "step": 7710 }, { "epoch": 1.2757694691179509, "grad_norm": 9.16114330291748, "learning_rate": 4.846765456582016e-05, "loss": 1.6013, "step": 7720 }, { "epoch": 1.2774220202437512, "grad_norm": 10.258267402648926, "learning_rate": 4.845847334692154e-05, "loss": 1.5444, "step": 7730 }, { "epoch": 1.2790745713695517, "grad_norm": 41.98063278198242, "learning_rate": 4.844929212802291e-05, "loss": 1.7147, "step": 7740 }, { "epoch": 1.2807271224953523, "grad_norm": 5.9125237464904785, "learning_rate": 4.8440110909124296e-05, "loss": 1.5708, "step": 7750 }, { "epoch": 1.2823796736211526, "grad_norm": 10.1311616897583, "learning_rate": 4.843092969022568e-05, "loss": 1.6326, "step": 7760 }, { "epoch": 1.284032224746953, "grad_norm": 9.376750946044922, "learning_rate": 4.8421748471327054e-05, "loss": 1.7641, "step": 7770 }, { "epoch": 1.2856847758727534, "grad_norm": 8.097874641418457, "learning_rate": 4.841256725242844e-05, "loss": 1.5963, "step": 7780 }, { "epoch": 1.287337326998554, "grad_norm": 8.448856353759766, "learning_rate": 4.840338603352981e-05, "loss": 1.5056, "step": 7790 }, { "epoch": 1.2889898781243545, "grad_norm": 10.571368217468262, "learning_rate": 4.8394204814631195e-05, "loss": 1.7242, "step": 7800 }, { "epoch": 1.290642429250155, "grad_norm": 28.546253204345703, "learning_rate": 4.838502359573257e-05, "loss": 1.7191, "step": 7810 }, { "epoch": 1.2922949803759554, "grad_norm": 20.1251220703125, "learning_rate": 4.8375842376833954e-05, "loss": 1.6916, "step": 7820 }, { "epoch": 1.293947531501756, "grad_norm": 10.026655197143555, "learning_rate": 4.836666115793533e-05, "loss": 1.6829, "step": 7830 }, { "epoch": 1.2956000826275562, "grad_norm": 21.374874114990234, "learning_rate": 4.835747993903671e-05, "loss": 1.5028, "step": 7840 }, { "epoch": 1.2972526337533568, "grad_norm": 42.24235916137695, "learning_rate": 4.834829872013809e-05, "loss": 1.631, "step": 7850 }, { "epoch": 1.2989051848791573, "grad_norm": 6.917150974273682, "learning_rate": 4.8339117501239464e-05, "loss": 1.7458, "step": 7860 }, { "epoch": 1.3005577360049576, "grad_norm": 26.261568069458008, "learning_rate": 4.8329936282340847e-05, "loss": 1.7711, "step": 7870 }, { "epoch": 1.3022102871307581, "grad_norm": 5.60324239730835, "learning_rate": 4.832075506344222e-05, "loss": 1.726, "step": 7880 }, { "epoch": 1.3038628382565585, "grad_norm": 12.937496185302734, "learning_rate": 4.8311573844543605e-05, "loss": 1.7153, "step": 7890 }, { "epoch": 1.305515389382359, "grad_norm": 14.33709716796875, "learning_rate": 4.830239262564498e-05, "loss": 1.713, "step": 7900 }, { "epoch": 1.3071679405081595, "grad_norm": 8.725417137145996, "learning_rate": 4.8293211406746364e-05, "loss": 1.7221, "step": 7910 }, { "epoch": 1.3088204916339599, "grad_norm": 13.017943382263184, "learning_rate": 4.828403018784774e-05, "loss": 1.5677, "step": 7920 }, { "epoch": 1.3104730427597604, "grad_norm": 10.267081260681152, "learning_rate": 4.827484896894912e-05, "loss": 1.6575, "step": 7930 }, { "epoch": 1.3121255938855607, "grad_norm": 4.439043045043945, "learning_rate": 4.82656677500505e-05, "loss": 1.6943, "step": 7940 }, { "epoch": 1.3137781450113613, "grad_norm": 9.938862800598145, "learning_rate": 4.825648653115188e-05, "loss": 1.6691, "step": 7950 }, { "epoch": 1.3154306961371618, "grad_norm": 7.4380669593811035, "learning_rate": 4.8247305312253256e-05, "loss": 1.6049, "step": 7960 }, { "epoch": 1.3170832472629623, "grad_norm": 9.574049949645996, "learning_rate": 4.823812409335464e-05, "loss": 1.5485, "step": 7970 }, { "epoch": 1.3187357983887626, "grad_norm": 6.69705057144165, "learning_rate": 4.8228942874456015e-05, "loss": 1.7664, "step": 7980 }, { "epoch": 1.3203883495145632, "grad_norm": 51.60087203979492, "learning_rate": 4.821976165555739e-05, "loss": 1.7427, "step": 7990 }, { "epoch": 1.3220409006403635, "grad_norm": 8.675333023071289, "learning_rate": 4.821058043665877e-05, "loss": 1.5976, "step": 8000 }, { "epoch": 1.323693451766164, "grad_norm": 11.984296798706055, "learning_rate": 4.820139921776015e-05, "loss": 1.655, "step": 8010 }, { "epoch": 1.3253460028919646, "grad_norm": 7.3102617263793945, "learning_rate": 4.819221799886153e-05, "loss": 1.6596, "step": 8020 }, { "epoch": 1.326998554017765, "grad_norm": 5.259336948394775, "learning_rate": 4.818303677996291e-05, "loss": 1.6986, "step": 8030 }, { "epoch": 1.3286511051435654, "grad_norm": 8.154091835021973, "learning_rate": 4.817385556106429e-05, "loss": 1.7009, "step": 8040 }, { "epoch": 1.3303036562693658, "grad_norm": 6.396356582641602, "learning_rate": 4.8164674342165666e-05, "loss": 1.5735, "step": 8050 }, { "epoch": 1.3319562073951663, "grad_norm": 7.4831647872924805, "learning_rate": 4.815549312326705e-05, "loss": 1.684, "step": 8060 }, { "epoch": 1.3336087585209668, "grad_norm": 9.999634742736816, "learning_rate": 4.8146311904368425e-05, "loss": 1.7047, "step": 8070 }, { "epoch": 1.3352613096467671, "grad_norm": 9.42754077911377, "learning_rate": 4.813713068546981e-05, "loss": 1.7498, "step": 8080 }, { "epoch": 1.3369138607725677, "grad_norm": 14.416214942932129, "learning_rate": 4.812794946657118e-05, "loss": 1.6447, "step": 8090 }, { "epoch": 1.338566411898368, "grad_norm": 6.572526454925537, "learning_rate": 4.8118768247672566e-05, "loss": 1.6883, "step": 8100 }, { "epoch": 1.3402189630241685, "grad_norm": 27.258047103881836, "learning_rate": 4.810958702877394e-05, "loss": 1.606, "step": 8110 }, { "epoch": 1.341871514149969, "grad_norm": 6.700786590576172, "learning_rate": 4.810040580987532e-05, "loss": 1.6751, "step": 8120 }, { "epoch": 1.3435240652757694, "grad_norm": 8.797626495361328, "learning_rate": 4.80912245909767e-05, "loss": 1.6453, "step": 8130 }, { "epoch": 1.34517661640157, "grad_norm": 14.59374713897705, "learning_rate": 4.8082043372078076e-05, "loss": 1.6201, "step": 8140 }, { "epoch": 1.3468291675273703, "grad_norm": 12.410117149353027, "learning_rate": 4.807286215317946e-05, "loss": 1.6923, "step": 8150 }, { "epoch": 1.3484817186531708, "grad_norm": 5.754236221313477, "learning_rate": 4.8063680934280835e-05, "loss": 1.5902, "step": 8160 }, { "epoch": 1.3501342697789713, "grad_norm": 21.738666534423828, "learning_rate": 4.805449971538222e-05, "loss": 1.7867, "step": 8170 }, { "epoch": 1.3517868209047719, "grad_norm": 11.151928901672363, "learning_rate": 4.804531849648359e-05, "loss": 1.6477, "step": 8180 }, { "epoch": 1.3534393720305722, "grad_norm": 7.115407466888428, "learning_rate": 4.8036137277584976e-05, "loss": 1.6847, "step": 8190 }, { "epoch": 1.3550919231563727, "grad_norm": 9.613741874694824, "learning_rate": 4.802695605868635e-05, "loss": 1.7105, "step": 8200 }, { "epoch": 1.356744474282173, "grad_norm": 9.931926727294922, "learning_rate": 4.8017774839787734e-05, "loss": 1.6286, "step": 8210 }, { "epoch": 1.3583970254079736, "grad_norm": 46.38539505004883, "learning_rate": 4.800859362088912e-05, "loss": 1.7295, "step": 8220 }, { "epoch": 1.3600495765337741, "grad_norm": 11.234518051147461, "learning_rate": 4.799941240199049e-05, "loss": 1.6839, "step": 8230 }, { "epoch": 1.3617021276595744, "grad_norm": 29.555355072021484, "learning_rate": 4.799023118309187e-05, "loss": 1.6353, "step": 8240 }, { "epoch": 1.363354678785375, "grad_norm": 13.247050285339355, "learning_rate": 4.7981049964193244e-05, "loss": 1.7446, "step": 8250 }, { "epoch": 1.3650072299111753, "grad_norm": 31.702011108398438, "learning_rate": 4.797186874529463e-05, "loss": 1.6522, "step": 8260 }, { "epoch": 1.3666597810369758, "grad_norm": 19.7655029296875, "learning_rate": 4.7962687526396e-05, "loss": 1.7344, "step": 8270 }, { "epoch": 1.3683123321627764, "grad_norm": 31.14563751220703, "learning_rate": 4.7953506307497386e-05, "loss": 1.6795, "step": 8280 }, { "epoch": 1.3699648832885767, "grad_norm": 10.664438247680664, "learning_rate": 4.794432508859876e-05, "loss": 1.7746, "step": 8290 }, { "epoch": 1.3716174344143772, "grad_norm": 14.965887069702148, "learning_rate": 4.7935143869700144e-05, "loss": 1.6984, "step": 8300 }, { "epoch": 1.3732699855401775, "grad_norm": 18.51651954650879, "learning_rate": 4.792596265080152e-05, "loss": 1.6421, "step": 8310 }, { "epoch": 1.374922536665978, "grad_norm": 8.034571647644043, "learning_rate": 4.79167814319029e-05, "loss": 1.6664, "step": 8320 }, { "epoch": 1.3765750877917786, "grad_norm": 11.700427055358887, "learning_rate": 4.7907600213004285e-05, "loss": 1.6122, "step": 8330 }, { "epoch": 1.378227638917579, "grad_norm": 7.673017501831055, "learning_rate": 4.789841899410566e-05, "loss": 1.6424, "step": 8340 }, { "epoch": 1.3798801900433795, "grad_norm": 4.827954292297363, "learning_rate": 4.7889237775207044e-05, "loss": 1.6598, "step": 8350 }, { "epoch": 1.3815327411691798, "grad_norm": 11.37001895904541, "learning_rate": 4.788005655630842e-05, "loss": 1.6502, "step": 8360 }, { "epoch": 1.3831852922949803, "grad_norm": 7.629409313201904, "learning_rate": 4.7870875337409795e-05, "loss": 1.7068, "step": 8370 }, { "epoch": 1.3848378434207809, "grad_norm": 21.615652084350586, "learning_rate": 4.786169411851117e-05, "loss": 1.5153, "step": 8380 }, { "epoch": 1.3864903945465814, "grad_norm": 9.277376174926758, "learning_rate": 4.7852512899612554e-05, "loss": 1.6594, "step": 8390 }, { "epoch": 1.3881429456723817, "grad_norm": 11.108504295349121, "learning_rate": 4.784333168071393e-05, "loss": 1.6291, "step": 8400 }, { "epoch": 1.3897954967981823, "grad_norm": 9.173980712890625, "learning_rate": 4.783415046181531e-05, "loss": 1.6998, "step": 8410 }, { "epoch": 1.3914480479239826, "grad_norm": 8.537729263305664, "learning_rate": 4.782496924291669e-05, "loss": 1.5601, "step": 8420 }, { "epoch": 1.3931005990497831, "grad_norm": 10.740826606750488, "learning_rate": 4.781578802401807e-05, "loss": 1.4999, "step": 8430 }, { "epoch": 1.3947531501755837, "grad_norm": 9.543370246887207, "learning_rate": 4.7806606805119453e-05, "loss": 1.6375, "step": 8440 }, { "epoch": 1.396405701301384, "grad_norm": 9.984679222106934, "learning_rate": 4.779742558622083e-05, "loss": 1.5013, "step": 8450 }, { "epoch": 1.3980582524271845, "grad_norm": 9.201363563537598, "learning_rate": 4.778824436732221e-05, "loss": 1.5612, "step": 8460 }, { "epoch": 1.3997108035529848, "grad_norm": 17.85106086730957, "learning_rate": 4.777906314842359e-05, "loss": 1.7215, "step": 8470 }, { "epoch": 1.4013633546787854, "grad_norm": 8.731996536254883, "learning_rate": 4.776988192952497e-05, "loss": 1.4592, "step": 8480 }, { "epoch": 1.403015905804586, "grad_norm": 12.443288803100586, "learning_rate": 4.7760700710626346e-05, "loss": 1.6517, "step": 8490 }, { "epoch": 1.4046684569303862, "grad_norm": 4.665842056274414, "learning_rate": 4.775151949172772e-05, "loss": 1.5992, "step": 8500 }, { "epoch": 1.4063210080561868, "grad_norm": 8.88791275024414, "learning_rate": 4.77423382728291e-05, "loss": 1.7198, "step": 8510 }, { "epoch": 1.407973559181987, "grad_norm": 7.840359687805176, "learning_rate": 4.773315705393048e-05, "loss": 1.633, "step": 8520 }, { "epoch": 1.4096261103077876, "grad_norm": 42.78237533569336, "learning_rate": 4.7723975835031857e-05, "loss": 1.7015, "step": 8530 }, { "epoch": 1.4112786614335882, "grad_norm": 31.31909942626953, "learning_rate": 4.771479461613324e-05, "loss": 1.8256, "step": 8540 }, { "epoch": 1.4129312125593887, "grad_norm": 9.952142715454102, "learning_rate": 4.7705613397234615e-05, "loss": 1.7117, "step": 8550 }, { "epoch": 1.414583763685189, "grad_norm": 10.509651184082031, "learning_rate": 4.7696432178336e-05, "loss": 1.6899, "step": 8560 }, { "epoch": 1.4162363148109893, "grad_norm": 13.65649700164795, "learning_rate": 4.768725095943738e-05, "loss": 1.6753, "step": 8570 }, { "epoch": 1.4178888659367899, "grad_norm": 9.891709327697754, "learning_rate": 4.7678069740538756e-05, "loss": 1.6893, "step": 8580 }, { "epoch": 1.4195414170625904, "grad_norm": 6.675931453704834, "learning_rate": 4.766888852164014e-05, "loss": 1.6134, "step": 8590 }, { "epoch": 1.421193968188391, "grad_norm": 8.769001007080078, "learning_rate": 4.7659707302741515e-05, "loss": 1.6775, "step": 8600 }, { "epoch": 1.4228465193141913, "grad_norm": 9.261868476867676, "learning_rate": 4.76505260838429e-05, "loss": 1.7286, "step": 8610 }, { "epoch": 1.4244990704399918, "grad_norm": 9.514866828918457, "learning_rate": 4.764134486494427e-05, "loss": 1.5825, "step": 8620 }, { "epoch": 1.4261516215657921, "grad_norm": 11.191073417663574, "learning_rate": 4.763216364604565e-05, "loss": 1.699, "step": 8630 }, { "epoch": 1.4278041726915927, "grad_norm": 7.584383487701416, "learning_rate": 4.7622982427147025e-05, "loss": 1.7905, "step": 8640 }, { "epoch": 1.4294567238173932, "grad_norm": 7.346413612365723, "learning_rate": 4.761380120824841e-05, "loss": 1.7111, "step": 8650 }, { "epoch": 1.4311092749431935, "grad_norm": 12.299200057983398, "learning_rate": 4.7604619989349783e-05, "loss": 1.6624, "step": 8660 }, { "epoch": 1.432761826068994, "grad_norm": 28.52010154724121, "learning_rate": 4.7595438770451166e-05, "loss": 1.7085, "step": 8670 }, { "epoch": 1.4344143771947944, "grad_norm": 20.789058685302734, "learning_rate": 4.758625755155255e-05, "loss": 1.7056, "step": 8680 }, { "epoch": 1.436066928320595, "grad_norm": 13.599698066711426, "learning_rate": 4.7577076332653925e-05, "loss": 1.5663, "step": 8690 }, { "epoch": 1.4377194794463954, "grad_norm": 15.90054702758789, "learning_rate": 4.756789511375531e-05, "loss": 1.7743, "step": 8700 }, { "epoch": 1.4393720305721958, "grad_norm": 21.3345890045166, "learning_rate": 4.755871389485668e-05, "loss": 1.7051, "step": 8710 }, { "epoch": 1.4410245816979963, "grad_norm": 10.17492389678955, "learning_rate": 4.7549532675958066e-05, "loss": 1.6872, "step": 8720 }, { "epoch": 1.4426771328237966, "grad_norm": 5.552209377288818, "learning_rate": 4.754035145705944e-05, "loss": 1.6925, "step": 8730 }, { "epoch": 1.4443296839495972, "grad_norm": 17.34671401977539, "learning_rate": 4.7531170238160824e-05, "loss": 1.5588, "step": 8740 }, { "epoch": 1.4459822350753977, "grad_norm": 7.345114231109619, "learning_rate": 4.75219890192622e-05, "loss": 1.6705, "step": 8750 }, { "epoch": 1.4476347862011982, "grad_norm": 7.885573387145996, "learning_rate": 4.7512807800363576e-05, "loss": 1.5831, "step": 8760 }, { "epoch": 1.4492873373269985, "grad_norm": 26.88873291015625, "learning_rate": 4.750362658146495e-05, "loss": 1.6182, "step": 8770 }, { "epoch": 1.450939888452799, "grad_norm": 6.879226207733154, "learning_rate": 4.7494445362566334e-05, "loss": 1.8231, "step": 8780 }, { "epoch": 1.4525924395785994, "grad_norm": 15.858061790466309, "learning_rate": 4.748526414366772e-05, "loss": 1.5887, "step": 8790 }, { "epoch": 1.4542449907044, "grad_norm": 6.855731010437012, "learning_rate": 4.747608292476909e-05, "loss": 1.7003, "step": 8800 }, { "epoch": 1.4558975418302005, "grad_norm": 32.11435317993164, "learning_rate": 4.7466901705870475e-05, "loss": 1.682, "step": 8810 }, { "epoch": 1.4575500929560008, "grad_norm": 10.053679466247559, "learning_rate": 4.745772048697185e-05, "loss": 1.6652, "step": 8820 }, { "epoch": 1.4592026440818013, "grad_norm": 9.131372451782227, "learning_rate": 4.7448539268073234e-05, "loss": 1.592, "step": 8830 }, { "epoch": 1.4608551952076017, "grad_norm": 7.688348293304443, "learning_rate": 4.743935804917461e-05, "loss": 1.6068, "step": 8840 }, { "epoch": 1.4625077463334022, "grad_norm": 6.706598281860352, "learning_rate": 4.743017683027599e-05, "loss": 1.5156, "step": 8850 }, { "epoch": 1.4641602974592027, "grad_norm": 10.917819023132324, "learning_rate": 4.742099561137737e-05, "loss": 1.5903, "step": 8860 }, { "epoch": 1.465812848585003, "grad_norm": 21.588254928588867, "learning_rate": 4.741181439247875e-05, "loss": 1.6745, "step": 8870 }, { "epoch": 1.4674653997108036, "grad_norm": 20.283750534057617, "learning_rate": 4.740263317358013e-05, "loss": 1.5973, "step": 8880 }, { "epoch": 1.469117950836604, "grad_norm": 21.922521591186523, "learning_rate": 4.73934519546815e-05, "loss": 1.5737, "step": 8890 }, { "epoch": 1.4707705019624044, "grad_norm": 8.129682540893555, "learning_rate": 4.7384270735782885e-05, "loss": 1.7585, "step": 8900 }, { "epoch": 1.472423053088205, "grad_norm": 24.920692443847656, "learning_rate": 4.737508951688426e-05, "loss": 1.6209, "step": 8910 }, { "epoch": 1.4740756042140053, "grad_norm": 10.06619930267334, "learning_rate": 4.7365908297985644e-05, "loss": 1.686, "step": 8920 }, { "epoch": 1.4757281553398058, "grad_norm": 13.576436042785645, "learning_rate": 4.735672707908702e-05, "loss": 1.6421, "step": 8930 }, { "epoch": 1.4773807064656062, "grad_norm": 13.374571800231934, "learning_rate": 4.73475458601884e-05, "loss": 1.7228, "step": 8940 }, { "epoch": 1.4790332575914067, "grad_norm": 7.154562473297119, "learning_rate": 4.733836464128978e-05, "loss": 1.6549, "step": 8950 }, { "epoch": 1.4806858087172072, "grad_norm": 15.508869171142578, "learning_rate": 4.732918342239116e-05, "loss": 1.675, "step": 8960 }, { "epoch": 1.4823383598430078, "grad_norm": 10.355504989624023, "learning_rate": 4.732000220349254e-05, "loss": 1.6165, "step": 8970 }, { "epoch": 1.483990910968808, "grad_norm": 9.313375473022461, "learning_rate": 4.731082098459392e-05, "loss": 1.6555, "step": 8980 }, { "epoch": 1.4856434620946086, "grad_norm": 11.76880168914795, "learning_rate": 4.7301639765695295e-05, "loss": 1.7011, "step": 8990 }, { "epoch": 1.487296013220409, "grad_norm": 9.017674446105957, "learning_rate": 4.729245854679668e-05, "loss": 1.696, "step": 9000 }, { "epoch": 1.4889485643462095, "grad_norm": 30.168582916259766, "learning_rate": 4.7283277327898054e-05, "loss": 1.6945, "step": 9010 }, { "epoch": 1.49060111547201, "grad_norm": 28.239177703857422, "learning_rate": 4.727409610899943e-05, "loss": 1.5782, "step": 9020 }, { "epoch": 1.4922536665978103, "grad_norm": 7.943751335144043, "learning_rate": 4.726491489010081e-05, "loss": 1.6587, "step": 9030 }, { "epoch": 1.4939062177236109, "grad_norm": 19.905715942382812, "learning_rate": 4.725573367120219e-05, "loss": 1.5575, "step": 9040 }, { "epoch": 1.4955587688494112, "grad_norm": 10.705276489257812, "learning_rate": 4.724655245230357e-05, "loss": 1.59, "step": 9050 }, { "epoch": 1.4972113199752117, "grad_norm": 5.611824035644531, "learning_rate": 4.7237371233404947e-05, "loss": 1.63, "step": 9060 }, { "epoch": 1.4988638711010123, "grad_norm": 14.214753150939941, "learning_rate": 4.722819001450633e-05, "loss": 1.6358, "step": 9070 }, { "epoch": 1.5005164222268128, "grad_norm": 12.516484260559082, "learning_rate": 4.7219008795607705e-05, "loss": 1.7702, "step": 9080 }, { "epoch": 1.5021689733526131, "grad_norm": 7.916662693023682, "learning_rate": 4.720982757670909e-05, "loss": 1.677, "step": 9090 }, { "epoch": 1.5038215244784134, "grad_norm": 8.450326919555664, "learning_rate": 4.7200646357810464e-05, "loss": 1.6053, "step": 9100 }, { "epoch": 1.505474075604214, "grad_norm": 15.917101860046387, "learning_rate": 4.7191465138911846e-05, "loss": 1.6568, "step": 9110 }, { "epoch": 1.5071266267300145, "grad_norm": 10.047342300415039, "learning_rate": 4.718228392001322e-05, "loss": 1.7409, "step": 9120 }, { "epoch": 1.508779177855815, "grad_norm": 8.033047676086426, "learning_rate": 4.7173102701114605e-05, "loss": 1.6499, "step": 9130 }, { "epoch": 1.5104317289816154, "grad_norm": 8.630041122436523, "learning_rate": 4.716392148221598e-05, "loss": 1.5798, "step": 9140 }, { "epoch": 1.5120842801074157, "grad_norm": 18.576356887817383, "learning_rate": 4.7154740263317356e-05, "loss": 1.7651, "step": 9150 }, { "epoch": 1.5137368312332162, "grad_norm": 11.440652847290039, "learning_rate": 4.714555904441874e-05, "loss": 1.6616, "step": 9160 }, { "epoch": 1.5153893823590168, "grad_norm": 11.976500511169434, "learning_rate": 4.7136377825520115e-05, "loss": 1.673, "step": 9170 }, { "epoch": 1.5170419334848173, "grad_norm": 7.650939464569092, "learning_rate": 4.71271966066215e-05, "loss": 1.5774, "step": 9180 }, { "epoch": 1.5186944846106176, "grad_norm": 8.787154197692871, "learning_rate": 4.711801538772287e-05, "loss": 1.581, "step": 9190 }, { "epoch": 1.520347035736418, "grad_norm": 11.962265014648438, "learning_rate": 4.7108834168824256e-05, "loss": 1.5368, "step": 9200 }, { "epoch": 1.5219995868622185, "grad_norm": 8.907034873962402, "learning_rate": 4.709965294992563e-05, "loss": 1.5875, "step": 9210 }, { "epoch": 1.523652137988019, "grad_norm": 13.364984512329102, "learning_rate": 4.7090471731027014e-05, "loss": 1.6646, "step": 9220 }, { "epoch": 1.5253046891138196, "grad_norm": 7.773966312408447, "learning_rate": 4.708129051212839e-05, "loss": 1.5193, "step": 9230 }, { "epoch": 1.5269572402396199, "grad_norm": 6.378410339355469, "learning_rate": 4.707210929322977e-05, "loss": 1.7244, "step": 9240 }, { "epoch": 1.5286097913654204, "grad_norm": 15.127907752990723, "learning_rate": 4.7062928074331156e-05, "loss": 1.6726, "step": 9250 }, { "epoch": 1.5302623424912207, "grad_norm": 16.456928253173828, "learning_rate": 4.705374685543253e-05, "loss": 1.7078, "step": 9260 }, { "epoch": 1.5319148936170213, "grad_norm": 11.239667892456055, "learning_rate": 4.704456563653391e-05, "loss": 1.6224, "step": 9270 }, { "epoch": 1.5335674447428218, "grad_norm": 26.4625186920166, "learning_rate": 4.703538441763528e-05, "loss": 1.6131, "step": 9280 }, { "epoch": 1.5352199958686223, "grad_norm": 7.001266956329346, "learning_rate": 4.7026203198736666e-05, "loss": 1.6851, "step": 9290 }, { "epoch": 1.5368725469944227, "grad_norm": 4.010128974914551, "learning_rate": 4.701702197983804e-05, "loss": 1.5748, "step": 9300 }, { "epoch": 1.538525098120223, "grad_norm": 5.630429267883301, "learning_rate": 4.7007840760939424e-05, "loss": 1.7095, "step": 9310 }, { "epoch": 1.5401776492460235, "grad_norm": 10.235610961914062, "learning_rate": 4.69986595420408e-05, "loss": 1.7108, "step": 9320 }, { "epoch": 1.541830200371824, "grad_norm": 27.188196182250977, "learning_rate": 4.698947832314218e-05, "loss": 1.6722, "step": 9330 }, { "epoch": 1.5434827514976246, "grad_norm": 9.516741752624512, "learning_rate": 4.698029710424356e-05, "loss": 1.5869, "step": 9340 }, { "epoch": 1.545135302623425, "grad_norm": 16.91329574584961, "learning_rate": 4.697111588534494e-05, "loss": 1.8084, "step": 9350 }, { "epoch": 1.5467878537492252, "grad_norm": 26.68416976928711, "learning_rate": 4.6961934666446324e-05, "loss": 1.5328, "step": 9360 }, { "epoch": 1.5484404048750258, "grad_norm": 7.9282073974609375, "learning_rate": 4.69527534475477e-05, "loss": 1.6512, "step": 9370 }, { "epoch": 1.5500929560008263, "grad_norm": 10.000015258789062, "learning_rate": 4.694357222864908e-05, "loss": 1.6211, "step": 9380 }, { "epoch": 1.5517455071266268, "grad_norm": 51.1877555847168, "learning_rate": 4.693439100975046e-05, "loss": 1.5976, "step": 9390 }, { "epoch": 1.5533980582524272, "grad_norm": 14.719481468200684, "learning_rate": 4.6925209790851834e-05, "loss": 1.5779, "step": 9400 }, { "epoch": 1.5550506093782275, "grad_norm": 9.398426055908203, "learning_rate": 4.691602857195321e-05, "loss": 1.5547, "step": 9410 }, { "epoch": 1.556703160504028, "grad_norm": 13.369629859924316, "learning_rate": 4.690684735305459e-05, "loss": 1.625, "step": 9420 }, { "epoch": 1.5583557116298286, "grad_norm": 7.2386674880981445, "learning_rate": 4.689766613415597e-05, "loss": 1.6962, "step": 9430 }, { "epoch": 1.560008262755629, "grad_norm": 6.752237319946289, "learning_rate": 4.688848491525735e-05, "loss": 1.6528, "step": 9440 }, { "epoch": 1.5616608138814294, "grad_norm": 16.95864486694336, "learning_rate": 4.687930369635873e-05, "loss": 1.6891, "step": 9450 }, { "epoch": 1.56331336500723, "grad_norm": 29.162368774414062, "learning_rate": 4.687012247746011e-05, "loss": 1.6036, "step": 9460 }, { "epoch": 1.5649659161330303, "grad_norm": 8.65707778930664, "learning_rate": 4.686094125856149e-05, "loss": 1.749, "step": 9470 }, { "epoch": 1.5666184672588308, "grad_norm": 8.36571979522705, "learning_rate": 4.685176003966287e-05, "loss": 1.597, "step": 9480 }, { "epoch": 1.5682710183846313, "grad_norm": 10.10719108581543, "learning_rate": 4.684257882076425e-05, "loss": 1.7746, "step": 9490 }, { "epoch": 1.5699235695104319, "grad_norm": 7.152252197265625, "learning_rate": 4.683339760186563e-05, "loss": 1.604, "step": 9500 }, { "epoch": 1.5715761206362322, "grad_norm": 7.251582145690918, "learning_rate": 4.682421638296701e-05, "loss": 1.599, "step": 9510 }, { "epoch": 1.5732286717620325, "grad_norm": 14.877774238586426, "learning_rate": 4.6815035164068385e-05, "loss": 1.5955, "step": 9520 }, { "epoch": 1.574881222887833, "grad_norm": 7.264001846313477, "learning_rate": 4.680585394516976e-05, "loss": 1.5446, "step": 9530 }, { "epoch": 1.5765337740136336, "grad_norm": 31.467344284057617, "learning_rate": 4.679667272627114e-05, "loss": 1.6975, "step": 9540 }, { "epoch": 1.5781863251394341, "grad_norm": 6.771696090698242, "learning_rate": 4.678749150737252e-05, "loss": 1.5352, "step": 9550 }, { "epoch": 1.5798388762652344, "grad_norm": 16.742000579833984, "learning_rate": 4.6778310288473895e-05, "loss": 1.7136, "step": 9560 }, { "epoch": 1.5814914273910348, "grad_norm": 29.50790023803711, "learning_rate": 4.676912906957528e-05, "loss": 1.6819, "step": 9570 }, { "epoch": 1.5831439785168353, "grad_norm": 15.069239616394043, "learning_rate": 4.675994785067666e-05, "loss": 1.5652, "step": 9580 }, { "epoch": 1.5847965296426358, "grad_norm": 10.359492301940918, "learning_rate": 4.6750766631778036e-05, "loss": 1.7212, "step": 9590 }, { "epoch": 1.5864490807684364, "grad_norm": 14.912342071533203, "learning_rate": 4.674158541287942e-05, "loss": 1.7613, "step": 9600 }, { "epoch": 1.5881016318942367, "grad_norm": 15.676534652709961, "learning_rate": 4.6732404193980795e-05, "loss": 1.5133, "step": 9610 }, { "epoch": 1.5897541830200372, "grad_norm": 6.48195743560791, "learning_rate": 4.672322297508218e-05, "loss": 1.6921, "step": 9620 }, { "epoch": 1.5914067341458376, "grad_norm": 10.377370834350586, "learning_rate": 4.6714041756183553e-05, "loss": 1.6365, "step": 9630 }, { "epoch": 1.593059285271638, "grad_norm": 6.733672142028809, "learning_rate": 4.6704860537284936e-05, "loss": 1.6149, "step": 9640 }, { "epoch": 1.5947118363974386, "grad_norm": 13.145439147949219, "learning_rate": 4.669567931838631e-05, "loss": 1.7273, "step": 9650 }, { "epoch": 1.596364387523239, "grad_norm": 23.76000213623047, "learning_rate": 4.668649809948769e-05, "loss": 1.5651, "step": 9660 }, { "epoch": 1.5980169386490395, "grad_norm": 10.361004829406738, "learning_rate": 4.6677316880589064e-05, "loss": 1.679, "step": 9670 }, { "epoch": 1.5996694897748398, "grad_norm": 7.273348808288574, "learning_rate": 4.6668135661690446e-05, "loss": 1.6333, "step": 9680 }, { "epoch": 1.6013220409006403, "grad_norm": 5.444534778594971, "learning_rate": 4.665895444279182e-05, "loss": 1.6189, "step": 9690 }, { "epoch": 1.6029745920264409, "grad_norm": 11.040206909179688, "learning_rate": 4.6649773223893205e-05, "loss": 1.5579, "step": 9700 }, { "epoch": 1.6046271431522414, "grad_norm": 8.984979629516602, "learning_rate": 4.664059200499459e-05, "loss": 1.5681, "step": 9710 }, { "epoch": 1.6062796942780417, "grad_norm": 8.338875770568848, "learning_rate": 4.663141078609596e-05, "loss": 1.6869, "step": 9720 }, { "epoch": 1.607932245403842, "grad_norm": 8.329351425170898, "learning_rate": 4.6622229567197346e-05, "loss": 1.4787, "step": 9730 }, { "epoch": 1.6095847965296426, "grad_norm": 9.657584190368652, "learning_rate": 4.661304834829872e-05, "loss": 1.7737, "step": 9740 }, { "epoch": 1.6112373476554431, "grad_norm": 9.302536964416504, "learning_rate": 4.6603867129400104e-05, "loss": 1.6792, "step": 9750 }, { "epoch": 1.6128898987812437, "grad_norm": 7.729322910308838, "learning_rate": 4.659468591050148e-05, "loss": 1.6573, "step": 9760 }, { "epoch": 1.614542449907044, "grad_norm": 8.147910118103027, "learning_rate": 4.658550469160286e-05, "loss": 1.5995, "step": 9770 }, { "epoch": 1.6161950010328443, "grad_norm": 7.201321601867676, "learning_rate": 4.657632347270424e-05, "loss": 1.6919, "step": 9780 }, { "epoch": 1.6178475521586448, "grad_norm": 8.503691673278809, "learning_rate": 4.6567142253805615e-05, "loss": 1.6173, "step": 9790 }, { "epoch": 1.6195001032844454, "grad_norm": 21.700334548950195, "learning_rate": 4.655796103490699e-05, "loss": 1.5879, "step": 9800 }, { "epoch": 1.621152654410246, "grad_norm": 18.57465171813965, "learning_rate": 4.654877981600837e-05, "loss": 1.6012, "step": 9810 }, { "epoch": 1.6228052055360462, "grad_norm": 6.389706611633301, "learning_rate": 4.6539598597109756e-05, "loss": 1.6895, "step": 9820 }, { "epoch": 1.6244577566618468, "grad_norm": 6.799575328826904, "learning_rate": 4.653041737821113e-05, "loss": 1.6661, "step": 9830 }, { "epoch": 1.626110307787647, "grad_norm": 6.362144947052002, "learning_rate": 4.6521236159312514e-05, "loss": 1.6193, "step": 9840 }, { "epoch": 1.6277628589134476, "grad_norm": 18.13498306274414, "learning_rate": 4.651205494041389e-05, "loss": 1.6063, "step": 9850 }, { "epoch": 1.6294154100392482, "grad_norm": 15.989859580993652, "learning_rate": 4.650287372151527e-05, "loss": 1.6043, "step": 9860 }, { "epoch": 1.6310679611650487, "grad_norm": 30.983396530151367, "learning_rate": 4.649369250261665e-05, "loss": 1.8636, "step": 9870 }, { "epoch": 1.632720512290849, "grad_norm": 9.010120391845703, "learning_rate": 4.648451128371803e-05, "loss": 1.6487, "step": 9880 }, { "epoch": 1.6343730634166493, "grad_norm": 6.68414831161499, "learning_rate": 4.647533006481941e-05, "loss": 1.6081, "step": 9890 }, { "epoch": 1.6360256145424499, "grad_norm": 9.62691593170166, "learning_rate": 4.646614884592079e-05, "loss": 1.5234, "step": 9900 }, { "epoch": 1.6376781656682504, "grad_norm": 7.007033348083496, "learning_rate": 4.6456967627022166e-05, "loss": 1.584, "step": 9910 }, { "epoch": 1.639330716794051, "grad_norm": 17.63177490234375, "learning_rate": 4.644778640812354e-05, "loss": 1.4388, "step": 9920 }, { "epoch": 1.6409832679198513, "grad_norm": 19.87029266357422, "learning_rate": 4.6438605189224924e-05, "loss": 1.5444, "step": 9930 }, { "epoch": 1.6426358190456516, "grad_norm": 28.15524673461914, "learning_rate": 4.64294239703263e-05, "loss": 1.6218, "step": 9940 }, { "epoch": 1.6442883701714521, "grad_norm": 22.747636795043945, "learning_rate": 4.642024275142768e-05, "loss": 1.5842, "step": 9950 }, { "epoch": 1.6459409212972527, "grad_norm": 12.167838096618652, "learning_rate": 4.641106153252906e-05, "loss": 1.5029, "step": 9960 }, { "epoch": 1.6475934724230532, "grad_norm": 9.144806861877441, "learning_rate": 4.640188031363044e-05, "loss": 1.6125, "step": 9970 }, { "epoch": 1.6492460235488535, "grad_norm": 11.655991554260254, "learning_rate": 4.639269909473182e-05, "loss": 1.5915, "step": 9980 }, { "epoch": 1.6508985746746538, "grad_norm": 29.67504119873047, "learning_rate": 4.63835178758332e-05, "loss": 1.6178, "step": 9990 }, { "epoch": 1.6525511258004544, "grad_norm": 18.424442291259766, "learning_rate": 4.6374336656934575e-05, "loss": 1.46, "step": 10000 }, { "epoch": 1.654203676926255, "grad_norm": 13.235391616821289, "learning_rate": 4.636515543803596e-05, "loss": 1.5958, "step": 10010 }, { "epoch": 1.6558562280520555, "grad_norm": 10.103015899658203, "learning_rate": 4.6355974219137334e-05, "loss": 1.7109, "step": 10020 }, { "epoch": 1.6575087791778558, "grad_norm": 10.416471481323242, "learning_rate": 4.6346793000238717e-05, "loss": 1.7097, "step": 10030 }, { "epoch": 1.6591613303036563, "grad_norm": 12.568252563476562, "learning_rate": 4.633761178134009e-05, "loss": 1.6269, "step": 10040 }, { "epoch": 1.6608138814294566, "grad_norm": 65.68162536621094, "learning_rate": 4.632843056244147e-05, "loss": 1.5952, "step": 10050 }, { "epoch": 1.6624664325552572, "grad_norm": 29.50044822692871, "learning_rate": 4.631924934354285e-05, "loss": 1.529, "step": 10060 }, { "epoch": 1.6641189836810577, "grad_norm": 33.595272064208984, "learning_rate": 4.631006812464423e-05, "loss": 1.5998, "step": 10070 }, { "epoch": 1.6657715348068582, "grad_norm": 8.986908912658691, "learning_rate": 4.630088690574561e-05, "loss": 1.7082, "step": 10080 }, { "epoch": 1.6674240859326586, "grad_norm": 12.171062469482422, "learning_rate": 4.6291705686846985e-05, "loss": 1.6, "step": 10090 }, { "epoch": 1.6690766370584589, "grad_norm": 28.181230545043945, "learning_rate": 4.628252446794837e-05, "loss": 1.6835, "step": 10100 }, { "epoch": 1.6707291881842594, "grad_norm": 13.5730562210083, "learning_rate": 4.6273343249049744e-05, "loss": 1.7877, "step": 10110 }, { "epoch": 1.67238173931006, "grad_norm": 13.940990447998047, "learning_rate": 4.6264162030151126e-05, "loss": 1.7061, "step": 10120 }, { "epoch": 1.6740342904358605, "grad_norm": 10.991562843322754, "learning_rate": 4.62549808112525e-05, "loss": 1.6114, "step": 10130 }, { "epoch": 1.6756868415616608, "grad_norm": 24.29729461669922, "learning_rate": 4.6245799592353885e-05, "loss": 1.6884, "step": 10140 }, { "epoch": 1.6773393926874611, "grad_norm": 11.747648239135742, "learning_rate": 4.623661837345526e-05, "loss": 1.6264, "step": 10150 }, { "epoch": 1.6789919438132617, "grad_norm": 10.77138614654541, "learning_rate": 4.6227437154556643e-05, "loss": 1.6901, "step": 10160 }, { "epoch": 1.6806444949390622, "grad_norm": 13.280838012695312, "learning_rate": 4.621825593565802e-05, "loss": 1.7122, "step": 10170 }, { "epoch": 1.6822970460648627, "grad_norm": 12.801493644714355, "learning_rate": 4.6209074716759395e-05, "loss": 1.5463, "step": 10180 }, { "epoch": 1.683949597190663, "grad_norm": 11.150125503540039, "learning_rate": 4.619989349786078e-05, "loss": 1.6348, "step": 10190 }, { "epoch": 1.6856021483164634, "grad_norm": 8.492911338806152, "learning_rate": 4.6190712278962154e-05, "loss": 1.545, "step": 10200 }, { "epoch": 1.687254699442264, "grad_norm": 7.204408168792725, "learning_rate": 4.6181531060063536e-05, "loss": 1.6983, "step": 10210 }, { "epoch": 1.6889072505680645, "grad_norm": 53.46291732788086, "learning_rate": 4.617234984116491e-05, "loss": 1.6098, "step": 10220 }, { "epoch": 1.690559801693865, "grad_norm": 5.803036212921143, "learning_rate": 4.6163168622266295e-05, "loss": 1.5576, "step": 10230 }, { "epoch": 1.6922123528196653, "grad_norm": 20.07267951965332, "learning_rate": 4.615398740336767e-05, "loss": 1.7001, "step": 10240 }, { "epoch": 1.6938649039454658, "grad_norm": 9.970727920532227, "learning_rate": 4.614480618446905e-05, "loss": 1.6336, "step": 10250 }, { "epoch": 1.6955174550712662, "grad_norm": 9.943961143493652, "learning_rate": 4.613562496557043e-05, "loss": 1.7672, "step": 10260 }, { "epoch": 1.6971700061970667, "grad_norm": 13.809309959411621, "learning_rate": 4.612644374667181e-05, "loss": 1.6802, "step": 10270 }, { "epoch": 1.6988225573228672, "grad_norm": 5.0867462158203125, "learning_rate": 4.6117262527773194e-05, "loss": 1.6411, "step": 10280 }, { "epoch": 1.7004751084486678, "grad_norm": 15.1895751953125, "learning_rate": 4.610808130887457e-05, "loss": 1.6637, "step": 10290 }, { "epoch": 1.702127659574468, "grad_norm": 5.628765106201172, "learning_rate": 4.6098900089975946e-05, "loss": 1.6516, "step": 10300 }, { "epoch": 1.7037802107002684, "grad_norm": 9.596723556518555, "learning_rate": 4.608971887107732e-05, "loss": 1.5923, "step": 10310 }, { "epoch": 1.705432761826069, "grad_norm": 6.9686455726623535, "learning_rate": 4.6080537652178705e-05, "loss": 1.5587, "step": 10320 }, { "epoch": 1.7070853129518695, "grad_norm": 12.421316146850586, "learning_rate": 4.607135643328008e-05, "loss": 1.6681, "step": 10330 }, { "epoch": 1.70873786407767, "grad_norm": 21.520915985107422, "learning_rate": 4.606217521438146e-05, "loss": 1.6138, "step": 10340 }, { "epoch": 1.7103904152034703, "grad_norm": 8.50918960571289, "learning_rate": 4.605299399548284e-05, "loss": 1.7301, "step": 10350 }, { "epoch": 1.7120429663292707, "grad_norm": 10.39163875579834, "learning_rate": 4.604381277658422e-05, "loss": 1.6752, "step": 10360 }, { "epoch": 1.7136955174550712, "grad_norm": 6.190021514892578, "learning_rate": 4.60346315576856e-05, "loss": 1.5676, "step": 10370 }, { "epoch": 1.7153480685808717, "grad_norm": 9.422684669494629, "learning_rate": 4.602545033878698e-05, "loss": 1.5699, "step": 10380 }, { "epoch": 1.7170006197066723, "grad_norm": 7.96755838394165, "learning_rate": 4.601626911988836e-05, "loss": 1.6369, "step": 10390 }, { "epoch": 1.7186531708324726, "grad_norm": 6.596837997436523, "learning_rate": 4.600708790098974e-05, "loss": 1.6232, "step": 10400 }, { "epoch": 1.7203057219582731, "grad_norm": 18.394948959350586, "learning_rate": 4.599790668209112e-05, "loss": 1.6457, "step": 10410 }, { "epoch": 1.7219582730840735, "grad_norm": 9.514167785644531, "learning_rate": 4.59887254631925e-05, "loss": 1.5002, "step": 10420 }, { "epoch": 1.723610824209874, "grad_norm": 15.90142822265625, "learning_rate": 4.597954424429387e-05, "loss": 1.6668, "step": 10430 }, { "epoch": 1.7252633753356745, "grad_norm": 14.313610076904297, "learning_rate": 4.597036302539525e-05, "loss": 1.6253, "step": 10440 }, { "epoch": 1.7269159264614748, "grad_norm": 5.3916144371032715, "learning_rate": 4.596118180649663e-05, "loss": 1.5947, "step": 10450 }, { "epoch": 1.7285684775872754, "grad_norm": 9.352823257446289, "learning_rate": 4.595200058759801e-05, "loss": 1.502, "step": 10460 }, { "epoch": 1.7302210287130757, "grad_norm": 27.035520553588867, "learning_rate": 4.594281936869939e-05, "loss": 1.7585, "step": 10470 }, { "epoch": 1.7318735798388762, "grad_norm": 5.941956520080566, "learning_rate": 4.5933638149800766e-05, "loss": 1.8005, "step": 10480 }, { "epoch": 1.7335261309646768, "grad_norm": 7.316133499145508, "learning_rate": 4.592445693090215e-05, "loss": 1.5243, "step": 10490 }, { "epoch": 1.7351786820904773, "grad_norm": 7.501877784729004, "learning_rate": 4.591527571200353e-05, "loss": 1.5457, "step": 10500 }, { "epoch": 1.7368312332162776, "grad_norm": 8.191024780273438, "learning_rate": 4.590609449310491e-05, "loss": 1.6192, "step": 10510 }, { "epoch": 1.738483784342078, "grad_norm": 7.398270130157471, "learning_rate": 4.589691327420629e-05, "loss": 1.713, "step": 10520 }, { "epoch": 1.7401363354678785, "grad_norm": 7.048802375793457, "learning_rate": 4.5887732055307665e-05, "loss": 1.7066, "step": 10530 }, { "epoch": 1.741788886593679, "grad_norm": 6.611818313598633, "learning_rate": 4.587855083640905e-05, "loss": 1.5422, "step": 10540 }, { "epoch": 1.7434414377194796, "grad_norm": 11.106283187866211, "learning_rate": 4.5869369617510424e-05, "loss": 1.6228, "step": 10550 }, { "epoch": 1.7450939888452799, "grad_norm": 17.441743850708008, "learning_rate": 4.58601883986118e-05, "loss": 1.6286, "step": 10560 }, { "epoch": 1.7467465399710802, "grad_norm": 17.710969924926758, "learning_rate": 4.585100717971318e-05, "loss": 1.5696, "step": 10570 }, { "epoch": 1.7483990910968807, "grad_norm": 12.676824569702148, "learning_rate": 4.584182596081456e-05, "loss": 1.6077, "step": 10580 }, { "epoch": 1.7500516422226813, "grad_norm": 8.125213623046875, "learning_rate": 4.5832644741915934e-05, "loss": 1.7088, "step": 10590 }, { "epoch": 1.7517041933484818, "grad_norm": 7.755391597747803, "learning_rate": 4.582346352301732e-05, "loss": 1.638, "step": 10600 }, { "epoch": 1.7533567444742821, "grad_norm": 19.214263916015625, "learning_rate": 4.58142823041187e-05, "loss": 1.5969, "step": 10610 }, { "epoch": 1.7550092956000827, "grad_norm": 32.08985900878906, "learning_rate": 4.5805101085220075e-05, "loss": 1.7992, "step": 10620 }, { "epoch": 1.756661846725883, "grad_norm": 6.143647193908691, "learning_rate": 4.579591986632146e-05, "loss": 1.6176, "step": 10630 }, { "epoch": 1.7583143978516835, "grad_norm": 12.872699737548828, "learning_rate": 4.5786738647422834e-05, "loss": 1.5866, "step": 10640 }, { "epoch": 1.759966948977484, "grad_norm": 7.934300899505615, "learning_rate": 4.5777557428524216e-05, "loss": 1.5976, "step": 10650 }, { "epoch": 1.7616195001032846, "grad_norm": 6.824174404144287, "learning_rate": 4.576837620962559e-05, "loss": 1.5911, "step": 10660 }, { "epoch": 1.763272051229085, "grad_norm": 7.65330696105957, "learning_rate": 4.5759194990726975e-05, "loss": 1.5306, "step": 10670 }, { "epoch": 1.7649246023548852, "grad_norm": 4.150641918182373, "learning_rate": 4.575001377182835e-05, "loss": 1.6023, "step": 10680 }, { "epoch": 1.7665771534806858, "grad_norm": 15.337532997131348, "learning_rate": 4.574083255292973e-05, "loss": 1.5932, "step": 10690 }, { "epoch": 1.7682297046064863, "grad_norm": 10.681439399719238, "learning_rate": 4.573165133403111e-05, "loss": 1.7345, "step": 10700 }, { "epoch": 1.7698822557322869, "grad_norm": 21.317352294921875, "learning_rate": 4.5722470115132485e-05, "loss": 1.6743, "step": 10710 }, { "epoch": 1.7715348068580872, "grad_norm": 25.704092025756836, "learning_rate": 4.571328889623386e-05, "loss": 1.3808, "step": 10720 }, { "epoch": 1.7731873579838875, "grad_norm": 7.380794525146484, "learning_rate": 4.5704107677335244e-05, "loss": 1.6079, "step": 10730 }, { "epoch": 1.774839909109688, "grad_norm": 12.925481796264648, "learning_rate": 4.5694926458436626e-05, "loss": 1.656, "step": 10740 }, { "epoch": 1.7764924602354886, "grad_norm": 62.927032470703125, "learning_rate": 4.5685745239538e-05, "loss": 1.7919, "step": 10750 }, { "epoch": 1.778145011361289, "grad_norm": 13.639461517333984, "learning_rate": 4.5676564020639385e-05, "loss": 1.6534, "step": 10760 }, { "epoch": 1.7797975624870894, "grad_norm": 7.046122074127197, "learning_rate": 4.566738280174076e-05, "loss": 1.549, "step": 10770 }, { "epoch": 1.7814501136128897, "grad_norm": 12.599806785583496, "learning_rate": 4.565820158284214e-05, "loss": 1.6526, "step": 10780 }, { "epoch": 1.7831026647386903, "grad_norm": 8.485870361328125, "learning_rate": 4.564902036394352e-05, "loss": 1.7176, "step": 10790 }, { "epoch": 1.7847552158644908, "grad_norm": 16.592004776000977, "learning_rate": 4.56398391450449e-05, "loss": 1.5627, "step": 10800 }, { "epoch": 1.7864077669902914, "grad_norm": 11.336044311523438, "learning_rate": 4.563065792614628e-05, "loss": 1.6818, "step": 10810 }, { "epoch": 1.7880603181160917, "grad_norm": 8.369160652160645, "learning_rate": 4.5621476707247653e-05, "loss": 1.6238, "step": 10820 }, { "epoch": 1.7897128692418922, "grad_norm": 9.106146812438965, "learning_rate": 4.5612295488349036e-05, "loss": 1.4657, "step": 10830 }, { "epoch": 1.7913654203676925, "grad_norm": 18.96662712097168, "learning_rate": 4.560311426945041e-05, "loss": 1.6315, "step": 10840 }, { "epoch": 1.793017971493493, "grad_norm": 9.854429244995117, "learning_rate": 4.5593933050551795e-05, "loss": 1.685, "step": 10850 }, { "epoch": 1.7946705226192936, "grad_norm": 20.518041610717773, "learning_rate": 4.558475183165317e-05, "loss": 1.6613, "step": 10860 }, { "epoch": 1.7963230737450941, "grad_norm": 16.10544204711914, "learning_rate": 4.557557061275455e-05, "loss": 1.6003, "step": 10870 }, { "epoch": 1.7979756248708945, "grad_norm": 28.675254821777344, "learning_rate": 4.556638939385593e-05, "loss": 1.5901, "step": 10880 }, { "epoch": 1.7996281759966948, "grad_norm": 27.472984313964844, "learning_rate": 4.555720817495731e-05, "loss": 1.6684, "step": 10890 }, { "epoch": 1.8012807271224953, "grad_norm": 7.602771759033203, "learning_rate": 4.554802695605869e-05, "loss": 1.8076, "step": 10900 }, { "epoch": 1.8029332782482959, "grad_norm": 8.444791793823242, "learning_rate": 4.553884573716007e-05, "loss": 1.7259, "step": 10910 }, { "epoch": 1.8045858293740964, "grad_norm": 8.157547950744629, "learning_rate": 4.5529664518261446e-05, "loss": 1.7186, "step": 10920 }, { "epoch": 1.8062383804998967, "grad_norm": 7.917896270751953, "learning_rate": 4.552048329936283e-05, "loss": 1.6898, "step": 10930 }, { "epoch": 1.807890931625697, "grad_norm": 12.761302947998047, "learning_rate": 4.5511302080464204e-05, "loss": 1.6634, "step": 10940 }, { "epoch": 1.8095434827514976, "grad_norm": 5.908732891082764, "learning_rate": 4.550212086156558e-05, "loss": 1.6357, "step": 10950 }, { "epoch": 1.811196033877298, "grad_norm": 6.7102766036987305, "learning_rate": 4.549293964266696e-05, "loss": 1.6942, "step": 10960 }, { "epoch": 1.8128485850030986, "grad_norm": 117.56930541992188, "learning_rate": 4.548375842376834e-05, "loss": 1.6321, "step": 10970 }, { "epoch": 1.814501136128899, "grad_norm": 9.186972618103027, "learning_rate": 4.547457720486972e-05, "loss": 1.6335, "step": 10980 }, { "epoch": 1.8161536872546993, "grad_norm": 12.602619171142578, "learning_rate": 4.54653959859711e-05, "loss": 1.6513, "step": 10990 }, { "epoch": 1.8178062383804998, "grad_norm": 6.2811279296875, "learning_rate": 4.545621476707248e-05, "loss": 1.5188, "step": 11000 }, { "epoch": 1.8194587895063004, "grad_norm": 15.940876960754395, "learning_rate": 4.5447033548173856e-05, "loss": 1.7214, "step": 11010 }, { "epoch": 1.821111340632101, "grad_norm": 8.485485076904297, "learning_rate": 4.543785232927524e-05, "loss": 1.7004, "step": 11020 }, { "epoch": 1.8227638917579012, "grad_norm": 37.532020568847656, "learning_rate": 4.5428671110376614e-05, "loss": 1.5909, "step": 11030 }, { "epoch": 1.8244164428837017, "grad_norm": 10.41656494140625, "learning_rate": 4.5419489891478e-05, "loss": 1.5895, "step": 11040 }, { "epoch": 1.826068994009502, "grad_norm": 5.756435394287109, "learning_rate": 4.541030867257937e-05, "loss": 1.6121, "step": 11050 }, { "epoch": 1.8277215451353026, "grad_norm": 16.178802490234375, "learning_rate": 4.5401127453680755e-05, "loss": 1.5903, "step": 11060 }, { "epoch": 1.8293740962611031, "grad_norm": 10.916646957397461, "learning_rate": 4.539194623478213e-05, "loss": 1.7007, "step": 11070 }, { "epoch": 1.8310266473869037, "grad_norm": 6.1422224044799805, "learning_rate": 4.538276501588351e-05, "loss": 1.6766, "step": 11080 }, { "epoch": 1.832679198512704, "grad_norm": 15.099671363830566, "learning_rate": 4.537358379698489e-05, "loss": 1.6593, "step": 11090 }, { "epoch": 1.8343317496385043, "grad_norm": 19.930587768554688, "learning_rate": 4.5364402578086266e-05, "loss": 1.6625, "step": 11100 }, { "epoch": 1.8359843007643049, "grad_norm": 9.389049530029297, "learning_rate": 4.535522135918765e-05, "loss": 1.6434, "step": 11110 }, { "epoch": 1.8376368518901054, "grad_norm": 10.457796096801758, "learning_rate": 4.5346040140289024e-05, "loss": 1.6743, "step": 11120 }, { "epoch": 1.839289403015906, "grad_norm": 9.54992961883545, "learning_rate": 4.533685892139041e-05, "loss": 1.6617, "step": 11130 }, { "epoch": 1.8409419541417062, "grad_norm": 19.115575790405273, "learning_rate": 4.532767770249178e-05, "loss": 1.6583, "step": 11140 }, { "epoch": 1.8425945052675066, "grad_norm": 9.527848243713379, "learning_rate": 4.5318496483593165e-05, "loss": 1.5821, "step": 11150 }, { "epoch": 1.844247056393307, "grad_norm": 9.40245246887207, "learning_rate": 4.530931526469454e-05, "loss": 1.6554, "step": 11160 }, { "epoch": 1.8458996075191076, "grad_norm": 18.67144012451172, "learning_rate": 4.5300134045795924e-05, "loss": 1.7357, "step": 11170 }, { "epoch": 1.8475521586449082, "grad_norm": 25.492530822753906, "learning_rate": 4.5290952826897306e-05, "loss": 1.6508, "step": 11180 }, { "epoch": 1.8492047097707085, "grad_norm": 13.621424674987793, "learning_rate": 4.528177160799868e-05, "loss": 1.7588, "step": 11190 }, { "epoch": 1.850857260896509, "grad_norm": 12.195111274719238, "learning_rate": 4.527259038910006e-05, "loss": 1.6934, "step": 11200 }, { "epoch": 1.8525098120223094, "grad_norm": 9.302632331848145, "learning_rate": 4.526340917020144e-05, "loss": 1.5907, "step": 11210 }, { "epoch": 1.85416236314811, "grad_norm": 6.693589687347412, "learning_rate": 4.5254227951302817e-05, "loss": 1.7577, "step": 11220 }, { "epoch": 1.8558149142739104, "grad_norm": 7.7038116455078125, "learning_rate": 4.524504673240419e-05, "loss": 1.574, "step": 11230 }, { "epoch": 1.8574674653997107, "grad_norm": 10.348893165588379, "learning_rate": 4.5235865513505575e-05, "loss": 1.5739, "step": 11240 }, { "epoch": 1.8591200165255113, "grad_norm": 12.050825119018555, "learning_rate": 4.522668429460695e-05, "loss": 1.658, "step": 11250 }, { "epoch": 1.8607725676513116, "grad_norm": 47.17829132080078, "learning_rate": 4.5217503075708334e-05, "loss": 1.4564, "step": 11260 }, { "epoch": 1.8624251187771121, "grad_norm": 8.023752212524414, "learning_rate": 4.520832185680971e-05, "loss": 1.5252, "step": 11270 }, { "epoch": 1.8640776699029127, "grad_norm": 11.899310111999512, "learning_rate": 4.519914063791109e-05, "loss": 1.7046, "step": 11280 }, { "epoch": 1.8657302210287132, "grad_norm": 7.167046546936035, "learning_rate": 4.518995941901247e-05, "loss": 1.6141, "step": 11290 }, { "epoch": 1.8673827721545135, "grad_norm": 42.9312858581543, "learning_rate": 4.518077820011385e-05, "loss": 1.6363, "step": 11300 }, { "epoch": 1.8690353232803139, "grad_norm": 12.356457710266113, "learning_rate": 4.517159698121523e-05, "loss": 1.7502, "step": 11310 }, { "epoch": 1.8706878744061144, "grad_norm": 5.740696430206299, "learning_rate": 4.516241576231661e-05, "loss": 1.6006, "step": 11320 }, { "epoch": 1.872340425531915, "grad_norm": 5.727828502655029, "learning_rate": 4.5153234543417985e-05, "loss": 1.6271, "step": 11330 }, { "epoch": 1.8739929766577155, "grad_norm": 10.743067741394043, "learning_rate": 4.514405332451937e-05, "loss": 1.5755, "step": 11340 }, { "epoch": 1.8756455277835158, "grad_norm": 14.965692520141602, "learning_rate": 4.5134872105620743e-05, "loss": 1.53, "step": 11350 }, { "epoch": 1.877298078909316, "grad_norm": 11.617084503173828, "learning_rate": 4.512569088672212e-05, "loss": 1.6813, "step": 11360 }, { "epoch": 1.8789506300351166, "grad_norm": 253.05593872070312, "learning_rate": 4.51165096678235e-05, "loss": 1.5946, "step": 11370 }, { "epoch": 1.8806031811609172, "grad_norm": 9.766687393188477, "learning_rate": 4.510732844892488e-05, "loss": 1.5899, "step": 11380 }, { "epoch": 1.8822557322867177, "grad_norm": 22.282573699951172, "learning_rate": 4.509814723002626e-05, "loss": 1.6214, "step": 11390 }, { "epoch": 1.883908283412518, "grad_norm": 7.993402481079102, "learning_rate": 4.5088966011127636e-05, "loss": 1.4566, "step": 11400 }, { "epoch": 1.8855608345383186, "grad_norm": 8.523614883422852, "learning_rate": 4.507978479222902e-05, "loss": 1.6375, "step": 11410 }, { "epoch": 1.887213385664119, "grad_norm": 32.4261474609375, "learning_rate": 4.50706035733304e-05, "loss": 1.6616, "step": 11420 }, { "epoch": 1.8888659367899194, "grad_norm": 9.417035102844238, "learning_rate": 4.506142235443178e-05, "loss": 1.5917, "step": 11430 }, { "epoch": 1.89051848791572, "grad_norm": 46.04423904418945, "learning_rate": 4.505224113553316e-05, "loss": 1.6917, "step": 11440 }, { "epoch": 1.8921710390415205, "grad_norm": 14.668797492980957, "learning_rate": 4.5043059916634536e-05, "loss": 1.5452, "step": 11450 }, { "epoch": 1.8938235901673208, "grad_norm": 10.886049270629883, "learning_rate": 4.503387869773591e-05, "loss": 1.5244, "step": 11460 }, { "epoch": 1.8954761412931211, "grad_norm": 10.956747055053711, "learning_rate": 4.5024697478837294e-05, "loss": 1.5967, "step": 11470 }, { "epoch": 1.8971286924189217, "grad_norm": 15.368989944458008, "learning_rate": 4.501551625993867e-05, "loss": 1.5721, "step": 11480 }, { "epoch": 1.8987812435447222, "grad_norm": 8.719017028808594, "learning_rate": 4.5006335041040046e-05, "loss": 1.6545, "step": 11490 }, { "epoch": 1.9004337946705228, "grad_norm": 10.52745246887207, "learning_rate": 4.499715382214143e-05, "loss": 1.6317, "step": 11500 }, { "epoch": 1.902086345796323, "grad_norm": 27.014604568481445, "learning_rate": 4.4987972603242805e-05, "loss": 1.4449, "step": 11510 }, { "epoch": 1.9037388969221234, "grad_norm": 14.527423858642578, "learning_rate": 4.497879138434419e-05, "loss": 1.6464, "step": 11520 }, { "epoch": 1.905391448047924, "grad_norm": 28.174190521240234, "learning_rate": 4.496961016544557e-05, "loss": 1.5219, "step": 11530 }, { "epoch": 1.9070439991737245, "grad_norm": 10.394495964050293, "learning_rate": 4.4960428946546946e-05, "loss": 1.6479, "step": 11540 }, { "epoch": 1.908696550299525, "grad_norm": 29.0770263671875, "learning_rate": 4.495124772764833e-05, "loss": 1.5932, "step": 11550 }, { "epoch": 1.9103491014253253, "grad_norm": 6.238829612731934, "learning_rate": 4.4942066508749704e-05, "loss": 1.5789, "step": 11560 }, { "epoch": 1.9120016525511256, "grad_norm": 7.41868257522583, "learning_rate": 4.493288528985109e-05, "loss": 1.5565, "step": 11570 }, { "epoch": 1.9136542036769262, "grad_norm": 18.375661849975586, "learning_rate": 4.492370407095246e-05, "loss": 1.6516, "step": 11580 }, { "epoch": 1.9153067548027267, "grad_norm": 11.252774238586426, "learning_rate": 4.491452285205384e-05, "loss": 1.4842, "step": 11590 }, { "epoch": 1.9169593059285273, "grad_norm": 13.860923767089844, "learning_rate": 4.490534163315522e-05, "loss": 1.6212, "step": 11600 }, { "epoch": 1.9186118570543276, "grad_norm": 6.298261642456055, "learning_rate": 4.48961604142566e-05, "loss": 1.5285, "step": 11610 }, { "epoch": 1.9202644081801281, "grad_norm": 19.169706344604492, "learning_rate": 4.488697919535797e-05, "loss": 1.6163, "step": 11620 }, { "epoch": 1.9219169593059284, "grad_norm": 15.898575782775879, "learning_rate": 4.4877797976459356e-05, "loss": 1.609, "step": 11630 }, { "epoch": 1.923569510431729, "grad_norm": 16.889341354370117, "learning_rate": 4.486861675756074e-05, "loss": 1.5827, "step": 11640 }, { "epoch": 1.9252220615575295, "grad_norm": 8.887215614318848, "learning_rate": 4.4859435538662114e-05, "loss": 1.581, "step": 11650 }, { "epoch": 1.92687461268333, "grad_norm": 27.01335906982422, "learning_rate": 4.48502543197635e-05, "loss": 1.7247, "step": 11660 }, { "epoch": 1.9285271638091304, "grad_norm": 7.634554862976074, "learning_rate": 4.484107310086487e-05, "loss": 1.5232, "step": 11670 }, { "epoch": 1.9301797149349307, "grad_norm": 17.882055282592773, "learning_rate": 4.4831891881966255e-05, "loss": 1.5038, "step": 11680 }, { "epoch": 1.9318322660607312, "grad_norm": 24.722803115844727, "learning_rate": 4.482271066306763e-05, "loss": 1.4617, "step": 11690 }, { "epoch": 1.9334848171865318, "grad_norm": 62.85787582397461, "learning_rate": 4.4813529444169014e-05, "loss": 1.6762, "step": 11700 }, { "epoch": 1.9351373683123323, "grad_norm": 20.34491729736328, "learning_rate": 4.480434822527039e-05, "loss": 1.7011, "step": 11710 }, { "epoch": 1.9367899194381326, "grad_norm": 8.389205932617188, "learning_rate": 4.4795167006371765e-05, "loss": 1.6886, "step": 11720 }, { "epoch": 1.938442470563933, "grad_norm": 7.912249565124512, "learning_rate": 4.478598578747315e-05, "loss": 1.6878, "step": 11730 }, { "epoch": 1.9400950216897335, "grad_norm": 26.233190536499023, "learning_rate": 4.4776804568574524e-05, "loss": 1.7822, "step": 11740 }, { "epoch": 1.941747572815534, "grad_norm": 24.81481170654297, "learning_rate": 4.4767623349675907e-05, "loss": 1.7393, "step": 11750 }, { "epoch": 1.9434001239413345, "grad_norm": 19.601308822631836, "learning_rate": 4.475844213077728e-05, "loss": 1.5535, "step": 11760 }, { "epoch": 1.9450526750671349, "grad_norm": 9.827634811401367, "learning_rate": 4.4749260911878665e-05, "loss": 1.5759, "step": 11770 }, { "epoch": 1.9467052261929352, "grad_norm": 10.674678802490234, "learning_rate": 4.474007969298004e-05, "loss": 1.6419, "step": 11780 }, { "epoch": 1.9483577773187357, "grad_norm": 13.497589111328125, "learning_rate": 4.4730898474081424e-05, "loss": 1.6282, "step": 11790 }, { "epoch": 1.9500103284445363, "grad_norm": 9.30100154876709, "learning_rate": 4.47217172551828e-05, "loss": 1.6183, "step": 11800 }, { "epoch": 1.9516628795703368, "grad_norm": 9.254987716674805, "learning_rate": 4.471253603628418e-05, "loss": 1.6552, "step": 11810 }, { "epoch": 1.9533154306961371, "grad_norm": 14.543704986572266, "learning_rate": 4.470335481738556e-05, "loss": 1.5962, "step": 11820 }, { "epoch": 1.9549679818219377, "grad_norm": 17.067127227783203, "learning_rate": 4.469417359848694e-05, "loss": 1.6292, "step": 11830 }, { "epoch": 1.956620532947738, "grad_norm": 49.91341018676758, "learning_rate": 4.4684992379588316e-05, "loss": 1.6171, "step": 11840 }, { "epoch": 1.9582730840735385, "grad_norm": 11.141582489013672, "learning_rate": 4.467581116068969e-05, "loss": 1.6286, "step": 11850 }, { "epoch": 1.959925635199339, "grad_norm": 148.42442321777344, "learning_rate": 4.4666629941791075e-05, "loss": 1.598, "step": 11860 }, { "epoch": 1.9615781863251396, "grad_norm": 7.553905010223389, "learning_rate": 4.465744872289245e-05, "loss": 1.7413, "step": 11870 }, { "epoch": 1.96323073745094, "grad_norm": 8.26811408996582, "learning_rate": 4.464826750399383e-05, "loss": 1.7399, "step": 11880 }, { "epoch": 1.9648832885767402, "grad_norm": 12.610078811645508, "learning_rate": 4.463908628509521e-05, "loss": 1.6173, "step": 11890 }, { "epoch": 1.9665358397025408, "grad_norm": 12.24221134185791, "learning_rate": 4.462990506619659e-05, "loss": 1.5597, "step": 11900 }, { "epoch": 1.9681883908283413, "grad_norm": 32.20631790161133, "learning_rate": 4.462072384729797e-05, "loss": 1.6357, "step": 11910 }, { "epoch": 1.9698409419541418, "grad_norm": 13.021040916442871, "learning_rate": 4.461154262839935e-05, "loss": 1.5987, "step": 11920 }, { "epoch": 1.9714934930799421, "grad_norm": 9.593809127807617, "learning_rate": 4.4602361409500726e-05, "loss": 1.5853, "step": 11930 }, { "epoch": 1.9731460442057425, "grad_norm": 6.821667194366455, "learning_rate": 4.459318019060211e-05, "loss": 1.5064, "step": 11940 }, { "epoch": 1.974798595331543, "grad_norm": 15.331339836120605, "learning_rate": 4.4583998971703485e-05, "loss": 1.6008, "step": 11950 }, { "epoch": 1.9764511464573435, "grad_norm": 8.000182151794434, "learning_rate": 4.457481775280487e-05, "loss": 1.655, "step": 11960 }, { "epoch": 1.978103697583144, "grad_norm": 8.434917449951172, "learning_rate": 4.456563653390624e-05, "loss": 1.753, "step": 11970 }, { "epoch": 1.9797562487089444, "grad_norm": 48.595760345458984, "learning_rate": 4.4556455315007626e-05, "loss": 1.685, "step": 11980 }, { "epoch": 1.981408799834745, "grad_norm": 9.696410179138184, "learning_rate": 4.4547274096109e-05, "loss": 1.6055, "step": 11990 }, { "epoch": 1.9830613509605453, "grad_norm": 9.626373291015625, "learning_rate": 4.453809287721038e-05, "loss": 1.7424, "step": 12000 }, { "epoch": 1.9847139020863458, "grad_norm": 62.46161651611328, "learning_rate": 4.452891165831176e-05, "loss": 1.596, "step": 12010 }, { "epoch": 1.9863664532121463, "grad_norm": 25.460296630859375, "learning_rate": 4.4519730439413136e-05, "loss": 1.7073, "step": 12020 }, { "epoch": 1.9880190043379469, "grad_norm": 6.691403865814209, "learning_rate": 4.451054922051452e-05, "loss": 1.61, "step": 12030 }, { "epoch": 1.9896715554637472, "grad_norm": 9.661092758178711, "learning_rate": 4.4501368001615895e-05, "loss": 1.6125, "step": 12040 }, { "epoch": 1.9913241065895475, "grad_norm": 26.178430557250977, "learning_rate": 4.449218678271728e-05, "loss": 1.5553, "step": 12050 }, { "epoch": 1.992976657715348, "grad_norm": 96.2996826171875, "learning_rate": 4.448300556381865e-05, "loss": 1.7229, "step": 12060 }, { "epoch": 1.9946292088411486, "grad_norm": 18.224470138549805, "learning_rate": 4.4473824344920036e-05, "loss": 1.6561, "step": 12070 }, { "epoch": 1.9962817599669491, "grad_norm": 12.857390403747559, "learning_rate": 4.446464312602141e-05, "loss": 1.5519, "step": 12080 }, { "epoch": 1.9979343110927494, "grad_norm": 65.5601577758789, "learning_rate": 4.4455461907122794e-05, "loss": 1.5376, "step": 12090 }, { "epoch": 1.9995868622185498, "grad_norm": 31.238407135009766, "learning_rate": 4.444628068822417e-05, "loss": 1.6433, "step": 12100 }, { "epoch": 1.99991737244371, "eval_accuracy": 0.2733721095190807, "eval_loss": 2.2641355991363525, "eval_runtime": 819.3221, "eval_samples_per_second": 34.414, "eval_steps_per_second": 8.603, "step": 12102 }, { "epoch": 2.0012394133443503, "grad_norm": 44.95764923095703, "learning_rate": 4.443709946932555e-05, "loss": 1.6572, "step": 12110 }, { "epoch": 2.002891964470151, "grad_norm": 28.433679580688477, "learning_rate": 4.442791825042693e-05, "loss": 1.5288, "step": 12120 }, { "epoch": 2.0045445155959514, "grad_norm": 20.113248825073242, "learning_rate": 4.4418737031528304e-05, "loss": 1.5625, "step": 12130 }, { "epoch": 2.006197066721752, "grad_norm": 18.262046813964844, "learning_rate": 4.440955581262969e-05, "loss": 1.6292, "step": 12140 }, { "epoch": 2.007849617847552, "grad_norm": 9.9489164352417, "learning_rate": 4.440037459373106e-05, "loss": 1.7362, "step": 12150 }, { "epoch": 2.0095021689733525, "grad_norm": 71.15792846679688, "learning_rate": 4.4391193374832446e-05, "loss": 1.7299, "step": 12160 }, { "epoch": 2.011154720099153, "grad_norm": 12.936304092407227, "learning_rate": 4.438201215593382e-05, "loss": 1.5459, "step": 12170 }, { "epoch": 2.0128072712249536, "grad_norm": 17.437440872192383, "learning_rate": 4.4372830937035204e-05, "loss": 1.604, "step": 12180 }, { "epoch": 2.014459822350754, "grad_norm": 12.949777603149414, "learning_rate": 4.436364971813658e-05, "loss": 1.5845, "step": 12190 }, { "epoch": 2.0161123734765543, "grad_norm": 34.75144958496094, "learning_rate": 4.435446849923796e-05, "loss": 1.7017, "step": 12200 }, { "epoch": 2.017764924602355, "grad_norm": 25.475833892822266, "learning_rate": 4.4345287280339345e-05, "loss": 1.6947, "step": 12210 }, { "epoch": 2.0194174757281553, "grad_norm": 10.156431198120117, "learning_rate": 4.433610606144072e-05, "loss": 1.6284, "step": 12220 }, { "epoch": 2.021070026853956, "grad_norm": 19.941617965698242, "learning_rate": 4.43269248425421e-05, "loss": 1.6042, "step": 12230 }, { "epoch": 2.0227225779797564, "grad_norm": 9.263725280761719, "learning_rate": 4.431774362364348e-05, "loss": 1.6048, "step": 12240 }, { "epoch": 2.0243751291055565, "grad_norm": 7.461373805999756, "learning_rate": 4.4308562404744855e-05, "loss": 1.6171, "step": 12250 }, { "epoch": 2.026027680231357, "grad_norm": 28.221065521240234, "learning_rate": 4.429938118584623e-05, "loss": 1.6176, "step": 12260 }, { "epoch": 2.0276802313571576, "grad_norm": 6.061464309692383, "learning_rate": 4.4290199966947614e-05, "loss": 1.5904, "step": 12270 }, { "epoch": 2.029332782482958, "grad_norm": 7.372241973876953, "learning_rate": 4.428101874804899e-05, "loss": 1.5756, "step": 12280 }, { "epoch": 2.0309853336087587, "grad_norm": 5.46327543258667, "learning_rate": 4.427183752915037e-05, "loss": 1.6985, "step": 12290 }, { "epoch": 2.0326378847345588, "grad_norm": 8.150008201599121, "learning_rate": 4.426265631025175e-05, "loss": 1.6157, "step": 12300 }, { "epoch": 2.0342904358603593, "grad_norm": 7.626075267791748, "learning_rate": 4.425347509135313e-05, "loss": 1.5679, "step": 12310 }, { "epoch": 2.03594298698616, "grad_norm": 14.908061981201172, "learning_rate": 4.424429387245451e-05, "loss": 1.548, "step": 12320 }, { "epoch": 2.0375955381119604, "grad_norm": 7.012806415557861, "learning_rate": 4.423511265355589e-05, "loss": 1.7013, "step": 12330 }, { "epoch": 2.039248089237761, "grad_norm": 5.985696792602539, "learning_rate": 4.422593143465727e-05, "loss": 1.543, "step": 12340 }, { "epoch": 2.0409006403635614, "grad_norm": 20.78647232055664, "learning_rate": 4.421675021575865e-05, "loss": 1.6405, "step": 12350 }, { "epoch": 2.0425531914893615, "grad_norm": 11.975393295288086, "learning_rate": 4.4207568996860024e-05, "loss": 1.6962, "step": 12360 }, { "epoch": 2.044205742615162, "grad_norm": 8.53466796875, "learning_rate": 4.4198387777961406e-05, "loss": 1.6913, "step": 12370 }, { "epoch": 2.0458582937409626, "grad_norm": 9.030325889587402, "learning_rate": 4.418920655906278e-05, "loss": 1.578, "step": 12380 }, { "epoch": 2.047510844866763, "grad_norm": 21.684886932373047, "learning_rate": 4.418002534016416e-05, "loss": 1.5179, "step": 12390 }, { "epoch": 2.0491633959925637, "grad_norm": 13.426318168640137, "learning_rate": 4.417084412126554e-05, "loss": 1.637, "step": 12400 }, { "epoch": 2.050815947118364, "grad_norm": 6.491002082824707, "learning_rate": 4.4161662902366917e-05, "loss": 1.5211, "step": 12410 }, { "epoch": 2.0524684982441643, "grad_norm": 8.91051197052002, "learning_rate": 4.41524816834683e-05, "loss": 1.6806, "step": 12420 }, { "epoch": 2.054121049369965, "grad_norm": 26.088380813598633, "learning_rate": 4.4143300464569675e-05, "loss": 1.6835, "step": 12430 }, { "epoch": 2.0557736004957654, "grad_norm": 9.795445442199707, "learning_rate": 4.413411924567106e-05, "loss": 1.7466, "step": 12440 }, { "epoch": 2.057426151621566, "grad_norm": 16.947595596313477, "learning_rate": 4.412493802677244e-05, "loss": 1.6179, "step": 12450 }, { "epoch": 2.059078702747366, "grad_norm": 14.341743469238281, "learning_rate": 4.4115756807873816e-05, "loss": 1.6692, "step": 12460 }, { "epoch": 2.0607312538731666, "grad_norm": 8.210740089416504, "learning_rate": 4.41065755889752e-05, "loss": 1.5425, "step": 12470 }, { "epoch": 2.062383804998967, "grad_norm": 16.021188735961914, "learning_rate": 4.4097394370076575e-05, "loss": 1.466, "step": 12480 }, { "epoch": 2.0640363561247677, "grad_norm": 25.511781692504883, "learning_rate": 4.408821315117795e-05, "loss": 1.6227, "step": 12490 }, { "epoch": 2.065688907250568, "grad_norm": 10.479546546936035, "learning_rate": 4.407903193227933e-05, "loss": 1.6323, "step": 12500 }, { "epoch": 2.0673414583763687, "grad_norm": 29.294937133789062, "learning_rate": 4.406985071338071e-05, "loss": 1.7135, "step": 12510 }, { "epoch": 2.068994009502169, "grad_norm": 9.334646224975586, "learning_rate": 4.4060669494482085e-05, "loss": 1.4985, "step": 12520 }, { "epoch": 2.0706465606279694, "grad_norm": 6.601807117462158, "learning_rate": 4.405148827558347e-05, "loss": 1.6179, "step": 12530 }, { "epoch": 2.07229911175377, "grad_norm": 39.724403381347656, "learning_rate": 4.4042307056684843e-05, "loss": 1.5593, "step": 12540 }, { "epoch": 2.0739516628795704, "grad_norm": 21.564411163330078, "learning_rate": 4.4033125837786226e-05, "loss": 1.6602, "step": 12550 }, { "epoch": 2.075604214005371, "grad_norm": 9.211846351623535, "learning_rate": 4.402394461888761e-05, "loss": 1.5664, "step": 12560 }, { "epoch": 2.077256765131171, "grad_norm": 86.83989715576172, "learning_rate": 4.4014763399988985e-05, "loss": 1.6355, "step": 12570 }, { "epoch": 2.0789093162569716, "grad_norm": 15.011252403259277, "learning_rate": 4.400558218109037e-05, "loss": 1.7233, "step": 12580 }, { "epoch": 2.080561867382772, "grad_norm": 22.9763240814209, "learning_rate": 4.399640096219174e-05, "loss": 1.6327, "step": 12590 }, { "epoch": 2.0822144185085727, "grad_norm": 9.363661766052246, "learning_rate": 4.3987219743293126e-05, "loss": 1.6782, "step": 12600 }, { "epoch": 2.0838669696343732, "grad_norm": 10.958879470825195, "learning_rate": 4.39780385243945e-05, "loss": 1.7072, "step": 12610 }, { "epoch": 2.0855195207601733, "grad_norm": 6.636941432952881, "learning_rate": 4.3968857305495884e-05, "loss": 1.5247, "step": 12620 }, { "epoch": 2.087172071885974, "grad_norm": 74.40653228759766, "learning_rate": 4.395967608659726e-05, "loss": 1.5546, "step": 12630 }, { "epoch": 2.0888246230117744, "grad_norm": 9.923933029174805, "learning_rate": 4.3950494867698636e-05, "loss": 1.5428, "step": 12640 }, { "epoch": 2.090477174137575, "grad_norm": 8.562093734741211, "learning_rate": 4.394131364880001e-05, "loss": 1.5351, "step": 12650 }, { "epoch": 2.0921297252633755, "grad_norm": 46.868709564208984, "learning_rate": 4.3932132429901394e-05, "loss": 1.6106, "step": 12660 }, { "epoch": 2.0937822763891756, "grad_norm": 5.111778259277344, "learning_rate": 4.392295121100278e-05, "loss": 1.5635, "step": 12670 }, { "epoch": 2.095434827514976, "grad_norm": 9.51055908203125, "learning_rate": 4.391376999210415e-05, "loss": 1.5957, "step": 12680 }, { "epoch": 2.0970873786407767, "grad_norm": 14.892036437988281, "learning_rate": 4.3904588773205535e-05, "loss": 1.6939, "step": 12690 }, { "epoch": 2.098739929766577, "grad_norm": 6.7437567710876465, "learning_rate": 4.389540755430691e-05, "loss": 1.5671, "step": 12700 }, { "epoch": 2.1003924808923777, "grad_norm": 34.27804946899414, "learning_rate": 4.3886226335408294e-05, "loss": 1.6071, "step": 12710 }, { "epoch": 2.1020450320181783, "grad_norm": 13.502313613891602, "learning_rate": 4.387704511650967e-05, "loss": 1.5568, "step": 12720 }, { "epoch": 2.1036975831439784, "grad_norm": 12.005770683288574, "learning_rate": 4.386786389761105e-05, "loss": 1.563, "step": 12730 }, { "epoch": 2.105350134269779, "grad_norm": 8.112605094909668, "learning_rate": 4.385868267871243e-05, "loss": 1.7326, "step": 12740 }, { "epoch": 2.1070026853955794, "grad_norm": 6.765664100646973, "learning_rate": 4.384950145981381e-05, "loss": 1.6562, "step": 12750 }, { "epoch": 2.10865523652138, "grad_norm": 12.570945739746094, "learning_rate": 4.384032024091519e-05, "loss": 1.6129, "step": 12760 }, { "epoch": 2.1103077876471805, "grad_norm": 10.410295486450195, "learning_rate": 4.383113902201656e-05, "loss": 1.6615, "step": 12770 }, { "epoch": 2.1119603387729806, "grad_norm": 25.10518455505371, "learning_rate": 4.3821957803117945e-05, "loss": 1.7272, "step": 12780 }, { "epoch": 2.113612889898781, "grad_norm": 19.222471237182617, "learning_rate": 4.381277658421932e-05, "loss": 1.5842, "step": 12790 }, { "epoch": 2.1152654410245817, "grad_norm": 9.53968334197998, "learning_rate": 4.3803595365320704e-05, "loss": 1.5341, "step": 12800 }, { "epoch": 2.1169179921503822, "grad_norm": 11.091026306152344, "learning_rate": 4.379441414642208e-05, "loss": 1.7523, "step": 12810 }, { "epoch": 2.1185705432761828, "grad_norm": 5.745179653167725, "learning_rate": 4.378523292752346e-05, "loss": 1.6201, "step": 12820 }, { "epoch": 2.120223094401983, "grad_norm": 21.147724151611328, "learning_rate": 4.377605170862484e-05, "loss": 1.5084, "step": 12830 }, { "epoch": 2.1218756455277834, "grad_norm": 5.363860130310059, "learning_rate": 4.376687048972622e-05, "loss": 1.6167, "step": 12840 }, { "epoch": 2.123528196653584, "grad_norm": 5.5015459060668945, "learning_rate": 4.37576892708276e-05, "loss": 1.5093, "step": 12850 }, { "epoch": 2.1251807477793845, "grad_norm": 6.91679573059082, "learning_rate": 4.374850805192898e-05, "loss": 1.5668, "step": 12860 }, { "epoch": 2.126833298905185, "grad_norm": 15.651091575622559, "learning_rate": 4.3739326833030355e-05, "loss": 1.6675, "step": 12870 }, { "epoch": 2.1284858500309856, "grad_norm": 9.818643569946289, "learning_rate": 4.373014561413174e-05, "loss": 1.5916, "step": 12880 }, { "epoch": 2.1301384011567857, "grad_norm": 11.20659065246582, "learning_rate": 4.3720964395233114e-05, "loss": 1.6256, "step": 12890 }, { "epoch": 2.131790952282586, "grad_norm": 9.242450714111328, "learning_rate": 4.371178317633449e-05, "loss": 1.5514, "step": 12900 }, { "epoch": 2.1334435034083867, "grad_norm": 11.752909660339355, "learning_rate": 4.370260195743587e-05, "loss": 1.6745, "step": 12910 }, { "epoch": 2.1350960545341873, "grad_norm": 19.809019088745117, "learning_rate": 4.369342073853725e-05, "loss": 1.537, "step": 12920 }, { "epoch": 2.136748605659988, "grad_norm": 13.41193962097168, "learning_rate": 4.368423951963863e-05, "loss": 1.5797, "step": 12930 }, { "epoch": 2.138401156785788, "grad_norm": 17.19065284729004, "learning_rate": 4.3675058300740007e-05, "loss": 1.6646, "step": 12940 }, { "epoch": 2.1400537079115884, "grad_norm": 32.37812423706055, "learning_rate": 4.366587708184139e-05, "loss": 1.6777, "step": 12950 }, { "epoch": 2.141706259037389, "grad_norm": 14.590213775634766, "learning_rate": 4.3656695862942765e-05, "loss": 1.582, "step": 12960 }, { "epoch": 2.1433588101631895, "grad_norm": 11.580479621887207, "learning_rate": 4.364751464404415e-05, "loss": 1.5257, "step": 12970 }, { "epoch": 2.14501136128899, "grad_norm": 13.083703994750977, "learning_rate": 4.3638333425145524e-05, "loss": 1.6449, "step": 12980 }, { "epoch": 2.14666391241479, "grad_norm": 44.948280334472656, "learning_rate": 4.3629152206246906e-05, "loss": 1.6575, "step": 12990 }, { "epoch": 2.1483164635405907, "grad_norm": 12.724278450012207, "learning_rate": 4.361997098734828e-05, "loss": 1.576, "step": 13000 }, { "epoch": 2.1499690146663912, "grad_norm": 14.588871002197266, "learning_rate": 4.3610789768449665e-05, "loss": 1.5038, "step": 13010 }, { "epoch": 2.1516215657921918, "grad_norm": 47.780120849609375, "learning_rate": 4.360160854955104e-05, "loss": 1.6169, "step": 13020 }, { "epoch": 2.1532741169179923, "grad_norm": 303.17950439453125, "learning_rate": 4.3592427330652416e-05, "loss": 1.4563, "step": 13030 }, { "epoch": 2.1549266680437924, "grad_norm": 8.716339111328125, "learning_rate": 4.35832461117538e-05, "loss": 1.5758, "step": 13040 }, { "epoch": 2.156579219169593, "grad_norm": 11.586296081542969, "learning_rate": 4.3574064892855175e-05, "loss": 1.6525, "step": 13050 }, { "epoch": 2.1582317702953935, "grad_norm": 20.37527847290039, "learning_rate": 4.356488367395656e-05, "loss": 1.6296, "step": 13060 }, { "epoch": 2.159884321421194, "grad_norm": 258.34210205078125, "learning_rate": 4.355570245505793e-05, "loss": 1.6202, "step": 13070 }, { "epoch": 2.1615368725469946, "grad_norm": 8.469282150268555, "learning_rate": 4.3546521236159316e-05, "loss": 1.6968, "step": 13080 }, { "epoch": 2.1631894236727947, "grad_norm": 16.505489349365234, "learning_rate": 4.353734001726069e-05, "loss": 1.6207, "step": 13090 }, { "epoch": 2.164841974798595, "grad_norm": 12.483043670654297, "learning_rate": 4.3528158798362074e-05, "loss": 1.674, "step": 13100 }, { "epoch": 2.1664945259243957, "grad_norm": 11.628996849060059, "learning_rate": 4.351897757946345e-05, "loss": 1.6864, "step": 13110 }, { "epoch": 2.1681470770501963, "grad_norm": 9.3779878616333, "learning_rate": 4.350979636056483e-05, "loss": 1.5968, "step": 13120 }, { "epoch": 2.169799628175997, "grad_norm": 7.350371360778809, "learning_rate": 4.350061514166621e-05, "loss": 1.5171, "step": 13130 }, { "epoch": 2.1714521793017973, "grad_norm": 17.027692794799805, "learning_rate": 4.349143392276759e-05, "loss": 1.7044, "step": 13140 }, { "epoch": 2.1731047304275974, "grad_norm": 6.943367958068848, "learning_rate": 4.348225270386897e-05, "loss": 1.585, "step": 13150 }, { "epoch": 2.174757281553398, "grad_norm": 14.061978340148926, "learning_rate": 4.347307148497034e-05, "loss": 1.5373, "step": 13160 }, { "epoch": 2.1764098326791985, "grad_norm": 10.38961410522461, "learning_rate": 4.3463890266071726e-05, "loss": 1.7171, "step": 13170 }, { "epoch": 2.178062383804999, "grad_norm": 33.14574432373047, "learning_rate": 4.34547090471731e-05, "loss": 1.5202, "step": 13180 }, { "epoch": 2.1797149349307996, "grad_norm": 34.48433303833008, "learning_rate": 4.3445527828274484e-05, "loss": 1.671, "step": 13190 }, { "epoch": 2.1813674860565997, "grad_norm": 8.59406566619873, "learning_rate": 4.343634660937586e-05, "loss": 1.6391, "step": 13200 }, { "epoch": 2.1830200371824002, "grad_norm": 9.573761940002441, "learning_rate": 4.342716539047724e-05, "loss": 1.6025, "step": 13210 }, { "epoch": 2.1846725883082008, "grad_norm": 11.121373176574707, "learning_rate": 4.341798417157862e-05, "loss": 1.5423, "step": 13220 }, { "epoch": 2.1863251394340013, "grad_norm": 8.483185768127441, "learning_rate": 4.340880295268e-05, "loss": 1.6423, "step": 13230 }, { "epoch": 2.187977690559802, "grad_norm": 11.853544235229492, "learning_rate": 4.3399621733781384e-05, "loss": 1.6377, "step": 13240 }, { "epoch": 2.189630241685602, "grad_norm": 6.522881031036377, "learning_rate": 4.339044051488276e-05, "loss": 1.5402, "step": 13250 }, { "epoch": 2.1912827928114025, "grad_norm": 14.069580078125, "learning_rate": 4.338125929598414e-05, "loss": 1.5593, "step": 13260 }, { "epoch": 2.192935343937203, "grad_norm": 7.207234859466553, "learning_rate": 4.337207807708552e-05, "loss": 1.6057, "step": 13270 }, { "epoch": 2.1945878950630036, "grad_norm": 11.689896583557129, "learning_rate": 4.3362896858186894e-05, "loss": 1.5226, "step": 13280 }, { "epoch": 2.196240446188804, "grad_norm": 57.798797607421875, "learning_rate": 4.335371563928827e-05, "loss": 1.5292, "step": 13290 }, { "epoch": 2.1978929973146046, "grad_norm": 15.47495174407959, "learning_rate": 4.334453442038965e-05, "loss": 1.5943, "step": 13300 }, { "epoch": 2.1995455484404047, "grad_norm": 6.988327980041504, "learning_rate": 4.333535320149103e-05, "loss": 1.5755, "step": 13310 }, { "epoch": 2.2011980995662053, "grad_norm": 10.670233726501465, "learning_rate": 4.332617198259241e-05, "loss": 1.6494, "step": 13320 }, { "epoch": 2.202850650692006, "grad_norm": 9.843840599060059, "learning_rate": 4.331699076369379e-05, "loss": 1.6159, "step": 13330 }, { "epoch": 2.2045032018178063, "grad_norm": 7.6408233642578125, "learning_rate": 4.330780954479517e-05, "loss": 1.6032, "step": 13340 }, { "epoch": 2.206155752943607, "grad_norm": 7.837189197540283, "learning_rate": 4.329862832589655e-05, "loss": 1.5953, "step": 13350 }, { "epoch": 2.207808304069407, "grad_norm": 14.017073631286621, "learning_rate": 4.328944710699793e-05, "loss": 1.5831, "step": 13360 }, { "epoch": 2.2094608551952075, "grad_norm": 19.07843017578125, "learning_rate": 4.328026588809931e-05, "loss": 1.5628, "step": 13370 }, { "epoch": 2.211113406321008, "grad_norm": 8.755454063415527, "learning_rate": 4.327108466920069e-05, "loss": 1.6262, "step": 13380 }, { "epoch": 2.2127659574468086, "grad_norm": 41.75136184692383, "learning_rate": 4.326190345030207e-05, "loss": 1.6595, "step": 13390 }, { "epoch": 2.214418508572609, "grad_norm": 10.803937911987305, "learning_rate": 4.3252722231403445e-05, "loss": 1.7103, "step": 13400 }, { "epoch": 2.2160710596984092, "grad_norm": 64.1683349609375, "learning_rate": 4.324354101250482e-05, "loss": 1.6162, "step": 13410 }, { "epoch": 2.2177236108242098, "grad_norm": 14.838408470153809, "learning_rate": 4.32343597936062e-05, "loss": 1.6336, "step": 13420 }, { "epoch": 2.2193761619500103, "grad_norm": 15.170886039733887, "learning_rate": 4.322517857470758e-05, "loss": 1.4815, "step": 13430 }, { "epoch": 2.221028713075811, "grad_norm": 13.091402053833008, "learning_rate": 4.3215997355808955e-05, "loss": 1.6045, "step": 13440 }, { "epoch": 2.2226812642016114, "grad_norm": 8.757658958435059, "learning_rate": 4.320681613691034e-05, "loss": 1.7558, "step": 13450 }, { "epoch": 2.2243338153274115, "grad_norm": 12.71592903137207, "learning_rate": 4.3197634918011714e-05, "loss": 1.581, "step": 13460 }, { "epoch": 2.225986366453212, "grad_norm": 18.19761085510254, "learning_rate": 4.3188453699113096e-05, "loss": 1.5034, "step": 13470 }, { "epoch": 2.2276389175790126, "grad_norm": 8.124272346496582, "learning_rate": 4.317927248021448e-05, "loss": 1.561, "step": 13480 }, { "epoch": 2.229291468704813, "grad_norm": 6.5509796142578125, "learning_rate": 4.3170091261315855e-05, "loss": 1.709, "step": 13490 }, { "epoch": 2.2309440198306136, "grad_norm": 34.16242599487305, "learning_rate": 4.316091004241724e-05, "loss": 1.5383, "step": 13500 }, { "epoch": 2.2325965709564137, "grad_norm": 60.66366195678711, "learning_rate": 4.3151728823518613e-05, "loss": 1.6962, "step": 13510 }, { "epoch": 2.2342491220822143, "grad_norm": 8.734251976013184, "learning_rate": 4.3142547604619996e-05, "loss": 1.5747, "step": 13520 }, { "epoch": 2.235901673208015, "grad_norm": 7.926821231842041, "learning_rate": 4.313336638572137e-05, "loss": 1.5384, "step": 13530 }, { "epoch": 2.2375542243338153, "grad_norm": 13.231529235839844, "learning_rate": 4.312418516682275e-05, "loss": 1.6822, "step": 13540 }, { "epoch": 2.239206775459616, "grad_norm": 19.836008071899414, "learning_rate": 4.3115003947924124e-05, "loss": 1.5861, "step": 13550 }, { "epoch": 2.2408593265854164, "grad_norm": 9.528962135314941, "learning_rate": 4.3105822729025506e-05, "loss": 1.7453, "step": 13560 }, { "epoch": 2.2425118777112165, "grad_norm": 25.385387420654297, "learning_rate": 4.309664151012688e-05, "loss": 1.5686, "step": 13570 }, { "epoch": 2.244164428837017, "grad_norm": 7.8074870109558105, "learning_rate": 4.3087460291228265e-05, "loss": 1.6753, "step": 13580 }, { "epoch": 2.2458169799628176, "grad_norm": 142.185302734375, "learning_rate": 4.307827907232965e-05, "loss": 1.5779, "step": 13590 }, { "epoch": 2.247469531088618, "grad_norm": 24.071073532104492, "learning_rate": 4.306909785343102e-05, "loss": 1.5333, "step": 13600 }, { "epoch": 2.2491220822144187, "grad_norm": 16.71168327331543, "learning_rate": 4.3059916634532406e-05, "loss": 1.6225, "step": 13610 }, { "epoch": 2.250774633340219, "grad_norm": 14.29904556274414, "learning_rate": 4.305073541563378e-05, "loss": 1.7723, "step": 13620 }, { "epoch": 2.2524271844660193, "grad_norm": 9.134810447692871, "learning_rate": 4.3041554196735164e-05, "loss": 1.5871, "step": 13630 }, { "epoch": 2.25407973559182, "grad_norm": 15.202738761901855, "learning_rate": 4.303237297783654e-05, "loss": 1.6463, "step": 13640 }, { "epoch": 2.2557322867176204, "grad_norm": 61.93202209472656, "learning_rate": 4.302319175893792e-05, "loss": 1.7039, "step": 13650 }, { "epoch": 2.257384837843421, "grad_norm": 48.4030647277832, "learning_rate": 4.30140105400393e-05, "loss": 1.6503, "step": 13660 }, { "epoch": 2.2590373889692215, "grad_norm": 17.932994842529297, "learning_rate": 4.3004829321140675e-05, "loss": 1.5064, "step": 13670 }, { "epoch": 2.2606899400950216, "grad_norm": 12.965964317321777, "learning_rate": 4.299564810224205e-05, "loss": 1.5113, "step": 13680 }, { "epoch": 2.262342491220822, "grad_norm": 34.27570724487305, "learning_rate": 4.298646688334343e-05, "loss": 1.5163, "step": 13690 }, { "epoch": 2.2639950423466226, "grad_norm": 31.845481872558594, "learning_rate": 4.2977285664444816e-05, "loss": 1.5038, "step": 13700 }, { "epoch": 2.265647593472423, "grad_norm": 10.161199569702148, "learning_rate": 4.296810444554619e-05, "loss": 1.4943, "step": 13710 }, { "epoch": 2.2673001445982237, "grad_norm": 14.812753677368164, "learning_rate": 4.2958923226647574e-05, "loss": 1.5093, "step": 13720 }, { "epoch": 2.268952695724024, "grad_norm": 8.536107063293457, "learning_rate": 4.294974200774895e-05, "loss": 1.5917, "step": 13730 }, { "epoch": 2.2706052468498243, "grad_norm": 12.14443302154541, "learning_rate": 4.294056078885033e-05, "loss": 1.6145, "step": 13740 }, { "epoch": 2.272257797975625, "grad_norm": 9.584919929504395, "learning_rate": 4.293137956995171e-05, "loss": 1.5674, "step": 13750 }, { "epoch": 2.2739103491014254, "grad_norm": 8.43072509765625, "learning_rate": 4.292219835105309e-05, "loss": 1.6024, "step": 13760 }, { "epoch": 2.275562900227226, "grad_norm": 10.97778606414795, "learning_rate": 4.291301713215447e-05, "loss": 1.7366, "step": 13770 }, { "epoch": 2.277215451353026, "grad_norm": 7.326660633087158, "learning_rate": 4.290383591325585e-05, "loss": 1.632, "step": 13780 }, { "epoch": 2.2788680024788266, "grad_norm": 6.518946170806885, "learning_rate": 4.2894654694357226e-05, "loss": 1.5364, "step": 13790 }, { "epoch": 2.280520553604627, "grad_norm": 28.530855178833008, "learning_rate": 4.28854734754586e-05, "loss": 1.7219, "step": 13800 }, { "epoch": 2.2821731047304277, "grad_norm": 14.148917198181152, "learning_rate": 4.2876292256559984e-05, "loss": 1.6488, "step": 13810 }, { "epoch": 2.283825655856228, "grad_norm": 40.1848030090332, "learning_rate": 4.286711103766136e-05, "loss": 1.6645, "step": 13820 }, { "epoch": 2.2854782069820283, "grad_norm": 7.235057830810547, "learning_rate": 4.285792981876274e-05, "loss": 1.584, "step": 13830 }, { "epoch": 2.287130758107829, "grad_norm": 10.927480697631836, "learning_rate": 4.284874859986412e-05, "loss": 1.612, "step": 13840 }, { "epoch": 2.2887833092336294, "grad_norm": 9.236492156982422, "learning_rate": 4.28395673809655e-05, "loss": 1.6889, "step": 13850 }, { "epoch": 2.29043586035943, "grad_norm": 13.05184555053711, "learning_rate": 4.283038616206688e-05, "loss": 1.564, "step": 13860 }, { "epoch": 2.2920884114852305, "grad_norm": 17.070354461669922, "learning_rate": 4.282120494316826e-05, "loss": 1.5551, "step": 13870 }, { "epoch": 2.2937409626110306, "grad_norm": 11.861235618591309, "learning_rate": 4.2812023724269635e-05, "loss": 1.763, "step": 13880 }, { "epoch": 2.295393513736831, "grad_norm": 10.376340866088867, "learning_rate": 4.280284250537102e-05, "loss": 1.6106, "step": 13890 }, { "epoch": 2.2970460648626316, "grad_norm": 77.84674835205078, "learning_rate": 4.2793661286472394e-05, "loss": 1.6489, "step": 13900 }, { "epoch": 2.298698615988432, "grad_norm": 8.317307472229004, "learning_rate": 4.2784480067573777e-05, "loss": 1.6459, "step": 13910 }, { "epoch": 2.3003511671142327, "grad_norm": 9.242462158203125, "learning_rate": 4.277529884867515e-05, "loss": 1.5626, "step": 13920 }, { "epoch": 2.302003718240033, "grad_norm": 9.533720016479492, "learning_rate": 4.276611762977653e-05, "loss": 1.6698, "step": 13930 }, { "epoch": 2.3036562693658333, "grad_norm": 8.523353576660156, "learning_rate": 4.275693641087791e-05, "loss": 1.6613, "step": 13940 }, { "epoch": 2.305308820491634, "grad_norm": 7.527460098266602, "learning_rate": 4.274775519197929e-05, "loss": 1.5093, "step": 13950 }, { "epoch": 2.3069613716174344, "grad_norm": 7.213209629058838, "learning_rate": 4.273857397308067e-05, "loss": 1.6411, "step": 13960 }, { "epoch": 2.308613922743235, "grad_norm": 8.763940811157227, "learning_rate": 4.2729392754182045e-05, "loss": 1.666, "step": 13970 }, { "epoch": 2.3102664738690355, "grad_norm": 26.79812240600586, "learning_rate": 4.272021153528343e-05, "loss": 1.5755, "step": 13980 }, { "epoch": 2.3119190249948356, "grad_norm": 20.0844669342041, "learning_rate": 4.2711030316384804e-05, "loss": 1.6604, "step": 13990 }, { "epoch": 2.313571576120636, "grad_norm": 9.126409530639648, "learning_rate": 4.2701849097486186e-05, "loss": 1.5742, "step": 14000 }, { "epoch": 2.3152241272464367, "grad_norm": 24.678070068359375, "learning_rate": 4.269266787858756e-05, "loss": 1.5018, "step": 14010 }, { "epoch": 2.316876678372237, "grad_norm": 6.996845245361328, "learning_rate": 4.2683486659688945e-05, "loss": 1.5528, "step": 14020 }, { "epoch": 2.3185292294980377, "grad_norm": 7.3137617111206055, "learning_rate": 4.267430544079032e-05, "loss": 1.54, "step": 14030 }, { "epoch": 2.3201817806238383, "grad_norm": 10.51693344116211, "learning_rate": 4.2665124221891703e-05, "loss": 1.7331, "step": 14040 }, { "epoch": 2.3218343317496384, "grad_norm": 9.240628242492676, "learning_rate": 4.265594300299308e-05, "loss": 1.5992, "step": 14050 }, { "epoch": 2.323486882875439, "grad_norm": 34.22092056274414, "learning_rate": 4.2646761784094455e-05, "loss": 1.5359, "step": 14060 }, { "epoch": 2.3251394340012395, "grad_norm": 8.278003692626953, "learning_rate": 4.263758056519584e-05, "loss": 1.587, "step": 14070 }, { "epoch": 2.32679198512704, "grad_norm": 9.079841613769531, "learning_rate": 4.2628399346297214e-05, "loss": 1.615, "step": 14080 }, { "epoch": 2.3284445362528405, "grad_norm": 130.9219512939453, "learning_rate": 4.2619218127398596e-05, "loss": 1.6647, "step": 14090 }, { "epoch": 2.3300970873786406, "grad_norm": 8.039260864257812, "learning_rate": 4.261003690849997e-05, "loss": 1.5117, "step": 14100 }, { "epoch": 2.331749638504441, "grad_norm": 20.554414749145508, "learning_rate": 4.2600855689601355e-05, "loss": 1.659, "step": 14110 }, { "epoch": 2.3334021896302417, "grad_norm": 16.2318172454834, "learning_rate": 4.259167447070273e-05, "loss": 1.7476, "step": 14120 }, { "epoch": 2.3350547407560422, "grad_norm": 14.719922065734863, "learning_rate": 4.258249325180411e-05, "loss": 1.6945, "step": 14130 }, { "epoch": 2.336707291881843, "grad_norm": 19.263084411621094, "learning_rate": 4.257331203290549e-05, "loss": 1.6284, "step": 14140 }, { "epoch": 2.338359843007643, "grad_norm": 8.643294334411621, "learning_rate": 4.256413081400687e-05, "loss": 1.568, "step": 14150 }, { "epoch": 2.3400123941334434, "grad_norm": 7.2459797859191895, "learning_rate": 4.2554949595108254e-05, "loss": 1.6159, "step": 14160 }, { "epoch": 2.341664945259244, "grad_norm": 9.25794792175293, "learning_rate": 4.254576837620963e-05, "loss": 1.548, "step": 14170 }, { "epoch": 2.3433174963850445, "grad_norm": 39.79923629760742, "learning_rate": 4.2536587157311006e-05, "loss": 1.5182, "step": 14180 }, { "epoch": 2.344970047510845, "grad_norm": 10.711408615112305, "learning_rate": 4.252740593841238e-05, "loss": 1.5869, "step": 14190 }, { "epoch": 2.346622598636645, "grad_norm": 4.972466945648193, "learning_rate": 4.2518224719513765e-05, "loss": 1.6212, "step": 14200 }, { "epoch": 2.3482751497624457, "grad_norm": 23.379873275756836, "learning_rate": 4.250904350061514e-05, "loss": 1.581, "step": 14210 }, { "epoch": 2.349927700888246, "grad_norm": 9.372620582580566, "learning_rate": 4.249986228171652e-05, "loss": 1.6083, "step": 14220 }, { "epoch": 2.3515802520140467, "grad_norm": 7.492948532104492, "learning_rate": 4.24906810628179e-05, "loss": 1.5183, "step": 14230 }, { "epoch": 2.3532328031398473, "grad_norm": 11.669918060302734, "learning_rate": 4.248149984391928e-05, "loss": 1.6055, "step": 14240 }, { "epoch": 2.3548853542656474, "grad_norm": 14.879387855529785, "learning_rate": 4.247231862502066e-05, "loss": 1.5403, "step": 14250 }, { "epoch": 2.356537905391448, "grad_norm": 10.409502029418945, "learning_rate": 4.246313740612204e-05, "loss": 1.6121, "step": 14260 }, { "epoch": 2.3581904565172485, "grad_norm": 12.416383743286133, "learning_rate": 4.245395618722342e-05, "loss": 1.7431, "step": 14270 }, { "epoch": 2.359843007643049, "grad_norm": 13.863862991333008, "learning_rate": 4.24447749683248e-05, "loss": 1.5425, "step": 14280 }, { "epoch": 2.3614955587688495, "grad_norm": 10.172608375549316, "learning_rate": 4.243559374942618e-05, "loss": 1.5315, "step": 14290 }, { "epoch": 2.3631481098946496, "grad_norm": 21.186439514160156, "learning_rate": 4.242641253052756e-05, "loss": 1.7509, "step": 14300 }, { "epoch": 2.36480066102045, "grad_norm": 14.802777290344238, "learning_rate": 4.241723131162893e-05, "loss": 1.6284, "step": 14310 }, { "epoch": 2.3664532121462507, "grad_norm": 7.647261142730713, "learning_rate": 4.240805009273031e-05, "loss": 1.6912, "step": 14320 }, { "epoch": 2.3681057632720512, "grad_norm": 12.083390235900879, "learning_rate": 4.239886887383169e-05, "loss": 1.5557, "step": 14330 }, { "epoch": 2.369758314397852, "grad_norm": 7.496702194213867, "learning_rate": 4.238968765493307e-05, "loss": 1.7399, "step": 14340 }, { "epoch": 2.3714108655236523, "grad_norm": 13.159797668457031, "learning_rate": 4.238050643603445e-05, "loss": 1.5162, "step": 14350 }, { "epoch": 2.3730634166494524, "grad_norm": 6.241695404052734, "learning_rate": 4.2371325217135826e-05, "loss": 1.6829, "step": 14360 }, { "epoch": 2.374715967775253, "grad_norm": 10.561747550964355, "learning_rate": 4.236214399823721e-05, "loss": 1.6487, "step": 14370 }, { "epoch": 2.3763685189010535, "grad_norm": 24.415855407714844, "learning_rate": 4.235296277933859e-05, "loss": 1.7504, "step": 14380 }, { "epoch": 2.378021070026854, "grad_norm": 6.549388885498047, "learning_rate": 4.234378156043997e-05, "loss": 1.4467, "step": 14390 }, { "epoch": 2.3796736211526546, "grad_norm": 8.12175464630127, "learning_rate": 4.233460034154135e-05, "loss": 1.6034, "step": 14400 }, { "epoch": 2.381326172278455, "grad_norm": 10.860891342163086, "learning_rate": 4.2325419122642725e-05, "loss": 1.5926, "step": 14410 }, { "epoch": 2.382978723404255, "grad_norm": 5.750729560852051, "learning_rate": 4.231623790374411e-05, "loss": 1.5773, "step": 14420 }, { "epoch": 2.3846312745300557, "grad_norm": 4.9887871742248535, "learning_rate": 4.2307056684845484e-05, "loss": 1.5499, "step": 14430 }, { "epoch": 2.3862838256558563, "grad_norm": 15.247535705566406, "learning_rate": 4.229787546594686e-05, "loss": 1.6275, "step": 14440 }, { "epoch": 2.387936376781657, "grad_norm": 5.501838207244873, "learning_rate": 4.2288694247048236e-05, "loss": 1.507, "step": 14450 }, { "epoch": 2.3895889279074574, "grad_norm": 6.640819549560547, "learning_rate": 4.227951302814962e-05, "loss": 1.5051, "step": 14460 }, { "epoch": 2.3912414790332575, "grad_norm": 6.430408477783203, "learning_rate": 4.2270331809250994e-05, "loss": 1.7827, "step": 14470 }, { "epoch": 2.392894030159058, "grad_norm": 10.69271183013916, "learning_rate": 4.226115059035238e-05, "loss": 1.6772, "step": 14480 }, { "epoch": 2.3945465812848585, "grad_norm": 12.075413703918457, "learning_rate": 4.225196937145376e-05, "loss": 1.586, "step": 14490 }, { "epoch": 2.396199132410659, "grad_norm": 12.302436828613281, "learning_rate": 4.2242788152555135e-05, "loss": 1.7124, "step": 14500 }, { "epoch": 2.3978516835364596, "grad_norm": 19.029186248779297, "learning_rate": 4.223360693365652e-05, "loss": 1.5647, "step": 14510 }, { "epoch": 2.3995042346622597, "grad_norm": 10.742748260498047, "learning_rate": 4.2224425714757894e-05, "loss": 1.6184, "step": 14520 }, { "epoch": 2.4011567857880602, "grad_norm": 7.062446594238281, "learning_rate": 4.2215244495859276e-05, "loss": 1.5524, "step": 14530 }, { "epoch": 2.402809336913861, "grad_norm": 9.456984519958496, "learning_rate": 4.220606327696065e-05, "loss": 1.4961, "step": 14540 }, { "epoch": 2.4044618880396613, "grad_norm": 17.99965476989746, "learning_rate": 4.2196882058062035e-05, "loss": 1.5853, "step": 14550 }, { "epoch": 2.406114439165462, "grad_norm": 56.12897491455078, "learning_rate": 4.218770083916341e-05, "loss": 1.6261, "step": 14560 }, { "epoch": 2.407766990291262, "grad_norm": 7.298854827880859, "learning_rate": 4.217851962026479e-05, "loss": 1.8109, "step": 14570 }, { "epoch": 2.4094195414170625, "grad_norm": 6.553171634674072, "learning_rate": 4.216933840136616e-05, "loss": 1.5705, "step": 14580 }, { "epoch": 2.411072092542863, "grad_norm": 18.439796447753906, "learning_rate": 4.2160157182467545e-05, "loss": 1.4941, "step": 14590 }, { "epoch": 2.4127246436686636, "grad_norm": 16.03473663330078, "learning_rate": 4.215097596356892e-05, "loss": 1.5356, "step": 14600 }, { "epoch": 2.414377194794464, "grad_norm": 23.698720932006836, "learning_rate": 4.2141794744670304e-05, "loss": 1.6713, "step": 14610 }, { "epoch": 2.416029745920264, "grad_norm": 12.044432640075684, "learning_rate": 4.2132613525771686e-05, "loss": 1.6328, "step": 14620 }, { "epoch": 2.4176822970460647, "grad_norm": 44.36164093017578, "learning_rate": 4.212343230687306e-05, "loss": 1.5393, "step": 14630 }, { "epoch": 2.4193348481718653, "grad_norm": 8.344030380249023, "learning_rate": 4.2114251087974445e-05, "loss": 1.7533, "step": 14640 }, { "epoch": 2.420987399297666, "grad_norm": 17.99836540222168, "learning_rate": 4.210506986907582e-05, "loss": 1.7182, "step": 14650 }, { "epoch": 2.4226399504234664, "grad_norm": 31.487668991088867, "learning_rate": 4.20958886501772e-05, "loss": 1.5901, "step": 14660 }, { "epoch": 2.4242925015492665, "grad_norm": 10.33210277557373, "learning_rate": 4.208670743127858e-05, "loss": 1.5997, "step": 14670 }, { "epoch": 2.425945052675067, "grad_norm": 7.618146896362305, "learning_rate": 4.207752621237996e-05, "loss": 1.5562, "step": 14680 }, { "epoch": 2.4275976038008675, "grad_norm": 39.23401641845703, "learning_rate": 4.206834499348134e-05, "loss": 1.5395, "step": 14690 }, { "epoch": 2.429250154926668, "grad_norm": 32.31549072265625, "learning_rate": 4.2059163774582713e-05, "loss": 1.5347, "step": 14700 }, { "epoch": 2.4309027060524686, "grad_norm": 17.82357406616211, "learning_rate": 4.204998255568409e-05, "loss": 1.5538, "step": 14710 }, { "epoch": 2.4325552571782687, "grad_norm": 9.20028305053711, "learning_rate": 4.204080133678547e-05, "loss": 1.6732, "step": 14720 }, { "epoch": 2.4342078083040692, "grad_norm": 5.321588039398193, "learning_rate": 4.2031620117886855e-05, "loss": 1.5215, "step": 14730 }, { "epoch": 2.43586035942987, "grad_norm": 21.302366256713867, "learning_rate": 4.202243889898823e-05, "loss": 1.4867, "step": 14740 }, { "epoch": 2.4375129105556703, "grad_norm": 6.692672252655029, "learning_rate": 4.201325768008961e-05, "loss": 1.5802, "step": 14750 }, { "epoch": 2.439165461681471, "grad_norm": 36.045379638671875, "learning_rate": 4.200407646119099e-05, "loss": 1.465, "step": 14760 }, { "epoch": 2.4408180128072714, "grad_norm": 11.485674858093262, "learning_rate": 4.199489524229237e-05, "loss": 1.5289, "step": 14770 }, { "epoch": 2.4424705639330715, "grad_norm": 11.762076377868652, "learning_rate": 4.198571402339375e-05, "loss": 1.5009, "step": 14780 }, { "epoch": 2.444123115058872, "grad_norm": 9.65092658996582, "learning_rate": 4.197653280449513e-05, "loss": 1.7058, "step": 14790 }, { "epoch": 2.4457756661846726, "grad_norm": 12.469741821289062, "learning_rate": 4.1967351585596506e-05, "loss": 1.5731, "step": 14800 }, { "epoch": 2.447428217310473, "grad_norm": 48.536380767822266, "learning_rate": 4.195817036669789e-05, "loss": 1.509, "step": 14810 }, { "epoch": 2.4490807684362736, "grad_norm": 9.138647079467773, "learning_rate": 4.1948989147799264e-05, "loss": 1.5453, "step": 14820 }, { "epoch": 2.450733319562074, "grad_norm": 13.465476036071777, "learning_rate": 4.193980792890064e-05, "loss": 1.4734, "step": 14830 }, { "epoch": 2.4523858706878743, "grad_norm": 9.56730842590332, "learning_rate": 4.193062671000202e-05, "loss": 1.6971, "step": 14840 }, { "epoch": 2.454038421813675, "grad_norm": 30.218923568725586, "learning_rate": 4.19214454911034e-05, "loss": 1.6432, "step": 14850 }, { "epoch": 2.4556909729394754, "grad_norm": 9.52951717376709, "learning_rate": 4.191226427220478e-05, "loss": 1.5145, "step": 14860 }, { "epoch": 2.457343524065276, "grad_norm": 15.97042179107666, "learning_rate": 4.190308305330616e-05, "loss": 1.5629, "step": 14870 }, { "epoch": 2.4589960751910764, "grad_norm": 7.123246669769287, "learning_rate": 4.189390183440754e-05, "loss": 1.6142, "step": 14880 }, { "epoch": 2.4606486263168765, "grad_norm": 10.944896697998047, "learning_rate": 4.1884720615508916e-05, "loss": 1.4761, "step": 14890 }, { "epoch": 2.462301177442677, "grad_norm": 14.6271333694458, "learning_rate": 4.18755393966103e-05, "loss": 1.6334, "step": 14900 }, { "epoch": 2.4639537285684776, "grad_norm": 13.755815505981445, "learning_rate": 4.1866358177711674e-05, "loss": 1.6396, "step": 14910 }, { "epoch": 2.465606279694278, "grad_norm": 5.795435428619385, "learning_rate": 4.185717695881306e-05, "loss": 1.6156, "step": 14920 }, { "epoch": 2.4672588308200787, "grad_norm": 8.652409553527832, "learning_rate": 4.184799573991443e-05, "loss": 1.5923, "step": 14930 }, { "epoch": 2.468911381945879, "grad_norm": 34.65216064453125, "learning_rate": 4.1838814521015815e-05, "loss": 1.6352, "step": 14940 }, { "epoch": 2.4705639330716793, "grad_norm": 8.559181213378906, "learning_rate": 4.182963330211719e-05, "loss": 1.567, "step": 14950 }, { "epoch": 2.47221648419748, "grad_norm": 9.969573020935059, "learning_rate": 4.182045208321857e-05, "loss": 1.5711, "step": 14960 }, { "epoch": 2.4738690353232804, "grad_norm": 18.179960250854492, "learning_rate": 4.181127086431995e-05, "loss": 1.6054, "step": 14970 }, { "epoch": 2.475521586449081, "grad_norm": 14.064473152160645, "learning_rate": 4.1802089645421326e-05, "loss": 1.6849, "step": 14980 }, { "epoch": 2.477174137574881, "grad_norm": 32.33445739746094, "learning_rate": 4.179290842652271e-05, "loss": 1.6486, "step": 14990 }, { "epoch": 2.4788266887006816, "grad_norm": 10.423996925354004, "learning_rate": 4.1783727207624084e-05, "loss": 1.5323, "step": 15000 }, { "epoch": 2.480479239826482, "grad_norm": 18.493953704833984, "learning_rate": 4.177454598872547e-05, "loss": 1.6395, "step": 15010 }, { "epoch": 2.4821317909522826, "grad_norm": 12.233598709106445, "learning_rate": 4.176536476982684e-05, "loss": 1.6386, "step": 15020 }, { "epoch": 2.483784342078083, "grad_norm": 6.207724571228027, "learning_rate": 4.1756183550928225e-05, "loss": 1.6277, "step": 15030 }, { "epoch": 2.4854368932038833, "grad_norm": 29.125558853149414, "learning_rate": 4.17470023320296e-05, "loss": 1.7678, "step": 15040 }, { "epoch": 2.487089444329684, "grad_norm": 155.11293029785156, "learning_rate": 4.1737821113130984e-05, "loss": 1.6569, "step": 15050 }, { "epoch": 2.4887419954554844, "grad_norm": 11.938981056213379, "learning_rate": 4.172863989423236e-05, "loss": 1.4698, "step": 15060 }, { "epoch": 2.490394546581285, "grad_norm": 17.687034606933594, "learning_rate": 4.171945867533374e-05, "loss": 1.4958, "step": 15070 }, { "epoch": 2.4920470977070854, "grad_norm": 18.96551513671875, "learning_rate": 4.171027745643512e-05, "loss": 1.5655, "step": 15080 }, { "epoch": 2.4936996488328855, "grad_norm": 151.6097869873047, "learning_rate": 4.1701096237536494e-05, "loss": 1.553, "step": 15090 }, { "epoch": 2.495352199958686, "grad_norm": 18.68946075439453, "learning_rate": 4.1691915018637877e-05, "loss": 1.7066, "step": 15100 }, { "epoch": 2.4970047510844866, "grad_norm": 6.90726375579834, "learning_rate": 4.168273379973925e-05, "loss": 1.5377, "step": 15110 }, { "epoch": 2.498657302210287, "grad_norm": 5.961545467376709, "learning_rate": 4.1673552580840635e-05, "loss": 1.6982, "step": 15120 }, { "epoch": 2.5003098533360877, "grad_norm": 14.853984832763672, "learning_rate": 4.166437136194201e-05, "loss": 1.4802, "step": 15130 }, { "epoch": 2.501962404461888, "grad_norm": 22.466632843017578, "learning_rate": 4.1655190143043394e-05, "loss": 1.7366, "step": 15140 }, { "epoch": 2.5036149555876888, "grad_norm": 17.365001678466797, "learning_rate": 4.164600892414477e-05, "loss": 1.574, "step": 15150 }, { "epoch": 2.505267506713489, "grad_norm": 14.483482360839844, "learning_rate": 4.163682770524615e-05, "loss": 1.7487, "step": 15160 }, { "epoch": 2.5069200578392894, "grad_norm": 16.953773498535156, "learning_rate": 4.162764648634753e-05, "loss": 1.6225, "step": 15170 }, { "epoch": 2.50857260896509, "grad_norm": 10.457685470581055, "learning_rate": 4.161846526744891e-05, "loss": 1.7095, "step": 15180 }, { "epoch": 2.5102251600908905, "grad_norm": 17.119020462036133, "learning_rate": 4.160928404855029e-05, "loss": 1.6546, "step": 15190 }, { "epoch": 2.511877711216691, "grad_norm": 5.457040309906006, "learning_rate": 4.160010282965167e-05, "loss": 1.5472, "step": 15200 }, { "epoch": 2.513530262342491, "grad_norm": 11.147836685180664, "learning_rate": 4.1590921610753045e-05, "loss": 1.6512, "step": 15210 }, { "epoch": 2.5151828134682916, "grad_norm": 9.593242645263672, "learning_rate": 4.158174039185442e-05, "loss": 1.6628, "step": 15220 }, { "epoch": 2.516835364594092, "grad_norm": 11.19896125793457, "learning_rate": 4.1572559172955803e-05, "loss": 1.6524, "step": 15230 }, { "epoch": 2.5184879157198927, "grad_norm": 7.335904121398926, "learning_rate": 4.156337795405718e-05, "loss": 1.5542, "step": 15240 }, { "epoch": 2.5201404668456933, "grad_norm": 15.49926471710205, "learning_rate": 4.155419673515856e-05, "loss": 1.5335, "step": 15250 }, { "epoch": 2.5217930179714934, "grad_norm": 8.069356918334961, "learning_rate": 4.154501551625994e-05, "loss": 1.498, "step": 15260 }, { "epoch": 2.523445569097294, "grad_norm": 12.631221771240234, "learning_rate": 4.153583429736132e-05, "loss": 1.5483, "step": 15270 }, { "epoch": 2.5250981202230944, "grad_norm": 10.724843978881836, "learning_rate": 4.1526653078462696e-05, "loss": 1.459, "step": 15280 }, { "epoch": 2.526750671348895, "grad_norm": 28.29610252380371, "learning_rate": 4.151747185956408e-05, "loss": 1.5287, "step": 15290 }, { "epoch": 2.5284032224746955, "grad_norm": 11.114155769348145, "learning_rate": 4.150829064066546e-05, "loss": 1.5773, "step": 15300 }, { "epoch": 2.5300557736004956, "grad_norm": 16.751001358032227, "learning_rate": 4.149910942176684e-05, "loss": 1.6019, "step": 15310 }, { "epoch": 2.531708324726296, "grad_norm": 12.741584777832031, "learning_rate": 4.148992820286822e-05, "loss": 1.602, "step": 15320 }, { "epoch": 2.5333608758520967, "grad_norm": 14.595974922180176, "learning_rate": 4.1480746983969596e-05, "loss": 1.5229, "step": 15330 }, { "epoch": 2.535013426977897, "grad_norm": 9.688117027282715, "learning_rate": 4.147156576507097e-05, "loss": 1.573, "step": 15340 }, { "epoch": 2.5366659781036978, "grad_norm": 5.253176212310791, "learning_rate": 4.146238454617235e-05, "loss": 1.6018, "step": 15350 }, { "epoch": 2.538318529229498, "grad_norm": 11.29525375366211, "learning_rate": 4.145320332727373e-05, "loss": 1.5689, "step": 15360 }, { "epoch": 2.5399710803552984, "grad_norm": 10.628171920776367, "learning_rate": 4.1444022108375106e-05, "loss": 1.4875, "step": 15370 }, { "epoch": 2.541623631481099, "grad_norm": 11.296927452087402, "learning_rate": 4.143484088947649e-05, "loss": 1.6718, "step": 15380 }, { "epoch": 2.5432761826068995, "grad_norm": 9.183626174926758, "learning_rate": 4.1425659670577865e-05, "loss": 1.6317, "step": 15390 }, { "epoch": 2.5449287337327, "grad_norm": 7.3979268074035645, "learning_rate": 4.141647845167925e-05, "loss": 1.6035, "step": 15400 }, { "epoch": 2.5465812848585, "grad_norm": 9.273728370666504, "learning_rate": 4.140729723278063e-05, "loss": 1.596, "step": 15410 }, { "epoch": 2.5482338359843006, "grad_norm": 16.016374588012695, "learning_rate": 4.1398116013882006e-05, "loss": 1.5189, "step": 15420 }, { "epoch": 2.549886387110101, "grad_norm": 7.42404317855835, "learning_rate": 4.138893479498339e-05, "loss": 1.6049, "step": 15430 }, { "epoch": 2.5515389382359017, "grad_norm": 14.383243560791016, "learning_rate": 4.1379753576084764e-05, "loss": 1.6131, "step": 15440 }, { "epoch": 2.5531914893617023, "grad_norm": 7.443404197692871, "learning_rate": 4.137057235718615e-05, "loss": 1.7355, "step": 15450 }, { "epoch": 2.5548440404875024, "grad_norm": 7.681070327758789, "learning_rate": 4.136139113828752e-05, "loss": 1.6039, "step": 15460 }, { "epoch": 2.556496591613303, "grad_norm": 17.475801467895508, "learning_rate": 4.13522099193889e-05, "loss": 1.5287, "step": 15470 }, { "epoch": 2.5581491427391034, "grad_norm": 11.472020149230957, "learning_rate": 4.1343028700490274e-05, "loss": 1.572, "step": 15480 }, { "epoch": 2.559801693864904, "grad_norm": 8.961161613464355, "learning_rate": 4.133384748159166e-05, "loss": 1.6715, "step": 15490 }, { "epoch": 2.5614542449907045, "grad_norm": 9.045692443847656, "learning_rate": 4.132466626269303e-05, "loss": 1.6285, "step": 15500 }, { "epoch": 2.5631067961165046, "grad_norm": 9.706321716308594, "learning_rate": 4.1315485043794416e-05, "loss": 1.6204, "step": 15510 }, { "epoch": 2.564759347242305, "grad_norm": 22.907732009887695, "learning_rate": 4.13063038248958e-05, "loss": 1.6672, "step": 15520 }, { "epoch": 2.5664118983681057, "grad_norm": 13.848122596740723, "learning_rate": 4.1297122605997174e-05, "loss": 1.7706, "step": 15530 }, { "epoch": 2.568064449493906, "grad_norm": 9.652771949768066, "learning_rate": 4.128794138709856e-05, "loss": 1.6259, "step": 15540 }, { "epoch": 2.5697170006197068, "grad_norm": 12.885273933410645, "learning_rate": 4.127876016819993e-05, "loss": 1.7457, "step": 15550 }, { "epoch": 2.571369551745507, "grad_norm": 6.977317810058594, "learning_rate": 4.1269578949301315e-05, "loss": 1.5125, "step": 15560 }, { "epoch": 2.573022102871308, "grad_norm": 34.75843811035156, "learning_rate": 4.126039773040269e-05, "loss": 1.6154, "step": 15570 }, { "epoch": 2.574674653997108, "grad_norm": 9.55295181274414, "learning_rate": 4.1251216511504074e-05, "loss": 1.5819, "step": 15580 }, { "epoch": 2.5763272051229085, "grad_norm": 13.800176620483398, "learning_rate": 4.124203529260545e-05, "loss": 1.5153, "step": 15590 }, { "epoch": 2.577979756248709, "grad_norm": 53.50123977661133, "learning_rate": 4.1232854073706825e-05, "loss": 1.6517, "step": 15600 }, { "epoch": 2.5796323073745095, "grad_norm": 24.61274528503418, "learning_rate": 4.12236728548082e-05, "loss": 1.6843, "step": 15610 }, { "epoch": 2.58128485850031, "grad_norm": 8.19063663482666, "learning_rate": 4.1214491635909584e-05, "loss": 1.5765, "step": 15620 }, { "epoch": 2.58293740962611, "grad_norm": 8.86671257019043, "learning_rate": 4.120531041701096e-05, "loss": 1.5666, "step": 15630 }, { "epoch": 2.5845899607519107, "grad_norm": 8.37794303894043, "learning_rate": 4.119612919811234e-05, "loss": 1.6589, "step": 15640 }, { "epoch": 2.5862425118777113, "grad_norm": 12.301908493041992, "learning_rate": 4.1186947979213725e-05, "loss": 1.5103, "step": 15650 }, { "epoch": 2.587895063003512, "grad_norm": 21.372512817382812, "learning_rate": 4.11777667603151e-05, "loss": 1.5983, "step": 15660 }, { "epoch": 2.5895476141293123, "grad_norm": 10.715096473693848, "learning_rate": 4.1168585541416484e-05, "loss": 1.5617, "step": 15670 }, { "epoch": 2.5912001652551124, "grad_norm": 24.612468719482422, "learning_rate": 4.115940432251786e-05, "loss": 1.6039, "step": 15680 }, { "epoch": 2.592852716380913, "grad_norm": 26.50051498413086, "learning_rate": 4.115022310361924e-05, "loss": 1.5615, "step": 15690 }, { "epoch": 2.5945052675067135, "grad_norm": 21.82634162902832, "learning_rate": 4.114104188472062e-05, "loss": 1.5786, "step": 15700 }, { "epoch": 2.596157818632514, "grad_norm": 7.950484275817871, "learning_rate": 4.1131860665822e-05, "loss": 1.6964, "step": 15710 }, { "epoch": 2.5978103697583146, "grad_norm": 30.824180603027344, "learning_rate": 4.1122679446923376e-05, "loss": 1.514, "step": 15720 }, { "epoch": 2.5994629208841147, "grad_norm": 7.380947113037109, "learning_rate": 4.111349822802475e-05, "loss": 1.6425, "step": 15730 }, { "epoch": 2.601115472009915, "grad_norm": 7.936986923217773, "learning_rate": 4.110431700912613e-05, "loss": 1.5766, "step": 15740 }, { "epoch": 2.6027680231357158, "grad_norm": 9.09585189819336, "learning_rate": 4.109513579022751e-05, "loss": 1.6068, "step": 15750 }, { "epoch": 2.6044205742615163, "grad_norm": 23.57267951965332, "learning_rate": 4.108595457132889e-05, "loss": 1.4544, "step": 15760 }, { "epoch": 2.606073125387317, "grad_norm": 8.182823181152344, "learning_rate": 4.107677335243027e-05, "loss": 1.5769, "step": 15770 }, { "epoch": 2.607725676513117, "grad_norm": 15.561851501464844, "learning_rate": 4.106759213353165e-05, "loss": 1.5978, "step": 15780 }, { "epoch": 2.6093782276389175, "grad_norm": 11.034695625305176, "learning_rate": 4.105841091463303e-05, "loss": 1.608, "step": 15790 }, { "epoch": 2.611030778764718, "grad_norm": 63.234703063964844, "learning_rate": 4.104922969573441e-05, "loss": 1.6947, "step": 15800 }, { "epoch": 2.6126833298905185, "grad_norm": 8.965785026550293, "learning_rate": 4.1040048476835786e-05, "loss": 1.6128, "step": 15810 }, { "epoch": 2.614335881016319, "grad_norm": 7.541203022003174, "learning_rate": 4.103086725793717e-05, "loss": 1.5494, "step": 15820 }, { "epoch": 2.615988432142119, "grad_norm": 12.703873634338379, "learning_rate": 4.1021686039038545e-05, "loss": 1.6318, "step": 15830 }, { "epoch": 2.6176409832679197, "grad_norm": 8.111274719238281, "learning_rate": 4.101250482013993e-05, "loss": 1.5468, "step": 15840 }, { "epoch": 2.6192935343937203, "grad_norm": 23.11355209350586, "learning_rate": 4.10033236012413e-05, "loss": 1.6287, "step": 15850 }, { "epoch": 2.620946085519521, "grad_norm": 12.819192886352539, "learning_rate": 4.099414238234268e-05, "loss": 1.6666, "step": 15860 }, { "epoch": 2.6225986366453213, "grad_norm": 9.071503639221191, "learning_rate": 4.098496116344406e-05, "loss": 1.6456, "step": 15870 }, { "epoch": 2.6242511877711214, "grad_norm": 14.59699821472168, "learning_rate": 4.097577994454544e-05, "loss": 1.5137, "step": 15880 }, { "epoch": 2.625903738896922, "grad_norm": 11.511642456054688, "learning_rate": 4.096659872564682e-05, "loss": 1.5802, "step": 15890 }, { "epoch": 2.6275562900227225, "grad_norm": 13.53958511352539, "learning_rate": 4.0957417506748196e-05, "loss": 1.704, "step": 15900 }, { "epoch": 2.629208841148523, "grad_norm": 12.743249893188477, "learning_rate": 4.094823628784958e-05, "loss": 1.567, "step": 15910 }, { "epoch": 2.6308613922743236, "grad_norm": 31.744903564453125, "learning_rate": 4.0939055068950955e-05, "loss": 1.4322, "step": 15920 }, { "epoch": 2.6325139434001237, "grad_norm": 12.78609848022461, "learning_rate": 4.092987385005234e-05, "loss": 1.7043, "step": 15930 }, { "epoch": 2.6341664945259247, "grad_norm": 19.535892486572266, "learning_rate": 4.092069263115371e-05, "loss": 1.569, "step": 15940 }, { "epoch": 2.6358190456517248, "grad_norm": 13.720259666442871, "learning_rate": 4.0911511412255096e-05, "loss": 1.6444, "step": 15950 }, { "epoch": 2.6374715967775253, "grad_norm": 7.525299549102783, "learning_rate": 4.090233019335647e-05, "loss": 1.511, "step": 15960 }, { "epoch": 2.639124147903326, "grad_norm": 12.380800247192383, "learning_rate": 4.0893148974457854e-05, "loss": 1.6673, "step": 15970 }, { "epoch": 2.6407766990291264, "grad_norm": 14.61955451965332, "learning_rate": 4.088396775555923e-05, "loss": 1.5086, "step": 15980 }, { "epoch": 2.642429250154927, "grad_norm": 11.027737617492676, "learning_rate": 4.0874786536660606e-05, "loss": 1.7163, "step": 15990 }, { "epoch": 2.644081801280727, "grad_norm": 19.45004653930664, "learning_rate": 4.086560531776199e-05, "loss": 1.6217, "step": 16000 }, { "epoch": 2.6457343524065275, "grad_norm": 13.399431228637695, "learning_rate": 4.0856424098863364e-05, "loss": 1.5046, "step": 16010 }, { "epoch": 2.647386903532328, "grad_norm": 11.20180892944336, "learning_rate": 4.084724287996475e-05, "loss": 1.5504, "step": 16020 }, { "epoch": 2.6490394546581286, "grad_norm": 24.36643409729004, "learning_rate": 4.083806166106612e-05, "loss": 1.633, "step": 16030 }, { "epoch": 2.650692005783929, "grad_norm": 9.366679191589355, "learning_rate": 4.0828880442167506e-05, "loss": 1.6683, "step": 16040 }, { "epoch": 2.6523445569097293, "grad_norm": 11.785042762756348, "learning_rate": 4.081969922326888e-05, "loss": 1.6522, "step": 16050 }, { "epoch": 2.65399710803553, "grad_norm": 5.310166358947754, "learning_rate": 4.0810518004370264e-05, "loss": 1.621, "step": 16060 }, { "epoch": 2.6556496591613303, "grad_norm": 39.18266677856445, "learning_rate": 4.080133678547164e-05, "loss": 1.6034, "step": 16070 }, { "epoch": 2.657302210287131, "grad_norm": 12.86594009399414, "learning_rate": 4.079215556657302e-05, "loss": 1.438, "step": 16080 }, { "epoch": 2.6589547614129314, "grad_norm": 13.897305488586426, "learning_rate": 4.0782974347674405e-05, "loss": 1.4932, "step": 16090 }, { "epoch": 2.6606073125387315, "grad_norm": 25.27904510498047, "learning_rate": 4.077379312877578e-05, "loss": 1.7065, "step": 16100 }, { "epoch": 2.662259863664532, "grad_norm": 10.040287017822266, "learning_rate": 4.076461190987716e-05, "loss": 1.571, "step": 16110 }, { "epoch": 2.6639124147903326, "grad_norm": 11.403328895568848, "learning_rate": 4.075543069097853e-05, "loss": 1.6483, "step": 16120 }, { "epoch": 2.665564965916133, "grad_norm": 10.510461807250977, "learning_rate": 4.0746249472079915e-05, "loss": 1.6841, "step": 16130 }, { "epoch": 2.6672175170419337, "grad_norm": 6.346688270568848, "learning_rate": 4.073706825318129e-05, "loss": 1.6634, "step": 16140 }, { "epoch": 2.6688700681677338, "grad_norm": 6.1756696701049805, "learning_rate": 4.0727887034282674e-05, "loss": 1.6516, "step": 16150 }, { "epoch": 2.6705226192935343, "grad_norm": 10.464988708496094, "learning_rate": 4.071870581538405e-05, "loss": 1.6066, "step": 16160 }, { "epoch": 2.672175170419335, "grad_norm": 18.168760299682617, "learning_rate": 4.070952459648543e-05, "loss": 1.6268, "step": 16170 }, { "epoch": 2.6738277215451354, "grad_norm": 31.59029197692871, "learning_rate": 4.070034337758681e-05, "loss": 1.593, "step": 16180 }, { "epoch": 2.675480272670936, "grad_norm": 13.246702194213867, "learning_rate": 4.069116215868819e-05, "loss": 1.6339, "step": 16190 }, { "epoch": 2.677132823796736, "grad_norm": 13.714371681213379, "learning_rate": 4.068198093978957e-05, "loss": 1.6681, "step": 16200 }, { "epoch": 2.6787853749225365, "grad_norm": 9.815982818603516, "learning_rate": 4.067279972089095e-05, "loss": 1.4579, "step": 16210 }, { "epoch": 2.680437926048337, "grad_norm": 11.735651016235352, "learning_rate": 4.066361850199233e-05, "loss": 1.6187, "step": 16220 }, { "epoch": 2.6820904771741376, "grad_norm": 10.747529983520508, "learning_rate": 4.065443728309371e-05, "loss": 1.5022, "step": 16230 }, { "epoch": 2.683743028299938, "grad_norm": 19.077959060668945, "learning_rate": 4.0645256064195084e-05, "loss": 1.5181, "step": 16240 }, { "epoch": 2.6853955794257383, "grad_norm": 23.054729461669922, "learning_rate": 4.063607484529646e-05, "loss": 1.6234, "step": 16250 }, { "epoch": 2.687048130551539, "grad_norm": 6.9372992515563965, "learning_rate": 4.062689362639784e-05, "loss": 1.5558, "step": 16260 }, { "epoch": 2.6887006816773393, "grad_norm": 9.9459867477417, "learning_rate": 4.061771240749922e-05, "loss": 1.4208, "step": 16270 }, { "epoch": 2.69035323280314, "grad_norm": 8.664091110229492, "learning_rate": 4.06085311886006e-05, "loss": 1.6421, "step": 16280 }, { "epoch": 2.6920057839289404, "grad_norm": 11.299137115478516, "learning_rate": 4.0599349969701977e-05, "loss": 1.6188, "step": 16290 }, { "epoch": 2.6936583350547405, "grad_norm": 8.935805320739746, "learning_rate": 4.059016875080336e-05, "loss": 1.6265, "step": 16300 }, { "epoch": 2.6953108861805415, "grad_norm": 8.099499702453613, "learning_rate": 4.0580987531904735e-05, "loss": 1.4238, "step": 16310 }, { "epoch": 2.6969634373063416, "grad_norm": 6.492707252502441, "learning_rate": 4.057180631300612e-05, "loss": 1.5873, "step": 16320 }, { "epoch": 2.698615988432142, "grad_norm": 6.262495517730713, "learning_rate": 4.05626250941075e-05, "loss": 1.639, "step": 16330 }, { "epoch": 2.7002685395579427, "grad_norm": 16.449399948120117, "learning_rate": 4.0553443875208876e-05, "loss": 1.5878, "step": 16340 }, { "epoch": 2.7019210906837428, "grad_norm": 21.750577926635742, "learning_rate": 4.054426265631026e-05, "loss": 1.7173, "step": 16350 }, { "epoch": 2.7035736418095437, "grad_norm": 10.185629844665527, "learning_rate": 4.0535081437411635e-05, "loss": 1.5977, "step": 16360 }, { "epoch": 2.705226192935344, "grad_norm": 66.03086853027344, "learning_rate": 4.052590021851301e-05, "loss": 1.7398, "step": 16370 }, { "epoch": 2.7068787440611444, "grad_norm": 6.185853481292725, "learning_rate": 4.0516718999614386e-05, "loss": 1.5928, "step": 16380 }, { "epoch": 2.708531295186945, "grad_norm": 5.825438499450684, "learning_rate": 4.050753778071577e-05, "loss": 1.5435, "step": 16390 }, { "epoch": 2.7101838463127454, "grad_norm": 19.0218505859375, "learning_rate": 4.0498356561817145e-05, "loss": 1.6741, "step": 16400 }, { "epoch": 2.711836397438546, "grad_norm": 12.254176139831543, "learning_rate": 4.048917534291853e-05, "loss": 1.5611, "step": 16410 }, { "epoch": 2.713488948564346, "grad_norm": 18.687061309814453, "learning_rate": 4.04799941240199e-05, "loss": 1.5091, "step": 16420 }, { "epoch": 2.7151414996901466, "grad_norm": 11.243393898010254, "learning_rate": 4.0470812905121286e-05, "loss": 1.5715, "step": 16430 }, { "epoch": 2.716794050815947, "grad_norm": 15.61385726928711, "learning_rate": 4.046163168622267e-05, "loss": 1.4761, "step": 16440 }, { "epoch": 2.7184466019417477, "grad_norm": 6.631096839904785, "learning_rate": 4.0452450467324045e-05, "loss": 1.6744, "step": 16450 }, { "epoch": 2.7200991530675482, "grad_norm": 31.41004753112793, "learning_rate": 4.044326924842543e-05, "loss": 1.4479, "step": 16460 }, { "epoch": 2.7217517041933483, "grad_norm": 8.312516212463379, "learning_rate": 4.04340880295268e-05, "loss": 1.4798, "step": 16470 }, { "epoch": 2.723404255319149, "grad_norm": 16.965839385986328, "learning_rate": 4.0424906810628186e-05, "loss": 1.5273, "step": 16480 }, { "epoch": 2.7250568064449494, "grad_norm": 22.01923179626465, "learning_rate": 4.041572559172956e-05, "loss": 1.5548, "step": 16490 }, { "epoch": 2.72670935757075, "grad_norm": 8.084673881530762, "learning_rate": 4.040654437283094e-05, "loss": 1.6711, "step": 16500 }, { "epoch": 2.7283619086965505, "grad_norm": 8.843356132507324, "learning_rate": 4.039736315393231e-05, "loss": 1.5687, "step": 16510 }, { "epoch": 2.7300144598223506, "grad_norm": 10.197770118713379, "learning_rate": 4.0388181935033696e-05, "loss": 1.5118, "step": 16520 }, { "epoch": 2.731667010948151, "grad_norm": 38.914649963378906, "learning_rate": 4.037900071613507e-05, "loss": 1.6277, "step": 16530 }, { "epoch": 2.7333195620739517, "grad_norm": 5.684174060821533, "learning_rate": 4.0369819497236454e-05, "loss": 1.7301, "step": 16540 }, { "epoch": 2.734972113199752, "grad_norm": 12.711173057556152, "learning_rate": 4.036063827833784e-05, "loss": 1.4475, "step": 16550 }, { "epoch": 2.7366246643255527, "grad_norm": 48.89714813232422, "learning_rate": 4.035145705943921e-05, "loss": 1.584, "step": 16560 }, { "epoch": 2.738277215451353, "grad_norm": 22.189855575561523, "learning_rate": 4.0342275840540595e-05, "loss": 1.6205, "step": 16570 }, { "epoch": 2.7399297665771534, "grad_norm": 7.027297019958496, "learning_rate": 4.033309462164197e-05, "loss": 1.6204, "step": 16580 }, { "epoch": 2.741582317702954, "grad_norm": 16.53078842163086, "learning_rate": 4.0323913402743354e-05, "loss": 1.5647, "step": 16590 }, { "epoch": 2.7432348688287544, "grad_norm": 14.532068252563477, "learning_rate": 4.031473218384473e-05, "loss": 1.5833, "step": 16600 }, { "epoch": 2.744887419954555, "grad_norm": 17.68378448486328, "learning_rate": 4.030555096494611e-05, "loss": 1.6564, "step": 16610 }, { "epoch": 2.746539971080355, "grad_norm": 8.336248397827148, "learning_rate": 4.029636974604749e-05, "loss": 1.6376, "step": 16620 }, { "epoch": 2.7481925222061556, "grad_norm": 8.331178665161133, "learning_rate": 4.0287188527148864e-05, "loss": 1.5466, "step": 16630 }, { "epoch": 2.749845073331956, "grad_norm": 9.454358100891113, "learning_rate": 4.027800730825024e-05, "loss": 1.6387, "step": 16640 }, { "epoch": 2.7514976244577567, "grad_norm": 19.310138702392578, "learning_rate": 4.026882608935162e-05, "loss": 1.6703, "step": 16650 }, { "epoch": 2.7531501755835572, "grad_norm": 7.6864705085754395, "learning_rate": 4.0259644870453005e-05, "loss": 1.5447, "step": 16660 }, { "epoch": 2.7548027267093573, "grad_norm": 5.934784412384033, "learning_rate": 4.025046365155438e-05, "loss": 1.4783, "step": 16670 }, { "epoch": 2.756455277835158, "grad_norm": 53.78288269042969, "learning_rate": 4.0241282432655764e-05, "loss": 1.7438, "step": 16680 }, { "epoch": 2.7581078289609584, "grad_norm": 9.907873153686523, "learning_rate": 4.023210121375714e-05, "loss": 1.5094, "step": 16690 }, { "epoch": 2.759760380086759, "grad_norm": 17.700838088989258, "learning_rate": 4.022291999485852e-05, "loss": 1.5519, "step": 16700 }, { "epoch": 2.7614129312125595, "grad_norm": 11.269119262695312, "learning_rate": 4.02137387759599e-05, "loss": 1.661, "step": 16710 }, { "epoch": 2.7630654823383596, "grad_norm": 12.648497581481934, "learning_rate": 4.020455755706128e-05, "loss": 1.7044, "step": 16720 }, { "epoch": 2.7647180334641606, "grad_norm": 7.257272720336914, "learning_rate": 4.019537633816266e-05, "loss": 1.5785, "step": 16730 }, { "epoch": 2.7663705845899607, "grad_norm": 10.091726303100586, "learning_rate": 4.018619511926404e-05, "loss": 1.5245, "step": 16740 }, { "epoch": 2.768023135715761, "grad_norm": 23.164775848388672, "learning_rate": 4.0177013900365415e-05, "loss": 1.5764, "step": 16750 }, { "epoch": 2.7696756868415617, "grad_norm": 21.203231811523438, "learning_rate": 4.016783268146679e-05, "loss": 1.5322, "step": 16760 }, { "epoch": 2.7713282379673623, "grad_norm": 8.188257217407227, "learning_rate": 4.015865146256817e-05, "loss": 1.5544, "step": 16770 }, { "epoch": 2.772980789093163, "grad_norm": 5.449336051940918, "learning_rate": 4.014947024366955e-05, "loss": 1.5638, "step": 16780 }, { "epoch": 2.774633340218963, "grad_norm": 30.01235008239746, "learning_rate": 4.014028902477093e-05, "loss": 1.6193, "step": 16790 }, { "epoch": 2.7762858913447634, "grad_norm": 7.071667194366455, "learning_rate": 4.013110780587231e-05, "loss": 1.5834, "step": 16800 }, { "epoch": 2.777938442470564, "grad_norm": 7.050570011138916, "learning_rate": 4.012192658697369e-05, "loss": 1.4917, "step": 16810 }, { "epoch": 2.7795909935963645, "grad_norm": 6.538548469543457, "learning_rate": 4.0112745368075067e-05, "loss": 1.6527, "step": 16820 }, { "epoch": 2.781243544722165, "grad_norm": 10.154224395751953, "learning_rate": 4.010356414917645e-05, "loss": 1.5722, "step": 16830 }, { "epoch": 2.782896095847965, "grad_norm": 13.937148094177246, "learning_rate": 4.0094382930277825e-05, "loss": 1.6655, "step": 16840 }, { "epoch": 2.7845486469737657, "grad_norm": 10.971857070922852, "learning_rate": 4.008520171137921e-05, "loss": 1.6278, "step": 16850 }, { "epoch": 2.7862011980995662, "grad_norm": 19.598770141601562, "learning_rate": 4.0076020492480584e-05, "loss": 1.6375, "step": 16860 }, { "epoch": 2.7878537492253668, "grad_norm": 13.447637557983398, "learning_rate": 4.0066839273581966e-05, "loss": 1.5709, "step": 16870 }, { "epoch": 2.7895063003511673, "grad_norm": 12.843521118164062, "learning_rate": 4.005765805468334e-05, "loss": 1.6029, "step": 16880 }, { "epoch": 2.7911588514769674, "grad_norm": 58.23180389404297, "learning_rate": 4.004847683578472e-05, "loss": 1.5457, "step": 16890 }, { "epoch": 2.792811402602768, "grad_norm": 9.176880836486816, "learning_rate": 4.00392956168861e-05, "loss": 1.5729, "step": 16900 }, { "epoch": 2.7944639537285685, "grad_norm": 5.380598545074463, "learning_rate": 4.0030114397987476e-05, "loss": 1.5439, "step": 16910 }, { "epoch": 2.796116504854369, "grad_norm": 8.638605117797852, "learning_rate": 4.002093317908886e-05, "loss": 1.5537, "step": 16920 }, { "epoch": 2.7977690559801696, "grad_norm": 5.76012659072876, "learning_rate": 4.0011751960190235e-05, "loss": 1.521, "step": 16930 }, { "epoch": 2.7994216071059697, "grad_norm": 8.071260452270508, "learning_rate": 4.000257074129162e-05, "loss": 1.559, "step": 16940 }, { "epoch": 2.80107415823177, "grad_norm": 13.014986991882324, "learning_rate": 3.999338952239299e-05, "loss": 1.6444, "step": 16950 }, { "epoch": 2.8027267093575707, "grad_norm": 5.6536431312561035, "learning_rate": 3.9984208303494376e-05, "loss": 1.6925, "step": 16960 }, { "epoch": 2.8043792604833713, "grad_norm": 6.50076961517334, "learning_rate": 3.997502708459575e-05, "loss": 1.5527, "step": 16970 }, { "epoch": 2.806031811609172, "grad_norm": 11.952144622802734, "learning_rate": 3.9965845865697134e-05, "loss": 1.6111, "step": 16980 }, { "epoch": 2.807684362734972, "grad_norm": 12.421814918518066, "learning_rate": 3.995666464679851e-05, "loss": 1.5704, "step": 16990 }, { "epoch": 2.8093369138607724, "grad_norm": 7.645977973937988, "learning_rate": 3.994748342789989e-05, "loss": 1.6001, "step": 17000 }, { "epoch": 2.810989464986573, "grad_norm": 9.946296691894531, "learning_rate": 3.993830220900127e-05, "loss": 1.5803, "step": 17010 }, { "epoch": 2.8126420161123735, "grad_norm": 61.43771743774414, "learning_rate": 3.9929120990102645e-05, "loss": 1.5299, "step": 17020 }, { "epoch": 2.814294567238174, "grad_norm": 13.09609317779541, "learning_rate": 3.991993977120403e-05, "loss": 1.5403, "step": 17030 }, { "epoch": 2.815947118363974, "grad_norm": 10.614474296569824, "learning_rate": 3.99107585523054e-05, "loss": 1.5166, "step": 17040 }, { "epoch": 2.8175996694897747, "grad_norm": 64.39093780517578, "learning_rate": 3.9901577333406786e-05, "loss": 1.6604, "step": 17050 }, { "epoch": 2.8192522206155752, "grad_norm": 27.681407928466797, "learning_rate": 3.989239611450816e-05, "loss": 1.6366, "step": 17060 }, { "epoch": 2.8209047717413758, "grad_norm": 9.592436790466309, "learning_rate": 3.9883214895609544e-05, "loss": 1.5281, "step": 17070 }, { "epoch": 2.8225573228671763, "grad_norm": 10.364959716796875, "learning_rate": 3.987403367671092e-05, "loss": 1.6947, "step": 17080 }, { "epoch": 2.8242098739929764, "grad_norm": 8.182236671447754, "learning_rate": 3.98648524578123e-05, "loss": 1.6698, "step": 17090 }, { "epoch": 2.8258624251187774, "grad_norm": 13.211899757385254, "learning_rate": 3.985567123891368e-05, "loss": 1.5847, "step": 17100 }, { "epoch": 2.8275149762445775, "grad_norm": 12.27530288696289, "learning_rate": 3.984649002001506e-05, "loss": 1.6346, "step": 17110 }, { "epoch": 2.829167527370378, "grad_norm": 27.445775985717773, "learning_rate": 3.9837308801116444e-05, "loss": 1.5896, "step": 17120 }, { "epoch": 2.8308200784961786, "grad_norm": 107.77072143554688, "learning_rate": 3.982812758221782e-05, "loss": 1.5398, "step": 17130 }, { "epoch": 2.8324726296219787, "grad_norm": 6.350532531738281, "learning_rate": 3.9818946363319196e-05, "loss": 1.6412, "step": 17140 }, { "epoch": 2.8341251807477796, "grad_norm": 10.610336303710938, "learning_rate": 3.980976514442057e-05, "loss": 1.5777, "step": 17150 }, { "epoch": 2.8357777318735797, "grad_norm": 6.003240585327148, "learning_rate": 3.9800583925521954e-05, "loss": 1.5696, "step": 17160 }, { "epoch": 2.8374302829993803, "grad_norm": 7.450899124145508, "learning_rate": 3.979140270662333e-05, "loss": 1.6595, "step": 17170 }, { "epoch": 2.839082834125181, "grad_norm": 10.081241607666016, "learning_rate": 3.978222148772471e-05, "loss": 1.5032, "step": 17180 }, { "epoch": 2.8407353852509813, "grad_norm": 10.23855972290039, "learning_rate": 3.977304026882609e-05, "loss": 1.5916, "step": 17190 }, { "epoch": 2.842387936376782, "grad_norm": 9.486077308654785, "learning_rate": 3.976385904992747e-05, "loss": 1.6483, "step": 17200 }, { "epoch": 2.844040487502582, "grad_norm": 22.05427360534668, "learning_rate": 3.975467783102885e-05, "loss": 1.6892, "step": 17210 }, { "epoch": 2.8456930386283825, "grad_norm": 16.26018524169922, "learning_rate": 3.974549661213023e-05, "loss": 1.5048, "step": 17220 }, { "epoch": 2.847345589754183, "grad_norm": 14.785170555114746, "learning_rate": 3.9736315393231606e-05, "loss": 1.7072, "step": 17230 }, { "epoch": 2.8489981408799836, "grad_norm": 7.535512924194336, "learning_rate": 3.972713417433299e-05, "loss": 1.6278, "step": 17240 }, { "epoch": 2.850650692005784, "grad_norm": 14.22430419921875, "learning_rate": 3.971795295543437e-05, "loss": 1.6181, "step": 17250 }, { "epoch": 2.8523032431315842, "grad_norm": 9.323792457580566, "learning_rate": 3.970877173653575e-05, "loss": 1.72, "step": 17260 }, { "epoch": 2.8539557942573848, "grad_norm": 9.750657081604004, "learning_rate": 3.969959051763712e-05, "loss": 1.4657, "step": 17270 }, { "epoch": 2.8556083453831853, "grad_norm": 16.884572982788086, "learning_rate": 3.96904092987385e-05, "loss": 1.6819, "step": 17280 }, { "epoch": 2.857260896508986, "grad_norm": 29.167482376098633, "learning_rate": 3.968122807983988e-05, "loss": 1.6451, "step": 17290 }, { "epoch": 2.8589134476347864, "grad_norm": 36.17399215698242, "learning_rate": 3.967204686094126e-05, "loss": 1.6083, "step": 17300 }, { "epoch": 2.8605659987605865, "grad_norm": 13.15809440612793, "learning_rate": 3.966286564204264e-05, "loss": 1.6859, "step": 17310 }, { "epoch": 2.862218549886387, "grad_norm": 9.510421752929688, "learning_rate": 3.9653684423144015e-05, "loss": 1.6486, "step": 17320 }, { "epoch": 2.8638711010121876, "grad_norm": 9.244708061218262, "learning_rate": 3.96445032042454e-05, "loss": 1.5877, "step": 17330 }, { "epoch": 2.865523652137988, "grad_norm": 6.563562393188477, "learning_rate": 3.9635321985346774e-05, "loss": 1.575, "step": 17340 }, { "epoch": 2.8671762032637886, "grad_norm": 15.244311332702637, "learning_rate": 3.9626140766448156e-05, "loss": 1.55, "step": 17350 }, { "epoch": 2.8688287543895887, "grad_norm": 5.563783645629883, "learning_rate": 3.961695954754954e-05, "loss": 1.5702, "step": 17360 }, { "epoch": 2.8704813055153893, "grad_norm": 11.255026817321777, "learning_rate": 3.9607778328650915e-05, "loss": 1.713, "step": 17370 }, { "epoch": 2.87213385664119, "grad_norm": 13.83602237701416, "learning_rate": 3.95985971097523e-05, "loss": 1.4584, "step": 17380 }, { "epoch": 2.8737864077669903, "grad_norm": 10.390022277832031, "learning_rate": 3.9589415890853673e-05, "loss": 1.5963, "step": 17390 }, { "epoch": 2.875438958892791, "grad_norm": 6.194431781768799, "learning_rate": 3.958023467195505e-05, "loss": 1.5349, "step": 17400 }, { "epoch": 2.877091510018591, "grad_norm": 6.946686744689941, "learning_rate": 3.9571053453056425e-05, "loss": 1.5731, "step": 17410 }, { "epoch": 2.8787440611443915, "grad_norm": 11.890841484069824, "learning_rate": 3.956187223415781e-05, "loss": 1.5738, "step": 17420 }, { "epoch": 2.880396612270192, "grad_norm": 4.596217632293701, "learning_rate": 3.9552691015259184e-05, "loss": 1.5325, "step": 17430 }, { "epoch": 2.8820491633959926, "grad_norm": 9.431676864624023, "learning_rate": 3.9543509796360566e-05, "loss": 1.6274, "step": 17440 }, { "epoch": 2.883701714521793, "grad_norm": 9.621706008911133, "learning_rate": 3.953432857746194e-05, "loss": 1.5976, "step": 17450 }, { "epoch": 2.8853542656475932, "grad_norm": 6.648024559020996, "learning_rate": 3.9525147358563325e-05, "loss": 1.5725, "step": 17460 }, { "epoch": 2.8870068167733938, "grad_norm": 33.377403259277344, "learning_rate": 3.951596613966471e-05, "loss": 1.5394, "step": 17470 }, { "epoch": 2.8886593678991943, "grad_norm": 7.782329082489014, "learning_rate": 3.950678492076608e-05, "loss": 1.5998, "step": 17480 }, { "epoch": 2.890311919024995, "grad_norm": 6.886441230773926, "learning_rate": 3.9497603701867466e-05, "loss": 1.6038, "step": 17490 }, { "epoch": 2.8919644701507954, "grad_norm": 7.424654006958008, "learning_rate": 3.948842248296884e-05, "loss": 1.5715, "step": 17500 }, { "epoch": 2.8936170212765955, "grad_norm": 35.0805549621582, "learning_rate": 3.9479241264070224e-05, "loss": 1.5405, "step": 17510 }, { "epoch": 2.8952695724023965, "grad_norm": 8.772028923034668, "learning_rate": 3.94700600451716e-05, "loss": 1.6665, "step": 17520 }, { "epoch": 2.8969221235281966, "grad_norm": 10.116393089294434, "learning_rate": 3.9460878826272976e-05, "loss": 1.4747, "step": 17530 }, { "epoch": 2.898574674653997, "grad_norm": 12.616483688354492, "learning_rate": 3.945169760737435e-05, "loss": 1.6141, "step": 17540 }, { "epoch": 2.9002272257797976, "grad_norm": 10.682483673095703, "learning_rate": 3.9442516388475735e-05, "loss": 1.526, "step": 17550 }, { "epoch": 2.901879776905598, "grad_norm": 12.046555519104004, "learning_rate": 3.943333516957711e-05, "loss": 1.5962, "step": 17560 }, { "epoch": 2.9035323280313987, "grad_norm": 9.173718452453613, "learning_rate": 3.942415395067849e-05, "loss": 1.5042, "step": 17570 }, { "epoch": 2.905184879157199, "grad_norm": 7.157196998596191, "learning_rate": 3.9414972731779876e-05, "loss": 1.5015, "step": 17580 }, { "epoch": 2.9068374302829993, "grad_norm": 9.088848114013672, "learning_rate": 3.940579151288125e-05, "loss": 1.5697, "step": 17590 }, { "epoch": 2.9084899814088, "grad_norm": 7.118719577789307, "learning_rate": 3.9396610293982634e-05, "loss": 1.5875, "step": 17600 }, { "epoch": 2.9101425325346004, "grad_norm": 4.978763103485107, "learning_rate": 3.938742907508401e-05, "loss": 1.4973, "step": 17610 }, { "epoch": 2.911795083660401, "grad_norm": 11.87973403930664, "learning_rate": 3.937824785618539e-05, "loss": 1.6336, "step": 17620 }, { "epoch": 2.913447634786201, "grad_norm": 23.558897018432617, "learning_rate": 3.936906663728677e-05, "loss": 1.4903, "step": 17630 }, { "epoch": 2.9151001859120016, "grad_norm": 8.35072135925293, "learning_rate": 3.935988541838815e-05, "loss": 1.5911, "step": 17640 }, { "epoch": 2.916752737037802, "grad_norm": 7.296316623687744, "learning_rate": 3.935070419948953e-05, "loss": 1.4861, "step": 17650 }, { "epoch": 2.9184052881636027, "grad_norm": 6.886360168457031, "learning_rate": 3.93415229805909e-05, "loss": 1.4605, "step": 17660 }, { "epoch": 2.920057839289403, "grad_norm": 9.66377067565918, "learning_rate": 3.933234176169228e-05, "loss": 1.4889, "step": 17670 }, { "epoch": 2.9217103904152033, "grad_norm": 12.230368614196777, "learning_rate": 3.932316054279366e-05, "loss": 1.6329, "step": 17680 }, { "epoch": 2.923362941541004, "grad_norm": 23.454057693481445, "learning_rate": 3.9313979323895044e-05, "loss": 1.5017, "step": 17690 }, { "epoch": 2.9250154926668044, "grad_norm": 10.6051025390625, "learning_rate": 3.930479810499642e-05, "loss": 1.6555, "step": 17700 }, { "epoch": 2.926668043792605, "grad_norm": 5.926885604858398, "learning_rate": 3.92956168860978e-05, "loss": 1.5747, "step": 17710 }, { "epoch": 2.9283205949184055, "grad_norm": 9.678142547607422, "learning_rate": 3.928643566719918e-05, "loss": 1.7058, "step": 17720 }, { "epoch": 2.9299731460442056, "grad_norm": 6.809134483337402, "learning_rate": 3.927725444830056e-05, "loss": 1.5812, "step": 17730 }, { "epoch": 2.931625697170006, "grad_norm": 25.03977394104004, "learning_rate": 3.926807322940194e-05, "loss": 1.5576, "step": 17740 }, { "epoch": 2.9332782482958066, "grad_norm": 6.897164344787598, "learning_rate": 3.925889201050332e-05, "loss": 1.5183, "step": 17750 }, { "epoch": 2.934930799421607, "grad_norm": 6.26050329208374, "learning_rate": 3.9249710791604695e-05, "loss": 1.7254, "step": 17760 }, { "epoch": 2.9365833505474077, "grad_norm": 7.9940385818481445, "learning_rate": 3.924052957270608e-05, "loss": 1.5969, "step": 17770 }, { "epoch": 2.938235901673208, "grad_norm": 13.388635635375977, "learning_rate": 3.9231348353807454e-05, "loss": 1.5729, "step": 17780 }, { "epoch": 2.9398884527990083, "grad_norm": 5.230599880218506, "learning_rate": 3.922216713490883e-05, "loss": 1.6222, "step": 17790 }, { "epoch": 2.941541003924809, "grad_norm": 13.589559555053711, "learning_rate": 3.9212985916010206e-05, "loss": 1.6322, "step": 17800 }, { "epoch": 2.9431935550506094, "grad_norm": 22.378572463989258, "learning_rate": 3.920380469711159e-05, "loss": 1.6475, "step": 17810 }, { "epoch": 2.94484610617641, "grad_norm": 6.242623805999756, "learning_rate": 3.919462347821297e-05, "loss": 1.6062, "step": 17820 }, { "epoch": 2.94649865730221, "grad_norm": 14.694461822509766, "learning_rate": 3.918544225931435e-05, "loss": 1.6167, "step": 17830 }, { "epoch": 2.9481512084280106, "grad_norm": 6.985626697540283, "learning_rate": 3.917626104041573e-05, "loss": 1.7133, "step": 17840 }, { "epoch": 2.949803759553811, "grad_norm": 30.49583625793457, "learning_rate": 3.9167079821517105e-05, "loss": 1.6603, "step": 17850 }, { "epoch": 2.9514563106796117, "grad_norm": 16.23304557800293, "learning_rate": 3.915789860261849e-05, "loss": 1.5594, "step": 17860 }, { "epoch": 2.953108861805412, "grad_norm": 8.873785972595215, "learning_rate": 3.9148717383719864e-05, "loss": 1.6136, "step": 17870 }, { "epoch": 2.9547614129312123, "grad_norm": 10.205568313598633, "learning_rate": 3.9139536164821246e-05, "loss": 1.5299, "step": 17880 }, { "epoch": 2.9564139640570133, "grad_norm": 7.908629894256592, "learning_rate": 3.913035494592262e-05, "loss": 1.5855, "step": 17890 }, { "epoch": 2.9580665151828134, "grad_norm": 14.08654499053955, "learning_rate": 3.9121173727024005e-05, "loss": 1.6594, "step": 17900 }, { "epoch": 2.959719066308614, "grad_norm": 6.650778293609619, "learning_rate": 3.911199250812538e-05, "loss": 1.5049, "step": 17910 }, { "epoch": 2.9613716174344145, "grad_norm": 14.497530937194824, "learning_rate": 3.910281128922676e-05, "loss": 1.5831, "step": 17920 }, { "epoch": 2.9630241685602146, "grad_norm": 9.797993659973145, "learning_rate": 3.909363007032814e-05, "loss": 1.6278, "step": 17930 }, { "epoch": 2.9646767196860155, "grad_norm": 8.208698272705078, "learning_rate": 3.9084448851429515e-05, "loss": 1.7192, "step": 17940 }, { "epoch": 2.9663292708118156, "grad_norm": 27.290040969848633, "learning_rate": 3.90752676325309e-05, "loss": 1.5312, "step": 17950 }, { "epoch": 2.967981821937616, "grad_norm": 7.827337741851807, "learning_rate": 3.9066086413632274e-05, "loss": 1.4619, "step": 17960 }, { "epoch": 2.9696343730634167, "grad_norm": 9.8374605178833, "learning_rate": 3.9056905194733656e-05, "loss": 1.6841, "step": 17970 }, { "epoch": 2.9712869241892172, "grad_norm": 6.857816219329834, "learning_rate": 3.904772397583503e-05, "loss": 1.6227, "step": 17980 }, { "epoch": 2.972939475315018, "grad_norm": 12.391343116760254, "learning_rate": 3.9038542756936415e-05, "loss": 1.6735, "step": 17990 }, { "epoch": 2.974592026440818, "grad_norm": 9.983665466308594, "learning_rate": 3.902936153803779e-05, "loss": 1.5625, "step": 18000 }, { "epoch": 2.9762445775666184, "grad_norm": 6.569074630737305, "learning_rate": 3.902018031913917e-05, "loss": 1.408, "step": 18010 }, { "epoch": 2.977897128692419, "grad_norm": 10.561470985412598, "learning_rate": 3.901099910024055e-05, "loss": 1.5342, "step": 18020 }, { "epoch": 2.9795496798182195, "grad_norm": 7.086844444274902, "learning_rate": 3.900181788134193e-05, "loss": 1.7286, "step": 18030 }, { "epoch": 2.98120223094402, "grad_norm": 15.396596908569336, "learning_rate": 3.899263666244331e-05, "loss": 1.4908, "step": 18040 }, { "epoch": 2.98285478206982, "grad_norm": 16.880029678344727, "learning_rate": 3.8983455443544683e-05, "loss": 1.623, "step": 18050 }, { "epoch": 2.9845073331956207, "grad_norm": 35.76949691772461, "learning_rate": 3.8974274224646066e-05, "loss": 1.6083, "step": 18060 }, { "epoch": 2.986159884321421, "grad_norm": 11.887407302856445, "learning_rate": 3.896509300574744e-05, "loss": 1.5319, "step": 18070 }, { "epoch": 2.9878124354472217, "grad_norm": 15.86429500579834, "learning_rate": 3.8955911786848825e-05, "loss": 1.6974, "step": 18080 }, { "epoch": 2.9894649865730223, "grad_norm": 45.645751953125, "learning_rate": 3.89467305679502e-05, "loss": 1.5547, "step": 18090 }, { "epoch": 2.9911175376988224, "grad_norm": 4.425729274749756, "learning_rate": 3.893754934905158e-05, "loss": 1.6069, "step": 18100 }, { "epoch": 2.992770088824623, "grad_norm": 14.894420623779297, "learning_rate": 3.892836813015296e-05, "loss": 1.4857, "step": 18110 }, { "epoch": 2.9944226399504235, "grad_norm": 12.32193660736084, "learning_rate": 3.891918691125434e-05, "loss": 1.7373, "step": 18120 }, { "epoch": 2.996075191076224, "grad_norm": 8.957423210144043, "learning_rate": 3.891000569235572e-05, "loss": 1.5757, "step": 18130 }, { "epoch": 2.9977277422020245, "grad_norm": 12.539956092834473, "learning_rate": 3.89008244734571e-05, "loss": 1.6558, "step": 18140 }, { "epoch": 2.9993802933278246, "grad_norm": 18.08677864074707, "learning_rate": 3.889164325455848e-05, "loss": 1.6866, "step": 18150 }, { "epoch": 2.999876058665565, "eval_accuracy": 0.2888352957866364, "eval_loss": 2.114047050476074, "eval_runtime": 817.3743, "eval_samples_per_second": 34.496, "eval_steps_per_second": 8.624, "step": 18153 }, { "epoch": 3.001032844453625, "grad_norm": 7.226766586303711, "learning_rate": 3.888246203565986e-05, "loss": 1.481, "step": 18160 }, { "epoch": 3.0026853955794257, "grad_norm": 6.795170307159424, "learning_rate": 3.8873280816761234e-05, "loss": 1.5827, "step": 18170 }, { "epoch": 3.0043379467052262, "grad_norm": 9.203761100769043, "learning_rate": 3.886409959786261e-05, "loss": 1.5858, "step": 18180 }, { "epoch": 3.005990497831027, "grad_norm": 8.978983879089355, "learning_rate": 3.885491837896399e-05, "loss": 1.6911, "step": 18190 }, { "epoch": 3.007643048956827, "grad_norm": 6.757550239562988, "learning_rate": 3.884573716006537e-05, "loss": 1.4933, "step": 18200 }, { "epoch": 3.0092956000826274, "grad_norm": 6.979025363922119, "learning_rate": 3.883655594116675e-05, "loss": 1.4942, "step": 18210 }, { "epoch": 3.010948151208428, "grad_norm": 8.403514862060547, "learning_rate": 3.882737472226813e-05, "loss": 1.5651, "step": 18220 }, { "epoch": 3.0126007023342285, "grad_norm": 16.475433349609375, "learning_rate": 3.881819350336951e-05, "loss": 1.5098, "step": 18230 }, { "epoch": 3.014253253460029, "grad_norm": 10.676935195922852, "learning_rate": 3.8809012284470886e-05, "loss": 1.5301, "step": 18240 }, { "epoch": 3.0159058045858296, "grad_norm": 80.31954193115234, "learning_rate": 3.879983106557227e-05, "loss": 1.5747, "step": 18250 }, { "epoch": 3.0175583557116297, "grad_norm": 9.030322074890137, "learning_rate": 3.879064984667365e-05, "loss": 1.6205, "step": 18260 }, { "epoch": 3.01921090683743, "grad_norm": 9.721772193908691, "learning_rate": 3.878146862777503e-05, "loss": 1.4312, "step": 18270 }, { "epoch": 3.0208634579632307, "grad_norm": 8.545890808105469, "learning_rate": 3.877228740887641e-05, "loss": 1.6293, "step": 18280 }, { "epoch": 3.0225160090890313, "grad_norm": 15.735355377197266, "learning_rate": 3.8763106189977785e-05, "loss": 1.5873, "step": 18290 }, { "epoch": 3.024168560214832, "grad_norm": 7.491411209106445, "learning_rate": 3.875392497107916e-05, "loss": 1.5171, "step": 18300 }, { "epoch": 3.025821111340632, "grad_norm": 8.309893608093262, "learning_rate": 3.874474375218054e-05, "loss": 1.4795, "step": 18310 }, { "epoch": 3.0274736624664325, "grad_norm": 19.71273422241211, "learning_rate": 3.873556253328192e-05, "loss": 1.5505, "step": 18320 }, { "epoch": 3.029126213592233, "grad_norm": 9.179100036621094, "learning_rate": 3.8726381314383296e-05, "loss": 1.6547, "step": 18330 }, { "epoch": 3.0307787647180335, "grad_norm": 6.885425090789795, "learning_rate": 3.871720009548468e-05, "loss": 1.5285, "step": 18340 }, { "epoch": 3.032431315843834, "grad_norm": 7.15316104888916, "learning_rate": 3.8708018876586054e-05, "loss": 1.4998, "step": 18350 }, { "epoch": 3.034083866969634, "grad_norm": 7.689057350158691, "learning_rate": 3.869883765768744e-05, "loss": 1.5134, "step": 18360 }, { "epoch": 3.0357364180954347, "grad_norm": 6.1142449378967285, "learning_rate": 3.868965643878881e-05, "loss": 1.5328, "step": 18370 }, { "epoch": 3.0373889692212352, "grad_norm": 19.263931274414062, "learning_rate": 3.8680475219890195e-05, "loss": 1.5652, "step": 18380 }, { "epoch": 3.039041520347036, "grad_norm": 17.659536361694336, "learning_rate": 3.867129400099158e-05, "loss": 1.6269, "step": 18390 }, { "epoch": 3.0406940714728363, "grad_norm": 8.499923706054688, "learning_rate": 3.8662112782092954e-05, "loss": 1.5685, "step": 18400 }, { "epoch": 3.042346622598637, "grad_norm": 7.187222480773926, "learning_rate": 3.8652931563194336e-05, "loss": 1.6992, "step": 18410 }, { "epoch": 3.043999173724437, "grad_norm": 5.891327381134033, "learning_rate": 3.864375034429571e-05, "loss": 1.4802, "step": 18420 }, { "epoch": 3.0456517248502375, "grad_norm": 17.82583999633789, "learning_rate": 3.863456912539709e-05, "loss": 1.6052, "step": 18430 }, { "epoch": 3.047304275976038, "grad_norm": 19.411766052246094, "learning_rate": 3.8625387906498464e-05, "loss": 1.4634, "step": 18440 }, { "epoch": 3.0489568271018386, "grad_norm": 9.680952072143555, "learning_rate": 3.861620668759985e-05, "loss": 1.6466, "step": 18450 }, { "epoch": 3.050609378227639, "grad_norm": 7.9444990158081055, "learning_rate": 3.860702546870122e-05, "loss": 1.5706, "step": 18460 }, { "epoch": 3.052261929353439, "grad_norm": 7.4515814781188965, "learning_rate": 3.8597844249802605e-05, "loss": 1.6711, "step": 18470 }, { "epoch": 3.0539144804792397, "grad_norm": 14.548356056213379, "learning_rate": 3.858866303090398e-05, "loss": 1.6145, "step": 18480 }, { "epoch": 3.0555670316050403, "grad_norm": 24.74275016784668, "learning_rate": 3.8579481812005364e-05, "loss": 1.542, "step": 18490 }, { "epoch": 3.057219582730841, "grad_norm": 5.011672496795654, "learning_rate": 3.8570300593106746e-05, "loss": 1.5655, "step": 18500 }, { "epoch": 3.0588721338566414, "grad_norm": 9.48055648803711, "learning_rate": 3.856111937420812e-05, "loss": 1.5704, "step": 18510 }, { "epoch": 3.0605246849824415, "grad_norm": 17.483966827392578, "learning_rate": 3.8551938155309505e-05, "loss": 1.5234, "step": 18520 }, { "epoch": 3.062177236108242, "grad_norm": 6.578785419464111, "learning_rate": 3.854275693641088e-05, "loss": 1.4352, "step": 18530 }, { "epoch": 3.0638297872340425, "grad_norm": 9.402719497680664, "learning_rate": 3.853357571751226e-05, "loss": 1.6183, "step": 18540 }, { "epoch": 3.065482338359843, "grad_norm": 17.517202377319336, "learning_rate": 3.852439449861364e-05, "loss": 1.6068, "step": 18550 }, { "epoch": 3.0671348894856436, "grad_norm": 8.648771286010742, "learning_rate": 3.8515213279715015e-05, "loss": 1.6782, "step": 18560 }, { "epoch": 3.0687874406114437, "grad_norm": 7.555728912353516, "learning_rate": 3.850603206081639e-05, "loss": 1.5315, "step": 18570 }, { "epoch": 3.0704399917372442, "grad_norm": 43.67970657348633, "learning_rate": 3.8496850841917773e-05, "loss": 1.5716, "step": 18580 }, { "epoch": 3.072092542863045, "grad_norm": 11.917938232421875, "learning_rate": 3.848766962301915e-05, "loss": 1.4323, "step": 18590 }, { "epoch": 3.0737450939888453, "grad_norm": 6.986249923706055, "learning_rate": 3.847848840412053e-05, "loss": 1.4702, "step": 18600 }, { "epoch": 3.075397645114646, "grad_norm": 12.20637035369873, "learning_rate": 3.8469307185221915e-05, "loss": 1.568, "step": 18610 }, { "epoch": 3.0770501962404464, "grad_norm": 29.53895378112793, "learning_rate": 3.846012596632329e-05, "loss": 1.5619, "step": 18620 }, { "epoch": 3.0787027473662465, "grad_norm": 19.85051918029785, "learning_rate": 3.845094474742467e-05, "loss": 1.6694, "step": 18630 }, { "epoch": 3.080355298492047, "grad_norm": 6.126530170440674, "learning_rate": 3.844176352852605e-05, "loss": 1.5997, "step": 18640 }, { "epoch": 3.0820078496178476, "grad_norm": 6.928379058837891, "learning_rate": 3.843258230962743e-05, "loss": 1.6335, "step": 18650 }, { "epoch": 3.083660400743648, "grad_norm": 6.429804801940918, "learning_rate": 3.842340109072881e-05, "loss": 1.6117, "step": 18660 }, { "epoch": 3.0853129518694486, "grad_norm": 6.85563850402832, "learning_rate": 3.841421987183019e-05, "loss": 1.5142, "step": 18670 }, { "epoch": 3.0869655029952487, "grad_norm": 8.979350090026855, "learning_rate": 3.8405038652931566e-05, "loss": 1.5052, "step": 18680 }, { "epoch": 3.0886180541210493, "grad_norm": 8.090424537658691, "learning_rate": 3.839585743403294e-05, "loss": 1.5765, "step": 18690 }, { "epoch": 3.09027060524685, "grad_norm": 12.26007080078125, "learning_rate": 3.838667621513432e-05, "loss": 1.6073, "step": 18700 }, { "epoch": 3.0919231563726504, "grad_norm": 13.63984489440918, "learning_rate": 3.83774949962357e-05, "loss": 1.6378, "step": 18710 }, { "epoch": 3.093575707498451, "grad_norm": 35.192535400390625, "learning_rate": 3.836831377733708e-05, "loss": 1.7231, "step": 18720 }, { "epoch": 3.095228258624251, "grad_norm": 10.192204475402832, "learning_rate": 3.835913255843846e-05, "loss": 1.5279, "step": 18730 }, { "epoch": 3.0968808097500515, "grad_norm": 5.712940216064453, "learning_rate": 3.834995133953984e-05, "loss": 1.5624, "step": 18740 }, { "epoch": 3.098533360875852, "grad_norm": 7.214580535888672, "learning_rate": 3.834077012064122e-05, "loss": 1.672, "step": 18750 }, { "epoch": 3.1001859120016526, "grad_norm": 19.402013778686523, "learning_rate": 3.83315889017426e-05, "loss": 1.6678, "step": 18760 }, { "epoch": 3.101838463127453, "grad_norm": 7.78207540512085, "learning_rate": 3.8322407682843976e-05, "loss": 1.6826, "step": 18770 }, { "epoch": 3.1034910142532537, "grad_norm": 6.110933303833008, "learning_rate": 3.831322646394536e-05, "loss": 1.5121, "step": 18780 }, { "epoch": 3.105143565379054, "grad_norm": 5.636849403381348, "learning_rate": 3.8304045245046734e-05, "loss": 1.4881, "step": 18790 }, { "epoch": 3.1067961165048543, "grad_norm": 8.485923767089844, "learning_rate": 3.829486402614812e-05, "loss": 1.6185, "step": 18800 }, { "epoch": 3.108448667630655, "grad_norm": 7.813500881195068, "learning_rate": 3.828568280724949e-05, "loss": 1.645, "step": 18810 }, { "epoch": 3.1101012187564554, "grad_norm": 8.218082427978516, "learning_rate": 3.827650158835087e-05, "loss": 1.601, "step": 18820 }, { "epoch": 3.111753769882256, "grad_norm": 11.229850769042969, "learning_rate": 3.826732036945225e-05, "loss": 1.5679, "step": 18830 }, { "epoch": 3.113406321008056, "grad_norm": 8.275188446044922, "learning_rate": 3.825813915055363e-05, "loss": 1.5716, "step": 18840 }, { "epoch": 3.1150588721338566, "grad_norm": 7.776087284088135, "learning_rate": 3.824895793165501e-05, "loss": 1.4384, "step": 18850 }, { "epoch": 3.116711423259657, "grad_norm": 37.47963333129883, "learning_rate": 3.8239776712756386e-05, "loss": 1.6417, "step": 18860 }, { "epoch": 3.1183639743854576, "grad_norm": 8.274860382080078, "learning_rate": 3.823059549385777e-05, "loss": 1.5703, "step": 18870 }, { "epoch": 3.120016525511258, "grad_norm": 8.022534370422363, "learning_rate": 3.8221414274959144e-05, "loss": 1.5707, "step": 18880 }, { "epoch": 3.1216690766370583, "grad_norm": 5.824304103851318, "learning_rate": 3.821223305606053e-05, "loss": 1.4785, "step": 18890 }, { "epoch": 3.123321627762859, "grad_norm": 9.422294616699219, "learning_rate": 3.82030518371619e-05, "loss": 1.5626, "step": 18900 }, { "epoch": 3.1249741788886594, "grad_norm": 6.564004898071289, "learning_rate": 3.8193870618263285e-05, "loss": 1.6056, "step": 18910 }, { "epoch": 3.12662673001446, "grad_norm": 10.041267395019531, "learning_rate": 3.818468939936466e-05, "loss": 1.4272, "step": 18920 }, { "epoch": 3.1282792811402604, "grad_norm": 36.88932800292969, "learning_rate": 3.8175508180466044e-05, "loss": 1.3849, "step": 18930 }, { "epoch": 3.1299318322660605, "grad_norm": 13.796693801879883, "learning_rate": 3.816632696156742e-05, "loss": 1.5946, "step": 18940 }, { "epoch": 3.131584383391861, "grad_norm": 14.519100189208984, "learning_rate": 3.8157145742668795e-05, "loss": 1.532, "step": 18950 }, { "epoch": 3.1332369345176616, "grad_norm": 9.376446723937988, "learning_rate": 3.814796452377018e-05, "loss": 1.683, "step": 18960 }, { "epoch": 3.134889485643462, "grad_norm": 9.751054763793945, "learning_rate": 3.8138783304871554e-05, "loss": 1.5191, "step": 18970 }, { "epoch": 3.1365420367692627, "grad_norm": 11.590496063232422, "learning_rate": 3.8129602085972937e-05, "loss": 1.4696, "step": 18980 }, { "epoch": 3.138194587895063, "grad_norm": 18.807069778442383, "learning_rate": 3.812042086707431e-05, "loss": 1.5602, "step": 18990 }, { "epoch": 3.1398471390208633, "grad_norm": 6.233638286590576, "learning_rate": 3.8111239648175695e-05, "loss": 1.5646, "step": 19000 }, { "epoch": 3.141499690146664, "grad_norm": 6.848485469818115, "learning_rate": 3.810205842927707e-05, "loss": 1.5728, "step": 19010 }, { "epoch": 3.1431522412724644, "grad_norm": 18.452594757080078, "learning_rate": 3.8092877210378454e-05, "loss": 1.6127, "step": 19020 }, { "epoch": 3.144804792398265, "grad_norm": 59.3503303527832, "learning_rate": 3.808369599147983e-05, "loss": 1.6197, "step": 19030 }, { "epoch": 3.1464573435240655, "grad_norm": 11.30215835571289, "learning_rate": 3.807451477258121e-05, "loss": 1.5164, "step": 19040 }, { "epoch": 3.1481098946498656, "grad_norm": 8.094250679016113, "learning_rate": 3.806533355368259e-05, "loss": 1.4852, "step": 19050 }, { "epoch": 3.149762445775666, "grad_norm": 8.159111022949219, "learning_rate": 3.805615233478397e-05, "loss": 1.6521, "step": 19060 }, { "epoch": 3.1514149969014666, "grad_norm": 27.37004280090332, "learning_rate": 3.8046971115885346e-05, "loss": 1.4812, "step": 19070 }, { "epoch": 3.153067548027267, "grad_norm": 9.601912498474121, "learning_rate": 3.803778989698672e-05, "loss": 1.6315, "step": 19080 }, { "epoch": 3.1547200991530677, "grad_norm": 10.574562072753906, "learning_rate": 3.8028608678088105e-05, "loss": 1.5586, "step": 19090 }, { "epoch": 3.156372650278868, "grad_norm": 5.611733913421631, "learning_rate": 3.801942745918948e-05, "loss": 1.5879, "step": 19100 }, { "epoch": 3.1580252014046684, "grad_norm": 24.429452896118164, "learning_rate": 3.8010246240290863e-05, "loss": 1.5259, "step": 19110 }, { "epoch": 3.159677752530469, "grad_norm": 6.225391864776611, "learning_rate": 3.800106502139224e-05, "loss": 1.5108, "step": 19120 }, { "epoch": 3.1613303036562694, "grad_norm": 8.502754211425781, "learning_rate": 3.799188380249362e-05, "loss": 1.5145, "step": 19130 }, { "epoch": 3.16298285478207, "grad_norm": 17.88811683654785, "learning_rate": 3.7982702583595e-05, "loss": 1.6389, "step": 19140 }, { "epoch": 3.16463540590787, "grad_norm": 13.118366241455078, "learning_rate": 3.797352136469638e-05, "loss": 1.5567, "step": 19150 }, { "epoch": 3.1662879570336706, "grad_norm": 8.860665321350098, "learning_rate": 3.7964340145797756e-05, "loss": 1.6773, "step": 19160 }, { "epoch": 3.167940508159471, "grad_norm": 6.436194896697998, "learning_rate": 3.795515892689914e-05, "loss": 1.4645, "step": 19170 }, { "epoch": 3.1695930592852717, "grad_norm": 11.198339462280273, "learning_rate": 3.794597770800052e-05, "loss": 1.5443, "step": 19180 }, { "epoch": 3.1712456104110722, "grad_norm": 15.714473724365234, "learning_rate": 3.79367964891019e-05, "loss": 1.5452, "step": 19190 }, { "epoch": 3.1728981615368728, "grad_norm": 17.801097869873047, "learning_rate": 3.792761527020327e-05, "loss": 1.5366, "step": 19200 }, { "epoch": 3.174550712662673, "grad_norm": 11.515276908874512, "learning_rate": 3.791843405130465e-05, "loss": 1.6625, "step": 19210 }, { "epoch": 3.1762032637884734, "grad_norm": 6.000487804412842, "learning_rate": 3.790925283240603e-05, "loss": 1.4931, "step": 19220 }, { "epoch": 3.177855814914274, "grad_norm": 8.129054069519043, "learning_rate": 3.790007161350741e-05, "loss": 1.7251, "step": 19230 }, { "epoch": 3.1795083660400745, "grad_norm": 14.319493293762207, "learning_rate": 3.789089039460879e-05, "loss": 1.6495, "step": 19240 }, { "epoch": 3.181160917165875, "grad_norm": 30.3303165435791, "learning_rate": 3.7881709175710166e-05, "loss": 1.4484, "step": 19250 }, { "epoch": 3.182813468291675, "grad_norm": 10.565496444702148, "learning_rate": 3.787252795681155e-05, "loss": 1.6871, "step": 19260 }, { "epoch": 3.1844660194174756, "grad_norm": 9.682084083557129, "learning_rate": 3.7863346737912925e-05, "loss": 1.6799, "step": 19270 }, { "epoch": 3.186118570543276, "grad_norm": 6.963440418243408, "learning_rate": 3.785416551901431e-05, "loss": 1.6251, "step": 19280 }, { "epoch": 3.1877711216690767, "grad_norm": 20.987451553344727, "learning_rate": 3.784498430011569e-05, "loss": 1.4718, "step": 19290 }, { "epoch": 3.1894236727948773, "grad_norm": 12.118252754211426, "learning_rate": 3.7835803081217066e-05, "loss": 1.5291, "step": 19300 }, { "epoch": 3.1910762239206774, "grad_norm": 8.367793083190918, "learning_rate": 3.782662186231845e-05, "loss": 1.5914, "step": 19310 }, { "epoch": 3.192728775046478, "grad_norm": 7.3973164558410645, "learning_rate": 3.7817440643419824e-05, "loss": 1.4778, "step": 19320 }, { "epoch": 3.1943813261722784, "grad_norm": 28.844614028930664, "learning_rate": 3.78082594245212e-05, "loss": 1.4982, "step": 19330 }, { "epoch": 3.196033877298079, "grad_norm": 14.039192199707031, "learning_rate": 3.7799078205622576e-05, "loss": 1.5076, "step": 19340 }, { "epoch": 3.1976864284238795, "grad_norm": 6.318631172180176, "learning_rate": 3.778989698672396e-05, "loss": 1.5983, "step": 19350 }, { "epoch": 3.1993389795496796, "grad_norm": 11.539376258850098, "learning_rate": 3.7780715767825334e-05, "loss": 1.6026, "step": 19360 }, { "epoch": 3.20099153067548, "grad_norm": 15.837471961975098, "learning_rate": 3.777153454892672e-05, "loss": 1.6435, "step": 19370 }, { "epoch": 3.2026440818012807, "grad_norm": 10.379616737365723, "learning_rate": 3.776235333002809e-05, "loss": 1.6636, "step": 19380 }, { "epoch": 3.2042966329270812, "grad_norm": 18.38787841796875, "learning_rate": 3.7753172111129476e-05, "loss": 1.5559, "step": 19390 }, { "epoch": 3.2059491840528818, "grad_norm": 22.003950119018555, "learning_rate": 3.774399089223085e-05, "loss": 1.5326, "step": 19400 }, { "epoch": 3.207601735178682, "grad_norm": 6.592588424682617, "learning_rate": 3.7734809673332234e-05, "loss": 1.4978, "step": 19410 }, { "epoch": 3.2092542863044824, "grad_norm": 23.952259063720703, "learning_rate": 3.772562845443362e-05, "loss": 1.5836, "step": 19420 }, { "epoch": 3.210906837430283, "grad_norm": 16.6333065032959, "learning_rate": 3.771644723553499e-05, "loss": 1.61, "step": 19430 }, { "epoch": 3.2125593885560835, "grad_norm": 11.940427780151367, "learning_rate": 3.7707266016636375e-05, "loss": 1.4965, "step": 19440 }, { "epoch": 3.214211939681884, "grad_norm": 49.953617095947266, "learning_rate": 3.769808479773775e-05, "loss": 1.3932, "step": 19450 }, { "epoch": 3.2158644908076846, "grad_norm": 11.84276008605957, "learning_rate": 3.768890357883913e-05, "loss": 1.5016, "step": 19460 }, { "epoch": 3.2175170419334846, "grad_norm": 17.914264678955078, "learning_rate": 3.76797223599405e-05, "loss": 1.5665, "step": 19470 }, { "epoch": 3.219169593059285, "grad_norm": 8.291590690612793, "learning_rate": 3.7670541141041885e-05, "loss": 1.5544, "step": 19480 }, { "epoch": 3.2208221441850857, "grad_norm": 11.181132316589355, "learning_rate": 3.766135992214326e-05, "loss": 1.6092, "step": 19490 }, { "epoch": 3.2224746953108863, "grad_norm": 7.685586452484131, "learning_rate": 3.7652178703244644e-05, "loss": 1.6567, "step": 19500 }, { "epoch": 3.224127246436687, "grad_norm": 9.388480186462402, "learning_rate": 3.764299748434602e-05, "loss": 1.7372, "step": 19510 }, { "epoch": 3.225779797562487, "grad_norm": 6.502718448638916, "learning_rate": 3.76338162654474e-05, "loss": 1.5202, "step": 19520 }, { "epoch": 3.2274323486882874, "grad_norm": 19.254741668701172, "learning_rate": 3.7624635046548785e-05, "loss": 1.5534, "step": 19530 }, { "epoch": 3.229084899814088, "grad_norm": 12.590047836303711, "learning_rate": 3.761545382765016e-05, "loss": 1.4716, "step": 19540 }, { "epoch": 3.2307374509398885, "grad_norm": 58.66154479980469, "learning_rate": 3.7606272608751544e-05, "loss": 1.5091, "step": 19550 }, { "epoch": 3.232390002065689, "grad_norm": 9.476922035217285, "learning_rate": 3.759709138985292e-05, "loss": 1.5869, "step": 19560 }, { "epoch": 3.2340425531914896, "grad_norm": 62.604740142822266, "learning_rate": 3.75879101709543e-05, "loss": 1.5135, "step": 19570 }, { "epoch": 3.2356951043172897, "grad_norm": 7.957396507263184, "learning_rate": 3.757872895205568e-05, "loss": 1.5217, "step": 19580 }, { "epoch": 3.23734765544309, "grad_norm": 6.035177230834961, "learning_rate": 3.7569547733157054e-05, "loss": 1.4088, "step": 19590 }, { "epoch": 3.2390002065688908, "grad_norm": 33.28099822998047, "learning_rate": 3.756036651425843e-05, "loss": 1.5084, "step": 19600 }, { "epoch": 3.2406527576946913, "grad_norm": 10.489121437072754, "learning_rate": 3.755118529535981e-05, "loss": 1.5133, "step": 19610 }, { "epoch": 3.242305308820492, "grad_norm": 9.935800552368164, "learning_rate": 3.754200407646119e-05, "loss": 1.5428, "step": 19620 }, { "epoch": 3.243957859946292, "grad_norm": 8.586844444274902, "learning_rate": 3.753282285756257e-05, "loss": 1.4201, "step": 19630 }, { "epoch": 3.2456104110720925, "grad_norm": 24.38202476501465, "learning_rate": 3.752364163866395e-05, "loss": 1.5493, "step": 19640 }, { "epoch": 3.247262962197893, "grad_norm": 9.439826965332031, "learning_rate": 3.751446041976533e-05, "loss": 1.5399, "step": 19650 }, { "epoch": 3.2489155133236935, "grad_norm": 12.375255584716797, "learning_rate": 3.750527920086671e-05, "loss": 1.511, "step": 19660 }, { "epoch": 3.250568064449494, "grad_norm": 9.748537063598633, "learning_rate": 3.749609798196809e-05, "loss": 1.5526, "step": 19670 }, { "epoch": 3.252220615575294, "grad_norm": 6.342364311218262, "learning_rate": 3.748691676306947e-05, "loss": 1.513, "step": 19680 }, { "epoch": 3.2538731667010947, "grad_norm": 23.94266700744629, "learning_rate": 3.7477735544170846e-05, "loss": 1.5372, "step": 19690 }, { "epoch": 3.2555257178268953, "grad_norm": 15.126330375671387, "learning_rate": 3.746855432527223e-05, "loss": 1.5006, "step": 19700 }, { "epoch": 3.257178268952696, "grad_norm": 23.147714614868164, "learning_rate": 3.7459373106373605e-05, "loss": 1.6257, "step": 19710 }, { "epoch": 3.2588308200784963, "grad_norm": 6.821142673492432, "learning_rate": 3.745019188747498e-05, "loss": 1.3936, "step": 19720 }, { "epoch": 3.2604833712042964, "grad_norm": 7.183291912078857, "learning_rate": 3.7441010668576356e-05, "loss": 1.4941, "step": 19730 }, { "epoch": 3.262135922330097, "grad_norm": 9.05080509185791, "learning_rate": 3.743182944967774e-05, "loss": 1.466, "step": 19740 }, { "epoch": 3.2637884734558975, "grad_norm": 30.209848403930664, "learning_rate": 3.742264823077912e-05, "loss": 1.6091, "step": 19750 }, { "epoch": 3.265441024581698, "grad_norm": 7.082402229309082, "learning_rate": 3.74134670118805e-05, "loss": 1.6003, "step": 19760 }, { "epoch": 3.2670935757074986, "grad_norm": 17.827167510986328, "learning_rate": 3.740428579298188e-05, "loss": 1.4811, "step": 19770 }, { "epoch": 3.2687461268332987, "grad_norm": 10.344867706298828, "learning_rate": 3.7395104574083256e-05, "loss": 1.5169, "step": 19780 }, { "epoch": 3.270398677959099, "grad_norm": 12.192914009094238, "learning_rate": 3.738592335518464e-05, "loss": 1.4465, "step": 19790 }, { "epoch": 3.2720512290848998, "grad_norm": 5.389448165893555, "learning_rate": 3.7376742136286015e-05, "loss": 1.4501, "step": 19800 }, { "epoch": 3.2737037802107003, "grad_norm": 10.874370574951172, "learning_rate": 3.73675609173874e-05, "loss": 1.5779, "step": 19810 }, { "epoch": 3.275356331336501, "grad_norm": 12.324655532836914, "learning_rate": 3.735837969848877e-05, "loss": 1.459, "step": 19820 }, { "epoch": 3.277008882462301, "grad_norm": 10.794198036193848, "learning_rate": 3.7349198479590156e-05, "loss": 1.4958, "step": 19830 }, { "epoch": 3.2786614335881015, "grad_norm": 14.633699417114258, "learning_rate": 3.734001726069153e-05, "loss": 1.5338, "step": 19840 }, { "epoch": 3.280313984713902, "grad_norm": 8.332018852233887, "learning_rate": 3.733083604179291e-05, "loss": 1.4962, "step": 19850 }, { "epoch": 3.2819665358397025, "grad_norm": 9.715315818786621, "learning_rate": 3.732165482289429e-05, "loss": 1.5335, "step": 19860 }, { "epoch": 3.283619086965503, "grad_norm": 13.441153526306152, "learning_rate": 3.7312473603995666e-05, "loss": 1.5538, "step": 19870 }, { "epoch": 3.2852716380913036, "grad_norm": 16.49589729309082, "learning_rate": 3.730329238509705e-05, "loss": 1.5301, "step": 19880 }, { "epoch": 3.2869241892171037, "grad_norm": 31.22637939453125, "learning_rate": 3.7294111166198424e-05, "loss": 1.6213, "step": 19890 }, { "epoch": 3.2885767403429043, "grad_norm": 8.242454528808594, "learning_rate": 3.728492994729981e-05, "loss": 1.6577, "step": 19900 }, { "epoch": 3.290229291468705, "grad_norm": 8.728832244873047, "learning_rate": 3.727574872840118e-05, "loss": 1.5487, "step": 19910 }, { "epoch": 3.2918818425945053, "grad_norm": 7.110820770263672, "learning_rate": 3.7266567509502566e-05, "loss": 1.6357, "step": 19920 }, { "epoch": 3.293534393720306, "grad_norm": 6.833790302276611, "learning_rate": 3.725738629060394e-05, "loss": 1.5007, "step": 19930 }, { "epoch": 3.2951869448461064, "grad_norm": 10.365285873413086, "learning_rate": 3.7248205071705324e-05, "loss": 1.6454, "step": 19940 }, { "epoch": 3.2968394959719065, "grad_norm": 10.187037467956543, "learning_rate": 3.72390238528067e-05, "loss": 1.4288, "step": 19950 }, { "epoch": 3.298492047097707, "grad_norm": 33.366790771484375, "learning_rate": 3.722984263390808e-05, "loss": 1.6844, "step": 19960 }, { "epoch": 3.3001445982235076, "grad_norm": 6.164004802703857, "learning_rate": 3.722066141500946e-05, "loss": 1.5483, "step": 19970 }, { "epoch": 3.301797149349308, "grad_norm": 14.696124076843262, "learning_rate": 3.7211480196110834e-05, "loss": 1.5542, "step": 19980 }, { "epoch": 3.3034497004751087, "grad_norm": 11.939413070678711, "learning_rate": 3.720229897721222e-05, "loss": 1.5707, "step": 19990 }, { "epoch": 3.3051022516009088, "grad_norm": 6.623092174530029, "learning_rate": 3.719311775831359e-05, "loss": 1.5703, "step": 20000 }, { "epoch": 3.3067548027267093, "grad_norm": 7.930966854095459, "learning_rate": 3.7183936539414975e-05, "loss": 1.4843, "step": 20010 }, { "epoch": 3.30840735385251, "grad_norm": 5.481695652008057, "learning_rate": 3.717475532051635e-05, "loss": 1.5541, "step": 20020 }, { "epoch": 3.3100599049783104, "grad_norm": 6.865151405334473, "learning_rate": 3.7165574101617734e-05, "loss": 1.5562, "step": 20030 }, { "epoch": 3.311712456104111, "grad_norm": 9.590388298034668, "learning_rate": 3.715639288271911e-05, "loss": 1.6177, "step": 20040 }, { "epoch": 3.313365007229911, "grad_norm": 7.312343120574951, "learning_rate": 3.714721166382049e-05, "loss": 1.7381, "step": 20050 }, { "epoch": 3.3150175583557115, "grad_norm": 9.11748218536377, "learning_rate": 3.713803044492187e-05, "loss": 1.5066, "step": 20060 }, { "epoch": 3.316670109481512, "grad_norm": 7.51953649520874, "learning_rate": 3.712884922602325e-05, "loss": 1.7568, "step": 20070 }, { "epoch": 3.3183226606073126, "grad_norm": 7.698461532592773, "learning_rate": 3.711966800712463e-05, "loss": 1.4384, "step": 20080 }, { "epoch": 3.319975211733113, "grad_norm": 4.527771949768066, "learning_rate": 3.711048678822601e-05, "loss": 1.5915, "step": 20090 }, { "epoch": 3.3216277628589133, "grad_norm": 7.3180131912231445, "learning_rate": 3.7101305569327385e-05, "loss": 1.5506, "step": 20100 }, { "epoch": 3.323280313984714, "grad_norm": 5.010993480682373, "learning_rate": 3.709212435042876e-05, "loss": 1.4742, "step": 20110 }, { "epoch": 3.3249328651105143, "grad_norm": 15.535231590270996, "learning_rate": 3.7082943131530144e-05, "loss": 1.5647, "step": 20120 }, { "epoch": 3.326585416236315, "grad_norm": 7.405104637145996, "learning_rate": 3.707376191263152e-05, "loss": 1.3815, "step": 20130 }, { "epoch": 3.3282379673621154, "grad_norm": 6.594423770904541, "learning_rate": 3.70645806937329e-05, "loss": 1.5013, "step": 20140 }, { "epoch": 3.3298905184879155, "grad_norm": 16.589862823486328, "learning_rate": 3.705539947483428e-05, "loss": 1.5586, "step": 20150 }, { "epoch": 3.331543069613716, "grad_norm": 21.43290138244629, "learning_rate": 3.704621825593566e-05, "loss": 1.4964, "step": 20160 }, { "epoch": 3.3331956207395166, "grad_norm": 13.260900497436523, "learning_rate": 3.7037037037037037e-05, "loss": 1.5953, "step": 20170 }, { "epoch": 3.334848171865317, "grad_norm": 7.0593037605285645, "learning_rate": 3.702785581813842e-05, "loss": 1.6032, "step": 20180 }, { "epoch": 3.3365007229911177, "grad_norm": 5.94556188583374, "learning_rate": 3.7018674599239795e-05, "loss": 1.677, "step": 20190 }, { "epoch": 3.3381532741169178, "grad_norm": 6.961794376373291, "learning_rate": 3.700949338034118e-05, "loss": 1.3566, "step": 20200 }, { "epoch": 3.3398058252427183, "grad_norm": 12.444746017456055, "learning_rate": 3.700031216144256e-05, "loss": 1.6006, "step": 20210 }, { "epoch": 3.341458376368519, "grad_norm": 7.811690330505371, "learning_rate": 3.6991130942543936e-05, "loss": 1.557, "step": 20220 }, { "epoch": 3.3431109274943194, "grad_norm": 58.86392593383789, "learning_rate": 3.698194972364531e-05, "loss": 1.6786, "step": 20230 }, { "epoch": 3.34476347862012, "grad_norm": 13.110153198242188, "learning_rate": 3.697276850474669e-05, "loss": 1.5049, "step": 20240 }, { "epoch": 3.3464160297459205, "grad_norm": 26.155275344848633, "learning_rate": 3.696358728584807e-05, "loss": 1.6196, "step": 20250 }, { "epoch": 3.3480685808717205, "grad_norm": 9.868633270263672, "learning_rate": 3.6954406066949446e-05, "loss": 1.52, "step": 20260 }, { "epoch": 3.349721131997521, "grad_norm": 6.902059078216553, "learning_rate": 3.694522484805083e-05, "loss": 1.6092, "step": 20270 }, { "epoch": 3.3513736831233216, "grad_norm": 13.972870826721191, "learning_rate": 3.6936043629152205e-05, "loss": 1.4675, "step": 20280 }, { "epoch": 3.353026234249122, "grad_norm": 4.600803852081299, "learning_rate": 3.692686241025359e-05, "loss": 1.5782, "step": 20290 }, { "epoch": 3.3546787853749227, "grad_norm": 29.91995620727539, "learning_rate": 3.691768119135496e-05, "loss": 1.6141, "step": 20300 }, { "epoch": 3.3563313365007232, "grad_norm": 13.344480514526367, "learning_rate": 3.6908499972456346e-05, "loss": 1.5646, "step": 20310 }, { "epoch": 3.3579838876265233, "grad_norm": 36.023399353027344, "learning_rate": 3.689931875355773e-05, "loss": 1.5179, "step": 20320 }, { "epoch": 3.359636438752324, "grad_norm": 14.964554786682129, "learning_rate": 3.6890137534659105e-05, "loss": 1.523, "step": 20330 }, { "epoch": 3.3612889898781244, "grad_norm": 10.848349571228027, "learning_rate": 3.688095631576049e-05, "loss": 1.8119, "step": 20340 }, { "epoch": 3.362941541003925, "grad_norm": 13.974653244018555, "learning_rate": 3.687177509686186e-05, "loss": 1.6156, "step": 20350 }, { "epoch": 3.3645940921297255, "grad_norm": 6.851309776306152, "learning_rate": 3.686259387796324e-05, "loss": 1.5522, "step": 20360 }, { "epoch": 3.3662466432555256, "grad_norm": 7.36599063873291, "learning_rate": 3.6853412659064615e-05, "loss": 1.6563, "step": 20370 }, { "epoch": 3.367899194381326, "grad_norm": 8.092686653137207, "learning_rate": 3.6844231440166e-05, "loss": 1.4674, "step": 20380 }, { "epoch": 3.3695517455071267, "grad_norm": 13.80720329284668, "learning_rate": 3.683505022126737e-05, "loss": 1.6301, "step": 20390 }, { "epoch": 3.371204296632927, "grad_norm": 15.420303344726562, "learning_rate": 3.6825869002368756e-05, "loss": 1.5869, "step": 20400 }, { "epoch": 3.3728568477587277, "grad_norm": 8.8609037399292, "learning_rate": 3.681668778347013e-05, "loss": 1.3465, "step": 20410 }, { "epoch": 3.374509398884528, "grad_norm": 12.477286338806152, "learning_rate": 3.6807506564571514e-05, "loss": 1.6163, "step": 20420 }, { "epoch": 3.3761619500103284, "grad_norm": 18.01897430419922, "learning_rate": 3.67983253456729e-05, "loss": 1.5364, "step": 20430 }, { "epoch": 3.377814501136129, "grad_norm": 13.501622200012207, "learning_rate": 3.678914412677427e-05, "loss": 1.4659, "step": 20440 }, { "epoch": 3.3794670522619294, "grad_norm": 16.007644653320312, "learning_rate": 3.6779962907875655e-05, "loss": 1.6269, "step": 20450 }, { "epoch": 3.38111960338773, "grad_norm": 6.28301477432251, "learning_rate": 3.677078168897703e-05, "loss": 1.4513, "step": 20460 }, { "epoch": 3.38277215451353, "grad_norm": 8.616754531860352, "learning_rate": 3.6761600470078414e-05, "loss": 1.4903, "step": 20470 }, { "epoch": 3.3844247056393306, "grad_norm": 9.9006986618042, "learning_rate": 3.675241925117979e-05, "loss": 1.5364, "step": 20480 }, { "epoch": 3.386077256765131, "grad_norm": 11.879949569702148, "learning_rate": 3.6743238032281166e-05, "loss": 1.6025, "step": 20490 }, { "epoch": 3.3877298078909317, "grad_norm": 28.69721221923828, "learning_rate": 3.673405681338254e-05, "loss": 1.5899, "step": 20500 }, { "epoch": 3.3893823590167322, "grad_norm": 10.507150650024414, "learning_rate": 3.6724875594483924e-05, "loss": 1.6434, "step": 20510 }, { "epoch": 3.3910349101425323, "grad_norm": 10.478775978088379, "learning_rate": 3.67156943755853e-05, "loss": 1.5265, "step": 20520 }, { "epoch": 3.392687461268333, "grad_norm": 24.88822364807129, "learning_rate": 3.670651315668668e-05, "loss": 1.6751, "step": 20530 }, { "epoch": 3.3943400123941334, "grad_norm": 9.983197212219238, "learning_rate": 3.669733193778806e-05, "loss": 1.5644, "step": 20540 }, { "epoch": 3.395992563519934, "grad_norm": 7.989924430847168, "learning_rate": 3.668815071888944e-05, "loss": 1.6401, "step": 20550 }, { "epoch": 3.3976451146457345, "grad_norm": 5.701867580413818, "learning_rate": 3.6678969499990824e-05, "loss": 1.5425, "step": 20560 }, { "epoch": 3.3992976657715346, "grad_norm": 14.750271797180176, "learning_rate": 3.66697882810922e-05, "loss": 1.585, "step": 20570 }, { "epoch": 3.400950216897335, "grad_norm": 21.3060245513916, "learning_rate": 3.666060706219358e-05, "loss": 1.5328, "step": 20580 }, { "epoch": 3.4026027680231357, "grad_norm": 14.30643367767334, "learning_rate": 3.665142584329496e-05, "loss": 1.5749, "step": 20590 }, { "epoch": 3.404255319148936, "grad_norm": 10.9791898727417, "learning_rate": 3.664224462439634e-05, "loss": 1.6841, "step": 20600 }, { "epoch": 3.4059078702747367, "grad_norm": 10.282464981079102, "learning_rate": 3.663306340549772e-05, "loss": 1.586, "step": 20610 }, { "epoch": 3.407560421400537, "grad_norm": 10.054344177246094, "learning_rate": 3.662388218659909e-05, "loss": 1.5445, "step": 20620 }, { "epoch": 3.4092129725263374, "grad_norm": 6.583102703094482, "learning_rate": 3.661470096770047e-05, "loss": 1.507, "step": 20630 }, { "epoch": 3.410865523652138, "grad_norm": 7.482510566711426, "learning_rate": 3.660551974880185e-05, "loss": 1.5865, "step": 20640 }, { "epoch": 3.4125180747779384, "grad_norm": 23.863901138305664, "learning_rate": 3.659633852990323e-05, "loss": 1.7097, "step": 20650 }, { "epoch": 3.414170625903739, "grad_norm": 5.8396830558776855, "learning_rate": 3.658715731100461e-05, "loss": 1.4904, "step": 20660 }, { "epoch": 3.4158231770295395, "grad_norm": 14.669455528259277, "learning_rate": 3.657797609210599e-05, "loss": 1.5476, "step": 20670 }, { "epoch": 3.4174757281553396, "grad_norm": 9.562002182006836, "learning_rate": 3.656879487320737e-05, "loss": 1.474, "step": 20680 }, { "epoch": 3.41912827928114, "grad_norm": 13.14786434173584, "learning_rate": 3.655961365430875e-05, "loss": 1.6046, "step": 20690 }, { "epoch": 3.4207808304069407, "grad_norm": 27.712749481201172, "learning_rate": 3.6550432435410127e-05, "loss": 1.6887, "step": 20700 }, { "epoch": 3.4224333815327412, "grad_norm": 8.249465942382812, "learning_rate": 3.654125121651151e-05, "loss": 1.5875, "step": 20710 }, { "epoch": 3.4240859326585418, "grad_norm": 34.13796615600586, "learning_rate": 3.6532069997612885e-05, "loss": 1.5839, "step": 20720 }, { "epoch": 3.4257384837843423, "grad_norm": 9.606948852539062, "learning_rate": 3.652288877871427e-05, "loss": 1.6466, "step": 20730 }, { "epoch": 3.4273910349101424, "grad_norm": 11.323090553283691, "learning_rate": 3.6513707559815644e-05, "loss": 1.4478, "step": 20740 }, { "epoch": 3.429043586035943, "grad_norm": 6.942873477935791, "learning_rate": 3.650452634091702e-05, "loss": 1.5668, "step": 20750 }, { "epoch": 3.4306961371617435, "grad_norm": 6.9645867347717285, "learning_rate": 3.6495345122018395e-05, "loss": 1.3887, "step": 20760 }, { "epoch": 3.432348688287544, "grad_norm": 9.70297622680664, "learning_rate": 3.648616390311978e-05, "loss": 1.5917, "step": 20770 }, { "epoch": 3.4340012394133446, "grad_norm": 19.935123443603516, "learning_rate": 3.647698268422116e-05, "loss": 1.6505, "step": 20780 }, { "epoch": 3.4356537905391447, "grad_norm": 8.549393653869629, "learning_rate": 3.6467801465322536e-05, "loss": 1.7168, "step": 20790 }, { "epoch": 3.437306341664945, "grad_norm": 7.79632568359375, "learning_rate": 3.645862024642392e-05, "loss": 1.5747, "step": 20800 }, { "epoch": 3.4389588927907457, "grad_norm": 15.404546737670898, "learning_rate": 3.6449439027525295e-05, "loss": 1.54, "step": 20810 }, { "epoch": 3.4406114439165463, "grad_norm": 7.910075664520264, "learning_rate": 3.644025780862668e-05, "loss": 1.4787, "step": 20820 }, { "epoch": 3.442263995042347, "grad_norm": 38.832923889160156, "learning_rate": 3.643107658972805e-05, "loss": 1.3986, "step": 20830 }, { "epoch": 3.443916546168147, "grad_norm": 15.27536392211914, "learning_rate": 3.6421895370829436e-05, "loss": 1.5966, "step": 20840 }, { "epoch": 3.4455690972939474, "grad_norm": 5.51830530166626, "learning_rate": 3.641271415193081e-05, "loss": 1.4576, "step": 20850 }, { "epoch": 3.447221648419748, "grad_norm": 9.22146987915039, "learning_rate": 3.6403532933032194e-05, "loss": 1.5793, "step": 20860 }, { "epoch": 3.4488741995455485, "grad_norm": 18.014942169189453, "learning_rate": 3.639435171413357e-05, "loss": 1.4621, "step": 20870 }, { "epoch": 3.450526750671349, "grad_norm": 11.17994499206543, "learning_rate": 3.6385170495234946e-05, "loss": 1.5456, "step": 20880 }, { "epoch": 3.452179301797149, "grad_norm": 7.204450607299805, "learning_rate": 3.637598927633633e-05, "loss": 1.5712, "step": 20890 }, { "epoch": 3.4538318529229497, "grad_norm": 9.417290687561035, "learning_rate": 3.6366808057437705e-05, "loss": 1.4765, "step": 20900 }, { "epoch": 3.4554844040487502, "grad_norm": 8.76371955871582, "learning_rate": 3.635762683853909e-05, "loss": 1.6961, "step": 20910 }, { "epoch": 3.4571369551745508, "grad_norm": 9.267240524291992, "learning_rate": 3.634844561964046e-05, "loss": 1.5875, "step": 20920 }, { "epoch": 3.4587895063003513, "grad_norm": 11.603541374206543, "learning_rate": 3.6339264400741846e-05, "loss": 1.6531, "step": 20930 }, { "epoch": 3.4604420574261514, "grad_norm": 8.296456336975098, "learning_rate": 3.633008318184322e-05, "loss": 1.6704, "step": 20940 }, { "epoch": 3.462094608551952, "grad_norm": 8.473135948181152, "learning_rate": 3.6320901962944604e-05, "loss": 1.4977, "step": 20950 }, { "epoch": 3.4637471596777525, "grad_norm": 18.611042022705078, "learning_rate": 3.631172074404598e-05, "loss": 1.475, "step": 20960 }, { "epoch": 3.465399710803553, "grad_norm": 7.0088090896606445, "learning_rate": 3.630253952514736e-05, "loss": 1.4484, "step": 20970 }, { "epoch": 3.4670522619293536, "grad_norm": 9.421944618225098, "learning_rate": 3.629335830624874e-05, "loss": 1.4781, "step": 20980 }, { "epoch": 3.4687048130551537, "grad_norm": 35.80681610107422, "learning_rate": 3.628417708735012e-05, "loss": 1.5584, "step": 20990 }, { "epoch": 3.470357364180954, "grad_norm": 7.813302040100098, "learning_rate": 3.62749958684515e-05, "loss": 1.5762, "step": 21000 }, { "epoch": 3.4720099153067547, "grad_norm": 21.3023681640625, "learning_rate": 3.626581464955287e-05, "loss": 1.6249, "step": 21010 }, { "epoch": 3.4736624664325553, "grad_norm": 62.12176513671875, "learning_rate": 3.6256633430654256e-05, "loss": 1.5029, "step": 21020 }, { "epoch": 3.475315017558356, "grad_norm": 8.91295337677002, "learning_rate": 3.624745221175563e-05, "loss": 1.5845, "step": 21030 }, { "epoch": 3.4769675686841564, "grad_norm": 10.726168632507324, "learning_rate": 3.6238270992857014e-05, "loss": 1.6008, "step": 21040 }, { "epoch": 3.4786201198099564, "grad_norm": 7.324353218078613, "learning_rate": 3.622908977395839e-05, "loss": 1.5174, "step": 21050 }, { "epoch": 3.480272670935757, "grad_norm": 14.042418479919434, "learning_rate": 3.621990855505977e-05, "loss": 1.6201, "step": 21060 }, { "epoch": 3.4819252220615575, "grad_norm": 22.140363693237305, "learning_rate": 3.621072733616115e-05, "loss": 1.6755, "step": 21070 }, { "epoch": 3.483577773187358, "grad_norm": 10.497157096862793, "learning_rate": 3.620154611726253e-05, "loss": 1.4485, "step": 21080 }, { "epoch": 3.4852303243131586, "grad_norm": 7.269160270690918, "learning_rate": 3.619236489836391e-05, "loss": 1.5914, "step": 21090 }, { "epoch": 3.486882875438959, "grad_norm": 11.066585540771484, "learning_rate": 3.618318367946529e-05, "loss": 1.4845, "step": 21100 }, { "epoch": 3.4885354265647592, "grad_norm": 16.51125144958496, "learning_rate": 3.6174002460566666e-05, "loss": 1.5517, "step": 21110 }, { "epoch": 3.4901879776905598, "grad_norm": 35.069183349609375, "learning_rate": 3.616482124166805e-05, "loss": 1.5281, "step": 21120 }, { "epoch": 3.4918405288163603, "grad_norm": 16.97264289855957, "learning_rate": 3.6155640022769424e-05, "loss": 1.469, "step": 21130 }, { "epoch": 3.493493079942161, "grad_norm": 21.5349178314209, "learning_rate": 3.61464588038708e-05, "loss": 1.6392, "step": 21140 }, { "epoch": 3.4951456310679614, "grad_norm": 8.533900260925293, "learning_rate": 3.613727758497218e-05, "loss": 1.5645, "step": 21150 }, { "epoch": 3.4967981821937615, "grad_norm": 59.57920455932617, "learning_rate": 3.612809636607356e-05, "loss": 1.598, "step": 21160 }, { "epoch": 3.498450733319562, "grad_norm": 34.87241744995117, "learning_rate": 3.611891514717494e-05, "loss": 1.6019, "step": 21170 }, { "epoch": 3.5001032844453626, "grad_norm": 12.19272232055664, "learning_rate": 3.610973392827632e-05, "loss": 1.5174, "step": 21180 }, { "epoch": 3.501755835571163, "grad_norm": 23.954174041748047, "learning_rate": 3.61005527093777e-05, "loss": 1.6589, "step": 21190 }, { "epoch": 3.5034083866969636, "grad_norm": 11.947124481201172, "learning_rate": 3.6091371490479075e-05, "loss": 1.6736, "step": 21200 }, { "epoch": 3.5050609378227637, "grad_norm": 30.422679901123047, "learning_rate": 3.608219027158046e-05, "loss": 1.5366, "step": 21210 }, { "epoch": 3.5067134889485643, "grad_norm": 5.95852518081665, "learning_rate": 3.6073009052681834e-05, "loss": 1.6103, "step": 21220 }, { "epoch": 3.508366040074365, "grad_norm": 7.431918144226074, "learning_rate": 3.6063827833783216e-05, "loss": 1.4234, "step": 21230 }, { "epoch": 3.5100185912001653, "grad_norm": 19.810720443725586, "learning_rate": 3.60546466148846e-05, "loss": 1.5673, "step": 21240 }, { "epoch": 3.511671142325966, "grad_norm": 9.25375747680664, "learning_rate": 3.6045465395985975e-05, "loss": 1.5157, "step": 21250 }, { "epoch": 3.513323693451766, "grad_norm": 15.374961853027344, "learning_rate": 3.603628417708735e-05, "loss": 1.5093, "step": 21260 }, { "epoch": 3.5149762445775665, "grad_norm": 15.200456619262695, "learning_rate": 3.602710295818873e-05, "loss": 1.6222, "step": 21270 }, { "epoch": 3.516628795703367, "grad_norm": 10.129278182983398, "learning_rate": 3.601792173929011e-05, "loss": 1.5672, "step": 21280 }, { "epoch": 3.5182813468291676, "grad_norm": 15.771454811096191, "learning_rate": 3.6008740520391485e-05, "loss": 1.6731, "step": 21290 }, { "epoch": 3.519933897954968, "grad_norm": 10.804917335510254, "learning_rate": 3.599955930149287e-05, "loss": 1.6508, "step": 21300 }, { "epoch": 3.5215864490807682, "grad_norm": 10.056406021118164, "learning_rate": 3.5990378082594244e-05, "loss": 1.527, "step": 21310 }, { "epoch": 3.5232390002065688, "grad_norm": 14.683306694030762, "learning_rate": 3.5981196863695626e-05, "loss": 1.6713, "step": 21320 }, { "epoch": 3.5248915513323693, "grad_norm": 12.720149040222168, "learning_rate": 3.5972015644797e-05, "loss": 1.5369, "step": 21330 }, { "epoch": 3.52654410245817, "grad_norm": 20.362594604492188, "learning_rate": 3.5962834425898385e-05, "loss": 1.4948, "step": 21340 }, { "epoch": 3.5281966535839704, "grad_norm": 9.18614387512207, "learning_rate": 3.595365320699977e-05, "loss": 1.5522, "step": 21350 }, { "epoch": 3.5298492047097705, "grad_norm": 15.578404426574707, "learning_rate": 3.594447198810114e-05, "loss": 1.5016, "step": 21360 }, { "epoch": 3.531501755835571, "grad_norm": 28.72294807434082, "learning_rate": 3.5935290769202526e-05, "loss": 1.5429, "step": 21370 }, { "epoch": 3.5331543069613716, "grad_norm": 8.129132270812988, "learning_rate": 3.59261095503039e-05, "loss": 1.6308, "step": 21380 }, { "epoch": 3.534806858087172, "grad_norm": 7.975843906402588, "learning_rate": 3.591692833140528e-05, "loss": 1.5086, "step": 21390 }, { "epoch": 3.5364594092129726, "grad_norm": 69.10977935791016, "learning_rate": 3.5907747112506654e-05, "loss": 1.6113, "step": 21400 }, { "epoch": 3.5381119603387727, "grad_norm": 28.328746795654297, "learning_rate": 3.5898565893608036e-05, "loss": 1.5441, "step": 21410 }, { "epoch": 3.5397645114645733, "grad_norm": 15.218031883239746, "learning_rate": 3.588938467470941e-05, "loss": 1.6617, "step": 21420 }, { "epoch": 3.541417062590374, "grad_norm": 7.890848636627197, "learning_rate": 3.5880203455810795e-05, "loss": 1.4507, "step": 21430 }, { "epoch": 3.5430696137161743, "grad_norm": 23.32213592529297, "learning_rate": 3.587102223691217e-05, "loss": 1.4829, "step": 21440 }, { "epoch": 3.544722164841975, "grad_norm": 9.921598434448242, "learning_rate": 3.586184101801355e-05, "loss": 1.5639, "step": 21450 }, { "epoch": 3.546374715967775, "grad_norm": 10.007643699645996, "learning_rate": 3.5852659799114936e-05, "loss": 1.6078, "step": 21460 }, { "epoch": 3.548027267093576, "grad_norm": 6.750604629516602, "learning_rate": 3.584347858021631e-05, "loss": 1.5142, "step": 21470 }, { "epoch": 3.549679818219376, "grad_norm": 12.836925506591797, "learning_rate": 3.5834297361317694e-05, "loss": 1.5415, "step": 21480 }, { "epoch": 3.5513323693451766, "grad_norm": 9.204744338989258, "learning_rate": 3.582511614241907e-05, "loss": 1.5137, "step": 21490 }, { "epoch": 3.552984920470977, "grad_norm": 22.862333297729492, "learning_rate": 3.581593492352045e-05, "loss": 1.5273, "step": 21500 }, { "epoch": 3.5546374715967777, "grad_norm": 8.714118003845215, "learning_rate": 3.580675370462183e-05, "loss": 1.6176, "step": 21510 }, { "epoch": 3.556290022722578, "grad_norm": 32.12099838256836, "learning_rate": 3.5797572485723204e-05, "loss": 1.5262, "step": 21520 }, { "epoch": 3.5579425738483783, "grad_norm": 33.427494049072266, "learning_rate": 3.578839126682458e-05, "loss": 1.6296, "step": 21530 }, { "epoch": 3.559595124974179, "grad_norm": 7.42344856262207, "learning_rate": 3.577921004792596e-05, "loss": 1.6682, "step": 21540 }, { "epoch": 3.5612476760999794, "grad_norm": 7.865045547485352, "learning_rate": 3.577002882902734e-05, "loss": 1.6207, "step": 21550 }, { "epoch": 3.56290022722578, "grad_norm": 8.04334831237793, "learning_rate": 3.576084761012872e-05, "loss": 1.5475, "step": 21560 }, { "epoch": 3.5645527783515805, "grad_norm": 9.874261856079102, "learning_rate": 3.5751666391230104e-05, "loss": 1.5455, "step": 21570 }, { "epoch": 3.5662053294773806, "grad_norm": 18.90970802307129, "learning_rate": 3.574248517233148e-05, "loss": 1.6134, "step": 21580 }, { "epoch": 3.567857880603181, "grad_norm": 7.690478324890137, "learning_rate": 3.573330395343286e-05, "loss": 1.6368, "step": 21590 }, { "epoch": 3.5695104317289816, "grad_norm": 12.998335838317871, "learning_rate": 3.572412273453424e-05, "loss": 1.6065, "step": 21600 }, { "epoch": 3.571162982854782, "grad_norm": 6.6522979736328125, "learning_rate": 3.571494151563562e-05, "loss": 1.3663, "step": 21610 }, { "epoch": 3.5728155339805827, "grad_norm": 9.580485343933105, "learning_rate": 3.5705760296737e-05, "loss": 1.5932, "step": 21620 }, { "epoch": 3.574468085106383, "grad_norm": 14.493224143981934, "learning_rate": 3.569657907783838e-05, "loss": 1.548, "step": 21630 }, { "epoch": 3.5761206362321833, "grad_norm": 9.22100830078125, "learning_rate": 3.5687397858939755e-05, "loss": 1.4881, "step": 21640 }, { "epoch": 3.577773187357984, "grad_norm": 32.90215301513672, "learning_rate": 3.567821664004113e-05, "loss": 1.5178, "step": 21650 }, { "epoch": 3.5794257384837844, "grad_norm": 10.091255187988281, "learning_rate": 3.566903542114251e-05, "loss": 1.5195, "step": 21660 }, { "epoch": 3.581078289609585, "grad_norm": 6.877391815185547, "learning_rate": 3.565985420224389e-05, "loss": 1.5511, "step": 21670 }, { "epoch": 3.582730840735385, "grad_norm": 15.701227188110352, "learning_rate": 3.5650672983345266e-05, "loss": 1.6004, "step": 21680 }, { "epoch": 3.5843833918611856, "grad_norm": 5.971979141235352, "learning_rate": 3.564149176444665e-05, "loss": 1.4233, "step": 21690 }, { "epoch": 3.586035942986986, "grad_norm": 28.64961051940918, "learning_rate": 3.563231054554803e-05, "loss": 1.5402, "step": 21700 }, { "epoch": 3.5876884941127867, "grad_norm": 18.97611427307129, "learning_rate": 3.562312932664941e-05, "loss": 1.6188, "step": 21710 }, { "epoch": 3.589341045238587, "grad_norm": 11.403149604797363, "learning_rate": 3.561394810775079e-05, "loss": 1.6067, "step": 21720 }, { "epoch": 3.5909935963643873, "grad_norm": 10.633176803588867, "learning_rate": 3.5604766888852165e-05, "loss": 1.5884, "step": 21730 }, { "epoch": 3.592646147490188, "grad_norm": 6.8395490646362305, "learning_rate": 3.559558566995355e-05, "loss": 1.5274, "step": 21740 }, { "epoch": 3.5942986986159884, "grad_norm": 7.640970706939697, "learning_rate": 3.5586404451054924e-05, "loss": 1.4888, "step": 21750 }, { "epoch": 3.595951249741789, "grad_norm": 27.678407669067383, "learning_rate": 3.5577223232156306e-05, "loss": 1.4142, "step": 21760 }, { "epoch": 3.5976038008675895, "grad_norm": 15.568962097167969, "learning_rate": 3.556804201325768e-05, "loss": 1.5022, "step": 21770 }, { "epoch": 3.5992563519933896, "grad_norm": 7.94047737121582, "learning_rate": 3.555886079435906e-05, "loss": 1.4938, "step": 21780 }, { "epoch": 3.60090890311919, "grad_norm": 8.355306625366211, "learning_rate": 3.5549679575460434e-05, "loss": 1.6414, "step": 21790 }, { "epoch": 3.6025614542449906, "grad_norm": 9.412763595581055, "learning_rate": 3.554049835656182e-05, "loss": 1.5618, "step": 21800 }, { "epoch": 3.604214005370791, "grad_norm": 8.5521240234375, "learning_rate": 3.55313171376632e-05, "loss": 1.5158, "step": 21810 }, { "epoch": 3.6058665564965917, "grad_norm": 14.980252265930176, "learning_rate": 3.5522135918764575e-05, "loss": 1.5169, "step": 21820 }, { "epoch": 3.607519107622392, "grad_norm": 5.556758403778076, "learning_rate": 3.551295469986596e-05, "loss": 1.5842, "step": 21830 }, { "epoch": 3.609171658748193, "grad_norm": 18.579702377319336, "learning_rate": 3.5503773480967334e-05, "loss": 1.5249, "step": 21840 }, { "epoch": 3.610824209873993, "grad_norm": 26.00187873840332, "learning_rate": 3.5494592262068716e-05, "loss": 1.5037, "step": 21850 }, { "epoch": 3.6124767609997934, "grad_norm": 34.09498977661133, "learning_rate": 3.548541104317009e-05, "loss": 1.4003, "step": 21860 }, { "epoch": 3.614129312125594, "grad_norm": 15.18822956085205, "learning_rate": 3.5476229824271475e-05, "loss": 1.6049, "step": 21870 }, { "epoch": 3.6157818632513945, "grad_norm": 8.645750999450684, "learning_rate": 3.546704860537285e-05, "loss": 1.5309, "step": 21880 }, { "epoch": 3.617434414377195, "grad_norm": 9.381811141967773, "learning_rate": 3.545786738647423e-05, "loss": 1.3985, "step": 21890 }, { "epoch": 3.619086965502995, "grad_norm": 10.318458557128906, "learning_rate": 3.544868616757561e-05, "loss": 1.5741, "step": 21900 }, { "epoch": 3.6207395166287957, "grad_norm": 7.140669345855713, "learning_rate": 3.5439504948676985e-05, "loss": 1.5998, "step": 21910 }, { "epoch": 3.622392067754596, "grad_norm": 11.092816352844238, "learning_rate": 3.543032372977837e-05, "loss": 1.6585, "step": 21920 }, { "epoch": 3.6240446188803968, "grad_norm": 14.348875045776367, "learning_rate": 3.5421142510879743e-05, "loss": 1.6889, "step": 21930 }, { "epoch": 3.6256971700061973, "grad_norm": 20.467607498168945, "learning_rate": 3.5411961291981126e-05, "loss": 1.5698, "step": 21940 }, { "epoch": 3.6273497211319974, "grad_norm": 6.864068508148193, "learning_rate": 3.54027800730825e-05, "loss": 1.3678, "step": 21950 }, { "epoch": 3.629002272257798, "grad_norm": 10.878697395324707, "learning_rate": 3.5393598854183885e-05, "loss": 1.5885, "step": 21960 }, { "epoch": 3.6306548233835985, "grad_norm": 11.186368942260742, "learning_rate": 3.538441763528526e-05, "loss": 1.4722, "step": 21970 }, { "epoch": 3.632307374509399, "grad_norm": 11.424324989318848, "learning_rate": 3.537523641638664e-05, "loss": 1.6041, "step": 21980 }, { "epoch": 3.6339599256351995, "grad_norm": 11.108640670776367, "learning_rate": 3.536605519748802e-05, "loss": 1.5555, "step": 21990 }, { "epoch": 3.6356124767609996, "grad_norm": 6.946378707885742, "learning_rate": 3.53568739785894e-05, "loss": 1.5627, "step": 22000 }, { "epoch": 3.6372650278868, "grad_norm": 17.414968490600586, "learning_rate": 3.534769275969078e-05, "loss": 1.5827, "step": 22010 }, { "epoch": 3.6389175790126007, "grad_norm": 8.051285743713379, "learning_rate": 3.533851154079216e-05, "loss": 1.561, "step": 22020 }, { "epoch": 3.6405701301384013, "grad_norm": 6.667621612548828, "learning_rate": 3.5329330321893536e-05, "loss": 1.4156, "step": 22030 }, { "epoch": 3.642222681264202, "grad_norm": 19.320453643798828, "learning_rate": 3.532014910299491e-05, "loss": 1.5852, "step": 22040 }, { "epoch": 3.643875232390002, "grad_norm": 8.498446464538574, "learning_rate": 3.5310967884096294e-05, "loss": 1.5934, "step": 22050 }, { "epoch": 3.6455277835158024, "grad_norm": 8.024924278259277, "learning_rate": 3.530178666519767e-05, "loss": 1.5443, "step": 22060 }, { "epoch": 3.647180334641603, "grad_norm": 14.608724594116211, "learning_rate": 3.529260544629905e-05, "loss": 1.6092, "step": 22070 }, { "epoch": 3.6488328857674035, "grad_norm": 8.343809127807617, "learning_rate": 3.528342422740043e-05, "loss": 1.6338, "step": 22080 }, { "epoch": 3.650485436893204, "grad_norm": 8.434662818908691, "learning_rate": 3.527424300850181e-05, "loss": 1.5697, "step": 22090 }, { "epoch": 3.652137988019004, "grad_norm": 7.201797008514404, "learning_rate": 3.526506178960319e-05, "loss": 1.4681, "step": 22100 }, { "epoch": 3.6537905391448047, "grad_norm": 7.6197896003723145, "learning_rate": 3.525588057070457e-05, "loss": 1.6327, "step": 22110 }, { "epoch": 3.655443090270605, "grad_norm": 6.695487022399902, "learning_rate": 3.5246699351805946e-05, "loss": 1.5547, "step": 22120 }, { "epoch": 3.6570956413964057, "grad_norm": 7.320518493652344, "learning_rate": 3.523751813290733e-05, "loss": 1.376, "step": 22130 }, { "epoch": 3.6587481925222063, "grad_norm": 10.218902587890625, "learning_rate": 3.5228336914008704e-05, "loss": 1.6282, "step": 22140 }, { "epoch": 3.6604007436480064, "grad_norm": 26.678739547729492, "learning_rate": 3.521915569511009e-05, "loss": 1.5529, "step": 22150 }, { "epoch": 3.662053294773807, "grad_norm": 7.635467529296875, "learning_rate": 3.520997447621146e-05, "loss": 1.5517, "step": 22160 }, { "epoch": 3.6637058458996075, "grad_norm": 21.92401123046875, "learning_rate": 3.520079325731284e-05, "loss": 1.5591, "step": 22170 }, { "epoch": 3.665358397025408, "grad_norm": 5.554649829864502, "learning_rate": 3.519161203841422e-05, "loss": 1.5443, "step": 22180 }, { "epoch": 3.6670109481512085, "grad_norm": 9.22087287902832, "learning_rate": 3.51824308195156e-05, "loss": 1.3653, "step": 22190 }, { "epoch": 3.6686634992770086, "grad_norm": 9.278251647949219, "learning_rate": 3.517324960061698e-05, "loss": 1.5557, "step": 22200 }, { "epoch": 3.670316050402809, "grad_norm": 9.113847732543945, "learning_rate": 3.5164068381718356e-05, "loss": 1.5877, "step": 22210 }, { "epoch": 3.6719686015286097, "grad_norm": 7.552914619445801, "learning_rate": 3.515488716281974e-05, "loss": 1.5065, "step": 22220 }, { "epoch": 3.6736211526544102, "grad_norm": 10.888140678405762, "learning_rate": 3.5145705943921114e-05, "loss": 1.4641, "step": 22230 }, { "epoch": 3.675273703780211, "grad_norm": 11.267471313476562, "learning_rate": 3.51365247250225e-05, "loss": 1.6272, "step": 22240 }, { "epoch": 3.676926254906011, "grad_norm": 7.349489688873291, "learning_rate": 3.512734350612387e-05, "loss": 1.5629, "step": 22250 }, { "epoch": 3.678578806031812, "grad_norm": 9.634526252746582, "learning_rate": 3.5118162287225255e-05, "loss": 1.5289, "step": 22260 }, { "epoch": 3.680231357157612, "grad_norm": 9.006583213806152, "learning_rate": 3.510898106832664e-05, "loss": 1.5232, "step": 22270 }, { "epoch": 3.6818839082834125, "grad_norm": 9.722323417663574, "learning_rate": 3.5099799849428014e-05, "loss": 1.4823, "step": 22280 }, { "epoch": 3.683536459409213, "grad_norm": 7.501804828643799, "learning_rate": 3.509061863052939e-05, "loss": 1.609, "step": 22290 }, { "epoch": 3.6851890105350136, "grad_norm": 62.3031120300293, "learning_rate": 3.5081437411630765e-05, "loss": 1.554, "step": 22300 }, { "epoch": 3.686841561660814, "grad_norm": 7.480673313140869, "learning_rate": 3.507225619273215e-05, "loss": 1.4209, "step": 22310 }, { "epoch": 3.688494112786614, "grad_norm": 8.595160484313965, "learning_rate": 3.5063074973833524e-05, "loss": 1.7512, "step": 22320 }, { "epoch": 3.6901466639124147, "grad_norm": 5.803942680358887, "learning_rate": 3.5053893754934907e-05, "loss": 1.5432, "step": 22330 }, { "epoch": 3.6917992150382153, "grad_norm": 13.61528491973877, "learning_rate": 3.504471253603628e-05, "loss": 1.617, "step": 22340 }, { "epoch": 3.693451766164016, "grad_norm": 60.33266067504883, "learning_rate": 3.5035531317137665e-05, "loss": 1.5357, "step": 22350 }, { "epoch": 3.6951043172898164, "grad_norm": 11.57695484161377, "learning_rate": 3.502635009823904e-05, "loss": 1.3428, "step": 22360 }, { "epoch": 3.6967568684156165, "grad_norm": 9.341504096984863, "learning_rate": 3.5017168879340424e-05, "loss": 1.5852, "step": 22370 }, { "epoch": 3.698409419541417, "grad_norm": 9.287238121032715, "learning_rate": 3.5007987660441806e-05, "loss": 1.4823, "step": 22380 }, { "epoch": 3.7000619706672175, "grad_norm": 11.68283462524414, "learning_rate": 3.499880644154318e-05, "loss": 1.5553, "step": 22390 }, { "epoch": 3.701714521793018, "grad_norm": 9.870018005371094, "learning_rate": 3.4989625222644565e-05, "loss": 1.5829, "step": 22400 }, { "epoch": 3.7033670729188186, "grad_norm": 41.26398849487305, "learning_rate": 3.498044400374594e-05, "loss": 1.4629, "step": 22410 }, { "epoch": 3.7050196240446187, "grad_norm": 13.389437675476074, "learning_rate": 3.4971262784847316e-05, "loss": 1.6009, "step": 22420 }, { "epoch": 3.7066721751704192, "grad_norm": 8.6530179977417, "learning_rate": 3.496208156594869e-05, "loss": 1.6486, "step": 22430 }, { "epoch": 3.70832472629622, "grad_norm": 8.932863235473633, "learning_rate": 3.4952900347050075e-05, "loss": 1.6876, "step": 22440 }, { "epoch": 3.7099772774220203, "grad_norm": 14.32821273803711, "learning_rate": 3.494371912815145e-05, "loss": 1.4702, "step": 22450 }, { "epoch": 3.711629828547821, "grad_norm": 10.528609275817871, "learning_rate": 3.4934537909252833e-05, "loss": 1.4973, "step": 22460 }, { "epoch": 3.713282379673621, "grad_norm": 13.962540626525879, "learning_rate": 3.492535669035421e-05, "loss": 1.6201, "step": 22470 }, { "epoch": 3.7149349307994215, "grad_norm": 15.784793853759766, "learning_rate": 3.491617547145559e-05, "loss": 1.6017, "step": 22480 }, { "epoch": 3.716587481925222, "grad_norm": 9.065518379211426, "learning_rate": 3.4906994252556975e-05, "loss": 1.5692, "step": 22490 }, { "epoch": 3.7182400330510226, "grad_norm": 10.009034156799316, "learning_rate": 3.489781303365835e-05, "loss": 1.5791, "step": 22500 }, { "epoch": 3.719892584176823, "grad_norm": 8.641139030456543, "learning_rate": 3.488863181475973e-05, "loss": 1.5955, "step": 22510 }, { "epoch": 3.721545135302623, "grad_norm": 10.301948547363281, "learning_rate": 3.487945059586111e-05, "loss": 1.5108, "step": 22520 }, { "epoch": 3.7231976864284237, "grad_norm": 28.2911319732666, "learning_rate": 3.487026937696249e-05, "loss": 1.5386, "step": 22530 }, { "epoch": 3.7248502375542243, "grad_norm": 129.94375610351562, "learning_rate": 3.486108815806387e-05, "loss": 1.7079, "step": 22540 }, { "epoch": 3.726502788680025, "grad_norm": 23.256444931030273, "learning_rate": 3.485190693916524e-05, "loss": 1.5878, "step": 22550 }, { "epoch": 3.7281553398058254, "grad_norm": 7.868270397186279, "learning_rate": 3.484272572026662e-05, "loss": 1.536, "step": 22560 }, { "epoch": 3.7298078909316255, "grad_norm": 14.722084045410156, "learning_rate": 3.4833544501368e-05, "loss": 1.4831, "step": 22570 }, { "epoch": 3.731460442057426, "grad_norm": 19.196247100830078, "learning_rate": 3.482436328246938e-05, "loss": 1.4927, "step": 22580 }, { "epoch": 3.7331129931832265, "grad_norm": 7.337265968322754, "learning_rate": 3.481518206357076e-05, "loss": 1.6621, "step": 22590 }, { "epoch": 3.734765544309027, "grad_norm": 21.33555793762207, "learning_rate": 3.480600084467214e-05, "loss": 1.4837, "step": 22600 }, { "epoch": 3.7364180954348276, "grad_norm": 33.45065689086914, "learning_rate": 3.479681962577352e-05, "loss": 1.5462, "step": 22610 }, { "epoch": 3.7380706465606277, "grad_norm": 13.384827613830566, "learning_rate": 3.47876384068749e-05, "loss": 1.5441, "step": 22620 }, { "epoch": 3.7397231976864287, "grad_norm": 75.69512939453125, "learning_rate": 3.477845718797628e-05, "loss": 1.4274, "step": 22630 }, { "epoch": 3.741375748812229, "grad_norm": 14.372223854064941, "learning_rate": 3.476927596907766e-05, "loss": 1.6275, "step": 22640 }, { "epoch": 3.7430282999380293, "grad_norm": 8.416259765625, "learning_rate": 3.4760094750179036e-05, "loss": 1.5561, "step": 22650 }, { "epoch": 3.74468085106383, "grad_norm": 11.804091453552246, "learning_rate": 3.475091353128042e-05, "loss": 1.6549, "step": 22660 }, { "epoch": 3.7463334021896304, "grad_norm": 11.802560806274414, "learning_rate": 3.4741732312381794e-05, "loss": 1.5372, "step": 22670 }, { "epoch": 3.747985953315431, "grad_norm": 8.755041122436523, "learning_rate": 3.473255109348317e-05, "loss": 1.6324, "step": 22680 }, { "epoch": 3.749638504441231, "grad_norm": 6.660466194152832, "learning_rate": 3.4723369874584546e-05, "loss": 1.537, "step": 22690 }, { "epoch": 3.7512910555670316, "grad_norm": 19.879173278808594, "learning_rate": 3.471418865568593e-05, "loss": 1.474, "step": 22700 }, { "epoch": 3.752943606692832, "grad_norm": 4.748521327972412, "learning_rate": 3.4705007436787304e-05, "loss": 1.5009, "step": 22710 }, { "epoch": 3.7545961578186327, "grad_norm": 30.82050895690918, "learning_rate": 3.469582621788869e-05, "loss": 1.5368, "step": 22720 }, { "epoch": 3.756248708944433, "grad_norm": 28.105743408203125, "learning_rate": 3.468664499899007e-05, "loss": 1.359, "step": 22730 }, { "epoch": 3.7579012600702333, "grad_norm": 20.436500549316406, "learning_rate": 3.4677463780091446e-05, "loss": 1.4738, "step": 22740 }, { "epoch": 3.759553811196034, "grad_norm": 18.063720703125, "learning_rate": 3.466828256119283e-05, "loss": 1.6101, "step": 22750 }, { "epoch": 3.7612063623218344, "grad_norm": 11.609806060791016, "learning_rate": 3.4659101342294204e-05, "loss": 1.4377, "step": 22760 }, { "epoch": 3.762858913447635, "grad_norm": 7.862545490264893, "learning_rate": 3.464992012339559e-05, "loss": 1.5208, "step": 22770 }, { "epoch": 3.7645114645734354, "grad_norm": 13.573543548583984, "learning_rate": 3.464073890449696e-05, "loss": 1.6314, "step": 22780 }, { "epoch": 3.7661640156992355, "grad_norm": 6.686908721923828, "learning_rate": 3.4631557685598345e-05, "loss": 1.5347, "step": 22790 }, { "epoch": 3.767816566825036, "grad_norm": 16.876447677612305, "learning_rate": 3.462237646669972e-05, "loss": 1.3998, "step": 22800 }, { "epoch": 3.7694691179508366, "grad_norm": 5.551175117492676, "learning_rate": 3.46131952478011e-05, "loss": 1.6144, "step": 22810 }, { "epoch": 3.771121669076637, "grad_norm": 12.1668701171875, "learning_rate": 3.460401402890247e-05, "loss": 1.5202, "step": 22820 }, { "epoch": 3.7727742202024377, "grad_norm": 7.639707565307617, "learning_rate": 3.4594832810003855e-05, "loss": 1.5545, "step": 22830 }, { "epoch": 3.774426771328238, "grad_norm": 27.100339889526367, "learning_rate": 3.458565159110524e-05, "loss": 1.5098, "step": 22840 }, { "epoch": 3.7760793224540383, "grad_norm": 6.334820747375488, "learning_rate": 3.4576470372206614e-05, "loss": 1.4167, "step": 22850 }, { "epoch": 3.777731873579839, "grad_norm": 56.109954833984375, "learning_rate": 3.4567289153307997e-05, "loss": 1.5223, "step": 22860 }, { "epoch": 3.7793844247056394, "grad_norm": 9.93230152130127, "learning_rate": 3.455810793440937e-05, "loss": 1.5619, "step": 22870 }, { "epoch": 3.78103697583144, "grad_norm": 9.03853702545166, "learning_rate": 3.4548926715510755e-05, "loss": 1.468, "step": 22880 }, { "epoch": 3.78268952695724, "grad_norm": 11.272610664367676, "learning_rate": 3.453974549661213e-05, "loss": 1.4569, "step": 22890 }, { "epoch": 3.7843420780830406, "grad_norm": 11.486401557922363, "learning_rate": 3.4530564277713514e-05, "loss": 1.5401, "step": 22900 }, { "epoch": 3.785994629208841, "grad_norm": 12.329487800598145, "learning_rate": 3.452138305881489e-05, "loss": 1.5879, "step": 22910 }, { "epoch": 3.7876471803346417, "grad_norm": 13.06454849243164, "learning_rate": 3.451220183991627e-05, "loss": 1.5854, "step": 22920 }, { "epoch": 3.789299731460442, "grad_norm": 17.574142456054688, "learning_rate": 3.450302062101765e-05, "loss": 1.5392, "step": 22930 }, { "epoch": 3.7909522825862423, "grad_norm": 17.874441146850586, "learning_rate": 3.4493839402119024e-05, "loss": 1.542, "step": 22940 }, { "epoch": 3.792604833712043, "grad_norm": 8.613436698913574, "learning_rate": 3.4484658183220406e-05, "loss": 1.5781, "step": 22950 }, { "epoch": 3.7942573848378434, "grad_norm": 9.75782585144043, "learning_rate": 3.447547696432178e-05, "loss": 1.5628, "step": 22960 }, { "epoch": 3.795909935963644, "grad_norm": 13.988187789916992, "learning_rate": 3.4466295745423165e-05, "loss": 1.5528, "step": 22970 }, { "epoch": 3.7975624870894444, "grad_norm": 15.687790870666504, "learning_rate": 3.445711452652454e-05, "loss": 1.555, "step": 22980 }, { "epoch": 3.7992150382152445, "grad_norm": 10.992048263549805, "learning_rate": 3.4447933307625923e-05, "loss": 1.7061, "step": 22990 }, { "epoch": 3.8008675893410455, "grad_norm": 10.867881774902344, "learning_rate": 3.44387520887273e-05, "loss": 1.6758, "step": 23000 }, { "epoch": 3.8025201404668456, "grad_norm": 10.184456825256348, "learning_rate": 3.442957086982868e-05, "loss": 1.5848, "step": 23010 }, { "epoch": 3.804172691592646, "grad_norm": 17.802675247192383, "learning_rate": 3.442038965093006e-05, "loss": 1.592, "step": 23020 }, { "epoch": 3.8058252427184467, "grad_norm": 12.011911392211914, "learning_rate": 3.441120843203144e-05, "loss": 1.4794, "step": 23030 }, { "epoch": 3.807477793844247, "grad_norm": 8.725467681884766, "learning_rate": 3.4402027213132816e-05, "loss": 1.5482, "step": 23040 }, { "epoch": 3.8091303449700478, "grad_norm": 10.62525749206543, "learning_rate": 3.43928459942342e-05, "loss": 1.4878, "step": 23050 }, { "epoch": 3.810782896095848, "grad_norm": 10.504483222961426, "learning_rate": 3.4383664775335575e-05, "loss": 1.4817, "step": 23060 }, { "epoch": 3.8124354472216484, "grad_norm": 12.407063484191895, "learning_rate": 3.437448355643695e-05, "loss": 1.5458, "step": 23070 }, { "epoch": 3.814087998347449, "grad_norm": 17.53013038635254, "learning_rate": 3.436530233753833e-05, "loss": 1.5761, "step": 23080 }, { "epoch": 3.8157405494732495, "grad_norm": 16.574785232543945, "learning_rate": 3.435612111863971e-05, "loss": 1.5797, "step": 23090 }, { "epoch": 3.81739310059905, "grad_norm": 11.763534545898438, "learning_rate": 3.434693989974109e-05, "loss": 1.5059, "step": 23100 }, { "epoch": 3.81904565172485, "grad_norm": 11.488256454467773, "learning_rate": 3.433775868084247e-05, "loss": 1.6877, "step": 23110 }, { "epoch": 3.8206982028506506, "grad_norm": 8.004593849182129, "learning_rate": 3.432857746194385e-05, "loss": 1.491, "step": 23120 }, { "epoch": 3.822350753976451, "grad_norm": 8.863856315612793, "learning_rate": 3.4319396243045226e-05, "loss": 1.5296, "step": 23130 }, { "epoch": 3.8240033051022517, "grad_norm": 9.865023612976074, "learning_rate": 3.431021502414661e-05, "loss": 1.5924, "step": 23140 }, { "epoch": 3.8256558562280523, "grad_norm": 5.761257171630859, "learning_rate": 3.4301033805247985e-05, "loss": 1.4003, "step": 23150 }, { "epoch": 3.8273084073538524, "grad_norm": 14.028019905090332, "learning_rate": 3.429185258634937e-05, "loss": 1.5787, "step": 23160 }, { "epoch": 3.828960958479653, "grad_norm": 11.172537803649902, "learning_rate": 3.428267136745075e-05, "loss": 1.5284, "step": 23170 }, { "epoch": 3.8306135096054534, "grad_norm": 8.914949417114258, "learning_rate": 3.4273490148552126e-05, "loss": 1.5841, "step": 23180 }, { "epoch": 3.832266060731254, "grad_norm": 11.068294525146484, "learning_rate": 3.42643089296535e-05, "loss": 1.5591, "step": 23190 }, { "epoch": 3.8339186118570545, "grad_norm": 7.558146953582764, "learning_rate": 3.425512771075488e-05, "loss": 1.6917, "step": 23200 }, { "epoch": 3.8355711629828546, "grad_norm": 11.731191635131836, "learning_rate": 3.424594649185626e-05, "loss": 1.5366, "step": 23210 }, { "epoch": 3.837223714108655, "grad_norm": 19.394287109375, "learning_rate": 3.4236765272957636e-05, "loss": 1.5022, "step": 23220 }, { "epoch": 3.8388762652344557, "grad_norm": 27.054391860961914, "learning_rate": 3.422758405405902e-05, "loss": 1.538, "step": 23230 }, { "epoch": 3.8405288163602562, "grad_norm": 15.95034408569336, "learning_rate": 3.4218402835160394e-05, "loss": 1.6006, "step": 23240 }, { "epoch": 3.8421813674860568, "grad_norm": 8.388917922973633, "learning_rate": 3.420922161626178e-05, "loss": 1.4591, "step": 23250 }, { "epoch": 3.843833918611857, "grad_norm": 11.338773727416992, "learning_rate": 3.420004039736315e-05, "loss": 1.6336, "step": 23260 }, { "epoch": 3.8454864697376574, "grad_norm": 12.91083812713623, "learning_rate": 3.4190859178464536e-05, "loss": 1.6015, "step": 23270 }, { "epoch": 3.847139020863458, "grad_norm": 10.893755912780762, "learning_rate": 3.418167795956591e-05, "loss": 1.5369, "step": 23280 }, { "epoch": 3.8487915719892585, "grad_norm": 10.60979175567627, "learning_rate": 3.4172496740667294e-05, "loss": 1.591, "step": 23290 }, { "epoch": 3.850444123115059, "grad_norm": 9.582144737243652, "learning_rate": 3.416331552176868e-05, "loss": 1.6382, "step": 23300 }, { "epoch": 3.852096674240859, "grad_norm": 14.1846923828125, "learning_rate": 3.415413430287005e-05, "loss": 1.5981, "step": 23310 }, { "epoch": 3.8537492253666596, "grad_norm": 10.367266654968262, "learning_rate": 3.414495308397143e-05, "loss": 1.5864, "step": 23320 }, { "epoch": 3.85540177649246, "grad_norm": 9.663958549499512, "learning_rate": 3.4135771865072804e-05, "loss": 1.544, "step": 23330 }, { "epoch": 3.8570543276182607, "grad_norm": 13.666308403015137, "learning_rate": 3.412659064617419e-05, "loss": 1.4994, "step": 23340 }, { "epoch": 3.8587068787440613, "grad_norm": 9.828655242919922, "learning_rate": 3.411740942727556e-05, "loss": 1.5709, "step": 23350 }, { "epoch": 3.8603594298698614, "grad_norm": 9.607194900512695, "learning_rate": 3.4108228208376945e-05, "loss": 1.54, "step": 23360 }, { "epoch": 3.862011980995662, "grad_norm": 12.873283386230469, "learning_rate": 3.409904698947832e-05, "loss": 1.3461, "step": 23370 }, { "epoch": 3.8636645321214624, "grad_norm": 8.448955535888672, "learning_rate": 3.4089865770579704e-05, "loss": 1.5732, "step": 23380 }, { "epoch": 3.865317083247263, "grad_norm": 36.90695571899414, "learning_rate": 3.408068455168108e-05, "loss": 1.539, "step": 23390 }, { "epoch": 3.8669696343730635, "grad_norm": 14.371119499206543, "learning_rate": 3.407150333278246e-05, "loss": 1.542, "step": 23400 }, { "epoch": 3.8686221854988636, "grad_norm": 6.606020450592041, "learning_rate": 3.4062322113883845e-05, "loss": 1.412, "step": 23410 }, { "epoch": 3.8702747366246646, "grad_norm": 7.5736083984375, "learning_rate": 3.405314089498522e-05, "loss": 1.6222, "step": 23420 }, { "epoch": 3.8719272877504647, "grad_norm": 184.7552490234375, "learning_rate": 3.4043959676086604e-05, "loss": 1.5178, "step": 23430 }, { "epoch": 3.8735798388762652, "grad_norm": 7.478529930114746, "learning_rate": 3.403477845718798e-05, "loss": 1.6738, "step": 23440 }, { "epoch": 3.8752323900020658, "grad_norm": 14.52708911895752, "learning_rate": 3.4025597238289355e-05, "loss": 1.6449, "step": 23450 }, { "epoch": 3.8768849411278663, "grad_norm": 8.332096099853516, "learning_rate": 3.401641601939073e-05, "loss": 1.5679, "step": 23460 }, { "epoch": 3.878537492253667, "grad_norm": 9.562748908996582, "learning_rate": 3.4007234800492114e-05, "loss": 1.5742, "step": 23470 }, { "epoch": 3.880190043379467, "grad_norm": 7.013890743255615, "learning_rate": 3.399805358159349e-05, "loss": 1.6466, "step": 23480 }, { "epoch": 3.8818425945052675, "grad_norm": 16.500028610229492, "learning_rate": 3.398887236269487e-05, "loss": 1.6899, "step": 23490 }, { "epoch": 3.883495145631068, "grad_norm": 8.28381633758545, "learning_rate": 3.397969114379625e-05, "loss": 1.5692, "step": 23500 }, { "epoch": 3.8851476967568686, "grad_norm": 35.62749099731445, "learning_rate": 3.397050992489763e-05, "loss": 1.4717, "step": 23510 }, { "epoch": 3.886800247882669, "grad_norm": 8.142434120178223, "learning_rate": 3.396132870599901e-05, "loss": 1.4372, "step": 23520 }, { "epoch": 3.888452799008469, "grad_norm": 13.488250732421875, "learning_rate": 3.395214748710039e-05, "loss": 1.6781, "step": 23530 }, { "epoch": 3.8901053501342697, "grad_norm": 8.421558380126953, "learning_rate": 3.394296626820177e-05, "loss": 1.4748, "step": 23540 }, { "epoch": 3.8917579012600703, "grad_norm": 14.573169708251953, "learning_rate": 3.393378504930315e-05, "loss": 1.5179, "step": 23550 }, { "epoch": 3.893410452385871, "grad_norm": 8.99917984008789, "learning_rate": 3.392460383040453e-05, "loss": 1.5438, "step": 23560 }, { "epoch": 3.8950630035116713, "grad_norm": 16.3277645111084, "learning_rate": 3.3915422611505906e-05, "loss": 1.5428, "step": 23570 }, { "epoch": 3.8967155546374714, "grad_norm": 7.9492878913879395, "learning_rate": 3.390624139260728e-05, "loss": 1.6256, "step": 23580 }, { "epoch": 3.898368105763272, "grad_norm": 7.273673057556152, "learning_rate": 3.389706017370866e-05, "loss": 1.6642, "step": 23590 }, { "epoch": 3.9000206568890725, "grad_norm": 8.6654634475708, "learning_rate": 3.388787895481004e-05, "loss": 1.5932, "step": 23600 }, { "epoch": 3.901673208014873, "grad_norm": 10.21167278289795, "learning_rate": 3.3878697735911416e-05, "loss": 1.5356, "step": 23610 }, { "epoch": 3.9033257591406736, "grad_norm": 16.551963806152344, "learning_rate": 3.38695165170128e-05, "loss": 1.6243, "step": 23620 }, { "epoch": 3.9049783102664737, "grad_norm": 28.581851959228516, "learning_rate": 3.386033529811418e-05, "loss": 1.6437, "step": 23630 }, { "epoch": 3.9066308613922742, "grad_norm": 22.335962295532227, "learning_rate": 3.385115407921556e-05, "loss": 1.4263, "step": 23640 }, { "epoch": 3.9082834125180748, "grad_norm": 7.5155158042907715, "learning_rate": 3.384197286031694e-05, "loss": 1.4715, "step": 23650 }, { "epoch": 3.9099359636438753, "grad_norm": 12.375370979309082, "learning_rate": 3.3832791641418316e-05, "loss": 1.5336, "step": 23660 }, { "epoch": 3.911588514769676, "grad_norm": 26.960378646850586, "learning_rate": 3.38236104225197e-05, "loss": 1.5445, "step": 23670 }, { "epoch": 3.913241065895476, "grad_norm": 17.1274471282959, "learning_rate": 3.3814429203621075e-05, "loss": 1.6372, "step": 23680 }, { "epoch": 3.9148936170212765, "grad_norm": 8.658958435058594, "learning_rate": 3.380524798472246e-05, "loss": 1.5278, "step": 23690 }, { "epoch": 3.916546168147077, "grad_norm": 8.599283218383789, "learning_rate": 3.379606676582383e-05, "loss": 1.5516, "step": 23700 }, { "epoch": 3.9181987192728776, "grad_norm": 17.065500259399414, "learning_rate": 3.378688554692521e-05, "loss": 1.5585, "step": 23710 }, { "epoch": 3.919851270398678, "grad_norm": 6.110587120056152, "learning_rate": 3.3777704328026585e-05, "loss": 1.4451, "step": 23720 }, { "epoch": 3.921503821524478, "grad_norm": 11.20459270477295, "learning_rate": 3.376852310912797e-05, "loss": 1.5366, "step": 23730 }, { "epoch": 3.9231563726502787, "grad_norm": 7.95195198059082, "learning_rate": 3.375934189022935e-05, "loss": 1.3379, "step": 23740 }, { "epoch": 3.9248089237760793, "grad_norm": 17.207660675048828, "learning_rate": 3.3750160671330726e-05, "loss": 1.5098, "step": 23750 }, { "epoch": 3.92646147490188, "grad_norm": 18.350038528442383, "learning_rate": 3.374097945243211e-05, "loss": 1.421, "step": 23760 }, { "epoch": 3.9281140260276803, "grad_norm": 21.70857048034668, "learning_rate": 3.3731798233533484e-05, "loss": 1.5718, "step": 23770 }, { "epoch": 3.9297665771534804, "grad_norm": 12.964973449707031, "learning_rate": 3.372261701463487e-05, "loss": 1.5394, "step": 23780 }, { "epoch": 3.9314191282792814, "grad_norm": 7.696381092071533, "learning_rate": 3.371343579573624e-05, "loss": 1.392, "step": 23790 }, { "epoch": 3.9330716794050815, "grad_norm": 8.877033233642578, "learning_rate": 3.3704254576837626e-05, "loss": 1.6341, "step": 23800 }, { "epoch": 3.934724230530882, "grad_norm": 6.651622772216797, "learning_rate": 3.3695073357939e-05, "loss": 1.3865, "step": 23810 }, { "epoch": 3.9363767816566826, "grad_norm": 13.928619384765625, "learning_rate": 3.3685892139040384e-05, "loss": 1.5814, "step": 23820 }, { "epoch": 3.9380293327824827, "grad_norm": 14.732012748718262, "learning_rate": 3.367671092014176e-05, "loss": 1.4712, "step": 23830 }, { "epoch": 3.9396818839082837, "grad_norm": 14.067058563232422, "learning_rate": 3.3667529701243136e-05, "loss": 1.5988, "step": 23840 }, { "epoch": 3.9413344350340838, "grad_norm": 10.48046875, "learning_rate": 3.365834848234452e-05, "loss": 1.6534, "step": 23850 }, { "epoch": 3.9429869861598843, "grad_norm": 25.77431297302246, "learning_rate": 3.3649167263445894e-05, "loss": 1.4357, "step": 23860 }, { "epoch": 3.944639537285685, "grad_norm": 39.13468551635742, "learning_rate": 3.363998604454728e-05, "loss": 1.5746, "step": 23870 }, { "epoch": 3.9462920884114854, "grad_norm": 12.897835731506348, "learning_rate": 3.363080482564865e-05, "loss": 1.435, "step": 23880 }, { "epoch": 3.947944639537286, "grad_norm": 18.15728187561035, "learning_rate": 3.3621623606750035e-05, "loss": 1.5303, "step": 23890 }, { "epoch": 3.949597190663086, "grad_norm": 20.04810905456543, "learning_rate": 3.361244238785141e-05, "loss": 1.5076, "step": 23900 }, { "epoch": 3.9512497417888865, "grad_norm": 6.630679607391357, "learning_rate": 3.3603261168952794e-05, "loss": 1.4879, "step": 23910 }, { "epoch": 3.952902292914687, "grad_norm": 30.30918312072754, "learning_rate": 3.359407995005417e-05, "loss": 1.5553, "step": 23920 }, { "epoch": 3.9545548440404876, "grad_norm": 7.379817008972168, "learning_rate": 3.358489873115555e-05, "loss": 1.5888, "step": 23930 }, { "epoch": 3.956207395166288, "grad_norm": 8.264845848083496, "learning_rate": 3.357571751225693e-05, "loss": 1.637, "step": 23940 }, { "epoch": 3.9578599462920883, "grad_norm": 14.769917488098145, "learning_rate": 3.356653629335831e-05, "loss": 1.5499, "step": 23950 }, { "epoch": 3.959512497417889, "grad_norm": 6.285022735595703, "learning_rate": 3.355735507445969e-05, "loss": 1.6016, "step": 23960 }, { "epoch": 3.9611650485436893, "grad_norm": 8.565293312072754, "learning_rate": 3.354817385556106e-05, "loss": 1.4791, "step": 23970 }, { "epoch": 3.96281759966949, "grad_norm": 13.735636711120605, "learning_rate": 3.3538992636662445e-05, "loss": 1.4269, "step": 23980 }, { "epoch": 3.9644701507952904, "grad_norm": 6.193691253662109, "learning_rate": 3.352981141776382e-05, "loss": 1.4687, "step": 23990 }, { "epoch": 3.9661227019210905, "grad_norm": 9.597599029541016, "learning_rate": 3.3520630198865204e-05, "loss": 1.4732, "step": 24000 }, { "epoch": 3.967775253046891, "grad_norm": 12.805542945861816, "learning_rate": 3.351144897996658e-05, "loss": 1.7671, "step": 24010 }, { "epoch": 3.9694278041726916, "grad_norm": 11.650763511657715, "learning_rate": 3.350226776106796e-05, "loss": 1.4774, "step": 24020 }, { "epoch": 3.971080355298492, "grad_norm": 32.57954406738281, "learning_rate": 3.349308654216934e-05, "loss": 1.6312, "step": 24030 }, { "epoch": 3.9727329064242927, "grad_norm": 10.600648880004883, "learning_rate": 3.348390532327072e-05, "loss": 1.4607, "step": 24040 }, { "epoch": 3.9743854575500928, "grad_norm": 7.16843843460083, "learning_rate": 3.3474724104372097e-05, "loss": 1.5808, "step": 24050 }, { "epoch": 3.9760380086758933, "grad_norm": 60.35005187988281, "learning_rate": 3.346554288547348e-05, "loss": 1.4766, "step": 24060 }, { "epoch": 3.977690559801694, "grad_norm": 8.780489921569824, "learning_rate": 3.3456361666574855e-05, "loss": 1.5446, "step": 24070 }, { "epoch": 3.9793431109274944, "grad_norm": 17.84938621520996, "learning_rate": 3.344718044767624e-05, "loss": 1.569, "step": 24080 }, { "epoch": 3.980995662053295, "grad_norm": 10.445038795471191, "learning_rate": 3.3437999228777614e-05, "loss": 1.6016, "step": 24090 }, { "epoch": 3.982648213179095, "grad_norm": 6.451947212219238, "learning_rate": 3.342881800987899e-05, "loss": 1.5652, "step": 24100 }, { "epoch": 3.9843007643048955, "grad_norm": 13.274555206298828, "learning_rate": 3.341963679098037e-05, "loss": 1.3444, "step": 24110 }, { "epoch": 3.985953315430696, "grad_norm": 92.55748748779297, "learning_rate": 3.341045557208175e-05, "loss": 1.5804, "step": 24120 }, { "epoch": 3.9876058665564966, "grad_norm": 6.433228015899658, "learning_rate": 3.340127435318313e-05, "loss": 1.5241, "step": 24130 }, { "epoch": 3.989258417682297, "grad_norm": 7.102395534515381, "learning_rate": 3.3392093134284506e-05, "loss": 1.4866, "step": 24140 }, { "epoch": 3.9909109688080973, "grad_norm": 7.430931568145752, "learning_rate": 3.338291191538589e-05, "loss": 1.5228, "step": 24150 }, { "epoch": 3.992563519933898, "grad_norm": 7.019854545593262, "learning_rate": 3.3373730696487265e-05, "loss": 1.6071, "step": 24160 }, { "epoch": 3.9942160710596983, "grad_norm": 5.7070512771606445, "learning_rate": 3.336454947758865e-05, "loss": 1.4218, "step": 24170 }, { "epoch": 3.995868622185499, "grad_norm": 15.087320327758789, "learning_rate": 3.335536825869002e-05, "loss": 1.4703, "step": 24180 }, { "epoch": 3.9975211733112994, "grad_norm": 10.09599781036377, "learning_rate": 3.3346187039791406e-05, "loss": 1.5294, "step": 24190 }, { "epoch": 3.9991737244370995, "grad_norm": 9.25766658782959, "learning_rate": 3.333700582089279e-05, "loss": 1.5693, "step": 24200 }, { "epoch": 4.0, "eval_accuracy": 0.3066392396084551, "eval_loss": 2.2003278732299805, "eval_runtime": 814.2753, "eval_samples_per_second": 34.627, "eval_steps_per_second": 8.657, "step": 24205 }, { "epoch": 4.0008262755629005, "grad_norm": 6.969991207122803, "learning_rate": 3.3327824601994165e-05, "loss": 1.5264, "step": 24210 }, { "epoch": 4.002478826688701, "grad_norm": 6.718509674072266, "learning_rate": 3.331864338309554e-05, "loss": 1.5123, "step": 24220 }, { "epoch": 4.004131377814502, "grad_norm": 13.605385780334473, "learning_rate": 3.3309462164196916e-05, "loss": 1.5449, "step": 24230 }, { "epoch": 4.005783928940302, "grad_norm": 8.160221099853516, "learning_rate": 3.33002809452983e-05, "loss": 1.6421, "step": 24240 }, { "epoch": 4.007436480066102, "grad_norm": 6.459905624389648, "learning_rate": 3.3291099726399675e-05, "loss": 1.6331, "step": 24250 }, { "epoch": 4.009089031191903, "grad_norm": 11.185013771057129, "learning_rate": 3.328191850750106e-05, "loss": 1.4834, "step": 24260 }, { "epoch": 4.010741582317703, "grad_norm": 11.707094192504883, "learning_rate": 3.327273728860243e-05, "loss": 1.5363, "step": 24270 }, { "epoch": 4.012394133443504, "grad_norm": 44.96635055541992, "learning_rate": 3.3263556069703816e-05, "loss": 1.4972, "step": 24280 }, { "epoch": 4.014046684569304, "grad_norm": 15.029733657836914, "learning_rate": 3.325437485080519e-05, "loss": 1.575, "step": 24290 }, { "epoch": 4.015699235695104, "grad_norm": 8.984262466430664, "learning_rate": 3.3245193631906574e-05, "loss": 1.6195, "step": 24300 }, { "epoch": 4.017351786820905, "grad_norm": 6.634871482849121, "learning_rate": 3.323601241300795e-05, "loss": 1.5722, "step": 24310 }, { "epoch": 4.019004337946705, "grad_norm": 13.543932914733887, "learning_rate": 3.322683119410933e-05, "loss": 1.6134, "step": 24320 }, { "epoch": 4.020656889072506, "grad_norm": 7.46361780166626, "learning_rate": 3.3217649975210715e-05, "loss": 1.41, "step": 24330 }, { "epoch": 4.022309440198306, "grad_norm": 16.312719345092773, "learning_rate": 3.320846875631209e-05, "loss": 1.4198, "step": 24340 }, { "epoch": 4.023961991324106, "grad_norm": 16.578506469726562, "learning_rate": 3.319928753741347e-05, "loss": 1.4846, "step": 24350 }, { "epoch": 4.025614542449907, "grad_norm": 25.313661575317383, "learning_rate": 3.319010631851484e-05, "loss": 1.4667, "step": 24360 }, { "epoch": 4.027267093575707, "grad_norm": 71.65775299072266, "learning_rate": 3.3180925099616226e-05, "loss": 1.6426, "step": 24370 }, { "epoch": 4.028919644701508, "grad_norm": 17.943498611450195, "learning_rate": 3.31717438807176e-05, "loss": 1.5965, "step": 24380 }, { "epoch": 4.030572195827308, "grad_norm": 12.518502235412598, "learning_rate": 3.3162562661818984e-05, "loss": 1.57, "step": 24390 }, { "epoch": 4.0322247469531085, "grad_norm": 64.42125701904297, "learning_rate": 3.315338144292036e-05, "loss": 1.5285, "step": 24400 }, { "epoch": 4.0338772980789095, "grad_norm": 12.844326972961426, "learning_rate": 3.314420022402174e-05, "loss": 1.5768, "step": 24410 }, { "epoch": 4.03552984920471, "grad_norm": 14.874149322509766, "learning_rate": 3.313501900512312e-05, "loss": 1.424, "step": 24420 }, { "epoch": 4.037182400330511, "grad_norm": 18.798112869262695, "learning_rate": 3.31258377862245e-05, "loss": 1.6189, "step": 24430 }, { "epoch": 4.038834951456311, "grad_norm": 8.762454986572266, "learning_rate": 3.3116656567325884e-05, "loss": 1.6398, "step": 24440 }, { "epoch": 4.040487502582111, "grad_norm": 47.06496047973633, "learning_rate": 3.310747534842726e-05, "loss": 1.4831, "step": 24450 }, { "epoch": 4.042140053707912, "grad_norm": 7.9549384117126465, "learning_rate": 3.309829412952864e-05, "loss": 1.5666, "step": 24460 }, { "epoch": 4.043792604833712, "grad_norm": 8.519120216369629, "learning_rate": 3.308911291063002e-05, "loss": 1.5574, "step": 24470 }, { "epoch": 4.045445155959513, "grad_norm": 16.42377471923828, "learning_rate": 3.3079931691731394e-05, "loss": 1.5932, "step": 24480 }, { "epoch": 4.047097707085313, "grad_norm": 6.637816429138184, "learning_rate": 3.307075047283277e-05, "loss": 1.5165, "step": 24490 }, { "epoch": 4.048750258211113, "grad_norm": 91.19857025146484, "learning_rate": 3.306156925393415e-05, "loss": 1.4989, "step": 24500 }, { "epoch": 4.050402809336914, "grad_norm": 8.961830139160156, "learning_rate": 3.305238803503553e-05, "loss": 1.5585, "step": 24510 }, { "epoch": 4.052055360462714, "grad_norm": 11.918331146240234, "learning_rate": 3.304320681613691e-05, "loss": 1.3883, "step": 24520 }, { "epoch": 4.053707911588515, "grad_norm": 15.587654113769531, "learning_rate": 3.303402559723829e-05, "loss": 1.5789, "step": 24530 }, { "epoch": 4.055360462714315, "grad_norm": 15.174756050109863, "learning_rate": 3.302484437833967e-05, "loss": 1.5297, "step": 24540 }, { "epoch": 4.057013013840115, "grad_norm": 8.750446319580078, "learning_rate": 3.301566315944105e-05, "loss": 1.4318, "step": 24550 }, { "epoch": 4.058665564965916, "grad_norm": 10.243573188781738, "learning_rate": 3.300648194054243e-05, "loss": 1.5251, "step": 24560 }, { "epoch": 4.060318116091716, "grad_norm": 21.074369430541992, "learning_rate": 3.299730072164381e-05, "loss": 1.7139, "step": 24570 }, { "epoch": 4.061970667217517, "grad_norm": 8.997513771057129, "learning_rate": 3.2988119502745187e-05, "loss": 1.7064, "step": 24580 }, { "epoch": 4.063623218343317, "grad_norm": 29.157140731811523, "learning_rate": 3.297893828384657e-05, "loss": 1.464, "step": 24590 }, { "epoch": 4.0652757694691175, "grad_norm": 11.638101577758789, "learning_rate": 3.2969757064947945e-05, "loss": 1.5183, "step": 24600 }, { "epoch": 4.0669283205949185, "grad_norm": 7.134934425354004, "learning_rate": 3.296057584604932e-05, "loss": 1.4353, "step": 24610 }, { "epoch": 4.068580871720719, "grad_norm": 4.920504093170166, "learning_rate": 3.2951394627150704e-05, "loss": 1.5342, "step": 24620 }, { "epoch": 4.07023342284652, "grad_norm": 8.68477725982666, "learning_rate": 3.294221340825208e-05, "loss": 1.49, "step": 24630 }, { "epoch": 4.07188597397232, "grad_norm": 9.163007736206055, "learning_rate": 3.2933032189353455e-05, "loss": 1.3947, "step": 24640 }, { "epoch": 4.073538525098121, "grad_norm": 32.37632751464844, "learning_rate": 3.292385097045484e-05, "loss": 1.4837, "step": 24650 }, { "epoch": 4.075191076223921, "grad_norm": 9.217835426330566, "learning_rate": 3.291466975155622e-05, "loss": 1.579, "step": 24660 }, { "epoch": 4.076843627349721, "grad_norm": 11.495637893676758, "learning_rate": 3.2905488532657596e-05, "loss": 1.6953, "step": 24670 }, { "epoch": 4.078496178475522, "grad_norm": 7.819090843200684, "learning_rate": 3.289630731375898e-05, "loss": 1.5833, "step": 24680 }, { "epoch": 4.080148729601322, "grad_norm": 17.969079971313477, "learning_rate": 3.2887126094860355e-05, "loss": 1.3082, "step": 24690 }, { "epoch": 4.081801280727123, "grad_norm": 11.410839080810547, "learning_rate": 3.287794487596174e-05, "loss": 1.5623, "step": 24700 }, { "epoch": 4.083453831852923, "grad_norm": 37.196895599365234, "learning_rate": 3.286876365706311e-05, "loss": 1.5952, "step": 24710 }, { "epoch": 4.085106382978723, "grad_norm": 7.64441442489624, "learning_rate": 3.2859582438164496e-05, "loss": 1.4717, "step": 24720 }, { "epoch": 4.086758934104524, "grad_norm": 8.817895889282227, "learning_rate": 3.285040121926587e-05, "loss": 1.4053, "step": 24730 }, { "epoch": 4.088411485230324, "grad_norm": 19.185001373291016, "learning_rate": 3.284122000036725e-05, "loss": 1.5218, "step": 24740 }, { "epoch": 4.090064036356125, "grad_norm": 15.323920249938965, "learning_rate": 3.283203878146863e-05, "loss": 1.5812, "step": 24750 }, { "epoch": 4.091716587481925, "grad_norm": 17.28553009033203, "learning_rate": 3.2822857562570006e-05, "loss": 1.4426, "step": 24760 }, { "epoch": 4.093369138607725, "grad_norm": 10.657970428466797, "learning_rate": 3.281367634367139e-05, "loss": 1.5622, "step": 24770 }, { "epoch": 4.095021689733526, "grad_norm": 10.3989896774292, "learning_rate": 3.2804495124772765e-05, "loss": 1.5272, "step": 24780 }, { "epoch": 4.096674240859326, "grad_norm": 8.306746482849121, "learning_rate": 3.279531390587415e-05, "loss": 1.6179, "step": 24790 }, { "epoch": 4.098326791985127, "grad_norm": 7.192846298217773, "learning_rate": 3.278613268697552e-05, "loss": 1.5878, "step": 24800 }, { "epoch": 4.0999793431109275, "grad_norm": 9.504925727844238, "learning_rate": 3.2776951468076906e-05, "loss": 1.4805, "step": 24810 }, { "epoch": 4.101631894236728, "grad_norm": 7.679511070251465, "learning_rate": 3.276777024917828e-05, "loss": 1.6694, "step": 24820 }, { "epoch": 4.103284445362529, "grad_norm": 8.45281982421875, "learning_rate": 3.2758589030279664e-05, "loss": 1.4295, "step": 24830 }, { "epoch": 4.104936996488329, "grad_norm": 33.410911560058594, "learning_rate": 3.274940781138104e-05, "loss": 1.6718, "step": 24840 }, { "epoch": 4.10658954761413, "grad_norm": 10.545844078063965, "learning_rate": 3.274022659248242e-05, "loss": 1.4676, "step": 24850 }, { "epoch": 4.10824209873993, "grad_norm": 9.887289047241211, "learning_rate": 3.27310453735838e-05, "loss": 1.4952, "step": 24860 }, { "epoch": 4.10989464986573, "grad_norm": 13.104098320007324, "learning_rate": 3.2721864154685175e-05, "loss": 1.6299, "step": 24870 }, { "epoch": 4.111547200991531, "grad_norm": 8.035774230957031, "learning_rate": 3.271268293578656e-05, "loss": 1.5514, "step": 24880 }, { "epoch": 4.113199752117331, "grad_norm": 7.870905876159668, "learning_rate": 3.270350171688793e-05, "loss": 1.4943, "step": 24890 }, { "epoch": 4.114852303243132, "grad_norm": 8.745378494262695, "learning_rate": 3.2694320497989316e-05, "loss": 1.4858, "step": 24900 }, { "epoch": 4.116504854368932, "grad_norm": 10.829198837280273, "learning_rate": 3.268513927909069e-05, "loss": 1.5582, "step": 24910 }, { "epoch": 4.118157405494732, "grad_norm": 19.55958366394043, "learning_rate": 3.2675958060192074e-05, "loss": 1.5017, "step": 24920 }, { "epoch": 4.119809956620533, "grad_norm": 12.78548812866211, "learning_rate": 3.266677684129345e-05, "loss": 1.4994, "step": 24930 }, { "epoch": 4.121462507746333, "grad_norm": 13.971933364868164, "learning_rate": 3.265759562239483e-05, "loss": 1.41, "step": 24940 }, { "epoch": 4.123115058872134, "grad_norm": 7.70231294631958, "learning_rate": 3.264841440349621e-05, "loss": 1.5179, "step": 24950 }, { "epoch": 4.124767609997934, "grad_norm": 11.750558853149414, "learning_rate": 3.263923318459759e-05, "loss": 1.6527, "step": 24960 }, { "epoch": 4.126420161123734, "grad_norm": 12.228842735290527, "learning_rate": 3.263005196569897e-05, "loss": 1.6643, "step": 24970 }, { "epoch": 4.128072712249535, "grad_norm": 8.996533393859863, "learning_rate": 3.262087074680035e-05, "loss": 1.4033, "step": 24980 }, { "epoch": 4.129725263375335, "grad_norm": 5.231132984161377, "learning_rate": 3.2611689527901726e-05, "loss": 1.4871, "step": 24990 }, { "epoch": 4.131377814501136, "grad_norm": 8.555381774902344, "learning_rate": 3.26025083090031e-05, "loss": 1.5646, "step": 25000 }, { "epoch": 4.1330303656269365, "grad_norm": 6.629762649536133, "learning_rate": 3.2593327090104484e-05, "loss": 1.5089, "step": 25010 }, { "epoch": 4.1346829167527375, "grad_norm": 12.015291213989258, "learning_rate": 3.258414587120586e-05, "loss": 1.5633, "step": 25020 }, { "epoch": 4.136335467878538, "grad_norm": 7.017959117889404, "learning_rate": 3.257496465230724e-05, "loss": 1.5515, "step": 25030 }, { "epoch": 4.137988019004338, "grad_norm": 9.60529899597168, "learning_rate": 3.256578343340862e-05, "loss": 1.4825, "step": 25040 }, { "epoch": 4.139640570130139, "grad_norm": 10.894176483154297, "learning_rate": 3.255660221451e-05, "loss": 1.4634, "step": 25050 }, { "epoch": 4.141293121255939, "grad_norm": 11.371975898742676, "learning_rate": 3.254742099561138e-05, "loss": 1.5382, "step": 25060 }, { "epoch": 4.14294567238174, "grad_norm": 6.190928936004639, "learning_rate": 3.253823977671276e-05, "loss": 1.3551, "step": 25070 }, { "epoch": 4.14459822350754, "grad_norm": 36.7093620300293, "learning_rate": 3.2529058557814135e-05, "loss": 1.4276, "step": 25080 }, { "epoch": 4.14625077463334, "grad_norm": 9.693130493164062, "learning_rate": 3.251987733891552e-05, "loss": 1.6435, "step": 25090 }, { "epoch": 4.147903325759141, "grad_norm": 25.358810424804688, "learning_rate": 3.2510696120016894e-05, "loss": 1.4884, "step": 25100 }, { "epoch": 4.149555876884941, "grad_norm": 16.18667984008789, "learning_rate": 3.2501514901118276e-05, "loss": 1.5553, "step": 25110 }, { "epoch": 4.151208428010742, "grad_norm": 7.252718925476074, "learning_rate": 3.249233368221965e-05, "loss": 1.4565, "step": 25120 }, { "epoch": 4.152860979136542, "grad_norm": 14.278046607971191, "learning_rate": 3.248315246332103e-05, "loss": 1.5002, "step": 25130 }, { "epoch": 4.154513530262342, "grad_norm": 33.27809524536133, "learning_rate": 3.247397124442241e-05, "loss": 1.5514, "step": 25140 }, { "epoch": 4.156166081388143, "grad_norm": 11.636425971984863, "learning_rate": 3.246479002552379e-05, "loss": 1.4199, "step": 25150 }, { "epoch": 4.157818632513943, "grad_norm": 13.851326942443848, "learning_rate": 3.245560880662517e-05, "loss": 1.4482, "step": 25160 }, { "epoch": 4.159471183639744, "grad_norm": 11.560588836669922, "learning_rate": 3.2446427587726545e-05, "loss": 1.3819, "step": 25170 }, { "epoch": 4.161123734765544, "grad_norm": 12.765569686889648, "learning_rate": 3.243724636882793e-05, "loss": 1.5236, "step": 25180 }, { "epoch": 4.162776285891344, "grad_norm": 30.970333099365234, "learning_rate": 3.2428065149929304e-05, "loss": 1.5637, "step": 25190 }, { "epoch": 4.164428837017145, "grad_norm": 13.522059440612793, "learning_rate": 3.2418883931030686e-05, "loss": 1.4642, "step": 25200 }, { "epoch": 4.1660813881429455, "grad_norm": 9.605454444885254, "learning_rate": 3.240970271213206e-05, "loss": 1.4922, "step": 25210 }, { "epoch": 4.1677339392687465, "grad_norm": 15.900501251220703, "learning_rate": 3.2400521493233445e-05, "loss": 1.5052, "step": 25220 }, { "epoch": 4.169386490394547, "grad_norm": 21.861635208129883, "learning_rate": 3.239134027433483e-05, "loss": 1.5523, "step": 25230 }, { "epoch": 4.171039041520347, "grad_norm": 10.886978149414062, "learning_rate": 3.23821590554362e-05, "loss": 1.5286, "step": 25240 }, { "epoch": 4.172691592646148, "grad_norm": 13.990205764770508, "learning_rate": 3.237297783653758e-05, "loss": 1.5499, "step": 25250 }, { "epoch": 4.174344143771948, "grad_norm": 17.176815032958984, "learning_rate": 3.236379661763896e-05, "loss": 1.5911, "step": 25260 }, { "epoch": 4.175996694897749, "grad_norm": 7.191830635070801, "learning_rate": 3.235461539874034e-05, "loss": 1.4213, "step": 25270 }, { "epoch": 4.177649246023549, "grad_norm": 8.074373245239258, "learning_rate": 3.2345434179841714e-05, "loss": 1.4857, "step": 25280 }, { "epoch": 4.179301797149349, "grad_norm": 26.866851806640625, "learning_rate": 3.2336252960943096e-05, "loss": 1.629, "step": 25290 }, { "epoch": 4.18095434827515, "grad_norm": 9.89456558227539, "learning_rate": 3.232707174204447e-05, "loss": 1.5325, "step": 25300 }, { "epoch": 4.18260689940095, "grad_norm": 9.872194290161133, "learning_rate": 3.2317890523145855e-05, "loss": 1.5802, "step": 25310 }, { "epoch": 4.184259450526751, "grad_norm": 8.017595291137695, "learning_rate": 3.230870930424723e-05, "loss": 1.4805, "step": 25320 }, { "epoch": 4.185912001652551, "grad_norm": 6.862139701843262, "learning_rate": 3.229952808534861e-05, "loss": 1.4683, "step": 25330 }, { "epoch": 4.187564552778351, "grad_norm": 10.27568244934082, "learning_rate": 3.2290346866449996e-05, "loss": 1.5286, "step": 25340 }, { "epoch": 4.189217103904152, "grad_norm": 26.036792755126953, "learning_rate": 3.228116564755137e-05, "loss": 1.5395, "step": 25350 }, { "epoch": 4.190869655029952, "grad_norm": 6.976101398468018, "learning_rate": 3.2271984428652754e-05, "loss": 1.5876, "step": 25360 }, { "epoch": 4.192522206155753, "grad_norm": 13.752799987792969, "learning_rate": 3.226280320975413e-05, "loss": 1.4922, "step": 25370 }, { "epoch": 4.194174757281553, "grad_norm": 12.30428409576416, "learning_rate": 3.2253621990855506e-05, "loss": 1.5245, "step": 25380 }, { "epoch": 4.195827308407354, "grad_norm": 8.185288429260254, "learning_rate": 3.224444077195689e-05, "loss": 1.4679, "step": 25390 }, { "epoch": 4.197479859533154, "grad_norm": 11.520121574401855, "learning_rate": 3.2235259553058264e-05, "loss": 1.5309, "step": 25400 }, { "epoch": 4.1991324106589545, "grad_norm": 23.830415725708008, "learning_rate": 3.222607833415964e-05, "loss": 1.521, "step": 25410 }, { "epoch": 4.2007849617847555, "grad_norm": 13.380767822265625, "learning_rate": 3.221689711526102e-05, "loss": 1.6148, "step": 25420 }, { "epoch": 4.202437512910556, "grad_norm": 11.789327621459961, "learning_rate": 3.22077158963624e-05, "loss": 1.5611, "step": 25430 }, { "epoch": 4.2040900640363565, "grad_norm": 8.8134765625, "learning_rate": 3.219853467746378e-05, "loss": 1.4858, "step": 25440 }, { "epoch": 4.205742615162157, "grad_norm": 9.195273399353027, "learning_rate": 3.218935345856516e-05, "loss": 1.457, "step": 25450 }, { "epoch": 4.207395166287957, "grad_norm": 12.645697593688965, "learning_rate": 3.218017223966654e-05, "loss": 1.5891, "step": 25460 }, { "epoch": 4.209047717413758, "grad_norm": 14.299896240234375, "learning_rate": 3.217099102076792e-05, "loss": 1.6082, "step": 25470 }, { "epoch": 4.210700268539558, "grad_norm": 16.324865341186523, "learning_rate": 3.21618098018693e-05, "loss": 1.5836, "step": 25480 }, { "epoch": 4.212352819665359, "grad_norm": 16.070384979248047, "learning_rate": 3.215262858297068e-05, "loss": 1.5849, "step": 25490 }, { "epoch": 4.214005370791159, "grad_norm": 20.05198097229004, "learning_rate": 3.214344736407206e-05, "loss": 1.6418, "step": 25500 }, { "epoch": 4.215657921916959, "grad_norm": 15.854426383972168, "learning_rate": 3.213426614517343e-05, "loss": 1.5086, "step": 25510 }, { "epoch": 4.21731047304276, "grad_norm": 13.941832542419434, "learning_rate": 3.2125084926274815e-05, "loss": 1.6784, "step": 25520 }, { "epoch": 4.21896302416856, "grad_norm": 11.559526443481445, "learning_rate": 3.211590370737619e-05, "loss": 1.5038, "step": 25530 }, { "epoch": 4.220615575294361, "grad_norm": 8.316058158874512, "learning_rate": 3.210672248847757e-05, "loss": 1.6032, "step": 25540 }, { "epoch": 4.222268126420161, "grad_norm": 9.870329856872559, "learning_rate": 3.209754126957895e-05, "loss": 1.4683, "step": 25550 }, { "epoch": 4.223920677545961, "grad_norm": 7.676739692687988, "learning_rate": 3.2088360050680326e-05, "loss": 1.499, "step": 25560 }, { "epoch": 4.225573228671762, "grad_norm": 10.399463653564453, "learning_rate": 3.207917883178171e-05, "loss": 1.4412, "step": 25570 }, { "epoch": 4.227225779797562, "grad_norm": 15.944662094116211, "learning_rate": 3.206999761288309e-05, "loss": 1.5239, "step": 25580 }, { "epoch": 4.228878330923363, "grad_norm": 10.55685806274414, "learning_rate": 3.206081639398447e-05, "loss": 1.4544, "step": 25590 }, { "epoch": 4.230530882049163, "grad_norm": 16.161338806152344, "learning_rate": 3.205163517508585e-05, "loss": 1.4372, "step": 25600 }, { "epoch": 4.2321834331749635, "grad_norm": 11.07420539855957, "learning_rate": 3.2042453956187225e-05, "loss": 1.5521, "step": 25610 }, { "epoch": 4.2338359843007645, "grad_norm": 19.852649688720703, "learning_rate": 3.203327273728861e-05, "loss": 1.5212, "step": 25620 }, { "epoch": 4.235488535426565, "grad_norm": 12.890336036682129, "learning_rate": 3.2024091518389984e-05, "loss": 1.6077, "step": 25630 }, { "epoch": 4.2371410865523655, "grad_norm": 17.776052474975586, "learning_rate": 3.201491029949136e-05, "loss": 1.6323, "step": 25640 }, { "epoch": 4.238793637678166, "grad_norm": 14.739234924316406, "learning_rate": 3.200572908059274e-05, "loss": 1.5555, "step": 25650 }, { "epoch": 4.240446188803966, "grad_norm": 9.28357982635498, "learning_rate": 3.199654786169412e-05, "loss": 1.5533, "step": 25660 }, { "epoch": 4.242098739929767, "grad_norm": 9.304981231689453, "learning_rate": 3.1987366642795494e-05, "loss": 1.5796, "step": 25670 }, { "epoch": 4.243751291055567, "grad_norm": 17.264741897583008, "learning_rate": 3.197818542389688e-05, "loss": 1.3938, "step": 25680 }, { "epoch": 4.245403842181368, "grad_norm": 6.289753437042236, "learning_rate": 3.196900420499826e-05, "loss": 1.3865, "step": 25690 }, { "epoch": 4.247056393307168, "grad_norm": 7.785802364349365, "learning_rate": 3.1959822986099635e-05, "loss": 1.4776, "step": 25700 }, { "epoch": 4.248708944432968, "grad_norm": 39.08346939086914, "learning_rate": 3.195064176720102e-05, "loss": 1.5611, "step": 25710 }, { "epoch": 4.250361495558769, "grad_norm": 29.96815299987793, "learning_rate": 3.1941460548302394e-05, "loss": 1.5694, "step": 25720 }, { "epoch": 4.252014046684569, "grad_norm": 17.154983520507812, "learning_rate": 3.1932279329403776e-05, "loss": 1.5357, "step": 25730 }, { "epoch": 4.25366659781037, "grad_norm": 11.302206993103027, "learning_rate": 3.192309811050515e-05, "loss": 1.6063, "step": 25740 }, { "epoch": 4.25531914893617, "grad_norm": 9.660160064697266, "learning_rate": 3.1913916891606535e-05, "loss": 1.5413, "step": 25750 }, { "epoch": 4.256971700061971, "grad_norm": 7.870529651641846, "learning_rate": 3.190473567270791e-05, "loss": 1.5043, "step": 25760 }, { "epoch": 4.258624251187771, "grad_norm": 16.945133209228516, "learning_rate": 3.1895554453809286e-05, "loss": 1.5214, "step": 25770 }, { "epoch": 4.260276802313571, "grad_norm": 11.31995677947998, "learning_rate": 3.188637323491067e-05, "loss": 1.4101, "step": 25780 }, { "epoch": 4.261929353439372, "grad_norm": 16.450275421142578, "learning_rate": 3.1877192016012045e-05, "loss": 1.4787, "step": 25790 }, { "epoch": 4.263581904565172, "grad_norm": 10.560013771057129, "learning_rate": 3.186801079711343e-05, "loss": 1.49, "step": 25800 }, { "epoch": 4.2652344556909725, "grad_norm": 27.106367111206055, "learning_rate": 3.1858829578214803e-05, "loss": 1.5307, "step": 25810 }, { "epoch": 4.2668870068167735, "grad_norm": 9.301347732543945, "learning_rate": 3.1849648359316186e-05, "loss": 1.4364, "step": 25820 }, { "epoch": 4.268539557942574, "grad_norm": 13.334724426269531, "learning_rate": 3.184046714041756e-05, "loss": 1.6001, "step": 25830 }, { "epoch": 4.2701921090683745, "grad_norm": 9.709102630615234, "learning_rate": 3.1831285921518945e-05, "loss": 1.4029, "step": 25840 }, { "epoch": 4.271844660194175, "grad_norm": 28.314720153808594, "learning_rate": 3.182210470262032e-05, "loss": 1.4576, "step": 25850 }, { "epoch": 4.273497211319976, "grad_norm": 9.064886093139648, "learning_rate": 3.18129234837217e-05, "loss": 1.5087, "step": 25860 }, { "epoch": 4.275149762445776, "grad_norm": 8.588004112243652, "learning_rate": 3.180374226482308e-05, "loss": 1.4572, "step": 25870 }, { "epoch": 4.276802313571576, "grad_norm": 9.804033279418945, "learning_rate": 3.179456104592446e-05, "loss": 1.4764, "step": 25880 }, { "epoch": 4.278454864697377, "grad_norm": 65.4351806640625, "learning_rate": 3.178537982702584e-05, "loss": 1.5435, "step": 25890 }, { "epoch": 4.280107415823177, "grad_norm": 6.215262413024902, "learning_rate": 3.177619860812722e-05, "loss": 1.6816, "step": 25900 }, { "epoch": 4.281759966948978, "grad_norm": 105.8828353881836, "learning_rate": 3.1767017389228596e-05, "loss": 1.5335, "step": 25910 }, { "epoch": 4.283412518074778, "grad_norm": 13.918194770812988, "learning_rate": 3.175783617032997e-05, "loss": 1.5267, "step": 25920 }, { "epoch": 4.285065069200578, "grad_norm": 8.10260009765625, "learning_rate": 3.1748654951431354e-05, "loss": 1.4447, "step": 25930 }, { "epoch": 4.286717620326379, "grad_norm": 16.183929443359375, "learning_rate": 3.173947373253273e-05, "loss": 1.4827, "step": 25940 }, { "epoch": 4.288370171452179, "grad_norm": 9.055156707763672, "learning_rate": 3.173029251363411e-05, "loss": 1.506, "step": 25950 }, { "epoch": 4.29002272257798, "grad_norm": 7.047090530395508, "learning_rate": 3.172111129473549e-05, "loss": 1.5904, "step": 25960 }, { "epoch": 4.29167527370378, "grad_norm": 6.232375144958496, "learning_rate": 3.171193007583687e-05, "loss": 1.5418, "step": 25970 }, { "epoch": 4.29332782482958, "grad_norm": 12.954999923706055, "learning_rate": 3.170274885693825e-05, "loss": 1.5424, "step": 25980 }, { "epoch": 4.294980375955381, "grad_norm": 15.10276985168457, "learning_rate": 3.169356763803963e-05, "loss": 1.5881, "step": 25990 }, { "epoch": 4.296632927081181, "grad_norm": 14.072202682495117, "learning_rate": 3.1684386419141006e-05, "loss": 1.6264, "step": 26000 }, { "epoch": 4.298285478206982, "grad_norm": 11.788676261901855, "learning_rate": 3.167520520024239e-05, "loss": 1.4439, "step": 26010 }, { "epoch": 4.2999380293327825, "grad_norm": 9.451334953308105, "learning_rate": 3.1666023981343764e-05, "loss": 1.5252, "step": 26020 }, { "epoch": 4.301590580458583, "grad_norm": 22.73940086364746, "learning_rate": 3.165684276244515e-05, "loss": 1.4381, "step": 26030 }, { "epoch": 4.3032431315843835, "grad_norm": 10.139789581298828, "learning_rate": 3.164766154354652e-05, "loss": 1.4498, "step": 26040 }, { "epoch": 4.304895682710184, "grad_norm": 8.749555587768555, "learning_rate": 3.16384803246479e-05, "loss": 1.4908, "step": 26050 }, { "epoch": 4.306548233835985, "grad_norm": 4.8961405754089355, "learning_rate": 3.162929910574928e-05, "loss": 1.5442, "step": 26060 }, { "epoch": 4.308200784961785, "grad_norm": 25.065303802490234, "learning_rate": 3.162011788685066e-05, "loss": 1.5435, "step": 26070 }, { "epoch": 4.309853336087585, "grad_norm": 8.996482849121094, "learning_rate": 3.161093666795204e-05, "loss": 1.5444, "step": 26080 }, { "epoch": 4.311505887213386, "grad_norm": 7.40595817565918, "learning_rate": 3.1601755449053416e-05, "loss": 1.6196, "step": 26090 }, { "epoch": 4.313158438339186, "grad_norm": 10.102790832519531, "learning_rate": 3.15925742301548e-05, "loss": 1.6037, "step": 26100 }, { "epoch": 4.314810989464987, "grad_norm": 7.849573612213135, "learning_rate": 3.1583393011256174e-05, "loss": 1.5106, "step": 26110 }, { "epoch": 4.316463540590787, "grad_norm": 8.720571517944336, "learning_rate": 3.157421179235756e-05, "loss": 1.5115, "step": 26120 }, { "epoch": 4.318116091716588, "grad_norm": 25.104869842529297, "learning_rate": 3.156503057345893e-05, "loss": 1.5522, "step": 26130 }, { "epoch": 4.319768642842388, "grad_norm": 8.44864559173584, "learning_rate": 3.1555849354560315e-05, "loss": 1.5257, "step": 26140 }, { "epoch": 4.321421193968188, "grad_norm": 9.844928741455078, "learning_rate": 3.154666813566169e-05, "loss": 1.5298, "step": 26150 }, { "epoch": 4.323073745093989, "grad_norm": 7.365574836730957, "learning_rate": 3.1537486916763074e-05, "loss": 1.4086, "step": 26160 }, { "epoch": 4.324726296219789, "grad_norm": 5.956984043121338, "learning_rate": 3.152830569786445e-05, "loss": 1.5345, "step": 26170 }, { "epoch": 4.326378847345589, "grad_norm": 18.497196197509766, "learning_rate": 3.1519124478965825e-05, "loss": 1.6195, "step": 26180 }, { "epoch": 4.32803139847139, "grad_norm": 9.667485237121582, "learning_rate": 3.150994326006721e-05, "loss": 1.5314, "step": 26190 }, { "epoch": 4.32968394959719, "grad_norm": 10.36966609954834, "learning_rate": 3.1500762041168584e-05, "loss": 1.5571, "step": 26200 }, { "epoch": 4.331336500722991, "grad_norm": 10.898240089416504, "learning_rate": 3.1491580822269967e-05, "loss": 1.4782, "step": 26210 }, { "epoch": 4.3329890518487915, "grad_norm": 19.933618545532227, "learning_rate": 3.148239960337134e-05, "loss": 1.4672, "step": 26220 }, { "epoch": 4.3346416029745924, "grad_norm": 16.246917724609375, "learning_rate": 3.1473218384472725e-05, "loss": 1.4771, "step": 26230 }, { "epoch": 4.3362941541003925, "grad_norm": 25.84589385986328, "learning_rate": 3.14640371655741e-05, "loss": 1.4655, "step": 26240 }, { "epoch": 4.337946705226193, "grad_norm": 34.56238555908203, "learning_rate": 3.1454855946675484e-05, "loss": 1.616, "step": 26250 }, { "epoch": 4.339599256351994, "grad_norm": 15.874077796936035, "learning_rate": 3.1445674727776866e-05, "loss": 1.5548, "step": 26260 }, { "epoch": 4.341251807477794, "grad_norm": 20.784616470336914, "learning_rate": 3.143649350887824e-05, "loss": 1.5053, "step": 26270 }, { "epoch": 4.342904358603595, "grad_norm": 8.26366138458252, "learning_rate": 3.142731228997962e-05, "loss": 1.5606, "step": 26280 }, { "epoch": 4.344556909729395, "grad_norm": 13.238356590270996, "learning_rate": 3.1418131071081e-05, "loss": 1.5958, "step": 26290 }, { "epoch": 4.346209460855195, "grad_norm": 14.99146556854248, "learning_rate": 3.1408949852182376e-05, "loss": 1.5942, "step": 26300 }, { "epoch": 4.347862011980996, "grad_norm": 20.563129425048828, "learning_rate": 3.139976863328375e-05, "loss": 1.5517, "step": 26310 }, { "epoch": 4.349514563106796, "grad_norm": 8.888270378112793, "learning_rate": 3.1390587414385135e-05, "loss": 1.48, "step": 26320 }, { "epoch": 4.351167114232597, "grad_norm": 12.94379997253418, "learning_rate": 3.138140619548651e-05, "loss": 1.473, "step": 26330 }, { "epoch": 4.352819665358397, "grad_norm": 17.277467727661133, "learning_rate": 3.1372224976587893e-05, "loss": 1.4717, "step": 26340 }, { "epoch": 4.354472216484197, "grad_norm": 5.993367671966553, "learning_rate": 3.136304375768927e-05, "loss": 1.4952, "step": 26350 }, { "epoch": 4.356124767609998, "grad_norm": 30.13902473449707, "learning_rate": 3.135386253879065e-05, "loss": 1.5322, "step": 26360 }, { "epoch": 4.357777318735798, "grad_norm": 18.45423126220703, "learning_rate": 3.1344681319892035e-05, "loss": 1.4632, "step": 26370 }, { "epoch": 4.359429869861599, "grad_norm": 13.011897087097168, "learning_rate": 3.133550010099341e-05, "loss": 1.5353, "step": 26380 }, { "epoch": 4.361082420987399, "grad_norm": 31.791933059692383, "learning_rate": 3.132631888209479e-05, "loss": 1.3787, "step": 26390 }, { "epoch": 4.362734972113199, "grad_norm": 10.626348495483398, "learning_rate": 3.131713766319617e-05, "loss": 1.4245, "step": 26400 }, { "epoch": 4.364387523239, "grad_norm": 8.767387390136719, "learning_rate": 3.1307956444297545e-05, "loss": 1.5493, "step": 26410 }, { "epoch": 4.3660400743648005, "grad_norm": 9.301674842834473, "learning_rate": 3.129877522539893e-05, "loss": 1.5448, "step": 26420 }, { "epoch": 4.367692625490601, "grad_norm": 7.154281139373779, "learning_rate": 3.12895940065003e-05, "loss": 1.6527, "step": 26430 }, { "epoch": 4.3693451766164015, "grad_norm": 10.634243965148926, "learning_rate": 3.128041278760168e-05, "loss": 1.4756, "step": 26440 }, { "epoch": 4.370997727742202, "grad_norm": 23.75998878479004, "learning_rate": 3.127123156870306e-05, "loss": 1.4276, "step": 26450 }, { "epoch": 4.372650278868003, "grad_norm": 11.77112102508545, "learning_rate": 3.126205034980444e-05, "loss": 1.3796, "step": 26460 }, { "epoch": 4.374302829993803, "grad_norm": 15.972389221191406, "learning_rate": 3.125286913090582e-05, "loss": 1.4398, "step": 26470 }, { "epoch": 4.375955381119604, "grad_norm": 16.23674964904785, "learning_rate": 3.1243687912007196e-05, "loss": 1.5868, "step": 26480 }, { "epoch": 4.377607932245404, "grad_norm": 9.995187759399414, "learning_rate": 3.123450669310858e-05, "loss": 1.5567, "step": 26490 }, { "epoch": 4.379260483371204, "grad_norm": 14.570067405700684, "learning_rate": 3.122532547420996e-05, "loss": 1.5392, "step": 26500 }, { "epoch": 4.380913034497005, "grad_norm": 15.810830116271973, "learning_rate": 3.121614425531134e-05, "loss": 1.5493, "step": 26510 }, { "epoch": 4.382565585622805, "grad_norm": 11.888526916503906, "learning_rate": 3.120696303641272e-05, "loss": 1.4854, "step": 26520 }, { "epoch": 4.384218136748606, "grad_norm": 6.8027663230896, "learning_rate": 3.1197781817514096e-05, "loss": 1.5692, "step": 26530 }, { "epoch": 4.385870687874406, "grad_norm": 12.093624114990234, "learning_rate": 3.118860059861548e-05, "loss": 1.409, "step": 26540 }, { "epoch": 4.387523239000206, "grad_norm": 28.978029251098633, "learning_rate": 3.1179419379716854e-05, "loss": 1.5128, "step": 26550 }, { "epoch": 4.389175790126007, "grad_norm": 7.327620506286621, "learning_rate": 3.117023816081823e-05, "loss": 1.5797, "step": 26560 }, { "epoch": 4.390828341251807, "grad_norm": 10.677287101745605, "learning_rate": 3.1161056941919606e-05, "loss": 1.5038, "step": 26570 }, { "epoch": 4.392480892377608, "grad_norm": 9.653717994689941, "learning_rate": 3.115187572302099e-05, "loss": 1.6638, "step": 26580 }, { "epoch": 4.394133443503408, "grad_norm": 6.950878620147705, "learning_rate": 3.1142694504122364e-05, "loss": 1.4129, "step": 26590 }, { "epoch": 4.395785994629209, "grad_norm": 11.13272476196289, "learning_rate": 3.113351328522375e-05, "loss": 1.5374, "step": 26600 }, { "epoch": 4.397438545755009, "grad_norm": 9.883722305297852, "learning_rate": 3.112433206632513e-05, "loss": 1.6651, "step": 26610 }, { "epoch": 4.3990910968808095, "grad_norm": 9.824180603027344, "learning_rate": 3.1115150847426506e-05, "loss": 1.5053, "step": 26620 }, { "epoch": 4.40074364800661, "grad_norm": 11.171745300292969, "learning_rate": 3.110596962852789e-05, "loss": 1.4877, "step": 26630 }, { "epoch": 4.4023961991324105, "grad_norm": 12.247041702270508, "learning_rate": 3.1096788409629264e-05, "loss": 1.6003, "step": 26640 }, { "epoch": 4.4040487502582115, "grad_norm": 7.303963661193848, "learning_rate": 3.108760719073065e-05, "loss": 1.442, "step": 26650 }, { "epoch": 4.405701301384012, "grad_norm": 20.831327438354492, "learning_rate": 3.107842597183202e-05, "loss": 1.5416, "step": 26660 }, { "epoch": 4.407353852509812, "grad_norm": 7.863111972808838, "learning_rate": 3.1069244752933405e-05, "loss": 1.6196, "step": 26670 }, { "epoch": 4.409006403635613, "grad_norm": 9.105256080627441, "learning_rate": 3.106006353403478e-05, "loss": 1.4094, "step": 26680 }, { "epoch": 4.410658954761413, "grad_norm": 54.83008575439453, "learning_rate": 3.105088231513616e-05, "loss": 1.528, "step": 26690 }, { "epoch": 4.412311505887214, "grad_norm": 10.44033432006836, "learning_rate": 3.104170109623753e-05, "loss": 1.5623, "step": 26700 }, { "epoch": 4.413964057013014, "grad_norm": 8.483716011047363, "learning_rate": 3.1032519877338915e-05, "loss": 1.4732, "step": 26710 }, { "epoch": 4.415616608138814, "grad_norm": 10.583048820495605, "learning_rate": 3.10233386584403e-05, "loss": 1.5956, "step": 26720 }, { "epoch": 4.417269159264615, "grad_norm": 12.964887619018555, "learning_rate": 3.1014157439541674e-05, "loss": 1.3888, "step": 26730 }, { "epoch": 4.418921710390415, "grad_norm": 7.349795818328857, "learning_rate": 3.1004976220643057e-05, "loss": 1.4753, "step": 26740 }, { "epoch": 4.420574261516216, "grad_norm": 17.131244659423828, "learning_rate": 3.099579500174443e-05, "loss": 1.4025, "step": 26750 }, { "epoch": 4.422226812642016, "grad_norm": 7.006505012512207, "learning_rate": 3.0986613782845815e-05, "loss": 1.5483, "step": 26760 }, { "epoch": 4.423879363767816, "grad_norm": 7.9716620445251465, "learning_rate": 3.097743256394719e-05, "loss": 1.5293, "step": 26770 }, { "epoch": 4.425531914893617, "grad_norm": 10.431143760681152, "learning_rate": 3.0968251345048574e-05, "loss": 1.4546, "step": 26780 }, { "epoch": 4.427184466019417, "grad_norm": 15.854337692260742, "learning_rate": 3.095907012614995e-05, "loss": 1.3846, "step": 26790 }, { "epoch": 4.428837017145218, "grad_norm": 8.748208999633789, "learning_rate": 3.094988890725133e-05, "loss": 1.4832, "step": 26800 }, { "epoch": 4.430489568271018, "grad_norm": 10.268428802490234, "learning_rate": 3.094070768835271e-05, "loss": 1.4754, "step": 26810 }, { "epoch": 4.4321421193968185, "grad_norm": 8.496047973632812, "learning_rate": 3.0931526469454084e-05, "loss": 1.5358, "step": 26820 }, { "epoch": 4.433794670522619, "grad_norm": 6.902714729309082, "learning_rate": 3.0922345250555466e-05, "loss": 1.5926, "step": 26830 }, { "epoch": 4.4354472216484195, "grad_norm": 8.306710243225098, "learning_rate": 3.091316403165684e-05, "loss": 1.5422, "step": 26840 }, { "epoch": 4.4370997727742205, "grad_norm": 8.802634239196777, "learning_rate": 3.0903982812758225e-05, "loss": 1.4948, "step": 26850 }, { "epoch": 4.438752323900021, "grad_norm": 13.463654518127441, "learning_rate": 3.08948015938596e-05, "loss": 1.598, "step": 26860 }, { "epoch": 4.440404875025821, "grad_norm": 16.30317497253418, "learning_rate": 3.0885620374960983e-05, "loss": 1.4191, "step": 26870 }, { "epoch": 4.442057426151622, "grad_norm": 19.12843894958496, "learning_rate": 3.087643915606236e-05, "loss": 1.5168, "step": 26880 }, { "epoch": 4.443709977277422, "grad_norm": 8.68060302734375, "learning_rate": 3.086725793716374e-05, "loss": 1.4416, "step": 26890 }, { "epoch": 4.445362528403223, "grad_norm": 26.926401138305664, "learning_rate": 3.085807671826512e-05, "loss": 1.5008, "step": 26900 }, { "epoch": 4.447015079529023, "grad_norm": 12.683781623840332, "learning_rate": 3.08488954993665e-05, "loss": 1.6921, "step": 26910 }, { "epoch": 4.448667630654823, "grad_norm": 6.749821186065674, "learning_rate": 3.0839714280467876e-05, "loss": 1.5364, "step": 26920 }, { "epoch": 4.450320181780624, "grad_norm": 7.689281463623047, "learning_rate": 3.083053306156926e-05, "loss": 1.5276, "step": 26930 }, { "epoch": 4.451972732906424, "grad_norm": 10.439373016357422, "learning_rate": 3.0821351842670635e-05, "loss": 1.4987, "step": 26940 }, { "epoch": 4.453625284032225, "grad_norm": 15.080227851867676, "learning_rate": 3.081217062377201e-05, "loss": 1.5277, "step": 26950 }, { "epoch": 4.455277835158025, "grad_norm": 7.284642219543457, "learning_rate": 3.080298940487339e-05, "loss": 1.6219, "step": 26960 }, { "epoch": 4.456930386283826, "grad_norm": 26.771888732910156, "learning_rate": 3.079380818597477e-05, "loss": 1.3984, "step": 26970 }, { "epoch": 4.458582937409626, "grad_norm": 11.829761505126953, "learning_rate": 3.078462696707615e-05, "loss": 1.4057, "step": 26980 }, { "epoch": 4.460235488535426, "grad_norm": 7.455525875091553, "learning_rate": 3.077544574817753e-05, "loss": 1.5112, "step": 26990 }, { "epoch": 4.461888039661227, "grad_norm": 18.92814826965332, "learning_rate": 3.076626452927891e-05, "loss": 1.5195, "step": 27000 }, { "epoch": 4.463540590787027, "grad_norm": 10.309431076049805, "learning_rate": 3.0757083310380286e-05, "loss": 1.5205, "step": 27010 }, { "epoch": 4.4651931419128275, "grad_norm": 6.530293941497803, "learning_rate": 3.074790209148167e-05, "loss": 1.5346, "step": 27020 }, { "epoch": 4.466845693038628, "grad_norm": 13.004956245422363, "learning_rate": 3.0738720872583045e-05, "loss": 1.5465, "step": 27030 }, { "epoch": 4.4684982441644285, "grad_norm": 9.708096504211426, "learning_rate": 3.072953965368443e-05, "loss": 1.5709, "step": 27040 }, { "epoch": 4.4701507952902295, "grad_norm": 32.74457931518555, "learning_rate": 3.07203584347858e-05, "loss": 1.6366, "step": 27050 }, { "epoch": 4.47180334641603, "grad_norm": 16.446645736694336, "learning_rate": 3.0711177215887186e-05, "loss": 1.456, "step": 27060 }, { "epoch": 4.473455897541831, "grad_norm": 11.239441871643066, "learning_rate": 3.070199599698856e-05, "loss": 1.4364, "step": 27070 }, { "epoch": 4.475108448667631, "grad_norm": 12.089319229125977, "learning_rate": 3.069281477808994e-05, "loss": 1.4886, "step": 27080 }, { "epoch": 4.476760999793431, "grad_norm": 8.01862621307373, "learning_rate": 3.068363355919132e-05, "loss": 1.5668, "step": 27090 }, { "epoch": 4.478413550919232, "grad_norm": 10.913430213928223, "learning_rate": 3.0674452340292696e-05, "loss": 1.5688, "step": 27100 }, { "epoch": 4.480066102045032, "grad_norm": 50.19123458862305, "learning_rate": 3.066527112139408e-05, "loss": 1.5296, "step": 27110 }, { "epoch": 4.481718653170833, "grad_norm": 7.208930015563965, "learning_rate": 3.0656089902495454e-05, "loss": 1.5731, "step": 27120 }, { "epoch": 4.483371204296633, "grad_norm": 8.685304641723633, "learning_rate": 3.064690868359684e-05, "loss": 1.4869, "step": 27130 }, { "epoch": 4.485023755422433, "grad_norm": 10.306434631347656, "learning_rate": 3.063772746469821e-05, "loss": 1.5608, "step": 27140 }, { "epoch": 4.486676306548234, "grad_norm": 15.503512382507324, "learning_rate": 3.0628546245799596e-05, "loss": 1.5302, "step": 27150 }, { "epoch": 4.488328857674034, "grad_norm": 11.12451171875, "learning_rate": 3.061936502690097e-05, "loss": 1.3706, "step": 27160 }, { "epoch": 4.489981408799835, "grad_norm": 8.692049026489258, "learning_rate": 3.0610183808002354e-05, "loss": 1.4919, "step": 27170 }, { "epoch": 4.491633959925635, "grad_norm": 12.720965385437012, "learning_rate": 3.060100258910373e-05, "loss": 1.63, "step": 27180 }, { "epoch": 4.493286511051435, "grad_norm": 7.457226753234863, "learning_rate": 3.059182137020511e-05, "loss": 1.7286, "step": 27190 }, { "epoch": 4.494939062177236, "grad_norm": 10.036781311035156, "learning_rate": 3.058264015130649e-05, "loss": 1.4945, "step": 27200 }, { "epoch": 4.496591613303036, "grad_norm": 9.686598777770996, "learning_rate": 3.0573458932407864e-05, "loss": 1.4577, "step": 27210 }, { "epoch": 4.498244164428837, "grad_norm": 10.819511413574219, "learning_rate": 3.056427771350925e-05, "loss": 1.4983, "step": 27220 }, { "epoch": 4.499896715554637, "grad_norm": 6.3481879234313965, "learning_rate": 3.055509649461062e-05, "loss": 1.5714, "step": 27230 }, { "epoch": 4.501549266680438, "grad_norm": 6.407865524291992, "learning_rate": 3.0545915275712005e-05, "loss": 1.4642, "step": 27240 }, { "epoch": 4.5032018178062385, "grad_norm": 22.46175193786621, "learning_rate": 3.053673405681338e-05, "loss": 1.5947, "step": 27250 }, { "epoch": 4.504854368932039, "grad_norm": 12.746009826660156, "learning_rate": 3.0527552837914764e-05, "loss": 1.4251, "step": 27260 }, { "epoch": 4.50650692005784, "grad_norm": 19.7728328704834, "learning_rate": 3.051837161901614e-05, "loss": 1.5405, "step": 27270 }, { "epoch": 4.50815947118364, "grad_norm": 15.723058700561523, "learning_rate": 3.0509190400117522e-05, "loss": 1.6367, "step": 27280 }, { "epoch": 4.50981202230944, "grad_norm": 8.419683456420898, "learning_rate": 3.05000091812189e-05, "loss": 1.5474, "step": 27290 }, { "epoch": 4.511464573435241, "grad_norm": 13.057724952697754, "learning_rate": 3.0490827962320277e-05, "loss": 1.4442, "step": 27300 }, { "epoch": 4.513117124561041, "grad_norm": 12.913500785827637, "learning_rate": 3.048164674342166e-05, "loss": 1.5384, "step": 27310 }, { "epoch": 4.514769675686842, "grad_norm": 7.9755659103393555, "learning_rate": 3.0472465524523036e-05, "loss": 1.5771, "step": 27320 }, { "epoch": 4.516422226812642, "grad_norm": 8.68862247467041, "learning_rate": 3.046328430562442e-05, "loss": 1.3936, "step": 27330 }, { "epoch": 4.518074777938443, "grad_norm": 10.90861701965332, "learning_rate": 3.0454103086725794e-05, "loss": 1.4325, "step": 27340 }, { "epoch": 4.519727329064243, "grad_norm": 10.610472679138184, "learning_rate": 3.0444921867827174e-05, "loss": 1.4947, "step": 27350 }, { "epoch": 4.521379880190043, "grad_norm": 14.162581443786621, "learning_rate": 3.043574064892855e-05, "loss": 1.4768, "step": 27360 }, { "epoch": 4.523032431315844, "grad_norm": 7.975509166717529, "learning_rate": 3.0426559430029932e-05, "loss": 1.3613, "step": 27370 }, { "epoch": 4.524684982441644, "grad_norm": 11.789381980895996, "learning_rate": 3.0417378211131308e-05, "loss": 1.5751, "step": 27380 }, { "epoch": 4.526337533567444, "grad_norm": 8.49382495880127, "learning_rate": 3.040819699223269e-05, "loss": 1.565, "step": 27390 }, { "epoch": 4.527990084693245, "grad_norm": 11.56302547454834, "learning_rate": 3.0399015773334073e-05, "loss": 1.6405, "step": 27400 }, { "epoch": 4.529642635819045, "grad_norm": 7.2713494300842285, "learning_rate": 3.038983455443545e-05, "loss": 1.5445, "step": 27410 }, { "epoch": 4.531295186944846, "grad_norm": 40.96128463745117, "learning_rate": 3.038065333553683e-05, "loss": 1.5608, "step": 27420 }, { "epoch": 4.532947738070646, "grad_norm": 6.933916091918945, "learning_rate": 3.0371472116638204e-05, "loss": 1.7051, "step": 27430 }, { "epoch": 4.534600289196447, "grad_norm": 9.174333572387695, "learning_rate": 3.0362290897739587e-05, "loss": 1.6537, "step": 27440 }, { "epoch": 4.5362528403222475, "grad_norm": 11.921561241149902, "learning_rate": 3.0353109678840963e-05, "loss": 1.458, "step": 27450 }, { "epoch": 4.537905391448048, "grad_norm": 11.688679695129395, "learning_rate": 3.0343928459942345e-05, "loss": 1.4882, "step": 27460 }, { "epoch": 4.539557942573849, "grad_norm": 9.94924545288086, "learning_rate": 3.033474724104372e-05, "loss": 1.3484, "step": 27470 }, { "epoch": 4.541210493699649, "grad_norm": 14.807235717773438, "learning_rate": 3.03255660221451e-05, "loss": 1.5368, "step": 27480 }, { "epoch": 4.54286304482545, "grad_norm": 7.9106597900390625, "learning_rate": 3.0316384803246476e-05, "loss": 1.5756, "step": 27490 }, { "epoch": 4.54451559595125, "grad_norm": 17.72800636291504, "learning_rate": 3.030720358434786e-05, "loss": 1.6303, "step": 27500 }, { "epoch": 4.54616814707705, "grad_norm": 14.3507080078125, "learning_rate": 3.0298022365449242e-05, "loss": 1.488, "step": 27510 }, { "epoch": 4.547820698202851, "grad_norm": 11.83436107635498, "learning_rate": 3.0288841146550618e-05, "loss": 1.4159, "step": 27520 }, { "epoch": 4.549473249328651, "grad_norm": 32.23595428466797, "learning_rate": 3.0279659927652e-05, "loss": 1.4765, "step": 27530 }, { "epoch": 4.551125800454452, "grad_norm": 7.518164157867432, "learning_rate": 3.0270478708753376e-05, "loss": 1.5396, "step": 27540 }, { "epoch": 4.552778351580252, "grad_norm": 12.867490768432617, "learning_rate": 3.0261297489854755e-05, "loss": 1.6184, "step": 27550 }, { "epoch": 4.554430902706052, "grad_norm": 59.21015548706055, "learning_rate": 3.025211627095613e-05, "loss": 1.3638, "step": 27560 }, { "epoch": 4.556083453831853, "grad_norm": 8.144857406616211, "learning_rate": 3.0242935052057514e-05, "loss": 1.6169, "step": 27570 }, { "epoch": 4.557736004957653, "grad_norm": 13.78732681274414, "learning_rate": 3.023375383315889e-05, "loss": 1.5019, "step": 27580 }, { "epoch": 4.559388556083454, "grad_norm": 12.740043640136719, "learning_rate": 3.0224572614260272e-05, "loss": 1.5669, "step": 27590 }, { "epoch": 4.561041107209254, "grad_norm": 7.079008102416992, "learning_rate": 3.0215391395361648e-05, "loss": 1.5509, "step": 27600 }, { "epoch": 4.562693658335054, "grad_norm": 13.054255485534668, "learning_rate": 3.0206210176463027e-05, "loss": 1.443, "step": 27610 }, { "epoch": 4.564346209460855, "grad_norm": 8.606029510498047, "learning_rate": 3.0197028957564407e-05, "loss": 1.5148, "step": 27620 }, { "epoch": 4.565998760586655, "grad_norm": 6.800489902496338, "learning_rate": 3.0187847738665786e-05, "loss": 1.5745, "step": 27630 }, { "epoch": 4.567651311712456, "grad_norm": 10.607884407043457, "learning_rate": 3.017866651976717e-05, "loss": 1.5926, "step": 27640 }, { "epoch": 4.5693038628382565, "grad_norm": 9.12733268737793, "learning_rate": 3.0169485300868544e-05, "loss": 1.5111, "step": 27650 }, { "epoch": 4.570956413964057, "grad_norm": 7.448328971862793, "learning_rate": 3.0160304081969927e-05, "loss": 1.6386, "step": 27660 }, { "epoch": 4.572608965089858, "grad_norm": 21.30801010131836, "learning_rate": 3.0151122863071303e-05, "loss": 1.4581, "step": 27670 }, { "epoch": 4.574261516215658, "grad_norm": 19.67100715637207, "learning_rate": 3.0141941644172682e-05, "loss": 1.4829, "step": 27680 }, { "epoch": 4.575914067341459, "grad_norm": 14.657584190368652, "learning_rate": 3.0132760425274058e-05, "loss": 1.5703, "step": 27690 }, { "epoch": 4.577566618467259, "grad_norm": 7.990481853485107, "learning_rate": 3.012357920637544e-05, "loss": 1.4242, "step": 27700 }, { "epoch": 4.57921916959306, "grad_norm": 8.869722366333008, "learning_rate": 3.0114397987476816e-05, "loss": 1.5055, "step": 27710 }, { "epoch": 4.58087172071886, "grad_norm": 11.212498664855957, "learning_rate": 3.01052167685782e-05, "loss": 1.4827, "step": 27720 }, { "epoch": 4.58252427184466, "grad_norm": 10.676173210144043, "learning_rate": 3.0096035549679575e-05, "loss": 1.5627, "step": 27730 }, { "epoch": 4.584176822970461, "grad_norm": 25.556760787963867, "learning_rate": 3.0086854330780954e-05, "loss": 1.5832, "step": 27740 }, { "epoch": 4.585829374096261, "grad_norm": 8.269768714904785, "learning_rate": 3.0077673111882337e-05, "loss": 1.5174, "step": 27750 }, { "epoch": 4.587481925222061, "grad_norm": 11.795088768005371, "learning_rate": 3.0068491892983713e-05, "loss": 1.5345, "step": 27760 }, { "epoch": 4.589134476347862, "grad_norm": 12.628785133361816, "learning_rate": 3.0059310674085095e-05, "loss": 1.4388, "step": 27770 }, { "epoch": 4.590787027473662, "grad_norm": 8.711159706115723, "learning_rate": 3.005012945518647e-05, "loss": 1.4389, "step": 27780 }, { "epoch": 4.592439578599463, "grad_norm": 5.210586071014404, "learning_rate": 3.0040948236287854e-05, "loss": 1.4444, "step": 27790 }, { "epoch": 4.594092129725263, "grad_norm": 11.018863677978516, "learning_rate": 3.003176701738923e-05, "loss": 1.4392, "step": 27800 }, { "epoch": 4.595744680851064, "grad_norm": 8.213711738586426, "learning_rate": 3.002258579849061e-05, "loss": 1.4233, "step": 27810 }, { "epoch": 4.597397231976864, "grad_norm": 22.795164108276367, "learning_rate": 3.0013404579591985e-05, "loss": 1.5016, "step": 27820 }, { "epoch": 4.599049783102664, "grad_norm": 10.903739929199219, "learning_rate": 3.0004223360693367e-05, "loss": 1.456, "step": 27830 }, { "epoch": 4.600702334228465, "grad_norm": 13.617247581481934, "learning_rate": 2.9995042141794743e-05, "loss": 1.6139, "step": 27840 }, { "epoch": 4.6023548853542655, "grad_norm": 11.749178886413574, "learning_rate": 2.9985860922896126e-05, "loss": 1.4379, "step": 27850 }, { "epoch": 4.604007436480066, "grad_norm": 11.892571449279785, "learning_rate": 2.9976679703997505e-05, "loss": 1.5134, "step": 27860 }, { "epoch": 4.605659987605867, "grad_norm": 16.06785011291504, "learning_rate": 2.996749848509888e-05, "loss": 1.5021, "step": 27870 }, { "epoch": 4.607312538731667, "grad_norm": 11.480622291564941, "learning_rate": 2.9958317266200264e-05, "loss": 1.5267, "step": 27880 }, { "epoch": 4.608965089857468, "grad_norm": 11.960569381713867, "learning_rate": 2.994913604730164e-05, "loss": 1.5191, "step": 27890 }, { "epoch": 4.610617640983268, "grad_norm": 10.92312240600586, "learning_rate": 2.9939954828403022e-05, "loss": 1.6442, "step": 27900 }, { "epoch": 4.612270192109069, "grad_norm": 8.785314559936523, "learning_rate": 2.9930773609504398e-05, "loss": 1.6221, "step": 27910 }, { "epoch": 4.613922743234869, "grad_norm": 10.37857723236084, "learning_rate": 2.992159239060578e-05, "loss": 1.4723, "step": 27920 }, { "epoch": 4.615575294360669, "grad_norm": 47.27583694458008, "learning_rate": 2.9912411171707157e-05, "loss": 1.5619, "step": 27930 }, { "epoch": 4.61722784548647, "grad_norm": 28.969919204711914, "learning_rate": 2.9903229952808536e-05, "loss": 1.3872, "step": 27940 }, { "epoch": 4.61888039661227, "grad_norm": 16.729145050048828, "learning_rate": 2.989404873390991e-05, "loss": 1.5894, "step": 27950 }, { "epoch": 4.620532947738071, "grad_norm": 6.659755229949951, "learning_rate": 2.9884867515011294e-05, "loss": 1.6566, "step": 27960 }, { "epoch": 4.622185498863871, "grad_norm": 15.295912742614746, "learning_rate": 2.9875686296112677e-05, "loss": 1.4874, "step": 27970 }, { "epoch": 4.623838049989671, "grad_norm": 9.182865142822266, "learning_rate": 2.9866505077214053e-05, "loss": 1.6185, "step": 27980 }, { "epoch": 4.625490601115472, "grad_norm": 12.22716999053955, "learning_rate": 2.9857323858315432e-05, "loss": 1.4013, "step": 27990 }, { "epoch": 4.627143152241272, "grad_norm": 14.52023983001709, "learning_rate": 2.9848142639416808e-05, "loss": 1.6018, "step": 28000 }, { "epoch": 4.628795703367073, "grad_norm": 9.691109657287598, "learning_rate": 2.983896142051819e-05, "loss": 1.4829, "step": 28010 }, { "epoch": 4.630448254492873, "grad_norm": 7.735055446624756, "learning_rate": 2.9829780201619566e-05, "loss": 1.5496, "step": 28020 }, { "epoch": 4.632100805618673, "grad_norm": 9.349820137023926, "learning_rate": 2.982059898272095e-05, "loss": 1.5716, "step": 28030 }, { "epoch": 4.633753356744474, "grad_norm": 47.04668426513672, "learning_rate": 2.9811417763822325e-05, "loss": 1.5508, "step": 28040 }, { "epoch": 4.6354059078702745, "grad_norm": 11.281471252441406, "learning_rate": 2.9802236544923708e-05, "loss": 1.5374, "step": 28050 }, { "epoch": 4.6370584589960755, "grad_norm": 10.420440673828125, "learning_rate": 2.9793055326025083e-05, "loss": 1.4952, "step": 28060 }, { "epoch": 4.638711010121876, "grad_norm": 7.602105140686035, "learning_rate": 2.9783874107126463e-05, "loss": 1.6077, "step": 28070 }, { "epoch": 4.640363561247677, "grad_norm": 11.72873306274414, "learning_rate": 2.9774692888227845e-05, "loss": 1.517, "step": 28080 }, { "epoch": 4.642016112373477, "grad_norm": 5.891235828399658, "learning_rate": 2.976551166932922e-05, "loss": 1.5004, "step": 28090 }, { "epoch": 4.643668663499277, "grad_norm": 8.504074096679688, "learning_rate": 2.9756330450430604e-05, "loss": 1.4433, "step": 28100 }, { "epoch": 4.645321214625078, "grad_norm": 11.097142219543457, "learning_rate": 2.974714923153198e-05, "loss": 1.517, "step": 28110 }, { "epoch": 4.646973765750878, "grad_norm": 25.318063735961914, "learning_rate": 2.973796801263336e-05, "loss": 1.475, "step": 28120 }, { "epoch": 4.648626316876678, "grad_norm": 12.631922721862793, "learning_rate": 2.9728786793734735e-05, "loss": 1.6466, "step": 28130 }, { "epoch": 4.650278868002479, "grad_norm": 18.910295486450195, "learning_rate": 2.9719605574836117e-05, "loss": 1.4223, "step": 28140 }, { "epoch": 4.651931419128279, "grad_norm": 10.944915771484375, "learning_rate": 2.9710424355937493e-05, "loss": 1.4081, "step": 28150 }, { "epoch": 4.65358397025408, "grad_norm": 36.19316482543945, "learning_rate": 2.9701243137038876e-05, "loss": 1.5777, "step": 28160 }, { "epoch": 4.65523652137988, "grad_norm": 11.189594268798828, "learning_rate": 2.9692061918140252e-05, "loss": 1.6023, "step": 28170 }, { "epoch": 4.656889072505681, "grad_norm": 8.958883285522461, "learning_rate": 2.9682880699241634e-05, "loss": 1.4499, "step": 28180 }, { "epoch": 4.658541623631481, "grad_norm": 21.078588485717773, "learning_rate": 2.967369948034301e-05, "loss": 1.5326, "step": 28190 }, { "epoch": 4.660194174757281, "grad_norm": 9.72829818725586, "learning_rate": 2.966451826144439e-05, "loss": 1.665, "step": 28200 }, { "epoch": 4.661846725883082, "grad_norm": 6.814817905426025, "learning_rate": 2.9655337042545772e-05, "loss": 1.4978, "step": 28210 }, { "epoch": 4.663499277008882, "grad_norm": 12.29881763458252, "learning_rate": 2.9646155823647148e-05, "loss": 1.5255, "step": 28220 }, { "epoch": 4.665151828134682, "grad_norm": 9.907049179077148, "learning_rate": 2.963697460474853e-05, "loss": 1.5061, "step": 28230 }, { "epoch": 4.666804379260483, "grad_norm": 8.112325668334961, "learning_rate": 2.9627793385849906e-05, "loss": 1.5927, "step": 28240 }, { "epoch": 4.6684569303862835, "grad_norm": 31.478939056396484, "learning_rate": 2.9618612166951286e-05, "loss": 1.4132, "step": 28250 }, { "epoch": 4.6701094815120845, "grad_norm": 8.31009292602539, "learning_rate": 2.9609430948052665e-05, "loss": 1.409, "step": 28260 }, { "epoch": 4.671762032637885, "grad_norm": 69.70064544677734, "learning_rate": 2.9600249729154044e-05, "loss": 1.646, "step": 28270 }, { "epoch": 4.673414583763686, "grad_norm": 8.266491889953613, "learning_rate": 2.959106851025542e-05, "loss": 1.5419, "step": 28280 }, { "epoch": 4.675067134889486, "grad_norm": 7.468424320220947, "learning_rate": 2.9581887291356803e-05, "loss": 1.4903, "step": 28290 }, { "epoch": 4.676719686015286, "grad_norm": 8.034427642822266, "learning_rate": 2.957270607245818e-05, "loss": 1.5049, "step": 28300 }, { "epoch": 4.678372237141087, "grad_norm": 6.364953517913818, "learning_rate": 2.956352485355956e-05, "loss": 1.4946, "step": 28310 }, { "epoch": 4.680024788266887, "grad_norm": 6.952188014984131, "learning_rate": 2.955434363466094e-05, "loss": 1.585, "step": 28320 }, { "epoch": 4.681677339392688, "grad_norm": 6.363594055175781, "learning_rate": 2.9545162415762316e-05, "loss": 1.4206, "step": 28330 }, { "epoch": 4.683329890518488, "grad_norm": 12.087472915649414, "learning_rate": 2.95359811968637e-05, "loss": 1.5401, "step": 28340 }, { "epoch": 4.684982441644288, "grad_norm": 7.647066593170166, "learning_rate": 2.9526799977965075e-05, "loss": 1.626, "step": 28350 }, { "epoch": 4.686634992770089, "grad_norm": 10.387514114379883, "learning_rate": 2.9517618759066457e-05, "loss": 1.5102, "step": 28360 }, { "epoch": 4.688287543895889, "grad_norm": 10.926636695861816, "learning_rate": 2.9508437540167833e-05, "loss": 1.511, "step": 28370 }, { "epoch": 4.68994009502169, "grad_norm": 10.67368221282959, "learning_rate": 2.9499256321269213e-05, "loss": 1.4165, "step": 28380 }, { "epoch": 4.69159264614749, "grad_norm": 9.141593933105469, "learning_rate": 2.9490075102370592e-05, "loss": 1.5694, "step": 28390 }, { "epoch": 4.69324519727329, "grad_norm": 6.583410739898682, "learning_rate": 2.948089388347197e-05, "loss": 1.5357, "step": 28400 }, { "epoch": 4.694897748399091, "grad_norm": 10.10991382598877, "learning_rate": 2.9471712664573347e-05, "loss": 1.5769, "step": 28410 }, { "epoch": 4.696550299524891, "grad_norm": 7.276322841644287, "learning_rate": 2.946253144567473e-05, "loss": 1.6132, "step": 28420 }, { "epoch": 4.698202850650692, "grad_norm": 6.623241901397705, "learning_rate": 2.9453350226776112e-05, "loss": 1.449, "step": 28430 }, { "epoch": 4.699855401776492, "grad_norm": 8.261223793029785, "learning_rate": 2.9444169007877488e-05, "loss": 1.4652, "step": 28440 }, { "epoch": 4.701507952902293, "grad_norm": 38.858604431152344, "learning_rate": 2.9434987788978867e-05, "loss": 1.5068, "step": 28450 }, { "epoch": 4.7031605040280935, "grad_norm": 15.038886070251465, "learning_rate": 2.9425806570080243e-05, "loss": 1.636, "step": 28460 }, { "epoch": 4.704813055153894, "grad_norm": 11.770708084106445, "learning_rate": 2.9416625351181626e-05, "loss": 1.5393, "step": 28470 }, { "epoch": 4.706465606279695, "grad_norm": 24.59950065612793, "learning_rate": 2.9407444132283e-05, "loss": 1.508, "step": 28480 }, { "epoch": 4.708118157405495, "grad_norm": 8.745185852050781, "learning_rate": 2.9398262913384384e-05, "loss": 1.6451, "step": 28490 }, { "epoch": 4.709770708531295, "grad_norm": 12.335895538330078, "learning_rate": 2.938908169448576e-05, "loss": 1.487, "step": 28500 }, { "epoch": 4.711423259657096, "grad_norm": 38.088829040527344, "learning_rate": 2.937990047558714e-05, "loss": 1.4954, "step": 28510 }, { "epoch": 4.713075810782896, "grad_norm": 12.632776260375977, "learning_rate": 2.937071925668852e-05, "loss": 1.5633, "step": 28520 }, { "epoch": 4.714728361908697, "grad_norm": 11.09786319732666, "learning_rate": 2.9361538037789898e-05, "loss": 1.4451, "step": 28530 }, { "epoch": 4.716380913034497, "grad_norm": 11.804417610168457, "learning_rate": 2.935235681889128e-05, "loss": 1.596, "step": 28540 }, { "epoch": 4.718033464160298, "grad_norm": 10.606738090515137, "learning_rate": 2.9343175599992656e-05, "loss": 1.4794, "step": 28550 }, { "epoch": 4.719686015286098, "grad_norm": 8.68057632446289, "learning_rate": 2.933399438109404e-05, "loss": 1.5829, "step": 28560 }, { "epoch": 4.721338566411898, "grad_norm": 9.055428504943848, "learning_rate": 2.9324813162195415e-05, "loss": 1.5983, "step": 28570 }, { "epoch": 4.722991117537699, "grad_norm": 18.714929580688477, "learning_rate": 2.9315631943296794e-05, "loss": 1.5333, "step": 28580 }, { "epoch": 4.724643668663499, "grad_norm": 11.138300895690918, "learning_rate": 2.930645072439817e-05, "loss": 1.5573, "step": 28590 }, { "epoch": 4.726296219789299, "grad_norm": 11.27102279663086, "learning_rate": 2.9297269505499553e-05, "loss": 1.4813, "step": 28600 }, { "epoch": 4.7279487709151, "grad_norm": 9.76452350616455, "learning_rate": 2.928808828660093e-05, "loss": 1.6399, "step": 28610 }, { "epoch": 4.7296013220409, "grad_norm": 9.003868103027344, "learning_rate": 2.927890706770231e-05, "loss": 1.5414, "step": 28620 }, { "epoch": 4.731253873166701, "grad_norm": 8.384162902832031, "learning_rate": 2.9269725848803687e-05, "loss": 1.4369, "step": 28630 }, { "epoch": 4.732906424292501, "grad_norm": 13.919119834899902, "learning_rate": 2.9260544629905066e-05, "loss": 1.4788, "step": 28640 }, { "epoch": 4.734558975418302, "grad_norm": 9.051968574523926, "learning_rate": 2.925136341100645e-05, "loss": 1.6114, "step": 28650 }, { "epoch": 4.7362115265441025, "grad_norm": 12.999975204467773, "learning_rate": 2.9242182192107825e-05, "loss": 1.4965, "step": 28660 }, { "epoch": 4.737864077669903, "grad_norm": 9.09343147277832, "learning_rate": 2.9233000973209207e-05, "loss": 1.5995, "step": 28670 }, { "epoch": 4.739516628795704, "grad_norm": 9.132031440734863, "learning_rate": 2.9223819754310583e-05, "loss": 1.4807, "step": 28680 }, { "epoch": 4.741169179921504, "grad_norm": 7.520742416381836, "learning_rate": 2.9214638535411966e-05, "loss": 1.5562, "step": 28690 }, { "epoch": 4.742821731047305, "grad_norm": 8.723919868469238, "learning_rate": 2.920545731651334e-05, "loss": 1.4716, "step": 28700 }, { "epoch": 4.744474282173105, "grad_norm": 22.840511322021484, "learning_rate": 2.919627609761472e-05, "loss": 1.5385, "step": 28710 }, { "epoch": 4.746126833298905, "grad_norm": 8.98953628540039, "learning_rate": 2.9187094878716097e-05, "loss": 1.4941, "step": 28720 }, { "epoch": 4.747779384424706, "grad_norm": 8.429163932800293, "learning_rate": 2.917791365981748e-05, "loss": 1.5198, "step": 28730 }, { "epoch": 4.749431935550506, "grad_norm": 12.46285629272461, "learning_rate": 2.9168732440918855e-05, "loss": 1.4711, "step": 28740 }, { "epoch": 4.751084486676307, "grad_norm": 10.32840633392334, "learning_rate": 2.9159551222020238e-05, "loss": 1.4423, "step": 28750 }, { "epoch": 4.752737037802107, "grad_norm": 5.583861827850342, "learning_rate": 2.9150370003121614e-05, "loss": 1.6284, "step": 28760 }, { "epoch": 4.754389588927907, "grad_norm": 9.908193588256836, "learning_rate": 2.9141188784222993e-05, "loss": 1.5041, "step": 28770 }, { "epoch": 4.756042140053708, "grad_norm": 11.036250114440918, "learning_rate": 2.9132007565324376e-05, "loss": 1.529, "step": 28780 }, { "epoch": 4.757694691179508, "grad_norm": 13.065105438232422, "learning_rate": 2.912282634642575e-05, "loss": 1.5542, "step": 28790 }, { "epoch": 4.759347242305309, "grad_norm": 11.913582801818848, "learning_rate": 2.9113645127527134e-05, "loss": 1.6069, "step": 28800 }, { "epoch": 4.760999793431109, "grad_norm": 9.790139198303223, "learning_rate": 2.910446390862851e-05, "loss": 1.5313, "step": 28810 }, { "epoch": 4.76265234455691, "grad_norm": 10.083866119384766, "learning_rate": 2.9095282689729893e-05, "loss": 1.5612, "step": 28820 }, { "epoch": 4.76430489568271, "grad_norm": 17.150331497192383, "learning_rate": 2.908610147083127e-05, "loss": 1.5995, "step": 28830 }, { "epoch": 4.76595744680851, "grad_norm": 13.593814849853516, "learning_rate": 2.9076920251932648e-05, "loss": 1.4406, "step": 28840 }, { "epoch": 4.767609997934311, "grad_norm": 8.73280143737793, "learning_rate": 2.9067739033034024e-05, "loss": 1.6393, "step": 28850 }, { "epoch": 4.7692625490601115, "grad_norm": 11.49311351776123, "learning_rate": 2.9058557814135406e-05, "loss": 1.5007, "step": 28860 }, { "epoch": 4.770915100185912, "grad_norm": 18.270648956298828, "learning_rate": 2.9049376595236782e-05, "loss": 1.4804, "step": 28870 }, { "epoch": 4.772567651311713, "grad_norm": 7.664954662322998, "learning_rate": 2.9040195376338165e-05, "loss": 1.5895, "step": 28880 }, { "epoch": 4.774220202437513, "grad_norm": 7.60715389251709, "learning_rate": 2.9031014157439544e-05, "loss": 1.5162, "step": 28890 }, { "epoch": 4.775872753563314, "grad_norm": 9.755659103393555, "learning_rate": 2.9021832938540923e-05, "loss": 1.4321, "step": 28900 }, { "epoch": 4.777525304689114, "grad_norm": 15.464385032653809, "learning_rate": 2.9012651719642302e-05, "loss": 1.4708, "step": 28910 }, { "epoch": 4.779177855814915, "grad_norm": 24.310163497924805, "learning_rate": 2.900347050074368e-05, "loss": 1.4532, "step": 28920 }, { "epoch": 4.780830406940715, "grad_norm": 15.663667678833008, "learning_rate": 2.899428928184506e-05, "loss": 1.4912, "step": 28930 }, { "epoch": 4.782482958066515, "grad_norm": 12.316214561462402, "learning_rate": 2.8985108062946437e-05, "loss": 1.5429, "step": 28940 }, { "epoch": 4.784135509192316, "grad_norm": 14.147255897521973, "learning_rate": 2.897592684404782e-05, "loss": 1.4476, "step": 28950 }, { "epoch": 4.785788060318116, "grad_norm": 9.104997634887695, "learning_rate": 2.8966745625149195e-05, "loss": 1.4273, "step": 28960 }, { "epoch": 4.787440611443916, "grad_norm": 11.243693351745605, "learning_rate": 2.8957564406250575e-05, "loss": 1.5976, "step": 28970 }, { "epoch": 4.789093162569717, "grad_norm": 15.91396427154541, "learning_rate": 2.894838318735195e-05, "loss": 1.5532, "step": 28980 }, { "epoch": 4.790745713695517, "grad_norm": 23.990699768066406, "learning_rate": 2.8939201968453333e-05, "loss": 1.5387, "step": 28990 }, { "epoch": 4.792398264821318, "grad_norm": 13.71623706817627, "learning_rate": 2.8930020749554716e-05, "loss": 1.5653, "step": 29000 }, { "epoch": 4.794050815947118, "grad_norm": 7.524801731109619, "learning_rate": 2.892083953065609e-05, "loss": 1.5804, "step": 29010 }, { "epoch": 4.795703367072919, "grad_norm": 31.6929874420166, "learning_rate": 2.891165831175747e-05, "loss": 1.5599, "step": 29020 }, { "epoch": 4.797355918198719, "grad_norm": 10.225500106811523, "learning_rate": 2.890247709285885e-05, "loss": 1.3953, "step": 29030 }, { "epoch": 4.799008469324519, "grad_norm": 11.53248405456543, "learning_rate": 2.889329587396023e-05, "loss": 1.5633, "step": 29040 }, { "epoch": 4.80066102045032, "grad_norm": 9.548233985900879, "learning_rate": 2.8884114655061605e-05, "loss": 1.4989, "step": 29050 }, { "epoch": 4.8023135715761205, "grad_norm": 5.22447395324707, "learning_rate": 2.8874933436162988e-05, "loss": 1.4038, "step": 29060 }, { "epoch": 4.8039661227019215, "grad_norm": 15.642428398132324, "learning_rate": 2.8865752217264364e-05, "loss": 1.4773, "step": 29070 }, { "epoch": 4.805618673827722, "grad_norm": 7.29582405090332, "learning_rate": 2.8856570998365746e-05, "loss": 1.5097, "step": 29080 }, { "epoch": 4.807271224953522, "grad_norm": 14.306315422058105, "learning_rate": 2.8847389779467122e-05, "loss": 1.4798, "step": 29090 }, { "epoch": 4.808923776079323, "grad_norm": 9.41856861114502, "learning_rate": 2.88382085605685e-05, "loss": 1.5179, "step": 29100 }, { "epoch": 4.810576327205123, "grad_norm": 14.0534086227417, "learning_rate": 2.8829027341669884e-05, "loss": 1.4123, "step": 29110 }, { "epoch": 4.812228878330924, "grad_norm": 73.24070739746094, "learning_rate": 2.881984612277126e-05, "loss": 1.5063, "step": 29120 }, { "epoch": 4.813881429456724, "grad_norm": 11.320660591125488, "learning_rate": 2.8810664903872643e-05, "loss": 1.5333, "step": 29130 }, { "epoch": 4.815533980582524, "grad_norm": 10.174736976623535, "learning_rate": 2.880148368497402e-05, "loss": 1.4865, "step": 29140 }, { "epoch": 4.817186531708325, "grad_norm": 13.212072372436523, "learning_rate": 2.8792302466075398e-05, "loss": 1.486, "step": 29150 }, { "epoch": 4.818839082834125, "grad_norm": 14.246745109558105, "learning_rate": 2.8783121247176777e-05, "loss": 1.5052, "step": 29160 }, { "epoch": 4.820491633959926, "grad_norm": 10.12642765045166, "learning_rate": 2.8773940028278156e-05, "loss": 1.3828, "step": 29170 }, { "epoch": 4.822144185085726, "grad_norm": 8.442862510681152, "learning_rate": 2.8764758809379532e-05, "loss": 1.5126, "step": 29180 }, { "epoch": 4.823796736211526, "grad_norm": 18.026945114135742, "learning_rate": 2.8755577590480915e-05, "loss": 1.5676, "step": 29190 }, { "epoch": 4.825449287337327, "grad_norm": 7.779977321624756, "learning_rate": 2.874639637158229e-05, "loss": 1.4621, "step": 29200 }, { "epoch": 4.827101838463127, "grad_norm": 21.727739334106445, "learning_rate": 2.8737215152683673e-05, "loss": 1.551, "step": 29210 }, { "epoch": 4.828754389588928, "grad_norm": 9.406710624694824, "learning_rate": 2.872803393378505e-05, "loss": 1.4938, "step": 29220 }, { "epoch": 4.830406940714728, "grad_norm": 9.602388381958008, "learning_rate": 2.8718852714886428e-05, "loss": 1.5477, "step": 29230 }, { "epoch": 4.832059491840528, "grad_norm": 13.313919067382812, "learning_rate": 2.870967149598781e-05, "loss": 1.5438, "step": 29240 }, { "epoch": 4.833712042966329, "grad_norm": 6.431525707244873, "learning_rate": 2.8700490277089187e-05, "loss": 1.5184, "step": 29250 }, { "epoch": 4.8353645940921295, "grad_norm": 6.1765313148498535, "learning_rate": 2.869130905819057e-05, "loss": 1.4919, "step": 29260 }, { "epoch": 4.8370171452179305, "grad_norm": 9.743697166442871, "learning_rate": 2.8682127839291945e-05, "loss": 1.4928, "step": 29270 }, { "epoch": 4.838669696343731, "grad_norm": 9.40673542022705, "learning_rate": 2.8672946620393324e-05, "loss": 1.4567, "step": 29280 }, { "epoch": 4.8403222474695315, "grad_norm": 9.90953254699707, "learning_rate": 2.8663765401494704e-05, "loss": 1.4985, "step": 29290 }, { "epoch": 4.841974798595332, "grad_norm": 15.230709075927734, "learning_rate": 2.8654584182596083e-05, "loss": 1.5292, "step": 29300 }, { "epoch": 4.843627349721132, "grad_norm": 8.818098068237305, "learning_rate": 2.864540296369746e-05, "loss": 1.4527, "step": 29310 }, { "epoch": 4.845279900846933, "grad_norm": 7.097944736480713, "learning_rate": 2.863622174479884e-05, "loss": 1.627, "step": 29320 }, { "epoch": 4.846932451972733, "grad_norm": 9.903128623962402, "learning_rate": 2.8627040525900217e-05, "loss": 1.6714, "step": 29330 }, { "epoch": 4.848585003098533, "grad_norm": 11.924121856689453, "learning_rate": 2.86178593070016e-05, "loss": 1.2808, "step": 29340 }, { "epoch": 4.850237554224334, "grad_norm": 15.464925765991211, "learning_rate": 2.860867808810298e-05, "loss": 1.5392, "step": 29350 }, { "epoch": 4.851890105350134, "grad_norm": 9.023946762084961, "learning_rate": 2.8599496869204355e-05, "loss": 1.5101, "step": 29360 }, { "epoch": 4.853542656475935, "grad_norm": 13.63484001159668, "learning_rate": 2.8590315650305738e-05, "loss": 1.4716, "step": 29370 }, { "epoch": 4.855195207601735, "grad_norm": 8.59793472290039, "learning_rate": 2.8581134431407114e-05, "loss": 1.4911, "step": 29380 }, { "epoch": 4.856847758727536, "grad_norm": 10.655295372009277, "learning_rate": 2.8571953212508496e-05, "loss": 1.4926, "step": 29390 }, { "epoch": 4.858500309853336, "grad_norm": 44.5798225402832, "learning_rate": 2.8562771993609872e-05, "loss": 1.4292, "step": 29400 }, { "epoch": 4.860152860979136, "grad_norm": 10.469300270080566, "learning_rate": 2.855359077471125e-05, "loss": 1.3958, "step": 29410 }, { "epoch": 4.861805412104937, "grad_norm": 125.58094024658203, "learning_rate": 2.854440955581263e-05, "loss": 1.5653, "step": 29420 }, { "epoch": 4.863457963230737, "grad_norm": 9.53484058380127, "learning_rate": 2.853522833691401e-05, "loss": 1.5401, "step": 29430 }, { "epoch": 4.865110514356537, "grad_norm": 8.112886428833008, "learning_rate": 2.8526047118015386e-05, "loss": 1.4984, "step": 29440 }, { "epoch": 4.866763065482338, "grad_norm": 19.068845748901367, "learning_rate": 2.851686589911677e-05, "loss": 1.4704, "step": 29450 }, { "epoch": 4.8684156166081385, "grad_norm": 14.235884666442871, "learning_rate": 2.850768468021815e-05, "loss": 1.4131, "step": 29460 }, { "epoch": 4.8700681677339395, "grad_norm": 8.707880973815918, "learning_rate": 2.8498503461319527e-05, "loss": 1.3683, "step": 29470 }, { "epoch": 4.87172071885974, "grad_norm": 8.955572128295898, "learning_rate": 2.8489322242420906e-05, "loss": 1.4594, "step": 29480 }, { "epoch": 4.8733732699855405, "grad_norm": 14.345805168151855, "learning_rate": 2.8480141023522282e-05, "loss": 1.5525, "step": 29490 }, { "epoch": 4.875025821111341, "grad_norm": 20.3609619140625, "learning_rate": 2.8470959804623665e-05, "loss": 1.5474, "step": 29500 }, { "epoch": 4.876678372237141, "grad_norm": 9.365708351135254, "learning_rate": 2.846177858572504e-05, "loss": 1.5406, "step": 29510 }, { "epoch": 4.878330923362942, "grad_norm": 8.881852149963379, "learning_rate": 2.8452597366826423e-05, "loss": 1.3521, "step": 29520 }, { "epoch": 4.879983474488742, "grad_norm": 46.270851135253906, "learning_rate": 2.84434161479278e-05, "loss": 1.6585, "step": 29530 }, { "epoch": 4.881636025614543, "grad_norm": 6.4464240074157715, "learning_rate": 2.843423492902918e-05, "loss": 1.4865, "step": 29540 }, { "epoch": 4.883288576740343, "grad_norm": 12.958675384521484, "learning_rate": 2.8425053710130557e-05, "loss": 1.5643, "step": 29550 }, { "epoch": 4.884941127866143, "grad_norm": 9.830168724060059, "learning_rate": 2.8415872491231937e-05, "loss": 1.3667, "step": 29560 }, { "epoch": 4.886593678991944, "grad_norm": 15.14609146118164, "learning_rate": 2.840669127233332e-05, "loss": 1.4713, "step": 29570 }, { "epoch": 4.888246230117744, "grad_norm": 10.329089164733887, "learning_rate": 2.8397510053434695e-05, "loss": 1.5543, "step": 29580 }, { "epoch": 4.889898781243545, "grad_norm": 6.754423141479492, "learning_rate": 2.8388328834536078e-05, "loss": 1.3762, "step": 29590 }, { "epoch": 4.891551332369345, "grad_norm": 23.90659523010254, "learning_rate": 2.8379147615637454e-05, "loss": 1.5471, "step": 29600 }, { "epoch": 4.893203883495145, "grad_norm": 14.84753131866455, "learning_rate": 2.8369966396738833e-05, "loss": 1.6663, "step": 29610 }, { "epoch": 4.894856434620946, "grad_norm": 8.822469711303711, "learning_rate": 2.836078517784021e-05, "loss": 1.3787, "step": 29620 }, { "epoch": 4.896508985746746, "grad_norm": 7.7165913581848145, "learning_rate": 2.835160395894159e-05, "loss": 1.4144, "step": 29630 }, { "epoch": 4.898161536872547, "grad_norm": 15.842850685119629, "learning_rate": 2.8342422740042967e-05, "loss": 1.4095, "step": 29640 }, { "epoch": 4.899814087998347, "grad_norm": 6.203104019165039, "learning_rate": 2.833324152114435e-05, "loss": 1.4753, "step": 29650 }, { "epoch": 4.901466639124148, "grad_norm": 9.8826904296875, "learning_rate": 2.8324060302245726e-05, "loss": 1.4384, "step": 29660 }, { "epoch": 4.9031191902499485, "grad_norm": 12.302288055419922, "learning_rate": 2.831487908334711e-05, "loss": 1.5356, "step": 29670 }, { "epoch": 4.904771741375749, "grad_norm": 11.829875946044922, "learning_rate": 2.8305697864448488e-05, "loss": 1.4053, "step": 29680 }, { "epoch": 4.9064242925015495, "grad_norm": 91.15592956542969, "learning_rate": 2.8296516645549863e-05, "loss": 1.5348, "step": 29690 }, { "epoch": 4.90807684362735, "grad_norm": 8.566816329956055, "learning_rate": 2.8287335426651246e-05, "loss": 1.4056, "step": 29700 }, { "epoch": 4.90972939475315, "grad_norm": 14.642806053161621, "learning_rate": 2.8278154207752622e-05, "loss": 1.5218, "step": 29710 }, { "epoch": 4.911381945878951, "grad_norm": 11.299127578735352, "learning_rate": 2.8268972988854005e-05, "loss": 1.5021, "step": 29720 }, { "epoch": 4.913034497004751, "grad_norm": 7.661716938018799, "learning_rate": 2.825979176995538e-05, "loss": 1.4599, "step": 29730 }, { "epoch": 4.914687048130552, "grad_norm": 16.636594772338867, "learning_rate": 2.825061055105676e-05, "loss": 1.4983, "step": 29740 }, { "epoch": 4.916339599256352, "grad_norm": 11.943872451782227, "learning_rate": 2.8241429332158136e-05, "loss": 1.6077, "step": 29750 }, { "epoch": 4.917992150382153, "grad_norm": 8.194153785705566, "learning_rate": 2.8232248113259518e-05, "loss": 1.4847, "step": 29760 }, { "epoch": 4.919644701507953, "grad_norm": 9.320284843444824, "learning_rate": 2.8223066894360894e-05, "loss": 1.5322, "step": 29770 }, { "epoch": 4.921297252633753, "grad_norm": 6.6243391036987305, "learning_rate": 2.8213885675462277e-05, "loss": 1.5005, "step": 29780 }, { "epoch": 4.922949803759554, "grad_norm": 6.813876628875732, "learning_rate": 2.8204704456563653e-05, "loss": 1.6577, "step": 29790 }, { "epoch": 4.924602354885354, "grad_norm": 7.4178876876831055, "learning_rate": 2.8195523237665035e-05, "loss": 1.4614, "step": 29800 }, { "epoch": 4.926254906011154, "grad_norm": 5.743941783905029, "learning_rate": 2.8186342018766414e-05, "loss": 1.448, "step": 29810 }, { "epoch": 4.927907457136955, "grad_norm": 17.722078323364258, "learning_rate": 2.817716079986779e-05, "loss": 1.5352, "step": 29820 }, { "epoch": 4.929560008262755, "grad_norm": 8.234187126159668, "learning_rate": 2.8167979580969173e-05, "loss": 1.4319, "step": 29830 }, { "epoch": 4.931212559388556, "grad_norm": 14.171329498291016, "learning_rate": 2.815879836207055e-05, "loss": 1.6987, "step": 29840 }, { "epoch": 4.932865110514356, "grad_norm": 18.609638214111328, "learning_rate": 2.814961714317193e-05, "loss": 1.4605, "step": 29850 }, { "epoch": 4.934517661640157, "grad_norm": 10.389782905578613, "learning_rate": 2.8140435924273307e-05, "loss": 1.571, "step": 29860 }, { "epoch": 4.9361702127659575, "grad_norm": 206.19345092773438, "learning_rate": 2.8131254705374687e-05, "loss": 1.4592, "step": 29870 }, { "epoch": 4.937822763891758, "grad_norm": 8.415513038635254, "learning_rate": 2.8122073486476062e-05, "loss": 1.4644, "step": 29880 }, { "epoch": 4.9394753150175585, "grad_norm": 13.903251647949219, "learning_rate": 2.8112892267577445e-05, "loss": 1.4461, "step": 29890 }, { "epoch": 4.941127866143359, "grad_norm": 7.16209602355957, "learning_rate": 2.810371104867882e-05, "loss": 1.5743, "step": 29900 }, { "epoch": 4.94278041726916, "grad_norm": 11.736125946044922, "learning_rate": 2.8094529829780204e-05, "loss": 1.6183, "step": 29910 }, { "epoch": 4.94443296839496, "grad_norm": 12.971633911132812, "learning_rate": 2.8085348610881583e-05, "loss": 1.4226, "step": 29920 }, { "epoch": 4.94608551952076, "grad_norm": 6.504984378814697, "learning_rate": 2.8076167391982962e-05, "loss": 1.4956, "step": 29930 }, { "epoch": 4.947738070646561, "grad_norm": 7.893176555633545, "learning_rate": 2.806698617308434e-05, "loss": 1.4615, "step": 29940 }, { "epoch": 4.949390621772361, "grad_norm": 8.680290222167969, "learning_rate": 2.8057804954185717e-05, "loss": 1.4396, "step": 29950 }, { "epoch": 4.951043172898162, "grad_norm": 11.371504783630371, "learning_rate": 2.80486237352871e-05, "loss": 1.4316, "step": 29960 }, { "epoch": 4.952695724023962, "grad_norm": 11.4708251953125, "learning_rate": 2.8039442516388476e-05, "loss": 1.5216, "step": 29970 }, { "epoch": 4.954348275149762, "grad_norm": 7.596912860870361, "learning_rate": 2.8030261297489858e-05, "loss": 1.538, "step": 29980 }, { "epoch": 4.956000826275563, "grad_norm": 12.889708518981934, "learning_rate": 2.8021080078591234e-05, "loss": 1.4701, "step": 29990 }, { "epoch": 4.957653377401363, "grad_norm": 23.94351577758789, "learning_rate": 2.8011898859692613e-05, "loss": 1.4761, "step": 30000 }, { "epoch": 4.959305928527164, "grad_norm": 10.514241218566895, "learning_rate": 2.800271764079399e-05, "loss": 1.5601, "step": 30010 }, { "epoch": 4.960958479652964, "grad_norm": 134.59942626953125, "learning_rate": 2.7993536421895372e-05, "loss": 1.5578, "step": 30020 }, { "epoch": 4.962611030778765, "grad_norm": 8.633646965026855, "learning_rate": 2.7984355202996755e-05, "loss": 1.3932, "step": 30030 }, { "epoch": 4.964263581904565, "grad_norm": 8.6370849609375, "learning_rate": 2.797517398409813e-05, "loss": 1.4556, "step": 30040 }, { "epoch": 4.965916133030365, "grad_norm": 7.444171905517578, "learning_rate": 2.796599276519951e-05, "loss": 1.4341, "step": 30050 }, { "epoch": 4.967568684156166, "grad_norm": 14.582514762878418, "learning_rate": 2.795681154630089e-05, "loss": 1.5428, "step": 30060 }, { "epoch": 4.9692212352819665, "grad_norm": 10.481239318847656, "learning_rate": 2.7947630327402268e-05, "loss": 1.4816, "step": 30070 }, { "epoch": 4.970873786407767, "grad_norm": 9.104494094848633, "learning_rate": 2.7938449108503644e-05, "loss": 1.4667, "step": 30080 }, { "epoch": 4.9725263375335675, "grad_norm": 14.128423690795898, "learning_rate": 2.7929267889605027e-05, "loss": 1.4932, "step": 30090 }, { "epoch": 4.974178888659368, "grad_norm": 8.712443351745605, "learning_rate": 2.7920086670706402e-05, "loss": 1.57, "step": 30100 }, { "epoch": 4.975831439785169, "grad_norm": 8.335115432739258, "learning_rate": 2.7910905451807785e-05, "loss": 1.4954, "step": 30110 }, { "epoch": 4.977483990910969, "grad_norm": 47.504981994628906, "learning_rate": 2.790172423290916e-05, "loss": 1.5175, "step": 30120 }, { "epoch": 4.97913654203677, "grad_norm": 8.322532653808594, "learning_rate": 2.789254301401054e-05, "loss": 1.5074, "step": 30130 }, { "epoch": 4.98078909316257, "grad_norm": 7.302855014801025, "learning_rate": 2.7883361795111923e-05, "loss": 1.6079, "step": 30140 }, { "epoch": 4.98244164428837, "grad_norm": 17.811246871948242, "learning_rate": 2.78741805762133e-05, "loss": 1.4465, "step": 30150 }, { "epoch": 4.984094195414171, "grad_norm": 16.895263671875, "learning_rate": 2.786499935731468e-05, "loss": 1.5104, "step": 30160 }, { "epoch": 4.985746746539971, "grad_norm": 31.71302604675293, "learning_rate": 2.7855818138416057e-05, "loss": 1.5411, "step": 30170 }, { "epoch": 4.987399297665771, "grad_norm": 19.637605667114258, "learning_rate": 2.7846636919517436e-05, "loss": 1.4842, "step": 30180 }, { "epoch": 4.989051848791572, "grad_norm": 134.47610473632812, "learning_rate": 2.7837455700618816e-05, "loss": 1.6898, "step": 30190 }, { "epoch": 4.990704399917372, "grad_norm": 18.40015983581543, "learning_rate": 2.7828274481720195e-05, "loss": 1.4774, "step": 30200 }, { "epoch": 4.992356951043173, "grad_norm": 8.392579078674316, "learning_rate": 2.781909326282157e-05, "loss": 1.3739, "step": 30210 }, { "epoch": 4.994009502168973, "grad_norm": 12.9806547164917, "learning_rate": 2.7809912043922953e-05, "loss": 1.4431, "step": 30220 }, { "epoch": 4.995662053294774, "grad_norm": 9.284590721130371, "learning_rate": 2.780073082502433e-05, "loss": 1.5775, "step": 30230 }, { "epoch": 4.997314604420574, "grad_norm": 13.390445709228516, "learning_rate": 2.7791549606125712e-05, "loss": 1.5908, "step": 30240 }, { "epoch": 4.998967155546374, "grad_norm": 9.49555778503418, "learning_rate": 2.778236838722709e-05, "loss": 1.5371, "step": 30250 }, { "epoch": 4.999958686221855, "eval_accuracy": 0.2968151510852603, "eval_loss": 2.206881046295166, "eval_runtime": 816.4999, "eval_samples_per_second": 34.533, "eval_steps_per_second": 8.633, "step": 30256 }, { "epoch": 5.000619706672175, "grad_norm": 15.309430122375488, "learning_rate": 2.7773187168328467e-05, "loss": 1.5734, "step": 30260 }, { "epoch": 5.0022722577979755, "grad_norm": 11.606006622314453, "learning_rate": 2.776400594942985e-05, "loss": 1.4502, "step": 30270 }, { "epoch": 5.0039248089237764, "grad_norm": 13.329957962036133, "learning_rate": 2.7754824730531226e-05, "loss": 1.3861, "step": 30280 }, { "epoch": 5.0055773600495765, "grad_norm": 35.278804779052734, "learning_rate": 2.7745643511632608e-05, "loss": 1.468, "step": 30290 }, { "epoch": 5.007229911175377, "grad_norm": 27.012672424316406, "learning_rate": 2.7736462292733984e-05, "loss": 1.6023, "step": 30300 }, { "epoch": 5.008882462301178, "grad_norm": 7.429107189178467, "learning_rate": 2.7727281073835367e-05, "loss": 1.4019, "step": 30310 }, { "epoch": 5.010535013426978, "grad_norm": 8.644866943359375, "learning_rate": 2.7718099854936743e-05, "loss": 1.3994, "step": 30320 }, { "epoch": 5.012187564552779, "grad_norm": 8.626925468444824, "learning_rate": 2.7708918636038122e-05, "loss": 1.6237, "step": 30330 }, { "epoch": 5.013840115678579, "grad_norm": 23.41725730895996, "learning_rate": 2.7699737417139498e-05, "loss": 1.3434, "step": 30340 }, { "epoch": 5.015492666804379, "grad_norm": 12.992168426513672, "learning_rate": 2.769055619824088e-05, "loss": 1.4989, "step": 30350 }, { "epoch": 5.01714521793018, "grad_norm": 19.700824737548828, "learning_rate": 2.7681374979342256e-05, "loss": 1.6157, "step": 30360 }, { "epoch": 5.01879776905598, "grad_norm": 35.20869827270508, "learning_rate": 2.767219376044364e-05, "loss": 1.3213, "step": 30370 }, { "epoch": 5.020450320181781, "grad_norm": 10.241279602050781, "learning_rate": 2.7663012541545018e-05, "loss": 1.5054, "step": 30380 }, { "epoch": 5.022102871307581, "grad_norm": 7.736096382141113, "learning_rate": 2.7653831322646394e-05, "loss": 1.5098, "step": 30390 }, { "epoch": 5.023755422433381, "grad_norm": 16.00613784790039, "learning_rate": 2.7644650103747777e-05, "loss": 1.4232, "step": 30400 }, { "epoch": 5.025407973559182, "grad_norm": 9.938050270080566, "learning_rate": 2.7635468884849152e-05, "loss": 1.472, "step": 30410 }, { "epoch": 5.027060524684982, "grad_norm": 7.0826640129089355, "learning_rate": 2.7626287665950535e-05, "loss": 1.4146, "step": 30420 }, { "epoch": 5.028713075810783, "grad_norm": 8.384482383728027, "learning_rate": 2.761710644705191e-05, "loss": 1.503, "step": 30430 }, { "epoch": 5.030365626936583, "grad_norm": 8.713443756103516, "learning_rate": 2.7607925228153293e-05, "loss": 1.2782, "step": 30440 }, { "epoch": 5.032018178062383, "grad_norm": 11.568880081176758, "learning_rate": 2.759874400925467e-05, "loss": 1.4832, "step": 30450 }, { "epoch": 5.033670729188184, "grad_norm": 11.942070960998535, "learning_rate": 2.758956279035605e-05, "loss": 1.4826, "step": 30460 }, { "epoch": 5.0353232803139845, "grad_norm": 8.392718315124512, "learning_rate": 2.7580381571457424e-05, "loss": 1.5613, "step": 30470 }, { "epoch": 5.0369758314397854, "grad_norm": 8.625317573547363, "learning_rate": 2.7571200352558807e-05, "loss": 1.5039, "step": 30480 }, { "epoch": 5.0386283825655855, "grad_norm": 34.13085174560547, "learning_rate": 2.756201913366019e-05, "loss": 1.5013, "step": 30490 }, { "epoch": 5.0402809336913865, "grad_norm": 9.937477111816406, "learning_rate": 2.7552837914761566e-05, "loss": 1.4878, "step": 30500 }, { "epoch": 5.041933484817187, "grad_norm": 10.615894317626953, "learning_rate": 2.7543656695862945e-05, "loss": 1.5141, "step": 30510 }, { "epoch": 5.043586035942987, "grad_norm": 9.882899284362793, "learning_rate": 2.753447547696432e-05, "loss": 1.4341, "step": 30520 }, { "epoch": 5.045238587068788, "grad_norm": 13.298185348510742, "learning_rate": 2.7525294258065703e-05, "loss": 1.3448, "step": 30530 }, { "epoch": 5.046891138194588, "grad_norm": 10.530019760131836, "learning_rate": 2.751611303916708e-05, "loss": 1.4876, "step": 30540 }, { "epoch": 5.048543689320389, "grad_norm": 7.8333892822265625, "learning_rate": 2.7506931820268462e-05, "loss": 1.3568, "step": 30550 }, { "epoch": 5.050196240446189, "grad_norm": 8.093653678894043, "learning_rate": 2.7497750601369838e-05, "loss": 1.4622, "step": 30560 }, { "epoch": 5.051848791571989, "grad_norm": 15.520524978637695, "learning_rate": 2.748856938247122e-05, "loss": 1.5338, "step": 30570 }, { "epoch": 5.05350134269779, "grad_norm": 7.4860382080078125, "learning_rate": 2.7479388163572596e-05, "loss": 1.3255, "step": 30580 }, { "epoch": 5.05515389382359, "grad_norm": 24.090126037597656, "learning_rate": 2.7470206944673975e-05, "loss": 1.4614, "step": 30590 }, { "epoch": 5.056806444949391, "grad_norm": 6.862220287322998, "learning_rate": 2.7461025725775358e-05, "loss": 1.4854, "step": 30600 }, { "epoch": 5.058458996075191, "grad_norm": 28.299789428710938, "learning_rate": 2.7451844506876734e-05, "loss": 1.4952, "step": 30610 }, { "epoch": 5.060111547200991, "grad_norm": 9.547993659973145, "learning_rate": 2.7442663287978117e-05, "loss": 1.4909, "step": 30620 }, { "epoch": 5.061764098326792, "grad_norm": 14.898249626159668, "learning_rate": 2.7433482069079492e-05, "loss": 1.4672, "step": 30630 }, { "epoch": 5.063416649452592, "grad_norm": 8.26840591430664, "learning_rate": 2.742430085018087e-05, "loss": 1.5186, "step": 30640 }, { "epoch": 5.065069200578393, "grad_norm": 7.622718811035156, "learning_rate": 2.7415119631282248e-05, "loss": 1.3791, "step": 30650 }, { "epoch": 5.066721751704193, "grad_norm": 9.911038398742676, "learning_rate": 2.740593841238363e-05, "loss": 1.3867, "step": 30660 }, { "epoch": 5.0683743028299935, "grad_norm": 21.617273330688477, "learning_rate": 2.7396757193485006e-05, "loss": 1.4964, "step": 30670 }, { "epoch": 5.070026853955794, "grad_norm": 10.754417419433594, "learning_rate": 2.738757597458639e-05, "loss": 1.4902, "step": 30680 }, { "epoch": 5.0716794050815945, "grad_norm": 15.153556823730469, "learning_rate": 2.7378394755687765e-05, "loss": 1.4118, "step": 30690 }, { "epoch": 5.0733319562073955, "grad_norm": 10.339966773986816, "learning_rate": 2.7369213536789147e-05, "loss": 1.5785, "step": 30700 }, { "epoch": 5.074984507333196, "grad_norm": 16.77724838256836, "learning_rate": 2.7360032317890526e-05, "loss": 1.3934, "step": 30710 }, { "epoch": 5.076637058458996, "grad_norm": 11.809764862060547, "learning_rate": 2.7350851098991902e-05, "loss": 1.4605, "step": 30720 }, { "epoch": 5.078289609584797, "grad_norm": 16.061250686645508, "learning_rate": 2.7341669880093285e-05, "loss": 1.5512, "step": 30730 }, { "epoch": 5.079942160710597, "grad_norm": 8.934623718261719, "learning_rate": 2.733248866119466e-05, "loss": 1.4989, "step": 30740 }, { "epoch": 5.081594711836398, "grad_norm": 8.88762378692627, "learning_rate": 2.7323307442296043e-05, "loss": 1.3776, "step": 30750 }, { "epoch": 5.083247262962198, "grad_norm": 14.036128044128418, "learning_rate": 2.731412622339742e-05, "loss": 1.5089, "step": 30760 }, { "epoch": 5.084899814087998, "grad_norm": 12.190908432006836, "learning_rate": 2.73049450044988e-05, "loss": 1.4895, "step": 30770 }, { "epoch": 5.086552365213799, "grad_norm": 10.279311180114746, "learning_rate": 2.7295763785600174e-05, "loss": 1.5135, "step": 30780 }, { "epoch": 5.088204916339599, "grad_norm": 7.8421549797058105, "learning_rate": 2.7286582566701557e-05, "loss": 1.4824, "step": 30790 }, { "epoch": 5.0898574674654, "grad_norm": 33.02604293823242, "learning_rate": 2.7277401347802933e-05, "loss": 1.4672, "step": 30800 }, { "epoch": 5.0915100185912, "grad_norm": 9.192763328552246, "learning_rate": 2.7268220128904315e-05, "loss": 1.5224, "step": 30810 }, { "epoch": 5.093162569717, "grad_norm": 14.467486381530762, "learning_rate": 2.7259038910005695e-05, "loss": 1.5533, "step": 30820 }, { "epoch": 5.094815120842801, "grad_norm": 12.854667663574219, "learning_rate": 2.7249857691107074e-05, "loss": 1.5342, "step": 30830 }, { "epoch": 5.096467671968601, "grad_norm": 8.334192276000977, "learning_rate": 2.7240676472208453e-05, "loss": 1.5695, "step": 30840 }, { "epoch": 5.098120223094402, "grad_norm": 7.920324325561523, "learning_rate": 2.723149525330983e-05, "loss": 1.5243, "step": 30850 }, { "epoch": 5.099772774220202, "grad_norm": 15.681721687316895, "learning_rate": 2.7222314034411212e-05, "loss": 1.5742, "step": 30860 }, { "epoch": 5.1014253253460025, "grad_norm": 10.732522010803223, "learning_rate": 2.7213132815512588e-05, "loss": 1.451, "step": 30870 }, { "epoch": 5.103077876471803, "grad_norm": 9.83503532409668, "learning_rate": 2.720395159661397e-05, "loss": 1.5081, "step": 30880 }, { "epoch": 5.1047304275976035, "grad_norm": 15.027039527893066, "learning_rate": 2.7194770377715346e-05, "loss": 1.5076, "step": 30890 }, { "epoch": 5.1063829787234045, "grad_norm": 6.365040302276611, "learning_rate": 2.7185589158816725e-05, "loss": 1.4941, "step": 30900 }, { "epoch": 5.108035529849205, "grad_norm": 14.915818214416504, "learning_rate": 2.71764079399181e-05, "loss": 1.5407, "step": 30910 }, { "epoch": 5.109688080975006, "grad_norm": 6.7997236251831055, "learning_rate": 2.7167226721019484e-05, "loss": 1.5268, "step": 30920 }, { "epoch": 5.111340632100806, "grad_norm": 9.963415145874023, "learning_rate": 2.715804550212086e-05, "loss": 1.3845, "step": 30930 }, { "epoch": 5.112993183226606, "grad_norm": 10.191669464111328, "learning_rate": 2.7148864283222242e-05, "loss": 1.4634, "step": 30940 }, { "epoch": 5.114645734352407, "grad_norm": 12.32084846496582, "learning_rate": 2.7139683064323625e-05, "loss": 1.4858, "step": 30950 }, { "epoch": 5.116298285478207, "grad_norm": 5.869612216949463, "learning_rate": 2.7130501845425e-05, "loss": 1.4629, "step": 30960 }, { "epoch": 5.117950836604008, "grad_norm": 17.75486183166504, "learning_rate": 2.712132062652638e-05, "loss": 1.4306, "step": 30970 }, { "epoch": 5.119603387729808, "grad_norm": 15.648809432983398, "learning_rate": 2.7112139407627756e-05, "loss": 1.4946, "step": 30980 }, { "epoch": 5.121255938855608, "grad_norm": 10.89876651763916, "learning_rate": 2.710295818872914e-05, "loss": 1.6039, "step": 30990 }, { "epoch": 5.122908489981409, "grad_norm": 13.772411346435547, "learning_rate": 2.7093776969830514e-05, "loss": 1.5348, "step": 31000 }, { "epoch": 5.124561041107209, "grad_norm": 11.758411407470703, "learning_rate": 2.7084595750931897e-05, "loss": 1.4812, "step": 31010 }, { "epoch": 5.12621359223301, "grad_norm": 27.372583389282227, "learning_rate": 2.7075414532033273e-05, "loss": 1.4999, "step": 31020 }, { "epoch": 5.12786614335881, "grad_norm": 9.395577430725098, "learning_rate": 2.7066233313134652e-05, "loss": 1.4854, "step": 31030 }, { "epoch": 5.12951869448461, "grad_norm": 24.79270362854004, "learning_rate": 2.7057052094236028e-05, "loss": 1.4465, "step": 31040 }, { "epoch": 5.131171245610411, "grad_norm": 10.64253044128418, "learning_rate": 2.704787087533741e-05, "loss": 1.5273, "step": 31050 }, { "epoch": 5.132823796736211, "grad_norm": 15.544374465942383, "learning_rate": 2.7038689656438793e-05, "loss": 1.4328, "step": 31060 }, { "epoch": 5.134476347862012, "grad_norm": 13.274250030517578, "learning_rate": 2.702950843754017e-05, "loss": 1.3941, "step": 31070 }, { "epoch": 5.136128898987812, "grad_norm": 7.694459438323975, "learning_rate": 2.7020327218641552e-05, "loss": 1.2814, "step": 31080 }, { "epoch": 5.1377814501136125, "grad_norm": 10.96912956237793, "learning_rate": 2.7011145999742928e-05, "loss": 1.5482, "step": 31090 }, { "epoch": 5.1394340012394135, "grad_norm": 107.31693267822266, "learning_rate": 2.7001964780844307e-05, "loss": 1.6004, "step": 31100 }, { "epoch": 5.141086552365214, "grad_norm": 10.470490455627441, "learning_rate": 2.6992783561945683e-05, "loss": 1.4453, "step": 31110 }, { "epoch": 5.142739103491015, "grad_norm": 8.99507999420166, "learning_rate": 2.6983602343047065e-05, "loss": 1.6031, "step": 31120 }, { "epoch": 5.144391654616815, "grad_norm": 7.556997776031494, "learning_rate": 2.697442112414844e-05, "loss": 1.6775, "step": 31130 }, { "epoch": 5.146044205742615, "grad_norm": 6.91307258605957, "learning_rate": 2.6965239905249824e-05, "loss": 1.4369, "step": 31140 }, { "epoch": 5.147696756868416, "grad_norm": 11.001294136047363, "learning_rate": 2.69560586863512e-05, "loss": 1.5253, "step": 31150 }, { "epoch": 5.149349307994216, "grad_norm": 11.226962089538574, "learning_rate": 2.694687746745258e-05, "loss": 1.4318, "step": 31160 }, { "epoch": 5.151001859120017, "grad_norm": 9.484665870666504, "learning_rate": 2.693769624855396e-05, "loss": 1.6958, "step": 31170 }, { "epoch": 5.152654410245817, "grad_norm": 11.47225570678711, "learning_rate": 2.6928515029655337e-05, "loss": 1.4713, "step": 31180 }, { "epoch": 5.154306961371617, "grad_norm": 13.216545104980469, "learning_rate": 2.691933381075672e-05, "loss": 1.462, "step": 31190 }, { "epoch": 5.155959512497418, "grad_norm": 14.571917533874512, "learning_rate": 2.6910152591858096e-05, "loss": 1.5572, "step": 31200 }, { "epoch": 5.157612063623218, "grad_norm": 6.982619285583496, "learning_rate": 2.690097137295948e-05, "loss": 1.5357, "step": 31210 }, { "epoch": 5.159264614749019, "grad_norm": 7.8256988525390625, "learning_rate": 2.6891790154060854e-05, "loss": 1.4597, "step": 31220 }, { "epoch": 5.160917165874819, "grad_norm": 11.033082962036133, "learning_rate": 2.6882608935162234e-05, "loss": 1.3216, "step": 31230 }, { "epoch": 5.162569717000619, "grad_norm": 9.309578895568848, "learning_rate": 2.687342771626361e-05, "loss": 1.3888, "step": 31240 }, { "epoch": 5.16422226812642, "grad_norm": 36.22593307495117, "learning_rate": 2.6864246497364992e-05, "loss": 1.3437, "step": 31250 }, { "epoch": 5.16587481925222, "grad_norm": 24.026330947875977, "learning_rate": 2.6855065278466368e-05, "loss": 1.4197, "step": 31260 }, { "epoch": 5.167527370378021, "grad_norm": 11.367629051208496, "learning_rate": 2.684588405956775e-05, "loss": 1.4089, "step": 31270 }, { "epoch": 5.169179921503821, "grad_norm": 8.361258506774902, "learning_rate": 2.683670284066913e-05, "loss": 1.4635, "step": 31280 }, { "epoch": 5.1708324726296215, "grad_norm": 25.029769897460938, "learning_rate": 2.6827521621770506e-05, "loss": 1.446, "step": 31290 }, { "epoch": 5.1724850237554225, "grad_norm": 11.334177017211914, "learning_rate": 2.681834040287189e-05, "loss": 1.4217, "step": 31300 }, { "epoch": 5.174137574881223, "grad_norm": 8.15962028503418, "learning_rate": 2.6809159183973264e-05, "loss": 1.4519, "step": 31310 }, { "epoch": 5.175790126007024, "grad_norm": 10.570094108581543, "learning_rate": 2.6799977965074647e-05, "loss": 1.3329, "step": 31320 }, { "epoch": 5.177442677132824, "grad_norm": 9.071272850036621, "learning_rate": 2.6790796746176023e-05, "loss": 1.558, "step": 31330 }, { "epoch": 5.179095228258625, "grad_norm": 9.73098373413086, "learning_rate": 2.6781615527277405e-05, "loss": 1.4698, "step": 31340 }, { "epoch": 5.180747779384425, "grad_norm": 13.9347505569458, "learning_rate": 2.677243430837878e-05, "loss": 1.5003, "step": 31350 }, { "epoch": 5.182400330510225, "grad_norm": 10.36252498626709, "learning_rate": 2.676325308948016e-05, "loss": 1.4587, "step": 31360 }, { "epoch": 5.184052881636026, "grad_norm": 16.580341339111328, "learning_rate": 2.6754071870581536e-05, "loss": 1.4521, "step": 31370 }, { "epoch": 5.185705432761826, "grad_norm": 14.264723777770996, "learning_rate": 2.674489065168292e-05, "loss": 1.3813, "step": 31380 }, { "epoch": 5.187357983887627, "grad_norm": 13.60984992980957, "learning_rate": 2.6735709432784295e-05, "loss": 1.6065, "step": 31390 }, { "epoch": 5.189010535013427, "grad_norm": 8.337096214294434, "learning_rate": 2.6726528213885678e-05, "loss": 1.4814, "step": 31400 }, { "epoch": 5.190663086139227, "grad_norm": 8.234798431396484, "learning_rate": 2.6717346994987057e-05, "loss": 1.5305, "step": 31410 }, { "epoch": 5.192315637265028, "grad_norm": 12.562371253967285, "learning_rate": 2.6708165776088433e-05, "loss": 1.5045, "step": 31420 }, { "epoch": 5.193968188390828, "grad_norm": 11.537344932556152, "learning_rate": 2.6698984557189815e-05, "loss": 1.5556, "step": 31430 }, { "epoch": 5.195620739516629, "grad_norm": 34.379127502441406, "learning_rate": 2.668980333829119e-05, "loss": 1.4086, "step": 31440 }, { "epoch": 5.197273290642429, "grad_norm": 10.825907707214355, "learning_rate": 2.6680622119392574e-05, "loss": 1.6994, "step": 31450 }, { "epoch": 5.198925841768229, "grad_norm": 9.560426712036133, "learning_rate": 2.667144090049395e-05, "loss": 1.606, "step": 31460 }, { "epoch": 5.20057839289403, "grad_norm": 7.058825492858887, "learning_rate": 2.6662259681595332e-05, "loss": 1.5243, "step": 31470 }, { "epoch": 5.20223094401983, "grad_norm": 8.447346687316895, "learning_rate": 2.6653078462696708e-05, "loss": 1.4428, "step": 31480 }, { "epoch": 5.203883495145631, "grad_norm": 8.007600784301758, "learning_rate": 2.6643897243798087e-05, "loss": 1.419, "step": 31490 }, { "epoch": 5.2055360462714315, "grad_norm": 10.7009916305542, "learning_rate": 2.6634716024899463e-05, "loss": 1.5057, "step": 31500 }, { "epoch": 5.207188597397232, "grad_norm": 10.528794288635254, "learning_rate": 2.6625534806000846e-05, "loss": 1.3964, "step": 31510 }, { "epoch": 5.208841148523033, "grad_norm": 11.727227210998535, "learning_rate": 2.661635358710223e-05, "loss": 1.5043, "step": 31520 }, { "epoch": 5.210493699648833, "grad_norm": 7.113905429840088, "learning_rate": 2.6607172368203604e-05, "loss": 1.4591, "step": 31530 }, { "epoch": 5.212146250774634, "grad_norm": 8.97098159790039, "learning_rate": 2.6597991149304984e-05, "loss": 1.5621, "step": 31540 }, { "epoch": 5.213798801900434, "grad_norm": 10.305506706237793, "learning_rate": 2.658880993040636e-05, "loss": 1.4602, "step": 31550 }, { "epoch": 5.215451353026234, "grad_norm": 13.641674995422363, "learning_rate": 2.6579628711507742e-05, "loss": 1.4489, "step": 31560 }, { "epoch": 5.217103904152035, "grad_norm": 9.43690299987793, "learning_rate": 2.6570447492609118e-05, "loss": 1.5889, "step": 31570 }, { "epoch": 5.218756455277835, "grad_norm": 6.873445510864258, "learning_rate": 2.65612662737105e-05, "loss": 1.5353, "step": 31580 }, { "epoch": 5.220409006403636, "grad_norm": 28.370594024658203, "learning_rate": 2.6552085054811876e-05, "loss": 1.5797, "step": 31590 }, { "epoch": 5.222061557529436, "grad_norm": 11.743301391601562, "learning_rate": 2.654290383591326e-05, "loss": 1.379, "step": 31600 }, { "epoch": 5.223714108655236, "grad_norm": 8.087701797485352, "learning_rate": 2.6533722617014635e-05, "loss": 1.5028, "step": 31610 }, { "epoch": 5.225366659781037, "grad_norm": 49.10771179199219, "learning_rate": 2.6524541398116014e-05, "loss": 1.4961, "step": 31620 }, { "epoch": 5.227019210906837, "grad_norm": 12.940389633178711, "learning_rate": 2.6515360179217397e-05, "loss": 1.5188, "step": 31630 }, { "epoch": 5.228671762032638, "grad_norm": 9.253148078918457, "learning_rate": 2.6506178960318773e-05, "loss": 1.5103, "step": 31640 }, { "epoch": 5.230324313158438, "grad_norm": 7.253983497619629, "learning_rate": 2.6496997741420155e-05, "loss": 1.5957, "step": 31650 }, { "epoch": 5.231976864284238, "grad_norm": 7.679978370666504, "learning_rate": 2.648781652252153e-05, "loss": 1.4185, "step": 31660 }, { "epoch": 5.233629415410039, "grad_norm": 13.257478713989258, "learning_rate": 2.647863530362291e-05, "loss": 1.5459, "step": 31670 }, { "epoch": 5.235281966535839, "grad_norm": 11.092429161071777, "learning_rate": 2.6469454084724286e-05, "loss": 1.4213, "step": 31680 }, { "epoch": 5.23693451766164, "grad_norm": 13.886821746826172, "learning_rate": 2.646027286582567e-05, "loss": 1.4144, "step": 31690 }, { "epoch": 5.2385870687874405, "grad_norm": 9.925250053405762, "learning_rate": 2.6451091646927045e-05, "loss": 1.3833, "step": 31700 }, { "epoch": 5.2402396199132415, "grad_norm": 8.703375816345215, "learning_rate": 2.6441910428028427e-05, "loss": 1.4315, "step": 31710 }, { "epoch": 5.241892171039042, "grad_norm": 91.34703826904297, "learning_rate": 2.6432729209129803e-05, "loss": 1.3998, "step": 31720 }, { "epoch": 5.243544722164842, "grad_norm": 15.27595043182373, "learning_rate": 2.6423547990231186e-05, "loss": 1.5865, "step": 31730 }, { "epoch": 5.245197273290643, "grad_norm": 11.300504684448242, "learning_rate": 2.6414366771332565e-05, "loss": 1.4581, "step": 31740 }, { "epoch": 5.246849824416443, "grad_norm": 10.756152153015137, "learning_rate": 2.640518555243394e-05, "loss": 1.5089, "step": 31750 }, { "epoch": 5.248502375542244, "grad_norm": 32.1532096862793, "learning_rate": 2.6396004333535324e-05, "loss": 1.4962, "step": 31760 }, { "epoch": 5.250154926668044, "grad_norm": 9.208897590637207, "learning_rate": 2.63868231146367e-05, "loss": 1.4471, "step": 31770 }, { "epoch": 5.251807477793844, "grad_norm": 17.185142517089844, "learning_rate": 2.6377641895738082e-05, "loss": 1.5346, "step": 31780 }, { "epoch": 5.253460028919645, "grad_norm": 15.642415046691895, "learning_rate": 2.6368460676839458e-05, "loss": 1.4897, "step": 31790 }, { "epoch": 5.255112580045445, "grad_norm": 11.783285140991211, "learning_rate": 2.6359279457940837e-05, "loss": 1.4354, "step": 31800 }, { "epoch": 5.256765131171246, "grad_norm": 10.766020774841309, "learning_rate": 2.6350098239042213e-05, "loss": 1.4611, "step": 31810 }, { "epoch": 5.258417682297046, "grad_norm": 9.75991439819336, "learning_rate": 2.6340917020143596e-05, "loss": 1.397, "step": 31820 }, { "epoch": 5.260070233422846, "grad_norm": 9.640849113464355, "learning_rate": 2.633173580124497e-05, "loss": 1.4211, "step": 31830 }, { "epoch": 5.261722784548647, "grad_norm": 49.9780387878418, "learning_rate": 2.6322554582346354e-05, "loss": 1.4722, "step": 31840 }, { "epoch": 5.263375335674447, "grad_norm": 11.358599662780762, "learning_rate": 2.6313373363447737e-05, "loss": 1.4281, "step": 31850 }, { "epoch": 5.265027886800248, "grad_norm": 6.540308475494385, "learning_rate": 2.6304192144549113e-05, "loss": 1.4257, "step": 31860 }, { "epoch": 5.266680437926048, "grad_norm": 11.856473922729492, "learning_rate": 2.6295010925650492e-05, "loss": 1.4528, "step": 31870 }, { "epoch": 5.268332989051848, "grad_norm": 6.613503456115723, "learning_rate": 2.6285829706751868e-05, "loss": 1.2896, "step": 31880 }, { "epoch": 5.269985540177649, "grad_norm": 10.662419319152832, "learning_rate": 2.627664848785325e-05, "loss": 1.4951, "step": 31890 }, { "epoch": 5.2716380913034495, "grad_norm": 13.057232856750488, "learning_rate": 2.6267467268954626e-05, "loss": 1.5273, "step": 31900 }, { "epoch": 5.2732906424292505, "grad_norm": 11.477456092834473, "learning_rate": 2.625828605005601e-05, "loss": 1.6712, "step": 31910 }, { "epoch": 5.274943193555051, "grad_norm": 7.808887481689453, "learning_rate": 2.6249104831157385e-05, "loss": 1.4828, "step": 31920 }, { "epoch": 5.276595744680851, "grad_norm": 8.068138122558594, "learning_rate": 2.6239923612258764e-05, "loss": 1.5149, "step": 31930 }, { "epoch": 5.278248295806652, "grad_norm": 11.689132690429688, "learning_rate": 2.623074239336014e-05, "loss": 1.4828, "step": 31940 }, { "epoch": 5.279900846932452, "grad_norm": 11.739019393920898, "learning_rate": 2.6221561174461523e-05, "loss": 1.5513, "step": 31950 }, { "epoch": 5.281553398058253, "grad_norm": 9.972562789916992, "learning_rate": 2.62123799555629e-05, "loss": 1.4665, "step": 31960 }, { "epoch": 5.283205949184053, "grad_norm": 9.730892181396484, "learning_rate": 2.620319873666428e-05, "loss": 1.6372, "step": 31970 }, { "epoch": 5.284858500309853, "grad_norm": 7.058382034301758, "learning_rate": 2.6194017517765664e-05, "loss": 1.3208, "step": 31980 }, { "epoch": 5.286511051435654, "grad_norm": 21.382532119750977, "learning_rate": 2.618483629886704e-05, "loss": 1.6161, "step": 31990 }, { "epoch": 5.288163602561454, "grad_norm": 12.467560768127441, "learning_rate": 2.617565507996842e-05, "loss": 1.5576, "step": 32000 }, { "epoch": 5.289816153687255, "grad_norm": 8.383124351501465, "learning_rate": 2.6166473861069795e-05, "loss": 1.5683, "step": 32010 }, { "epoch": 5.291468704813055, "grad_norm": 11.486825942993164, "learning_rate": 2.6157292642171177e-05, "loss": 1.5452, "step": 32020 }, { "epoch": 5.293121255938855, "grad_norm": 10.539969444274902, "learning_rate": 2.6148111423272553e-05, "loss": 1.5899, "step": 32030 }, { "epoch": 5.294773807064656, "grad_norm": 6.630085468292236, "learning_rate": 2.6138930204373936e-05, "loss": 1.6407, "step": 32040 }, { "epoch": 5.296426358190456, "grad_norm": 18.89915657043457, "learning_rate": 2.6129748985475312e-05, "loss": 1.4234, "step": 32050 }, { "epoch": 5.298078909316257, "grad_norm": 6.79443883895874, "learning_rate": 2.612056776657669e-05, "loss": 1.428, "step": 32060 }, { "epoch": 5.299731460442057, "grad_norm": 6.143259048461914, "learning_rate": 2.6111386547678067e-05, "loss": 1.4715, "step": 32070 }, { "epoch": 5.301384011567858, "grad_norm": 11.97968864440918, "learning_rate": 2.610220532877945e-05, "loss": 1.3861, "step": 32080 }, { "epoch": 5.303036562693658, "grad_norm": 6.659870624542236, "learning_rate": 2.6093024109880832e-05, "loss": 1.3774, "step": 32090 }, { "epoch": 5.3046891138194585, "grad_norm": 19.500179290771484, "learning_rate": 2.6083842890982208e-05, "loss": 1.4016, "step": 32100 }, { "epoch": 5.3063416649452595, "grad_norm": 10.958991050720215, "learning_rate": 2.607466167208359e-05, "loss": 1.2945, "step": 32110 }, { "epoch": 5.30799421607106, "grad_norm": 12.997007369995117, "learning_rate": 2.6065480453184966e-05, "loss": 1.4438, "step": 32120 }, { "epoch": 5.30964676719686, "grad_norm": 6.068414688110352, "learning_rate": 2.6056299234286346e-05, "loss": 1.4711, "step": 32130 }, { "epoch": 5.311299318322661, "grad_norm": 11.195246696472168, "learning_rate": 2.604711801538772e-05, "loss": 1.4093, "step": 32140 }, { "epoch": 5.312951869448461, "grad_norm": 40.626102447509766, "learning_rate": 2.6037936796489104e-05, "loss": 1.575, "step": 32150 }, { "epoch": 5.314604420574262, "grad_norm": 9.044241905212402, "learning_rate": 2.602875557759048e-05, "loss": 1.3712, "step": 32160 }, { "epoch": 5.316256971700062, "grad_norm": 13.25019359588623, "learning_rate": 2.6019574358691863e-05, "loss": 1.4445, "step": 32170 }, { "epoch": 5.317909522825863, "grad_norm": 20.42643165588379, "learning_rate": 2.601039313979324e-05, "loss": 1.3569, "step": 32180 }, { "epoch": 5.319562073951663, "grad_norm": 9.683372497558594, "learning_rate": 2.6001211920894618e-05, "loss": 1.5151, "step": 32190 }, { "epoch": 5.321214625077463, "grad_norm": 11.241543769836426, "learning_rate": 2.5992030701996e-05, "loss": 1.4425, "step": 32200 }, { "epoch": 5.322867176203264, "grad_norm": 12.246685981750488, "learning_rate": 2.5982849483097376e-05, "loss": 1.6109, "step": 32210 }, { "epoch": 5.324519727329064, "grad_norm": 30.807636260986328, "learning_rate": 2.597366826419876e-05, "loss": 1.4588, "step": 32220 }, { "epoch": 5.326172278454865, "grad_norm": 19.119007110595703, "learning_rate": 2.5964487045300135e-05, "loss": 1.4311, "step": 32230 }, { "epoch": 5.327824829580665, "grad_norm": 12.0513277053833, "learning_rate": 2.5955305826401517e-05, "loss": 1.5163, "step": 32240 }, { "epoch": 5.329477380706465, "grad_norm": 11.274496078491211, "learning_rate": 2.5946124607502893e-05, "loss": 1.3246, "step": 32250 }, { "epoch": 5.331129931832266, "grad_norm": 56.508384704589844, "learning_rate": 2.5936943388604273e-05, "loss": 1.5643, "step": 32260 }, { "epoch": 5.332782482958066, "grad_norm": 14.02528190612793, "learning_rate": 2.592776216970565e-05, "loss": 1.5164, "step": 32270 }, { "epoch": 5.334435034083867, "grad_norm": 16.86151123046875, "learning_rate": 2.591858095080703e-05, "loss": 1.4939, "step": 32280 }, { "epoch": 5.336087585209667, "grad_norm": 8.078414916992188, "learning_rate": 2.5909399731908407e-05, "loss": 1.4786, "step": 32290 }, { "epoch": 5.3377401363354675, "grad_norm": 11.507722854614258, "learning_rate": 2.590021851300979e-05, "loss": 1.4795, "step": 32300 }, { "epoch": 5.3393926874612685, "grad_norm": 20.504240036010742, "learning_rate": 2.589103729411117e-05, "loss": 1.5318, "step": 32310 }, { "epoch": 5.341045238587069, "grad_norm": 8.86858081817627, "learning_rate": 2.5881856075212545e-05, "loss": 1.5606, "step": 32320 }, { "epoch": 5.34269778971287, "grad_norm": 6.756838321685791, "learning_rate": 2.5872674856313927e-05, "loss": 1.4607, "step": 32330 }, { "epoch": 5.34435034083867, "grad_norm": 17.09296989440918, "learning_rate": 2.5863493637415303e-05, "loss": 1.5001, "step": 32340 }, { "epoch": 5.34600289196447, "grad_norm": 7.092482566833496, "learning_rate": 2.5854312418516686e-05, "loss": 1.5361, "step": 32350 }, { "epoch": 5.347655443090271, "grad_norm": 10.106451988220215, "learning_rate": 2.584513119961806e-05, "loss": 1.4626, "step": 32360 }, { "epoch": 5.349307994216071, "grad_norm": 7.64391565322876, "learning_rate": 2.5835949980719444e-05, "loss": 1.5781, "step": 32370 }, { "epoch": 5.350960545341872, "grad_norm": 10.09816837310791, "learning_rate": 2.582676876182082e-05, "loss": 1.5339, "step": 32380 }, { "epoch": 5.352613096467672, "grad_norm": 12.53437328338623, "learning_rate": 2.58175875429222e-05, "loss": 1.4137, "step": 32390 }, { "epoch": 5.354265647593472, "grad_norm": 9.417365074157715, "learning_rate": 2.5808406324023575e-05, "loss": 1.4312, "step": 32400 }, { "epoch": 5.355918198719273, "grad_norm": 8.617480278015137, "learning_rate": 2.5799225105124958e-05, "loss": 1.527, "step": 32410 }, { "epoch": 5.357570749845073, "grad_norm": 13.1574068069458, "learning_rate": 2.579004388622634e-05, "loss": 1.6434, "step": 32420 }, { "epoch": 5.359223300970874, "grad_norm": 7.331067085266113, "learning_rate": 2.5780862667327716e-05, "loss": 1.4821, "step": 32430 }, { "epoch": 5.360875852096674, "grad_norm": 9.73812484741211, "learning_rate": 2.5771681448429096e-05, "loss": 1.4768, "step": 32440 }, { "epoch": 5.362528403222475, "grad_norm": 9.941301345825195, "learning_rate": 2.576250022953047e-05, "loss": 1.5879, "step": 32450 }, { "epoch": 5.364180954348275, "grad_norm": 12.169427871704102, "learning_rate": 2.5753319010631854e-05, "loss": 1.4923, "step": 32460 }, { "epoch": 5.365833505474075, "grad_norm": 12.982132911682129, "learning_rate": 2.574413779173323e-05, "loss": 1.3733, "step": 32470 }, { "epoch": 5.367486056599876, "grad_norm": 7.65261173248291, "learning_rate": 2.5734956572834613e-05, "loss": 1.436, "step": 32480 }, { "epoch": 5.369138607725676, "grad_norm": 9.102943420410156, "learning_rate": 2.572577535393599e-05, "loss": 1.515, "step": 32490 }, { "epoch": 5.3707911588514765, "grad_norm": 5.440056800842285, "learning_rate": 2.571659413503737e-05, "loss": 1.4411, "step": 32500 }, { "epoch": 5.3724437099772775, "grad_norm": 6.815498352050781, "learning_rate": 2.5707412916138747e-05, "loss": 1.4195, "step": 32510 }, { "epoch": 5.374096261103078, "grad_norm": 9.051548957824707, "learning_rate": 2.5698231697240126e-05, "loss": 1.4627, "step": 32520 }, { "epoch": 5.375748812228879, "grad_norm": 10.097661972045898, "learning_rate": 2.5689050478341502e-05, "loss": 1.5081, "step": 32530 }, { "epoch": 5.377401363354679, "grad_norm": 8.773504257202148, "learning_rate": 2.5679869259442885e-05, "loss": 1.5666, "step": 32540 }, { "epoch": 5.37905391448048, "grad_norm": 9.091877937316895, "learning_rate": 2.5670688040544267e-05, "loss": 1.5096, "step": 32550 }, { "epoch": 5.38070646560628, "grad_norm": 6.017947196960449, "learning_rate": 2.5661506821645643e-05, "loss": 1.4604, "step": 32560 }, { "epoch": 5.38235901673208, "grad_norm": 8.248211860656738, "learning_rate": 2.5652325602747022e-05, "loss": 1.4782, "step": 32570 }, { "epoch": 5.384011567857881, "grad_norm": 8.899044036865234, "learning_rate": 2.5643144383848398e-05, "loss": 1.4193, "step": 32580 }, { "epoch": 5.385664118983681, "grad_norm": 10.997679710388184, "learning_rate": 2.563396316494978e-05, "loss": 1.469, "step": 32590 }, { "epoch": 5.387316670109482, "grad_norm": 14.518561363220215, "learning_rate": 2.5624781946051157e-05, "loss": 1.447, "step": 32600 }, { "epoch": 5.388969221235282, "grad_norm": 12.715332984924316, "learning_rate": 2.561560072715254e-05, "loss": 1.55, "step": 32610 }, { "epoch": 5.390621772361082, "grad_norm": 74.79273223876953, "learning_rate": 2.5606419508253915e-05, "loss": 1.3885, "step": 32620 }, { "epoch": 5.392274323486883, "grad_norm": 13.330283164978027, "learning_rate": 2.5597238289355298e-05, "loss": 1.5348, "step": 32630 }, { "epoch": 5.393926874612683, "grad_norm": 11.709308624267578, "learning_rate": 2.5588057070456674e-05, "loss": 1.5496, "step": 32640 }, { "epoch": 5.395579425738484, "grad_norm": 7.356724739074707, "learning_rate": 2.5578875851558053e-05, "loss": 1.3092, "step": 32650 }, { "epoch": 5.397231976864284, "grad_norm": 7.576127529144287, "learning_rate": 2.5569694632659436e-05, "loss": 1.3922, "step": 32660 }, { "epoch": 5.398884527990084, "grad_norm": 8.769586563110352, "learning_rate": 2.556051341376081e-05, "loss": 1.4997, "step": 32670 }, { "epoch": 5.400537079115885, "grad_norm": 8.266812324523926, "learning_rate": 2.5551332194862194e-05, "loss": 1.504, "step": 32680 }, { "epoch": 5.402189630241685, "grad_norm": 11.372197151184082, "learning_rate": 2.554215097596357e-05, "loss": 1.4409, "step": 32690 }, { "epoch": 5.403842181367486, "grad_norm": 19.209890365600586, "learning_rate": 2.553296975706495e-05, "loss": 1.5131, "step": 32700 }, { "epoch": 5.4054947324932865, "grad_norm": 11.207270622253418, "learning_rate": 2.5523788538166325e-05, "loss": 1.4532, "step": 32710 }, { "epoch": 5.407147283619087, "grad_norm": 7.834964752197266, "learning_rate": 2.5514607319267708e-05, "loss": 1.4809, "step": 32720 }, { "epoch": 5.408799834744888, "grad_norm": 10.717996597290039, "learning_rate": 2.5505426100369084e-05, "loss": 1.4442, "step": 32730 }, { "epoch": 5.410452385870688, "grad_norm": 7.571774005889893, "learning_rate": 2.5496244881470466e-05, "loss": 1.5859, "step": 32740 }, { "epoch": 5.412104936996489, "grad_norm": 9.474056243896484, "learning_rate": 2.5487063662571842e-05, "loss": 1.5151, "step": 32750 }, { "epoch": 5.413757488122289, "grad_norm": 8.241455078125, "learning_rate": 2.5477882443673225e-05, "loss": 1.4181, "step": 32760 }, { "epoch": 5.415410039248089, "grad_norm": 8.962974548339844, "learning_rate": 2.5468701224774604e-05, "loss": 1.4505, "step": 32770 }, { "epoch": 5.41706259037389, "grad_norm": 9.275091171264648, "learning_rate": 2.545952000587598e-05, "loss": 1.4723, "step": 32780 }, { "epoch": 5.41871514149969, "grad_norm": 9.588011741638184, "learning_rate": 2.5450338786977362e-05, "loss": 1.5176, "step": 32790 }, { "epoch": 5.420367692625491, "grad_norm": 6.4946441650390625, "learning_rate": 2.544115756807874e-05, "loss": 1.4376, "step": 32800 }, { "epoch": 5.422020243751291, "grad_norm": 13.272873878479004, "learning_rate": 2.543197634918012e-05, "loss": 1.5343, "step": 32810 }, { "epoch": 5.423672794877092, "grad_norm": 9.417807579040527, "learning_rate": 2.5422795130281497e-05, "loss": 1.4349, "step": 32820 }, { "epoch": 5.425325346002892, "grad_norm": 16.028039932250977, "learning_rate": 2.5413613911382876e-05, "loss": 1.464, "step": 32830 }, { "epoch": 5.426977897128692, "grad_norm": 6.217875003814697, "learning_rate": 2.5404432692484252e-05, "loss": 1.4224, "step": 32840 }, { "epoch": 5.428630448254493, "grad_norm": 4.6173529624938965, "learning_rate": 2.5395251473585635e-05, "loss": 1.389, "step": 32850 }, { "epoch": 5.430282999380293, "grad_norm": 16.59287452697754, "learning_rate": 2.538607025468701e-05, "loss": 1.406, "step": 32860 }, { "epoch": 5.431935550506093, "grad_norm": 28.710718154907227, "learning_rate": 2.5376889035788393e-05, "loss": 1.5599, "step": 32870 }, { "epoch": 5.433588101631894, "grad_norm": 6.974539756774902, "learning_rate": 2.5367707816889776e-05, "loss": 1.5168, "step": 32880 }, { "epoch": 5.435240652757694, "grad_norm": 18.166751861572266, "learning_rate": 2.535852659799115e-05, "loss": 1.4325, "step": 32890 }, { "epoch": 5.436893203883495, "grad_norm": 13.46557331085205, "learning_rate": 2.534934537909253e-05, "loss": 1.3934, "step": 32900 }, { "epoch": 5.4385457550092955, "grad_norm": 11.49219036102295, "learning_rate": 2.5340164160193907e-05, "loss": 1.4228, "step": 32910 }, { "epoch": 5.4401983061350965, "grad_norm": 12.140527725219727, "learning_rate": 2.533098294129529e-05, "loss": 1.5574, "step": 32920 }, { "epoch": 5.441850857260897, "grad_norm": 13.28624153137207, "learning_rate": 2.5321801722396665e-05, "loss": 1.4256, "step": 32930 }, { "epoch": 5.443503408386697, "grad_norm": 19.24834632873535, "learning_rate": 2.5312620503498048e-05, "loss": 1.4024, "step": 32940 }, { "epoch": 5.445155959512498, "grad_norm": 12.811565399169922, "learning_rate": 2.5303439284599424e-05, "loss": 1.5038, "step": 32950 }, { "epoch": 5.446808510638298, "grad_norm": 135.61062622070312, "learning_rate": 2.5294258065700803e-05, "loss": 1.5554, "step": 32960 }, { "epoch": 5.448461061764099, "grad_norm": 13.871460914611816, "learning_rate": 2.528507684680218e-05, "loss": 1.5125, "step": 32970 }, { "epoch": 5.450113612889899, "grad_norm": 7.033801078796387, "learning_rate": 2.527589562790356e-05, "loss": 1.6154, "step": 32980 }, { "epoch": 5.451766164015699, "grad_norm": 17.240575790405273, "learning_rate": 2.5266714409004944e-05, "loss": 1.4924, "step": 32990 }, { "epoch": 5.4534187151415, "grad_norm": 11.401257514953613, "learning_rate": 2.525753319010632e-05, "loss": 1.449, "step": 33000 }, { "epoch": 5.4550712662673, "grad_norm": 23.046972274780273, "learning_rate": 2.5248351971207703e-05, "loss": 1.3946, "step": 33010 }, { "epoch": 5.456723817393101, "grad_norm": 13.394057273864746, "learning_rate": 2.523917075230908e-05, "loss": 1.4518, "step": 33020 }, { "epoch": 5.458376368518901, "grad_norm": 7.417761325836182, "learning_rate": 2.5229989533410458e-05, "loss": 1.5553, "step": 33030 }, { "epoch": 5.460028919644701, "grad_norm": 15.697338104248047, "learning_rate": 2.5220808314511834e-05, "loss": 1.3501, "step": 33040 }, { "epoch": 5.461681470770502, "grad_norm": 13.63474178314209, "learning_rate": 2.5211627095613216e-05, "loss": 1.5208, "step": 33050 }, { "epoch": 5.463334021896302, "grad_norm": 15.581018447875977, "learning_rate": 2.5202445876714592e-05, "loss": 1.3335, "step": 33060 }, { "epoch": 5.464986573022103, "grad_norm": 15.496585845947266, "learning_rate": 2.5193264657815975e-05, "loss": 1.3965, "step": 33070 }, { "epoch": 5.466639124147903, "grad_norm": 11.815439224243164, "learning_rate": 2.518408343891735e-05, "loss": 1.4156, "step": 33080 }, { "epoch": 5.468291675273703, "grad_norm": 15.371979713439941, "learning_rate": 2.517490222001873e-05, "loss": 1.452, "step": 33090 }, { "epoch": 5.469944226399504, "grad_norm": 14.599471092224121, "learning_rate": 2.5165721001120106e-05, "loss": 1.4849, "step": 33100 }, { "epoch": 5.4715967775253045, "grad_norm": 16.83903694152832, "learning_rate": 2.5156539782221488e-05, "loss": 1.3825, "step": 33110 }, { "epoch": 5.4732493286511055, "grad_norm": 11.542963027954102, "learning_rate": 2.514735856332287e-05, "loss": 1.4628, "step": 33120 }, { "epoch": 5.474901879776906, "grad_norm": 44.345481872558594, "learning_rate": 2.5138177344424247e-05, "loss": 1.3834, "step": 33130 }, { "epoch": 5.476554430902706, "grad_norm": 13.720561981201172, "learning_rate": 2.512899612552563e-05, "loss": 1.3351, "step": 33140 }, { "epoch": 5.478206982028507, "grad_norm": 11.957843780517578, "learning_rate": 2.5119814906627005e-05, "loss": 1.4726, "step": 33150 }, { "epoch": 5.479859533154307, "grad_norm": 11.13582706451416, "learning_rate": 2.5110633687728384e-05, "loss": 1.3245, "step": 33160 }, { "epoch": 5.481512084280108, "grad_norm": 33.86077117919922, "learning_rate": 2.510145246882976e-05, "loss": 1.6398, "step": 33170 }, { "epoch": 5.483164635405908, "grad_norm": 19.655872344970703, "learning_rate": 2.5092271249931143e-05, "loss": 1.5043, "step": 33180 }, { "epoch": 5.484817186531708, "grad_norm": 10.309099197387695, "learning_rate": 2.508309003103252e-05, "loss": 1.4599, "step": 33190 }, { "epoch": 5.486469737657509, "grad_norm": 11.701153755187988, "learning_rate": 2.50739088121339e-05, "loss": 1.4324, "step": 33200 }, { "epoch": 5.488122288783309, "grad_norm": 12.09907054901123, "learning_rate": 2.5064727593235277e-05, "loss": 1.4819, "step": 33210 }, { "epoch": 5.48977483990911, "grad_norm": 11.532211303710938, "learning_rate": 2.5055546374336657e-05, "loss": 1.4336, "step": 33220 }, { "epoch": 5.49142739103491, "grad_norm": 7.926141738891602, "learning_rate": 2.504636515543804e-05, "loss": 1.4861, "step": 33230 }, { "epoch": 5.49307994216071, "grad_norm": 9.352069854736328, "learning_rate": 2.5037183936539415e-05, "loss": 1.403, "step": 33240 }, { "epoch": 5.494732493286511, "grad_norm": 8.540033340454102, "learning_rate": 2.5028002717640798e-05, "loss": 1.4863, "step": 33250 }, { "epoch": 5.496385044412311, "grad_norm": 119.00782775878906, "learning_rate": 2.5018821498742174e-05, "loss": 1.4733, "step": 33260 }, { "epoch": 5.498037595538112, "grad_norm": 9.270196914672852, "learning_rate": 2.5009640279843556e-05, "loss": 1.5347, "step": 33270 }, { "epoch": 5.499690146663912, "grad_norm": 7.050109386444092, "learning_rate": 2.5000459060944932e-05, "loss": 1.4698, "step": 33280 }, { "epoch": 5.501342697789713, "grad_norm": 15.336756706237793, "learning_rate": 2.499127784204631e-05, "loss": 1.3084, "step": 33290 }, { "epoch": 5.502995248915513, "grad_norm": 7.765586853027344, "learning_rate": 2.498209662314769e-05, "loss": 1.46, "step": 33300 }, { "epoch": 5.5046478000413135, "grad_norm": 22.45989227294922, "learning_rate": 2.497291540424907e-05, "loss": 1.5559, "step": 33310 }, { "epoch": 5.5063003511671145, "grad_norm": 16.461627960205078, "learning_rate": 2.496373418535045e-05, "loss": 1.5358, "step": 33320 }, { "epoch": 5.507952902292915, "grad_norm": 13.17129898071289, "learning_rate": 2.4954552966451828e-05, "loss": 1.4679, "step": 33330 }, { "epoch": 5.509605453418715, "grad_norm": 8.983606338500977, "learning_rate": 2.4945371747553208e-05, "loss": 1.4101, "step": 33340 }, { "epoch": 5.511258004544516, "grad_norm": 16.77591323852539, "learning_rate": 2.4936190528654583e-05, "loss": 1.4592, "step": 33350 }, { "epoch": 5.512910555670316, "grad_norm": 20.5835018157959, "learning_rate": 2.4927009309755963e-05, "loss": 1.4214, "step": 33360 }, { "epoch": 5.514563106796117, "grad_norm": 14.100784301757812, "learning_rate": 2.4917828090857342e-05, "loss": 1.4042, "step": 33370 }, { "epoch": 5.516215657921917, "grad_norm": 8.669175148010254, "learning_rate": 2.490864687195872e-05, "loss": 1.3463, "step": 33380 }, { "epoch": 5.517868209047718, "grad_norm": 19.604291915893555, "learning_rate": 2.4899465653060104e-05, "loss": 1.3455, "step": 33390 }, { "epoch": 5.519520760173518, "grad_norm": 13.911442756652832, "learning_rate": 2.4890284434161483e-05, "loss": 1.5867, "step": 33400 }, { "epoch": 5.521173311299318, "grad_norm": 9.792315483093262, "learning_rate": 2.488110321526286e-05, "loss": 1.4837, "step": 33410 }, { "epoch": 5.522825862425119, "grad_norm": 11.161901473999023, "learning_rate": 2.4871921996364238e-05, "loss": 1.4366, "step": 33420 }, { "epoch": 5.524478413550919, "grad_norm": 12.950854301452637, "learning_rate": 2.4862740777465617e-05, "loss": 1.3821, "step": 33430 }, { "epoch": 5.52613096467672, "grad_norm": 49.98655700683594, "learning_rate": 2.4853559558566997e-05, "loss": 1.5996, "step": 33440 }, { "epoch": 5.52778351580252, "grad_norm": 12.21288013458252, "learning_rate": 2.4844378339668376e-05, "loss": 1.5804, "step": 33450 }, { "epoch": 5.52943606692832, "grad_norm": 7.666236400604248, "learning_rate": 2.4835197120769755e-05, "loss": 1.665, "step": 33460 }, { "epoch": 5.531088618054121, "grad_norm": 11.325262069702148, "learning_rate": 2.4826015901871134e-05, "loss": 1.5233, "step": 33470 }, { "epoch": 5.532741169179921, "grad_norm": 9.03773021697998, "learning_rate": 2.481683468297251e-05, "loss": 1.5621, "step": 33480 }, { "epoch": 5.534393720305722, "grad_norm": 9.082817077636719, "learning_rate": 2.480765346407389e-05, "loss": 1.5058, "step": 33490 }, { "epoch": 5.536046271431522, "grad_norm": 7.868617534637451, "learning_rate": 2.479847224517527e-05, "loss": 1.4172, "step": 33500 }, { "epoch": 5.5376988225573225, "grad_norm": 16.261873245239258, "learning_rate": 2.478929102627665e-05, "loss": 1.4935, "step": 33510 }, { "epoch": 5.5393513736831235, "grad_norm": 8.341659545898438, "learning_rate": 2.478010980737803e-05, "loss": 1.6001, "step": 33520 }, { "epoch": 5.541003924808924, "grad_norm": 8.13469123840332, "learning_rate": 2.477092858847941e-05, "loss": 1.5278, "step": 33530 }, { "epoch": 5.5426564759347245, "grad_norm": 8.067828178405762, "learning_rate": 2.4761747369580786e-05, "loss": 1.3339, "step": 33540 }, { "epoch": 5.544309027060525, "grad_norm": 9.125925064086914, "learning_rate": 2.4752566150682165e-05, "loss": 1.4147, "step": 33550 }, { "epoch": 5.545961578186326, "grad_norm": 8.341336250305176, "learning_rate": 2.4743384931783544e-05, "loss": 1.4189, "step": 33560 }, { "epoch": 5.547614129312126, "grad_norm": 10.543508529663086, "learning_rate": 2.4734203712884923e-05, "loss": 1.4079, "step": 33570 }, { "epoch": 5.549266680437926, "grad_norm": 16.04317283630371, "learning_rate": 2.4725022493986303e-05, "loss": 1.6624, "step": 33580 }, { "epoch": 5.550919231563727, "grad_norm": 10.512799263000488, "learning_rate": 2.4715841275087682e-05, "loss": 1.4323, "step": 33590 }, { "epoch": 5.552571782689527, "grad_norm": 8.222831726074219, "learning_rate": 2.470666005618906e-05, "loss": 1.604, "step": 33600 }, { "epoch": 5.554224333815327, "grad_norm": 8.435620307922363, "learning_rate": 2.4697478837290437e-05, "loss": 1.5879, "step": 33610 }, { "epoch": 5.555876884941128, "grad_norm": 16.17806625366211, "learning_rate": 2.468829761839182e-05, "loss": 1.4183, "step": 33620 }, { "epoch": 5.557529436066928, "grad_norm": 10.076927185058594, "learning_rate": 2.46791163994932e-05, "loss": 1.332, "step": 33630 }, { "epoch": 5.559181987192729, "grad_norm": 8.407470703125, "learning_rate": 2.4669935180594578e-05, "loss": 1.5482, "step": 33640 }, { "epoch": 5.560834538318529, "grad_norm": 5.60792350769043, "learning_rate": 2.4660753961695957e-05, "loss": 1.4146, "step": 33650 }, { "epoch": 5.56248708944433, "grad_norm": 9.023984909057617, "learning_rate": 2.4651572742797337e-05, "loss": 1.4729, "step": 33660 }, { "epoch": 5.56413964057013, "grad_norm": 9.371933937072754, "learning_rate": 2.4642391523898713e-05, "loss": 1.4933, "step": 33670 }, { "epoch": 5.56579219169593, "grad_norm": 11.463881492614746, "learning_rate": 2.4633210305000092e-05, "loss": 1.3909, "step": 33680 }, { "epoch": 5.567444742821731, "grad_norm": 22.128860473632812, "learning_rate": 2.462402908610147e-05, "loss": 1.5586, "step": 33690 }, { "epoch": 5.569097293947531, "grad_norm": 7.879482746124268, "learning_rate": 2.461484786720285e-05, "loss": 1.4793, "step": 33700 }, { "epoch": 5.5707498450733315, "grad_norm": 16.057891845703125, "learning_rate": 2.460566664830423e-05, "loss": 1.4825, "step": 33710 }, { "epoch": 5.5724023961991325, "grad_norm": 21.956098556518555, "learning_rate": 2.459648542940561e-05, "loss": 1.4789, "step": 33720 }, { "epoch": 5.574054947324933, "grad_norm": 10.628647804260254, "learning_rate": 2.4587304210506988e-05, "loss": 1.4485, "step": 33730 }, { "epoch": 5.5757074984507335, "grad_norm": 22.851346969604492, "learning_rate": 2.4578122991608367e-05, "loss": 1.5087, "step": 33740 }, { "epoch": 5.577360049576534, "grad_norm": 7.7167744636535645, "learning_rate": 2.4568941772709747e-05, "loss": 1.4648, "step": 33750 }, { "epoch": 5.579012600702335, "grad_norm": 32.65045166015625, "learning_rate": 2.4559760553811126e-05, "loss": 1.4757, "step": 33760 }, { "epoch": 5.580665151828135, "grad_norm": 9.209506034851074, "learning_rate": 2.4550579334912505e-05, "loss": 1.4859, "step": 33770 }, { "epoch": 5.582317702953935, "grad_norm": 10.900243759155273, "learning_rate": 2.4541398116013884e-05, "loss": 1.5101, "step": 33780 }, { "epoch": 5.583970254079736, "grad_norm": 13.830432891845703, "learning_rate": 2.4532216897115264e-05, "loss": 1.5847, "step": 33790 }, { "epoch": 5.585622805205536, "grad_norm": 13.913232803344727, "learning_rate": 2.452303567821664e-05, "loss": 1.419, "step": 33800 }, { "epoch": 5.587275356331337, "grad_norm": 19.409276962280273, "learning_rate": 2.451385445931802e-05, "loss": 1.5777, "step": 33810 }, { "epoch": 5.588927907457137, "grad_norm": 14.467093467712402, "learning_rate": 2.4504673240419398e-05, "loss": 1.635, "step": 33820 }, { "epoch": 5.590580458582937, "grad_norm": 9.140510559082031, "learning_rate": 2.4495492021520777e-05, "loss": 1.5292, "step": 33830 }, { "epoch": 5.592233009708738, "grad_norm": 7.822348594665527, "learning_rate": 2.4486310802622156e-05, "loss": 1.3846, "step": 33840 }, { "epoch": 5.593885560834538, "grad_norm": 9.017069816589355, "learning_rate": 2.447712958372354e-05, "loss": 1.4916, "step": 33850 }, { "epoch": 5.595538111960339, "grad_norm": 11.385196685791016, "learning_rate": 2.4467948364824915e-05, "loss": 1.508, "step": 33860 }, { "epoch": 5.597190663086139, "grad_norm": 7.629830837249756, "learning_rate": 2.4458767145926294e-05, "loss": 1.4346, "step": 33870 }, { "epoch": 5.598843214211939, "grad_norm": 7.943394184112549, "learning_rate": 2.4449585927027673e-05, "loss": 1.4446, "step": 33880 }, { "epoch": 5.60049576533774, "grad_norm": 12.818558692932129, "learning_rate": 2.4440404708129053e-05, "loss": 1.4871, "step": 33890 }, { "epoch": 5.60214831646354, "grad_norm": 8.883275985717773, "learning_rate": 2.4431223489230432e-05, "loss": 1.4815, "step": 33900 }, { "epoch": 5.603800867589341, "grad_norm": 12.792842864990234, "learning_rate": 2.442204227033181e-05, "loss": 1.4886, "step": 33910 }, { "epoch": 5.6054534187151415, "grad_norm": 9.756999015808105, "learning_rate": 2.441286105143319e-05, "loss": 1.4582, "step": 33920 }, { "epoch": 5.6071059698409424, "grad_norm": 11.55717658996582, "learning_rate": 2.4403679832534566e-05, "loss": 1.6197, "step": 33930 }, { "epoch": 5.6087585209667425, "grad_norm": 6.848897933959961, "learning_rate": 2.4394498613635945e-05, "loss": 1.4918, "step": 33940 }, { "epoch": 5.610411072092543, "grad_norm": 7.045119762420654, "learning_rate": 2.4385317394737325e-05, "loss": 1.6231, "step": 33950 }, { "epoch": 5.612063623218344, "grad_norm": 12.93820571899414, "learning_rate": 2.4376136175838707e-05, "loss": 1.4263, "step": 33960 }, { "epoch": 5.613716174344144, "grad_norm": 11.486026763916016, "learning_rate": 2.4366954956940087e-05, "loss": 1.4914, "step": 33970 }, { "epoch": 5.615368725469944, "grad_norm": 9.217084884643555, "learning_rate": 2.4357773738041466e-05, "loss": 1.4765, "step": 33980 }, { "epoch": 5.617021276595745, "grad_norm": 20.32526206970215, "learning_rate": 2.4348592519142842e-05, "loss": 1.493, "step": 33990 }, { "epoch": 5.618673827721545, "grad_norm": 12.086780548095703, "learning_rate": 2.433941130024422e-05, "loss": 1.5825, "step": 34000 }, { "epoch": 5.620326378847346, "grad_norm": 11.141498565673828, "learning_rate": 2.43302300813456e-05, "loss": 1.4825, "step": 34010 }, { "epoch": 5.621978929973146, "grad_norm": 6.86226224899292, "learning_rate": 2.432104886244698e-05, "loss": 1.5988, "step": 34020 }, { "epoch": 5.623631481098947, "grad_norm": 7.528665542602539, "learning_rate": 2.431186764354836e-05, "loss": 1.5774, "step": 34030 }, { "epoch": 5.625284032224747, "grad_norm": 11.368343353271484, "learning_rate": 2.4302686424649738e-05, "loss": 1.3792, "step": 34040 }, { "epoch": 5.626936583350547, "grad_norm": 5.986963272094727, "learning_rate": 2.4293505205751117e-05, "loss": 1.4285, "step": 34050 }, { "epoch": 5.628589134476348, "grad_norm": 9.011881828308105, "learning_rate": 2.4284323986852493e-05, "loss": 1.476, "step": 34060 }, { "epoch": 5.630241685602148, "grad_norm": 8.9256010055542, "learning_rate": 2.4275142767953872e-05, "loss": 1.4706, "step": 34070 }, { "epoch": 5.631894236727948, "grad_norm": 34.386566162109375, "learning_rate": 2.4265961549055255e-05, "loss": 1.3625, "step": 34080 }, { "epoch": 5.633546787853749, "grad_norm": 15.83523178100586, "learning_rate": 2.4256780330156634e-05, "loss": 1.3465, "step": 34090 }, { "epoch": 5.635199338979549, "grad_norm": 9.349414825439453, "learning_rate": 2.4247599111258013e-05, "loss": 1.4396, "step": 34100 }, { "epoch": 5.63685189010535, "grad_norm": 13.072619438171387, "learning_rate": 2.4238417892359393e-05, "loss": 1.5748, "step": 34110 }, { "epoch": 5.6385044412311505, "grad_norm": 8.265012741088867, "learning_rate": 2.422923667346077e-05, "loss": 1.4326, "step": 34120 }, { "epoch": 5.6401569923569514, "grad_norm": 14.93253231048584, "learning_rate": 2.4220055454562148e-05, "loss": 1.4903, "step": 34130 }, { "epoch": 5.6418095434827515, "grad_norm": 8.096601486206055, "learning_rate": 2.4210874235663527e-05, "loss": 1.384, "step": 34140 }, { "epoch": 5.643462094608552, "grad_norm": 7.2971343994140625, "learning_rate": 2.4201693016764906e-05, "loss": 1.4753, "step": 34150 }, { "epoch": 5.645114645734353, "grad_norm": 7.540863990783691, "learning_rate": 2.4192511797866286e-05, "loss": 1.3667, "step": 34160 }, { "epoch": 5.646767196860153, "grad_norm": 5.347985744476318, "learning_rate": 2.4183330578967665e-05, "loss": 1.5807, "step": 34170 }, { "epoch": 5.648419747985954, "grad_norm": 10.622434616088867, "learning_rate": 2.4174149360069044e-05, "loss": 1.5232, "step": 34180 }, { "epoch": 5.650072299111754, "grad_norm": 17.396007537841797, "learning_rate": 2.4164968141170423e-05, "loss": 1.4468, "step": 34190 }, { "epoch": 5.651724850237554, "grad_norm": 17.4361629486084, "learning_rate": 2.4155786922271803e-05, "loss": 1.5216, "step": 34200 }, { "epoch": 5.653377401363355, "grad_norm": 12.362852096557617, "learning_rate": 2.4146605703373182e-05, "loss": 1.5796, "step": 34210 }, { "epoch": 5.655029952489155, "grad_norm": 8.334657669067383, "learning_rate": 2.413742448447456e-05, "loss": 1.5931, "step": 34220 }, { "epoch": 5.656682503614956, "grad_norm": 15.316583633422852, "learning_rate": 2.412824326557594e-05, "loss": 1.5869, "step": 34230 }, { "epoch": 5.658335054740756, "grad_norm": 17.858875274658203, "learning_rate": 2.411906204667732e-05, "loss": 1.5354, "step": 34240 }, { "epoch": 5.659987605866556, "grad_norm": 15.654756546020508, "learning_rate": 2.4109880827778695e-05, "loss": 1.5952, "step": 34250 }, { "epoch": 5.661640156992357, "grad_norm": 5.612504005432129, "learning_rate": 2.4100699608880075e-05, "loss": 1.3402, "step": 34260 }, { "epoch": 5.663292708118157, "grad_norm": 12.783700942993164, "learning_rate": 2.4091518389981454e-05, "loss": 1.4176, "step": 34270 }, { "epoch": 5.664945259243958, "grad_norm": 7.64247989654541, "learning_rate": 2.4082337171082833e-05, "loss": 1.4333, "step": 34280 }, { "epoch": 5.666597810369758, "grad_norm": 7.911691665649414, "learning_rate": 2.4073155952184212e-05, "loss": 1.3645, "step": 34290 }, { "epoch": 5.668250361495558, "grad_norm": 8.747984886169434, "learning_rate": 2.406397473328559e-05, "loss": 1.5304, "step": 34300 }, { "epoch": 5.669902912621359, "grad_norm": 12.237468719482422, "learning_rate": 2.405479351438697e-05, "loss": 1.3955, "step": 34310 }, { "epoch": 5.6715554637471595, "grad_norm": 9.102463722229004, "learning_rate": 2.404561229548835e-05, "loss": 1.4228, "step": 34320 }, { "epoch": 5.6732080148729604, "grad_norm": 11.882768630981445, "learning_rate": 2.403643107658973e-05, "loss": 1.3596, "step": 34330 }, { "epoch": 5.6748605659987605, "grad_norm": 10.731307029724121, "learning_rate": 2.402724985769111e-05, "loss": 1.6394, "step": 34340 }, { "epoch": 5.676513117124561, "grad_norm": 13.682147026062012, "learning_rate": 2.4018068638792488e-05, "loss": 1.4626, "step": 34350 }, { "epoch": 5.678165668250362, "grad_norm": 14.700006484985352, "learning_rate": 2.4008887419893867e-05, "loss": 1.5381, "step": 34360 }, { "epoch": 5.679818219376162, "grad_norm": 8.677534103393555, "learning_rate": 2.3999706200995246e-05, "loss": 1.3861, "step": 34370 }, { "epoch": 5.681470770501963, "grad_norm": 9.908595085144043, "learning_rate": 2.3990524982096622e-05, "loss": 1.3655, "step": 34380 }, { "epoch": 5.683123321627763, "grad_norm": 22.74381446838379, "learning_rate": 2.3981343763198e-05, "loss": 1.479, "step": 34390 }, { "epoch": 5.684775872753564, "grad_norm": 13.582764625549316, "learning_rate": 2.397216254429938e-05, "loss": 1.3896, "step": 34400 }, { "epoch": 5.686428423879364, "grad_norm": 21.719499588012695, "learning_rate": 2.396298132540076e-05, "loss": 1.6054, "step": 34410 }, { "epoch": 5.688080975005164, "grad_norm": 12.603348731994629, "learning_rate": 2.3953800106502143e-05, "loss": 1.4277, "step": 34420 }, { "epoch": 5.689733526130965, "grad_norm": 9.63592529296875, "learning_rate": 2.3944618887603522e-05, "loss": 1.485, "step": 34430 }, { "epoch": 5.691386077256765, "grad_norm": 13.966578483581543, "learning_rate": 2.3935437668704898e-05, "loss": 1.5003, "step": 34440 }, { "epoch": 5.693038628382565, "grad_norm": 13.241275787353516, "learning_rate": 2.3926256449806277e-05, "loss": 1.5059, "step": 34450 }, { "epoch": 5.694691179508366, "grad_norm": 18.274063110351562, "learning_rate": 2.3917075230907656e-05, "loss": 1.4191, "step": 34460 }, { "epoch": 5.696343730634166, "grad_norm": 6.728181838989258, "learning_rate": 2.3907894012009035e-05, "loss": 1.5287, "step": 34470 }, { "epoch": 5.697996281759967, "grad_norm": 9.49169921875, "learning_rate": 2.3898712793110415e-05, "loss": 1.5762, "step": 34480 }, { "epoch": 5.699648832885767, "grad_norm": 13.1586332321167, "learning_rate": 2.3889531574211794e-05, "loss": 1.4773, "step": 34490 }, { "epoch": 5.701301384011568, "grad_norm": 36.238487243652344, "learning_rate": 2.3880350355313173e-05, "loss": 1.4757, "step": 34500 }, { "epoch": 5.702953935137368, "grad_norm": 14.713821411132812, "learning_rate": 2.387116913641455e-05, "loss": 1.3241, "step": 34510 }, { "epoch": 5.7046064862631685, "grad_norm": 8.172168731689453, "learning_rate": 2.3861987917515928e-05, "loss": 1.4352, "step": 34520 }, { "epoch": 5.7062590373889694, "grad_norm": 12.12569808959961, "learning_rate": 2.3852806698617308e-05, "loss": 1.6369, "step": 34530 }, { "epoch": 5.7079115885147695, "grad_norm": 8.741676330566406, "learning_rate": 2.384362547971869e-05, "loss": 1.4149, "step": 34540 }, { "epoch": 5.70956413964057, "grad_norm": 15.845980644226074, "learning_rate": 2.383444426082007e-05, "loss": 1.4781, "step": 34550 }, { "epoch": 5.711216690766371, "grad_norm": 10.60511302947998, "learning_rate": 2.382526304192145e-05, "loss": 1.4277, "step": 34560 }, { "epoch": 5.712869241892171, "grad_norm": 12.912707328796387, "learning_rate": 2.3816081823022825e-05, "loss": 1.4863, "step": 34570 }, { "epoch": 5.714521793017972, "grad_norm": 7.532742977142334, "learning_rate": 2.3806900604124204e-05, "loss": 1.3594, "step": 34580 }, { "epoch": 5.716174344143772, "grad_norm": 109.51636505126953, "learning_rate": 2.3797719385225583e-05, "loss": 1.4491, "step": 34590 }, { "epoch": 5.717826895269573, "grad_norm": 25.65382957458496, "learning_rate": 2.3788538166326962e-05, "loss": 1.5347, "step": 34600 }, { "epoch": 5.719479446395373, "grad_norm": 6.745087146759033, "learning_rate": 2.377935694742834e-05, "loss": 1.5059, "step": 34610 }, { "epoch": 5.721131997521173, "grad_norm": 8.282033920288086, "learning_rate": 2.377017572852972e-05, "loss": 1.4472, "step": 34620 }, { "epoch": 5.722784548646974, "grad_norm": 12.051273345947266, "learning_rate": 2.37609945096311e-05, "loss": 1.4371, "step": 34630 }, { "epoch": 5.724437099772774, "grad_norm": 7.050853252410889, "learning_rate": 2.3751813290732476e-05, "loss": 1.5519, "step": 34640 }, { "epoch": 5.726089650898575, "grad_norm": 29.751380920410156, "learning_rate": 2.374263207183386e-05, "loss": 1.38, "step": 34650 }, { "epoch": 5.727742202024375, "grad_norm": 17.988632202148438, "learning_rate": 2.3733450852935238e-05, "loss": 1.4125, "step": 34660 }, { "epoch": 5.729394753150175, "grad_norm": 19.773149490356445, "learning_rate": 2.3724269634036617e-05, "loss": 1.3835, "step": 34670 }, { "epoch": 5.731047304275976, "grad_norm": 23.994436264038086, "learning_rate": 2.3715088415137996e-05, "loss": 1.4931, "step": 34680 }, { "epoch": 5.732699855401776, "grad_norm": 24.97893714904785, "learning_rate": 2.3705907196239375e-05, "loss": 1.3949, "step": 34690 }, { "epoch": 5.734352406527577, "grad_norm": 6.368709564208984, "learning_rate": 2.369672597734075e-05, "loss": 1.3729, "step": 34700 }, { "epoch": 5.736004957653377, "grad_norm": 14.183878898620605, "learning_rate": 2.368754475844213e-05, "loss": 1.5165, "step": 34710 }, { "epoch": 5.7376575087791775, "grad_norm": 11.956380844116211, "learning_rate": 2.367836353954351e-05, "loss": 1.4883, "step": 34720 }, { "epoch": 5.7393100599049784, "grad_norm": 16.989643096923828, "learning_rate": 2.366918232064489e-05, "loss": 1.5531, "step": 34730 }, { "epoch": 5.7409626110307785, "grad_norm": 14.136730194091797, "learning_rate": 2.366000110174627e-05, "loss": 1.4212, "step": 34740 }, { "epoch": 5.7426151621565795, "grad_norm": 10.436883926391602, "learning_rate": 2.3650819882847648e-05, "loss": 1.5093, "step": 34750 }, { "epoch": 5.74426771328238, "grad_norm": 23.723215103149414, "learning_rate": 2.3641638663949027e-05, "loss": 1.5963, "step": 34760 }, { "epoch": 5.745920264408181, "grad_norm": 29.90015983581543, "learning_rate": 2.3632457445050406e-05, "loss": 1.5902, "step": 34770 }, { "epoch": 5.747572815533981, "grad_norm": 20.19734764099121, "learning_rate": 2.3623276226151785e-05, "loss": 1.3964, "step": 34780 }, { "epoch": 5.749225366659781, "grad_norm": 7.617761611938477, "learning_rate": 2.3614095007253165e-05, "loss": 1.4068, "step": 34790 }, { "epoch": 5.750877917785582, "grad_norm": 12.926080703735352, "learning_rate": 2.3604913788354544e-05, "loss": 1.4668, "step": 34800 }, { "epoch": 5.752530468911382, "grad_norm": 10.092638969421387, "learning_rate": 2.3595732569455923e-05, "loss": 1.526, "step": 34810 }, { "epoch": 5.754183020037182, "grad_norm": 16.717660903930664, "learning_rate": 2.3586551350557302e-05, "loss": 1.5049, "step": 34820 }, { "epoch": 5.755835571162983, "grad_norm": 9.57276439666748, "learning_rate": 2.3577370131658678e-05, "loss": 1.4629, "step": 34830 }, { "epoch": 5.757488122288783, "grad_norm": 8.01769733428955, "learning_rate": 2.3568188912760057e-05, "loss": 1.4832, "step": 34840 }, { "epoch": 5.759140673414584, "grad_norm": 11.659310340881348, "learning_rate": 2.3559007693861437e-05, "loss": 1.4063, "step": 34850 }, { "epoch": 5.760793224540384, "grad_norm": 11.5115327835083, "learning_rate": 2.3549826474962816e-05, "loss": 1.4312, "step": 34860 }, { "epoch": 5.762445775666185, "grad_norm": 8.317569732666016, "learning_rate": 2.3540645256064195e-05, "loss": 1.4929, "step": 34870 }, { "epoch": 5.764098326791985, "grad_norm": 12.994110107421875, "learning_rate": 2.3531464037165578e-05, "loss": 1.4944, "step": 34880 }, { "epoch": 5.765750877917785, "grad_norm": 8.37218189239502, "learning_rate": 2.3522282818266954e-05, "loss": 1.4133, "step": 34890 }, { "epoch": 5.767403429043586, "grad_norm": 9.333100318908691, "learning_rate": 2.3513101599368333e-05, "loss": 1.4405, "step": 34900 }, { "epoch": 5.769055980169386, "grad_norm": 17.137237548828125, "learning_rate": 2.3503920380469712e-05, "loss": 1.513, "step": 34910 }, { "epoch": 5.7707085312951865, "grad_norm": 12.368675231933594, "learning_rate": 2.349473916157109e-05, "loss": 1.3646, "step": 34920 }, { "epoch": 5.772361082420987, "grad_norm": 14.37025260925293, "learning_rate": 2.348555794267247e-05, "loss": 1.5304, "step": 34930 }, { "epoch": 5.7740136335467875, "grad_norm": 40.79022216796875, "learning_rate": 2.347637672377385e-05, "loss": 1.3919, "step": 34940 }, { "epoch": 5.7756661846725885, "grad_norm": 8.884847640991211, "learning_rate": 2.346719550487523e-05, "loss": 1.4815, "step": 34950 }, { "epoch": 5.777318735798389, "grad_norm": 9.6146879196167, "learning_rate": 2.3458014285976605e-05, "loss": 1.4746, "step": 34960 }, { "epoch": 5.77897128692419, "grad_norm": 18.530406951904297, "learning_rate": 2.3448833067077984e-05, "loss": 1.644, "step": 34970 }, { "epoch": 5.78062383804999, "grad_norm": 26.01064109802246, "learning_rate": 2.3439651848179364e-05, "loss": 1.5561, "step": 34980 }, { "epoch": 5.78227638917579, "grad_norm": 6.7076239585876465, "learning_rate": 2.3430470629280746e-05, "loss": 1.441, "step": 34990 }, { "epoch": 5.783928940301591, "grad_norm": 58.96223831176758, "learning_rate": 2.3421289410382125e-05, "loss": 1.4621, "step": 35000 }, { "epoch": 5.785581491427391, "grad_norm": 12.242283821105957, "learning_rate": 2.3412108191483505e-05, "loss": 1.4991, "step": 35010 }, { "epoch": 5.787234042553192, "grad_norm": 60.0484619140625, "learning_rate": 2.340292697258488e-05, "loss": 1.4799, "step": 35020 }, { "epoch": 5.788886593678992, "grad_norm": 15.15187931060791, "learning_rate": 2.339374575368626e-05, "loss": 1.5587, "step": 35030 }, { "epoch": 5.790539144804792, "grad_norm": 14.363666534423828, "learning_rate": 2.338456453478764e-05, "loss": 1.5531, "step": 35040 }, { "epoch": 5.792191695930593, "grad_norm": 9.732187271118164, "learning_rate": 2.3375383315889018e-05, "loss": 1.5367, "step": 35050 }, { "epoch": 5.793844247056393, "grad_norm": 8.981781005859375, "learning_rate": 2.3366202096990397e-05, "loss": 1.4947, "step": 35060 }, { "epoch": 5.795496798182194, "grad_norm": 10.503220558166504, "learning_rate": 2.3357020878091777e-05, "loss": 1.3545, "step": 35070 }, { "epoch": 5.797149349307994, "grad_norm": 8.12704849243164, "learning_rate": 2.3347839659193156e-05, "loss": 1.5702, "step": 35080 }, { "epoch": 5.798801900433794, "grad_norm": 13.842459678649902, "learning_rate": 2.3338658440294532e-05, "loss": 1.4373, "step": 35090 }, { "epoch": 5.800454451559595, "grad_norm": 11.6895170211792, "learning_rate": 2.332947722139591e-05, "loss": 1.3984, "step": 35100 }, { "epoch": 5.802107002685395, "grad_norm": 55.09220886230469, "learning_rate": 2.3320296002497294e-05, "loss": 1.5888, "step": 35110 }, { "epoch": 5.803759553811196, "grad_norm": 10.366085052490234, "learning_rate": 2.3311114783598673e-05, "loss": 1.4724, "step": 35120 }, { "epoch": 5.805412104936996, "grad_norm": 12.669951438903809, "learning_rate": 2.3301933564700052e-05, "loss": 1.595, "step": 35130 }, { "epoch": 5.807064656062797, "grad_norm": 9.713029861450195, "learning_rate": 2.329275234580143e-05, "loss": 1.3403, "step": 35140 }, { "epoch": 5.8087172071885975, "grad_norm": 10.147992134094238, "learning_rate": 2.3283571126902807e-05, "loss": 1.4291, "step": 35150 }, { "epoch": 5.810369758314398, "grad_norm": 8.817466735839844, "learning_rate": 2.3274389908004187e-05, "loss": 1.4716, "step": 35160 }, { "epoch": 5.812022309440199, "grad_norm": 17.47516441345215, "learning_rate": 2.3265208689105566e-05, "loss": 1.4725, "step": 35170 }, { "epoch": 5.813674860565999, "grad_norm": 33.834625244140625, "learning_rate": 2.3256027470206945e-05, "loss": 1.5657, "step": 35180 }, { "epoch": 5.815327411691799, "grad_norm": 14.494001388549805, "learning_rate": 2.3246846251308324e-05, "loss": 1.486, "step": 35190 }, { "epoch": 5.8169799628176, "grad_norm": 10.042065620422363, "learning_rate": 2.3237665032409704e-05, "loss": 1.4782, "step": 35200 }, { "epoch": 5.8186325139434, "grad_norm": 8.86281681060791, "learning_rate": 2.3228483813511083e-05, "loss": 1.5148, "step": 35210 }, { "epoch": 5.820285065069201, "grad_norm": 17.04343605041504, "learning_rate": 2.3219302594612462e-05, "loss": 1.3849, "step": 35220 }, { "epoch": 5.821937616195001, "grad_norm": 11.681707382202148, "learning_rate": 2.321012137571384e-05, "loss": 1.4524, "step": 35230 }, { "epoch": 5.823590167320802, "grad_norm": 15.696507453918457, "learning_rate": 2.320094015681522e-05, "loss": 1.4256, "step": 35240 }, { "epoch": 5.825242718446602, "grad_norm": 21.752723693847656, "learning_rate": 2.31917589379166e-05, "loss": 1.376, "step": 35250 }, { "epoch": 5.826895269572402, "grad_norm": 16.060373306274414, "learning_rate": 2.318257771901798e-05, "loss": 1.514, "step": 35260 }, { "epoch": 5.828547820698203, "grad_norm": 51.685455322265625, "learning_rate": 2.3173396500119358e-05, "loss": 1.433, "step": 35270 }, { "epoch": 5.830200371824003, "grad_norm": 8.148418426513672, "learning_rate": 2.3164215281220734e-05, "loss": 1.4283, "step": 35280 }, { "epoch": 5.831852922949803, "grad_norm": 7.561429977416992, "learning_rate": 2.3155034062322113e-05, "loss": 1.4483, "step": 35290 }, { "epoch": 5.833505474075604, "grad_norm": 13.371988296508789, "learning_rate": 2.3145852843423493e-05, "loss": 1.4895, "step": 35300 }, { "epoch": 5.835158025201404, "grad_norm": 15.92422103881836, "learning_rate": 2.3136671624524872e-05, "loss": 1.3263, "step": 35310 }, { "epoch": 5.836810576327205, "grad_norm": 49.8977165222168, "learning_rate": 2.312749040562625e-05, "loss": 1.5131, "step": 35320 }, { "epoch": 5.838463127453005, "grad_norm": 11.140594482421875, "learning_rate": 2.311830918672763e-05, "loss": 1.5284, "step": 35330 }, { "epoch": 5.840115678578806, "grad_norm": 9.05160903930664, "learning_rate": 2.310912796782901e-05, "loss": 1.515, "step": 35340 }, { "epoch": 5.8417682297046065, "grad_norm": 11.951143264770508, "learning_rate": 2.309994674893039e-05, "loss": 1.5105, "step": 35350 }, { "epoch": 5.843420780830407, "grad_norm": 10.511120796203613, "learning_rate": 2.3090765530031768e-05, "loss": 1.4417, "step": 35360 }, { "epoch": 5.845073331956208, "grad_norm": 18.432544708251953, "learning_rate": 2.3081584311133147e-05, "loss": 1.5113, "step": 35370 }, { "epoch": 5.846725883082008, "grad_norm": 12.873449325561523, "learning_rate": 2.3072403092234527e-05, "loss": 1.4663, "step": 35380 }, { "epoch": 5.848378434207809, "grad_norm": 7.534992218017578, "learning_rate": 2.3063221873335906e-05, "loss": 1.4698, "step": 35390 }, { "epoch": 5.850030985333609, "grad_norm": 6.996005535125732, "learning_rate": 2.3054040654437285e-05, "loss": 1.3313, "step": 35400 }, { "epoch": 5.851683536459409, "grad_norm": 122.31879425048828, "learning_rate": 2.304485943553866e-05, "loss": 1.3871, "step": 35410 }, { "epoch": 5.85333608758521, "grad_norm": 10.604686737060547, "learning_rate": 2.303567821664004e-05, "loss": 1.5325, "step": 35420 }, { "epoch": 5.85498863871101, "grad_norm": 16.7020263671875, "learning_rate": 2.302649699774142e-05, "loss": 1.5011, "step": 35430 }, { "epoch": 5.856641189836811, "grad_norm": 17.591352462768555, "learning_rate": 2.30173157788428e-05, "loss": 1.4792, "step": 35440 }, { "epoch": 5.858293740962611, "grad_norm": 10.369803428649902, "learning_rate": 2.300813455994418e-05, "loss": 1.4176, "step": 35450 }, { "epoch": 5.859946292088411, "grad_norm": 41.722450256347656, "learning_rate": 2.299895334104556e-05, "loss": 1.3517, "step": 35460 }, { "epoch": 5.861598843214212, "grad_norm": 13.766722679138184, "learning_rate": 2.2989772122146936e-05, "loss": 1.5139, "step": 35470 }, { "epoch": 5.863251394340012, "grad_norm": 11.477117538452148, "learning_rate": 2.2980590903248316e-05, "loss": 1.3467, "step": 35480 }, { "epoch": 5.864903945465813, "grad_norm": 7.684751033782959, "learning_rate": 2.2971409684349695e-05, "loss": 1.3977, "step": 35490 }, { "epoch": 5.866556496591613, "grad_norm": 19.736038208007812, "learning_rate": 2.2962228465451074e-05, "loss": 1.4578, "step": 35500 }, { "epoch": 5.868209047717414, "grad_norm": 9.840457916259766, "learning_rate": 2.2953047246552453e-05, "loss": 1.5332, "step": 35510 }, { "epoch": 5.869861598843214, "grad_norm": 10.037358283996582, "learning_rate": 2.2943866027653833e-05, "loss": 1.4446, "step": 35520 }, { "epoch": 5.871514149969014, "grad_norm": 12.01092529296875, "learning_rate": 2.2934684808755212e-05, "loss": 1.4489, "step": 35530 }, { "epoch": 5.873166701094815, "grad_norm": 11.730144500732422, "learning_rate": 2.292550358985659e-05, "loss": 1.4665, "step": 35540 }, { "epoch": 5.8748192522206155, "grad_norm": 9.427178382873535, "learning_rate": 2.2916322370957967e-05, "loss": 1.4231, "step": 35550 }, { "epoch": 5.876471803346416, "grad_norm": 9.14688491821289, "learning_rate": 2.290714115205935e-05, "loss": 1.4917, "step": 35560 }, { "epoch": 5.878124354472217, "grad_norm": 8.813817977905273, "learning_rate": 2.289795993316073e-05, "loss": 1.386, "step": 35570 }, { "epoch": 5.879776905598017, "grad_norm": 13.4219970703125, "learning_rate": 2.2888778714262108e-05, "loss": 1.6158, "step": 35580 }, { "epoch": 5.881429456723818, "grad_norm": 18.55006217956543, "learning_rate": 2.2879597495363487e-05, "loss": 1.5049, "step": 35590 }, { "epoch": 5.883082007849618, "grad_norm": 10.324186325073242, "learning_rate": 2.2870416276464863e-05, "loss": 1.4228, "step": 35600 }, { "epoch": 5.884734558975419, "grad_norm": 14.822036743164062, "learning_rate": 2.2861235057566243e-05, "loss": 1.4369, "step": 35610 }, { "epoch": 5.886387110101219, "grad_norm": 9.402215003967285, "learning_rate": 2.2852053838667622e-05, "loss": 1.5191, "step": 35620 }, { "epoch": 5.888039661227019, "grad_norm": 15.71996021270752, "learning_rate": 2.2842872619769e-05, "loss": 1.4482, "step": 35630 }, { "epoch": 5.88969221235282, "grad_norm": 6.093387126922607, "learning_rate": 2.283369140087038e-05, "loss": 1.4423, "step": 35640 }, { "epoch": 5.89134476347862, "grad_norm": 9.502908706665039, "learning_rate": 2.282451018197176e-05, "loss": 1.4896, "step": 35650 }, { "epoch": 5.89299731460442, "grad_norm": 10.822142601013184, "learning_rate": 2.281532896307314e-05, "loss": 1.5373, "step": 35660 }, { "epoch": 5.894649865730221, "grad_norm": 12.028356552124023, "learning_rate": 2.2806147744174518e-05, "loss": 1.6253, "step": 35670 }, { "epoch": 5.896302416856021, "grad_norm": 9.975237846374512, "learning_rate": 2.2796966525275897e-05, "loss": 1.57, "step": 35680 }, { "epoch": 5.897954967981822, "grad_norm": 8.391766548156738, "learning_rate": 2.2787785306377277e-05, "loss": 1.5282, "step": 35690 }, { "epoch": 5.899607519107622, "grad_norm": 13.360061645507812, "learning_rate": 2.2778604087478656e-05, "loss": 1.5431, "step": 35700 }, { "epoch": 5.901260070233423, "grad_norm": 6.314427375793457, "learning_rate": 2.2769422868580035e-05, "loss": 1.5607, "step": 35710 }, { "epoch": 5.902912621359223, "grad_norm": 7.808109283447266, "learning_rate": 2.2760241649681414e-05, "loss": 1.4794, "step": 35720 }, { "epoch": 5.904565172485023, "grad_norm": 9.234506607055664, "learning_rate": 2.275106043078279e-05, "loss": 1.3842, "step": 35730 }, { "epoch": 5.906217723610824, "grad_norm": 35.93871307373047, "learning_rate": 2.274187921188417e-05, "loss": 1.3852, "step": 35740 }, { "epoch": 5.9078702747366245, "grad_norm": 10.612404823303223, "learning_rate": 2.273269799298555e-05, "loss": 1.5685, "step": 35750 }, { "epoch": 5.9095228258624255, "grad_norm": 43.04104232788086, "learning_rate": 2.2723516774086928e-05, "loss": 1.3909, "step": 35760 }, { "epoch": 5.911175376988226, "grad_norm": 16.839719772338867, "learning_rate": 2.2714335555188307e-05, "loss": 1.4255, "step": 35770 }, { "epoch": 5.912827928114026, "grad_norm": 16.71356773376465, "learning_rate": 2.2705154336289686e-05, "loss": 1.6229, "step": 35780 }, { "epoch": 5.914480479239827, "grad_norm": 28.990924835205078, "learning_rate": 2.2695973117391066e-05, "loss": 1.5036, "step": 35790 }, { "epoch": 5.916133030365627, "grad_norm": 8.961358070373535, "learning_rate": 2.2686791898492445e-05, "loss": 1.4317, "step": 35800 }, { "epoch": 5.917785581491428, "grad_norm": 9.525544166564941, "learning_rate": 2.2677610679593824e-05, "loss": 1.5382, "step": 35810 }, { "epoch": 5.919438132617228, "grad_norm": 16.726709365844727, "learning_rate": 2.2668429460695203e-05, "loss": 1.4789, "step": 35820 }, { "epoch": 5.921090683743028, "grad_norm": 26.31761360168457, "learning_rate": 2.2659248241796583e-05, "loss": 1.53, "step": 35830 }, { "epoch": 5.922743234868829, "grad_norm": 18.97273826599121, "learning_rate": 2.2650067022897962e-05, "loss": 1.4395, "step": 35840 }, { "epoch": 5.924395785994629, "grad_norm": 6.1228928565979, "learning_rate": 2.264088580399934e-05, "loss": 1.3988, "step": 35850 }, { "epoch": 5.92604833712043, "grad_norm": 16.22532081604004, "learning_rate": 2.263170458510072e-05, "loss": 1.5295, "step": 35860 }, { "epoch": 5.92770088824623, "grad_norm": 7.841466426849365, "learning_rate": 2.2622523366202096e-05, "loss": 1.2468, "step": 35870 }, { "epoch": 5.92935343937203, "grad_norm": 9.497376441955566, "learning_rate": 2.2613342147303475e-05, "loss": 1.5498, "step": 35880 }, { "epoch": 5.931005990497831, "grad_norm": 13.135749816894531, "learning_rate": 2.2604160928404855e-05, "loss": 1.3572, "step": 35890 }, { "epoch": 5.932658541623631, "grad_norm": 12.444751739501953, "learning_rate": 2.2594979709506234e-05, "loss": 1.4146, "step": 35900 }, { "epoch": 5.934311092749432, "grad_norm": 10.811918258666992, "learning_rate": 2.2585798490607617e-05, "loss": 1.5179, "step": 35910 }, { "epoch": 5.935963643875232, "grad_norm": 14.414387702941895, "learning_rate": 2.2576617271708992e-05, "loss": 1.4109, "step": 35920 }, { "epoch": 5.937616195001032, "grad_norm": 10.46023178100586, "learning_rate": 2.2567436052810372e-05, "loss": 1.4051, "step": 35930 }, { "epoch": 5.939268746126833, "grad_norm": 46.42742156982422, "learning_rate": 2.255825483391175e-05, "loss": 1.4066, "step": 35940 }, { "epoch": 5.9409212972526335, "grad_norm": 18.5954532623291, "learning_rate": 2.254907361501313e-05, "loss": 1.4041, "step": 35950 }, { "epoch": 5.9425738483784345, "grad_norm": 8.255085945129395, "learning_rate": 2.253989239611451e-05, "loss": 1.4775, "step": 35960 }, { "epoch": 5.944226399504235, "grad_norm": 16.39137840270996, "learning_rate": 2.253071117721589e-05, "loss": 1.495, "step": 35970 }, { "epoch": 5.945878950630036, "grad_norm": 22.78189468383789, "learning_rate": 2.2521529958317268e-05, "loss": 1.5671, "step": 35980 }, { "epoch": 5.947531501755836, "grad_norm": 10.982939720153809, "learning_rate": 2.2512348739418647e-05, "loss": 1.5311, "step": 35990 }, { "epoch": 5.949184052881636, "grad_norm": 8.388236045837402, "learning_rate": 2.2503167520520023e-05, "loss": 1.5759, "step": 36000 }, { "epoch": 5.950836604007437, "grad_norm": 19.174211502075195, "learning_rate": 2.2493986301621402e-05, "loss": 1.4469, "step": 36010 }, { "epoch": 5.952489155133237, "grad_norm": 9.777421951293945, "learning_rate": 2.2484805082722785e-05, "loss": 1.4646, "step": 36020 }, { "epoch": 5.954141706259037, "grad_norm": 33.97954559326172, "learning_rate": 2.2475623863824164e-05, "loss": 1.2839, "step": 36030 }, { "epoch": 5.955794257384838, "grad_norm": 10.13986587524414, "learning_rate": 2.2466442644925543e-05, "loss": 1.4924, "step": 36040 }, { "epoch": 5.957446808510638, "grad_norm": 11.5507173538208, "learning_rate": 2.245726142602692e-05, "loss": 1.53, "step": 36050 }, { "epoch": 5.959099359636439, "grad_norm": 7.484362602233887, "learning_rate": 2.24480802071283e-05, "loss": 1.3517, "step": 36060 }, { "epoch": 5.960751910762239, "grad_norm": 49.174068450927734, "learning_rate": 2.2438898988229678e-05, "loss": 1.4585, "step": 36070 }, { "epoch": 5.96240446188804, "grad_norm": 9.194089889526367, "learning_rate": 2.2429717769331057e-05, "loss": 1.5553, "step": 36080 }, { "epoch": 5.96405701301384, "grad_norm": 20.3260440826416, "learning_rate": 2.2420536550432436e-05, "loss": 1.5759, "step": 36090 }, { "epoch": 5.96570956413964, "grad_norm": 10.215295791625977, "learning_rate": 2.2411355331533816e-05, "loss": 1.4843, "step": 36100 }, { "epoch": 5.967362115265441, "grad_norm": 12.862039566040039, "learning_rate": 2.2402174112635195e-05, "loss": 1.5045, "step": 36110 }, { "epoch": 5.969014666391241, "grad_norm": 22.09560775756836, "learning_rate": 2.2392992893736574e-05, "loss": 1.4349, "step": 36120 }, { "epoch": 5.970667217517041, "grad_norm": 11.517844200134277, "learning_rate": 2.2383811674837953e-05, "loss": 1.3689, "step": 36130 }, { "epoch": 5.972319768642842, "grad_norm": 14.348694801330566, "learning_rate": 2.2374630455939333e-05, "loss": 1.4601, "step": 36140 }, { "epoch": 5.9739723197686425, "grad_norm": 9.823168754577637, "learning_rate": 2.2365449237040712e-05, "loss": 1.4595, "step": 36150 }, { "epoch": 5.9756248708944435, "grad_norm": 19.39566421508789, "learning_rate": 2.235626801814209e-05, "loss": 1.4817, "step": 36160 }, { "epoch": 5.977277422020244, "grad_norm": 12.078182220458984, "learning_rate": 2.234708679924347e-05, "loss": 1.5023, "step": 36170 }, { "epoch": 5.978929973146045, "grad_norm": 27.949277877807617, "learning_rate": 2.2337905580344846e-05, "loss": 1.4331, "step": 36180 }, { "epoch": 5.980582524271845, "grad_norm": 12.060958862304688, "learning_rate": 2.2328724361446225e-05, "loss": 1.4613, "step": 36190 }, { "epoch": 5.982235075397645, "grad_norm": 12.409534454345703, "learning_rate": 2.2319543142547605e-05, "loss": 1.3824, "step": 36200 }, { "epoch": 5.983887626523446, "grad_norm": 11.61839485168457, "learning_rate": 2.2310361923648984e-05, "loss": 1.4633, "step": 36210 }, { "epoch": 5.985540177649246, "grad_norm": 17.775489807128906, "learning_rate": 2.2301180704750363e-05, "loss": 1.5373, "step": 36220 }, { "epoch": 5.987192728775047, "grad_norm": 13.473231315612793, "learning_rate": 2.2291999485851742e-05, "loss": 1.282, "step": 36230 }, { "epoch": 5.988845279900847, "grad_norm": 11.443365097045898, "learning_rate": 2.228281826695312e-05, "loss": 1.4227, "step": 36240 }, { "epoch": 5.990497831026647, "grad_norm": 9.717024803161621, "learning_rate": 2.22736370480545e-05, "loss": 1.5218, "step": 36250 }, { "epoch": 5.992150382152448, "grad_norm": 17.857152938842773, "learning_rate": 2.226445582915588e-05, "loss": 1.5035, "step": 36260 }, { "epoch": 5.993802933278248, "grad_norm": 10.381898880004883, "learning_rate": 2.225527461025726e-05, "loss": 1.4807, "step": 36270 }, { "epoch": 5.995455484404049, "grad_norm": 16.007429122924805, "learning_rate": 2.224609339135864e-05, "loss": 1.6168, "step": 36280 }, { "epoch": 5.997108035529849, "grad_norm": 8.93297290802002, "learning_rate": 2.2236912172460018e-05, "loss": 1.4956, "step": 36290 }, { "epoch": 5.998760586655649, "grad_norm": 10.40009880065918, "learning_rate": 2.2227730953561397e-05, "loss": 1.4969, "step": 36300 }, { "epoch": 5.99991737244371, "eval_accuracy": 0.3295502908213931, "eval_loss": 2.154688835144043, "eval_runtime": 818.3335, "eval_samples_per_second": 34.455, "eval_steps_per_second": 8.614, "step": 36307 }, { "epoch": 6.00041313778145, "grad_norm": 9.945453643798828, "learning_rate": 2.2218549734662776e-05, "loss": 1.5087, "step": 36310 }, { "epoch": 6.00206568890725, "grad_norm": 9.44550609588623, "learning_rate": 2.2209368515764152e-05, "loss": 1.2805, "step": 36320 }, { "epoch": 6.003718240033051, "grad_norm": 52.20871353149414, "learning_rate": 2.220018729686553e-05, "loss": 1.32, "step": 36330 }, { "epoch": 6.005370791158851, "grad_norm": 9.028908729553223, "learning_rate": 2.219100607796691e-05, "loss": 1.3254, "step": 36340 }, { "epoch": 6.0070233422846515, "grad_norm": 10.057682991027832, "learning_rate": 2.218182485906829e-05, "loss": 1.3984, "step": 36350 }, { "epoch": 6.0086758934104525, "grad_norm": 13.148024559020996, "learning_rate": 2.2172643640169673e-05, "loss": 1.4118, "step": 36360 }, { "epoch": 6.010328444536253, "grad_norm": 7.585305690765381, "learning_rate": 2.216346242127105e-05, "loss": 1.4126, "step": 36370 }, { "epoch": 6.011980995662054, "grad_norm": 16.880674362182617, "learning_rate": 2.2154281202372428e-05, "loss": 1.4296, "step": 36380 }, { "epoch": 6.013633546787854, "grad_norm": 9.740089416503906, "learning_rate": 2.2145099983473807e-05, "loss": 1.4044, "step": 36390 }, { "epoch": 6.015286097913654, "grad_norm": 16.309249877929688, "learning_rate": 2.2135918764575186e-05, "loss": 1.3367, "step": 36400 }, { "epoch": 6.016938649039455, "grad_norm": 88.99636840820312, "learning_rate": 2.2126737545676565e-05, "loss": 1.4252, "step": 36410 }, { "epoch": 6.018591200165255, "grad_norm": 20.119401931762695, "learning_rate": 2.2117556326777945e-05, "loss": 1.5106, "step": 36420 }, { "epoch": 6.020243751291056, "grad_norm": 26.70507049560547, "learning_rate": 2.2108375107879324e-05, "loss": 1.4082, "step": 36430 }, { "epoch": 6.021896302416856, "grad_norm": 10.413867950439453, "learning_rate": 2.2099193888980703e-05, "loss": 1.4838, "step": 36440 }, { "epoch": 6.023548853542657, "grad_norm": 20.84398078918457, "learning_rate": 2.209001267008208e-05, "loss": 1.4627, "step": 36450 }, { "epoch": 6.025201404668457, "grad_norm": 7.607133388519287, "learning_rate": 2.2080831451183458e-05, "loss": 1.5133, "step": 36460 }, { "epoch": 6.026853955794257, "grad_norm": 9.936842918395996, "learning_rate": 2.2071650232284838e-05, "loss": 1.4638, "step": 36470 }, { "epoch": 6.028506506920058, "grad_norm": 9.79660415649414, "learning_rate": 2.206246901338622e-05, "loss": 1.3432, "step": 36480 }, { "epoch": 6.030159058045858, "grad_norm": 14.434847831726074, "learning_rate": 2.20532877944876e-05, "loss": 1.4072, "step": 36490 }, { "epoch": 6.031811609171659, "grad_norm": 11.613893508911133, "learning_rate": 2.2044106575588975e-05, "loss": 1.3748, "step": 36500 }, { "epoch": 6.033464160297459, "grad_norm": 14.374581336975098, "learning_rate": 2.2034925356690355e-05, "loss": 1.5375, "step": 36510 }, { "epoch": 6.035116711423259, "grad_norm": 10.08636474609375, "learning_rate": 2.2025744137791734e-05, "loss": 1.4055, "step": 36520 }, { "epoch": 6.03676926254906, "grad_norm": 11.865565299987793, "learning_rate": 2.2016562918893113e-05, "loss": 1.3775, "step": 36530 }, { "epoch": 6.03842181367486, "grad_norm": 88.45521545410156, "learning_rate": 2.2007381699994492e-05, "loss": 1.5659, "step": 36540 }, { "epoch": 6.040074364800661, "grad_norm": 9.256975173950195, "learning_rate": 2.199820048109587e-05, "loss": 1.3508, "step": 36550 }, { "epoch": 6.0417269159264615, "grad_norm": 14.002177238464355, "learning_rate": 2.198901926219725e-05, "loss": 1.4102, "step": 36560 }, { "epoch": 6.043379467052262, "grad_norm": 21.786453247070312, "learning_rate": 2.197983804329863e-05, "loss": 1.5863, "step": 36570 }, { "epoch": 6.045032018178063, "grad_norm": 9.75375747680664, "learning_rate": 2.1970656824400006e-05, "loss": 1.3894, "step": 36580 }, { "epoch": 6.046684569303863, "grad_norm": 7.86843729019165, "learning_rate": 2.196147560550139e-05, "loss": 1.3645, "step": 36590 }, { "epoch": 6.048337120429664, "grad_norm": 7.051876068115234, "learning_rate": 2.1952294386602768e-05, "loss": 1.3968, "step": 36600 }, { "epoch": 6.049989671555464, "grad_norm": 53.84910202026367, "learning_rate": 2.1943113167704147e-05, "loss": 1.4543, "step": 36610 }, { "epoch": 6.051642222681264, "grad_norm": 8.938406944274902, "learning_rate": 2.1933931948805526e-05, "loss": 1.3795, "step": 36620 }, { "epoch": 6.053294773807065, "grad_norm": 10.21081829071045, "learning_rate": 2.1924750729906905e-05, "loss": 1.5895, "step": 36630 }, { "epoch": 6.054947324932865, "grad_norm": 11.11033821105957, "learning_rate": 2.191556951100828e-05, "loss": 1.4376, "step": 36640 }, { "epoch": 6.056599876058666, "grad_norm": 12.69739818572998, "learning_rate": 2.190638829210966e-05, "loss": 1.4503, "step": 36650 }, { "epoch": 6.058252427184466, "grad_norm": 14.584735870361328, "learning_rate": 2.189720707321104e-05, "loss": 1.3427, "step": 36660 }, { "epoch": 6.059904978310266, "grad_norm": 10.828611373901367, "learning_rate": 2.188802585431242e-05, "loss": 1.3451, "step": 36670 }, { "epoch": 6.061557529436067, "grad_norm": 7.586915493011475, "learning_rate": 2.18788446354138e-05, "loss": 1.3483, "step": 36680 }, { "epoch": 6.063210080561867, "grad_norm": 11.18470573425293, "learning_rate": 2.1869663416515178e-05, "loss": 1.4135, "step": 36690 }, { "epoch": 6.064862631687668, "grad_norm": 12.794292449951172, "learning_rate": 2.1860482197616557e-05, "loss": 1.3996, "step": 36700 }, { "epoch": 6.066515182813468, "grad_norm": 10.54262924194336, "learning_rate": 2.1851300978717936e-05, "loss": 1.5032, "step": 36710 }, { "epoch": 6.068167733939268, "grad_norm": 16.40874671936035, "learning_rate": 2.1842119759819315e-05, "loss": 1.4283, "step": 36720 }, { "epoch": 6.069820285065069, "grad_norm": 10.831640243530273, "learning_rate": 2.1832938540920695e-05, "loss": 1.472, "step": 36730 }, { "epoch": 6.071472836190869, "grad_norm": 10.649149894714355, "learning_rate": 2.1823757322022074e-05, "loss": 1.294, "step": 36740 }, { "epoch": 6.07312538731667, "grad_norm": 30.559856414794922, "learning_rate": 2.1814576103123453e-05, "loss": 1.4514, "step": 36750 }, { "epoch": 6.0747779384424705, "grad_norm": 10.63766860961914, "learning_rate": 2.1805394884224832e-05, "loss": 1.4639, "step": 36760 }, { "epoch": 6.076430489568271, "grad_norm": 14.730107307434082, "learning_rate": 2.1796213665326208e-05, "loss": 1.4646, "step": 36770 }, { "epoch": 6.078083040694072, "grad_norm": 15.621095657348633, "learning_rate": 2.1787032446427587e-05, "loss": 1.3344, "step": 36780 }, { "epoch": 6.079735591819872, "grad_norm": 15.240421295166016, "learning_rate": 2.1777851227528967e-05, "loss": 1.4834, "step": 36790 }, { "epoch": 6.081388142945673, "grad_norm": 8.048723220825195, "learning_rate": 2.1768670008630346e-05, "loss": 1.3243, "step": 36800 }, { "epoch": 6.083040694071473, "grad_norm": 14.045221328735352, "learning_rate": 2.1759488789731725e-05, "loss": 1.4613, "step": 36810 }, { "epoch": 6.084693245197274, "grad_norm": 9.798041343688965, "learning_rate": 2.1750307570833104e-05, "loss": 1.4506, "step": 36820 }, { "epoch": 6.086345796323074, "grad_norm": 9.147562980651855, "learning_rate": 2.1741126351934484e-05, "loss": 1.4779, "step": 36830 }, { "epoch": 6.087998347448874, "grad_norm": 7.114458084106445, "learning_rate": 2.1731945133035863e-05, "loss": 1.3534, "step": 36840 }, { "epoch": 6.089650898574675, "grad_norm": 15.428686141967773, "learning_rate": 2.1722763914137242e-05, "loss": 1.4559, "step": 36850 }, { "epoch": 6.091303449700475, "grad_norm": 8.084449768066406, "learning_rate": 2.171358269523862e-05, "loss": 1.4538, "step": 36860 }, { "epoch": 6.092956000826276, "grad_norm": 13.73076343536377, "learning_rate": 2.170440147634e-05, "loss": 1.3203, "step": 36870 }, { "epoch": 6.094608551952076, "grad_norm": 12.01843547821045, "learning_rate": 2.169522025744138e-05, "loss": 1.3937, "step": 36880 }, { "epoch": 6.096261103077876, "grad_norm": 11.336709022521973, "learning_rate": 2.168603903854276e-05, "loss": 1.4382, "step": 36890 }, { "epoch": 6.097913654203677, "grad_norm": 20.88669776916504, "learning_rate": 2.1676857819644135e-05, "loss": 1.3326, "step": 36900 }, { "epoch": 6.099566205329477, "grad_norm": 12.064225196838379, "learning_rate": 2.1667676600745514e-05, "loss": 1.4931, "step": 36910 }, { "epoch": 6.101218756455278, "grad_norm": 10.633503913879395, "learning_rate": 2.1658495381846894e-05, "loss": 1.5304, "step": 36920 }, { "epoch": 6.102871307581078, "grad_norm": 11.907389640808105, "learning_rate": 2.1649314162948276e-05, "loss": 1.4893, "step": 36930 }, { "epoch": 6.104523858706878, "grad_norm": 11.007550239562988, "learning_rate": 2.1640132944049655e-05, "loss": 1.4218, "step": 36940 }, { "epoch": 6.106176409832679, "grad_norm": 7.1882405281066895, "learning_rate": 2.1630951725151035e-05, "loss": 1.347, "step": 36950 }, { "epoch": 6.1078289609584795, "grad_norm": 21.802770614624023, "learning_rate": 2.162177050625241e-05, "loss": 1.3269, "step": 36960 }, { "epoch": 6.1094815120842805, "grad_norm": 28.251907348632812, "learning_rate": 2.161258928735379e-05, "loss": 1.5196, "step": 36970 }, { "epoch": 6.111134063210081, "grad_norm": 13.803258895874023, "learning_rate": 2.160340806845517e-05, "loss": 1.45, "step": 36980 }, { "epoch": 6.112786614335881, "grad_norm": 26.397541046142578, "learning_rate": 2.1594226849556548e-05, "loss": 1.4174, "step": 36990 }, { "epoch": 6.114439165461682, "grad_norm": 16.90498924255371, "learning_rate": 2.1585045630657927e-05, "loss": 1.4125, "step": 37000 }, { "epoch": 6.116091716587482, "grad_norm": 17.150217056274414, "learning_rate": 2.1575864411759307e-05, "loss": 1.411, "step": 37010 }, { "epoch": 6.117744267713283, "grad_norm": 11.79568862915039, "learning_rate": 2.1566683192860686e-05, "loss": 1.4386, "step": 37020 }, { "epoch": 6.119396818839083, "grad_norm": 10.997562408447266, "learning_rate": 2.1557501973962062e-05, "loss": 1.4482, "step": 37030 }, { "epoch": 6.121049369964883, "grad_norm": 14.64095687866211, "learning_rate": 2.154832075506344e-05, "loss": 1.4078, "step": 37040 }, { "epoch": 6.122701921090684, "grad_norm": 7.124085426330566, "learning_rate": 2.1539139536164824e-05, "loss": 1.489, "step": 37050 }, { "epoch": 6.124354472216484, "grad_norm": 16.14177894592285, "learning_rate": 2.1529958317266203e-05, "loss": 1.449, "step": 37060 }, { "epoch": 6.126007023342285, "grad_norm": 14.747572898864746, "learning_rate": 2.1520777098367582e-05, "loss": 1.4958, "step": 37070 }, { "epoch": 6.127659574468085, "grad_norm": 15.584579467773438, "learning_rate": 2.151159587946896e-05, "loss": 1.4513, "step": 37080 }, { "epoch": 6.129312125593885, "grad_norm": 28.18446922302246, "learning_rate": 2.1502414660570337e-05, "loss": 1.3798, "step": 37090 }, { "epoch": 6.130964676719686, "grad_norm": 37.316654205322266, "learning_rate": 2.1493233441671717e-05, "loss": 1.5111, "step": 37100 }, { "epoch": 6.132617227845486, "grad_norm": 8.39609432220459, "learning_rate": 2.1484052222773096e-05, "loss": 1.5339, "step": 37110 }, { "epoch": 6.134269778971287, "grad_norm": 14.053455352783203, "learning_rate": 2.1474871003874475e-05, "loss": 1.4811, "step": 37120 }, { "epoch": 6.135922330097087, "grad_norm": 10.345773696899414, "learning_rate": 2.1465689784975854e-05, "loss": 1.4089, "step": 37130 }, { "epoch": 6.137574881222887, "grad_norm": 7.838363170623779, "learning_rate": 2.1456508566077234e-05, "loss": 1.4742, "step": 37140 }, { "epoch": 6.139227432348688, "grad_norm": 22.944093704223633, "learning_rate": 2.1447327347178613e-05, "loss": 1.4821, "step": 37150 }, { "epoch": 6.1408799834744885, "grad_norm": 8.07544994354248, "learning_rate": 2.1438146128279992e-05, "loss": 1.3206, "step": 37160 }, { "epoch": 6.1425325346002895, "grad_norm": 99.47127532958984, "learning_rate": 2.142896490938137e-05, "loss": 1.3522, "step": 37170 }, { "epoch": 6.14418508572609, "grad_norm": 10.434372901916504, "learning_rate": 2.141978369048275e-05, "loss": 1.4843, "step": 37180 }, { "epoch": 6.1458376368518906, "grad_norm": 9.524206161499023, "learning_rate": 2.141060247158413e-05, "loss": 1.4098, "step": 37190 }, { "epoch": 6.147490187977691, "grad_norm": 7.026861667633057, "learning_rate": 2.140142125268551e-05, "loss": 1.3957, "step": 37200 }, { "epoch": 6.149142739103491, "grad_norm": 12.716541290283203, "learning_rate": 2.1392240033786888e-05, "loss": 1.5463, "step": 37210 }, { "epoch": 6.150795290229292, "grad_norm": 38.672367095947266, "learning_rate": 2.1383058814888264e-05, "loss": 1.4292, "step": 37220 }, { "epoch": 6.152447841355092, "grad_norm": 14.989725112915039, "learning_rate": 2.1373877595989643e-05, "loss": 1.4267, "step": 37230 }, { "epoch": 6.154100392480893, "grad_norm": 9.272750854492188, "learning_rate": 2.1364696377091023e-05, "loss": 1.4773, "step": 37240 }, { "epoch": 6.155752943606693, "grad_norm": 13.461223602294922, "learning_rate": 2.1355515158192402e-05, "loss": 1.3765, "step": 37250 }, { "epoch": 6.157405494732493, "grad_norm": 11.42111587524414, "learning_rate": 2.134633393929378e-05, "loss": 1.399, "step": 37260 }, { "epoch": 6.159058045858294, "grad_norm": 9.016616821289062, "learning_rate": 2.133715272039516e-05, "loss": 1.596, "step": 37270 }, { "epoch": 6.160710596984094, "grad_norm": 8.332453727722168, "learning_rate": 2.132797150149654e-05, "loss": 1.388, "step": 37280 }, { "epoch": 6.162363148109895, "grad_norm": 17.8102970123291, "learning_rate": 2.131879028259792e-05, "loss": 1.3235, "step": 37290 }, { "epoch": 6.164015699235695, "grad_norm": 11.08936882019043, "learning_rate": 2.1309609063699298e-05, "loss": 1.3742, "step": 37300 }, { "epoch": 6.165668250361495, "grad_norm": 8.341262817382812, "learning_rate": 2.1300427844800677e-05, "loss": 1.4266, "step": 37310 }, { "epoch": 6.167320801487296, "grad_norm": 18.066402435302734, "learning_rate": 2.1291246625902057e-05, "loss": 1.6203, "step": 37320 }, { "epoch": 6.168973352613096, "grad_norm": 19.090791702270508, "learning_rate": 2.1282065407003436e-05, "loss": 1.4881, "step": 37330 }, { "epoch": 6.170625903738897, "grad_norm": 7.528576850891113, "learning_rate": 2.1272884188104815e-05, "loss": 1.3519, "step": 37340 }, { "epoch": 6.172278454864697, "grad_norm": 19.304250717163086, "learning_rate": 2.126370296920619e-05, "loss": 1.4546, "step": 37350 }, { "epoch": 6.1739310059904975, "grad_norm": 7.075794219970703, "learning_rate": 2.125452175030757e-05, "loss": 1.4554, "step": 37360 }, { "epoch": 6.1755835571162985, "grad_norm": 12.549452781677246, "learning_rate": 2.124534053140895e-05, "loss": 1.5573, "step": 37370 }, { "epoch": 6.177236108242099, "grad_norm": 9.695381164550781, "learning_rate": 2.123615931251033e-05, "loss": 1.3744, "step": 37380 }, { "epoch": 6.1788886593678995, "grad_norm": 21.600099563598633, "learning_rate": 2.122697809361171e-05, "loss": 1.5132, "step": 37390 }, { "epoch": 6.1805412104937, "grad_norm": 9.879249572753906, "learning_rate": 2.121779687471309e-05, "loss": 1.4469, "step": 37400 }, { "epoch": 6.1821937616195, "grad_norm": 12.330972671508789, "learning_rate": 2.1208615655814466e-05, "loss": 1.5801, "step": 37410 }, { "epoch": 6.183846312745301, "grad_norm": 12.194676399230957, "learning_rate": 2.1199434436915846e-05, "loss": 1.3992, "step": 37420 }, { "epoch": 6.185498863871101, "grad_norm": 14.351995468139648, "learning_rate": 2.1190253218017225e-05, "loss": 1.5508, "step": 37430 }, { "epoch": 6.187151414996902, "grad_norm": 9.946985244750977, "learning_rate": 2.1181071999118604e-05, "loss": 1.5336, "step": 37440 }, { "epoch": 6.188803966122702, "grad_norm": 10.623150825500488, "learning_rate": 2.1171890780219983e-05, "loss": 1.4028, "step": 37450 }, { "epoch": 6.190456517248502, "grad_norm": 12.726574897766113, "learning_rate": 2.1162709561321363e-05, "loss": 1.4419, "step": 37460 }, { "epoch": 6.192109068374303, "grad_norm": 11.567181587219238, "learning_rate": 2.1153528342422742e-05, "loss": 1.3462, "step": 37470 }, { "epoch": 6.193761619500103, "grad_norm": 15.38206672668457, "learning_rate": 2.1144347123524118e-05, "loss": 1.4159, "step": 37480 }, { "epoch": 6.195414170625904, "grad_norm": 9.006597518920898, "learning_rate": 2.1135165904625497e-05, "loss": 1.3282, "step": 37490 }, { "epoch": 6.197066721751704, "grad_norm": 16.817569732666016, "learning_rate": 2.112598468572688e-05, "loss": 1.3807, "step": 37500 }, { "epoch": 6.198719272877504, "grad_norm": 14.372156143188477, "learning_rate": 2.111680346682826e-05, "loss": 1.5622, "step": 37510 }, { "epoch": 6.200371824003305, "grad_norm": 10.266152381896973, "learning_rate": 2.1107622247929638e-05, "loss": 1.4466, "step": 37520 }, { "epoch": 6.202024375129105, "grad_norm": 15.889741897583008, "learning_rate": 2.1098441029031017e-05, "loss": 1.4577, "step": 37530 }, { "epoch": 6.203676926254906, "grad_norm": 11.235896110534668, "learning_rate": 2.1089259810132393e-05, "loss": 1.4971, "step": 37540 }, { "epoch": 6.205329477380706, "grad_norm": 10.462620735168457, "learning_rate": 2.1080078591233773e-05, "loss": 1.3453, "step": 37550 }, { "epoch": 6.206982028506507, "grad_norm": 9.256996154785156, "learning_rate": 2.1070897372335152e-05, "loss": 1.4294, "step": 37560 }, { "epoch": 6.2086345796323075, "grad_norm": 9.670038223266602, "learning_rate": 2.106171615343653e-05, "loss": 1.498, "step": 37570 }, { "epoch": 6.210287130758108, "grad_norm": 10.330695152282715, "learning_rate": 2.105253493453791e-05, "loss": 1.4457, "step": 37580 }, { "epoch": 6.2119396818839085, "grad_norm": 13.623739242553711, "learning_rate": 2.104335371563929e-05, "loss": 1.52, "step": 37590 }, { "epoch": 6.213592233009709, "grad_norm": 8.723990440368652, "learning_rate": 2.103417249674067e-05, "loss": 1.4207, "step": 37600 }, { "epoch": 6.215244784135509, "grad_norm": 10.08510684967041, "learning_rate": 2.1024991277842045e-05, "loss": 1.4864, "step": 37610 }, { "epoch": 6.21689733526131, "grad_norm": 13.55770206451416, "learning_rate": 2.1015810058943427e-05, "loss": 1.3481, "step": 37620 }, { "epoch": 6.21854988638711, "grad_norm": 26.282560348510742, "learning_rate": 2.1006628840044807e-05, "loss": 1.4423, "step": 37630 }, { "epoch": 6.220202437512911, "grad_norm": 13.998091697692871, "learning_rate": 2.0997447621146186e-05, "loss": 1.5324, "step": 37640 }, { "epoch": 6.221854988638711, "grad_norm": 10.550564765930176, "learning_rate": 2.0988266402247565e-05, "loss": 1.4653, "step": 37650 }, { "epoch": 6.223507539764512, "grad_norm": 9.701068878173828, "learning_rate": 2.0979085183348944e-05, "loss": 1.4896, "step": 37660 }, { "epoch": 6.225160090890312, "grad_norm": 19.163440704345703, "learning_rate": 2.096990396445032e-05, "loss": 1.3897, "step": 37670 }, { "epoch": 6.226812642016112, "grad_norm": 21.417760848999023, "learning_rate": 2.09607227455517e-05, "loss": 1.3892, "step": 37680 }, { "epoch": 6.228465193141913, "grad_norm": 13.961776733398438, "learning_rate": 2.095154152665308e-05, "loss": 1.3522, "step": 37690 }, { "epoch": 6.230117744267713, "grad_norm": 7.012969493865967, "learning_rate": 2.0942360307754458e-05, "loss": 1.3648, "step": 37700 }, { "epoch": 6.231770295393514, "grad_norm": 15.90501594543457, "learning_rate": 2.0933179088855837e-05, "loss": 1.4158, "step": 37710 }, { "epoch": 6.233422846519314, "grad_norm": 16.21110725402832, "learning_rate": 2.0923997869957216e-05, "loss": 1.5681, "step": 37720 }, { "epoch": 6.235075397645114, "grad_norm": 11.805452346801758, "learning_rate": 2.0914816651058596e-05, "loss": 1.387, "step": 37730 }, { "epoch": 6.236727948770915, "grad_norm": 13.68657398223877, "learning_rate": 2.0905635432159975e-05, "loss": 1.3717, "step": 37740 }, { "epoch": 6.238380499896715, "grad_norm": 9.461950302124023, "learning_rate": 2.0896454213261354e-05, "loss": 1.3141, "step": 37750 }, { "epoch": 6.240033051022516, "grad_norm": 9.578817367553711, "learning_rate": 2.0887272994362733e-05, "loss": 1.3581, "step": 37760 }, { "epoch": 6.2416856021483165, "grad_norm": 11.16032600402832, "learning_rate": 2.0878091775464113e-05, "loss": 1.4463, "step": 37770 }, { "epoch": 6.243338153274117, "grad_norm": 36.12056350708008, "learning_rate": 2.0868910556565492e-05, "loss": 1.3249, "step": 37780 }, { "epoch": 6.2449907043999175, "grad_norm": 17.16036605834961, "learning_rate": 2.085972933766687e-05, "loss": 1.4926, "step": 37790 }, { "epoch": 6.246643255525718, "grad_norm": 8.647866249084473, "learning_rate": 2.0850548118768247e-05, "loss": 1.3782, "step": 37800 }, { "epoch": 6.248295806651519, "grad_norm": 8.385272026062012, "learning_rate": 2.0841366899869626e-05, "loss": 1.4198, "step": 37810 }, { "epoch": 6.249948357777319, "grad_norm": 14.405436515808105, "learning_rate": 2.0832185680971005e-05, "loss": 1.3887, "step": 37820 }, { "epoch": 6.251600908903119, "grad_norm": 7.519594669342041, "learning_rate": 2.0823004462072385e-05, "loss": 1.3571, "step": 37830 }, { "epoch": 6.25325346002892, "grad_norm": 27.60797882080078, "learning_rate": 2.0813823243173764e-05, "loss": 1.5992, "step": 37840 }, { "epoch": 6.25490601115472, "grad_norm": 11.991599082946777, "learning_rate": 2.0804642024275147e-05, "loss": 1.4627, "step": 37850 }, { "epoch": 6.256558562280521, "grad_norm": 12.27054500579834, "learning_rate": 2.0795460805376522e-05, "loss": 1.3893, "step": 37860 }, { "epoch": 6.258211113406321, "grad_norm": 33.28194808959961, "learning_rate": 2.0786279586477902e-05, "loss": 1.4389, "step": 37870 }, { "epoch": 6.259863664532121, "grad_norm": 22.726533889770508, "learning_rate": 2.077709836757928e-05, "loss": 1.5418, "step": 37880 }, { "epoch": 6.261516215657922, "grad_norm": 13.192795753479004, "learning_rate": 2.076791714868066e-05, "loss": 1.3905, "step": 37890 }, { "epoch": 6.263168766783722, "grad_norm": 14.053077697753906, "learning_rate": 2.075873592978204e-05, "loss": 1.4147, "step": 37900 }, { "epoch": 6.264821317909523, "grad_norm": 16.81157875061035, "learning_rate": 2.074955471088342e-05, "loss": 1.3977, "step": 37910 }, { "epoch": 6.266473869035323, "grad_norm": 7.91226863861084, "learning_rate": 2.0740373491984798e-05, "loss": 1.3966, "step": 37920 }, { "epoch": 6.268126420161124, "grad_norm": 13.343158721923828, "learning_rate": 2.0731192273086174e-05, "loss": 1.4248, "step": 37930 }, { "epoch": 6.269778971286924, "grad_norm": 10.185911178588867, "learning_rate": 2.0722011054187553e-05, "loss": 1.4735, "step": 37940 }, { "epoch": 6.271431522412724, "grad_norm": 8.653183937072754, "learning_rate": 2.0712829835288932e-05, "loss": 1.4104, "step": 37950 }, { "epoch": 6.273084073538525, "grad_norm": 16.011350631713867, "learning_rate": 2.0703648616390315e-05, "loss": 1.4787, "step": 37960 }, { "epoch": 6.2747366246643255, "grad_norm": 11.425875663757324, "learning_rate": 2.0694467397491694e-05, "loss": 1.5623, "step": 37970 }, { "epoch": 6.276389175790126, "grad_norm": 10.677700996398926, "learning_rate": 2.0685286178593073e-05, "loss": 1.4261, "step": 37980 }, { "epoch": 6.2780417269159265, "grad_norm": 19.281002044677734, "learning_rate": 2.067610495969445e-05, "loss": 1.7146, "step": 37990 }, { "epoch": 6.279694278041727, "grad_norm": 16.140913009643555, "learning_rate": 2.066692374079583e-05, "loss": 1.4536, "step": 38000 }, { "epoch": 6.281346829167528, "grad_norm": 11.096719741821289, "learning_rate": 2.0657742521897208e-05, "loss": 1.3705, "step": 38010 }, { "epoch": 6.282999380293328, "grad_norm": 18.13045883178711, "learning_rate": 2.0648561302998587e-05, "loss": 1.4394, "step": 38020 }, { "epoch": 6.284651931419129, "grad_norm": 14.277056694030762, "learning_rate": 2.0639380084099966e-05, "loss": 1.46, "step": 38030 }, { "epoch": 6.286304482544929, "grad_norm": 12.03947925567627, "learning_rate": 2.0630198865201346e-05, "loss": 1.2961, "step": 38040 }, { "epoch": 6.287957033670729, "grad_norm": 11.471107482910156, "learning_rate": 2.0621017646302725e-05, "loss": 1.3797, "step": 38050 }, { "epoch": 6.28960958479653, "grad_norm": 14.753610610961914, "learning_rate": 2.06118364274041e-05, "loss": 1.5292, "step": 38060 }, { "epoch": 6.29126213592233, "grad_norm": 9.602330207824707, "learning_rate": 2.060265520850548e-05, "loss": 1.3428, "step": 38070 }, { "epoch": 6.292914687048131, "grad_norm": 12.456007957458496, "learning_rate": 2.0593473989606863e-05, "loss": 1.4892, "step": 38080 }, { "epoch": 6.294567238173931, "grad_norm": 8.093803405761719, "learning_rate": 2.0584292770708242e-05, "loss": 1.4064, "step": 38090 }, { "epoch": 6.296219789299731, "grad_norm": 10.36359691619873, "learning_rate": 2.057511155180962e-05, "loss": 1.4547, "step": 38100 }, { "epoch": 6.297872340425532, "grad_norm": 10.041078567504883, "learning_rate": 2.0565930332911e-05, "loss": 1.4408, "step": 38110 }, { "epoch": 6.299524891551332, "grad_norm": 12.253533363342285, "learning_rate": 2.0556749114012376e-05, "loss": 1.589, "step": 38120 }, { "epoch": 6.301177442677133, "grad_norm": 10.570530891418457, "learning_rate": 2.0547567895113755e-05, "loss": 1.3889, "step": 38130 }, { "epoch": 6.302829993802933, "grad_norm": 13.251456260681152, "learning_rate": 2.0538386676215135e-05, "loss": 1.5385, "step": 38140 }, { "epoch": 6.304482544928733, "grad_norm": 27.627914428710938, "learning_rate": 2.0529205457316514e-05, "loss": 1.5747, "step": 38150 }, { "epoch": 6.306135096054534, "grad_norm": 9.486604690551758, "learning_rate": 2.0520024238417893e-05, "loss": 1.4378, "step": 38160 }, { "epoch": 6.3077876471803345, "grad_norm": 6.460193157196045, "learning_rate": 2.0510843019519272e-05, "loss": 1.444, "step": 38170 }, { "epoch": 6.3094401983061354, "grad_norm": 48.902530670166016, "learning_rate": 2.050166180062065e-05, "loss": 1.3742, "step": 38180 }, { "epoch": 6.3110927494319355, "grad_norm": 8.491707801818848, "learning_rate": 2.049248058172203e-05, "loss": 1.4993, "step": 38190 }, { "epoch": 6.312745300557736, "grad_norm": 14.18362808227539, "learning_rate": 2.048329936282341e-05, "loss": 1.5047, "step": 38200 }, { "epoch": 6.314397851683537, "grad_norm": 15.740824699401855, "learning_rate": 2.047411814392479e-05, "loss": 1.3403, "step": 38210 }, { "epoch": 6.316050402809337, "grad_norm": 11.892169952392578, "learning_rate": 2.046493692502617e-05, "loss": 1.5607, "step": 38220 }, { "epoch": 6.317702953935138, "grad_norm": 8.451154708862305, "learning_rate": 2.0455755706127548e-05, "loss": 1.3493, "step": 38230 }, { "epoch": 6.319355505060938, "grad_norm": 14.7618408203125, "learning_rate": 2.0446574487228927e-05, "loss": 1.504, "step": 38240 }, { "epoch": 6.321008056186738, "grad_norm": 10.903997421264648, "learning_rate": 2.0437393268330303e-05, "loss": 1.3993, "step": 38250 }, { "epoch": 6.322660607312539, "grad_norm": 12.13855266571045, "learning_rate": 2.0428212049431682e-05, "loss": 1.478, "step": 38260 }, { "epoch": 6.324313158438339, "grad_norm": 13.594361305236816, "learning_rate": 2.041903083053306e-05, "loss": 1.5079, "step": 38270 }, { "epoch": 6.32596570956414, "grad_norm": 9.111920356750488, "learning_rate": 2.040984961163444e-05, "loss": 1.5341, "step": 38280 }, { "epoch": 6.32761826068994, "grad_norm": 8.151607513427734, "learning_rate": 2.040066839273582e-05, "loss": 1.4063, "step": 38290 }, { "epoch": 6.32927081181574, "grad_norm": 9.332478523254395, "learning_rate": 2.0391487173837203e-05, "loss": 1.5013, "step": 38300 }, { "epoch": 6.330923362941541, "grad_norm": 20.929075241088867, "learning_rate": 2.038230595493858e-05, "loss": 1.3706, "step": 38310 }, { "epoch": 6.332575914067341, "grad_norm": 16.67856788635254, "learning_rate": 2.0373124736039958e-05, "loss": 1.4256, "step": 38320 }, { "epoch": 6.334228465193142, "grad_norm": 9.978659629821777, "learning_rate": 2.0363943517141337e-05, "loss": 1.5506, "step": 38330 }, { "epoch": 6.335881016318942, "grad_norm": 7.140017032623291, "learning_rate": 2.0354762298242716e-05, "loss": 1.3496, "step": 38340 }, { "epoch": 6.337533567444742, "grad_norm": 7.583073616027832, "learning_rate": 2.0345581079344095e-05, "loss": 1.2338, "step": 38350 }, { "epoch": 6.339186118570543, "grad_norm": 27.26841163635254, "learning_rate": 2.0336399860445475e-05, "loss": 1.3861, "step": 38360 }, { "epoch": 6.3408386696963435, "grad_norm": 22.0368595123291, "learning_rate": 2.0327218641546854e-05, "loss": 1.4712, "step": 38370 }, { "epoch": 6.3424912208221444, "grad_norm": 13.927277565002441, "learning_rate": 2.031803742264823e-05, "loss": 1.5026, "step": 38380 }, { "epoch": 6.3441437719479445, "grad_norm": 13.221909523010254, "learning_rate": 2.030885620374961e-05, "loss": 1.3972, "step": 38390 }, { "epoch": 6.3457963230737455, "grad_norm": 13.610274314880371, "learning_rate": 2.0299674984850988e-05, "loss": 1.5108, "step": 38400 }, { "epoch": 6.347448874199546, "grad_norm": 16.176225662231445, "learning_rate": 2.0290493765952368e-05, "loss": 1.3653, "step": 38410 }, { "epoch": 6.349101425325346, "grad_norm": 21.335020065307617, "learning_rate": 2.028131254705375e-05, "loss": 1.5748, "step": 38420 }, { "epoch": 6.350753976451147, "grad_norm": 17.27007293701172, "learning_rate": 2.027213132815513e-05, "loss": 1.4135, "step": 38430 }, { "epoch": 6.352406527576947, "grad_norm": 19.558349609375, "learning_rate": 2.0262950109256505e-05, "loss": 1.398, "step": 38440 }, { "epoch": 6.354059078702748, "grad_norm": 7.04690408706665, "learning_rate": 2.0253768890357885e-05, "loss": 1.3721, "step": 38450 }, { "epoch": 6.355711629828548, "grad_norm": 9.508496284484863, "learning_rate": 2.0244587671459264e-05, "loss": 1.3303, "step": 38460 }, { "epoch": 6.357364180954348, "grad_norm": 21.893522262573242, "learning_rate": 2.0235406452560643e-05, "loss": 1.3006, "step": 38470 }, { "epoch": 6.359016732080149, "grad_norm": 10.605900764465332, "learning_rate": 2.0226225233662022e-05, "loss": 1.5355, "step": 38480 }, { "epoch": 6.360669283205949, "grad_norm": 43.09657669067383, "learning_rate": 2.02170440147634e-05, "loss": 1.4263, "step": 38490 }, { "epoch": 6.36232183433175, "grad_norm": 11.316243171691895, "learning_rate": 2.020786279586478e-05, "loss": 1.435, "step": 38500 }, { "epoch": 6.36397438545755, "grad_norm": 13.281632423400879, "learning_rate": 2.0198681576966157e-05, "loss": 1.496, "step": 38510 }, { "epoch": 6.36562693658335, "grad_norm": 11.309965133666992, "learning_rate": 2.0189500358067536e-05, "loss": 1.4598, "step": 38520 }, { "epoch": 6.367279487709151, "grad_norm": 13.029101371765137, "learning_rate": 2.018031913916892e-05, "loss": 1.5399, "step": 38530 }, { "epoch": 6.368932038834951, "grad_norm": 112.3958740234375, "learning_rate": 2.0171137920270298e-05, "loss": 1.3838, "step": 38540 }, { "epoch": 6.370584589960752, "grad_norm": 9.24577808380127, "learning_rate": 2.0161956701371677e-05, "loss": 1.3194, "step": 38550 }, { "epoch": 6.372237141086552, "grad_norm": 14.375222206115723, "learning_rate": 2.0152775482473056e-05, "loss": 1.1996, "step": 38560 }, { "epoch": 6.3738896922123525, "grad_norm": 19.481122970581055, "learning_rate": 2.0143594263574432e-05, "loss": 1.5943, "step": 38570 }, { "epoch": 6.3755422433381534, "grad_norm": 13.95479679107666, "learning_rate": 2.013441304467581e-05, "loss": 1.4706, "step": 38580 }, { "epoch": 6.3771947944639535, "grad_norm": 13.144954681396484, "learning_rate": 2.012523182577719e-05, "loss": 1.3673, "step": 38590 }, { "epoch": 6.3788473455897545, "grad_norm": 12.10051155090332, "learning_rate": 2.011605060687857e-05, "loss": 1.2891, "step": 38600 }, { "epoch": 6.380499896715555, "grad_norm": 27.080062866210938, "learning_rate": 2.010686938797995e-05, "loss": 1.4499, "step": 38610 }, { "epoch": 6.382152447841355, "grad_norm": 14.484885215759277, "learning_rate": 2.009768816908133e-05, "loss": 1.3468, "step": 38620 }, { "epoch": 6.383804998967156, "grad_norm": 10.011687278747559, "learning_rate": 2.0088506950182708e-05, "loss": 1.4775, "step": 38630 }, { "epoch": 6.385457550092956, "grad_norm": 16.136798858642578, "learning_rate": 2.0079325731284083e-05, "loss": 1.4223, "step": 38640 }, { "epoch": 6.387110101218757, "grad_norm": 14.336216926574707, "learning_rate": 2.0070144512385466e-05, "loss": 1.4514, "step": 38650 }, { "epoch": 6.388762652344557, "grad_norm": 19.590391159057617, "learning_rate": 2.0060963293486845e-05, "loss": 1.5656, "step": 38660 }, { "epoch": 6.390415203470357, "grad_norm": 12.93675422668457, "learning_rate": 2.0051782074588225e-05, "loss": 1.4404, "step": 38670 }, { "epoch": 6.392067754596158, "grad_norm": 12.619584083557129, "learning_rate": 2.0042600855689604e-05, "loss": 1.4569, "step": 38680 }, { "epoch": 6.393720305721958, "grad_norm": 9.457258224487305, "learning_rate": 2.0033419636790983e-05, "loss": 1.3339, "step": 38690 }, { "epoch": 6.395372856847759, "grad_norm": 13.199435234069824, "learning_rate": 2.002423841789236e-05, "loss": 1.3885, "step": 38700 }, { "epoch": 6.397025407973559, "grad_norm": 19.85384178161621, "learning_rate": 2.0015057198993738e-05, "loss": 1.5067, "step": 38710 }, { "epoch": 6.398677959099359, "grad_norm": 9.798479080200195, "learning_rate": 2.0005875980095117e-05, "loss": 1.4391, "step": 38720 }, { "epoch": 6.40033051022516, "grad_norm": 14.563024520874023, "learning_rate": 1.9996694761196497e-05, "loss": 1.4709, "step": 38730 }, { "epoch": 6.40198306135096, "grad_norm": 13.90085506439209, "learning_rate": 1.9987513542297876e-05, "loss": 1.5339, "step": 38740 }, { "epoch": 6.403635612476761, "grad_norm": 7.516844272613525, "learning_rate": 1.9978332323399255e-05, "loss": 1.3254, "step": 38750 }, { "epoch": 6.405288163602561, "grad_norm": 10.863202095031738, "learning_rate": 1.9969151104500634e-05, "loss": 1.3872, "step": 38760 }, { "epoch": 6.406940714728362, "grad_norm": 21.591825485229492, "learning_rate": 1.9959969885602014e-05, "loss": 1.5489, "step": 38770 }, { "epoch": 6.4085932658541624, "grad_norm": 17.203176498413086, "learning_rate": 1.9950788666703393e-05, "loss": 1.4613, "step": 38780 }, { "epoch": 6.4102458169799625, "grad_norm": 13.360187530517578, "learning_rate": 1.9941607447804772e-05, "loss": 1.4316, "step": 38790 }, { "epoch": 6.4118983681057635, "grad_norm": 30.37958526611328, "learning_rate": 1.993242622890615e-05, "loss": 1.4888, "step": 38800 }, { "epoch": 6.413550919231564, "grad_norm": 21.938684463500977, "learning_rate": 1.992324501000753e-05, "loss": 1.5436, "step": 38810 }, { "epoch": 6.415203470357364, "grad_norm": 9.949905395507812, "learning_rate": 1.991406379110891e-05, "loss": 1.45, "step": 38820 }, { "epoch": 6.416856021483165, "grad_norm": 15.345038414001465, "learning_rate": 1.9904882572210286e-05, "loss": 1.4534, "step": 38830 }, { "epoch": 6.418508572608965, "grad_norm": 11.959953308105469, "learning_rate": 1.9895701353311665e-05, "loss": 1.4728, "step": 38840 }, { "epoch": 6.420161123734766, "grad_norm": 10.856972694396973, "learning_rate": 1.9886520134413044e-05, "loss": 1.4479, "step": 38850 }, { "epoch": 6.421813674860566, "grad_norm": 10.778406143188477, "learning_rate": 1.9877338915514424e-05, "loss": 1.4299, "step": 38860 }, { "epoch": 6.423466225986367, "grad_norm": 10.005019187927246, "learning_rate": 1.9868157696615803e-05, "loss": 1.3999, "step": 38870 }, { "epoch": 6.425118777112167, "grad_norm": 8.668060302734375, "learning_rate": 1.9858976477717185e-05, "loss": 1.425, "step": 38880 }, { "epoch": 6.426771328237967, "grad_norm": 13.997286796569824, "learning_rate": 1.984979525881856e-05, "loss": 1.3868, "step": 38890 }, { "epoch": 6.428423879363768, "grad_norm": 15.211793899536133, "learning_rate": 1.984061403991994e-05, "loss": 1.5317, "step": 38900 }, { "epoch": 6.430076430489568, "grad_norm": 7.7943034172058105, "learning_rate": 1.983143282102132e-05, "loss": 1.299, "step": 38910 }, { "epoch": 6.431728981615369, "grad_norm": 7.966866493225098, "learning_rate": 1.98222516021227e-05, "loss": 1.3767, "step": 38920 }, { "epoch": 6.433381532741169, "grad_norm": 29.43184471130371, "learning_rate": 1.9813070383224078e-05, "loss": 1.3973, "step": 38930 }, { "epoch": 6.435034083866969, "grad_norm": 15.053828239440918, "learning_rate": 1.9803889164325457e-05, "loss": 1.4583, "step": 38940 }, { "epoch": 6.43668663499277, "grad_norm": 6.739525318145752, "learning_rate": 1.9794707945426837e-05, "loss": 1.2457, "step": 38950 }, { "epoch": 6.43833918611857, "grad_norm": 15.069477081298828, "learning_rate": 1.9785526726528213e-05, "loss": 1.3477, "step": 38960 }, { "epoch": 6.439991737244371, "grad_norm": 14.493611335754395, "learning_rate": 1.9776345507629592e-05, "loss": 1.4433, "step": 38970 }, { "epoch": 6.4416442883701714, "grad_norm": 9.453902244567871, "learning_rate": 1.976716428873097e-05, "loss": 1.5945, "step": 38980 }, { "epoch": 6.4432968394959715, "grad_norm": 42.37503433227539, "learning_rate": 1.9757983069832354e-05, "loss": 1.4771, "step": 38990 }, { "epoch": 6.4449493906217725, "grad_norm": 13.014341354370117, "learning_rate": 1.9748801850933733e-05, "loss": 1.4682, "step": 39000 }, { "epoch": 6.446601941747573, "grad_norm": 13.779463768005371, "learning_rate": 1.9739620632035112e-05, "loss": 1.3348, "step": 39010 }, { "epoch": 6.448254492873374, "grad_norm": 27.29125213623047, "learning_rate": 1.9730439413136488e-05, "loss": 1.3392, "step": 39020 }, { "epoch": 6.449907043999174, "grad_norm": 23.10088539123535, "learning_rate": 1.9721258194237867e-05, "loss": 1.4222, "step": 39030 }, { "epoch": 6.451559595124974, "grad_norm": 9.619447708129883, "learning_rate": 1.9712076975339247e-05, "loss": 1.4682, "step": 39040 }, { "epoch": 6.453212146250775, "grad_norm": 12.745689392089844, "learning_rate": 1.9702895756440626e-05, "loss": 1.4813, "step": 39050 }, { "epoch": 6.454864697376575, "grad_norm": 13.98746109008789, "learning_rate": 1.9693714537542005e-05, "loss": 1.4522, "step": 39060 }, { "epoch": 6.456517248502376, "grad_norm": 11.557574272155762, "learning_rate": 1.9684533318643384e-05, "loss": 1.4221, "step": 39070 }, { "epoch": 6.458169799628176, "grad_norm": 10.058650016784668, "learning_rate": 1.9675352099744764e-05, "loss": 1.5567, "step": 39080 }, { "epoch": 6.459822350753976, "grad_norm": 13.266910552978516, "learning_rate": 1.966617088084614e-05, "loss": 1.5052, "step": 39090 }, { "epoch": 6.461474901879777, "grad_norm": 13.270020484924316, "learning_rate": 1.9656989661947522e-05, "loss": 1.4501, "step": 39100 }, { "epoch": 6.463127453005577, "grad_norm": 9.27397346496582, "learning_rate": 1.96478084430489e-05, "loss": 1.5234, "step": 39110 }, { "epoch": 6.464780004131378, "grad_norm": 12.057477951049805, "learning_rate": 1.963862722415028e-05, "loss": 1.3752, "step": 39120 }, { "epoch": 6.466432555257178, "grad_norm": 8.3319730758667, "learning_rate": 1.962944600525166e-05, "loss": 1.4345, "step": 39130 }, { "epoch": 6.468085106382979, "grad_norm": 9.282336235046387, "learning_rate": 1.962026478635304e-05, "loss": 1.3447, "step": 39140 }, { "epoch": 6.469737657508779, "grad_norm": 30.67858123779297, "learning_rate": 1.9611083567454415e-05, "loss": 1.4171, "step": 39150 }, { "epoch": 6.471390208634579, "grad_norm": 13.620590209960938, "learning_rate": 1.9601902348555794e-05, "loss": 1.4623, "step": 39160 }, { "epoch": 6.47304275976038, "grad_norm": 10.19704818725586, "learning_rate": 1.9592721129657173e-05, "loss": 1.4476, "step": 39170 }, { "epoch": 6.47469531088618, "grad_norm": 8.967962265014648, "learning_rate": 1.9583539910758553e-05, "loss": 1.502, "step": 39180 }, { "epoch": 6.4763478620119805, "grad_norm": 68.98345184326172, "learning_rate": 1.9574358691859932e-05, "loss": 1.4766, "step": 39190 }, { "epoch": 6.4780004131377815, "grad_norm": 13.379420280456543, "learning_rate": 1.956517747296131e-05, "loss": 1.4901, "step": 39200 }, { "epoch": 6.479652964263582, "grad_norm": 10.130301475524902, "learning_rate": 1.955599625406269e-05, "loss": 1.4408, "step": 39210 }, { "epoch": 6.481305515389383, "grad_norm": 9.031115531921387, "learning_rate": 1.954681503516407e-05, "loss": 1.4477, "step": 39220 }, { "epoch": 6.482958066515183, "grad_norm": 10.593781471252441, "learning_rate": 1.953763381626545e-05, "loss": 1.3162, "step": 39230 }, { "epoch": 6.484610617640984, "grad_norm": 10.19831657409668, "learning_rate": 1.9528452597366828e-05, "loss": 1.393, "step": 39240 }, { "epoch": 6.486263168766784, "grad_norm": 28.23524284362793, "learning_rate": 1.9519271378468207e-05, "loss": 1.4287, "step": 39250 }, { "epoch": 6.487915719892584, "grad_norm": 10.968947410583496, "learning_rate": 1.9510090159569587e-05, "loss": 1.4497, "step": 39260 }, { "epoch": 6.489568271018385, "grad_norm": 11.46053409576416, "learning_rate": 1.9500908940670966e-05, "loss": 1.511, "step": 39270 }, { "epoch": 6.491220822144185, "grad_norm": 11.82854175567627, "learning_rate": 1.9491727721772342e-05, "loss": 1.3231, "step": 39280 }, { "epoch": 6.492873373269986, "grad_norm": 14.076804161071777, "learning_rate": 1.948254650287372e-05, "loss": 1.4442, "step": 39290 }, { "epoch": 6.494525924395786, "grad_norm": 8.95998477935791, "learning_rate": 1.94733652839751e-05, "loss": 1.4034, "step": 39300 }, { "epoch": 6.496178475521586, "grad_norm": 18.003774642944336, "learning_rate": 1.946418406507648e-05, "loss": 1.5485, "step": 39310 }, { "epoch": 6.497831026647387, "grad_norm": 44.281982421875, "learning_rate": 1.945500284617786e-05, "loss": 1.4739, "step": 39320 }, { "epoch": 6.499483577773187, "grad_norm": 8.51934814453125, "learning_rate": 1.944582162727924e-05, "loss": 1.5113, "step": 39330 }, { "epoch": 6.501136128898988, "grad_norm": 12.045053482055664, "learning_rate": 1.9436640408380617e-05, "loss": 1.4032, "step": 39340 }, { "epoch": 6.502788680024788, "grad_norm": 9.922347068786621, "learning_rate": 1.9427459189481996e-05, "loss": 1.3881, "step": 39350 }, { "epoch": 6.504441231150588, "grad_norm": 17.480976104736328, "learning_rate": 1.9418277970583376e-05, "loss": 1.4731, "step": 39360 }, { "epoch": 6.506093782276389, "grad_norm": 7.395370006561279, "learning_rate": 1.9409096751684755e-05, "loss": 1.3666, "step": 39370 }, { "epoch": 6.507746333402189, "grad_norm": 23.064151763916016, "learning_rate": 1.9399915532786134e-05, "loss": 1.4598, "step": 39380 }, { "epoch": 6.50939888452799, "grad_norm": 8.329906463623047, "learning_rate": 1.9390734313887513e-05, "loss": 1.367, "step": 39390 }, { "epoch": 6.5110514356537905, "grad_norm": 8.619010925292969, "learning_rate": 1.9381553094988893e-05, "loss": 1.4067, "step": 39400 }, { "epoch": 6.512703986779591, "grad_norm": 9.927018165588379, "learning_rate": 1.937237187609027e-05, "loss": 1.4338, "step": 39410 }, { "epoch": 6.514356537905392, "grad_norm": 13.935831069946289, "learning_rate": 1.9363190657191648e-05, "loss": 1.4524, "step": 39420 }, { "epoch": 6.516009089031192, "grad_norm": 8.87132453918457, "learning_rate": 1.9354009438293027e-05, "loss": 1.3038, "step": 39430 }, { "epoch": 6.517661640156993, "grad_norm": 9.636089324951172, "learning_rate": 1.9344828219394406e-05, "loss": 1.4682, "step": 39440 }, { "epoch": 6.519314191282793, "grad_norm": 21.424053192138672, "learning_rate": 1.933564700049579e-05, "loss": 1.3425, "step": 39450 }, { "epoch": 6.520966742408593, "grad_norm": 53.614646911621094, "learning_rate": 1.9326465781597168e-05, "loss": 1.4947, "step": 39460 }, { "epoch": 6.522619293534394, "grad_norm": 8.673842430114746, "learning_rate": 1.9317284562698544e-05, "loss": 1.4588, "step": 39470 }, { "epoch": 6.524271844660194, "grad_norm": 47.06062316894531, "learning_rate": 1.9308103343799923e-05, "loss": 1.2771, "step": 39480 }, { "epoch": 6.525924395785995, "grad_norm": 11.706918716430664, "learning_rate": 1.9298922124901303e-05, "loss": 1.4553, "step": 39490 }, { "epoch": 6.527576946911795, "grad_norm": 9.595712661743164, "learning_rate": 1.9289740906002682e-05, "loss": 1.4247, "step": 39500 }, { "epoch": 6.529229498037596, "grad_norm": 9.61540412902832, "learning_rate": 1.928055968710406e-05, "loss": 1.3642, "step": 39510 }, { "epoch": 6.530882049163396, "grad_norm": 34.72698974609375, "learning_rate": 1.927137846820544e-05, "loss": 1.4828, "step": 39520 }, { "epoch": 6.532534600289196, "grad_norm": 14.31153678894043, "learning_rate": 1.926219724930682e-05, "loss": 1.5188, "step": 39530 }, { "epoch": 6.534187151414997, "grad_norm": 12.134452819824219, "learning_rate": 1.9253016030408195e-05, "loss": 1.5148, "step": 39540 }, { "epoch": 6.535839702540797, "grad_norm": 8.781564712524414, "learning_rate": 1.9243834811509575e-05, "loss": 1.5178, "step": 39550 }, { "epoch": 6.537492253666597, "grad_norm": 16.55754852294922, "learning_rate": 1.9234653592610957e-05, "loss": 1.4356, "step": 39560 }, { "epoch": 6.539144804792398, "grad_norm": 14.849857330322266, "learning_rate": 1.9225472373712337e-05, "loss": 1.405, "step": 39570 }, { "epoch": 6.540797355918198, "grad_norm": 12.2333402633667, "learning_rate": 1.9216291154813716e-05, "loss": 1.4223, "step": 39580 }, { "epoch": 6.542449907043999, "grad_norm": 8.129743576049805, "learning_rate": 1.9207109935915095e-05, "loss": 1.4793, "step": 39590 }, { "epoch": 6.5441024581697995, "grad_norm": 10.34343433380127, "learning_rate": 1.919792871701647e-05, "loss": 1.4236, "step": 39600 }, { "epoch": 6.5457550092956005, "grad_norm": 18.99115753173828, "learning_rate": 1.918874749811785e-05, "loss": 1.4893, "step": 39610 }, { "epoch": 6.547407560421401, "grad_norm": 13.560120582580566, "learning_rate": 1.917956627921923e-05, "loss": 1.5145, "step": 39620 }, { "epoch": 6.549060111547201, "grad_norm": 20.85279655456543, "learning_rate": 1.917038506032061e-05, "loss": 1.3886, "step": 39630 }, { "epoch": 6.550712662673002, "grad_norm": 11.048171997070312, "learning_rate": 1.9161203841421988e-05, "loss": 1.4902, "step": 39640 }, { "epoch": 6.552365213798802, "grad_norm": 11.270243644714355, "learning_rate": 1.9152022622523367e-05, "loss": 1.4862, "step": 39650 }, { "epoch": 6.554017764924602, "grad_norm": 10.38618278503418, "learning_rate": 1.9142841403624746e-05, "loss": 1.478, "step": 39660 }, { "epoch": 6.555670316050403, "grad_norm": 11.706856727600098, "learning_rate": 1.9133660184726126e-05, "loss": 1.4313, "step": 39670 }, { "epoch": 6.557322867176203, "grad_norm": 19.566499710083008, "learning_rate": 1.9124478965827505e-05, "loss": 1.4815, "step": 39680 }, { "epoch": 6.558975418302004, "grad_norm": 10.643657684326172, "learning_rate": 1.9115297746928884e-05, "loss": 1.2795, "step": 39690 }, { "epoch": 6.560627969427804, "grad_norm": 12.18120288848877, "learning_rate": 1.9106116528030263e-05, "loss": 1.2629, "step": 39700 }, { "epoch": 6.562280520553605, "grad_norm": 9.885913848876953, "learning_rate": 1.9096935309131643e-05, "loss": 1.4513, "step": 39710 }, { "epoch": 6.563933071679405, "grad_norm": 7.771469593048096, "learning_rate": 1.9087754090233022e-05, "loss": 1.4502, "step": 39720 }, { "epoch": 6.565585622805205, "grad_norm": 8.415877342224121, "learning_rate": 1.9078572871334398e-05, "loss": 1.5465, "step": 39730 }, { "epoch": 6.567238173931006, "grad_norm": 30.650428771972656, "learning_rate": 1.9069391652435777e-05, "loss": 1.3623, "step": 39740 }, { "epoch": 6.568890725056806, "grad_norm": 19.83210563659668, "learning_rate": 1.9060210433537156e-05, "loss": 1.3031, "step": 39750 }, { "epoch": 6.570543276182607, "grad_norm": 11.888619422912598, "learning_rate": 1.9051029214638535e-05, "loss": 1.55, "step": 39760 }, { "epoch": 6.572195827308407, "grad_norm": 8.220413208007812, "learning_rate": 1.9041847995739915e-05, "loss": 1.395, "step": 39770 }, { "epoch": 6.573848378434207, "grad_norm": 10.484262466430664, "learning_rate": 1.9032666776841294e-05, "loss": 1.3441, "step": 39780 }, { "epoch": 6.575500929560008, "grad_norm": 8.3544921875, "learning_rate": 1.9023485557942673e-05, "loss": 1.4837, "step": 39790 }, { "epoch": 6.5771534806858085, "grad_norm": 10.449460983276367, "learning_rate": 1.9014304339044052e-05, "loss": 1.4106, "step": 39800 }, { "epoch": 6.5788060318116095, "grad_norm": 9.053803443908691, "learning_rate": 1.9005123120145432e-05, "loss": 1.309, "step": 39810 }, { "epoch": 6.58045858293741, "grad_norm": 26.58787727355957, "learning_rate": 1.899594190124681e-05, "loss": 1.4609, "step": 39820 }, { "epoch": 6.58211113406321, "grad_norm": 12.057016372680664, "learning_rate": 1.898676068234819e-05, "loss": 1.3254, "step": 39830 }, { "epoch": 6.583763685189011, "grad_norm": 7.667273998260498, "learning_rate": 1.897757946344957e-05, "loss": 1.4452, "step": 39840 }, { "epoch": 6.585416236314811, "grad_norm": 12.83427619934082, "learning_rate": 1.896839824455095e-05, "loss": 1.4146, "step": 39850 }, { "epoch": 6.587068787440612, "grad_norm": 13.283857345581055, "learning_rate": 1.8959217025652325e-05, "loss": 1.4297, "step": 39860 }, { "epoch": 6.588721338566412, "grad_norm": 13.097709655761719, "learning_rate": 1.8950035806753704e-05, "loss": 1.443, "step": 39870 }, { "epoch": 6.590373889692213, "grad_norm": 10.823288917541504, "learning_rate": 1.8940854587855083e-05, "loss": 1.4325, "step": 39880 }, { "epoch": 6.592026440818013, "grad_norm": 13.02402400970459, "learning_rate": 1.8931673368956462e-05, "loss": 1.4653, "step": 39890 }, { "epoch": 6.593678991943813, "grad_norm": 11.030945777893066, "learning_rate": 1.8922492150057845e-05, "loss": 1.2672, "step": 39900 }, { "epoch": 6.595331543069614, "grad_norm": 8.03734302520752, "learning_rate": 1.8913310931159224e-05, "loss": 1.3499, "step": 39910 }, { "epoch": 6.596984094195414, "grad_norm": 12.094477653503418, "learning_rate": 1.89041297122606e-05, "loss": 1.4214, "step": 39920 }, { "epoch": 6.598636645321214, "grad_norm": 10.550955772399902, "learning_rate": 1.889494849336198e-05, "loss": 1.4045, "step": 39930 }, { "epoch": 6.600289196447015, "grad_norm": 10.761007308959961, "learning_rate": 1.888576727446336e-05, "loss": 1.5118, "step": 39940 }, { "epoch": 6.601941747572815, "grad_norm": 19.835403442382812, "learning_rate": 1.8876586055564738e-05, "loss": 1.4687, "step": 39950 }, { "epoch": 6.603594298698616, "grad_norm": 11.144922256469727, "learning_rate": 1.8867404836666117e-05, "loss": 1.4642, "step": 39960 }, { "epoch": 6.605246849824416, "grad_norm": 12.04179573059082, "learning_rate": 1.8858223617767496e-05, "loss": 1.6522, "step": 39970 }, { "epoch": 6.606899400950217, "grad_norm": 8.051116943359375, "learning_rate": 1.8849042398868876e-05, "loss": 1.4801, "step": 39980 }, { "epoch": 6.608551952076017, "grad_norm": 13.712382316589355, "learning_rate": 1.883986117997025e-05, "loss": 1.4568, "step": 39990 }, { "epoch": 6.6102045032018175, "grad_norm": 7.790043830871582, "learning_rate": 1.883067996107163e-05, "loss": 1.3776, "step": 40000 }, { "epoch": 6.6118570543276185, "grad_norm": 16.25408172607422, "learning_rate": 1.882149874217301e-05, "loss": 1.3283, "step": 40010 }, { "epoch": 6.613509605453419, "grad_norm": 6.5616774559021, "learning_rate": 1.8812317523274393e-05, "loss": 1.3177, "step": 40020 }, { "epoch": 6.615162156579219, "grad_norm": 8.824763298034668, "learning_rate": 1.8803136304375772e-05, "loss": 1.3024, "step": 40030 }, { "epoch": 6.61681470770502, "grad_norm": 14.049736976623535, "learning_rate": 1.879395508547715e-05, "loss": 1.3356, "step": 40040 }, { "epoch": 6.61846725883082, "grad_norm": 16.632484436035156, "learning_rate": 1.8784773866578527e-05, "loss": 1.5723, "step": 40050 }, { "epoch": 6.620119809956621, "grad_norm": 21.031719207763672, "learning_rate": 1.8775592647679906e-05, "loss": 1.2901, "step": 40060 }, { "epoch": 6.621772361082421, "grad_norm": 15.199872016906738, "learning_rate": 1.8766411428781285e-05, "loss": 1.4082, "step": 40070 }, { "epoch": 6.623424912208222, "grad_norm": 10.436044692993164, "learning_rate": 1.8757230209882665e-05, "loss": 1.4145, "step": 40080 }, { "epoch": 6.625077463334022, "grad_norm": 8.306492805480957, "learning_rate": 1.8748048990984044e-05, "loss": 1.3091, "step": 40090 }, { "epoch": 6.626730014459822, "grad_norm": 10.09487247467041, "learning_rate": 1.8738867772085423e-05, "loss": 1.3551, "step": 40100 }, { "epoch": 6.628382565585623, "grad_norm": 13.294191360473633, "learning_rate": 1.8729686553186802e-05, "loss": 1.6028, "step": 40110 }, { "epoch": 6.630035116711423, "grad_norm": 11.726201057434082, "learning_rate": 1.8720505334288178e-05, "loss": 1.4088, "step": 40120 }, { "epoch": 6.631687667837224, "grad_norm": 9.069192886352539, "learning_rate": 1.871132411538956e-05, "loss": 1.4661, "step": 40130 }, { "epoch": 6.633340218963024, "grad_norm": 14.839059829711914, "learning_rate": 1.870214289649094e-05, "loss": 1.5112, "step": 40140 }, { "epoch": 6.634992770088824, "grad_norm": 15.229697227478027, "learning_rate": 1.869296167759232e-05, "loss": 1.289, "step": 40150 }, { "epoch": 6.636645321214625, "grad_norm": 9.121678352355957, "learning_rate": 1.86837804586937e-05, "loss": 1.5177, "step": 40160 }, { "epoch": 6.638297872340425, "grad_norm": 9.134458541870117, "learning_rate": 1.8674599239795078e-05, "loss": 1.3569, "step": 40170 }, { "epoch": 6.639950423466226, "grad_norm": 9.136309623718262, "learning_rate": 1.8665418020896454e-05, "loss": 1.4439, "step": 40180 }, { "epoch": 6.641602974592026, "grad_norm": 13.449373245239258, "learning_rate": 1.8656236801997833e-05, "loss": 1.4349, "step": 40190 }, { "epoch": 6.6432555257178265, "grad_norm": 14.80142879486084, "learning_rate": 1.8647055583099212e-05, "loss": 1.379, "step": 40200 }, { "epoch": 6.6449080768436275, "grad_norm": 12.268270492553711, "learning_rate": 1.863787436420059e-05, "loss": 1.4978, "step": 40210 }, { "epoch": 6.646560627969428, "grad_norm": 20.85672378540039, "learning_rate": 1.862869314530197e-05, "loss": 1.3803, "step": 40220 }, { "epoch": 6.648213179095229, "grad_norm": 15.535070419311523, "learning_rate": 1.861951192640335e-05, "loss": 1.383, "step": 40230 }, { "epoch": 6.649865730221029, "grad_norm": 9.89908504486084, "learning_rate": 1.861033070750473e-05, "loss": 1.4263, "step": 40240 }, { "epoch": 6.65151828134683, "grad_norm": 15.000544548034668, "learning_rate": 1.860114948860611e-05, "loss": 1.5781, "step": 40250 }, { "epoch": 6.65317083247263, "grad_norm": 11.549114227294922, "learning_rate": 1.8591968269707488e-05, "loss": 1.4254, "step": 40260 }, { "epoch": 6.65482338359843, "grad_norm": 16.330419540405273, "learning_rate": 1.8582787050808867e-05, "loss": 1.4272, "step": 40270 }, { "epoch": 6.656475934724231, "grad_norm": 12.878795623779297, "learning_rate": 1.8573605831910246e-05, "loss": 1.4654, "step": 40280 }, { "epoch": 6.658128485850031, "grad_norm": 35.6964111328125, "learning_rate": 1.8564424613011625e-05, "loss": 1.3814, "step": 40290 }, { "epoch": 6.659781036975831, "grad_norm": 14.159383773803711, "learning_rate": 1.8555243394113005e-05, "loss": 1.5742, "step": 40300 }, { "epoch": 6.661433588101632, "grad_norm": 8.858052253723145, "learning_rate": 1.854606217521438e-05, "loss": 1.4659, "step": 40310 }, { "epoch": 6.663086139227432, "grad_norm": 9.946170806884766, "learning_rate": 1.853688095631576e-05, "loss": 1.2612, "step": 40320 }, { "epoch": 6.664738690353233, "grad_norm": 9.768632888793945, "learning_rate": 1.852769973741714e-05, "loss": 1.4816, "step": 40330 }, { "epoch": 6.666391241479033, "grad_norm": 11.27400016784668, "learning_rate": 1.8518518518518518e-05, "loss": 1.466, "step": 40340 }, { "epoch": 6.668043792604834, "grad_norm": 13.281896591186523, "learning_rate": 1.8509337299619898e-05, "loss": 1.356, "step": 40350 }, { "epoch": 6.669696343730634, "grad_norm": 14.230932235717773, "learning_rate": 1.850015608072128e-05, "loss": 1.3526, "step": 40360 }, { "epoch": 6.671348894856434, "grad_norm": 9.921380043029785, "learning_rate": 1.8490974861822656e-05, "loss": 1.4302, "step": 40370 }, { "epoch": 6.673001445982235, "grad_norm": 13.660683631896973, "learning_rate": 1.8481793642924035e-05, "loss": 1.3974, "step": 40380 }, { "epoch": 6.674653997108035, "grad_norm": 15.629735946655273, "learning_rate": 1.8472612424025415e-05, "loss": 1.4577, "step": 40390 }, { "epoch": 6.6763065482338355, "grad_norm": 10.573319435119629, "learning_rate": 1.8463431205126794e-05, "loss": 1.425, "step": 40400 }, { "epoch": 6.6779590993596365, "grad_norm": 9.454117774963379, "learning_rate": 1.8454249986228173e-05, "loss": 1.374, "step": 40410 }, { "epoch": 6.679611650485437, "grad_norm": 21.72432518005371, "learning_rate": 1.8445068767329552e-05, "loss": 1.499, "step": 40420 }, { "epoch": 6.681264201611238, "grad_norm": 10.876113891601562, "learning_rate": 1.843588754843093e-05, "loss": 1.5718, "step": 40430 }, { "epoch": 6.682916752737038, "grad_norm": 21.28350067138672, "learning_rate": 1.8426706329532307e-05, "loss": 1.399, "step": 40440 }, { "epoch": 6.684569303862839, "grad_norm": 10.992979049682617, "learning_rate": 1.8417525110633687e-05, "loss": 1.3749, "step": 40450 }, { "epoch": 6.686221854988639, "grad_norm": 11.09716510772705, "learning_rate": 1.8408343891735066e-05, "loss": 1.3887, "step": 40460 }, { "epoch": 6.687874406114439, "grad_norm": 14.070152282714844, "learning_rate": 1.839916267283645e-05, "loss": 1.541, "step": 40470 }, { "epoch": 6.68952695724024, "grad_norm": 16.888887405395508, "learning_rate": 1.8389981453937828e-05, "loss": 1.4876, "step": 40480 }, { "epoch": 6.69117950836604, "grad_norm": 13.392816543579102, "learning_rate": 1.8380800235039207e-05, "loss": 1.4925, "step": 40490 }, { "epoch": 6.692832059491841, "grad_norm": 12.64699649810791, "learning_rate": 1.8371619016140583e-05, "loss": 1.4564, "step": 40500 }, { "epoch": 6.694484610617641, "grad_norm": 48.2119026184082, "learning_rate": 1.8362437797241962e-05, "loss": 1.4863, "step": 40510 }, { "epoch": 6.696137161743441, "grad_norm": 10.135701179504395, "learning_rate": 1.835325657834334e-05, "loss": 1.4801, "step": 40520 }, { "epoch": 6.697789712869242, "grad_norm": 13.071991920471191, "learning_rate": 1.834407535944472e-05, "loss": 1.5195, "step": 40530 }, { "epoch": 6.699442263995042, "grad_norm": 10.40569019317627, "learning_rate": 1.83348941405461e-05, "loss": 1.4392, "step": 40540 }, { "epoch": 6.701094815120843, "grad_norm": 8.229533195495605, "learning_rate": 1.832571292164748e-05, "loss": 1.4115, "step": 40550 }, { "epoch": 6.702747366246643, "grad_norm": 10.888386726379395, "learning_rate": 1.831653170274886e-05, "loss": 1.3083, "step": 40560 }, { "epoch": 6.704399917372443, "grad_norm": 10.805375099182129, "learning_rate": 1.8307350483850234e-05, "loss": 1.4312, "step": 40570 }, { "epoch": 6.706052468498244, "grad_norm": 21.973468780517578, "learning_rate": 1.8298169264951613e-05, "loss": 1.4178, "step": 40580 }, { "epoch": 6.707705019624044, "grad_norm": 12.331151008605957, "learning_rate": 1.8288988046052996e-05, "loss": 1.361, "step": 40590 }, { "epoch": 6.709357570749845, "grad_norm": 17.735149383544922, "learning_rate": 1.8279806827154375e-05, "loss": 1.5563, "step": 40600 }, { "epoch": 6.7110101218756455, "grad_norm": 9.35239028930664, "learning_rate": 1.8270625608255755e-05, "loss": 1.2544, "step": 40610 }, { "epoch": 6.7126626730014465, "grad_norm": 10.340348243713379, "learning_rate": 1.8261444389357134e-05, "loss": 1.5049, "step": 40620 }, { "epoch": 6.714315224127247, "grad_norm": 7.2036871910095215, "learning_rate": 1.825226317045851e-05, "loss": 1.3728, "step": 40630 }, { "epoch": 6.715967775253047, "grad_norm": 21.527620315551758, "learning_rate": 1.824308195155989e-05, "loss": 1.4878, "step": 40640 }, { "epoch": 6.717620326378848, "grad_norm": 16.127460479736328, "learning_rate": 1.8233900732661268e-05, "loss": 1.4064, "step": 40650 }, { "epoch": 6.719272877504648, "grad_norm": 27.192962646484375, "learning_rate": 1.8224719513762647e-05, "loss": 1.3923, "step": 40660 }, { "epoch": 6.720925428630448, "grad_norm": 13.89322280883789, "learning_rate": 1.8215538294864027e-05, "loss": 1.4246, "step": 40670 }, { "epoch": 6.722577979756249, "grad_norm": 12.22103214263916, "learning_rate": 1.8206357075965406e-05, "loss": 1.3845, "step": 40680 }, { "epoch": 6.724230530882049, "grad_norm": 10.441896438598633, "learning_rate": 1.8197175857066785e-05, "loss": 1.6125, "step": 40690 }, { "epoch": 6.72588308200785, "grad_norm": 10.106922149658203, "learning_rate": 1.8187994638168164e-05, "loss": 1.3981, "step": 40700 }, { "epoch": 6.72753563313365, "grad_norm": 10.453673362731934, "learning_rate": 1.8178813419269544e-05, "loss": 1.4177, "step": 40710 }, { "epoch": 6.729188184259451, "grad_norm": 14.970731735229492, "learning_rate": 1.8169632200370923e-05, "loss": 1.4991, "step": 40720 }, { "epoch": 6.730840735385251, "grad_norm": 8.335892677307129, "learning_rate": 1.8160450981472302e-05, "loss": 1.354, "step": 40730 }, { "epoch": 6.732493286511051, "grad_norm": 15.514796257019043, "learning_rate": 1.815126976257368e-05, "loss": 1.4697, "step": 40740 }, { "epoch": 6.734145837636852, "grad_norm": 11.07535457611084, "learning_rate": 1.814208854367506e-05, "loss": 1.4608, "step": 40750 }, { "epoch": 6.735798388762652, "grad_norm": 14.842573165893555, "learning_rate": 1.8132907324776437e-05, "loss": 1.4753, "step": 40760 }, { "epoch": 6.737450939888452, "grad_norm": 9.721745491027832, "learning_rate": 1.8123726105877816e-05, "loss": 1.5479, "step": 40770 }, { "epoch": 6.739103491014253, "grad_norm": 8.41561222076416, "learning_rate": 1.8114544886979195e-05, "loss": 1.3348, "step": 40780 }, { "epoch": 6.740756042140053, "grad_norm": 9.427742004394531, "learning_rate": 1.8105363668080574e-05, "loss": 1.4775, "step": 40790 }, { "epoch": 6.742408593265854, "grad_norm": 11.878392219543457, "learning_rate": 1.8096182449181954e-05, "loss": 1.492, "step": 40800 }, { "epoch": 6.7440611443916545, "grad_norm": 14.520692825317383, "learning_rate": 1.8087001230283333e-05, "loss": 1.5121, "step": 40810 }, { "epoch": 6.7457136955174555, "grad_norm": 5.713458061218262, "learning_rate": 1.8077820011384712e-05, "loss": 1.4465, "step": 40820 }, { "epoch": 6.747366246643256, "grad_norm": 11.001376152038574, "learning_rate": 1.806863879248609e-05, "loss": 1.3831, "step": 40830 }, { "epoch": 6.749018797769056, "grad_norm": 11.784494400024414, "learning_rate": 1.805945757358747e-05, "loss": 1.4752, "step": 40840 }, { "epoch": 6.750671348894857, "grad_norm": 16.32159996032715, "learning_rate": 1.805027635468885e-05, "loss": 1.3969, "step": 40850 }, { "epoch": 6.752323900020657, "grad_norm": 8.51651382446289, "learning_rate": 1.804109513579023e-05, "loss": 1.4283, "step": 40860 }, { "epoch": 6.753976451146458, "grad_norm": 17.39286231994629, "learning_rate": 1.8031913916891608e-05, "loss": 1.482, "step": 40870 }, { "epoch": 6.755629002272258, "grad_norm": 19.771865844726562, "learning_rate": 1.8022732697992987e-05, "loss": 1.3334, "step": 40880 }, { "epoch": 6.757281553398058, "grad_norm": 8.667237281799316, "learning_rate": 1.8013551479094363e-05, "loss": 1.3109, "step": 40890 }, { "epoch": 6.758934104523859, "grad_norm": 16.53960609436035, "learning_rate": 1.8004370260195743e-05, "loss": 1.4221, "step": 40900 }, { "epoch": 6.760586655649659, "grad_norm": 18.91925048828125, "learning_rate": 1.7995189041297122e-05, "loss": 1.376, "step": 40910 }, { "epoch": 6.76223920677546, "grad_norm": 14.152482986450195, "learning_rate": 1.79860078223985e-05, "loss": 1.4648, "step": 40920 }, { "epoch": 6.76389175790126, "grad_norm": 10.383070945739746, "learning_rate": 1.7976826603499884e-05, "loss": 1.3391, "step": 40930 }, { "epoch": 6.76554430902706, "grad_norm": 15.106592178344727, "learning_rate": 1.7967645384601263e-05, "loss": 1.4528, "step": 40940 }, { "epoch": 6.767196860152861, "grad_norm": 13.787833213806152, "learning_rate": 1.795846416570264e-05, "loss": 1.3813, "step": 40950 }, { "epoch": 6.768849411278661, "grad_norm": 6.263724327087402, "learning_rate": 1.7949282946804018e-05, "loss": 1.5428, "step": 40960 }, { "epoch": 6.770501962404462, "grad_norm": 10.366127014160156, "learning_rate": 1.7940101727905397e-05, "loss": 1.4737, "step": 40970 }, { "epoch": 6.772154513530262, "grad_norm": 27.11163902282715, "learning_rate": 1.7930920509006777e-05, "loss": 1.4403, "step": 40980 }, { "epoch": 6.773807064656062, "grad_norm": 10.552127838134766, "learning_rate": 1.7921739290108156e-05, "loss": 1.4511, "step": 40990 }, { "epoch": 6.775459615781863, "grad_norm": 17.98050308227539, "learning_rate": 1.7912558071209535e-05, "loss": 1.5128, "step": 41000 }, { "epoch": 6.7771121669076635, "grad_norm": 9.192895889282227, "learning_rate": 1.7903376852310914e-05, "loss": 1.334, "step": 41010 }, { "epoch": 6.7787647180334645, "grad_norm": 11.638321876525879, "learning_rate": 1.789419563341229e-05, "loss": 1.3764, "step": 41020 }, { "epoch": 6.780417269159265, "grad_norm": 12.00686264038086, "learning_rate": 1.788501441451367e-05, "loss": 1.4813, "step": 41030 }, { "epoch": 6.782069820285065, "grad_norm": 11.396140098571777, "learning_rate": 1.7875833195615052e-05, "loss": 1.438, "step": 41040 }, { "epoch": 6.783722371410866, "grad_norm": 55.01451873779297, "learning_rate": 1.786665197671643e-05, "loss": 1.4997, "step": 41050 }, { "epoch": 6.785374922536666, "grad_norm": 9.244829177856445, "learning_rate": 1.785747075781781e-05, "loss": 1.3627, "step": 41060 }, { "epoch": 6.787027473662467, "grad_norm": 9.327887535095215, "learning_rate": 1.784828953891919e-05, "loss": 1.3883, "step": 41070 }, { "epoch": 6.788680024788267, "grad_norm": 9.212366104125977, "learning_rate": 1.7839108320020566e-05, "loss": 1.4265, "step": 41080 }, { "epoch": 6.790332575914068, "grad_norm": 41.623443603515625, "learning_rate": 1.7829927101121945e-05, "loss": 1.4342, "step": 41090 }, { "epoch": 6.791985127039868, "grad_norm": 10.537240982055664, "learning_rate": 1.7820745882223324e-05, "loss": 1.5631, "step": 41100 }, { "epoch": 6.793637678165668, "grad_norm": 14.747405052185059, "learning_rate": 1.7811564663324703e-05, "loss": 1.4803, "step": 41110 }, { "epoch": 6.795290229291469, "grad_norm": 10.192240715026855, "learning_rate": 1.7802383444426083e-05, "loss": 1.3918, "step": 41120 }, { "epoch": 6.796942780417269, "grad_norm": 9.440613746643066, "learning_rate": 1.7793202225527462e-05, "loss": 1.4808, "step": 41130 }, { "epoch": 6.798595331543069, "grad_norm": 14.262847900390625, "learning_rate": 1.778402100662884e-05, "loss": 1.3783, "step": 41140 }, { "epoch": 6.80024788266887, "grad_norm": 10.99431324005127, "learning_rate": 1.7774839787730217e-05, "loss": 1.4737, "step": 41150 }, { "epoch": 6.80190043379467, "grad_norm": 8.806312561035156, "learning_rate": 1.77656585688316e-05, "loss": 1.3779, "step": 41160 }, { "epoch": 6.803552984920471, "grad_norm": 39.46382141113281, "learning_rate": 1.775647734993298e-05, "loss": 1.4395, "step": 41170 }, { "epoch": 6.805205536046271, "grad_norm": 14.97314453125, "learning_rate": 1.7747296131034358e-05, "loss": 1.4276, "step": 41180 }, { "epoch": 6.806858087172072, "grad_norm": 9.130488395690918, "learning_rate": 1.7738114912135737e-05, "loss": 1.3061, "step": 41190 }, { "epoch": 6.808510638297872, "grad_norm": 9.451000213623047, "learning_rate": 1.7728933693237117e-05, "loss": 1.5621, "step": 41200 }, { "epoch": 6.8101631894236725, "grad_norm": 10.108766555786133, "learning_rate": 1.7719752474338492e-05, "loss": 1.387, "step": 41210 }, { "epoch": 6.8118157405494735, "grad_norm": 7.01558780670166, "learning_rate": 1.7710571255439872e-05, "loss": 1.3172, "step": 41220 }, { "epoch": 6.813468291675274, "grad_norm": 10.980785369873047, "learning_rate": 1.770139003654125e-05, "loss": 1.4921, "step": 41230 }, { "epoch": 6.815120842801074, "grad_norm": 12.635100364685059, "learning_rate": 1.769220881764263e-05, "loss": 1.5118, "step": 41240 }, { "epoch": 6.816773393926875, "grad_norm": 21.64072036743164, "learning_rate": 1.768302759874401e-05, "loss": 1.3484, "step": 41250 }, { "epoch": 6.818425945052675, "grad_norm": 10.687591552734375, "learning_rate": 1.767384637984539e-05, "loss": 1.3822, "step": 41260 }, { "epoch": 6.820078496178476, "grad_norm": 31.336519241333008, "learning_rate": 1.7664665160946768e-05, "loss": 1.4516, "step": 41270 }, { "epoch": 6.821731047304276, "grad_norm": 14.67437744140625, "learning_rate": 1.7655483942048147e-05, "loss": 1.4765, "step": 41280 }, { "epoch": 6.823383598430077, "grad_norm": 18.006410598754883, "learning_rate": 1.7646302723149526e-05, "loss": 1.3861, "step": 41290 }, { "epoch": 6.825036149555877, "grad_norm": 10.844664573669434, "learning_rate": 1.7637121504250906e-05, "loss": 1.4537, "step": 41300 }, { "epoch": 6.826688700681677, "grad_norm": 11.941879272460938, "learning_rate": 1.7627940285352285e-05, "loss": 1.4432, "step": 41310 }, { "epoch": 6.828341251807478, "grad_norm": 16.71599769592285, "learning_rate": 1.7618759066453664e-05, "loss": 1.4506, "step": 41320 }, { "epoch": 6.829993802933278, "grad_norm": 11.480480194091797, "learning_rate": 1.7609577847555043e-05, "loss": 1.5141, "step": 41330 }, { "epoch": 6.831646354059079, "grad_norm": 13.7313814163208, "learning_rate": 1.760039662865642e-05, "loss": 1.3246, "step": 41340 }, { "epoch": 6.833298905184879, "grad_norm": 6.67985725402832, "learning_rate": 1.75912154097578e-05, "loss": 1.5389, "step": 41350 }, { "epoch": 6.834951456310679, "grad_norm": 35.11752700805664, "learning_rate": 1.7582034190859178e-05, "loss": 1.3679, "step": 41360 }, { "epoch": 6.83660400743648, "grad_norm": 16.82154083251953, "learning_rate": 1.7572852971960557e-05, "loss": 1.5087, "step": 41370 }, { "epoch": 6.83825655856228, "grad_norm": 11.635119438171387, "learning_rate": 1.7563671753061936e-05, "loss": 1.5188, "step": 41380 }, { "epoch": 6.839909109688081, "grad_norm": 9.211424827575684, "learning_rate": 1.755449053416332e-05, "loss": 1.3765, "step": 41390 }, { "epoch": 6.841561660813881, "grad_norm": 7.994271755218506, "learning_rate": 1.7545309315264695e-05, "loss": 1.4092, "step": 41400 }, { "epoch": 6.8432142119396815, "grad_norm": 9.016519546508789, "learning_rate": 1.7536128096366074e-05, "loss": 1.5425, "step": 41410 }, { "epoch": 6.8448667630654825, "grad_norm": 15.688495635986328, "learning_rate": 1.7526946877467453e-05, "loss": 1.4526, "step": 41420 }, { "epoch": 6.846519314191283, "grad_norm": 11.424908638000488, "learning_rate": 1.7517765658568833e-05, "loss": 1.4075, "step": 41430 }, { "epoch": 6.8481718653170836, "grad_norm": 11.070535659790039, "learning_rate": 1.7508584439670212e-05, "loss": 1.3919, "step": 41440 }, { "epoch": 6.849824416442884, "grad_norm": 7.652846336364746, "learning_rate": 1.749940322077159e-05, "loss": 1.328, "step": 41450 }, { "epoch": 6.851476967568685, "grad_norm": 9.631972312927246, "learning_rate": 1.749022200187297e-05, "loss": 1.4173, "step": 41460 }, { "epoch": 6.853129518694485, "grad_norm": 41.709320068359375, "learning_rate": 1.7481040782974346e-05, "loss": 1.5274, "step": 41470 }, { "epoch": 6.854782069820285, "grad_norm": 11.905040740966797, "learning_rate": 1.7471859564075725e-05, "loss": 1.4682, "step": 41480 }, { "epoch": 6.856434620946086, "grad_norm": 20.44754981994629, "learning_rate": 1.7462678345177105e-05, "loss": 1.553, "step": 41490 }, { "epoch": 6.858087172071886, "grad_norm": 8.729151725769043, "learning_rate": 1.7453497126278487e-05, "loss": 1.4357, "step": 41500 }, { "epoch": 6.859739723197686, "grad_norm": 13.73019027709961, "learning_rate": 1.7444315907379867e-05, "loss": 1.4612, "step": 41510 }, { "epoch": 6.861392274323487, "grad_norm": 9.946063995361328, "learning_rate": 1.7435134688481246e-05, "loss": 1.4276, "step": 41520 }, { "epoch": 6.863044825449287, "grad_norm": 17.463390350341797, "learning_rate": 1.742595346958262e-05, "loss": 1.2973, "step": 41530 }, { "epoch": 6.864697376575088, "grad_norm": 25.70206642150879, "learning_rate": 1.7416772250684e-05, "loss": 1.5196, "step": 41540 }, { "epoch": 6.866349927700888, "grad_norm": 9.276856422424316, "learning_rate": 1.740759103178538e-05, "loss": 1.5073, "step": 41550 }, { "epoch": 6.868002478826689, "grad_norm": 9.336898803710938, "learning_rate": 1.739840981288676e-05, "loss": 1.4767, "step": 41560 }, { "epoch": 6.869655029952489, "grad_norm": 10.633883476257324, "learning_rate": 1.738922859398814e-05, "loss": 1.532, "step": 41570 }, { "epoch": 6.871307581078289, "grad_norm": 9.510002136230469, "learning_rate": 1.7380047375089518e-05, "loss": 1.4982, "step": 41580 }, { "epoch": 6.87296013220409, "grad_norm": 9.511191368103027, "learning_rate": 1.7370866156190897e-05, "loss": 1.4293, "step": 41590 }, { "epoch": 6.87461268332989, "grad_norm": 12.487457275390625, "learning_rate": 1.7361684937292273e-05, "loss": 1.5126, "step": 41600 }, { "epoch": 6.8762652344556905, "grad_norm": 11.388182640075684, "learning_rate": 1.7352503718393652e-05, "loss": 1.387, "step": 41610 }, { "epoch": 6.8779177855814915, "grad_norm": 12.775744438171387, "learning_rate": 1.7343322499495035e-05, "loss": 1.5719, "step": 41620 }, { "epoch": 6.879570336707292, "grad_norm": 9.344951629638672, "learning_rate": 1.7334141280596414e-05, "loss": 1.4063, "step": 41630 }, { "epoch": 6.8812228878330925, "grad_norm": 13.491334915161133, "learning_rate": 1.7324960061697793e-05, "loss": 1.2582, "step": 41640 }, { "epoch": 6.882875438958893, "grad_norm": 78.28813934326172, "learning_rate": 1.7315778842799173e-05, "loss": 1.5862, "step": 41650 }, { "epoch": 6.884527990084694, "grad_norm": 9.13769245147705, "learning_rate": 1.730659762390055e-05, "loss": 1.4646, "step": 41660 }, { "epoch": 6.886180541210494, "grad_norm": 7.546828269958496, "learning_rate": 1.7297416405001928e-05, "loss": 1.4563, "step": 41670 }, { "epoch": 6.887833092336294, "grad_norm": 8.716988563537598, "learning_rate": 1.7288235186103307e-05, "loss": 1.4902, "step": 41680 }, { "epoch": 6.889485643462095, "grad_norm": 11.223580360412598, "learning_rate": 1.7279053967204686e-05, "loss": 1.3886, "step": 41690 }, { "epoch": 6.891138194587895, "grad_norm": 9.644621849060059, "learning_rate": 1.7269872748306065e-05, "loss": 1.398, "step": 41700 }, { "epoch": 6.892790745713696, "grad_norm": 8.582539558410645, "learning_rate": 1.7260691529407445e-05, "loss": 1.3557, "step": 41710 }, { "epoch": 6.894443296839496, "grad_norm": 31.303890228271484, "learning_rate": 1.7251510310508824e-05, "loss": 1.4995, "step": 41720 }, { "epoch": 6.896095847965296, "grad_norm": 9.271944046020508, "learning_rate": 1.7242329091610203e-05, "loss": 1.2297, "step": 41730 }, { "epoch": 6.897748399091097, "grad_norm": 12.003678321838379, "learning_rate": 1.7233147872711582e-05, "loss": 1.3033, "step": 41740 }, { "epoch": 6.899400950216897, "grad_norm": 12.872206687927246, "learning_rate": 1.7223966653812962e-05, "loss": 1.5084, "step": 41750 }, { "epoch": 6.901053501342698, "grad_norm": 12.925690650939941, "learning_rate": 1.721478543491434e-05, "loss": 1.476, "step": 41760 }, { "epoch": 6.902706052468498, "grad_norm": 16.326433181762695, "learning_rate": 1.720560421601572e-05, "loss": 1.4717, "step": 41770 }, { "epoch": 6.904358603594298, "grad_norm": 10.100279808044434, "learning_rate": 1.71964229971171e-05, "loss": 1.5529, "step": 41780 }, { "epoch": 6.906011154720099, "grad_norm": 13.078716278076172, "learning_rate": 1.7187241778218475e-05, "loss": 1.4674, "step": 41790 }, { "epoch": 6.907663705845899, "grad_norm": 11.755579948425293, "learning_rate": 1.7178060559319855e-05, "loss": 1.3926, "step": 41800 }, { "epoch": 6.9093162569717, "grad_norm": 11.553741455078125, "learning_rate": 1.7168879340421234e-05, "loss": 1.352, "step": 41810 }, { "epoch": 6.9109688080975005, "grad_norm": 12.865326881408691, "learning_rate": 1.7159698121522613e-05, "loss": 1.3908, "step": 41820 }, { "epoch": 6.9126213592233015, "grad_norm": 9.914249420166016, "learning_rate": 1.7150516902623992e-05, "loss": 1.3497, "step": 41830 }, { "epoch": 6.9142739103491015, "grad_norm": 6.659285068511963, "learning_rate": 1.7141335683725375e-05, "loss": 1.3643, "step": 41840 }, { "epoch": 6.915926461474902, "grad_norm": 9.710965156555176, "learning_rate": 1.713215446482675e-05, "loss": 1.4596, "step": 41850 }, { "epoch": 6.917579012600703, "grad_norm": 9.427946090698242, "learning_rate": 1.712297324592813e-05, "loss": 1.4356, "step": 41860 }, { "epoch": 6.919231563726503, "grad_norm": 10.323373794555664, "learning_rate": 1.711379202702951e-05, "loss": 1.4465, "step": 41870 }, { "epoch": 6.920884114852303, "grad_norm": 19.84877586364746, "learning_rate": 1.710461080813089e-05, "loss": 1.4025, "step": 41880 }, { "epoch": 6.922536665978104, "grad_norm": 9.801685333251953, "learning_rate": 1.7095429589232268e-05, "loss": 1.3275, "step": 41890 }, { "epoch": 6.924189217103904, "grad_norm": 43.13996887207031, "learning_rate": 1.7086248370333647e-05, "loss": 1.4675, "step": 41900 }, { "epoch": 6.925841768229705, "grad_norm": 37.639007568359375, "learning_rate": 1.7077067151435026e-05, "loss": 1.2324, "step": 41910 }, { "epoch": 6.927494319355505, "grad_norm": 12.643664360046387, "learning_rate": 1.7067885932536402e-05, "loss": 1.3959, "step": 41920 }, { "epoch": 6.929146870481306, "grad_norm": 30.085914611816406, "learning_rate": 1.705870471363778e-05, "loss": 1.3784, "step": 41930 }, { "epoch": 6.930799421607106, "grad_norm": 17.809532165527344, "learning_rate": 1.704952349473916e-05, "loss": 1.4568, "step": 41940 }, { "epoch": 6.932451972732906, "grad_norm": 9.029561042785645, "learning_rate": 1.704034227584054e-05, "loss": 1.3388, "step": 41950 }, { "epoch": 6.934104523858707, "grad_norm": 12.235958099365234, "learning_rate": 1.7031161056941923e-05, "loss": 1.4447, "step": 41960 }, { "epoch": 6.935757074984507, "grad_norm": 12.634254455566406, "learning_rate": 1.7021979838043302e-05, "loss": 1.3982, "step": 41970 }, { "epoch": 6.937409626110307, "grad_norm": 15.774003982543945, "learning_rate": 1.7012798619144678e-05, "loss": 1.3463, "step": 41980 }, { "epoch": 6.939062177236108, "grad_norm": 8.786571502685547, "learning_rate": 1.7003617400246057e-05, "loss": 1.4888, "step": 41990 }, { "epoch": 6.940714728361908, "grad_norm": 20.433345794677734, "learning_rate": 1.6994436181347436e-05, "loss": 1.3355, "step": 42000 }, { "epoch": 6.942367279487709, "grad_norm": 7.788454055786133, "learning_rate": 1.6985254962448815e-05, "loss": 1.3896, "step": 42010 }, { "epoch": 6.9440198306135095, "grad_norm": 7.458805084228516, "learning_rate": 1.6976073743550195e-05, "loss": 1.393, "step": 42020 }, { "epoch": 6.9456723817393105, "grad_norm": 14.492402076721191, "learning_rate": 1.6966892524651574e-05, "loss": 1.5976, "step": 42030 }, { "epoch": 6.9473249328651105, "grad_norm": 8.02368450164795, "learning_rate": 1.6957711305752953e-05, "loss": 1.3769, "step": 42040 }, { "epoch": 6.948977483990911, "grad_norm": 8.434112548828125, "learning_rate": 1.694853008685433e-05, "loss": 1.4697, "step": 42050 }, { "epoch": 6.950630035116712, "grad_norm": 10.042356491088867, "learning_rate": 1.6939348867955708e-05, "loss": 1.4125, "step": 42060 }, { "epoch": 6.952282586242512, "grad_norm": 77.89820098876953, "learning_rate": 1.693016764905709e-05, "loss": 1.4607, "step": 42070 }, { "epoch": 6.953935137368313, "grad_norm": 48.598690032958984, "learning_rate": 1.692098643015847e-05, "loss": 1.3837, "step": 42080 }, { "epoch": 6.955587688494113, "grad_norm": 47.6584587097168, "learning_rate": 1.691180521125985e-05, "loss": 1.6172, "step": 42090 }, { "epoch": 6.957240239619913, "grad_norm": 7.663386344909668, "learning_rate": 1.690262399236123e-05, "loss": 1.383, "step": 42100 }, { "epoch": 6.958892790745714, "grad_norm": 7.526758670806885, "learning_rate": 1.6893442773462604e-05, "loss": 1.4312, "step": 42110 }, { "epoch": 6.960545341871514, "grad_norm": 16.803466796875, "learning_rate": 1.6884261554563984e-05, "loss": 1.4376, "step": 42120 }, { "epoch": 6.962197892997315, "grad_norm": 13.658576011657715, "learning_rate": 1.6875080335665363e-05, "loss": 1.5319, "step": 42130 }, { "epoch": 6.963850444123115, "grad_norm": 8.935698509216309, "learning_rate": 1.6865899116766742e-05, "loss": 1.439, "step": 42140 }, { "epoch": 6.965502995248915, "grad_norm": 11.672993659973145, "learning_rate": 1.685671789786812e-05, "loss": 1.4362, "step": 42150 }, { "epoch": 6.967155546374716, "grad_norm": 13.540828704833984, "learning_rate": 1.68475366789695e-05, "loss": 1.4189, "step": 42160 }, { "epoch": 6.968808097500516, "grad_norm": 11.229409217834473, "learning_rate": 1.683835546007088e-05, "loss": 1.4317, "step": 42170 }, { "epoch": 6.970460648626317, "grad_norm": 12.30543327331543, "learning_rate": 1.682917424117226e-05, "loss": 1.4359, "step": 42180 }, { "epoch": 6.972113199752117, "grad_norm": 11.764647483825684, "learning_rate": 1.681999302227364e-05, "loss": 1.5617, "step": 42190 }, { "epoch": 6.973765750877918, "grad_norm": 10.844428062438965, "learning_rate": 1.6810811803375018e-05, "loss": 1.4419, "step": 42200 }, { "epoch": 6.975418302003718, "grad_norm": 10.631561279296875, "learning_rate": 1.6801630584476397e-05, "loss": 1.3841, "step": 42210 }, { "epoch": 6.9770708531295185, "grad_norm": 13.280062675476074, "learning_rate": 1.6792449365577776e-05, "loss": 1.35, "step": 42220 }, { "epoch": 6.9787234042553195, "grad_norm": 15.788164138793945, "learning_rate": 1.6783268146679155e-05, "loss": 1.334, "step": 42230 }, { "epoch": 6.9803759553811195, "grad_norm": 12.476130485534668, "learning_rate": 1.677408692778053e-05, "loss": 1.5234, "step": 42240 }, { "epoch": 6.98202850650692, "grad_norm": 24.663625717163086, "learning_rate": 1.676490570888191e-05, "loss": 1.3422, "step": 42250 }, { "epoch": 6.983681057632721, "grad_norm": 16.133569717407227, "learning_rate": 1.675572448998329e-05, "loss": 1.4352, "step": 42260 }, { "epoch": 6.985333608758521, "grad_norm": 11.581048965454102, "learning_rate": 1.674654327108467e-05, "loss": 1.4443, "step": 42270 }, { "epoch": 6.986986159884322, "grad_norm": 10.542519569396973, "learning_rate": 1.6737362052186048e-05, "loss": 1.3883, "step": 42280 }, { "epoch": 6.988638711010122, "grad_norm": 7.865337371826172, "learning_rate": 1.6728180833287428e-05, "loss": 1.493, "step": 42290 }, { "epoch": 6.990291262135923, "grad_norm": 12.757125854492188, "learning_rate": 1.6718999614388807e-05, "loss": 1.4717, "step": 42300 }, { "epoch": 6.991943813261723, "grad_norm": 13.490144729614258, "learning_rate": 1.6709818395490186e-05, "loss": 1.4115, "step": 42310 }, { "epoch": 6.993596364387523, "grad_norm": 14.82015609741211, "learning_rate": 1.6700637176591565e-05, "loss": 1.4881, "step": 42320 }, { "epoch": 6.995248915513324, "grad_norm": 23.44951057434082, "learning_rate": 1.6691455957692945e-05, "loss": 1.4348, "step": 42330 }, { "epoch": 6.996901466639124, "grad_norm": 8.522165298461914, "learning_rate": 1.6682274738794324e-05, "loss": 1.4091, "step": 42340 }, { "epoch": 6.998554017764924, "grad_norm": 26.870582580566406, "learning_rate": 1.6673093519895703e-05, "loss": 1.4368, "step": 42350 }, { "epoch": 6.999876058665565, "eval_accuracy": 0.32497517378351537, "eval_loss": 2.2579329013824463, "eval_runtime": 821.7981, "eval_samples_per_second": 34.31, "eval_steps_per_second": 8.578, "step": 42358 }, { "epoch": 7.000206568890725, "grad_norm": 7.999766826629639, "learning_rate": 1.6663912300997082e-05, "loss": 1.289, "step": 42360 }, { "epoch": 7.001859120016525, "grad_norm": 8.853545188903809, "learning_rate": 1.6654731082098458e-05, "loss": 1.2426, "step": 42370 }, { "epoch": 7.003511671142326, "grad_norm": 9.978639602661133, "learning_rate": 1.6645549863199837e-05, "loss": 1.4188, "step": 42380 }, { "epoch": 7.005164222268126, "grad_norm": 12.857678413391113, "learning_rate": 1.6636368644301217e-05, "loss": 1.3589, "step": 42390 }, { "epoch": 7.006816773393927, "grad_norm": 12.013607025146484, "learning_rate": 1.6627187425402596e-05, "loss": 1.4138, "step": 42400 }, { "epoch": 7.008469324519727, "grad_norm": 6.538505554199219, "learning_rate": 1.6618006206503975e-05, "loss": 1.4131, "step": 42410 }, { "epoch": 7.0101218756455275, "grad_norm": 7.0642595291137695, "learning_rate": 1.6608824987605358e-05, "loss": 1.3476, "step": 42420 }, { "epoch": 7.0117744267713285, "grad_norm": 17.850061416625977, "learning_rate": 1.6599643768706734e-05, "loss": 1.4026, "step": 42430 }, { "epoch": 7.0134269778971285, "grad_norm": 12.280431747436523, "learning_rate": 1.6590462549808113e-05, "loss": 1.3851, "step": 42440 }, { "epoch": 7.0150795290229295, "grad_norm": 9.028722763061523, "learning_rate": 1.6581281330909492e-05, "loss": 1.3367, "step": 42450 }, { "epoch": 7.01673208014873, "grad_norm": 15.112545013427734, "learning_rate": 1.657210011201087e-05, "loss": 1.4602, "step": 42460 }, { "epoch": 7.01838463127453, "grad_norm": 13.523941993713379, "learning_rate": 1.656291889311225e-05, "loss": 1.4501, "step": 42470 }, { "epoch": 7.020037182400331, "grad_norm": 8.846076011657715, "learning_rate": 1.655373767421363e-05, "loss": 1.4001, "step": 42480 }, { "epoch": 7.021689733526131, "grad_norm": 7.614100456237793, "learning_rate": 1.654455645531501e-05, "loss": 1.3485, "step": 42490 }, { "epoch": 7.023342284651932, "grad_norm": 11.066787719726562, "learning_rate": 1.6535375236416385e-05, "loss": 1.4282, "step": 42500 }, { "epoch": 7.024994835777732, "grad_norm": 11.001472473144531, "learning_rate": 1.6526194017517764e-05, "loss": 1.3722, "step": 42510 }, { "epoch": 7.026647386903532, "grad_norm": 12.738889694213867, "learning_rate": 1.6517012798619143e-05, "loss": 1.4098, "step": 42520 }, { "epoch": 7.028299938029333, "grad_norm": 11.663647651672363, "learning_rate": 1.6507831579720526e-05, "loss": 1.4311, "step": 42530 }, { "epoch": 7.029952489155133, "grad_norm": 22.87346076965332, "learning_rate": 1.6498650360821905e-05, "loss": 1.316, "step": 42540 }, { "epoch": 7.031605040280934, "grad_norm": 8.73465347290039, "learning_rate": 1.6489469141923285e-05, "loss": 1.4358, "step": 42550 }, { "epoch": 7.033257591406734, "grad_norm": 10.015544891357422, "learning_rate": 1.648028792302466e-05, "loss": 1.3549, "step": 42560 }, { "epoch": 7.034910142532534, "grad_norm": 10.383803367614746, "learning_rate": 1.647110670412604e-05, "loss": 1.2933, "step": 42570 }, { "epoch": 7.036562693658335, "grad_norm": 6.737658977508545, "learning_rate": 1.646192548522742e-05, "loss": 1.3442, "step": 42580 }, { "epoch": 7.038215244784135, "grad_norm": 20.091114044189453, "learning_rate": 1.6452744266328798e-05, "loss": 1.4975, "step": 42590 }, { "epoch": 7.039867795909936, "grad_norm": 27.708118438720703, "learning_rate": 1.6443563047430177e-05, "loss": 1.5107, "step": 42600 }, { "epoch": 7.041520347035736, "grad_norm": 15.870881080627441, "learning_rate": 1.6434381828531557e-05, "loss": 1.4584, "step": 42610 }, { "epoch": 7.0431728981615365, "grad_norm": 20.80101203918457, "learning_rate": 1.6425200609632936e-05, "loss": 1.4254, "step": 42620 }, { "epoch": 7.0448254492873374, "grad_norm": 17.202617645263672, "learning_rate": 1.6416019390734315e-05, "loss": 1.3899, "step": 42630 }, { "epoch": 7.0464780004131375, "grad_norm": 10.804183006286621, "learning_rate": 1.6406838171835694e-05, "loss": 1.3267, "step": 42640 }, { "epoch": 7.0481305515389385, "grad_norm": 26.866113662719727, "learning_rate": 1.6397656952937074e-05, "loss": 1.4871, "step": 42650 }, { "epoch": 7.049783102664739, "grad_norm": 13.534538269042969, "learning_rate": 1.6388475734038453e-05, "loss": 1.5625, "step": 42660 }, { "epoch": 7.051435653790539, "grad_norm": 9.568449020385742, "learning_rate": 1.6379294515139832e-05, "loss": 1.3345, "step": 42670 }, { "epoch": 7.05308820491634, "grad_norm": 6.132664680480957, "learning_rate": 1.637011329624121e-05, "loss": 1.4646, "step": 42680 }, { "epoch": 7.05474075604214, "grad_norm": 13.083625793457031, "learning_rate": 1.6360932077342587e-05, "loss": 1.3717, "step": 42690 }, { "epoch": 7.056393307167941, "grad_norm": 22.688730239868164, "learning_rate": 1.6351750858443967e-05, "loss": 1.321, "step": 42700 }, { "epoch": 7.058045858293741, "grad_norm": 13.621474266052246, "learning_rate": 1.6342569639545346e-05, "loss": 1.4109, "step": 42710 }, { "epoch": 7.059698409419542, "grad_norm": 16.44967269897461, "learning_rate": 1.6333388420646725e-05, "loss": 1.3616, "step": 42720 }, { "epoch": 7.061350960545342, "grad_norm": 16.369356155395508, "learning_rate": 1.6324207201748104e-05, "loss": 1.4134, "step": 42730 }, { "epoch": 7.063003511671142, "grad_norm": 11.338850975036621, "learning_rate": 1.6315025982849484e-05, "loss": 1.3899, "step": 42740 }, { "epoch": 7.064656062796943, "grad_norm": 13.041457176208496, "learning_rate": 1.6305844763950863e-05, "loss": 1.3229, "step": 42750 }, { "epoch": 7.066308613922743, "grad_norm": 18.309200286865234, "learning_rate": 1.6296663545052242e-05, "loss": 1.4822, "step": 42760 }, { "epoch": 7.067961165048544, "grad_norm": 14.844568252563477, "learning_rate": 1.628748232615362e-05, "loss": 1.5301, "step": 42770 }, { "epoch": 7.069613716174344, "grad_norm": 8.260869979858398, "learning_rate": 1.6278301107255e-05, "loss": 1.3789, "step": 42780 }, { "epoch": 7.071266267300144, "grad_norm": 20.73078727722168, "learning_rate": 1.626911988835638e-05, "loss": 1.3875, "step": 42790 }, { "epoch": 7.072918818425945, "grad_norm": 7.812931537628174, "learning_rate": 1.625993866945776e-05, "loss": 1.3335, "step": 42800 }, { "epoch": 7.074571369551745, "grad_norm": 10.699761390686035, "learning_rate": 1.6250757450559138e-05, "loss": 1.2201, "step": 42810 }, { "epoch": 7.076223920677546, "grad_norm": 13.993300437927246, "learning_rate": 1.6241576231660514e-05, "loss": 1.3617, "step": 42820 }, { "epoch": 7.0778764718033464, "grad_norm": 7.880179405212402, "learning_rate": 1.6232395012761893e-05, "loss": 1.3303, "step": 42830 }, { "epoch": 7.0795290229291465, "grad_norm": 14.917696952819824, "learning_rate": 1.6223213793863273e-05, "loss": 1.383, "step": 42840 }, { "epoch": 7.0811815740549475, "grad_norm": 62.78154373168945, "learning_rate": 1.6214032574964652e-05, "loss": 1.5036, "step": 42850 }, { "epoch": 7.082834125180748, "grad_norm": 50.27464294433594, "learning_rate": 1.620485135606603e-05, "loss": 1.3522, "step": 42860 }, { "epoch": 7.084486676306549, "grad_norm": 19.16926383972168, "learning_rate": 1.6195670137167414e-05, "loss": 1.4757, "step": 42870 }, { "epoch": 7.086139227432349, "grad_norm": 13.433306694030762, "learning_rate": 1.618648891826879e-05, "loss": 1.3875, "step": 42880 }, { "epoch": 7.087791778558149, "grad_norm": 13.95352554321289, "learning_rate": 1.617730769937017e-05, "loss": 1.4077, "step": 42890 }, { "epoch": 7.08944432968395, "grad_norm": 11.004186630249023, "learning_rate": 1.6168126480471548e-05, "loss": 1.2435, "step": 42900 }, { "epoch": 7.09109688080975, "grad_norm": 9.82064151763916, "learning_rate": 1.6158945261572927e-05, "loss": 1.3743, "step": 42910 }, { "epoch": 7.092749431935551, "grad_norm": 11.183409690856934, "learning_rate": 1.6149764042674307e-05, "loss": 1.2179, "step": 42920 }, { "epoch": 7.094401983061351, "grad_norm": 12.038310050964355, "learning_rate": 1.6140582823775686e-05, "loss": 1.4433, "step": 42930 }, { "epoch": 7.096054534187151, "grad_norm": 14.014615058898926, "learning_rate": 1.6131401604877065e-05, "loss": 1.4148, "step": 42940 }, { "epoch": 7.097707085312952, "grad_norm": 11.169940948486328, "learning_rate": 1.6122220385978444e-05, "loss": 1.4004, "step": 42950 }, { "epoch": 7.099359636438752, "grad_norm": 12.410701751708984, "learning_rate": 1.611303916707982e-05, "loss": 1.4521, "step": 42960 }, { "epoch": 7.101012187564553, "grad_norm": 13.273151397705078, "learning_rate": 1.61038579481812e-05, "loss": 1.5151, "step": 42970 }, { "epoch": 7.102664738690353, "grad_norm": 8.128931999206543, "learning_rate": 1.609467672928258e-05, "loss": 1.4828, "step": 42980 }, { "epoch": 7.104317289816153, "grad_norm": 26.178449630737305, "learning_rate": 1.608549551038396e-05, "loss": 1.3804, "step": 42990 }, { "epoch": 7.105969840941954, "grad_norm": 9.145550727844238, "learning_rate": 1.607631429148534e-05, "loss": 1.3358, "step": 43000 }, { "epoch": 7.107622392067754, "grad_norm": 10.598913192749023, "learning_rate": 1.6067133072586716e-05, "loss": 1.4074, "step": 43010 }, { "epoch": 7.109274943193555, "grad_norm": 13.98646068572998, "learning_rate": 1.6057951853688096e-05, "loss": 1.4561, "step": 43020 }, { "epoch": 7.1109274943193554, "grad_norm": 15.383737564086914, "learning_rate": 1.6048770634789475e-05, "loss": 1.3876, "step": 43030 }, { "epoch": 7.1125800454451555, "grad_norm": 14.499407768249512, "learning_rate": 1.6039589415890854e-05, "loss": 1.3743, "step": 43040 }, { "epoch": 7.1142325965709565, "grad_norm": 21.623125076293945, "learning_rate": 1.6030408196992233e-05, "loss": 1.3869, "step": 43050 }, { "epoch": 7.115885147696757, "grad_norm": 19.47028350830078, "learning_rate": 1.6021226978093613e-05, "loss": 1.4454, "step": 43060 }, { "epoch": 7.117537698822558, "grad_norm": 12.078246116638184, "learning_rate": 1.6012045759194992e-05, "loss": 1.396, "step": 43070 }, { "epoch": 7.119190249948358, "grad_norm": 11.143622398376465, "learning_rate": 1.600286454029637e-05, "loss": 1.5037, "step": 43080 }, { "epoch": 7.120842801074158, "grad_norm": 35.760719299316406, "learning_rate": 1.5993683321397747e-05, "loss": 1.4383, "step": 43090 }, { "epoch": 7.122495352199959, "grad_norm": 23.67930030822754, "learning_rate": 1.598450210249913e-05, "loss": 1.415, "step": 43100 }, { "epoch": 7.124147903325759, "grad_norm": 8.731281280517578, "learning_rate": 1.597532088360051e-05, "loss": 1.4276, "step": 43110 }, { "epoch": 7.12580045445156, "grad_norm": 16.28318214416504, "learning_rate": 1.5966139664701888e-05, "loss": 1.4406, "step": 43120 }, { "epoch": 7.12745300557736, "grad_norm": 15.253238677978516, "learning_rate": 1.5956958445803267e-05, "loss": 1.4224, "step": 43130 }, { "epoch": 7.129105556703161, "grad_norm": 9.880094528198242, "learning_rate": 1.5947777226904643e-05, "loss": 1.289, "step": 43140 }, { "epoch": 7.130758107828961, "grad_norm": 45.78981399536133, "learning_rate": 1.5938596008006022e-05, "loss": 1.4237, "step": 43150 }, { "epoch": 7.132410658954761, "grad_norm": 15.530732154846191, "learning_rate": 1.5929414789107402e-05, "loss": 1.3385, "step": 43160 }, { "epoch": 7.134063210080562, "grad_norm": 7.943971157073975, "learning_rate": 1.592023357020878e-05, "loss": 1.3658, "step": 43170 }, { "epoch": 7.135715761206362, "grad_norm": 8.042201042175293, "learning_rate": 1.591105235131016e-05, "loss": 1.3703, "step": 43180 }, { "epoch": 7.137368312332163, "grad_norm": 14.290609359741211, "learning_rate": 1.590187113241154e-05, "loss": 1.433, "step": 43190 }, { "epoch": 7.139020863457963, "grad_norm": 12.84965991973877, "learning_rate": 1.589268991351292e-05, "loss": 1.3675, "step": 43200 }, { "epoch": 7.140673414583763, "grad_norm": 9.75621509552002, "learning_rate": 1.5883508694614298e-05, "loss": 1.3423, "step": 43210 }, { "epoch": 7.142325965709564, "grad_norm": 16.249305725097656, "learning_rate": 1.5874327475715677e-05, "loss": 1.5104, "step": 43220 }, { "epoch": 7.1439785168353644, "grad_norm": 9.561155319213867, "learning_rate": 1.5865146256817056e-05, "loss": 1.4316, "step": 43230 }, { "epoch": 7.145631067961165, "grad_norm": 9.332865715026855, "learning_rate": 1.5855965037918436e-05, "loss": 1.3956, "step": 43240 }, { "epoch": 7.1472836190869655, "grad_norm": 14.026449203491211, "learning_rate": 1.5846783819019815e-05, "loss": 1.3913, "step": 43250 }, { "epoch": 7.148936170212766, "grad_norm": 16.361709594726562, "learning_rate": 1.5837602600121194e-05, "loss": 1.4388, "step": 43260 }, { "epoch": 7.150588721338567, "grad_norm": 34.313411712646484, "learning_rate": 1.5828421381222573e-05, "loss": 1.4463, "step": 43270 }, { "epoch": 7.152241272464367, "grad_norm": 62.70059585571289, "learning_rate": 1.581924016232395e-05, "loss": 1.5965, "step": 43280 }, { "epoch": 7.153893823590168, "grad_norm": 12.427000999450684, "learning_rate": 1.581005894342533e-05, "loss": 1.4107, "step": 43290 }, { "epoch": 7.155546374715968, "grad_norm": 38.85627365112305, "learning_rate": 1.5800877724526708e-05, "loss": 1.4769, "step": 43300 }, { "epoch": 7.157198925841768, "grad_norm": 13.122133255004883, "learning_rate": 1.5791696505628087e-05, "loss": 1.3648, "step": 43310 }, { "epoch": 7.158851476967569, "grad_norm": 44.62786102294922, "learning_rate": 1.5782515286729466e-05, "loss": 1.5429, "step": 43320 }, { "epoch": 7.160504028093369, "grad_norm": 12.482089042663574, "learning_rate": 1.5773334067830846e-05, "loss": 1.3703, "step": 43330 }, { "epoch": 7.16215657921917, "grad_norm": 10.335782051086426, "learning_rate": 1.5764152848932225e-05, "loss": 1.3795, "step": 43340 }, { "epoch": 7.16380913034497, "grad_norm": 11.581258773803711, "learning_rate": 1.5754971630033604e-05, "loss": 1.3594, "step": 43350 }, { "epoch": 7.16546168147077, "grad_norm": 9.168930053710938, "learning_rate": 1.5745790411134983e-05, "loss": 1.3988, "step": 43360 }, { "epoch": 7.167114232596571, "grad_norm": 35.34038543701172, "learning_rate": 1.5736609192236363e-05, "loss": 1.367, "step": 43370 }, { "epoch": 7.168766783722371, "grad_norm": 13.737029075622559, "learning_rate": 1.5727427973337742e-05, "loss": 1.4333, "step": 43380 }, { "epoch": 7.170419334848172, "grad_norm": 11.130227088928223, "learning_rate": 1.571824675443912e-05, "loss": 1.3601, "step": 43390 }, { "epoch": 7.172071885973972, "grad_norm": 14.647340774536133, "learning_rate": 1.57090655355405e-05, "loss": 1.4556, "step": 43400 }, { "epoch": 7.173724437099772, "grad_norm": 11.120412826538086, "learning_rate": 1.5699884316641876e-05, "loss": 1.4372, "step": 43410 }, { "epoch": 7.175376988225573, "grad_norm": 51.608375549316406, "learning_rate": 1.5690703097743255e-05, "loss": 1.4654, "step": 43420 }, { "epoch": 7.177029539351373, "grad_norm": 10.668750762939453, "learning_rate": 1.5681521878844635e-05, "loss": 1.2519, "step": 43430 }, { "epoch": 7.178682090477174, "grad_norm": 12.485282897949219, "learning_rate": 1.5672340659946017e-05, "loss": 1.322, "step": 43440 }, { "epoch": 7.1803346416029745, "grad_norm": 12.183808326721191, "learning_rate": 1.5663159441047397e-05, "loss": 1.4264, "step": 43450 }, { "epoch": 7.181987192728775, "grad_norm": 13.21921443939209, "learning_rate": 1.5653978222148772e-05, "loss": 1.3703, "step": 43460 }, { "epoch": 7.183639743854576, "grad_norm": 10.9345064163208, "learning_rate": 1.564479700325015e-05, "loss": 1.2281, "step": 43470 }, { "epoch": 7.185292294980376, "grad_norm": 14.405402183532715, "learning_rate": 1.563561578435153e-05, "loss": 1.4111, "step": 43480 }, { "epoch": 7.186944846106177, "grad_norm": 17.535526275634766, "learning_rate": 1.562643456545291e-05, "loss": 1.3301, "step": 43490 }, { "epoch": 7.188597397231977, "grad_norm": 15.74835205078125, "learning_rate": 1.561725334655429e-05, "loss": 1.5023, "step": 43500 }, { "epoch": 7.190249948357778, "grad_norm": 29.075929641723633, "learning_rate": 1.560807212765567e-05, "loss": 1.443, "step": 43510 }, { "epoch": 7.191902499483578, "grad_norm": 8.865483283996582, "learning_rate": 1.5598890908757048e-05, "loss": 1.3097, "step": 43520 }, { "epoch": 7.193555050609378, "grad_norm": 14.490361213684082, "learning_rate": 1.5589709689858427e-05, "loss": 1.3695, "step": 43530 }, { "epoch": 7.195207601735179, "grad_norm": 21.584367752075195, "learning_rate": 1.5580528470959803e-05, "loss": 1.3773, "step": 43540 }, { "epoch": 7.196860152860979, "grad_norm": 10.539822578430176, "learning_rate": 1.5571347252061182e-05, "loss": 1.4, "step": 43550 }, { "epoch": 7.19851270398678, "grad_norm": 12.214553833007812, "learning_rate": 1.5562166033162565e-05, "loss": 1.3988, "step": 43560 }, { "epoch": 7.20016525511258, "grad_norm": 13.141803741455078, "learning_rate": 1.5552984814263944e-05, "loss": 1.3828, "step": 43570 }, { "epoch": 7.20181780623838, "grad_norm": 12.777125358581543, "learning_rate": 1.5543803595365323e-05, "loss": 1.3906, "step": 43580 }, { "epoch": 7.203470357364181, "grad_norm": 15.37719440460205, "learning_rate": 1.5534622376466703e-05, "loss": 1.4042, "step": 43590 }, { "epoch": 7.205122908489981, "grad_norm": 12.876169204711914, "learning_rate": 1.552544115756808e-05, "loss": 1.582, "step": 43600 }, { "epoch": 7.206775459615782, "grad_norm": 21.468862533569336, "learning_rate": 1.5516259938669458e-05, "loss": 1.4693, "step": 43610 }, { "epoch": 7.208428010741582, "grad_norm": 10.771493911743164, "learning_rate": 1.5507078719770837e-05, "loss": 1.4106, "step": 43620 }, { "epoch": 7.210080561867382, "grad_norm": 16.791486740112305, "learning_rate": 1.5497897500872216e-05, "loss": 1.4429, "step": 43630 }, { "epoch": 7.211733112993183, "grad_norm": 16.35462760925293, "learning_rate": 1.5488716281973595e-05, "loss": 1.3854, "step": 43640 }, { "epoch": 7.2133856641189835, "grad_norm": 12.617788314819336, "learning_rate": 1.5479535063074975e-05, "loss": 1.3077, "step": 43650 }, { "epoch": 7.2150382152447845, "grad_norm": 12.411581993103027, "learning_rate": 1.5470353844176354e-05, "loss": 1.3623, "step": 43660 }, { "epoch": 7.216690766370585, "grad_norm": 19.033109664916992, "learning_rate": 1.5461172625277733e-05, "loss": 1.4943, "step": 43670 }, { "epoch": 7.218343317496385, "grad_norm": 8.353110313415527, "learning_rate": 1.5451991406379112e-05, "loss": 1.3783, "step": 43680 }, { "epoch": 7.219995868622186, "grad_norm": 8.418465614318848, "learning_rate": 1.5442810187480492e-05, "loss": 1.3545, "step": 43690 }, { "epoch": 7.221648419747986, "grad_norm": 8.319928169250488, "learning_rate": 1.543362896858187e-05, "loss": 1.3794, "step": 43700 }, { "epoch": 7.223300970873787, "grad_norm": 13.188324928283691, "learning_rate": 1.542444774968325e-05, "loss": 1.2745, "step": 43710 }, { "epoch": 7.224953521999587, "grad_norm": 10.732872009277344, "learning_rate": 1.541526653078463e-05, "loss": 1.3477, "step": 43720 }, { "epoch": 7.226606073125387, "grad_norm": 47.982398986816406, "learning_rate": 1.5406085311886005e-05, "loss": 1.4172, "step": 43730 }, { "epoch": 7.228258624251188, "grad_norm": 44.85279846191406, "learning_rate": 1.5396904092987385e-05, "loss": 1.4017, "step": 43740 }, { "epoch": 7.229911175376988, "grad_norm": 9.955687522888184, "learning_rate": 1.5387722874088764e-05, "loss": 1.349, "step": 43750 }, { "epoch": 7.231563726502789, "grad_norm": 8.512934684753418, "learning_rate": 1.5378541655190143e-05, "loss": 1.4837, "step": 43760 }, { "epoch": 7.233216277628589, "grad_norm": 17.035743713378906, "learning_rate": 1.5369360436291522e-05, "loss": 1.4043, "step": 43770 }, { "epoch": 7.234868828754389, "grad_norm": 45.515899658203125, "learning_rate": 1.53601792173929e-05, "loss": 1.2978, "step": 43780 }, { "epoch": 7.23652137988019, "grad_norm": 13.9974946975708, "learning_rate": 1.535099799849428e-05, "loss": 1.4702, "step": 43790 }, { "epoch": 7.23817393100599, "grad_norm": 16.096546173095703, "learning_rate": 1.534181677959566e-05, "loss": 1.4132, "step": 43800 }, { "epoch": 7.239826482131791, "grad_norm": 11.778483390808105, "learning_rate": 1.533263556069704e-05, "loss": 1.3275, "step": 43810 }, { "epoch": 7.241479033257591, "grad_norm": 7.991126537322998, "learning_rate": 1.532345434179842e-05, "loss": 1.2556, "step": 43820 }, { "epoch": 7.243131584383391, "grad_norm": 12.626638412475586, "learning_rate": 1.5314273122899798e-05, "loss": 1.38, "step": 43830 }, { "epoch": 7.244784135509192, "grad_norm": 7.913274765014648, "learning_rate": 1.5305091904001177e-05, "loss": 1.253, "step": 43840 }, { "epoch": 7.2464366866349925, "grad_norm": 11.45130729675293, "learning_rate": 1.5295910685102556e-05, "loss": 1.4744, "step": 43850 }, { "epoch": 7.2480892377607935, "grad_norm": 7.596304416656494, "learning_rate": 1.5286729466203932e-05, "loss": 1.3495, "step": 43860 }, { "epoch": 7.249741788886594, "grad_norm": 16.072736740112305, "learning_rate": 1.527754824730531e-05, "loss": 1.3415, "step": 43870 }, { "epoch": 7.251394340012395, "grad_norm": 11.912080764770508, "learning_rate": 1.526836702840669e-05, "loss": 1.4074, "step": 43880 }, { "epoch": 7.253046891138195, "grad_norm": 14.602355003356934, "learning_rate": 1.525918580950807e-05, "loss": 1.4189, "step": 43890 }, { "epoch": 7.254699442263995, "grad_norm": 9.636602401733398, "learning_rate": 1.525000459060945e-05, "loss": 1.37, "step": 43900 }, { "epoch": 7.256351993389796, "grad_norm": 22.76931381225586, "learning_rate": 1.524082337171083e-05, "loss": 1.3947, "step": 43910 }, { "epoch": 7.258004544515596, "grad_norm": 15.332125663757324, "learning_rate": 1.523164215281221e-05, "loss": 1.433, "step": 43920 }, { "epoch": 7.259657095641396, "grad_norm": 13.930269241333008, "learning_rate": 1.5222460933913587e-05, "loss": 1.4699, "step": 43930 }, { "epoch": 7.261309646767197, "grad_norm": 13.538143157958984, "learning_rate": 1.5213279715014966e-05, "loss": 1.398, "step": 43940 }, { "epoch": 7.262962197892997, "grad_norm": 12.98928451538086, "learning_rate": 1.5204098496116345e-05, "loss": 1.2564, "step": 43950 }, { "epoch": 7.264614749018798, "grad_norm": 17.9935359954834, "learning_rate": 1.5194917277217725e-05, "loss": 1.4769, "step": 43960 }, { "epoch": 7.266267300144598, "grad_norm": 13.365751266479492, "learning_rate": 1.5185736058319102e-05, "loss": 1.4451, "step": 43970 }, { "epoch": 7.267919851270399, "grad_norm": 16.52039337158203, "learning_rate": 1.5176554839420481e-05, "loss": 1.4016, "step": 43980 }, { "epoch": 7.269572402396199, "grad_norm": 15.098515510559082, "learning_rate": 1.516737362052186e-05, "loss": 1.369, "step": 43990 }, { "epoch": 7.271224953521999, "grad_norm": 8.429466247558594, "learning_rate": 1.5158192401623238e-05, "loss": 1.4621, "step": 44000 }, { "epoch": 7.2728775046478, "grad_norm": 13.037130355834961, "learning_rate": 1.5149011182724621e-05, "loss": 1.4036, "step": 44010 }, { "epoch": 7.2745300557736, "grad_norm": 11.745615005493164, "learning_rate": 1.5139829963826e-05, "loss": 1.3655, "step": 44020 }, { "epoch": 7.276182606899401, "grad_norm": 25.232398986816406, "learning_rate": 1.5130648744927378e-05, "loss": 1.424, "step": 44030 }, { "epoch": 7.277835158025201, "grad_norm": 10.532912254333496, "learning_rate": 1.5121467526028757e-05, "loss": 1.4246, "step": 44040 }, { "epoch": 7.2794877091510015, "grad_norm": 9.035202026367188, "learning_rate": 1.5112286307130136e-05, "loss": 1.3259, "step": 44050 }, { "epoch": 7.2811402602768025, "grad_norm": 10.761393547058105, "learning_rate": 1.5103105088231514e-05, "loss": 1.4091, "step": 44060 }, { "epoch": 7.282792811402603, "grad_norm": 21.04129409790039, "learning_rate": 1.5093923869332893e-05, "loss": 1.3684, "step": 44070 }, { "epoch": 7.284445362528404, "grad_norm": 6.939519882202148, "learning_rate": 1.5084742650434272e-05, "loss": 1.5599, "step": 44080 }, { "epoch": 7.286097913654204, "grad_norm": 74.00979614257812, "learning_rate": 1.5075561431535651e-05, "loss": 1.4212, "step": 44090 }, { "epoch": 7.287750464780004, "grad_norm": 11.299165725708008, "learning_rate": 1.5066380212637029e-05, "loss": 1.3967, "step": 44100 }, { "epoch": 7.289403015905805, "grad_norm": 9.076863288879395, "learning_rate": 1.5057198993738408e-05, "loss": 1.3786, "step": 44110 }, { "epoch": 7.291055567031605, "grad_norm": 10.693045616149902, "learning_rate": 1.5048017774839787e-05, "loss": 1.4005, "step": 44120 }, { "epoch": 7.292708118157406, "grad_norm": 11.115889549255371, "learning_rate": 1.5038836555941168e-05, "loss": 1.512, "step": 44130 }, { "epoch": 7.294360669283206, "grad_norm": 10.336817741394043, "learning_rate": 1.5029655337042548e-05, "loss": 1.3774, "step": 44140 }, { "epoch": 7.296013220409006, "grad_norm": 11.22537612915039, "learning_rate": 1.5020474118143927e-05, "loss": 1.4058, "step": 44150 }, { "epoch": 7.297665771534807, "grad_norm": 43.39937210083008, "learning_rate": 1.5011292899245304e-05, "loss": 1.3649, "step": 44160 }, { "epoch": 7.299318322660607, "grad_norm": 22.82792091369629, "learning_rate": 1.5002111680346684e-05, "loss": 1.5085, "step": 44170 }, { "epoch": 7.300970873786408, "grad_norm": 30.476633071899414, "learning_rate": 1.4992930461448063e-05, "loss": 1.32, "step": 44180 }, { "epoch": 7.302623424912208, "grad_norm": 10.81251049041748, "learning_rate": 1.498374924254944e-05, "loss": 1.3552, "step": 44190 }, { "epoch": 7.304275976038008, "grad_norm": 17.479726791381836, "learning_rate": 1.497456802365082e-05, "loss": 1.437, "step": 44200 }, { "epoch": 7.305928527163809, "grad_norm": 14.184757232666016, "learning_rate": 1.4965386804752199e-05, "loss": 1.3092, "step": 44210 }, { "epoch": 7.307581078289609, "grad_norm": 16.718236923217773, "learning_rate": 1.4956205585853578e-05, "loss": 1.4491, "step": 44220 }, { "epoch": 7.30923362941541, "grad_norm": 13.425004005432129, "learning_rate": 1.4947024366954956e-05, "loss": 1.4319, "step": 44230 }, { "epoch": 7.31088618054121, "grad_norm": 12.716981887817383, "learning_rate": 1.4937843148056338e-05, "loss": 1.365, "step": 44240 }, { "epoch": 7.312538731667011, "grad_norm": 11.309150695800781, "learning_rate": 1.4928661929157716e-05, "loss": 1.3449, "step": 44250 }, { "epoch": 7.3141912827928115, "grad_norm": 9.994693756103516, "learning_rate": 1.4919480710259095e-05, "loss": 1.3902, "step": 44260 }, { "epoch": 7.315843833918612, "grad_norm": 12.569499015808105, "learning_rate": 1.4910299491360475e-05, "loss": 1.2932, "step": 44270 }, { "epoch": 7.317496385044413, "grad_norm": 14.061833381652832, "learning_rate": 1.4901118272461854e-05, "loss": 1.3171, "step": 44280 }, { "epoch": 7.319148936170213, "grad_norm": 17.114004135131836, "learning_rate": 1.4891937053563231e-05, "loss": 1.4204, "step": 44290 }, { "epoch": 7.320801487296013, "grad_norm": 20.603870391845703, "learning_rate": 1.488275583466461e-05, "loss": 1.4114, "step": 44300 }, { "epoch": 7.322454038421814, "grad_norm": 11.82662582397461, "learning_rate": 1.487357461576599e-05, "loss": 1.3978, "step": 44310 }, { "epoch": 7.324106589547614, "grad_norm": 8.087780952453613, "learning_rate": 1.4864393396867367e-05, "loss": 1.4033, "step": 44320 }, { "epoch": 7.325759140673415, "grad_norm": 14.538772583007812, "learning_rate": 1.4855212177968747e-05, "loss": 1.4692, "step": 44330 }, { "epoch": 7.327411691799215, "grad_norm": 11.816965103149414, "learning_rate": 1.4846030959070126e-05, "loss": 1.4003, "step": 44340 }, { "epoch": 7.329064242925016, "grad_norm": 11.297624588012695, "learning_rate": 1.4836849740171505e-05, "loss": 1.3688, "step": 44350 }, { "epoch": 7.330716794050816, "grad_norm": 29.10480308532715, "learning_rate": 1.4827668521272886e-05, "loss": 1.3417, "step": 44360 }, { "epoch": 7.332369345176616, "grad_norm": 16.348495483398438, "learning_rate": 1.4818487302374265e-05, "loss": 1.3799, "step": 44370 }, { "epoch": 7.334021896302417, "grad_norm": 15.20343017578125, "learning_rate": 1.4809306083475643e-05, "loss": 1.3703, "step": 44380 }, { "epoch": 7.335674447428217, "grad_norm": 10.832907676696777, "learning_rate": 1.4800124864577022e-05, "loss": 1.5541, "step": 44390 }, { "epoch": 7.337326998554018, "grad_norm": 13.253135681152344, "learning_rate": 1.4790943645678401e-05, "loss": 1.346, "step": 44400 }, { "epoch": 7.338979549679818, "grad_norm": 12.137068748474121, "learning_rate": 1.478176242677978e-05, "loss": 1.4282, "step": 44410 }, { "epoch": 7.340632100805618, "grad_norm": 17.6302547454834, "learning_rate": 1.4772581207881158e-05, "loss": 1.2844, "step": 44420 }, { "epoch": 7.342284651931419, "grad_norm": 8.67888355255127, "learning_rate": 1.4763399988982537e-05, "loss": 1.2272, "step": 44430 }, { "epoch": 7.343937203057219, "grad_norm": 12.13434886932373, "learning_rate": 1.4754218770083917e-05, "loss": 1.4605, "step": 44440 }, { "epoch": 7.34558975418302, "grad_norm": 13.558122634887695, "learning_rate": 1.4745037551185296e-05, "loss": 1.3914, "step": 44450 }, { "epoch": 7.3472423053088205, "grad_norm": 11.792350769042969, "learning_rate": 1.4735856332286673e-05, "loss": 1.3819, "step": 44460 }, { "epoch": 7.348894856434621, "grad_norm": 15.687477111816406, "learning_rate": 1.4726675113388056e-05, "loss": 1.4004, "step": 44470 }, { "epoch": 7.350547407560422, "grad_norm": 14.300453186035156, "learning_rate": 1.4717493894489434e-05, "loss": 1.3687, "step": 44480 }, { "epoch": 7.352199958686222, "grad_norm": 12.354840278625488, "learning_rate": 1.4708312675590813e-05, "loss": 1.3209, "step": 44490 }, { "epoch": 7.353852509812023, "grad_norm": 14.35772705078125, "learning_rate": 1.4699131456692192e-05, "loss": 1.428, "step": 44500 }, { "epoch": 7.355505060937823, "grad_norm": 16.01991844177246, "learning_rate": 1.468995023779357e-05, "loss": 1.3319, "step": 44510 }, { "epoch": 7.357157612063623, "grad_norm": 14.260498046875, "learning_rate": 1.4680769018894949e-05, "loss": 1.3704, "step": 44520 }, { "epoch": 7.358810163189424, "grad_norm": 17.772220611572266, "learning_rate": 1.4671587799996328e-05, "loss": 1.552, "step": 44530 }, { "epoch": 7.360462714315224, "grad_norm": 13.813607215881348, "learning_rate": 1.4662406581097707e-05, "loss": 1.4813, "step": 44540 }, { "epoch": 7.362115265441025, "grad_norm": 11.745838165283203, "learning_rate": 1.4653225362199085e-05, "loss": 1.396, "step": 44550 }, { "epoch": 7.363767816566825, "grad_norm": 24.77311897277832, "learning_rate": 1.4644044143300464e-05, "loss": 1.4673, "step": 44560 }, { "epoch": 7.365420367692625, "grad_norm": 12.793754577636719, "learning_rate": 1.4634862924401843e-05, "loss": 1.3443, "step": 44570 }, { "epoch": 7.367072918818426, "grad_norm": 10.67249870300293, "learning_rate": 1.4625681705503224e-05, "loss": 1.3569, "step": 44580 }, { "epoch": 7.368725469944226, "grad_norm": 15.902300834655762, "learning_rate": 1.4616500486604604e-05, "loss": 1.5068, "step": 44590 }, { "epoch": 7.370378021070027, "grad_norm": 9.883173942565918, "learning_rate": 1.4607319267705983e-05, "loss": 1.2706, "step": 44600 }, { "epoch": 7.372030572195827, "grad_norm": 17.580493927001953, "learning_rate": 1.459813804880736e-05, "loss": 1.3805, "step": 44610 }, { "epoch": 7.373683123321628, "grad_norm": 20.290407180786133, "learning_rate": 1.458895682990874e-05, "loss": 1.5256, "step": 44620 }, { "epoch": 7.375335674447428, "grad_norm": 11.272130012512207, "learning_rate": 1.4579775611010119e-05, "loss": 1.4099, "step": 44630 }, { "epoch": 7.376988225573228, "grad_norm": 14.19814395904541, "learning_rate": 1.4570594392111497e-05, "loss": 1.4281, "step": 44640 }, { "epoch": 7.378640776699029, "grad_norm": 13.382776260375977, "learning_rate": 1.4561413173212876e-05, "loss": 1.3977, "step": 44650 }, { "epoch": 7.3802933278248295, "grad_norm": 13.100677490234375, "learning_rate": 1.4552231954314255e-05, "loss": 1.3892, "step": 44660 }, { "epoch": 7.38194587895063, "grad_norm": 23.83889389038086, "learning_rate": 1.4543050735415634e-05, "loss": 1.3258, "step": 44670 }, { "epoch": 7.383598430076431, "grad_norm": 15.009169578552246, "learning_rate": 1.4533869516517012e-05, "loss": 1.3531, "step": 44680 }, { "epoch": 7.385250981202231, "grad_norm": 12.051997184753418, "learning_rate": 1.4524688297618391e-05, "loss": 1.3819, "step": 44690 }, { "epoch": 7.386903532328032, "grad_norm": 15.543947219848633, "learning_rate": 1.4515507078719772e-05, "loss": 1.3308, "step": 44700 }, { "epoch": 7.388556083453832, "grad_norm": 10.858241081237793, "learning_rate": 1.4506325859821151e-05, "loss": 1.32, "step": 44710 }, { "epoch": 7.390208634579633, "grad_norm": 11.575108528137207, "learning_rate": 1.449714464092253e-05, "loss": 1.3483, "step": 44720 }, { "epoch": 7.391861185705433, "grad_norm": 14.963944435119629, "learning_rate": 1.448796342202391e-05, "loss": 1.5092, "step": 44730 }, { "epoch": 7.393513736831233, "grad_norm": 24.254188537597656, "learning_rate": 1.4478782203125287e-05, "loss": 1.5216, "step": 44740 }, { "epoch": 7.395166287957034, "grad_norm": 11.076996803283691, "learning_rate": 1.4469600984226667e-05, "loss": 1.3176, "step": 44750 }, { "epoch": 7.396818839082834, "grad_norm": 17.357913970947266, "learning_rate": 1.4460419765328046e-05, "loss": 1.4716, "step": 44760 }, { "epoch": 7.398471390208635, "grad_norm": 15.097737312316895, "learning_rate": 1.4451238546429425e-05, "loss": 1.328, "step": 44770 }, { "epoch": 7.400123941334435, "grad_norm": 30.251018524169922, "learning_rate": 1.4442057327530803e-05, "loss": 1.3193, "step": 44780 }, { "epoch": 7.401776492460235, "grad_norm": 11.265031814575195, "learning_rate": 1.4432876108632182e-05, "loss": 1.3426, "step": 44790 }, { "epoch": 7.403429043586036, "grad_norm": 13.859474182128906, "learning_rate": 1.4423694889733561e-05, "loss": 1.4333, "step": 44800 }, { "epoch": 7.405081594711836, "grad_norm": 9.234789848327637, "learning_rate": 1.4414513670834942e-05, "loss": 1.3847, "step": 44810 }, { "epoch": 7.406734145837637, "grad_norm": 7.183691024780273, "learning_rate": 1.4405332451936321e-05, "loss": 1.3156, "step": 44820 }, { "epoch": 7.408386696963437, "grad_norm": 16.125627517700195, "learning_rate": 1.4396151233037699e-05, "loss": 1.4502, "step": 44830 }, { "epoch": 7.410039248089237, "grad_norm": 12.971248626708984, "learning_rate": 1.4386970014139078e-05, "loss": 1.4037, "step": 44840 }, { "epoch": 7.411691799215038, "grad_norm": 13.183744430541992, "learning_rate": 1.4377788795240457e-05, "loss": 1.3773, "step": 44850 }, { "epoch": 7.4133443503408385, "grad_norm": 12.956613540649414, "learning_rate": 1.4368607576341837e-05, "loss": 1.4324, "step": 44860 }, { "epoch": 7.4149969014666395, "grad_norm": 13.061296463012695, "learning_rate": 1.4359426357443214e-05, "loss": 1.3835, "step": 44870 }, { "epoch": 7.41664945259244, "grad_norm": 16.211750030517578, "learning_rate": 1.4350245138544593e-05, "loss": 1.2608, "step": 44880 }, { "epoch": 7.41830200371824, "grad_norm": 27.55082893371582, "learning_rate": 1.4341063919645973e-05, "loss": 1.3529, "step": 44890 }, { "epoch": 7.419954554844041, "grad_norm": 10.643492698669434, "learning_rate": 1.4331882700747352e-05, "loss": 1.3314, "step": 44900 }, { "epoch": 7.421607105969841, "grad_norm": 17.30533790588379, "learning_rate": 1.432270148184873e-05, "loss": 1.426, "step": 44910 }, { "epoch": 7.423259657095642, "grad_norm": 14.52156925201416, "learning_rate": 1.4313520262950109e-05, "loss": 1.3467, "step": 44920 }, { "epoch": 7.424912208221442, "grad_norm": 14.510608673095703, "learning_rate": 1.430433904405149e-05, "loss": 1.3018, "step": 44930 }, { "epoch": 7.426564759347242, "grad_norm": 15.366539001464844, "learning_rate": 1.4295157825152869e-05, "loss": 1.4342, "step": 44940 }, { "epoch": 7.428217310473043, "grad_norm": 66.76383972167969, "learning_rate": 1.4285976606254248e-05, "loss": 1.2281, "step": 44950 }, { "epoch": 7.429869861598843, "grad_norm": 12.815417289733887, "learning_rate": 1.4276795387355626e-05, "loss": 1.3599, "step": 44960 }, { "epoch": 7.431522412724644, "grad_norm": 11.356534004211426, "learning_rate": 1.4267614168457005e-05, "loss": 1.3074, "step": 44970 }, { "epoch": 7.433174963850444, "grad_norm": 12.682281494140625, "learning_rate": 1.4258432949558384e-05, "loss": 1.4034, "step": 44980 }, { "epoch": 7.434827514976244, "grad_norm": 217.0580596923828, "learning_rate": 1.4249251730659763e-05, "loss": 1.495, "step": 44990 }, { "epoch": 7.436480066102045, "grad_norm": 13.032721519470215, "learning_rate": 1.4240070511761141e-05, "loss": 1.373, "step": 45000 }, { "epoch": 7.438132617227845, "grad_norm": 13.059027671813965, "learning_rate": 1.423088929286252e-05, "loss": 1.4275, "step": 45010 }, { "epoch": 7.439785168353646, "grad_norm": 21.048809051513672, "learning_rate": 1.42217080739639e-05, "loss": 1.4316, "step": 45020 }, { "epoch": 7.441437719479446, "grad_norm": 15.526588439941406, "learning_rate": 1.4212526855065279e-05, "loss": 1.4637, "step": 45030 }, { "epoch": 7.443090270605246, "grad_norm": 12.340840339660645, "learning_rate": 1.420334563616666e-05, "loss": 1.4365, "step": 45040 }, { "epoch": 7.444742821731047, "grad_norm": 13.411223411560059, "learning_rate": 1.4194164417268039e-05, "loss": 1.4682, "step": 45050 }, { "epoch": 7.4463953728568475, "grad_norm": 9.226353645324707, "learning_rate": 1.4184983198369416e-05, "loss": 1.2974, "step": 45060 }, { "epoch": 7.4480479239826485, "grad_norm": 11.580143928527832, "learning_rate": 1.4175801979470796e-05, "loss": 1.517, "step": 45070 }, { "epoch": 7.449700475108449, "grad_norm": 20.38673973083496, "learning_rate": 1.4166620760572175e-05, "loss": 1.3698, "step": 45080 }, { "epoch": 7.4513530262342496, "grad_norm": 8.279924392700195, "learning_rate": 1.4157439541673554e-05, "loss": 1.5279, "step": 45090 }, { "epoch": 7.45300557736005, "grad_norm": 12.932108879089355, "learning_rate": 1.4148258322774932e-05, "loss": 1.5431, "step": 45100 }, { "epoch": 7.45465812848585, "grad_norm": 16.382911682128906, "learning_rate": 1.4139077103876311e-05, "loss": 1.3692, "step": 45110 }, { "epoch": 7.456310679611651, "grad_norm": 9.782100677490234, "learning_rate": 1.412989588497769e-05, "loss": 1.3877, "step": 45120 }, { "epoch": 7.457963230737451, "grad_norm": 13.853610038757324, "learning_rate": 1.4120714666079068e-05, "loss": 1.3601, "step": 45130 }, { "epoch": 7.459615781863252, "grad_norm": 15.717327117919922, "learning_rate": 1.4111533447180447e-05, "loss": 1.5425, "step": 45140 }, { "epoch": 7.461268332989052, "grad_norm": 11.447760581970215, "learning_rate": 1.4102352228281826e-05, "loss": 1.3657, "step": 45150 }, { "epoch": 7.462920884114852, "grad_norm": 15.681619644165039, "learning_rate": 1.4093171009383207e-05, "loss": 1.3663, "step": 45160 }, { "epoch": 7.464573435240653, "grad_norm": 11.067183494567871, "learning_rate": 1.4083989790484586e-05, "loss": 1.4642, "step": 45170 }, { "epoch": 7.466225986366453, "grad_norm": 21.230554580688477, "learning_rate": 1.4074808571585966e-05, "loss": 1.2956, "step": 45180 }, { "epoch": 7.467878537492254, "grad_norm": 12.558381080627441, "learning_rate": 1.4065627352687343e-05, "loss": 1.48, "step": 45190 }, { "epoch": 7.469531088618054, "grad_norm": 8.757758140563965, "learning_rate": 1.4056446133788723e-05, "loss": 1.1943, "step": 45200 }, { "epoch": 7.471183639743854, "grad_norm": 16.4716796875, "learning_rate": 1.4047264914890102e-05, "loss": 1.3091, "step": 45210 }, { "epoch": 7.472836190869655, "grad_norm": 22.595905303955078, "learning_rate": 1.4038083695991481e-05, "loss": 1.5014, "step": 45220 }, { "epoch": 7.474488741995455, "grad_norm": 12.184328079223633, "learning_rate": 1.4028902477092859e-05, "loss": 1.2568, "step": 45230 }, { "epoch": 7.476141293121256, "grad_norm": 14.307938575744629, "learning_rate": 1.4019721258194238e-05, "loss": 1.366, "step": 45240 }, { "epoch": 7.477793844247056, "grad_norm": 15.05473518371582, "learning_rate": 1.4010540039295617e-05, "loss": 1.3869, "step": 45250 }, { "epoch": 7.4794463953728565, "grad_norm": 12.070158958435059, "learning_rate": 1.4001358820396995e-05, "loss": 1.296, "step": 45260 }, { "epoch": 7.4810989464986575, "grad_norm": 23.16536521911621, "learning_rate": 1.3992177601498377e-05, "loss": 1.3242, "step": 45270 }, { "epoch": 7.482751497624458, "grad_norm": 11.528528213500977, "learning_rate": 1.3982996382599755e-05, "loss": 1.4554, "step": 45280 }, { "epoch": 7.4844040487502586, "grad_norm": 10.557455062866211, "learning_rate": 1.3973815163701134e-05, "loss": 1.3526, "step": 45290 }, { "epoch": 7.486056599876059, "grad_norm": 6.784778594970703, "learning_rate": 1.3964633944802513e-05, "loss": 1.3421, "step": 45300 }, { "epoch": 7.487709151001859, "grad_norm": 10.750931739807129, "learning_rate": 1.3955452725903893e-05, "loss": 1.305, "step": 45310 }, { "epoch": 7.48936170212766, "grad_norm": 16.1955623626709, "learning_rate": 1.394627150700527e-05, "loss": 1.4377, "step": 45320 }, { "epoch": 7.49101425325346, "grad_norm": 14.3265962600708, "learning_rate": 1.393709028810665e-05, "loss": 1.3428, "step": 45330 }, { "epoch": 7.492666804379261, "grad_norm": 31.921222686767578, "learning_rate": 1.3927909069208029e-05, "loss": 1.4143, "step": 45340 }, { "epoch": 7.494319355505061, "grad_norm": 20.369794845581055, "learning_rate": 1.3918727850309408e-05, "loss": 1.3892, "step": 45350 }, { "epoch": 7.495971906630861, "grad_norm": 14.980151176452637, "learning_rate": 1.3909546631410785e-05, "loss": 1.3714, "step": 45360 }, { "epoch": 7.497624457756662, "grad_norm": 21.37506675720215, "learning_rate": 1.3900365412512165e-05, "loss": 1.3442, "step": 45370 }, { "epoch": 7.499277008882462, "grad_norm": 9.15767765045166, "learning_rate": 1.3891184193613546e-05, "loss": 1.2651, "step": 45380 }, { "epoch": 7.500929560008263, "grad_norm": 19.919265747070312, "learning_rate": 1.3882002974714925e-05, "loss": 1.4399, "step": 45390 }, { "epoch": 7.502582111134063, "grad_norm": 18.08030891418457, "learning_rate": 1.3872821755816304e-05, "loss": 1.2924, "step": 45400 }, { "epoch": 7.504234662259863, "grad_norm": 14.912238121032715, "learning_rate": 1.3863640536917683e-05, "loss": 1.356, "step": 45410 }, { "epoch": 7.505887213385664, "grad_norm": 18.654098510742188, "learning_rate": 1.3854459318019061e-05, "loss": 1.3739, "step": 45420 }, { "epoch": 7.507539764511464, "grad_norm": 37.90325164794922, "learning_rate": 1.384527809912044e-05, "loss": 1.4239, "step": 45430 }, { "epoch": 7.509192315637265, "grad_norm": 10.297859191894531, "learning_rate": 1.383609688022182e-05, "loss": 1.3303, "step": 45440 }, { "epoch": 7.510844866763065, "grad_norm": 16.992605209350586, "learning_rate": 1.3826915661323197e-05, "loss": 1.4006, "step": 45450 }, { "epoch": 7.512497417888866, "grad_norm": 11.61043930053711, "learning_rate": 1.3817734442424576e-05, "loss": 1.3269, "step": 45460 }, { "epoch": 7.5141499690146665, "grad_norm": 11.616755485534668, "learning_rate": 1.3808553223525955e-05, "loss": 1.2896, "step": 45470 }, { "epoch": 7.515802520140467, "grad_norm": 13.678464889526367, "learning_rate": 1.3799372004627335e-05, "loss": 1.4308, "step": 45480 }, { "epoch": 7.5174550712662676, "grad_norm": 15.36746597290039, "learning_rate": 1.3790190785728712e-05, "loss": 1.2909, "step": 45490 }, { "epoch": 7.519107622392068, "grad_norm": 17.820960998535156, "learning_rate": 1.3781009566830095e-05, "loss": 1.273, "step": 45500 }, { "epoch": 7.520760173517868, "grad_norm": 34.905059814453125, "learning_rate": 1.3771828347931472e-05, "loss": 1.3707, "step": 45510 }, { "epoch": 7.522412724643669, "grad_norm": 9.051143646240234, "learning_rate": 1.3762647129032852e-05, "loss": 1.3682, "step": 45520 }, { "epoch": 7.524065275769469, "grad_norm": 12.66955852508545, "learning_rate": 1.3753465910134231e-05, "loss": 1.361, "step": 45530 }, { "epoch": 7.52571782689527, "grad_norm": 12.091620445251465, "learning_rate": 1.374428469123561e-05, "loss": 1.3856, "step": 45540 }, { "epoch": 7.52737037802107, "grad_norm": 9.986820220947266, "learning_rate": 1.3735103472336988e-05, "loss": 1.362, "step": 45550 }, { "epoch": 7.529022929146871, "grad_norm": 40.41029357910156, "learning_rate": 1.3725922253438367e-05, "loss": 1.4566, "step": 45560 }, { "epoch": 7.530675480272671, "grad_norm": 51.40917205810547, "learning_rate": 1.3716741034539746e-05, "loss": 1.4305, "step": 45570 }, { "epoch": 7.532328031398471, "grad_norm": 12.97027587890625, "learning_rate": 1.3707559815641124e-05, "loss": 1.444, "step": 45580 }, { "epoch": 7.533980582524272, "grad_norm": 10.593807220458984, "learning_rate": 1.3698378596742503e-05, "loss": 1.4671, "step": 45590 }, { "epoch": 7.535633133650072, "grad_norm": 14.09102725982666, "learning_rate": 1.3689197377843882e-05, "loss": 1.2288, "step": 45600 }, { "epoch": 7.537285684775873, "grad_norm": 13.20085620880127, "learning_rate": 1.3680016158945263e-05, "loss": 1.5422, "step": 45610 }, { "epoch": 7.538938235901673, "grad_norm": 13.723567008972168, "learning_rate": 1.3670834940046642e-05, "loss": 1.3606, "step": 45620 }, { "epoch": 7.540590787027473, "grad_norm": 12.14212703704834, "learning_rate": 1.3661653721148022e-05, "loss": 1.3494, "step": 45630 }, { "epoch": 7.542243338153274, "grad_norm": 18.38138771057129, "learning_rate": 1.36524725022494e-05, "loss": 1.3763, "step": 45640 }, { "epoch": 7.543895889279074, "grad_norm": 9.355786323547363, "learning_rate": 1.3643291283350778e-05, "loss": 1.4087, "step": 45650 }, { "epoch": 7.545548440404875, "grad_norm": 17.49809455871582, "learning_rate": 1.3634110064452158e-05, "loss": 1.3953, "step": 45660 }, { "epoch": 7.5472009915306755, "grad_norm": 29.25096893310547, "learning_rate": 1.3624928845553537e-05, "loss": 1.3762, "step": 45670 }, { "epoch": 7.548853542656476, "grad_norm": 21.453706741333008, "learning_rate": 1.3615747626654915e-05, "loss": 1.4828, "step": 45680 }, { "epoch": 7.5505060937822766, "grad_norm": 9.639589309692383, "learning_rate": 1.3606566407756294e-05, "loss": 1.4189, "step": 45690 }, { "epoch": 7.552158644908077, "grad_norm": 18.783720016479492, "learning_rate": 1.3597385188857673e-05, "loss": 1.3584, "step": 45700 }, { "epoch": 7.553811196033878, "grad_norm": 10.537060737609863, "learning_rate": 1.358820396995905e-05, "loss": 1.3764, "step": 45710 }, { "epoch": 7.555463747159678, "grad_norm": 17.940895080566406, "learning_rate": 1.357902275106043e-05, "loss": 1.5005, "step": 45720 }, { "epoch": 7.557116298285479, "grad_norm": 13.430806159973145, "learning_rate": 1.3569841532161812e-05, "loss": 1.4229, "step": 45730 }, { "epoch": 7.558768849411279, "grad_norm": 37.275325775146484, "learning_rate": 1.356066031326319e-05, "loss": 1.332, "step": 45740 }, { "epoch": 7.560421400537079, "grad_norm": 14.86922550201416, "learning_rate": 1.355147909436457e-05, "loss": 1.3686, "step": 45750 }, { "epoch": 7.56207395166288, "grad_norm": 87.05618286132812, "learning_rate": 1.3542297875465949e-05, "loss": 1.4918, "step": 45760 }, { "epoch": 7.56372650278868, "grad_norm": 11.52795696258545, "learning_rate": 1.3533116656567326e-05, "loss": 1.3878, "step": 45770 }, { "epoch": 7.56537905391448, "grad_norm": 13.127462387084961, "learning_rate": 1.3523935437668705e-05, "loss": 1.3601, "step": 45780 }, { "epoch": 7.567031605040281, "grad_norm": 14.65783977508545, "learning_rate": 1.3514754218770085e-05, "loss": 1.3726, "step": 45790 }, { "epoch": 7.568684156166081, "grad_norm": 12.145511627197266, "learning_rate": 1.3505572999871464e-05, "loss": 1.4446, "step": 45800 }, { "epoch": 7.570336707291882, "grad_norm": 20.19838523864746, "learning_rate": 1.3496391780972841e-05, "loss": 1.472, "step": 45810 }, { "epoch": 7.571989258417682, "grad_norm": 8.90953254699707, "learning_rate": 1.348721056207422e-05, "loss": 1.3158, "step": 45820 }, { "epoch": 7.573641809543483, "grad_norm": 14.088125228881836, "learning_rate": 1.34780293431756e-05, "loss": 1.2182, "step": 45830 }, { "epoch": 7.575294360669283, "grad_norm": 14.895604133605957, "learning_rate": 1.346884812427698e-05, "loss": 1.501, "step": 45840 }, { "epoch": 7.576946911795083, "grad_norm": 11.624994277954102, "learning_rate": 1.345966690537836e-05, "loss": 1.4957, "step": 45850 }, { "epoch": 7.578599462920884, "grad_norm": 19.928787231445312, "learning_rate": 1.345048568647974e-05, "loss": 1.2905, "step": 45860 }, { "epoch": 7.5802520140466845, "grad_norm": 12.225404739379883, "learning_rate": 1.3441304467581117e-05, "loss": 1.5096, "step": 45870 }, { "epoch": 7.581904565172485, "grad_norm": 12.218575477600098, "learning_rate": 1.3432123248682496e-05, "loss": 1.289, "step": 45880 }, { "epoch": 7.5835571162982855, "grad_norm": 17.526824951171875, "learning_rate": 1.3422942029783875e-05, "loss": 1.4192, "step": 45890 }, { "epoch": 7.585209667424086, "grad_norm": 13.405013084411621, "learning_rate": 1.3413760810885253e-05, "loss": 1.4625, "step": 45900 }, { "epoch": 7.586862218549887, "grad_norm": 10.898892402648926, "learning_rate": 1.3404579591986632e-05, "loss": 1.3915, "step": 45910 }, { "epoch": 7.588514769675687, "grad_norm": 17.272350311279297, "learning_rate": 1.3395398373088011e-05, "loss": 1.409, "step": 45920 }, { "epoch": 7.590167320801488, "grad_norm": 13.411367416381836, "learning_rate": 1.338621715418939e-05, "loss": 1.4557, "step": 45930 }, { "epoch": 7.591819871927288, "grad_norm": 33.857017517089844, "learning_rate": 1.3377035935290768e-05, "loss": 1.2233, "step": 45940 }, { "epoch": 7.593472423053088, "grad_norm": 18.17672348022461, "learning_rate": 1.3367854716392147e-05, "loss": 1.3755, "step": 45950 }, { "epoch": 7.595124974178889, "grad_norm": 11.827006340026855, "learning_rate": 1.3358673497493528e-05, "loss": 1.3968, "step": 45960 }, { "epoch": 7.596777525304689, "grad_norm": 10.973575592041016, "learning_rate": 1.3349492278594908e-05, "loss": 1.4225, "step": 45970 }, { "epoch": 7.59843007643049, "grad_norm": 13.734282493591309, "learning_rate": 1.3340311059696287e-05, "loss": 1.3545, "step": 45980 }, { "epoch": 7.60008262755629, "grad_norm": 20.4477596282959, "learning_rate": 1.3331129840797666e-05, "loss": 1.4661, "step": 45990 }, { "epoch": 7.60173517868209, "grad_norm": 11.852956771850586, "learning_rate": 1.3321948621899044e-05, "loss": 1.28, "step": 46000 }, { "epoch": 7.603387729807891, "grad_norm": 11.33363151550293, "learning_rate": 1.3312767403000423e-05, "loss": 1.4586, "step": 46010 }, { "epoch": 7.605040280933691, "grad_norm": 10.724862098693848, "learning_rate": 1.3303586184101802e-05, "loss": 1.5499, "step": 46020 }, { "epoch": 7.606692832059492, "grad_norm": 12.029391288757324, "learning_rate": 1.329440496520318e-05, "loss": 1.3334, "step": 46030 }, { "epoch": 7.608345383185292, "grad_norm": 27.66431999206543, "learning_rate": 1.3285223746304559e-05, "loss": 1.4114, "step": 46040 }, { "epoch": 7.609997934311092, "grad_norm": 11.550406455993652, "learning_rate": 1.3276042527405938e-05, "loss": 1.3458, "step": 46050 }, { "epoch": 7.611650485436893, "grad_norm": 19.206697463989258, "learning_rate": 1.3266861308507317e-05, "loss": 1.4311, "step": 46060 }, { "epoch": 7.6133030365626935, "grad_norm": 13.49929428100586, "learning_rate": 1.3257680089608698e-05, "loss": 1.4536, "step": 46070 }, { "epoch": 7.6149555876884945, "grad_norm": 12.506552696228027, "learning_rate": 1.3248498870710078e-05, "loss": 1.4649, "step": 46080 }, { "epoch": 7.6166081388142945, "grad_norm": 16.31134605407715, "learning_rate": 1.3239317651811455e-05, "loss": 1.4549, "step": 46090 }, { "epoch": 7.618260689940095, "grad_norm": 9.935333251953125, "learning_rate": 1.3230136432912834e-05, "loss": 1.387, "step": 46100 }, { "epoch": 7.619913241065896, "grad_norm": 14.679750442504883, "learning_rate": 1.3220955214014214e-05, "loss": 1.4526, "step": 46110 }, { "epoch": 7.621565792191696, "grad_norm": 11.663803100585938, "learning_rate": 1.3211773995115593e-05, "loss": 1.4375, "step": 46120 }, { "epoch": 7.623218343317497, "grad_norm": 333.21502685546875, "learning_rate": 1.320259277621697e-05, "loss": 1.5769, "step": 46130 }, { "epoch": 7.624870894443297, "grad_norm": 17.140153884887695, "learning_rate": 1.319341155731835e-05, "loss": 1.3195, "step": 46140 }, { "epoch": 7.626523445569097, "grad_norm": 7.242269039154053, "learning_rate": 1.3184230338419729e-05, "loss": 1.4281, "step": 46150 }, { "epoch": 7.628175996694898, "grad_norm": 13.725931167602539, "learning_rate": 1.3175049119521107e-05, "loss": 1.3323, "step": 46160 }, { "epoch": 7.629828547820698, "grad_norm": 14.671615600585938, "learning_rate": 1.3165867900622486e-05, "loss": 1.4224, "step": 46170 }, { "epoch": 7.631481098946499, "grad_norm": 12.745248794555664, "learning_rate": 1.3156686681723868e-05, "loss": 1.3534, "step": 46180 }, { "epoch": 7.633133650072299, "grad_norm": 24.55314064025879, "learning_rate": 1.3147505462825246e-05, "loss": 1.2832, "step": 46190 }, { "epoch": 7.6347862011981, "grad_norm": 17.937116622924805, "learning_rate": 1.3138324243926625e-05, "loss": 1.2754, "step": 46200 }, { "epoch": 7.6364387523239, "grad_norm": 26.932025909423828, "learning_rate": 1.3129143025028005e-05, "loss": 1.3059, "step": 46210 }, { "epoch": 7.6380913034497, "grad_norm": 11.607211112976074, "learning_rate": 1.3119961806129382e-05, "loss": 1.4191, "step": 46220 }, { "epoch": 7.639743854575501, "grad_norm": 30.498897552490234, "learning_rate": 1.3110780587230761e-05, "loss": 1.6439, "step": 46230 }, { "epoch": 7.641396405701301, "grad_norm": 9.308640480041504, "learning_rate": 1.310159936833214e-05, "loss": 1.3958, "step": 46240 }, { "epoch": 7.643048956827101, "grad_norm": 12.674652099609375, "learning_rate": 1.309241814943352e-05, "loss": 1.3947, "step": 46250 }, { "epoch": 7.644701507952902, "grad_norm": 18.442964553833008, "learning_rate": 1.3083236930534897e-05, "loss": 1.3829, "step": 46260 }, { "epoch": 7.6463540590787025, "grad_norm": 9.009620666503906, "learning_rate": 1.3074055711636277e-05, "loss": 1.382, "step": 46270 }, { "epoch": 7.6480066102045035, "grad_norm": 11.820587158203125, "learning_rate": 1.3064874492737656e-05, "loss": 1.2625, "step": 46280 }, { "epoch": 7.6496591613303035, "grad_norm": 13.428104400634766, "learning_rate": 1.3055693273839033e-05, "loss": 1.2081, "step": 46290 }, { "epoch": 7.6513117124561045, "grad_norm": 226.61851501464844, "learning_rate": 1.3046512054940416e-05, "loss": 1.3743, "step": 46300 }, { "epoch": 7.652964263581905, "grad_norm": 19.351016998291016, "learning_rate": 1.3037330836041795e-05, "loss": 1.3831, "step": 46310 }, { "epoch": 7.654616814707705, "grad_norm": 9.168930053710938, "learning_rate": 1.3028149617143173e-05, "loss": 1.3555, "step": 46320 }, { "epoch": 7.656269365833506, "grad_norm": 17.819650650024414, "learning_rate": 1.3018968398244552e-05, "loss": 1.4432, "step": 46330 }, { "epoch": 7.657921916959306, "grad_norm": 57.033180236816406, "learning_rate": 1.3009787179345931e-05, "loss": 1.3021, "step": 46340 }, { "epoch": 7.659574468085106, "grad_norm": 25.294410705566406, "learning_rate": 1.3000605960447309e-05, "loss": 1.4219, "step": 46350 }, { "epoch": 7.661227019210907, "grad_norm": 22.175817489624023, "learning_rate": 1.2991424741548688e-05, "loss": 1.4499, "step": 46360 }, { "epoch": 7.662879570336707, "grad_norm": 12.002792358398438, "learning_rate": 1.2982243522650067e-05, "loss": 1.3654, "step": 46370 }, { "epoch": 7.664532121462508, "grad_norm": 9.988709449768066, "learning_rate": 1.2973062303751447e-05, "loss": 1.4325, "step": 46380 }, { "epoch": 7.666184672588308, "grad_norm": 14.610112190246582, "learning_rate": 1.2963881084852824e-05, "loss": 1.2868, "step": 46390 }, { "epoch": 7.667837223714109, "grad_norm": 14.141910552978516, "learning_rate": 1.2954699865954203e-05, "loss": 1.3978, "step": 46400 }, { "epoch": 7.669489774839909, "grad_norm": 24.699037551879883, "learning_rate": 1.2945518647055584e-05, "loss": 1.3868, "step": 46410 }, { "epoch": 7.671142325965709, "grad_norm": 13.397317886352539, "learning_rate": 1.2936337428156964e-05, "loss": 1.329, "step": 46420 }, { "epoch": 7.67279487709151, "grad_norm": 13.475739479064941, "learning_rate": 1.2927156209258343e-05, "loss": 1.4917, "step": 46430 }, { "epoch": 7.67444742821731, "grad_norm": 14.52394962310791, "learning_rate": 1.2917974990359722e-05, "loss": 1.3478, "step": 46440 }, { "epoch": 7.676099979343111, "grad_norm": 20.19123077392578, "learning_rate": 1.29087937714611e-05, "loss": 1.5876, "step": 46450 }, { "epoch": 7.677752530468911, "grad_norm": 10.427851676940918, "learning_rate": 1.2899612552562479e-05, "loss": 1.3178, "step": 46460 }, { "epoch": 7.6794050815947115, "grad_norm": 15.901928901672363, "learning_rate": 1.2890431333663858e-05, "loss": 1.4723, "step": 46470 }, { "epoch": 7.6810576327205125, "grad_norm": 16.793914794921875, "learning_rate": 1.2881250114765236e-05, "loss": 1.3749, "step": 46480 }, { "epoch": 7.6827101838463125, "grad_norm": 9.323737144470215, "learning_rate": 1.2872068895866615e-05, "loss": 1.3149, "step": 46490 }, { "epoch": 7.6843627349721135, "grad_norm": 13.843609809875488, "learning_rate": 1.2862887676967994e-05, "loss": 1.4424, "step": 46500 }, { "epoch": 7.686015286097914, "grad_norm": 16.021211624145508, "learning_rate": 1.2853706458069373e-05, "loss": 1.3349, "step": 46510 }, { "epoch": 7.687667837223714, "grad_norm": 12.02914810180664, "learning_rate": 1.2844525239170751e-05, "loss": 1.3076, "step": 46520 }, { "epoch": 7.689320388349515, "grad_norm": 9.748140335083008, "learning_rate": 1.2835344020272134e-05, "loss": 1.3444, "step": 46530 }, { "epoch": 7.690972939475315, "grad_norm": 17.1309757232666, "learning_rate": 1.2826162801373511e-05, "loss": 1.3255, "step": 46540 }, { "epoch": 7.692625490601116, "grad_norm": 10.874885559082031, "learning_rate": 1.281698158247489e-05, "loss": 1.3793, "step": 46550 }, { "epoch": 7.694278041726916, "grad_norm": 13.18276596069336, "learning_rate": 1.280780036357627e-05, "loss": 1.399, "step": 46560 }, { "epoch": 7.695930592852717, "grad_norm": 12.042601585388184, "learning_rate": 1.2798619144677649e-05, "loss": 1.3615, "step": 46570 }, { "epoch": 7.697583143978517, "grad_norm": 25.975971221923828, "learning_rate": 1.2789437925779027e-05, "loss": 1.5137, "step": 46580 }, { "epoch": 7.699235695104317, "grad_norm": 16.21059226989746, "learning_rate": 1.2780256706880406e-05, "loss": 1.3351, "step": 46590 }, { "epoch": 7.700888246230118, "grad_norm": 13.045463562011719, "learning_rate": 1.2771075487981785e-05, "loss": 1.3001, "step": 46600 }, { "epoch": 7.702540797355918, "grad_norm": 11.22043228149414, "learning_rate": 1.2761894269083163e-05, "loss": 1.4261, "step": 46610 }, { "epoch": 7.704193348481718, "grad_norm": 36.950538635253906, "learning_rate": 1.2752713050184542e-05, "loss": 1.3339, "step": 46620 }, { "epoch": 7.705845899607519, "grad_norm": 9.229752540588379, "learning_rate": 1.2743531831285921e-05, "loss": 1.3431, "step": 46630 }, { "epoch": 7.707498450733319, "grad_norm": 11.805033683776855, "learning_rate": 1.2734350612387302e-05, "loss": 1.3491, "step": 46640 }, { "epoch": 7.70915100185912, "grad_norm": 23.916683197021484, "learning_rate": 1.2725169393488681e-05, "loss": 1.3354, "step": 46650 }, { "epoch": 7.71080355298492, "grad_norm": 15.571776390075684, "learning_rate": 1.271598817459006e-05, "loss": 1.4053, "step": 46660 }, { "epoch": 7.712456104110721, "grad_norm": 16.37409782409668, "learning_rate": 1.2706806955691438e-05, "loss": 1.2922, "step": 46670 }, { "epoch": 7.7141086552365215, "grad_norm": 10.659138679504395, "learning_rate": 1.2697625736792817e-05, "loss": 1.3695, "step": 46680 }, { "epoch": 7.7157612063623215, "grad_norm": 20.0853328704834, "learning_rate": 1.2688444517894197e-05, "loss": 1.4891, "step": 46690 }, { "epoch": 7.7174137574881225, "grad_norm": 16.797853469848633, "learning_rate": 1.2679263298995576e-05, "loss": 1.4373, "step": 46700 }, { "epoch": 7.719066308613923, "grad_norm": 14.921588897705078, "learning_rate": 1.2670082080096953e-05, "loss": 1.4576, "step": 46710 }, { "epoch": 7.720718859739723, "grad_norm": 9.122919082641602, "learning_rate": 1.2660900861198333e-05, "loss": 1.3505, "step": 46720 }, { "epoch": 7.722371410865524, "grad_norm": 18.881772994995117, "learning_rate": 1.2651719642299712e-05, "loss": 1.5416, "step": 46730 }, { "epoch": 7.724023961991324, "grad_norm": 9.798796653747559, "learning_rate": 1.264253842340109e-05, "loss": 1.3828, "step": 46740 }, { "epoch": 7.725676513117125, "grad_norm": 21.321590423583984, "learning_rate": 1.2633357204502472e-05, "loss": 1.4042, "step": 46750 }, { "epoch": 7.727329064242925, "grad_norm": 16.886615753173828, "learning_rate": 1.2624175985603851e-05, "loss": 1.3997, "step": 46760 }, { "epoch": 7.728981615368726, "grad_norm": 16.772817611694336, "learning_rate": 1.2614994766705229e-05, "loss": 1.4211, "step": 46770 }, { "epoch": 7.730634166494526, "grad_norm": 19.1043758392334, "learning_rate": 1.2605813547806608e-05, "loss": 1.3403, "step": 46780 }, { "epoch": 7.732286717620326, "grad_norm": 17.032512664794922, "learning_rate": 1.2596632328907987e-05, "loss": 1.3278, "step": 46790 }, { "epoch": 7.733939268746127, "grad_norm": 9.3512544631958, "learning_rate": 1.2587451110009365e-05, "loss": 1.4196, "step": 46800 }, { "epoch": 7.735591819871927, "grad_norm": 14.732444763183594, "learning_rate": 1.2578269891110744e-05, "loss": 1.4069, "step": 46810 }, { "epoch": 7.737244370997728, "grad_norm": 19.343429565429688, "learning_rate": 1.2569088672212123e-05, "loss": 1.3892, "step": 46820 }, { "epoch": 7.738896922123528, "grad_norm": 15.403067588806152, "learning_rate": 1.2559907453313503e-05, "loss": 1.433, "step": 46830 }, { "epoch": 7.740549473249328, "grad_norm": 14.680621147155762, "learning_rate": 1.255072623441488e-05, "loss": 1.5072, "step": 46840 }, { "epoch": 7.742202024375129, "grad_norm": 21.64932632446289, "learning_rate": 1.254154501551626e-05, "loss": 1.4388, "step": 46850 }, { "epoch": 7.743854575500929, "grad_norm": 12.506025314331055, "learning_rate": 1.2532363796617639e-05, "loss": 1.3015, "step": 46860 }, { "epoch": 7.74550712662673, "grad_norm": 18.126253128051758, "learning_rate": 1.252318257771902e-05, "loss": 1.2921, "step": 46870 }, { "epoch": 7.7471596777525304, "grad_norm": 8.994416236877441, "learning_rate": 1.2514001358820399e-05, "loss": 1.3792, "step": 46880 }, { "epoch": 7.7488122288783305, "grad_norm": 14.529839515686035, "learning_rate": 1.2504820139921778e-05, "loss": 1.3501, "step": 46890 }, { "epoch": 7.7504647800041315, "grad_norm": 8.697839736938477, "learning_rate": 1.2495638921023156e-05, "loss": 1.3575, "step": 46900 }, { "epoch": 7.752117331129932, "grad_norm": 14.368610382080078, "learning_rate": 1.2486457702124535e-05, "loss": 1.3781, "step": 46910 }, { "epoch": 7.753769882255733, "grad_norm": 11.302849769592285, "learning_rate": 1.2477276483225914e-05, "loss": 1.4043, "step": 46920 }, { "epoch": 7.755422433381533, "grad_norm": 13.063495635986328, "learning_rate": 1.2468095264327292e-05, "loss": 1.3099, "step": 46930 }, { "epoch": 7.757074984507334, "grad_norm": 11.363663673400879, "learning_rate": 1.2458914045428671e-05, "loss": 1.2627, "step": 46940 }, { "epoch": 7.758727535633134, "grad_norm": 16.601898193359375, "learning_rate": 1.2449732826530052e-05, "loss": 1.2975, "step": 46950 }, { "epoch": 7.760380086758934, "grad_norm": 15.275944709777832, "learning_rate": 1.244055160763143e-05, "loss": 1.3798, "step": 46960 }, { "epoch": 7.762032637884735, "grad_norm": 15.088699340820312, "learning_rate": 1.2431370388732809e-05, "loss": 1.4236, "step": 46970 }, { "epoch": 7.763685189010535, "grad_norm": 49.174373626708984, "learning_rate": 1.2422189169834188e-05, "loss": 1.446, "step": 46980 }, { "epoch": 7.765337740136335, "grad_norm": 10.12486457824707, "learning_rate": 1.2413007950935567e-05, "loss": 1.3574, "step": 46990 }, { "epoch": 7.766990291262136, "grad_norm": 13.83552074432373, "learning_rate": 1.2403826732036945e-05, "loss": 1.3296, "step": 47000 }, { "epoch": 7.768642842387936, "grad_norm": 7.309926986694336, "learning_rate": 1.2394645513138326e-05, "loss": 1.2785, "step": 47010 }, { "epoch": 7.770295393513737, "grad_norm": 11.483392715454102, "learning_rate": 1.2385464294239705e-05, "loss": 1.3437, "step": 47020 }, { "epoch": 7.771947944639537, "grad_norm": 14.15250301361084, "learning_rate": 1.2376283075341082e-05, "loss": 1.1891, "step": 47030 }, { "epoch": 7.773600495765338, "grad_norm": 8.438793182373047, "learning_rate": 1.2367101856442462e-05, "loss": 1.2479, "step": 47040 }, { "epoch": 7.775253046891138, "grad_norm": 11.011880874633789, "learning_rate": 1.2357920637543841e-05, "loss": 1.2589, "step": 47050 }, { "epoch": 7.776905598016938, "grad_norm": 12.733864784240723, "learning_rate": 1.2348739418645219e-05, "loss": 1.3427, "step": 47060 }, { "epoch": 7.778558149142739, "grad_norm": 11.575126647949219, "learning_rate": 1.23395581997466e-05, "loss": 1.3099, "step": 47070 }, { "epoch": 7.7802107002685394, "grad_norm": 9.306955337524414, "learning_rate": 1.2330376980847979e-05, "loss": 1.3208, "step": 47080 }, { "epoch": 7.7818632513943395, "grad_norm": 21.35601234436035, "learning_rate": 1.2321195761949356e-05, "loss": 1.4267, "step": 47090 }, { "epoch": 7.7835158025201405, "grad_norm": 9.708170890808105, "learning_rate": 1.2312014543050736e-05, "loss": 1.3281, "step": 47100 }, { "epoch": 7.785168353645941, "grad_norm": 11.418980598449707, "learning_rate": 1.2302833324152115e-05, "loss": 1.4658, "step": 47110 }, { "epoch": 7.786820904771742, "grad_norm": 33.159698486328125, "learning_rate": 1.2293652105253494e-05, "loss": 1.4893, "step": 47120 }, { "epoch": 7.788473455897542, "grad_norm": 16.78141212463379, "learning_rate": 1.2284470886354873e-05, "loss": 1.5537, "step": 47130 }, { "epoch": 7.790126007023343, "grad_norm": 8.051682472229004, "learning_rate": 1.2275289667456253e-05, "loss": 1.3073, "step": 47140 }, { "epoch": 7.791778558149143, "grad_norm": 10.035185813903809, "learning_rate": 1.2266108448557632e-05, "loss": 1.3287, "step": 47150 }, { "epoch": 7.793431109274943, "grad_norm": 16.550947189331055, "learning_rate": 1.225692722965901e-05, "loss": 1.3027, "step": 47160 }, { "epoch": 7.795083660400744, "grad_norm": 13.783156394958496, "learning_rate": 1.2247746010760389e-05, "loss": 1.5014, "step": 47170 }, { "epoch": 7.796736211526544, "grad_norm": 11.391876220703125, "learning_rate": 1.223856479186177e-05, "loss": 1.3918, "step": 47180 }, { "epoch": 7.798388762652345, "grad_norm": 16.179780960083008, "learning_rate": 1.2229383572963147e-05, "loss": 1.3255, "step": 47190 }, { "epoch": 7.800041313778145, "grad_norm": 20.307268142700195, "learning_rate": 1.2220202354064526e-05, "loss": 1.3893, "step": 47200 }, { "epoch": 7.801693864903945, "grad_norm": 32.367427825927734, "learning_rate": 1.2211021135165906e-05, "loss": 1.4139, "step": 47210 }, { "epoch": 7.803346416029746, "grad_norm": 9.915732383728027, "learning_rate": 1.2201839916267283e-05, "loss": 1.5902, "step": 47220 }, { "epoch": 7.804998967155546, "grad_norm": 10.596333503723145, "learning_rate": 1.2192658697368662e-05, "loss": 1.4343, "step": 47230 }, { "epoch": 7.806651518281347, "grad_norm": 16.008501052856445, "learning_rate": 1.2183477478470043e-05, "loss": 1.3559, "step": 47240 }, { "epoch": 7.808304069407147, "grad_norm": 14.380460739135742, "learning_rate": 1.2174296259571421e-05, "loss": 1.4441, "step": 47250 }, { "epoch": 7.809956620532947, "grad_norm": 18.642454147338867, "learning_rate": 1.21651150406728e-05, "loss": 1.3636, "step": 47260 }, { "epoch": 7.811609171658748, "grad_norm": 10.425969123840332, "learning_rate": 1.215593382177418e-05, "loss": 1.4597, "step": 47270 }, { "epoch": 7.8132617227845484, "grad_norm": 15.50149917602539, "learning_rate": 1.2146752602875559e-05, "loss": 1.3136, "step": 47280 }, { "epoch": 7.814914273910349, "grad_norm": 7.004604816436768, "learning_rate": 1.2137571383976936e-05, "loss": 1.3023, "step": 47290 }, { "epoch": 7.8165668250361495, "grad_norm": 10.380492210388184, "learning_rate": 1.2128390165078317e-05, "loss": 1.2798, "step": 47300 }, { "epoch": 7.8182193761619505, "grad_norm": 13.373892784118652, "learning_rate": 1.2119208946179696e-05, "loss": 1.4054, "step": 47310 }, { "epoch": 7.819871927287751, "grad_norm": 18.053157806396484, "learning_rate": 1.2110027727281074e-05, "loss": 1.5367, "step": 47320 }, { "epoch": 7.821524478413551, "grad_norm": 8.14160442352295, "learning_rate": 1.2100846508382453e-05, "loss": 1.2874, "step": 47330 }, { "epoch": 7.823177029539352, "grad_norm": 13.822787284851074, "learning_rate": 1.2091665289483832e-05, "loss": 1.3485, "step": 47340 }, { "epoch": 7.824829580665152, "grad_norm": 44.869102478027344, "learning_rate": 1.2082484070585212e-05, "loss": 1.4024, "step": 47350 }, { "epoch": 7.826482131790952, "grad_norm": 12.257972717285156, "learning_rate": 1.2073302851686591e-05, "loss": 1.3067, "step": 47360 }, { "epoch": 7.828134682916753, "grad_norm": 13.056764602661133, "learning_rate": 1.206412163278797e-05, "loss": 1.3647, "step": 47370 }, { "epoch": 7.829787234042553, "grad_norm": 9.314446449279785, "learning_rate": 1.2054940413889348e-05, "loss": 1.4241, "step": 47380 }, { "epoch": 7.831439785168354, "grad_norm": 8.149978637695312, "learning_rate": 1.2045759194990727e-05, "loss": 1.3955, "step": 47390 }, { "epoch": 7.833092336294154, "grad_norm": 25.435317993164062, "learning_rate": 1.2036577976092106e-05, "loss": 1.4185, "step": 47400 }, { "epoch": 7.834744887419955, "grad_norm": 15.4861478805542, "learning_rate": 1.2027396757193485e-05, "loss": 1.4882, "step": 47410 }, { "epoch": 7.836397438545755, "grad_norm": 8.784843444824219, "learning_rate": 1.2018215538294865e-05, "loss": 1.33, "step": 47420 }, { "epoch": 7.838049989671555, "grad_norm": 14.153306007385254, "learning_rate": 1.2009034319396244e-05, "loss": 1.3612, "step": 47430 }, { "epoch": 7.839702540797356, "grad_norm": 10.80195426940918, "learning_rate": 1.1999853100497623e-05, "loss": 1.48, "step": 47440 }, { "epoch": 7.841355091923156, "grad_norm": 8.831650733947754, "learning_rate": 1.1990671881599e-05, "loss": 1.3938, "step": 47450 }, { "epoch": 7.843007643048956, "grad_norm": 8.44402027130127, "learning_rate": 1.198149066270038e-05, "loss": 1.4246, "step": 47460 }, { "epoch": 7.844660194174757, "grad_norm": 10.85715389251709, "learning_rate": 1.1972309443801761e-05, "loss": 1.3261, "step": 47470 }, { "epoch": 7.8463127453005574, "grad_norm": 17.19044303894043, "learning_rate": 1.1963128224903138e-05, "loss": 1.5413, "step": 47480 }, { "epoch": 7.847965296426358, "grad_norm": 7.725858211517334, "learning_rate": 1.1953947006004518e-05, "loss": 1.4338, "step": 47490 }, { "epoch": 7.8496178475521585, "grad_norm": 9.736605644226074, "learning_rate": 1.1944765787105897e-05, "loss": 1.2932, "step": 47500 }, { "epoch": 7.8512703986779595, "grad_norm": 14.61507511138916, "learning_rate": 1.1935584568207275e-05, "loss": 1.4587, "step": 47510 }, { "epoch": 7.85292294980376, "grad_norm": 12.031658172607422, "learning_rate": 1.1926403349308654e-05, "loss": 1.4079, "step": 47520 }, { "epoch": 7.85457550092956, "grad_norm": 16.91095542907715, "learning_rate": 1.1917222130410035e-05, "loss": 1.3452, "step": 47530 }, { "epoch": 7.856228052055361, "grad_norm": 15.591407775878906, "learning_rate": 1.1908040911511412e-05, "loss": 1.4318, "step": 47540 }, { "epoch": 7.857880603181161, "grad_norm": 15.299722671508789, "learning_rate": 1.1898859692612792e-05, "loss": 1.3371, "step": 47550 }, { "epoch": 7.859533154306962, "grad_norm": 8.688148498535156, "learning_rate": 1.188967847371417e-05, "loss": 1.4927, "step": 47560 }, { "epoch": 7.861185705432762, "grad_norm": 14.59116268157959, "learning_rate": 1.188049725481555e-05, "loss": 1.3456, "step": 47570 }, { "epoch": 7.862838256558562, "grad_norm": 30.086807250976562, "learning_rate": 1.187131603591693e-05, "loss": 1.2876, "step": 47580 }, { "epoch": 7.864490807684363, "grad_norm": 25.886877059936523, "learning_rate": 1.1862134817018308e-05, "loss": 1.491, "step": 47590 }, { "epoch": 7.866143358810163, "grad_norm": 11.027227401733398, "learning_rate": 1.1852953598119688e-05, "loss": 1.4246, "step": 47600 }, { "epoch": 7.867795909935964, "grad_norm": 10.795921325683594, "learning_rate": 1.1843772379221065e-05, "loss": 1.3966, "step": 47610 }, { "epoch": 7.869448461061764, "grad_norm": 9.404576301574707, "learning_rate": 1.1834591160322445e-05, "loss": 1.3629, "step": 47620 }, { "epoch": 7.871101012187564, "grad_norm": 11.95871353149414, "learning_rate": 1.1825409941423824e-05, "loss": 1.4068, "step": 47630 }, { "epoch": 7.872753563313365, "grad_norm": 11.290194511413574, "learning_rate": 1.1816228722525203e-05, "loss": 1.4184, "step": 47640 }, { "epoch": 7.874406114439165, "grad_norm": 32.461849212646484, "learning_rate": 1.1807047503626582e-05, "loss": 1.4621, "step": 47650 }, { "epoch": 7.876058665564966, "grad_norm": 10.396666526794434, "learning_rate": 1.1797866284727962e-05, "loss": 1.3957, "step": 47660 }, { "epoch": 7.877711216690766, "grad_norm": 12.307036399841309, "learning_rate": 1.1788685065829339e-05, "loss": 1.3876, "step": 47670 }, { "epoch": 7.879363767816566, "grad_norm": 10.1054048538208, "learning_rate": 1.1779503846930718e-05, "loss": 1.4137, "step": 47680 }, { "epoch": 7.881016318942367, "grad_norm": 16.99799346923828, "learning_rate": 1.1770322628032098e-05, "loss": 1.3882, "step": 47690 }, { "epoch": 7.8826688700681675, "grad_norm": 14.859070777893066, "learning_rate": 1.1761141409133477e-05, "loss": 1.3793, "step": 47700 }, { "epoch": 7.8843214211939685, "grad_norm": 13.93166446685791, "learning_rate": 1.1751960190234856e-05, "loss": 1.4492, "step": 47710 }, { "epoch": 7.885973972319769, "grad_norm": 12.025364875793457, "learning_rate": 1.1742778971336235e-05, "loss": 1.5158, "step": 47720 }, { "epoch": 7.887626523445569, "grad_norm": 15.031390190124512, "learning_rate": 1.1733597752437615e-05, "loss": 1.3162, "step": 47730 }, { "epoch": 7.88927907457137, "grad_norm": 13.956084251403809, "learning_rate": 1.1724416533538992e-05, "loss": 1.2951, "step": 47740 }, { "epoch": 7.89093162569717, "grad_norm": 12.361295700073242, "learning_rate": 1.1715235314640373e-05, "loss": 1.5406, "step": 47750 }, { "epoch": 7.892584176822971, "grad_norm": 8.774741172790527, "learning_rate": 1.1706054095741752e-05, "loss": 1.2487, "step": 47760 }, { "epoch": 7.894236727948771, "grad_norm": 31.390975952148438, "learning_rate": 1.169687287684313e-05, "loss": 1.3818, "step": 47770 }, { "epoch": 7.895889279074572, "grad_norm": 7.857132911682129, "learning_rate": 1.1687691657944509e-05, "loss": 1.4169, "step": 47780 }, { "epoch": 7.897541830200372, "grad_norm": 27.877517700195312, "learning_rate": 1.1678510439045888e-05, "loss": 1.4917, "step": 47790 }, { "epoch": 7.899194381326172, "grad_norm": 21.92901611328125, "learning_rate": 1.1669329220147266e-05, "loss": 1.4045, "step": 47800 }, { "epoch": 7.900846932451973, "grad_norm": 13.53458023071289, "learning_rate": 1.1660148001248647e-05, "loss": 1.4553, "step": 47810 }, { "epoch": 7.902499483577773, "grad_norm": 13.368227005004883, "learning_rate": 1.1650966782350026e-05, "loss": 1.3294, "step": 47820 }, { "epoch": 7.904152034703573, "grad_norm": 9.063852310180664, "learning_rate": 1.1641785563451404e-05, "loss": 1.4244, "step": 47830 }, { "epoch": 7.905804585829374, "grad_norm": 62.96859359741211, "learning_rate": 1.1632604344552783e-05, "loss": 1.3679, "step": 47840 }, { "epoch": 7.907457136955174, "grad_norm": 11.219415664672852, "learning_rate": 1.1623423125654162e-05, "loss": 1.4244, "step": 47850 }, { "epoch": 7.909109688080975, "grad_norm": 13.625591278076172, "learning_rate": 1.1614241906755541e-05, "loss": 1.4893, "step": 47860 }, { "epoch": 7.910762239206775, "grad_norm": 7.387325763702393, "learning_rate": 1.160506068785692e-05, "loss": 1.3991, "step": 47870 }, { "epoch": 7.912414790332576, "grad_norm": 11.21784782409668, "learning_rate": 1.15958794689583e-05, "loss": 1.3294, "step": 47880 }, { "epoch": 7.914067341458376, "grad_norm": 17.778669357299805, "learning_rate": 1.1586698250059679e-05, "loss": 1.4258, "step": 47890 }, { "epoch": 7.9157198925841765, "grad_norm": 12.979781150817871, "learning_rate": 1.1577517031161057e-05, "loss": 1.2635, "step": 47900 }, { "epoch": 7.9173724437099775, "grad_norm": 13.952737808227539, "learning_rate": 1.1568335812262436e-05, "loss": 1.4153, "step": 47910 }, { "epoch": 7.919024994835778, "grad_norm": 10.576740264892578, "learning_rate": 1.1559154593363815e-05, "loss": 1.455, "step": 47920 }, { "epoch": 7.920677545961578, "grad_norm": 9.399680137634277, "learning_rate": 1.1549973374465194e-05, "loss": 1.3299, "step": 47930 }, { "epoch": 7.922330097087379, "grad_norm": 18.13553237915039, "learning_rate": 1.1540792155566574e-05, "loss": 1.3659, "step": 47940 }, { "epoch": 7.923982648213179, "grad_norm": 17.09833335876465, "learning_rate": 1.1531610936667953e-05, "loss": 1.3799, "step": 47950 }, { "epoch": 7.92563519933898, "grad_norm": 11.579339027404785, "learning_rate": 1.152242971776933e-05, "loss": 1.4604, "step": 47960 }, { "epoch": 7.92728775046478, "grad_norm": 17.407424926757812, "learning_rate": 1.151324849887071e-05, "loss": 1.4101, "step": 47970 }, { "epoch": 7.928940301590581, "grad_norm": 17.47738265991211, "learning_rate": 1.150406727997209e-05, "loss": 1.3574, "step": 47980 }, { "epoch": 7.930592852716381, "grad_norm": 10.31318473815918, "learning_rate": 1.1494886061073468e-05, "loss": 1.457, "step": 47990 }, { "epoch": 7.932245403842181, "grad_norm": 14.748434066772461, "learning_rate": 1.1485704842174847e-05, "loss": 1.3569, "step": 48000 }, { "epoch": 7.933897954967982, "grad_norm": 10.245794296264648, "learning_rate": 1.1476523623276227e-05, "loss": 1.4359, "step": 48010 }, { "epoch": 7.935550506093782, "grad_norm": 14.848341941833496, "learning_rate": 1.1467342404377606e-05, "loss": 1.365, "step": 48020 }, { "epoch": 7.937203057219583, "grad_norm": 11.871755599975586, "learning_rate": 1.1458161185478984e-05, "loss": 1.4103, "step": 48030 }, { "epoch": 7.938855608345383, "grad_norm": 13.944067001342773, "learning_rate": 1.1448979966580364e-05, "loss": 1.4093, "step": 48040 }, { "epoch": 7.940508159471183, "grad_norm": 67.12018585205078, "learning_rate": 1.1439798747681744e-05, "loss": 1.3977, "step": 48050 }, { "epoch": 7.942160710596984, "grad_norm": 13.307299613952637, "learning_rate": 1.1430617528783121e-05, "loss": 1.2964, "step": 48060 }, { "epoch": 7.943813261722784, "grad_norm": 7.652238845825195, "learning_rate": 1.14214363098845e-05, "loss": 1.221, "step": 48070 }, { "epoch": 7.945465812848585, "grad_norm": 11.270803451538086, "learning_rate": 1.141225509098588e-05, "loss": 1.3174, "step": 48080 }, { "epoch": 7.947118363974385, "grad_norm": 16.249711990356445, "learning_rate": 1.1403073872087259e-05, "loss": 1.3507, "step": 48090 }, { "epoch": 7.9487709151001855, "grad_norm": 12.031152725219727, "learning_rate": 1.1393892653188638e-05, "loss": 1.4643, "step": 48100 }, { "epoch": 7.9504234662259865, "grad_norm": 9.80186939239502, "learning_rate": 1.1384711434290018e-05, "loss": 1.2806, "step": 48110 }, { "epoch": 7.952076017351787, "grad_norm": 8.662875175476074, "learning_rate": 1.1375530215391395e-05, "loss": 1.3472, "step": 48120 }, { "epoch": 7.953728568477588, "grad_norm": 10.309189796447754, "learning_rate": 1.1366348996492774e-05, "loss": 1.3387, "step": 48130 }, { "epoch": 7.955381119603388, "grad_norm": 16.453157424926758, "learning_rate": 1.1357167777594154e-05, "loss": 1.3604, "step": 48140 }, { "epoch": 7.957033670729189, "grad_norm": 16.2752628326416, "learning_rate": 1.1347986558695533e-05, "loss": 1.4941, "step": 48150 }, { "epoch": 7.958686221854989, "grad_norm": 9.665366172790527, "learning_rate": 1.1338805339796912e-05, "loss": 1.3735, "step": 48160 }, { "epoch": 7.960338772980789, "grad_norm": 12.39187240600586, "learning_rate": 1.1329624120898291e-05, "loss": 1.3077, "step": 48170 }, { "epoch": 7.96199132410659, "grad_norm": 12.81519603729248, "learning_rate": 1.132044290199967e-05, "loss": 1.3662, "step": 48180 }, { "epoch": 7.96364387523239, "grad_norm": 23.483640670776367, "learning_rate": 1.1311261683101048e-05, "loss": 1.4731, "step": 48190 }, { "epoch": 7.96529642635819, "grad_norm": 26.04778480529785, "learning_rate": 1.1302080464202427e-05, "loss": 1.5032, "step": 48200 }, { "epoch": 7.966948977483991, "grad_norm": 7.779325485229492, "learning_rate": 1.1292899245303808e-05, "loss": 1.412, "step": 48210 }, { "epoch": 7.968601528609791, "grad_norm": 9.814796447753906, "learning_rate": 1.1283718026405186e-05, "loss": 1.4677, "step": 48220 }, { "epoch": 7.970254079735592, "grad_norm": 12.185991287231445, "learning_rate": 1.1274536807506565e-05, "loss": 1.3061, "step": 48230 }, { "epoch": 7.971906630861392, "grad_norm": 15.006294250488281, "learning_rate": 1.1265355588607944e-05, "loss": 1.3275, "step": 48240 }, { "epoch": 7.973559181987193, "grad_norm": 12.14893627166748, "learning_rate": 1.1256174369709324e-05, "loss": 1.3091, "step": 48250 }, { "epoch": 7.975211733112993, "grad_norm": 14.654484748840332, "learning_rate": 1.1246993150810701e-05, "loss": 1.4328, "step": 48260 }, { "epoch": 7.976864284238793, "grad_norm": 13.88418960571289, "learning_rate": 1.1237811931912082e-05, "loss": 1.3212, "step": 48270 }, { "epoch": 7.978516835364594, "grad_norm": 8.910808563232422, "learning_rate": 1.122863071301346e-05, "loss": 1.2907, "step": 48280 }, { "epoch": 7.980169386490394, "grad_norm": 9.435023307800293, "learning_rate": 1.1219449494114839e-05, "loss": 1.3946, "step": 48290 }, { "epoch": 7.9818219376161945, "grad_norm": 23.680673599243164, "learning_rate": 1.1210268275216218e-05, "loss": 1.401, "step": 48300 }, { "epoch": 7.9834744887419955, "grad_norm": 14.382912635803223, "learning_rate": 1.1201087056317597e-05, "loss": 1.367, "step": 48310 }, { "epoch": 7.985127039867796, "grad_norm": 18.664398193359375, "learning_rate": 1.1191905837418977e-05, "loss": 1.4019, "step": 48320 }, { "epoch": 7.986779590993597, "grad_norm": 13.360762596130371, "learning_rate": 1.1182724618520356e-05, "loss": 1.5117, "step": 48330 }, { "epoch": 7.988432142119397, "grad_norm": 13.729475975036621, "learning_rate": 1.1173543399621735e-05, "loss": 1.5005, "step": 48340 }, { "epoch": 7.990084693245198, "grad_norm": 9.402355194091797, "learning_rate": 1.1164362180723113e-05, "loss": 1.4522, "step": 48350 }, { "epoch": 7.991737244370998, "grad_norm": 10.234284400939941, "learning_rate": 1.1155180961824492e-05, "loss": 1.4535, "step": 48360 }, { "epoch": 7.993389795496798, "grad_norm": 14.257830619812012, "learning_rate": 1.1145999742925871e-05, "loss": 1.3578, "step": 48370 }, { "epoch": 7.995042346622599, "grad_norm": 16.891815185546875, "learning_rate": 1.113681852402725e-05, "loss": 1.46, "step": 48380 }, { "epoch": 7.996694897748399, "grad_norm": 11.595968246459961, "learning_rate": 1.112763730512863e-05, "loss": 1.3274, "step": 48390 }, { "epoch": 7.9983474488742, "grad_norm": 12.209437370300293, "learning_rate": 1.1118456086230009e-05, "loss": 1.3268, "step": 48400 }, { "epoch": 8.0, "grad_norm": 10.217020988464355, "learning_rate": 1.1109274867331388e-05, "loss": 1.3077, "step": 48410 }, { "epoch": 8.0, "eval_accuracy": 0.3359696410838417, "eval_loss": 2.232658863067627, "eval_runtime": 833.0289, "eval_samples_per_second": 33.848, "eval_steps_per_second": 8.462, "step": 48410 }, { "epoch": 8.001652551125801, "grad_norm": 13.642919540405273, "learning_rate": 1.1100093648432766e-05, "loss": 1.2788, "step": 48420 }, { "epoch": 8.0033051022516, "grad_norm": 11.345486640930176, "learning_rate": 1.1090912429534145e-05, "loss": 1.3731, "step": 48430 }, { "epoch": 8.004957653377401, "grad_norm": 13.825784683227539, "learning_rate": 1.1081731210635524e-05, "loss": 1.398, "step": 48440 }, { "epoch": 8.006610204503202, "grad_norm": 22.2542667388916, "learning_rate": 1.1072549991736903e-05, "loss": 1.3321, "step": 48450 }, { "epoch": 8.008262755629003, "grad_norm": 12.9199800491333, "learning_rate": 1.1063368772838283e-05, "loss": 1.3493, "step": 48460 }, { "epoch": 8.009915306754802, "grad_norm": 9.398885726928711, "learning_rate": 1.1054187553939662e-05, "loss": 1.292, "step": 48470 }, { "epoch": 8.011567857880603, "grad_norm": 17.111080169677734, "learning_rate": 1.104500633504104e-05, "loss": 1.3765, "step": 48480 }, { "epoch": 8.013220409006404, "grad_norm": 268.5393981933594, "learning_rate": 1.1035825116142419e-05, "loss": 1.3245, "step": 48490 }, { "epoch": 8.014872960132204, "grad_norm": 11.041481971740723, "learning_rate": 1.10266438972438e-05, "loss": 1.3954, "step": 48500 }, { "epoch": 8.016525511258004, "grad_norm": 13.948131561279297, "learning_rate": 1.1017462678345177e-05, "loss": 1.2804, "step": 48510 }, { "epoch": 8.018178062383805, "grad_norm": 10.275325775146484, "learning_rate": 1.1008281459446557e-05, "loss": 1.4867, "step": 48520 }, { "epoch": 8.019830613509605, "grad_norm": 9.993414878845215, "learning_rate": 1.0999100240547936e-05, "loss": 1.4039, "step": 48530 }, { "epoch": 8.021483164635406, "grad_norm": 10.115413665771484, "learning_rate": 1.0989919021649315e-05, "loss": 1.2562, "step": 48540 }, { "epoch": 8.023135715761207, "grad_norm": 12.741392135620117, "learning_rate": 1.0980737802750694e-05, "loss": 1.1463, "step": 48550 }, { "epoch": 8.024788266887008, "grad_norm": 21.065656661987305, "learning_rate": 1.0971556583852073e-05, "loss": 1.3402, "step": 48560 }, { "epoch": 8.026440818012807, "grad_norm": 13.77077865600586, "learning_rate": 1.0962375364953453e-05, "loss": 1.4906, "step": 48570 }, { "epoch": 8.028093369138608, "grad_norm": 10.071250915527344, "learning_rate": 1.095319414605483e-05, "loss": 1.2456, "step": 48580 }, { "epoch": 8.029745920264409, "grad_norm": 41.76918029785156, "learning_rate": 1.094401292715621e-05, "loss": 1.3994, "step": 48590 }, { "epoch": 8.031398471390208, "grad_norm": 12.07766342163086, "learning_rate": 1.0934831708257589e-05, "loss": 1.5174, "step": 48600 }, { "epoch": 8.033051022516009, "grad_norm": 12.890829086303711, "learning_rate": 1.0925650489358968e-05, "loss": 1.398, "step": 48610 }, { "epoch": 8.03470357364181, "grad_norm": 18.99813461303711, "learning_rate": 1.0916469270460347e-05, "loss": 1.4505, "step": 48620 }, { "epoch": 8.03635612476761, "grad_norm": 13.51767635345459, "learning_rate": 1.0907288051561727e-05, "loss": 1.2901, "step": 48630 }, { "epoch": 8.03800867589341, "grad_norm": 11.078587532043457, "learning_rate": 1.0898106832663104e-05, "loss": 1.273, "step": 48640 }, { "epoch": 8.039661227019211, "grad_norm": 9.125107765197754, "learning_rate": 1.0888925613764483e-05, "loss": 1.2565, "step": 48650 }, { "epoch": 8.041313778145012, "grad_norm": 17.25629997253418, "learning_rate": 1.0879744394865863e-05, "loss": 1.4344, "step": 48660 }, { "epoch": 8.042966329270811, "grad_norm": 10.981740951538086, "learning_rate": 1.0870563175967242e-05, "loss": 1.4435, "step": 48670 }, { "epoch": 8.044618880396612, "grad_norm": 24.005199432373047, "learning_rate": 1.0861381957068621e-05, "loss": 1.3133, "step": 48680 }, { "epoch": 8.046271431522413, "grad_norm": 10.016214370727539, "learning_rate": 1.085220073817e-05, "loss": 1.3351, "step": 48690 }, { "epoch": 8.047923982648213, "grad_norm": 19.678804397583008, "learning_rate": 1.084301951927138e-05, "loss": 1.5142, "step": 48700 }, { "epoch": 8.049576533774013, "grad_norm": 12.911602020263672, "learning_rate": 1.0833838300372757e-05, "loss": 1.3535, "step": 48710 }, { "epoch": 8.051229084899814, "grad_norm": 28.89436149597168, "learning_rate": 1.0824657081474138e-05, "loss": 1.3765, "step": 48720 }, { "epoch": 8.052881636025614, "grad_norm": 9.610832214355469, "learning_rate": 1.0815475862575517e-05, "loss": 1.3013, "step": 48730 }, { "epoch": 8.054534187151415, "grad_norm": 10.988263130187988, "learning_rate": 1.0806294643676895e-05, "loss": 1.4339, "step": 48740 }, { "epoch": 8.056186738277216, "grad_norm": 8.778138160705566, "learning_rate": 1.0797113424778274e-05, "loss": 1.4137, "step": 48750 }, { "epoch": 8.057839289403017, "grad_norm": 14.911858558654785, "learning_rate": 1.0787932205879653e-05, "loss": 1.4565, "step": 48760 }, { "epoch": 8.059491840528816, "grad_norm": 11.017518043518066, "learning_rate": 1.0778750986981031e-05, "loss": 1.2947, "step": 48770 }, { "epoch": 8.061144391654617, "grad_norm": 10.435925483703613, "learning_rate": 1.0769569768082412e-05, "loss": 1.2544, "step": 48780 }, { "epoch": 8.062796942780418, "grad_norm": 15.238306999206543, "learning_rate": 1.0760388549183791e-05, "loss": 1.4135, "step": 48790 }, { "epoch": 8.064449493906217, "grad_norm": 19.062891006469727, "learning_rate": 1.0751207330285169e-05, "loss": 1.375, "step": 48800 }, { "epoch": 8.066102045032018, "grad_norm": 32.4384651184082, "learning_rate": 1.0742026111386548e-05, "loss": 1.5354, "step": 48810 }, { "epoch": 8.067754596157819, "grad_norm": 12.400315284729004, "learning_rate": 1.0732844892487927e-05, "loss": 1.3206, "step": 48820 }, { "epoch": 8.06940714728362, "grad_norm": 21.19147491455078, "learning_rate": 1.0723663673589306e-05, "loss": 1.4587, "step": 48830 }, { "epoch": 8.07105969840942, "grad_norm": 14.054428100585938, "learning_rate": 1.0714482454690686e-05, "loss": 1.3973, "step": 48840 }, { "epoch": 8.07271224953522, "grad_norm": 9.821606636047363, "learning_rate": 1.0705301235792065e-05, "loss": 1.3624, "step": 48850 }, { "epoch": 8.074364800661021, "grad_norm": 17.55989646911621, "learning_rate": 1.0696120016893444e-05, "loss": 1.4144, "step": 48860 }, { "epoch": 8.07601735178682, "grad_norm": 14.98707103729248, "learning_rate": 1.0686938797994822e-05, "loss": 1.3011, "step": 48870 }, { "epoch": 8.077669902912621, "grad_norm": 20.7563419342041, "learning_rate": 1.0677757579096201e-05, "loss": 1.282, "step": 48880 }, { "epoch": 8.079322454038422, "grad_norm": 12.075446128845215, "learning_rate": 1.066857636019758e-05, "loss": 1.3504, "step": 48890 }, { "epoch": 8.080975005164222, "grad_norm": 41.509918212890625, "learning_rate": 1.065939514129896e-05, "loss": 1.2301, "step": 48900 }, { "epoch": 8.082627556290022, "grad_norm": 14.281842231750488, "learning_rate": 1.0650213922400339e-05, "loss": 1.3586, "step": 48910 }, { "epoch": 8.084280107415823, "grad_norm": 11.7456693649292, "learning_rate": 1.0641032703501718e-05, "loss": 1.3069, "step": 48920 }, { "epoch": 8.085932658541624, "grad_norm": 19.099252700805664, "learning_rate": 1.0631851484603095e-05, "loss": 1.4793, "step": 48930 }, { "epoch": 8.087585209667424, "grad_norm": 24.131771087646484, "learning_rate": 1.0622670265704475e-05, "loss": 1.2569, "step": 48940 }, { "epoch": 8.089237760793225, "grad_norm": 11.209100723266602, "learning_rate": 1.0613489046805856e-05, "loss": 1.2394, "step": 48950 }, { "epoch": 8.090890311919026, "grad_norm": 13.452157020568848, "learning_rate": 1.0604307827907233e-05, "loss": 1.2686, "step": 48960 }, { "epoch": 8.092542863044825, "grad_norm": 15.363450050354004, "learning_rate": 1.0595126609008612e-05, "loss": 1.4572, "step": 48970 }, { "epoch": 8.094195414170626, "grad_norm": 10.773612022399902, "learning_rate": 1.0585945390109992e-05, "loss": 1.3411, "step": 48980 }, { "epoch": 8.095847965296427, "grad_norm": 142.82435607910156, "learning_rate": 1.0576764171211371e-05, "loss": 1.3155, "step": 48990 }, { "epoch": 8.097500516422226, "grad_norm": 67.02763366699219, "learning_rate": 1.0567582952312749e-05, "loss": 1.39, "step": 49000 }, { "epoch": 8.099153067548027, "grad_norm": 38.37979507446289, "learning_rate": 1.055840173341413e-05, "loss": 1.3099, "step": 49010 }, { "epoch": 8.100805618673828, "grad_norm": 12.719371795654297, "learning_rate": 1.0549220514515509e-05, "loss": 1.3926, "step": 49020 }, { "epoch": 8.102458169799629, "grad_norm": 18.666179656982422, "learning_rate": 1.0540039295616886e-05, "loss": 1.3513, "step": 49030 }, { "epoch": 8.104110720925428, "grad_norm": 11.054634094238281, "learning_rate": 1.0530858076718266e-05, "loss": 1.3234, "step": 49040 }, { "epoch": 8.10576327205123, "grad_norm": 14.501553535461426, "learning_rate": 1.0521676857819645e-05, "loss": 1.352, "step": 49050 }, { "epoch": 8.10741582317703, "grad_norm": 12.57413101196289, "learning_rate": 1.0512495638921022e-05, "loss": 1.2829, "step": 49060 }, { "epoch": 8.10906837430283, "grad_norm": 19.41373062133789, "learning_rate": 1.0503314420022403e-05, "loss": 1.2808, "step": 49070 }, { "epoch": 8.11072092542863, "grad_norm": 12.489398956298828, "learning_rate": 1.0494133201123783e-05, "loss": 1.3782, "step": 49080 }, { "epoch": 8.112373476554431, "grad_norm": 17.002321243286133, "learning_rate": 1.048495198222516e-05, "loss": 1.4326, "step": 49090 }, { "epoch": 8.11402602768023, "grad_norm": 10.533157348632812, "learning_rate": 1.047577076332654e-05, "loss": 1.2274, "step": 49100 }, { "epoch": 8.115678578806031, "grad_norm": 14.494053840637207, "learning_rate": 1.0466589544427919e-05, "loss": 1.3649, "step": 49110 }, { "epoch": 8.117331129931832, "grad_norm": 8.390114784240723, "learning_rate": 1.0457408325529298e-05, "loss": 1.3052, "step": 49120 }, { "epoch": 8.118983681057633, "grad_norm": 23.97467803955078, "learning_rate": 1.0448227106630677e-05, "loss": 1.3362, "step": 49130 }, { "epoch": 8.120636232183433, "grad_norm": 18.547996520996094, "learning_rate": 1.0439045887732056e-05, "loss": 1.3471, "step": 49140 }, { "epoch": 8.122288783309234, "grad_norm": 17.564769744873047, "learning_rate": 1.0429864668833436e-05, "loss": 1.3934, "step": 49150 }, { "epoch": 8.123941334435035, "grad_norm": 25.809839248657227, "learning_rate": 1.0420683449934813e-05, "loss": 1.5178, "step": 49160 }, { "epoch": 8.125593885560834, "grad_norm": 24.299781799316406, "learning_rate": 1.0411502231036192e-05, "loss": 1.352, "step": 49170 }, { "epoch": 8.127246436686635, "grad_norm": 13.150251388549805, "learning_rate": 1.0402321012137573e-05, "loss": 1.4525, "step": 49180 }, { "epoch": 8.128898987812436, "grad_norm": 14.097673416137695, "learning_rate": 1.0393139793238951e-05, "loss": 1.4137, "step": 49190 }, { "epoch": 8.130551538938235, "grad_norm": 13.112128257751465, "learning_rate": 1.038395857434033e-05, "loss": 1.2533, "step": 49200 }, { "epoch": 8.132204090064036, "grad_norm": 12.603325843811035, "learning_rate": 1.037477735544171e-05, "loss": 1.4444, "step": 49210 }, { "epoch": 8.133856641189837, "grad_norm": 41.19782638549805, "learning_rate": 1.0365596136543087e-05, "loss": 1.45, "step": 49220 }, { "epoch": 8.135509192315638, "grad_norm": 15.660469055175781, "learning_rate": 1.0356414917644466e-05, "loss": 1.436, "step": 49230 }, { "epoch": 8.137161743441437, "grad_norm": 12.267075538635254, "learning_rate": 1.0347233698745847e-05, "loss": 1.4064, "step": 49240 }, { "epoch": 8.138814294567238, "grad_norm": 12.525456428527832, "learning_rate": 1.0338052479847225e-05, "loss": 1.3886, "step": 49250 }, { "epoch": 8.14046684569304, "grad_norm": 12.728673934936523, "learning_rate": 1.0328871260948604e-05, "loss": 1.3165, "step": 49260 }, { "epoch": 8.142119396818838, "grad_norm": 14.614742279052734, "learning_rate": 1.0319690042049983e-05, "loss": 1.2583, "step": 49270 }, { "epoch": 8.14377194794464, "grad_norm": 14.37644100189209, "learning_rate": 1.0310508823151362e-05, "loss": 1.2297, "step": 49280 }, { "epoch": 8.14542449907044, "grad_norm": 10.755502700805664, "learning_rate": 1.030132760425274e-05, "loss": 1.3726, "step": 49290 }, { "epoch": 8.147077050196241, "grad_norm": 156.68991088867188, "learning_rate": 1.0292146385354121e-05, "loss": 1.4638, "step": 49300 }, { "epoch": 8.14872960132204, "grad_norm": 16.2905330657959, "learning_rate": 1.02829651664555e-05, "loss": 1.3056, "step": 49310 }, { "epoch": 8.150382152447841, "grad_norm": 12.603002548217773, "learning_rate": 1.0273783947556878e-05, "loss": 1.4877, "step": 49320 }, { "epoch": 8.152034703573642, "grad_norm": 14.49899673461914, "learning_rate": 1.0264602728658257e-05, "loss": 1.3918, "step": 49330 }, { "epoch": 8.153687254699442, "grad_norm": 12.177115440368652, "learning_rate": 1.0255421509759636e-05, "loss": 1.2091, "step": 49340 }, { "epoch": 8.155339805825243, "grad_norm": 9.79865550994873, "learning_rate": 1.0246240290861015e-05, "loss": 1.2554, "step": 49350 }, { "epoch": 8.156992356951044, "grad_norm": 8.341238975524902, "learning_rate": 1.0237059071962395e-05, "loss": 1.2596, "step": 49360 }, { "epoch": 8.158644908076843, "grad_norm": 15.849066734313965, "learning_rate": 1.0227877853063774e-05, "loss": 1.2234, "step": 49370 }, { "epoch": 8.160297459202644, "grad_norm": 8.827754974365234, "learning_rate": 1.0218696634165151e-05, "loss": 1.3488, "step": 49380 }, { "epoch": 8.161950010328445, "grad_norm": 10.364538192749023, "learning_rate": 1.020951541526653e-05, "loss": 1.356, "step": 49390 }, { "epoch": 8.163602561454246, "grad_norm": 33.56261444091797, "learning_rate": 1.020033419636791e-05, "loss": 1.3118, "step": 49400 }, { "epoch": 8.165255112580045, "grad_norm": 21.711645126342773, "learning_rate": 1.019115297746929e-05, "loss": 1.2635, "step": 49410 }, { "epoch": 8.166907663705846, "grad_norm": 11.296979904174805, "learning_rate": 1.0181971758570668e-05, "loss": 1.3455, "step": 49420 }, { "epoch": 8.168560214831647, "grad_norm": 22.215770721435547, "learning_rate": 1.0172790539672048e-05, "loss": 1.29, "step": 49430 }, { "epoch": 8.170212765957446, "grad_norm": 16.198862075805664, "learning_rate": 1.0163609320773427e-05, "loss": 1.4448, "step": 49440 }, { "epoch": 8.171865317083247, "grad_norm": 15.688820838928223, "learning_rate": 1.0154428101874805e-05, "loss": 1.4095, "step": 49450 }, { "epoch": 8.173517868209048, "grad_norm": 18.601253509521484, "learning_rate": 1.0145246882976184e-05, "loss": 1.3167, "step": 49460 }, { "epoch": 8.175170419334847, "grad_norm": 14.552800178527832, "learning_rate": 1.0136065664077565e-05, "loss": 1.3402, "step": 49470 }, { "epoch": 8.176822970460648, "grad_norm": 25.716659545898438, "learning_rate": 1.0126884445178942e-05, "loss": 1.2686, "step": 49480 }, { "epoch": 8.17847552158645, "grad_norm": 14.601028442382812, "learning_rate": 1.0117703226280322e-05, "loss": 1.3386, "step": 49490 }, { "epoch": 8.18012807271225, "grad_norm": 19.567943572998047, "learning_rate": 1.01085220073817e-05, "loss": 1.3595, "step": 49500 }, { "epoch": 8.18178062383805, "grad_norm": 16.065189361572266, "learning_rate": 1.0099340788483078e-05, "loss": 1.4657, "step": 49510 }, { "epoch": 8.18343317496385, "grad_norm": 21.156858444213867, "learning_rate": 1.009015956958446e-05, "loss": 1.3717, "step": 49520 }, { "epoch": 8.185085726089651, "grad_norm": 9.819899559020996, "learning_rate": 1.0080978350685838e-05, "loss": 1.3775, "step": 49530 }, { "epoch": 8.18673827721545, "grad_norm": 20.710819244384766, "learning_rate": 1.0071797131787216e-05, "loss": 1.4204, "step": 49540 }, { "epoch": 8.188390828341252, "grad_norm": 8.26885986328125, "learning_rate": 1.0062615912888595e-05, "loss": 1.3977, "step": 49550 }, { "epoch": 8.190043379467053, "grad_norm": 23.709930419921875, "learning_rate": 1.0053434693989975e-05, "loss": 1.3835, "step": 49560 }, { "epoch": 8.191695930592854, "grad_norm": 13.372426986694336, "learning_rate": 1.0044253475091354e-05, "loss": 1.3282, "step": 49570 }, { "epoch": 8.193348481718653, "grad_norm": 14.474409103393555, "learning_rate": 1.0035072256192733e-05, "loss": 1.4683, "step": 49580 }, { "epoch": 8.195001032844454, "grad_norm": 19.034423828125, "learning_rate": 1.0025891037294112e-05, "loss": 1.5369, "step": 49590 }, { "epoch": 8.196653583970255, "grad_norm": 13.448150634765625, "learning_rate": 1.0016709818395492e-05, "loss": 1.3128, "step": 49600 }, { "epoch": 8.198306135096054, "grad_norm": 16.181913375854492, "learning_rate": 1.0007528599496869e-05, "loss": 1.2877, "step": 49610 }, { "epoch": 8.199958686221855, "grad_norm": 10.915853500366211, "learning_rate": 9.998347380598248e-06, "loss": 1.237, "step": 49620 }, { "epoch": 8.201611237347656, "grad_norm": 11.362007141113281, "learning_rate": 9.989166161699628e-06, "loss": 1.5008, "step": 49630 }, { "epoch": 8.203263788473455, "grad_norm": 11.269021034240723, "learning_rate": 9.979984942801007e-06, "loss": 1.2711, "step": 49640 }, { "epoch": 8.204916339599256, "grad_norm": 18.16535758972168, "learning_rate": 9.970803723902386e-06, "loss": 1.3525, "step": 49650 }, { "epoch": 8.206568890725057, "grad_norm": 16.32088279724121, "learning_rate": 9.961622505003765e-06, "loss": 1.2825, "step": 49660 }, { "epoch": 8.208221441850858, "grad_norm": 10.657417297363281, "learning_rate": 9.952441286105143e-06, "loss": 1.4107, "step": 49670 }, { "epoch": 8.209873992976657, "grad_norm": 8.144217491149902, "learning_rate": 9.943260067206522e-06, "loss": 1.3751, "step": 49680 }, { "epoch": 8.211526544102458, "grad_norm": 11.399767875671387, "learning_rate": 9.934078848307901e-06, "loss": 1.3849, "step": 49690 }, { "epoch": 8.21317909522826, "grad_norm": 12.942078590393066, "learning_rate": 9.92489762940928e-06, "loss": 1.3903, "step": 49700 }, { "epoch": 8.214831646354058, "grad_norm": 14.669422149658203, "learning_rate": 9.91571641051066e-06, "loss": 1.2849, "step": 49710 }, { "epoch": 8.21648419747986, "grad_norm": 14.220985412597656, "learning_rate": 9.906535191612039e-06, "loss": 1.2295, "step": 49720 }, { "epoch": 8.21813674860566, "grad_norm": 15.08796215057373, "learning_rate": 9.897353972713418e-06, "loss": 1.2791, "step": 49730 }, { "epoch": 8.21978929973146, "grad_norm": 19.543704986572266, "learning_rate": 9.888172753814796e-06, "loss": 1.2695, "step": 49740 }, { "epoch": 8.22144185085726, "grad_norm": 12.316389083862305, "learning_rate": 9.878991534916177e-06, "loss": 1.4851, "step": 49750 }, { "epoch": 8.223094401983062, "grad_norm": 15.752791404724121, "learning_rate": 9.869810316017556e-06, "loss": 1.4293, "step": 49760 }, { "epoch": 8.224746953108863, "grad_norm": 12.644379615783691, "learning_rate": 9.860629097118934e-06, "loss": 1.2978, "step": 49770 }, { "epoch": 8.226399504234662, "grad_norm": 22.362035751342773, "learning_rate": 9.851447878220313e-06, "loss": 1.3376, "step": 49780 }, { "epoch": 8.228052055360463, "grad_norm": 12.713665008544922, "learning_rate": 9.842266659321692e-06, "loss": 1.4388, "step": 49790 }, { "epoch": 8.229704606486264, "grad_norm": 12.972372055053711, "learning_rate": 9.83308544042307e-06, "loss": 1.5245, "step": 49800 }, { "epoch": 8.231357157612063, "grad_norm": 18.19927215576172, "learning_rate": 9.82390422152445e-06, "loss": 1.3927, "step": 49810 }, { "epoch": 8.233009708737864, "grad_norm": 36.67835998535156, "learning_rate": 9.81472300262583e-06, "loss": 1.3945, "step": 49820 }, { "epoch": 8.234662259863665, "grad_norm": 13.975602149963379, "learning_rate": 9.805541783727207e-06, "loss": 1.5244, "step": 49830 }, { "epoch": 8.236314810989464, "grad_norm": 11.509796142578125, "learning_rate": 9.796360564828587e-06, "loss": 1.3487, "step": 49840 }, { "epoch": 8.237967362115265, "grad_norm": 13.768054008483887, "learning_rate": 9.787179345929966e-06, "loss": 1.3423, "step": 49850 }, { "epoch": 8.239619913241066, "grad_norm": 24.76049041748047, "learning_rate": 9.777998127031345e-06, "loss": 1.3772, "step": 49860 }, { "epoch": 8.241272464366867, "grad_norm": 19.85248374938965, "learning_rate": 9.768816908132724e-06, "loss": 1.364, "step": 49870 }, { "epoch": 8.242925015492666, "grad_norm": 10.143357276916504, "learning_rate": 9.759635689234104e-06, "loss": 1.3958, "step": 49880 }, { "epoch": 8.244577566618467, "grad_norm": 11.001799583435059, "learning_rate": 9.750454470335483e-06, "loss": 1.3621, "step": 49890 }, { "epoch": 8.246230117744268, "grad_norm": 9.355253219604492, "learning_rate": 9.74127325143686e-06, "loss": 1.2735, "step": 49900 }, { "epoch": 8.247882668870067, "grad_norm": 31.860811233520508, "learning_rate": 9.73209203253824e-06, "loss": 1.3465, "step": 49910 }, { "epoch": 8.249535219995868, "grad_norm": 9.52219295501709, "learning_rate": 9.72291081363962e-06, "loss": 1.2591, "step": 49920 }, { "epoch": 8.25118777112167, "grad_norm": 16.92643165588379, "learning_rate": 9.713729594740998e-06, "loss": 1.5207, "step": 49930 }, { "epoch": 8.252840322247469, "grad_norm": 24.846473693847656, "learning_rate": 9.704548375842377e-06, "loss": 1.1995, "step": 49940 }, { "epoch": 8.25449287337327, "grad_norm": 18.698997497558594, "learning_rate": 9.695367156943757e-06, "loss": 1.409, "step": 49950 }, { "epoch": 8.25614542449907, "grad_norm": 9.128297805786133, "learning_rate": 9.686185938045134e-06, "loss": 1.4773, "step": 49960 }, { "epoch": 8.257797975624872, "grad_norm": 8.733552932739258, "learning_rate": 9.677004719146514e-06, "loss": 1.3389, "step": 49970 }, { "epoch": 8.25945052675067, "grad_norm": 14.439172744750977, "learning_rate": 9.667823500247894e-06, "loss": 1.4054, "step": 49980 }, { "epoch": 8.261103077876472, "grad_norm": 25.46586036682129, "learning_rate": 9.658642281349272e-06, "loss": 1.385, "step": 49990 }, { "epoch": 8.262755629002273, "grad_norm": 15.000371932983398, "learning_rate": 9.649461062450651e-06, "loss": 1.3654, "step": 50000 }, { "epoch": 8.264408180128072, "grad_norm": 15.230158805847168, "learning_rate": 9.64027984355203e-06, "loss": 1.2959, "step": 50010 }, { "epoch": 8.266060731253873, "grad_norm": 20.123403549194336, "learning_rate": 9.63109862465341e-06, "loss": 1.4118, "step": 50020 }, { "epoch": 8.267713282379674, "grad_norm": 9.338974952697754, "learning_rate": 9.621917405754787e-06, "loss": 1.2685, "step": 50030 }, { "epoch": 8.269365833505475, "grad_norm": 18.35788917541504, "learning_rate": 9.612736186856168e-06, "loss": 1.3901, "step": 50040 }, { "epoch": 8.271018384631274, "grad_norm": 22.194643020629883, "learning_rate": 9.603554967957548e-06, "loss": 1.278, "step": 50050 }, { "epoch": 8.272670935757075, "grad_norm": 10.933758735656738, "learning_rate": 9.594373749058925e-06, "loss": 1.4638, "step": 50060 }, { "epoch": 8.274323486882876, "grad_norm": 13.151747703552246, "learning_rate": 9.585192530160304e-06, "loss": 1.4557, "step": 50070 }, { "epoch": 8.275976038008675, "grad_norm": 16.250370025634766, "learning_rate": 9.576011311261684e-06, "loss": 1.3638, "step": 50080 }, { "epoch": 8.277628589134476, "grad_norm": 14.143457412719727, "learning_rate": 9.566830092363063e-06, "loss": 1.2704, "step": 50090 }, { "epoch": 8.279281140260277, "grad_norm": 15.831094741821289, "learning_rate": 9.557648873464442e-06, "loss": 1.4091, "step": 50100 }, { "epoch": 8.280933691386076, "grad_norm": 11.630001068115234, "learning_rate": 9.548467654565821e-06, "loss": 1.3446, "step": 50110 }, { "epoch": 8.282586242511877, "grad_norm": 9.2128324508667, "learning_rate": 9.539286435667199e-06, "loss": 1.2162, "step": 50120 }, { "epoch": 8.284238793637678, "grad_norm": 16.4133243560791, "learning_rate": 9.530105216768578e-06, "loss": 1.3735, "step": 50130 }, { "epoch": 8.28589134476348, "grad_norm": 8.486006736755371, "learning_rate": 9.520923997869957e-06, "loss": 1.2581, "step": 50140 }, { "epoch": 8.287543895889279, "grad_norm": 25.598995208740234, "learning_rate": 9.511742778971337e-06, "loss": 1.2991, "step": 50150 }, { "epoch": 8.28919644701508, "grad_norm": 13.834522247314453, "learning_rate": 9.502561560072716e-06, "loss": 1.4145, "step": 50160 }, { "epoch": 8.29084899814088, "grad_norm": 13.25239086151123, "learning_rate": 9.493380341174095e-06, "loss": 1.4416, "step": 50170 }, { "epoch": 8.29250154926668, "grad_norm": 15.901601791381836, "learning_rate": 9.484199122275474e-06, "loss": 1.4722, "step": 50180 }, { "epoch": 8.29415410039248, "grad_norm": 12.437292098999023, "learning_rate": 9.475017903376852e-06, "loss": 1.3519, "step": 50190 }, { "epoch": 8.295806651518282, "grad_norm": 13.174797058105469, "learning_rate": 9.465836684478231e-06, "loss": 1.3125, "step": 50200 }, { "epoch": 8.297459202644081, "grad_norm": 18.76152229309082, "learning_rate": 9.456655465579612e-06, "loss": 1.3622, "step": 50210 }, { "epoch": 8.299111753769882, "grad_norm": 13.118831634521484, "learning_rate": 9.44747424668099e-06, "loss": 1.3004, "step": 50220 }, { "epoch": 8.300764304895683, "grad_norm": 18.371763229370117, "learning_rate": 9.438293027782369e-06, "loss": 1.3731, "step": 50230 }, { "epoch": 8.302416856021484, "grad_norm": 12.924381256103516, "learning_rate": 9.429111808883748e-06, "loss": 1.3149, "step": 50240 }, { "epoch": 8.304069407147283, "grad_norm": 10.467041015625, "learning_rate": 9.419930589985126e-06, "loss": 1.3262, "step": 50250 }, { "epoch": 8.305721958273084, "grad_norm": 18.903898239135742, "learning_rate": 9.410749371086505e-06, "loss": 1.2989, "step": 50260 }, { "epoch": 8.307374509398885, "grad_norm": 13.446491241455078, "learning_rate": 9.401568152187886e-06, "loss": 1.4296, "step": 50270 }, { "epoch": 8.309027060524684, "grad_norm": 14.688339233398438, "learning_rate": 9.392386933289263e-06, "loss": 1.2989, "step": 50280 }, { "epoch": 8.310679611650485, "grad_norm": 17.203166961669922, "learning_rate": 9.383205714390643e-06, "loss": 1.3036, "step": 50290 }, { "epoch": 8.312332162776286, "grad_norm": 83.26315307617188, "learning_rate": 9.374024495492022e-06, "loss": 1.3526, "step": 50300 }, { "epoch": 8.313984713902085, "grad_norm": 23.8687686920166, "learning_rate": 9.364843276593401e-06, "loss": 1.2074, "step": 50310 }, { "epoch": 8.315637265027886, "grad_norm": 13.990741729736328, "learning_rate": 9.35566205769478e-06, "loss": 1.3218, "step": 50320 }, { "epoch": 8.317289816153687, "grad_norm": 34.39853286743164, "learning_rate": 9.34648083879616e-06, "loss": 1.338, "step": 50330 }, { "epoch": 8.318942367279488, "grad_norm": 22.453739166259766, "learning_rate": 9.337299619897539e-06, "loss": 1.4889, "step": 50340 }, { "epoch": 8.320594918405288, "grad_norm": 10.095982551574707, "learning_rate": 9.328118400998916e-06, "loss": 1.336, "step": 50350 }, { "epoch": 8.322247469531089, "grad_norm": 74.14069366455078, "learning_rate": 9.318937182100296e-06, "loss": 1.4469, "step": 50360 }, { "epoch": 8.32390002065689, "grad_norm": 23.241098403930664, "learning_rate": 9.309755963201675e-06, "loss": 1.4088, "step": 50370 }, { "epoch": 8.325552571782689, "grad_norm": 9.130888938903809, "learning_rate": 9.300574744303054e-06, "loss": 1.3539, "step": 50380 }, { "epoch": 8.32720512290849, "grad_norm": 28.220802307128906, "learning_rate": 9.291393525404433e-06, "loss": 1.4253, "step": 50390 }, { "epoch": 8.32885767403429, "grad_norm": 14.703250885009766, "learning_rate": 9.282212306505813e-06, "loss": 1.2751, "step": 50400 }, { "epoch": 8.33051022516009, "grad_norm": 17.532804489135742, "learning_rate": 9.27303108760719e-06, "loss": 1.3413, "step": 50410 }, { "epoch": 8.332162776285891, "grad_norm": 11.652085304260254, "learning_rate": 9.26384986870857e-06, "loss": 1.2881, "step": 50420 }, { "epoch": 8.333815327411692, "grad_norm": 16.633052825927734, "learning_rate": 9.254668649809949e-06, "loss": 1.3163, "step": 50430 }, { "epoch": 8.335467878537493, "grad_norm": 17.8756046295166, "learning_rate": 9.245487430911328e-06, "loss": 1.2875, "step": 50440 }, { "epoch": 8.337120429663292, "grad_norm": 14.60150146484375, "learning_rate": 9.236306212012707e-06, "loss": 1.3227, "step": 50450 }, { "epoch": 8.338772980789093, "grad_norm": 10.519417762756348, "learning_rate": 9.227124993114087e-06, "loss": 1.1588, "step": 50460 }, { "epoch": 8.340425531914894, "grad_norm": 12.479300498962402, "learning_rate": 9.217943774215466e-06, "loss": 1.2749, "step": 50470 }, { "epoch": 8.342078083040693, "grad_norm": 15.454779624938965, "learning_rate": 9.208762555316843e-06, "loss": 1.4351, "step": 50480 }, { "epoch": 8.343730634166494, "grad_norm": 22.32270050048828, "learning_rate": 9.199581336418224e-06, "loss": 1.242, "step": 50490 }, { "epoch": 8.345383185292295, "grad_norm": 12.450278282165527, "learning_rate": 9.190400117519603e-06, "loss": 1.3128, "step": 50500 }, { "epoch": 8.347035736418096, "grad_norm": 12.18801498413086, "learning_rate": 9.181218898620981e-06, "loss": 1.3458, "step": 50510 }, { "epoch": 8.348688287543895, "grad_norm": 11.80329704284668, "learning_rate": 9.17203767972236e-06, "loss": 1.3676, "step": 50520 }, { "epoch": 8.350340838669696, "grad_norm": 16.792072296142578, "learning_rate": 9.16285646082374e-06, "loss": 1.3725, "step": 50530 }, { "epoch": 8.351993389795497, "grad_norm": 20.232120513916016, "learning_rate": 9.153675241925117e-06, "loss": 1.2889, "step": 50540 }, { "epoch": 8.353645940921297, "grad_norm": 10.46639347076416, "learning_rate": 9.144494023026498e-06, "loss": 1.3758, "step": 50550 }, { "epoch": 8.355298492047098, "grad_norm": 11.493830680847168, "learning_rate": 9.135312804127877e-06, "loss": 1.3593, "step": 50560 }, { "epoch": 8.356951043172899, "grad_norm": 9.20073413848877, "learning_rate": 9.126131585229255e-06, "loss": 1.3475, "step": 50570 }, { "epoch": 8.358603594298698, "grad_norm": 19.450620651245117, "learning_rate": 9.116950366330634e-06, "loss": 1.4109, "step": 50580 }, { "epoch": 8.360256145424499, "grad_norm": 13.948092460632324, "learning_rate": 9.107769147432013e-06, "loss": 1.3133, "step": 50590 }, { "epoch": 8.3619086965503, "grad_norm": 26.458175659179688, "learning_rate": 9.098587928533393e-06, "loss": 1.5351, "step": 50600 }, { "epoch": 8.3635612476761, "grad_norm": 11.807082176208496, "learning_rate": 9.089406709634772e-06, "loss": 1.3635, "step": 50610 }, { "epoch": 8.3652137988019, "grad_norm": 13.108859062194824, "learning_rate": 9.080225490736151e-06, "loss": 1.411, "step": 50620 }, { "epoch": 8.366866349927701, "grad_norm": 14.530641555786133, "learning_rate": 9.07104427183753e-06, "loss": 1.4175, "step": 50630 }, { "epoch": 8.368518901053502, "grad_norm": 21.881153106689453, "learning_rate": 9.061863052938908e-06, "loss": 1.4125, "step": 50640 }, { "epoch": 8.370171452179301, "grad_norm": 15.157549858093262, "learning_rate": 9.052681834040287e-06, "loss": 1.2924, "step": 50650 }, { "epoch": 8.371824003305102, "grad_norm": 15.397344589233398, "learning_rate": 9.043500615141666e-06, "loss": 1.2743, "step": 50660 }, { "epoch": 8.373476554430903, "grad_norm": 13.956780433654785, "learning_rate": 9.034319396243046e-06, "loss": 1.2906, "step": 50670 }, { "epoch": 8.375129105556702, "grad_norm": 15.764337539672852, "learning_rate": 9.025138177344425e-06, "loss": 1.362, "step": 50680 }, { "epoch": 8.376781656682503, "grad_norm": 18.606061935424805, "learning_rate": 9.015956958445804e-06, "loss": 1.3092, "step": 50690 }, { "epoch": 8.378434207808304, "grad_norm": 17.503150939941406, "learning_rate": 9.006775739547182e-06, "loss": 1.3202, "step": 50700 }, { "epoch": 8.380086758934105, "grad_norm": 7.5358967781066895, "learning_rate": 8.997594520648561e-06, "loss": 1.3524, "step": 50710 }, { "epoch": 8.381739310059904, "grad_norm": 19.432466506958008, "learning_rate": 8.988413301749942e-06, "loss": 1.344, "step": 50720 }, { "epoch": 8.383391861185705, "grad_norm": 10.042672157287598, "learning_rate": 8.97923208285132e-06, "loss": 1.3504, "step": 50730 }, { "epoch": 8.385044412311506, "grad_norm": 14.225571632385254, "learning_rate": 8.970050863952699e-06, "loss": 1.2284, "step": 50740 }, { "epoch": 8.386696963437306, "grad_norm": 11.462421417236328, "learning_rate": 8.960869645054078e-06, "loss": 1.371, "step": 50750 }, { "epoch": 8.388349514563107, "grad_norm": 9.020613670349121, "learning_rate": 8.951688426155457e-06, "loss": 1.262, "step": 50760 }, { "epoch": 8.390002065688908, "grad_norm": 17.618812561035156, "learning_rate": 8.942507207256835e-06, "loss": 1.393, "step": 50770 }, { "epoch": 8.391654616814709, "grad_norm": 19.001405715942383, "learning_rate": 8.933325988358216e-06, "loss": 1.3542, "step": 50780 }, { "epoch": 8.393307167940508, "grad_norm": 19.39535140991211, "learning_rate": 8.924144769459595e-06, "loss": 1.4777, "step": 50790 }, { "epoch": 8.394959719066309, "grad_norm": 11.611642837524414, "learning_rate": 8.914963550560972e-06, "loss": 1.2758, "step": 50800 }, { "epoch": 8.39661227019211, "grad_norm": 25.300411224365234, "learning_rate": 8.905782331662352e-06, "loss": 1.4082, "step": 50810 }, { "epoch": 8.398264821317909, "grad_norm": 11.443560600280762, "learning_rate": 8.896601112763731e-06, "loss": 1.2994, "step": 50820 }, { "epoch": 8.39991737244371, "grad_norm": 11.911940574645996, "learning_rate": 8.887419893865109e-06, "loss": 1.4266, "step": 50830 }, { "epoch": 8.401569923569511, "grad_norm": 10.640558242797852, "learning_rate": 8.87823867496649e-06, "loss": 1.3858, "step": 50840 }, { "epoch": 8.40322247469531, "grad_norm": 14.38443660736084, "learning_rate": 8.869057456067869e-06, "loss": 1.3228, "step": 50850 }, { "epoch": 8.404875025821111, "grad_norm": 11.794403076171875, "learning_rate": 8.859876237169246e-06, "loss": 1.3956, "step": 50860 }, { "epoch": 8.406527576946912, "grad_norm": 10.348832130432129, "learning_rate": 8.850695018270625e-06, "loss": 1.4676, "step": 50870 }, { "epoch": 8.408180128072713, "grad_norm": 10.430915832519531, "learning_rate": 8.841513799372005e-06, "loss": 1.3324, "step": 50880 }, { "epoch": 8.409832679198512, "grad_norm": 11.54885482788086, "learning_rate": 8.832332580473384e-06, "loss": 1.4735, "step": 50890 }, { "epoch": 8.411485230324313, "grad_norm": 19.26331901550293, "learning_rate": 8.823151361574763e-06, "loss": 1.2549, "step": 50900 }, { "epoch": 8.413137781450114, "grad_norm": 12.017107963562012, "learning_rate": 8.813970142676142e-06, "loss": 1.3315, "step": 50910 }, { "epoch": 8.414790332575913, "grad_norm": 13.017470359802246, "learning_rate": 8.804788923777522e-06, "loss": 1.2567, "step": 50920 }, { "epoch": 8.416442883701714, "grad_norm": 14.95610523223877, "learning_rate": 8.7956077048789e-06, "loss": 1.3519, "step": 50930 }, { "epoch": 8.418095434827515, "grad_norm": 18.328262329101562, "learning_rate": 8.786426485980279e-06, "loss": 1.3535, "step": 50940 }, { "epoch": 8.419747985953315, "grad_norm": 11.61627197265625, "learning_rate": 8.77724526708166e-06, "loss": 1.4133, "step": 50950 }, { "epoch": 8.421400537079116, "grad_norm": 17.8658390045166, "learning_rate": 8.768064048183037e-06, "loss": 1.3322, "step": 50960 }, { "epoch": 8.423053088204917, "grad_norm": 12.533629417419434, "learning_rate": 8.758882829284416e-06, "loss": 1.2034, "step": 50970 }, { "epoch": 8.424705639330718, "grad_norm": 10.913763046264648, "learning_rate": 8.749701610385796e-06, "loss": 1.3928, "step": 50980 }, { "epoch": 8.426358190456517, "grad_norm": 12.7138090133667, "learning_rate": 8.740520391487173e-06, "loss": 1.2596, "step": 50990 }, { "epoch": 8.428010741582318, "grad_norm": 69.10160827636719, "learning_rate": 8.731339172588552e-06, "loss": 1.3902, "step": 51000 }, { "epoch": 8.429663292708119, "grad_norm": 18.809534072875977, "learning_rate": 8.722157953689933e-06, "loss": 1.343, "step": 51010 }, { "epoch": 8.431315843833918, "grad_norm": 17.767322540283203, "learning_rate": 8.71297673479131e-06, "loss": 1.241, "step": 51020 }, { "epoch": 8.432968394959719, "grad_norm": 18.070133209228516, "learning_rate": 8.70379551589269e-06, "loss": 1.2567, "step": 51030 }, { "epoch": 8.43462094608552, "grad_norm": 17.62630271911621, "learning_rate": 8.69461429699407e-06, "loss": 1.3031, "step": 51040 }, { "epoch": 8.43627349721132, "grad_norm": 19.49014663696289, "learning_rate": 8.685433078095449e-06, "loss": 1.4, "step": 51050 }, { "epoch": 8.43792604833712, "grad_norm": 22.65168571472168, "learning_rate": 8.676251859196826e-06, "loss": 1.3422, "step": 51060 }, { "epoch": 8.439578599462921, "grad_norm": 15.873695373535156, "learning_rate": 8.667070640298207e-06, "loss": 1.5412, "step": 51070 }, { "epoch": 8.441231150588722, "grad_norm": 20.69378089904785, "learning_rate": 8.657889421399586e-06, "loss": 1.2421, "step": 51080 }, { "epoch": 8.442883701714521, "grad_norm": 14.681087493896484, "learning_rate": 8.648708202500964e-06, "loss": 1.2936, "step": 51090 }, { "epoch": 8.444536252840322, "grad_norm": 23.839170455932617, "learning_rate": 8.639526983602343e-06, "loss": 1.3616, "step": 51100 }, { "epoch": 8.446188803966123, "grad_norm": 12.453177452087402, "learning_rate": 8.630345764703722e-06, "loss": 1.4633, "step": 51110 }, { "epoch": 8.447841355091922, "grad_norm": 10.4721040725708, "learning_rate": 8.621164545805102e-06, "loss": 1.3924, "step": 51120 }, { "epoch": 8.449493906217723, "grad_norm": 11.576587677001953, "learning_rate": 8.611983326906481e-06, "loss": 1.2731, "step": 51130 }, { "epoch": 8.451146457343524, "grad_norm": 16.914947509765625, "learning_rate": 8.60280210800786e-06, "loss": 1.4043, "step": 51140 }, { "epoch": 8.452799008469324, "grad_norm": 14.741705894470215, "learning_rate": 8.593620889109238e-06, "loss": 1.3298, "step": 51150 }, { "epoch": 8.454451559595125, "grad_norm": 10.166900634765625, "learning_rate": 8.584439670210617e-06, "loss": 1.3858, "step": 51160 }, { "epoch": 8.456104110720926, "grad_norm": 16.264799118041992, "learning_rate": 8.575258451311996e-06, "loss": 1.3903, "step": 51170 }, { "epoch": 8.457756661846727, "grad_norm": 11.90115737915039, "learning_rate": 8.566077232413375e-06, "loss": 1.3022, "step": 51180 }, { "epoch": 8.459409212972526, "grad_norm": 14.521602630615234, "learning_rate": 8.556896013514755e-06, "loss": 1.2753, "step": 51190 }, { "epoch": 8.461061764098327, "grad_norm": 29.15642547607422, "learning_rate": 8.547714794616134e-06, "loss": 1.3347, "step": 51200 }, { "epoch": 8.462714315224128, "grad_norm": 20.147611618041992, "learning_rate": 8.538533575717513e-06, "loss": 1.4416, "step": 51210 }, { "epoch": 8.464366866349927, "grad_norm": 30.82805633544922, "learning_rate": 8.52935235681889e-06, "loss": 1.3531, "step": 51220 }, { "epoch": 8.466019417475728, "grad_norm": 21.31821632385254, "learning_rate": 8.52017113792027e-06, "loss": 1.2286, "step": 51230 }, { "epoch": 8.467671968601529, "grad_norm": 19.59657096862793, "learning_rate": 8.510989919021651e-06, "loss": 1.3915, "step": 51240 }, { "epoch": 8.46932451972733, "grad_norm": 23.90947151184082, "learning_rate": 8.501808700123028e-06, "loss": 1.3517, "step": 51250 }, { "epoch": 8.47097707085313, "grad_norm": 22.817235946655273, "learning_rate": 8.492627481224408e-06, "loss": 1.3708, "step": 51260 }, { "epoch": 8.47262962197893, "grad_norm": 11.758684158325195, "learning_rate": 8.483446262325787e-06, "loss": 1.2679, "step": 51270 }, { "epoch": 8.474282173104731, "grad_norm": 9.888134002685547, "learning_rate": 8.474265043427164e-06, "loss": 1.3361, "step": 51280 }, { "epoch": 8.47593472423053, "grad_norm": 11.831820487976074, "learning_rate": 8.465083824528545e-06, "loss": 1.274, "step": 51290 }, { "epoch": 8.477587275356331, "grad_norm": 42.835636138916016, "learning_rate": 8.455902605629925e-06, "loss": 1.3636, "step": 51300 }, { "epoch": 8.479239826482132, "grad_norm": 10.630398750305176, "learning_rate": 8.446721386731302e-06, "loss": 1.1861, "step": 51310 }, { "epoch": 8.480892377607931, "grad_norm": 53.76140594482422, "learning_rate": 8.437540167832681e-06, "loss": 1.3159, "step": 51320 }, { "epoch": 8.482544928733732, "grad_norm": 25.955413818359375, "learning_rate": 8.42835894893406e-06, "loss": 1.2953, "step": 51330 }, { "epoch": 8.484197479859533, "grad_norm": 15.469341278076172, "learning_rate": 8.41917773003544e-06, "loss": 1.2313, "step": 51340 }, { "epoch": 8.485850030985334, "grad_norm": 15.408190727233887, "learning_rate": 8.40999651113682e-06, "loss": 1.4626, "step": 51350 }, { "epoch": 8.487502582111134, "grad_norm": 18.029531478881836, "learning_rate": 8.400815292238198e-06, "loss": 1.3921, "step": 51360 }, { "epoch": 8.489155133236935, "grad_norm": 12.669473648071289, "learning_rate": 8.391634073339578e-06, "loss": 1.203, "step": 51370 }, { "epoch": 8.490807684362736, "grad_norm": 20.233211517333984, "learning_rate": 8.382452854440955e-06, "loss": 1.5235, "step": 51380 }, { "epoch": 8.492460235488535, "grad_norm": 18.365198135375977, "learning_rate": 8.373271635542335e-06, "loss": 1.5077, "step": 51390 }, { "epoch": 8.494112786614336, "grad_norm": 9.256744384765625, "learning_rate": 8.364090416643714e-06, "loss": 1.328, "step": 51400 }, { "epoch": 8.495765337740137, "grad_norm": 13.671814918518066, "learning_rate": 8.354909197745093e-06, "loss": 1.4123, "step": 51410 }, { "epoch": 8.497417888865936, "grad_norm": 48.41630172729492, "learning_rate": 8.345727978846472e-06, "loss": 1.3484, "step": 51420 }, { "epoch": 8.499070439991737, "grad_norm": 19.150592803955078, "learning_rate": 8.336546759947852e-06, "loss": 1.3481, "step": 51430 }, { "epoch": 8.500722991117538, "grad_norm": 20.59627914428711, "learning_rate": 8.327365541049229e-06, "loss": 1.281, "step": 51440 }, { "epoch": 8.502375542243339, "grad_norm": 16.762386322021484, "learning_rate": 8.318184322150608e-06, "loss": 1.4107, "step": 51450 }, { "epoch": 8.504028093369138, "grad_norm": 15.31123161315918, "learning_rate": 8.309003103251988e-06, "loss": 1.3289, "step": 51460 }, { "epoch": 8.505680644494939, "grad_norm": 13.639659881591797, "learning_rate": 8.299821884353367e-06, "loss": 1.2842, "step": 51470 }, { "epoch": 8.50733319562074, "grad_norm": 15.621134757995605, "learning_rate": 8.290640665454746e-06, "loss": 1.3049, "step": 51480 }, { "epoch": 8.50898574674654, "grad_norm": 11.561347961425781, "learning_rate": 8.281459446556125e-06, "loss": 1.3786, "step": 51490 }, { "epoch": 8.51063829787234, "grad_norm": 6.681066989898682, "learning_rate": 8.272278227657505e-06, "loss": 1.1806, "step": 51500 }, { "epoch": 8.512290848998141, "grad_norm": 13.842896461486816, "learning_rate": 8.263097008758882e-06, "loss": 1.4607, "step": 51510 }, { "epoch": 8.513943400123942, "grad_norm": 16.961971282958984, "learning_rate": 8.253915789860263e-06, "loss": 1.2977, "step": 51520 }, { "epoch": 8.515595951249741, "grad_norm": 11.850485801696777, "learning_rate": 8.244734570961642e-06, "loss": 1.3939, "step": 51530 }, { "epoch": 8.517248502375542, "grad_norm": 20.252544403076172, "learning_rate": 8.23555335206302e-06, "loss": 1.4132, "step": 51540 }, { "epoch": 8.518901053501343, "grad_norm": 11.936491966247559, "learning_rate": 8.226372133164399e-06, "loss": 1.2569, "step": 51550 }, { "epoch": 8.520553604627143, "grad_norm": 15.113805770874023, "learning_rate": 8.217190914265778e-06, "loss": 1.3359, "step": 51560 }, { "epoch": 8.522206155752944, "grad_norm": 14.202859878540039, "learning_rate": 8.208009695367158e-06, "loss": 1.2987, "step": 51570 }, { "epoch": 8.523858706878745, "grad_norm": 22.607847213745117, "learning_rate": 8.198828476468537e-06, "loss": 1.3453, "step": 51580 }, { "epoch": 8.525511258004544, "grad_norm": 12.42171859741211, "learning_rate": 8.189647257569916e-06, "loss": 1.3669, "step": 51590 }, { "epoch": 8.527163809130345, "grad_norm": 19.58823585510254, "learning_rate": 8.180466038671294e-06, "loss": 1.2657, "step": 51600 }, { "epoch": 8.528816360256146, "grad_norm": 8.663179397583008, "learning_rate": 8.171284819772673e-06, "loss": 1.2291, "step": 51610 }, { "epoch": 8.530468911381945, "grad_norm": 52.90243148803711, "learning_rate": 8.162103600874052e-06, "loss": 1.2141, "step": 51620 }, { "epoch": 8.532121462507746, "grad_norm": 17.209096908569336, "learning_rate": 8.152922381975431e-06, "loss": 1.3594, "step": 51630 }, { "epoch": 8.533774013633547, "grad_norm": 14.850648880004883, "learning_rate": 8.14374116307681e-06, "loss": 1.3939, "step": 51640 }, { "epoch": 8.535426564759348, "grad_norm": 11.91596794128418, "learning_rate": 8.13455994417819e-06, "loss": 1.4166, "step": 51650 }, { "epoch": 8.537079115885147, "grad_norm": 19.062267303466797, "learning_rate": 8.125378725279569e-06, "loss": 1.2763, "step": 51660 }, { "epoch": 8.538731667010948, "grad_norm": 39.40817642211914, "learning_rate": 8.116197506380947e-06, "loss": 1.4173, "step": 51670 }, { "epoch": 8.540384218136749, "grad_norm": 9.898063659667969, "learning_rate": 8.107016287482326e-06, "loss": 1.2107, "step": 51680 }, { "epoch": 8.542036769262548, "grad_norm": 19.64806365966797, "learning_rate": 8.097835068583707e-06, "loss": 1.4224, "step": 51690 }, { "epoch": 8.54368932038835, "grad_norm": 12.104853630065918, "learning_rate": 8.088653849685084e-06, "loss": 1.3676, "step": 51700 }, { "epoch": 8.54534187151415, "grad_norm": 20.285003662109375, "learning_rate": 8.079472630786464e-06, "loss": 1.3047, "step": 51710 }, { "epoch": 8.546994422639951, "grad_norm": 18.16466522216797, "learning_rate": 8.070291411887843e-06, "loss": 1.494, "step": 51720 }, { "epoch": 8.54864697376575, "grad_norm": 14.75426197052002, "learning_rate": 8.061110192989222e-06, "loss": 1.3873, "step": 51730 }, { "epoch": 8.550299524891551, "grad_norm": 13.810311317443848, "learning_rate": 8.0519289740906e-06, "loss": 1.3675, "step": 51740 }, { "epoch": 8.551952076017352, "grad_norm": 10.323554992675781, "learning_rate": 8.04274775519198e-06, "loss": 1.3622, "step": 51750 }, { "epoch": 8.553604627143152, "grad_norm": 13.959274291992188, "learning_rate": 8.033566536293358e-06, "loss": 1.2956, "step": 51760 }, { "epoch": 8.555257178268953, "grad_norm": 12.601279258728027, "learning_rate": 8.024385317394737e-06, "loss": 1.2695, "step": 51770 }, { "epoch": 8.556909729394754, "grad_norm": 11.207502365112305, "learning_rate": 8.015204098496117e-06, "loss": 1.3684, "step": 51780 }, { "epoch": 8.558562280520553, "grad_norm": 28.540502548217773, "learning_rate": 8.006022879597496e-06, "loss": 1.319, "step": 51790 }, { "epoch": 8.560214831646354, "grad_norm": 15.684926986694336, "learning_rate": 7.996841660698874e-06, "loss": 1.3478, "step": 51800 }, { "epoch": 8.561867382772155, "grad_norm": 9.565122604370117, "learning_rate": 7.987660441800254e-06, "loss": 1.2809, "step": 51810 }, { "epoch": 8.563519933897956, "grad_norm": 12.698163986206055, "learning_rate": 7.978479222901634e-06, "loss": 1.2952, "step": 51820 }, { "epoch": 8.565172485023755, "grad_norm": 30.729351043701172, "learning_rate": 7.969298004003011e-06, "loss": 1.5078, "step": 51830 }, { "epoch": 8.566825036149556, "grad_norm": 14.974106788635254, "learning_rate": 7.96011678510439e-06, "loss": 1.3052, "step": 51840 }, { "epoch": 8.568477587275357, "grad_norm": 6.864882946014404, "learning_rate": 7.95093556620577e-06, "loss": 1.3145, "step": 51850 }, { "epoch": 8.570130138401156, "grad_norm": 17.064834594726562, "learning_rate": 7.941754347307149e-06, "loss": 1.3939, "step": 51860 }, { "epoch": 8.571782689526957, "grad_norm": 9.994977951049805, "learning_rate": 7.932573128408528e-06, "loss": 1.3526, "step": 51870 }, { "epoch": 8.573435240652758, "grad_norm": 14.07989501953125, "learning_rate": 7.923391909509907e-06, "loss": 1.3424, "step": 51880 }, { "epoch": 8.575087791778557, "grad_norm": 12.24575138092041, "learning_rate": 7.914210690611287e-06, "loss": 1.2845, "step": 51890 }, { "epoch": 8.576740342904358, "grad_norm": 13.51680850982666, "learning_rate": 7.905029471712664e-06, "loss": 1.3452, "step": 51900 }, { "epoch": 8.57839289403016, "grad_norm": 13.959500312805176, "learning_rate": 7.895848252814044e-06, "loss": 1.2149, "step": 51910 }, { "epoch": 8.58004544515596, "grad_norm": 28.394367218017578, "learning_rate": 7.886667033915423e-06, "loss": 1.463, "step": 51920 }, { "epoch": 8.58169799628176, "grad_norm": 14.02746868133545, "learning_rate": 7.877485815016802e-06, "loss": 1.4335, "step": 51930 }, { "epoch": 8.58335054740756, "grad_norm": 11.631068229675293, "learning_rate": 7.868304596118181e-06, "loss": 1.2435, "step": 51940 }, { "epoch": 8.585003098533361, "grad_norm": 13.741445541381836, "learning_rate": 7.85912337721956e-06, "loss": 1.2622, "step": 51950 }, { "epoch": 8.58665564965916, "grad_norm": 21.316083908081055, "learning_rate": 7.849942158320938e-06, "loss": 1.484, "step": 51960 }, { "epoch": 8.588308200784962, "grad_norm": 17.675111770629883, "learning_rate": 7.840760939422317e-06, "loss": 1.38, "step": 51970 }, { "epoch": 8.589960751910763, "grad_norm": 21.218908309936523, "learning_rate": 7.831579720523698e-06, "loss": 1.247, "step": 51980 }, { "epoch": 8.591613303036564, "grad_norm": 18.68570899963379, "learning_rate": 7.822398501625076e-06, "loss": 1.3275, "step": 51990 }, { "epoch": 8.593265854162363, "grad_norm": 12.028681755065918, "learning_rate": 7.813217282726455e-06, "loss": 1.2789, "step": 52000 }, { "epoch": 8.594918405288164, "grad_norm": 16.868160247802734, "learning_rate": 7.804036063827834e-06, "loss": 1.1928, "step": 52010 }, { "epoch": 8.596570956413965, "grad_norm": 14.895330429077148, "learning_rate": 7.794854844929214e-06, "loss": 1.3861, "step": 52020 }, { "epoch": 8.598223507539764, "grad_norm": 17.00758934020996, "learning_rate": 7.785673626030591e-06, "loss": 1.3761, "step": 52030 }, { "epoch": 8.599876058665565, "grad_norm": 21.712890625, "learning_rate": 7.776492407131972e-06, "loss": 1.2694, "step": 52040 }, { "epoch": 8.601528609791366, "grad_norm": 21.000192642211914, "learning_rate": 7.767311188233351e-06, "loss": 1.3491, "step": 52050 }, { "epoch": 8.603181160917165, "grad_norm": 19.828134536743164, "learning_rate": 7.758129969334729e-06, "loss": 1.2993, "step": 52060 }, { "epoch": 8.604833712042966, "grad_norm": 9.904111862182617, "learning_rate": 7.748948750436108e-06, "loss": 1.3062, "step": 52070 }, { "epoch": 8.606486263168767, "grad_norm": 19.431184768676758, "learning_rate": 7.739767531537487e-06, "loss": 1.3353, "step": 52080 }, { "epoch": 8.608138814294568, "grad_norm": 16.284452438354492, "learning_rate": 7.730586312638867e-06, "loss": 1.3189, "step": 52090 }, { "epoch": 8.609791365420367, "grad_norm": 20.377105712890625, "learning_rate": 7.721405093740246e-06, "loss": 1.2256, "step": 52100 }, { "epoch": 8.611443916546168, "grad_norm": 16.32294273376465, "learning_rate": 7.712223874841625e-06, "loss": 1.3875, "step": 52110 }, { "epoch": 8.61309646767197, "grad_norm": 17.102628707885742, "learning_rate": 7.703042655943003e-06, "loss": 1.3282, "step": 52120 }, { "epoch": 8.614749018797768, "grad_norm": 7.808797359466553, "learning_rate": 7.693861437044382e-06, "loss": 1.2726, "step": 52130 }, { "epoch": 8.61640156992357, "grad_norm": 44.669883728027344, "learning_rate": 7.684680218145761e-06, "loss": 1.3213, "step": 52140 }, { "epoch": 8.61805412104937, "grad_norm": 10.58360767364502, "learning_rate": 7.67549899924714e-06, "loss": 1.2849, "step": 52150 }, { "epoch": 8.61970667217517, "grad_norm": 11.978372573852539, "learning_rate": 7.66631778034852e-06, "loss": 1.392, "step": 52160 }, { "epoch": 8.62135922330097, "grad_norm": 19.33893585205078, "learning_rate": 7.657136561449899e-06, "loss": 1.3544, "step": 52170 }, { "epoch": 8.623011774426772, "grad_norm": 15.807573318481445, "learning_rate": 7.647955342551278e-06, "loss": 1.4272, "step": 52180 }, { "epoch": 8.624664325552573, "grad_norm": 22.08311653137207, "learning_rate": 7.638774123652656e-06, "loss": 1.3413, "step": 52190 }, { "epoch": 8.626316876678372, "grad_norm": 18.999937057495117, "learning_rate": 7.629592904754035e-06, "loss": 1.4418, "step": 52200 }, { "epoch": 8.627969427804173, "grad_norm": 18.321889877319336, "learning_rate": 7.620411685855415e-06, "loss": 1.4389, "step": 52210 }, { "epoch": 8.629621978929974, "grad_norm": 18.07795524597168, "learning_rate": 7.6112304669567934e-06, "loss": 1.2659, "step": 52220 }, { "epoch": 8.631274530055773, "grad_norm": 17.448719024658203, "learning_rate": 7.602049248058173e-06, "loss": 1.3933, "step": 52230 }, { "epoch": 8.632927081181574, "grad_norm": 10.875787734985352, "learning_rate": 7.592868029159551e-06, "loss": 1.313, "step": 52240 }, { "epoch": 8.634579632307375, "grad_norm": 17.80232810974121, "learning_rate": 7.58368681026093e-06, "loss": 1.3124, "step": 52250 }, { "epoch": 8.636232183433176, "grad_norm": 21.202545166015625, "learning_rate": 7.5745055913623104e-06, "loss": 1.4396, "step": 52260 }, { "epoch": 8.637884734558975, "grad_norm": 22.008880615234375, "learning_rate": 7.565324372463689e-06, "loss": 1.3628, "step": 52270 }, { "epoch": 8.639537285684776, "grad_norm": 31.674184799194336, "learning_rate": 7.556143153565068e-06, "loss": 1.3047, "step": 52280 }, { "epoch": 8.641189836810577, "grad_norm": 13.10922908782959, "learning_rate": 7.5469619346664465e-06, "loss": 1.3011, "step": 52290 }, { "epoch": 8.642842387936376, "grad_norm": 43.33464813232422, "learning_rate": 7.537780715767826e-06, "loss": 1.4427, "step": 52300 }, { "epoch": 8.644494939062177, "grad_norm": 11.043828964233398, "learning_rate": 7.528599496869204e-06, "loss": 1.3129, "step": 52310 }, { "epoch": 8.646147490187978, "grad_norm": 17.55091667175293, "learning_rate": 7.519418277970584e-06, "loss": 1.3026, "step": 52320 }, { "epoch": 8.647800041313777, "grad_norm": 11.531242370605469, "learning_rate": 7.5102370590719635e-06, "loss": 1.343, "step": 52330 }, { "epoch": 8.649452592439578, "grad_norm": 7.957118988037109, "learning_rate": 7.501055840173342e-06, "loss": 1.2908, "step": 52340 }, { "epoch": 8.65110514356538, "grad_norm": 15.16044807434082, "learning_rate": 7.49187462127472e-06, "loss": 1.4025, "step": 52350 }, { "epoch": 8.652757694691179, "grad_norm": 12.470380783081055, "learning_rate": 7.4826934023760995e-06, "loss": 1.2541, "step": 52360 }, { "epoch": 8.65441024581698, "grad_norm": 14.842360496520996, "learning_rate": 7.473512183477478e-06, "loss": 1.4249, "step": 52370 }, { "epoch": 8.65606279694278, "grad_norm": 10.60826587677002, "learning_rate": 7.464330964578858e-06, "loss": 1.2648, "step": 52380 }, { "epoch": 8.657715348068582, "grad_norm": 20.237024307250977, "learning_rate": 7.455149745680237e-06, "loss": 1.3097, "step": 52390 }, { "epoch": 8.65936789919438, "grad_norm": 15.583151817321777, "learning_rate": 7.445968526781616e-06, "loss": 1.3364, "step": 52400 }, { "epoch": 8.661020450320182, "grad_norm": 22.51787757873535, "learning_rate": 7.436787307882995e-06, "loss": 1.3017, "step": 52410 }, { "epoch": 8.662673001445983, "grad_norm": 8.777755737304688, "learning_rate": 7.427606088984373e-06, "loss": 1.2984, "step": 52420 }, { "epoch": 8.664325552571782, "grad_norm": 43.127593994140625, "learning_rate": 7.4184248700857526e-06, "loss": 1.3406, "step": 52430 }, { "epoch": 8.665978103697583, "grad_norm": 10.117231369018555, "learning_rate": 7.409243651187133e-06, "loss": 1.261, "step": 52440 }, { "epoch": 8.667630654823384, "grad_norm": 12.074843406677246, "learning_rate": 7.400062432288511e-06, "loss": 1.4873, "step": 52450 }, { "epoch": 8.669283205949185, "grad_norm": 13.140725135803223, "learning_rate": 7.39088121338989e-06, "loss": 1.3359, "step": 52460 }, { "epoch": 8.670935757074984, "grad_norm": 15.065147399902344, "learning_rate": 7.381699994491269e-06, "loss": 1.2755, "step": 52470 }, { "epoch": 8.672588308200785, "grad_norm": 10.053617477416992, "learning_rate": 7.372518775592648e-06, "loss": 1.3763, "step": 52480 }, { "epoch": 8.674240859326586, "grad_norm": 15.998075485229492, "learning_rate": 7.363337556694028e-06, "loss": 1.3625, "step": 52490 }, { "epoch": 8.675893410452385, "grad_norm": 11.436149597167969, "learning_rate": 7.3541563377954064e-06, "loss": 1.3587, "step": 52500 }, { "epoch": 8.677545961578186, "grad_norm": 12.246932983398438, "learning_rate": 7.344975118896785e-06, "loss": 1.3444, "step": 52510 }, { "epoch": 8.679198512703987, "grad_norm": 10.678253173828125, "learning_rate": 7.335793899998164e-06, "loss": 1.3673, "step": 52520 }, { "epoch": 8.680851063829786, "grad_norm": 11.65112018585205, "learning_rate": 7.3266126810995425e-06, "loss": 1.3642, "step": 52530 }, { "epoch": 8.682503614955587, "grad_norm": 17.31778335571289, "learning_rate": 7.317431462200922e-06, "loss": 1.3697, "step": 52540 }, { "epoch": 8.684156166081388, "grad_norm": 14.801467895507812, "learning_rate": 7.308250243302302e-06, "loss": 1.3962, "step": 52550 }, { "epoch": 8.68580871720719, "grad_norm": 11.221620559692383, "learning_rate": 7.29906902440368e-06, "loss": 1.2398, "step": 52560 }, { "epoch": 8.687461268332989, "grad_norm": 19.58458137512207, "learning_rate": 7.2898878055050595e-06, "loss": 1.3754, "step": 52570 }, { "epoch": 8.68911381945879, "grad_norm": 12.247591972351074, "learning_rate": 7.280706586606438e-06, "loss": 1.3815, "step": 52580 }, { "epoch": 8.69076637058459, "grad_norm": 9.798544883728027, "learning_rate": 7.271525367707817e-06, "loss": 1.3341, "step": 52590 }, { "epoch": 8.69241892171039, "grad_norm": 12.177785873413086, "learning_rate": 7.2623441488091955e-06, "loss": 1.2962, "step": 52600 }, { "epoch": 8.69407147283619, "grad_norm": 13.61369800567627, "learning_rate": 7.253162929910576e-06, "loss": 1.4009, "step": 52610 }, { "epoch": 8.695724023961992, "grad_norm": 11.679938316345215, "learning_rate": 7.243981711011955e-06, "loss": 1.2359, "step": 52620 }, { "epoch": 8.697376575087791, "grad_norm": 17.74927520751953, "learning_rate": 7.234800492113333e-06, "loss": 1.3443, "step": 52630 }, { "epoch": 8.699029126213592, "grad_norm": 20.92237091064453, "learning_rate": 7.2256192732147125e-06, "loss": 1.3608, "step": 52640 }, { "epoch": 8.700681677339393, "grad_norm": 14.271102905273438, "learning_rate": 7.216438054316091e-06, "loss": 1.2804, "step": 52650 }, { "epoch": 8.702334228465194, "grad_norm": 16.352445602416992, "learning_rate": 7.207256835417471e-06, "loss": 1.3786, "step": 52660 }, { "epoch": 8.703986779590993, "grad_norm": 11.873323440551758, "learning_rate": 7.198075616518849e-06, "loss": 1.3908, "step": 52670 }, { "epoch": 8.705639330716794, "grad_norm": 15.193153381347656, "learning_rate": 7.188894397620229e-06, "loss": 1.2402, "step": 52680 }, { "epoch": 8.707291881842595, "grad_norm": 22.531282424926758, "learning_rate": 7.179713178721607e-06, "loss": 1.2404, "step": 52690 }, { "epoch": 8.708944432968394, "grad_norm": 10.484509468078613, "learning_rate": 7.170531959822986e-06, "loss": 1.3803, "step": 52700 }, { "epoch": 8.710596984094195, "grad_norm": 21.465103149414062, "learning_rate": 7.161350740924365e-06, "loss": 1.458, "step": 52710 }, { "epoch": 8.712249535219996, "grad_norm": 16.147686004638672, "learning_rate": 7.152169522025745e-06, "loss": 1.3513, "step": 52720 }, { "epoch": 8.713902086345797, "grad_norm": 14.028088569641113, "learning_rate": 7.142988303127124e-06, "loss": 1.2286, "step": 52730 }, { "epoch": 8.715554637471596, "grad_norm": 15.7096586227417, "learning_rate": 7.1338070842285025e-06, "loss": 1.2122, "step": 52740 }, { "epoch": 8.717207188597397, "grad_norm": 21.39398956298828, "learning_rate": 7.124625865329882e-06, "loss": 1.274, "step": 52750 }, { "epoch": 8.718859739723198, "grad_norm": 15.905527114868164, "learning_rate": 7.11544464643126e-06, "loss": 1.323, "step": 52760 }, { "epoch": 8.720512290848998, "grad_norm": 13.975886344909668, "learning_rate": 7.106263427532639e-06, "loss": 1.5179, "step": 52770 }, { "epoch": 8.722164841974799, "grad_norm": 20.37264060974121, "learning_rate": 7.0970822086340194e-06, "loss": 1.2484, "step": 52780 }, { "epoch": 8.7238173931006, "grad_norm": 10.651167869567871, "learning_rate": 7.087900989735398e-06, "loss": 1.3944, "step": 52790 }, { "epoch": 8.725469944226399, "grad_norm": 12.802726745605469, "learning_rate": 7.078719770836777e-06, "loss": 1.4366, "step": 52800 }, { "epoch": 8.7271224953522, "grad_norm": 13.142681121826172, "learning_rate": 7.0695385519381555e-06, "loss": 1.3166, "step": 52810 }, { "epoch": 8.728775046478, "grad_norm": 14.891148567199707, "learning_rate": 7.060357333039534e-06, "loss": 1.3012, "step": 52820 }, { "epoch": 8.7304275976038, "grad_norm": 13.529729843139648, "learning_rate": 7.051176114140913e-06, "loss": 1.4384, "step": 52830 }, { "epoch": 8.732080148729601, "grad_norm": 14.417351722717285, "learning_rate": 7.041994895242293e-06, "loss": 1.4955, "step": 52840 }, { "epoch": 8.733732699855402, "grad_norm": 11.694726943969727, "learning_rate": 7.032813676343672e-06, "loss": 1.3628, "step": 52850 }, { "epoch": 8.735385250981203, "grad_norm": 8.318774223327637, "learning_rate": 7.023632457445051e-06, "loss": 1.2791, "step": 52860 }, { "epoch": 8.737037802107002, "grad_norm": 29.99515724182129, "learning_rate": 7.014451238546429e-06, "loss": 1.357, "step": 52870 }, { "epoch": 8.738690353232803, "grad_norm": 8.45457649230957, "learning_rate": 7.0052700196478085e-06, "loss": 1.2616, "step": 52880 }, { "epoch": 8.740342904358604, "grad_norm": 15.457688331604004, "learning_rate": 6.996088800749189e-06, "loss": 1.4261, "step": 52890 }, { "epoch": 8.741995455484403, "grad_norm": 23.088054656982422, "learning_rate": 6.986907581850567e-06, "loss": 1.3279, "step": 52900 }, { "epoch": 8.743648006610204, "grad_norm": 10.44580364227295, "learning_rate": 6.977726362951946e-06, "loss": 1.2399, "step": 52910 }, { "epoch": 8.745300557736005, "grad_norm": 15.399632453918457, "learning_rate": 6.968545144053325e-06, "loss": 1.3267, "step": 52920 }, { "epoch": 8.746953108861806, "grad_norm": 11.569892883300781, "learning_rate": 6.959363925154704e-06, "loss": 1.2217, "step": 52930 }, { "epoch": 8.748605659987605, "grad_norm": 13.182928085327148, "learning_rate": 6.950182706256082e-06, "loss": 1.3053, "step": 52940 }, { "epoch": 8.750258211113406, "grad_norm": 10.968775749206543, "learning_rate": 6.941001487357462e-06, "loss": 1.2109, "step": 52950 }, { "epoch": 8.751910762239207, "grad_norm": 28.66567611694336, "learning_rate": 6.931820268458842e-06, "loss": 1.2991, "step": 52960 }, { "epoch": 8.753563313365007, "grad_norm": 14.520941734313965, "learning_rate": 6.92263904956022e-06, "loss": 1.3108, "step": 52970 }, { "epoch": 8.755215864490808, "grad_norm": 18.797256469726562, "learning_rate": 6.9134578306615985e-06, "loss": 1.3105, "step": 52980 }, { "epoch": 8.756868415616609, "grad_norm": 21.74106788635254, "learning_rate": 6.904276611762978e-06, "loss": 1.4593, "step": 52990 }, { "epoch": 8.758520966742408, "grad_norm": 21.514694213867188, "learning_rate": 6.895095392864356e-06, "loss": 1.2157, "step": 53000 }, { "epoch": 8.760173517868209, "grad_norm": 15.12600040435791, "learning_rate": 6.885914173965736e-06, "loss": 1.4009, "step": 53010 }, { "epoch": 8.76182606899401, "grad_norm": 17.038434982299805, "learning_rate": 6.8767329550671155e-06, "loss": 1.3185, "step": 53020 }, { "epoch": 8.76347862011981, "grad_norm": 16.833236694335938, "learning_rate": 6.867551736168494e-06, "loss": 1.2934, "step": 53030 }, { "epoch": 8.76513117124561, "grad_norm": 13.726874351501465, "learning_rate": 6.858370517269873e-06, "loss": 1.4518, "step": 53040 }, { "epoch": 8.766783722371411, "grad_norm": 42.266380310058594, "learning_rate": 6.8491892983712515e-06, "loss": 1.3364, "step": 53050 }, { "epoch": 8.768436273497212, "grad_norm": 22.413843154907227, "learning_rate": 6.840008079472632e-06, "loss": 1.3972, "step": 53060 }, { "epoch": 8.770088824623011, "grad_norm": 19.03530502319336, "learning_rate": 6.830826860574011e-06, "loss": 1.322, "step": 53070 }, { "epoch": 8.771741375748812, "grad_norm": 22.070907592773438, "learning_rate": 6.821645641675389e-06, "loss": 1.314, "step": 53080 }, { "epoch": 8.773393926874613, "grad_norm": 11.455080032348633, "learning_rate": 6.8124644227767685e-06, "loss": 1.3339, "step": 53090 }, { "epoch": 8.775046478000412, "grad_norm": 9.250655174255371, "learning_rate": 6.803283203878147e-06, "loss": 1.3523, "step": 53100 }, { "epoch": 8.776699029126213, "grad_norm": 11.893872261047363, "learning_rate": 6.794101984979525e-06, "loss": 1.3746, "step": 53110 }, { "epoch": 8.778351580252014, "grad_norm": 16.446348190307617, "learning_rate": 6.784920766080906e-06, "loss": 1.3863, "step": 53120 }, { "epoch": 8.780004131377815, "grad_norm": 13.59485912322998, "learning_rate": 6.775739547182285e-06, "loss": 1.2766, "step": 53130 }, { "epoch": 8.781656682503614, "grad_norm": 11.817255020141602, "learning_rate": 6.766558328283663e-06, "loss": 1.3529, "step": 53140 }, { "epoch": 8.783309233629415, "grad_norm": 14.739175796508789, "learning_rate": 6.757377109385042e-06, "loss": 1.4092, "step": 53150 }, { "epoch": 8.784961784755216, "grad_norm": 16.03285789489746, "learning_rate": 6.748195890486421e-06, "loss": 1.4103, "step": 53160 }, { "epoch": 8.786614335881016, "grad_norm": 14.885622024536133, "learning_rate": 6.7390146715878e-06, "loss": 1.2313, "step": 53170 }, { "epoch": 8.788266887006817, "grad_norm": 17.540298461914062, "learning_rate": 6.72983345268918e-06, "loss": 1.2694, "step": 53180 }, { "epoch": 8.789919438132618, "grad_norm": 12.274869918823242, "learning_rate": 6.7206522337905584e-06, "loss": 1.3668, "step": 53190 }, { "epoch": 8.791571989258419, "grad_norm": 12.353447914123535, "learning_rate": 6.711471014891938e-06, "loss": 1.3208, "step": 53200 }, { "epoch": 8.793224540384218, "grad_norm": 10.738207817077637, "learning_rate": 6.702289795993316e-06, "loss": 1.1065, "step": 53210 }, { "epoch": 8.794877091510019, "grad_norm": 11.016992568969727, "learning_rate": 6.693108577094695e-06, "loss": 1.3443, "step": 53220 }, { "epoch": 8.79652964263582, "grad_norm": 15.448955535888672, "learning_rate": 6.683927358196074e-06, "loss": 1.3079, "step": 53230 }, { "epoch": 8.798182193761619, "grad_norm": 12.838058471679688, "learning_rate": 6.674746139297454e-06, "loss": 1.2739, "step": 53240 }, { "epoch": 8.79983474488742, "grad_norm": 6.943477630615234, "learning_rate": 6.665564920398833e-06, "loss": 1.2321, "step": 53250 }, { "epoch": 8.80148729601322, "grad_norm": 22.82369041442871, "learning_rate": 6.6563837015002115e-06, "loss": 1.4648, "step": 53260 }, { "epoch": 8.80313984713902, "grad_norm": 11.41329288482666, "learning_rate": 6.64720248260159e-06, "loss": 1.2926, "step": 53270 }, { "epoch": 8.804792398264821, "grad_norm": 16.04501724243164, "learning_rate": 6.638021263702969e-06, "loss": 1.3558, "step": 53280 }, { "epoch": 8.806444949390622, "grad_norm": 11.551048278808594, "learning_rate": 6.628840044804349e-06, "loss": 1.3032, "step": 53290 }, { "epoch": 8.808097500516423, "grad_norm": 16.122791290283203, "learning_rate": 6.619658825905728e-06, "loss": 1.4187, "step": 53300 }, { "epoch": 8.809750051642222, "grad_norm": 19.001909255981445, "learning_rate": 6.610477607007107e-06, "loss": 1.3634, "step": 53310 }, { "epoch": 8.811402602768023, "grad_norm": 16.42638397216797, "learning_rate": 6.601296388108485e-06, "loss": 1.2488, "step": 53320 }, { "epoch": 8.813055153893824, "grad_norm": 18.04165267944336, "learning_rate": 6.5921151692098645e-06, "loss": 1.3113, "step": 53330 }, { "epoch": 8.814707705019623, "grad_norm": 14.503774642944336, "learning_rate": 6.582933950311243e-06, "loss": 1.2873, "step": 53340 }, { "epoch": 8.816360256145424, "grad_norm": 12.262187957763672, "learning_rate": 6.573752731412623e-06, "loss": 1.3056, "step": 53350 }, { "epoch": 8.818012807271225, "grad_norm": 16.321773529052734, "learning_rate": 6.564571512514002e-06, "loss": 1.2377, "step": 53360 }, { "epoch": 8.819665358397025, "grad_norm": 8.0360107421875, "learning_rate": 6.555390293615381e-06, "loss": 1.3937, "step": 53370 }, { "epoch": 8.821317909522826, "grad_norm": 9.701162338256836, "learning_rate": 6.54620907471676e-06, "loss": 1.2723, "step": 53380 }, { "epoch": 8.822970460648627, "grad_norm": 63.11784362792969, "learning_rate": 6.537027855818138e-06, "loss": 1.4564, "step": 53390 }, { "epoch": 8.824623011774428, "grad_norm": 13.213398933410645, "learning_rate": 6.527846636919517e-06, "loss": 1.3917, "step": 53400 }, { "epoch": 8.826275562900227, "grad_norm": 29.3795166015625, "learning_rate": 6.518665418020898e-06, "loss": 1.1673, "step": 53410 }, { "epoch": 8.827928114026028, "grad_norm": 20.529861450195312, "learning_rate": 6.509484199122276e-06, "loss": 1.3965, "step": 53420 }, { "epoch": 8.829580665151829, "grad_norm": 12.665575981140137, "learning_rate": 6.5003029802236544e-06, "loss": 1.2904, "step": 53430 }, { "epoch": 8.831233216277628, "grad_norm": 23.68876838684082, "learning_rate": 6.491121761325034e-06, "loss": 1.4163, "step": 53440 }, { "epoch": 8.832885767403429, "grad_norm": 13.767816543579102, "learning_rate": 6.481940542426412e-06, "loss": 1.2772, "step": 53450 }, { "epoch": 8.83453831852923, "grad_norm": 38.02272415161133, "learning_rate": 6.472759323527792e-06, "loss": 1.3729, "step": 53460 }, { "epoch": 8.83619086965503, "grad_norm": 18.688528060913086, "learning_rate": 6.4635781046291714e-06, "loss": 1.4323, "step": 53470 }, { "epoch": 8.83784342078083, "grad_norm": 8.972434043884277, "learning_rate": 6.45439688573055e-06, "loss": 1.2365, "step": 53480 }, { "epoch": 8.839495971906631, "grad_norm": 12.999137878417969, "learning_rate": 6.445215666831929e-06, "loss": 1.2838, "step": 53490 }, { "epoch": 8.841148523032432, "grad_norm": 12.123946189880371, "learning_rate": 6.4360344479333075e-06, "loss": 1.3943, "step": 53500 }, { "epoch": 8.842801074158231, "grad_norm": 8.562018394470215, "learning_rate": 6.426853229034687e-06, "loss": 1.4237, "step": 53510 }, { "epoch": 8.844453625284032, "grad_norm": 19.84341049194336, "learning_rate": 6.417672010136067e-06, "loss": 1.3901, "step": 53520 }, { "epoch": 8.846106176409833, "grad_norm": 11.601472854614258, "learning_rate": 6.408490791237445e-06, "loss": 1.2849, "step": 53530 }, { "epoch": 8.847758727535632, "grad_norm": 16.48011589050293, "learning_rate": 6.3993095723388245e-06, "loss": 1.4206, "step": 53540 }, { "epoch": 8.849411278661433, "grad_norm": 10.797418594360352, "learning_rate": 6.390128353440203e-06, "loss": 1.516, "step": 53550 }, { "epoch": 8.851063829787234, "grad_norm": 13.68311595916748, "learning_rate": 6.380947134541581e-06, "loss": 1.3379, "step": 53560 }, { "epoch": 8.852716380913034, "grad_norm": 11.746673583984375, "learning_rate": 6.3717659156429605e-06, "loss": 1.3683, "step": 53570 }, { "epoch": 8.854368932038835, "grad_norm": 16.722267150878906, "learning_rate": 6.362584696744341e-06, "loss": 1.4114, "step": 53580 }, { "epoch": 8.856021483164636, "grad_norm": 13.57430362701416, "learning_rate": 6.353403477845719e-06, "loss": 1.3794, "step": 53590 }, { "epoch": 8.857674034290437, "grad_norm": 19.50122833251953, "learning_rate": 6.344222258947098e-06, "loss": 1.3773, "step": 53600 }, { "epoch": 8.859326585416236, "grad_norm": 13.851290702819824, "learning_rate": 6.335041040048477e-06, "loss": 1.2712, "step": 53610 }, { "epoch": 8.860979136542037, "grad_norm": 11.362812995910645, "learning_rate": 6.325859821149856e-06, "loss": 1.2669, "step": 53620 }, { "epoch": 8.862631687667838, "grad_norm": 13.138988494873047, "learning_rate": 6.316678602251236e-06, "loss": 1.3556, "step": 53630 }, { "epoch": 8.864284238793637, "grad_norm": 21.056747436523438, "learning_rate": 6.307497383352614e-06, "loss": 1.4155, "step": 53640 }, { "epoch": 8.865936789919438, "grad_norm": 16.05164909362793, "learning_rate": 6.298316164453994e-06, "loss": 1.4202, "step": 53650 }, { "epoch": 8.867589341045239, "grad_norm": 30.52538299560547, "learning_rate": 6.289134945555372e-06, "loss": 1.2314, "step": 53660 }, { "epoch": 8.86924189217104, "grad_norm": 27.325618743896484, "learning_rate": 6.279953726656751e-06, "loss": 1.3327, "step": 53670 }, { "epoch": 8.870894443296839, "grad_norm": 14.13979434967041, "learning_rate": 6.27077250775813e-06, "loss": 1.4423, "step": 53680 }, { "epoch": 8.87254699442264, "grad_norm": 9.841453552246094, "learning_rate": 6.26159128885951e-06, "loss": 1.4869, "step": 53690 }, { "epoch": 8.874199545548441, "grad_norm": 10.473819732666016, "learning_rate": 6.252410069960889e-06, "loss": 1.2138, "step": 53700 }, { "epoch": 8.87585209667424, "grad_norm": 20.32805824279785, "learning_rate": 6.2432288510622675e-06, "loss": 1.3251, "step": 53710 }, { "epoch": 8.877504647800041, "grad_norm": 26.516998291015625, "learning_rate": 6.234047632163646e-06, "loss": 1.3455, "step": 53720 }, { "epoch": 8.879157198925842, "grad_norm": 10.170158386230469, "learning_rate": 6.224866413265026e-06, "loss": 1.2815, "step": 53730 }, { "epoch": 8.880809750051641, "grad_norm": 13.213897705078125, "learning_rate": 6.215685194366404e-06, "loss": 1.3635, "step": 53740 }, { "epoch": 8.882462301177442, "grad_norm": 15.690320014953613, "learning_rate": 6.206503975467784e-06, "loss": 1.3606, "step": 53750 }, { "epoch": 8.884114852303243, "grad_norm": 8.988179206848145, "learning_rate": 6.197322756569163e-06, "loss": 1.3001, "step": 53760 }, { "epoch": 8.885767403429044, "grad_norm": 15.613768577575684, "learning_rate": 6.188141537670541e-06, "loss": 1.2319, "step": 53770 }, { "epoch": 8.887419954554844, "grad_norm": 22.44986343383789, "learning_rate": 6.1789603187719205e-06, "loss": 1.2765, "step": 53780 }, { "epoch": 8.889072505680645, "grad_norm": 19.037084579467773, "learning_rate": 6.1697790998733e-06, "loss": 1.3022, "step": 53790 }, { "epoch": 8.890725056806446, "grad_norm": 20.137554168701172, "learning_rate": 6.160597880974678e-06, "loss": 1.4232, "step": 53800 }, { "epoch": 8.892377607932245, "grad_norm": 10.645371437072754, "learning_rate": 6.151416662076057e-06, "loss": 1.2907, "step": 53810 }, { "epoch": 8.894030159058046, "grad_norm": 11.387445449829102, "learning_rate": 6.142235443177437e-06, "loss": 1.2386, "step": 53820 }, { "epoch": 8.895682710183847, "grad_norm": 13.088994979858398, "learning_rate": 6.133054224278816e-06, "loss": 1.379, "step": 53830 }, { "epoch": 8.897335261309646, "grad_norm": 17.280851364135742, "learning_rate": 6.123873005380194e-06, "loss": 1.2731, "step": 53840 }, { "epoch": 8.898987812435447, "grad_norm": 13.71379566192627, "learning_rate": 6.1146917864815735e-06, "loss": 1.3099, "step": 53850 }, { "epoch": 8.900640363561248, "grad_norm": 27.96197509765625, "learning_rate": 6.105510567582953e-06, "loss": 1.3606, "step": 53860 }, { "epoch": 8.902292914687049, "grad_norm": 14.15559196472168, "learning_rate": 6.096329348684331e-06, "loss": 1.3055, "step": 53870 }, { "epoch": 8.903945465812848, "grad_norm": 14.177240371704102, "learning_rate": 6.0871481297857104e-06, "loss": 1.3035, "step": 53880 }, { "epoch": 8.905598016938649, "grad_norm": 12.994119644165039, "learning_rate": 6.07796691088709e-06, "loss": 1.3279, "step": 53890 }, { "epoch": 8.90725056806445, "grad_norm": 15.457225799560547, "learning_rate": 6.068785691988468e-06, "loss": 1.3741, "step": 53900 }, { "epoch": 8.90890311919025, "grad_norm": 9.61684799194336, "learning_rate": 6.059604473089848e-06, "loss": 1.3281, "step": 53910 }, { "epoch": 8.91055567031605, "grad_norm": 10.564475059509277, "learning_rate": 6.0504232541912266e-06, "loss": 1.3453, "step": 53920 }, { "epoch": 8.912208221441851, "grad_norm": 12.618019104003906, "learning_rate": 6.041242035292606e-06, "loss": 1.2849, "step": 53930 }, { "epoch": 8.913860772567652, "grad_norm": 19.173622131347656, "learning_rate": 6.032060816393985e-06, "loss": 1.4845, "step": 53940 }, { "epoch": 8.915513323693451, "grad_norm": 11.999756813049316, "learning_rate": 6.0228795974953635e-06, "loss": 1.3252, "step": 53950 }, { "epoch": 8.917165874819252, "grad_norm": 25.721511840820312, "learning_rate": 6.013698378596743e-06, "loss": 1.3001, "step": 53960 }, { "epoch": 8.918818425945053, "grad_norm": 14.201374053955078, "learning_rate": 6.004517159698122e-06, "loss": 1.4462, "step": 53970 }, { "epoch": 8.920470977070853, "grad_norm": 14.782429695129395, "learning_rate": 5.9953359407995e-06, "loss": 1.5245, "step": 53980 }, { "epoch": 8.922123528196654, "grad_norm": 13.691256523132324, "learning_rate": 5.9861547219008805e-06, "loss": 1.3555, "step": 53990 }, { "epoch": 8.923776079322455, "grad_norm": 18.531856536865234, "learning_rate": 5.976973503002259e-06, "loss": 1.3047, "step": 54000 }, { "epoch": 8.925428630448254, "grad_norm": 11.382560729980469, "learning_rate": 5.967792284103637e-06, "loss": 1.2516, "step": 54010 }, { "epoch": 8.927081181574055, "grad_norm": 12.062386512756348, "learning_rate": 5.958611065205017e-06, "loss": 1.3224, "step": 54020 }, { "epoch": 8.928733732699856, "grad_norm": 26.916439056396484, "learning_rate": 5.949429846306396e-06, "loss": 1.3987, "step": 54030 }, { "epoch": 8.930386283825655, "grad_norm": 13.287517547607422, "learning_rate": 5.940248627407775e-06, "loss": 1.339, "step": 54040 }, { "epoch": 8.932038834951456, "grad_norm": 27.684982299804688, "learning_rate": 5.931067408509154e-06, "loss": 1.2144, "step": 54050 }, { "epoch": 8.933691386077257, "grad_norm": 12.099579811096191, "learning_rate": 5.921886189610533e-06, "loss": 1.2829, "step": 54060 }, { "epoch": 8.935343937203058, "grad_norm": 12.295833587646484, "learning_rate": 5.912704970711912e-06, "loss": 1.3906, "step": 54070 }, { "epoch": 8.936996488328857, "grad_norm": 15.061843872070312, "learning_rate": 5.903523751813291e-06, "loss": 1.4865, "step": 54080 }, { "epoch": 8.938649039454658, "grad_norm": 10.409856796264648, "learning_rate": 5.8943425329146695e-06, "loss": 1.4299, "step": 54090 }, { "epoch": 8.940301590580459, "grad_norm": 7.9486212730407715, "learning_rate": 5.885161314016049e-06, "loss": 1.2411, "step": 54100 }, { "epoch": 8.941954141706258, "grad_norm": 11.08385944366455, "learning_rate": 5.875980095117428e-06, "loss": 1.3165, "step": 54110 }, { "epoch": 8.94360669283206, "grad_norm": 15.459141731262207, "learning_rate": 5.866798876218807e-06, "loss": 1.431, "step": 54120 }, { "epoch": 8.94525924395786, "grad_norm": 12.070756912231445, "learning_rate": 5.8576176573201865e-06, "loss": 1.3148, "step": 54130 }, { "epoch": 8.946911795083661, "grad_norm": 15.25113582611084, "learning_rate": 5.848436438421565e-06, "loss": 1.4825, "step": 54140 }, { "epoch": 8.94856434620946, "grad_norm": 14.48801326751709, "learning_rate": 5.839255219522944e-06, "loss": 1.3299, "step": 54150 }, { "epoch": 8.950216897335261, "grad_norm": 40.4534797668457, "learning_rate": 5.8300740006243234e-06, "loss": 1.2875, "step": 54160 }, { "epoch": 8.951869448461062, "grad_norm": 10.24796199798584, "learning_rate": 5.820892781725702e-06, "loss": 1.296, "step": 54170 }, { "epoch": 8.953521999586862, "grad_norm": 12.344043731689453, "learning_rate": 5.811711562827081e-06, "loss": 1.4524, "step": 54180 }, { "epoch": 8.955174550712663, "grad_norm": 16.066959381103516, "learning_rate": 5.80253034392846e-06, "loss": 1.2011, "step": 54190 }, { "epoch": 8.956827101838464, "grad_norm": 16.067623138427734, "learning_rate": 5.7933491250298396e-06, "loss": 1.4874, "step": 54200 }, { "epoch": 8.958479652964265, "grad_norm": 15.692643165588379, "learning_rate": 5.784167906131218e-06, "loss": 1.2578, "step": 54210 }, { "epoch": 8.960132204090064, "grad_norm": 29.125408172607422, "learning_rate": 5.774986687232597e-06, "loss": 1.4732, "step": 54220 }, { "epoch": 8.961784755215865, "grad_norm": 14.437169075012207, "learning_rate": 5.7658054683339765e-06, "loss": 1.3453, "step": 54230 }, { "epoch": 8.963437306341666, "grad_norm": 14.426401138305664, "learning_rate": 5.756624249435355e-06, "loss": 1.3487, "step": 54240 }, { "epoch": 8.965089857467465, "grad_norm": 17.095333099365234, "learning_rate": 5.747443030536734e-06, "loss": 1.3297, "step": 54250 }, { "epoch": 8.966742408593266, "grad_norm": 15.15270709991455, "learning_rate": 5.738261811638113e-06, "loss": 1.2081, "step": 54260 }, { "epoch": 8.968394959719067, "grad_norm": 17.099987030029297, "learning_rate": 5.729080592739492e-06, "loss": 1.2349, "step": 54270 }, { "epoch": 8.970047510844866, "grad_norm": 19.881269454956055, "learning_rate": 5.719899373840872e-06, "loss": 1.3661, "step": 54280 }, { "epoch": 8.971700061970667, "grad_norm": 21.131669998168945, "learning_rate": 5.71071815494225e-06, "loss": 1.2879, "step": 54290 }, { "epoch": 8.973352613096468, "grad_norm": 19.318918228149414, "learning_rate": 5.7015369360436295e-06, "loss": 1.2724, "step": 54300 }, { "epoch": 8.975005164222267, "grad_norm": 22.20404624938965, "learning_rate": 5.692355717145009e-06, "loss": 1.3058, "step": 54310 }, { "epoch": 8.976657715348068, "grad_norm": 9.28625202178955, "learning_rate": 5.683174498246387e-06, "loss": 1.3284, "step": 54320 }, { "epoch": 8.97831026647387, "grad_norm": 18.054636001586914, "learning_rate": 5.673993279347766e-06, "loss": 1.2838, "step": 54330 }, { "epoch": 8.97996281759967, "grad_norm": 13.409037590026855, "learning_rate": 5.664812060449146e-06, "loss": 1.2346, "step": 54340 }, { "epoch": 8.98161536872547, "grad_norm": 15.619589805603027, "learning_rate": 5.655630841550524e-06, "loss": 1.3766, "step": 54350 }, { "epoch": 8.98326791985127, "grad_norm": 11.095884323120117, "learning_rate": 5.646449622651904e-06, "loss": 1.3703, "step": 54360 }, { "epoch": 8.984920470977071, "grad_norm": 14.479853630065918, "learning_rate": 5.6372684037532826e-06, "loss": 1.3841, "step": 54370 }, { "epoch": 8.98657302210287, "grad_norm": 12.613340377807617, "learning_rate": 5.628087184854662e-06, "loss": 1.2455, "step": 54380 }, { "epoch": 8.988225573228672, "grad_norm": 9.666672706604004, "learning_rate": 5.618905965956041e-06, "loss": 1.1269, "step": 54390 }, { "epoch": 8.989878124354473, "grad_norm": 32.712371826171875, "learning_rate": 5.6097247470574194e-06, "loss": 1.4111, "step": 54400 }, { "epoch": 8.991530675480274, "grad_norm": 13.545608520507812, "learning_rate": 5.600543528158799e-06, "loss": 1.3404, "step": 54410 }, { "epoch": 8.993183226606073, "grad_norm": 12.60755729675293, "learning_rate": 5.591362309260178e-06, "loss": 1.2749, "step": 54420 }, { "epoch": 8.994835777731874, "grad_norm": 17.72956657409668, "learning_rate": 5.582181090361556e-06, "loss": 1.3757, "step": 54430 }, { "epoch": 8.996488328857675, "grad_norm": 8.734169006347656, "learning_rate": 5.572999871462936e-06, "loss": 1.2355, "step": 54440 }, { "epoch": 8.998140879983474, "grad_norm": 10.11967945098877, "learning_rate": 5.563818652564315e-06, "loss": 1.1991, "step": 54450 }, { "epoch": 8.999793431109275, "grad_norm": 13.364051818847656, "learning_rate": 5.554637433665694e-06, "loss": 1.3775, "step": 54460 }, { "epoch": 8.999958686221856, "eval_accuracy": 0.33997730174492835, "eval_loss": 2.3859617710113525, "eval_runtime": 823.2799, "eval_samples_per_second": 34.248, "eval_steps_per_second": 8.562, "step": 54461 }, { "epoch": 9.001445982235076, "grad_norm": 18.1331844329834, "learning_rate": 5.5454562147670725e-06, "loss": 1.433, "step": 54470 }, { "epoch": 9.003098533360875, "grad_norm": 19.45754051208496, "learning_rate": 5.536274995868452e-06, "loss": 1.199, "step": 54480 }, { "epoch": 9.004751084486676, "grad_norm": 6.5627970695495605, "learning_rate": 5.527093776969831e-06, "loss": 1.2614, "step": 54490 }, { "epoch": 9.006403635612477, "grad_norm": 13.369138717651367, "learning_rate": 5.517912558071209e-06, "loss": 1.3375, "step": 54500 }, { "epoch": 9.008056186738278, "grad_norm": 21.747480392456055, "learning_rate": 5.508731339172589e-06, "loss": 1.2666, "step": 54510 }, { "epoch": 9.009708737864077, "grad_norm": 13.323081970214844, "learning_rate": 5.499550120273968e-06, "loss": 1.3027, "step": 54520 }, { "epoch": 9.011361288989878, "grad_norm": 8.494196891784668, "learning_rate": 5.490368901375347e-06, "loss": 1.1626, "step": 54530 }, { "epoch": 9.01301384011568, "grad_norm": 8.807066917419434, "learning_rate": 5.481187682476726e-06, "loss": 1.142, "step": 54540 }, { "epoch": 9.014666391241478, "grad_norm": 21.88263702392578, "learning_rate": 5.472006463578105e-06, "loss": 1.3858, "step": 54550 }, { "epoch": 9.01631894236728, "grad_norm": 16.97997283935547, "learning_rate": 5.462825244679484e-06, "loss": 1.4277, "step": 54560 }, { "epoch": 9.01797149349308, "grad_norm": 17.69019889831543, "learning_rate": 5.453644025780863e-06, "loss": 1.4564, "step": 54570 }, { "epoch": 9.01962404461888, "grad_norm": 21.331026077270508, "learning_rate": 5.444462806882242e-06, "loss": 1.2666, "step": 54580 }, { "epoch": 9.02127659574468, "grad_norm": 21.188798904418945, "learning_rate": 5.435281587983621e-06, "loss": 1.3853, "step": 54590 }, { "epoch": 9.022929146870482, "grad_norm": 17.578445434570312, "learning_rate": 5.426100369085e-06, "loss": 1.2737, "step": 54600 }, { "epoch": 9.024581697996283, "grad_norm": 21.36452293395996, "learning_rate": 5.4169191501863786e-06, "loss": 1.319, "step": 54610 }, { "epoch": 9.026234249122082, "grad_norm": 23.47857666015625, "learning_rate": 5.407737931287759e-06, "loss": 1.3926, "step": 54620 }, { "epoch": 9.027886800247883, "grad_norm": 11.379927635192871, "learning_rate": 5.398556712389137e-06, "loss": 1.2875, "step": 54630 }, { "epoch": 9.029539351373684, "grad_norm": 12.81651496887207, "learning_rate": 5.3893754934905155e-06, "loss": 1.1859, "step": 54640 }, { "epoch": 9.031191902499483, "grad_norm": 13.319893836975098, "learning_rate": 5.3801942745918956e-06, "loss": 1.3343, "step": 54650 }, { "epoch": 9.032844453625284, "grad_norm": 10.670450210571289, "learning_rate": 5.371013055693274e-06, "loss": 1.3357, "step": 54660 }, { "epoch": 9.034497004751085, "grad_norm": 12.764323234558105, "learning_rate": 5.361831836794653e-06, "loss": 1.3346, "step": 54670 }, { "epoch": 9.036149555876884, "grad_norm": 11.50831127166748, "learning_rate": 5.3526506178960325e-06, "loss": 1.2738, "step": 54680 }, { "epoch": 9.037802107002685, "grad_norm": 14.689154624938965, "learning_rate": 5.343469398997411e-06, "loss": 1.3225, "step": 54690 }, { "epoch": 9.039454658128486, "grad_norm": 12.108161926269531, "learning_rate": 5.33428818009879e-06, "loss": 1.2694, "step": 54700 }, { "epoch": 9.041107209254287, "grad_norm": 12.230198860168457, "learning_rate": 5.325106961200169e-06, "loss": 1.336, "step": 54710 }, { "epoch": 9.042759760380086, "grad_norm": 16.08524513244629, "learning_rate": 5.315925742301548e-06, "loss": 1.2942, "step": 54720 }, { "epoch": 9.044412311505887, "grad_norm": 36.26041030883789, "learning_rate": 5.306744523402928e-06, "loss": 1.2881, "step": 54730 }, { "epoch": 9.046064862631688, "grad_norm": 10.63325309753418, "learning_rate": 5.297563304504306e-06, "loss": 1.533, "step": 54740 }, { "epoch": 9.047717413757487, "grad_norm": 20.543127059936523, "learning_rate": 5.2883820856056855e-06, "loss": 1.429, "step": 54750 }, { "epoch": 9.049369964883288, "grad_norm": 16.287906646728516, "learning_rate": 5.279200866707065e-06, "loss": 1.3495, "step": 54760 }, { "epoch": 9.05102251600909, "grad_norm": 18.479795455932617, "learning_rate": 5.270019647808443e-06, "loss": 1.3766, "step": 54770 }, { "epoch": 9.05267506713489, "grad_norm": 12.131267547607422, "learning_rate": 5.260838428909822e-06, "loss": 1.388, "step": 54780 }, { "epoch": 9.05432761826069, "grad_norm": 17.166824340820312, "learning_rate": 5.251657210011202e-06, "loss": 1.3733, "step": 54790 }, { "epoch": 9.05598016938649, "grad_norm": 14.11315631866455, "learning_rate": 5.24247599111258e-06, "loss": 1.3204, "step": 54800 }, { "epoch": 9.057632720512292, "grad_norm": 11.092327117919922, "learning_rate": 5.233294772213959e-06, "loss": 1.2731, "step": 54810 }, { "epoch": 9.05928527163809, "grad_norm": 11.508016586303711, "learning_rate": 5.2241135533153385e-06, "loss": 1.3763, "step": 54820 }, { "epoch": 9.060937822763892, "grad_norm": 13.264908790588379, "learning_rate": 5.214932334416718e-06, "loss": 1.1948, "step": 54830 }, { "epoch": 9.062590373889693, "grad_norm": 16.212055206298828, "learning_rate": 5.205751115518096e-06, "loss": 1.318, "step": 54840 }, { "epoch": 9.064242925015492, "grad_norm": 33.09046173095703, "learning_rate": 5.1965698966194754e-06, "loss": 1.28, "step": 54850 }, { "epoch": 9.065895476141293, "grad_norm": 12.761731147766113, "learning_rate": 5.187388677720855e-06, "loss": 1.2515, "step": 54860 }, { "epoch": 9.067548027267094, "grad_norm": 11.305936813354492, "learning_rate": 5.178207458822233e-06, "loss": 1.1792, "step": 54870 }, { "epoch": 9.069200578392895, "grad_norm": 19.43776512145996, "learning_rate": 5.169026239923612e-06, "loss": 1.3766, "step": 54880 }, { "epoch": 9.070853129518694, "grad_norm": 11.29062557220459, "learning_rate": 5.1598450210249916e-06, "loss": 1.2883, "step": 54890 }, { "epoch": 9.072505680644495, "grad_norm": 12.61872673034668, "learning_rate": 5.15066380212637e-06, "loss": 1.245, "step": 54900 }, { "epoch": 9.074158231770296, "grad_norm": 10.980611801147461, "learning_rate": 5.14148258322775e-06, "loss": 1.42, "step": 54910 }, { "epoch": 9.075810782896095, "grad_norm": 13.058303833007812, "learning_rate": 5.1323013643291285e-06, "loss": 1.21, "step": 54920 }, { "epoch": 9.077463334021896, "grad_norm": 21.789306640625, "learning_rate": 5.123120145430508e-06, "loss": 1.3207, "step": 54930 }, { "epoch": 9.079115885147697, "grad_norm": 10.95693588256836, "learning_rate": 5.113938926531887e-06, "loss": 1.3505, "step": 54940 }, { "epoch": 9.080768436273496, "grad_norm": 21.274913787841797, "learning_rate": 5.104757707633265e-06, "loss": 1.2573, "step": 54950 }, { "epoch": 9.082420987399297, "grad_norm": 17.98501205444336, "learning_rate": 5.095576488734645e-06, "loss": 1.2962, "step": 54960 }, { "epoch": 9.084073538525098, "grad_norm": 21.35386848449707, "learning_rate": 5.086395269836024e-06, "loss": 1.3419, "step": 54970 }, { "epoch": 9.0857260896509, "grad_norm": 19.58464241027832, "learning_rate": 5.077214050937402e-06, "loss": 1.4037, "step": 54980 }, { "epoch": 9.087378640776699, "grad_norm": 19.767635345458984, "learning_rate": 5.068032832038782e-06, "loss": 1.2972, "step": 54990 }, { "epoch": 9.0890311919025, "grad_norm": 10.111676216125488, "learning_rate": 5.058851613140161e-06, "loss": 1.1995, "step": 55000 }, { "epoch": 9.0906837430283, "grad_norm": 11.100005149841309, "learning_rate": 5.049670394241539e-06, "loss": 1.2342, "step": 55010 }, { "epoch": 9.0923362941541, "grad_norm": 13.454795837402344, "learning_rate": 5.040489175342919e-06, "loss": 1.2795, "step": 55020 }, { "epoch": 9.0939888452799, "grad_norm": 16.521162033081055, "learning_rate": 5.031307956444298e-06, "loss": 1.3299, "step": 55030 }, { "epoch": 9.095641396405702, "grad_norm": 12.513435363769531, "learning_rate": 5.022126737545677e-06, "loss": 1.2241, "step": 55040 }, { "epoch": 9.097293947531501, "grad_norm": 33.393882751464844, "learning_rate": 5.012945518647056e-06, "loss": 1.3831, "step": 55050 }, { "epoch": 9.098946498657302, "grad_norm": 12.040635108947754, "learning_rate": 5.0037642997484345e-06, "loss": 1.2954, "step": 55060 }, { "epoch": 9.100599049783103, "grad_norm": 16.64748764038086, "learning_rate": 4.994583080849814e-06, "loss": 1.2867, "step": 55070 }, { "epoch": 9.102251600908904, "grad_norm": 20.921030044555664, "learning_rate": 4.985401861951193e-06, "loss": 1.3136, "step": 55080 }, { "epoch": 9.103904152034703, "grad_norm": 17.621395111083984, "learning_rate": 4.9762206430525714e-06, "loss": 1.2214, "step": 55090 }, { "epoch": 9.105556703160504, "grad_norm": 13.227643013000488, "learning_rate": 4.967039424153951e-06, "loss": 1.3465, "step": 55100 }, { "epoch": 9.107209254286305, "grad_norm": 12.755581855773926, "learning_rate": 4.95785820525533e-06, "loss": 1.3946, "step": 55110 }, { "epoch": 9.108861805412104, "grad_norm": 19.701791763305664, "learning_rate": 4.948676986356709e-06, "loss": 1.2807, "step": 55120 }, { "epoch": 9.110514356537905, "grad_norm": 16.122726440429688, "learning_rate": 4.9394957674580884e-06, "loss": 1.3864, "step": 55130 }, { "epoch": 9.112166907663706, "grad_norm": 14.194814682006836, "learning_rate": 4.930314548559467e-06, "loss": 1.3725, "step": 55140 }, { "epoch": 9.113819458789507, "grad_norm": 16.94427490234375, "learning_rate": 4.921133329660846e-06, "loss": 1.311, "step": 55150 }, { "epoch": 9.115472009915306, "grad_norm": 41.79347229003906, "learning_rate": 4.911952110762225e-06, "loss": 1.498, "step": 55160 }, { "epoch": 9.117124561041107, "grad_norm": 26.90255355834961, "learning_rate": 4.902770891863604e-06, "loss": 1.2616, "step": 55170 }, { "epoch": 9.118777112166908, "grad_norm": 15.550649642944336, "learning_rate": 4.893589672964983e-06, "loss": 1.2621, "step": 55180 }, { "epoch": 9.120429663292708, "grad_norm": 25.298446655273438, "learning_rate": 4.884408454066362e-06, "loss": 1.2517, "step": 55190 }, { "epoch": 9.122082214418509, "grad_norm": 19.14631462097168, "learning_rate": 4.8752272351677415e-06, "loss": 1.353, "step": 55200 }, { "epoch": 9.12373476554431, "grad_norm": 16.164554595947266, "learning_rate": 4.86604601626912e-06, "loss": 1.2734, "step": 55210 }, { "epoch": 9.125387316670109, "grad_norm": 16.599605560302734, "learning_rate": 4.856864797370499e-06, "loss": 1.2069, "step": 55220 }, { "epoch": 9.12703986779591, "grad_norm": 22.704082489013672, "learning_rate": 4.847683578471878e-06, "loss": 1.3562, "step": 55230 }, { "epoch": 9.12869241892171, "grad_norm": 9.706205368041992, "learning_rate": 4.838502359573257e-06, "loss": 1.239, "step": 55240 }, { "epoch": 9.130344970047512, "grad_norm": 19.193674087524414, "learning_rate": 4.829321140674636e-06, "loss": 1.3882, "step": 55250 }, { "epoch": 9.13199752117331, "grad_norm": 9.436002731323242, "learning_rate": 4.820139921776015e-06, "loss": 1.3104, "step": 55260 }, { "epoch": 9.133650072299112, "grad_norm": 12.764290809631348, "learning_rate": 4.810958702877394e-06, "loss": 1.238, "step": 55270 }, { "epoch": 9.135302623424913, "grad_norm": 13.602463722229004, "learning_rate": 4.801777483978774e-06, "loss": 1.4013, "step": 55280 }, { "epoch": 9.136955174550712, "grad_norm": 18.19413185119629, "learning_rate": 4.792596265080152e-06, "loss": 1.2348, "step": 55290 }, { "epoch": 9.138607725676513, "grad_norm": 11.962725639343262, "learning_rate": 4.783415046181531e-06, "loss": 1.1874, "step": 55300 }, { "epoch": 9.140260276802314, "grad_norm": 14.87602710723877, "learning_rate": 4.774233827282911e-06, "loss": 1.3184, "step": 55310 }, { "epoch": 9.141912827928113, "grad_norm": 17.802339553833008, "learning_rate": 4.765052608384289e-06, "loss": 1.2428, "step": 55320 }, { "epoch": 9.143565379053914, "grad_norm": 23.761425018310547, "learning_rate": 4.755871389485668e-06, "loss": 1.2894, "step": 55330 }, { "epoch": 9.145217930179715, "grad_norm": 10.590774536132812, "learning_rate": 4.7466901705870475e-06, "loss": 1.2307, "step": 55340 }, { "epoch": 9.146870481305516, "grad_norm": 11.037496566772461, "learning_rate": 4.737508951688426e-06, "loss": 1.2873, "step": 55350 }, { "epoch": 9.148523032431315, "grad_norm": 14.964154243469238, "learning_rate": 4.728327732789806e-06, "loss": 1.295, "step": 55360 }, { "epoch": 9.150175583557116, "grad_norm": 15.096177101135254, "learning_rate": 4.7191465138911844e-06, "loss": 1.2046, "step": 55370 }, { "epoch": 9.151828134682917, "grad_norm": 8.96634578704834, "learning_rate": 4.709965294992563e-06, "loss": 1.382, "step": 55380 }, { "epoch": 9.153480685808717, "grad_norm": 16.687162399291992, "learning_rate": 4.700784076093943e-06, "loss": 1.317, "step": 55390 }, { "epoch": 9.155133236934518, "grad_norm": 15.783407211303711, "learning_rate": 4.691602857195321e-06, "loss": 1.3151, "step": 55400 }, { "epoch": 9.156785788060319, "grad_norm": 18.55613136291504, "learning_rate": 4.682421638296701e-06, "loss": 1.402, "step": 55410 }, { "epoch": 9.158438339186118, "grad_norm": 11.569503784179688, "learning_rate": 4.67324041939808e-06, "loss": 1.3886, "step": 55420 }, { "epoch": 9.160090890311919, "grad_norm": 49.667762756347656, "learning_rate": 4.664059200499458e-06, "loss": 1.0859, "step": 55430 }, { "epoch": 9.16174344143772, "grad_norm": 11.705397605895996, "learning_rate": 4.6548779816008375e-06, "loss": 1.2273, "step": 55440 }, { "epoch": 9.16339599256352, "grad_norm": 15.881692886352539, "learning_rate": 4.645696762702217e-06, "loss": 1.388, "step": 55450 }, { "epoch": 9.16504854368932, "grad_norm": 18.31934928894043, "learning_rate": 4.636515543803595e-06, "loss": 1.3677, "step": 55460 }, { "epoch": 9.16670109481512, "grad_norm": 14.108166694641113, "learning_rate": 4.627334324904974e-06, "loss": 1.2068, "step": 55470 }, { "epoch": 9.168353645940922, "grad_norm": 10.275944709777832, "learning_rate": 4.618153106006354e-06, "loss": 1.3015, "step": 55480 }, { "epoch": 9.170006197066721, "grad_norm": 11.239326477050781, "learning_rate": 4.608971887107733e-06, "loss": 1.3207, "step": 55490 }, { "epoch": 9.171658748192522, "grad_norm": 16.125225067138672, "learning_rate": 4.599790668209112e-06, "loss": 1.2972, "step": 55500 }, { "epoch": 9.173311299318323, "grad_norm": 15.761225700378418, "learning_rate": 4.5906094493104905e-06, "loss": 1.2291, "step": 55510 }, { "epoch": 9.174963850444122, "grad_norm": 12.649540901184082, "learning_rate": 4.58142823041187e-06, "loss": 1.2749, "step": 55520 }, { "epoch": 9.176616401569923, "grad_norm": 17.501537322998047, "learning_rate": 4.572247011513249e-06, "loss": 1.3203, "step": 55530 }, { "epoch": 9.178268952695724, "grad_norm": 18.077924728393555, "learning_rate": 4.563065792614627e-06, "loss": 1.397, "step": 55540 }, { "epoch": 9.179921503821525, "grad_norm": 16.778873443603516, "learning_rate": 4.553884573716007e-06, "loss": 1.2976, "step": 55550 }, { "epoch": 9.181574054947324, "grad_norm": 13.572968482971191, "learning_rate": 4.544703354817386e-06, "loss": 1.2331, "step": 55560 }, { "epoch": 9.183226606073125, "grad_norm": 13.623981475830078, "learning_rate": 4.535522135918765e-06, "loss": 1.3081, "step": 55570 }, { "epoch": 9.184879157198926, "grad_norm": 11.336007118225098, "learning_rate": 4.5263409170201436e-06, "loss": 1.2609, "step": 55580 }, { "epoch": 9.186531708324726, "grad_norm": 14.808525085449219, "learning_rate": 4.517159698121523e-06, "loss": 1.2853, "step": 55590 }, { "epoch": 9.188184259450527, "grad_norm": 11.729154586791992, "learning_rate": 4.507978479222902e-06, "loss": 1.3402, "step": 55600 }, { "epoch": 9.189836810576328, "grad_norm": 25.362701416015625, "learning_rate": 4.4987972603242805e-06, "loss": 1.5593, "step": 55610 }, { "epoch": 9.191489361702128, "grad_norm": 10.813699722290039, "learning_rate": 4.48961604142566e-06, "loss": 1.233, "step": 55620 }, { "epoch": 9.193141912827928, "grad_norm": 11.491537094116211, "learning_rate": 4.480434822527039e-06, "loss": 1.3878, "step": 55630 }, { "epoch": 9.194794463953729, "grad_norm": 17.993120193481445, "learning_rate": 4.471253603628417e-06, "loss": 1.2426, "step": 55640 }, { "epoch": 9.19644701507953, "grad_norm": 14.781805038452148, "learning_rate": 4.4620723847297975e-06, "loss": 1.2122, "step": 55650 }, { "epoch": 9.198099566205329, "grad_norm": 12.504164695739746, "learning_rate": 4.452891165831176e-06, "loss": 1.3508, "step": 55660 }, { "epoch": 9.19975211733113, "grad_norm": 14.548686027526855, "learning_rate": 4.443709946932554e-06, "loss": 1.3797, "step": 55670 }, { "epoch": 9.20140466845693, "grad_norm": 14.047411918640137, "learning_rate": 4.434528728033934e-06, "loss": 1.1652, "step": 55680 }, { "epoch": 9.20305721958273, "grad_norm": 27.93585205078125, "learning_rate": 4.425347509135313e-06, "loss": 1.2276, "step": 55690 }, { "epoch": 9.204709770708531, "grad_norm": 14.484628677368164, "learning_rate": 4.416166290236692e-06, "loss": 1.2675, "step": 55700 }, { "epoch": 9.206362321834332, "grad_norm": 23.827726364135742, "learning_rate": 4.406985071338071e-06, "loss": 1.306, "step": 55710 }, { "epoch": 9.208014872960133, "grad_norm": 22.634979248046875, "learning_rate": 4.39780385243945e-06, "loss": 1.4343, "step": 55720 }, { "epoch": 9.209667424085932, "grad_norm": 17.955224990844727, "learning_rate": 4.38862263354083e-06, "loss": 1.3497, "step": 55730 }, { "epoch": 9.211319975211733, "grad_norm": 12.252691268920898, "learning_rate": 4.379441414642208e-06, "loss": 1.2658, "step": 55740 }, { "epoch": 9.212972526337534, "grad_norm": 29.16194725036621, "learning_rate": 4.3702601957435865e-06, "loss": 1.1545, "step": 55750 }, { "epoch": 9.214625077463333, "grad_norm": 19.66663932800293, "learning_rate": 4.361078976844967e-06, "loss": 1.3138, "step": 55760 }, { "epoch": 9.216277628589134, "grad_norm": 11.54080867767334, "learning_rate": 4.351897757946345e-06, "loss": 1.2657, "step": 55770 }, { "epoch": 9.217930179714935, "grad_norm": 17.858362197875977, "learning_rate": 4.342716539047724e-06, "loss": 1.3622, "step": 55780 }, { "epoch": 9.219582730840735, "grad_norm": 10.797994613647461, "learning_rate": 4.3335353201491035e-06, "loss": 1.4025, "step": 55790 }, { "epoch": 9.221235281966536, "grad_norm": 12.868209838867188, "learning_rate": 4.324354101250482e-06, "loss": 1.3193, "step": 55800 }, { "epoch": 9.222887833092337, "grad_norm": 12.669856071472168, "learning_rate": 4.315172882351861e-06, "loss": 1.3091, "step": 55810 }, { "epoch": 9.224540384218137, "grad_norm": 14.927458763122559, "learning_rate": 4.3059916634532404e-06, "loss": 1.3628, "step": 55820 }, { "epoch": 9.226192935343937, "grad_norm": 13.37768840789795, "learning_rate": 4.296810444554619e-06, "loss": 1.3404, "step": 55830 }, { "epoch": 9.227845486469738, "grad_norm": 21.847307205200195, "learning_rate": 4.287629225655998e-06, "loss": 1.306, "step": 55840 }, { "epoch": 9.229498037595539, "grad_norm": 40.5103759765625, "learning_rate": 4.278448006757377e-06, "loss": 1.3563, "step": 55850 }, { "epoch": 9.231150588721338, "grad_norm": 9.23901653289795, "learning_rate": 4.2692667878587566e-06, "loss": 1.1886, "step": 55860 }, { "epoch": 9.232803139847139, "grad_norm": 21.267831802368164, "learning_rate": 4.260085568960135e-06, "loss": 1.3202, "step": 55870 }, { "epoch": 9.23445569097294, "grad_norm": 32.90507888793945, "learning_rate": 4.250904350061514e-06, "loss": 1.3244, "step": 55880 }, { "epoch": 9.23610824209874, "grad_norm": 44.84469223022461, "learning_rate": 4.2417231311628935e-06, "loss": 1.3374, "step": 55890 }, { "epoch": 9.23776079322454, "grad_norm": 21.209402084350586, "learning_rate": 4.232541912264273e-06, "loss": 1.3268, "step": 55900 }, { "epoch": 9.239413344350341, "grad_norm": 10.94561767578125, "learning_rate": 4.223360693365651e-06, "loss": 1.3878, "step": 55910 }, { "epoch": 9.241065895476142, "grad_norm": 24.05000114440918, "learning_rate": 4.21417947446703e-06, "loss": 1.2205, "step": 55920 }, { "epoch": 9.242718446601941, "grad_norm": 17.849794387817383, "learning_rate": 4.20499825556841e-06, "loss": 1.2642, "step": 55930 }, { "epoch": 9.244370997727742, "grad_norm": 22.153186798095703, "learning_rate": 4.195817036669789e-06, "loss": 1.2733, "step": 55940 }, { "epoch": 9.246023548853543, "grad_norm": 15.071115493774414, "learning_rate": 4.186635817771167e-06, "loss": 1.2561, "step": 55950 }, { "epoch": 9.247676099979342, "grad_norm": 22.48716926574707, "learning_rate": 4.1774545988725465e-06, "loss": 1.3194, "step": 55960 }, { "epoch": 9.249328651105143, "grad_norm": 18.10262680053711, "learning_rate": 4.168273379973926e-06, "loss": 1.3629, "step": 55970 }, { "epoch": 9.250981202230944, "grad_norm": 14.13875961303711, "learning_rate": 4.159092161075304e-06, "loss": 1.286, "step": 55980 }, { "epoch": 9.252633753356745, "grad_norm": 16.74396324157715, "learning_rate": 4.149910942176683e-06, "loss": 1.2687, "step": 55990 }, { "epoch": 9.254286304482545, "grad_norm": 33.51347732543945, "learning_rate": 4.140729723278063e-06, "loss": 1.4942, "step": 56000 }, { "epoch": 9.255938855608346, "grad_norm": 15.115519523620605, "learning_rate": 4.131548504379441e-06, "loss": 1.36, "step": 56010 }, { "epoch": 9.257591406734146, "grad_norm": 25.56139373779297, "learning_rate": 4.122367285480821e-06, "loss": 1.4605, "step": 56020 }, { "epoch": 9.259243957859946, "grad_norm": 20.03956413269043, "learning_rate": 4.1131860665821995e-06, "loss": 1.3127, "step": 56030 }, { "epoch": 9.260896508985747, "grad_norm": 14.782129287719727, "learning_rate": 4.104004847683579e-06, "loss": 1.4149, "step": 56040 }, { "epoch": 9.262549060111548, "grad_norm": 16.4676570892334, "learning_rate": 4.094823628784958e-06, "loss": 1.3314, "step": 56050 }, { "epoch": 9.264201611237347, "grad_norm": 18.199886322021484, "learning_rate": 4.0856424098863364e-06, "loss": 1.3737, "step": 56060 }, { "epoch": 9.265854162363148, "grad_norm": 19.445627212524414, "learning_rate": 4.076461190987716e-06, "loss": 1.3664, "step": 56070 }, { "epoch": 9.267506713488949, "grad_norm": 14.629560470581055, "learning_rate": 4.067279972089095e-06, "loss": 1.3397, "step": 56080 }, { "epoch": 9.26915926461475, "grad_norm": 22.30562400817871, "learning_rate": 4.058098753190473e-06, "loss": 1.3311, "step": 56090 }, { "epoch": 9.270811815740549, "grad_norm": 9.496840476989746, "learning_rate": 4.0489175342918534e-06, "loss": 1.1589, "step": 56100 }, { "epoch": 9.27246436686635, "grad_norm": 13.570216178894043, "learning_rate": 4.039736315393232e-06, "loss": 1.3121, "step": 56110 }, { "epoch": 9.274116917992151, "grad_norm": 23.2502384185791, "learning_rate": 4.030555096494611e-06, "loss": 1.3267, "step": 56120 }, { "epoch": 9.27576946911795, "grad_norm": 15.193344116210938, "learning_rate": 4.02137387759599e-06, "loss": 1.308, "step": 56130 }, { "epoch": 9.277422020243751, "grad_norm": 18.408641815185547, "learning_rate": 4.012192658697369e-06, "loss": 1.3684, "step": 56140 }, { "epoch": 9.279074571369552, "grad_norm": 16.26186180114746, "learning_rate": 4.003011439798748e-06, "loss": 1.3184, "step": 56150 }, { "epoch": 9.280727122495351, "grad_norm": 15.509997367858887, "learning_rate": 3.993830220900127e-06, "loss": 1.3755, "step": 56160 }, { "epoch": 9.282379673621152, "grad_norm": 10.246735572814941, "learning_rate": 3.984649002001506e-06, "loss": 1.4041, "step": 56170 }, { "epoch": 9.284032224746953, "grad_norm": 14.112131118774414, "learning_rate": 3.975467783102885e-06, "loss": 1.2511, "step": 56180 }, { "epoch": 9.285684775872754, "grad_norm": 14.345402717590332, "learning_rate": 3.966286564204264e-06, "loss": 1.3432, "step": 56190 }, { "epoch": 9.287337326998554, "grad_norm": 13.019506454467773, "learning_rate": 3.957105345305643e-06, "loss": 1.4026, "step": 56200 }, { "epoch": 9.288989878124355, "grad_norm": 10.218276023864746, "learning_rate": 3.947924126407022e-06, "loss": 1.3788, "step": 56210 }, { "epoch": 9.290642429250155, "grad_norm": 10.781867980957031, "learning_rate": 3.938742907508401e-06, "loss": 1.2091, "step": 56220 }, { "epoch": 9.292294980375955, "grad_norm": 8.732402801513672, "learning_rate": 3.92956168860978e-06, "loss": 1.2941, "step": 56230 }, { "epoch": 9.293947531501756, "grad_norm": 19.886987686157227, "learning_rate": 3.920380469711159e-06, "loss": 1.2725, "step": 56240 }, { "epoch": 9.295600082627557, "grad_norm": 13.052376747131348, "learning_rate": 3.911199250812538e-06, "loss": 1.2563, "step": 56250 }, { "epoch": 9.297252633753356, "grad_norm": 9.23311710357666, "learning_rate": 3.902018031913917e-06, "loss": 1.2777, "step": 56260 }, { "epoch": 9.298905184879157, "grad_norm": 17.674257278442383, "learning_rate": 3.8928368130152956e-06, "loss": 1.1415, "step": 56270 }, { "epoch": 9.300557736004958, "grad_norm": 13.015941619873047, "learning_rate": 3.883655594116676e-06, "loss": 1.2718, "step": 56280 }, { "epoch": 9.302210287130759, "grad_norm": 13.032365798950195, "learning_rate": 3.874474375218054e-06, "loss": 1.3148, "step": 56290 }, { "epoch": 9.303862838256558, "grad_norm": 13.440291404724121, "learning_rate": 3.865293156319433e-06, "loss": 1.2976, "step": 56300 }, { "epoch": 9.305515389382359, "grad_norm": 20.601293563842773, "learning_rate": 3.8561119374208125e-06, "loss": 1.2242, "step": 56310 }, { "epoch": 9.30716794050816, "grad_norm": 14.70266342163086, "learning_rate": 3.846930718522191e-06, "loss": 1.3573, "step": 56320 }, { "epoch": 9.30882049163396, "grad_norm": 18.35739517211914, "learning_rate": 3.83774949962357e-06, "loss": 1.2484, "step": 56330 }, { "epoch": 9.31047304275976, "grad_norm": 14.594882011413574, "learning_rate": 3.8285682807249494e-06, "loss": 1.3821, "step": 56340 }, { "epoch": 9.312125593885561, "grad_norm": 17.77569007873535, "learning_rate": 3.819387061826328e-06, "loss": 1.2919, "step": 56350 }, { "epoch": 9.313778145011362, "grad_norm": 15.527255058288574, "learning_rate": 3.8102058429277075e-06, "loss": 1.29, "step": 56360 }, { "epoch": 9.315430696137161, "grad_norm": 15.60554313659668, "learning_rate": 3.8010246240290863e-06, "loss": 1.2916, "step": 56370 }, { "epoch": 9.317083247262962, "grad_norm": 14.84151554107666, "learning_rate": 3.791843405130465e-06, "loss": 1.2193, "step": 56380 }, { "epoch": 9.318735798388763, "grad_norm": 15.908535957336426, "learning_rate": 3.7826621862318444e-06, "loss": 1.3779, "step": 56390 }, { "epoch": 9.320388349514563, "grad_norm": 20.001300811767578, "learning_rate": 3.7734809673332232e-06, "loss": 1.3879, "step": 56400 }, { "epoch": 9.322040900640364, "grad_norm": 25.633216857910156, "learning_rate": 3.764299748434602e-06, "loss": 1.4137, "step": 56410 }, { "epoch": 9.323693451766164, "grad_norm": 14.804731369018555, "learning_rate": 3.7551185295359817e-06, "loss": 1.3807, "step": 56420 }, { "epoch": 9.325346002891964, "grad_norm": 17.46161651611328, "learning_rate": 3.74593731063736e-06, "loss": 1.2958, "step": 56430 }, { "epoch": 9.326998554017765, "grad_norm": 29.73974609375, "learning_rate": 3.736756091738739e-06, "loss": 1.3127, "step": 56440 }, { "epoch": 9.328651105143566, "grad_norm": 14.454090118408203, "learning_rate": 3.7275748728401186e-06, "loss": 1.3697, "step": 56450 }, { "epoch": 9.330303656269367, "grad_norm": 16.23573875427246, "learning_rate": 3.7183936539414975e-06, "loss": 1.3834, "step": 56460 }, { "epoch": 9.331956207395166, "grad_norm": 31.776811599731445, "learning_rate": 3.7092124350428763e-06, "loss": 1.3624, "step": 56470 }, { "epoch": 9.333608758520967, "grad_norm": 17.408411026000977, "learning_rate": 3.7000312161442555e-06, "loss": 1.13, "step": 56480 }, { "epoch": 9.335261309646768, "grad_norm": 14.618858337402344, "learning_rate": 3.6908499972456343e-06, "loss": 1.2647, "step": 56490 }, { "epoch": 9.336913860772567, "grad_norm": 15.05654239654541, "learning_rate": 3.681668778347014e-06, "loss": 1.2852, "step": 56500 }, { "epoch": 9.338566411898368, "grad_norm": 12.420156478881836, "learning_rate": 3.6724875594483924e-06, "loss": 1.43, "step": 56510 }, { "epoch": 9.340218963024169, "grad_norm": 15.04156494140625, "learning_rate": 3.6633063405497712e-06, "loss": 1.207, "step": 56520 }, { "epoch": 9.341871514149968, "grad_norm": 8.728355407714844, "learning_rate": 3.654125121651151e-06, "loss": 1.2025, "step": 56530 }, { "epoch": 9.34352406527577, "grad_norm": 21.201353073120117, "learning_rate": 3.6449439027525297e-06, "loss": 1.0984, "step": 56540 }, { "epoch": 9.34517661640157, "grad_norm": 11.79217529296875, "learning_rate": 3.6357626838539086e-06, "loss": 1.2759, "step": 56550 }, { "epoch": 9.346829167527371, "grad_norm": 13.807501792907715, "learning_rate": 3.626581464955288e-06, "loss": 1.2994, "step": 56560 }, { "epoch": 9.34848171865317, "grad_norm": 19.914011001586914, "learning_rate": 3.6174002460566666e-06, "loss": 1.3291, "step": 56570 }, { "epoch": 9.350134269778971, "grad_norm": 16.1053409576416, "learning_rate": 3.6082190271580455e-06, "loss": 1.2639, "step": 56580 }, { "epoch": 9.351786820904772, "grad_norm": 17.01410484313965, "learning_rate": 3.5990378082594247e-06, "loss": 1.4563, "step": 56590 }, { "epoch": 9.353439372030572, "grad_norm": 12.761663436889648, "learning_rate": 3.5898565893608035e-06, "loss": 1.4376, "step": 56600 }, { "epoch": 9.355091923156373, "grad_norm": 12.494538307189941, "learning_rate": 3.5806753704621824e-06, "loss": 1.4416, "step": 56610 }, { "epoch": 9.356744474282173, "grad_norm": 15.235856056213379, "learning_rate": 3.571494151563562e-06, "loss": 1.2971, "step": 56620 }, { "epoch": 9.358397025407974, "grad_norm": 22.249408721923828, "learning_rate": 3.562312932664941e-06, "loss": 1.3123, "step": 56630 }, { "epoch": 9.360049576533774, "grad_norm": 13.691697120666504, "learning_rate": 3.5531317137663197e-06, "loss": 1.4081, "step": 56640 }, { "epoch": 9.361702127659575, "grad_norm": 14.822437286376953, "learning_rate": 3.543950494867699e-06, "loss": 1.2504, "step": 56650 }, { "epoch": 9.363354678785376, "grad_norm": 13.317008018493652, "learning_rate": 3.5347692759690777e-06, "loss": 1.2689, "step": 56660 }, { "epoch": 9.365007229911175, "grad_norm": 19.28122901916504, "learning_rate": 3.5255880570704566e-06, "loss": 1.3562, "step": 56670 }, { "epoch": 9.366659781036976, "grad_norm": 14.012068748474121, "learning_rate": 3.516406838171836e-06, "loss": 1.3528, "step": 56680 }, { "epoch": 9.368312332162777, "grad_norm": 16.407472610473633, "learning_rate": 3.5072256192732146e-06, "loss": 1.1581, "step": 56690 }, { "epoch": 9.369964883288576, "grad_norm": 13.19459342956543, "learning_rate": 3.4980444003745943e-06, "loss": 1.2654, "step": 56700 }, { "epoch": 9.371617434414377, "grad_norm": 11.365413665771484, "learning_rate": 3.488863181475973e-06, "loss": 1.3031, "step": 56710 }, { "epoch": 9.373269985540178, "grad_norm": 11.131091117858887, "learning_rate": 3.479681962577352e-06, "loss": 1.3596, "step": 56720 }, { "epoch": 9.374922536665977, "grad_norm": 13.381197929382324, "learning_rate": 3.470500743678731e-06, "loss": 1.3861, "step": 56730 }, { "epoch": 9.376575087791778, "grad_norm": 9.68941879272461, "learning_rate": 3.46131952478011e-06, "loss": 1.3179, "step": 56740 }, { "epoch": 9.37822763891758, "grad_norm": 15.212055206298828, "learning_rate": 3.452138305881489e-06, "loss": 1.2758, "step": 56750 }, { "epoch": 9.37988019004338, "grad_norm": 17.098337173461914, "learning_rate": 3.442957086982868e-06, "loss": 1.1934, "step": 56760 }, { "epoch": 9.38153274116918, "grad_norm": 10.168761253356934, "learning_rate": 3.433775868084247e-06, "loss": 1.3831, "step": 56770 }, { "epoch": 9.38318529229498, "grad_norm": 50.241641998291016, "learning_rate": 3.4245946491856258e-06, "loss": 1.4488, "step": 56780 }, { "epoch": 9.384837843420781, "grad_norm": 18.76384162902832, "learning_rate": 3.4154134302870054e-06, "loss": 1.3276, "step": 56790 }, { "epoch": 9.38649039454658, "grad_norm": 16.12409782409668, "learning_rate": 3.4062322113883842e-06, "loss": 1.2093, "step": 56800 }, { "epoch": 9.388142945672382, "grad_norm": 16.562042236328125, "learning_rate": 3.3970509924897626e-06, "loss": 1.1709, "step": 56810 }, { "epoch": 9.389795496798182, "grad_norm": 26.49726104736328, "learning_rate": 3.3878697735911423e-06, "loss": 1.3279, "step": 56820 }, { "epoch": 9.391448047923983, "grad_norm": 13.345320701599121, "learning_rate": 3.378688554692521e-06, "loss": 1.2046, "step": 56830 }, { "epoch": 9.393100599049783, "grad_norm": 16.027231216430664, "learning_rate": 3.3695073357939e-06, "loss": 1.3354, "step": 56840 }, { "epoch": 9.394753150175584, "grad_norm": 10.626627922058105, "learning_rate": 3.3603261168952792e-06, "loss": 1.168, "step": 56850 }, { "epoch": 9.396405701301385, "grad_norm": 18.5012264251709, "learning_rate": 3.351144897996658e-06, "loss": 1.4278, "step": 56860 }, { "epoch": 9.398058252427184, "grad_norm": 21.47645378112793, "learning_rate": 3.341963679098037e-06, "loss": 1.3033, "step": 56870 }, { "epoch": 9.399710803552985, "grad_norm": 17.78287696838379, "learning_rate": 3.3327824601994165e-06, "loss": 1.2744, "step": 56880 }, { "epoch": 9.401363354678786, "grad_norm": 14.139810562133789, "learning_rate": 3.323601241300795e-06, "loss": 1.4479, "step": 56890 }, { "epoch": 9.403015905804585, "grad_norm": 28.748075485229492, "learning_rate": 3.3144200224021746e-06, "loss": 1.2902, "step": 56900 }, { "epoch": 9.404668456930386, "grad_norm": 46.95508575439453, "learning_rate": 3.3052388035035534e-06, "loss": 1.4943, "step": 56910 }, { "epoch": 9.406321008056187, "grad_norm": 15.258516311645508, "learning_rate": 3.2960575846049323e-06, "loss": 1.1904, "step": 56920 }, { "epoch": 9.407973559181988, "grad_norm": 23.977991104125977, "learning_rate": 3.2868763657063115e-06, "loss": 1.2751, "step": 56930 }, { "epoch": 9.409626110307787, "grad_norm": 12.30024242401123, "learning_rate": 3.2776951468076903e-06, "loss": 1.2913, "step": 56940 }, { "epoch": 9.411278661433588, "grad_norm": 11.82300853729248, "learning_rate": 3.268513927909069e-06, "loss": 1.4331, "step": 56950 }, { "epoch": 9.41293121255939, "grad_norm": 17.451065063476562, "learning_rate": 3.259332709010449e-06, "loss": 1.2351, "step": 56960 }, { "epoch": 9.414583763685188, "grad_norm": 13.529333114624023, "learning_rate": 3.2501514901118272e-06, "loss": 1.1954, "step": 56970 }, { "epoch": 9.41623631481099, "grad_norm": 14.515869140625, "learning_rate": 3.240970271213206e-06, "loss": 1.3682, "step": 56980 }, { "epoch": 9.41788886593679, "grad_norm": 11.694517135620117, "learning_rate": 3.2317890523145857e-06, "loss": 1.3659, "step": 56990 }, { "epoch": 9.41954141706259, "grad_norm": 27.488210678100586, "learning_rate": 3.2226078334159645e-06, "loss": 1.1186, "step": 57000 }, { "epoch": 9.42119396818839, "grad_norm": 15.681142807006836, "learning_rate": 3.2134266145173434e-06, "loss": 1.2438, "step": 57010 }, { "epoch": 9.422846519314191, "grad_norm": 19.548660278320312, "learning_rate": 3.2042453956187226e-06, "loss": 1.3773, "step": 57020 }, { "epoch": 9.424499070439992, "grad_norm": 23.549972534179688, "learning_rate": 3.1950641767201014e-06, "loss": 1.308, "step": 57030 }, { "epoch": 9.426151621565792, "grad_norm": 19.313648223876953, "learning_rate": 3.1858829578214803e-06, "loss": 1.2348, "step": 57040 }, { "epoch": 9.427804172691593, "grad_norm": 32.71291732788086, "learning_rate": 3.1767017389228595e-06, "loss": 1.1344, "step": 57050 }, { "epoch": 9.429456723817394, "grad_norm": 14.130952835083008, "learning_rate": 3.1675205200242383e-06, "loss": 1.3653, "step": 57060 }, { "epoch": 9.431109274943193, "grad_norm": 17.445035934448242, "learning_rate": 3.158339301125618e-06, "loss": 1.3794, "step": 57070 }, { "epoch": 9.432761826068994, "grad_norm": 14.413025856018066, "learning_rate": 3.149158082226997e-06, "loss": 1.3055, "step": 57080 }, { "epoch": 9.434414377194795, "grad_norm": 17.203859329223633, "learning_rate": 3.1399768633283757e-06, "loss": 1.3073, "step": 57090 }, { "epoch": 9.436066928320596, "grad_norm": 13.873775482177734, "learning_rate": 3.130795644429755e-06, "loss": 1.2085, "step": 57100 }, { "epoch": 9.437719479446395, "grad_norm": 15.375999450683594, "learning_rate": 3.1216144255311337e-06, "loss": 1.326, "step": 57110 }, { "epoch": 9.439372030572196, "grad_norm": 17.9966983795166, "learning_rate": 3.112433206632513e-06, "loss": 1.2816, "step": 57120 }, { "epoch": 9.441024581697997, "grad_norm": 19.590351104736328, "learning_rate": 3.103251987733892e-06, "loss": 1.2491, "step": 57130 }, { "epoch": 9.442677132823796, "grad_norm": 19.17728614807129, "learning_rate": 3.0940707688352706e-06, "loss": 1.3321, "step": 57140 }, { "epoch": 9.444329683949597, "grad_norm": 17.96892547607422, "learning_rate": 3.08488954993665e-06, "loss": 1.2419, "step": 57150 }, { "epoch": 9.445982235075398, "grad_norm": 19.752483367919922, "learning_rate": 3.0757083310380287e-06, "loss": 1.3689, "step": 57160 }, { "epoch": 9.447634786201197, "grad_norm": 16.997968673706055, "learning_rate": 3.066527112139408e-06, "loss": 1.3388, "step": 57170 }, { "epoch": 9.449287337326998, "grad_norm": 13.03810977935791, "learning_rate": 3.0573458932407868e-06, "loss": 1.2395, "step": 57180 }, { "epoch": 9.4509398884528, "grad_norm": 21.81022071838379, "learning_rate": 3.0481646743421656e-06, "loss": 1.2308, "step": 57190 }, { "epoch": 9.4525924395786, "grad_norm": 19.233882904052734, "learning_rate": 3.038983455443545e-06, "loss": 1.135, "step": 57200 }, { "epoch": 9.4542449907044, "grad_norm": 19.100370407104492, "learning_rate": 3.029802236544924e-06, "loss": 1.1726, "step": 57210 }, { "epoch": 9.4558975418302, "grad_norm": 10.6184720993042, "learning_rate": 3.020621017646303e-06, "loss": 1.2419, "step": 57220 }, { "epoch": 9.457550092956001, "grad_norm": 23.119884490966797, "learning_rate": 3.0114397987476817e-06, "loss": 1.253, "step": 57230 }, { "epoch": 9.4592026440818, "grad_norm": 15.129219055175781, "learning_rate": 3.002258579849061e-06, "loss": 1.3053, "step": 57240 }, { "epoch": 9.460855195207602, "grad_norm": 16.50132179260254, "learning_rate": 2.9930773609504402e-06, "loss": 1.2244, "step": 57250 }, { "epoch": 9.462507746333403, "grad_norm": 12.94861125946045, "learning_rate": 2.9838961420518186e-06, "loss": 1.3387, "step": 57260 }, { "epoch": 9.464160297459202, "grad_norm": 17.04841423034668, "learning_rate": 2.974714923153198e-06, "loss": 1.3652, "step": 57270 }, { "epoch": 9.465812848585003, "grad_norm": 10.876708984375, "learning_rate": 2.965533704254577e-06, "loss": 1.2098, "step": 57280 }, { "epoch": 9.467465399710804, "grad_norm": 11.755070686340332, "learning_rate": 2.956352485355956e-06, "loss": 1.299, "step": 57290 }, { "epoch": 9.469117950836605, "grad_norm": 10.822213172912598, "learning_rate": 2.9471712664573348e-06, "loss": 1.2336, "step": 57300 }, { "epoch": 9.470770501962404, "grad_norm": 19.71744155883789, "learning_rate": 2.937990047558714e-06, "loss": 1.3242, "step": 57310 }, { "epoch": 9.472423053088205, "grad_norm": 21.902231216430664, "learning_rate": 2.9288088286600933e-06, "loss": 1.4042, "step": 57320 }, { "epoch": 9.474075604214006, "grad_norm": 28.398181915283203, "learning_rate": 2.919627609761472e-06, "loss": 1.292, "step": 57330 }, { "epoch": 9.475728155339805, "grad_norm": 17.9105281829834, "learning_rate": 2.910446390862851e-06, "loss": 1.4072, "step": 57340 }, { "epoch": 9.477380706465606, "grad_norm": 24.03833770751953, "learning_rate": 2.90126517196423e-06, "loss": 1.3059, "step": 57350 }, { "epoch": 9.479033257591407, "grad_norm": 18.128108978271484, "learning_rate": 2.892083953065609e-06, "loss": 1.3698, "step": 57360 }, { "epoch": 9.480685808717206, "grad_norm": 15.403632164001465, "learning_rate": 2.8829027341669882e-06, "loss": 1.2558, "step": 57370 }, { "epoch": 9.482338359843007, "grad_norm": 12.787527084350586, "learning_rate": 2.873721515268367e-06, "loss": 1.3369, "step": 57380 }, { "epoch": 9.483990910968808, "grad_norm": 13.58300495147705, "learning_rate": 2.864540296369746e-06, "loss": 1.2305, "step": 57390 }, { "epoch": 9.48564346209461, "grad_norm": 8.516387939453125, "learning_rate": 2.855359077471125e-06, "loss": 1.184, "step": 57400 }, { "epoch": 9.487296013220408, "grad_norm": 12.774700164794922, "learning_rate": 2.8461778585725044e-06, "loss": 1.3685, "step": 57410 }, { "epoch": 9.48894856434621, "grad_norm": 18.586366653442383, "learning_rate": 2.836996639673883e-06, "loss": 1.3613, "step": 57420 }, { "epoch": 9.49060111547201, "grad_norm": 19.410852432250977, "learning_rate": 2.827815420775262e-06, "loss": 1.3254, "step": 57430 }, { "epoch": 9.49225366659781, "grad_norm": 37.648040771484375, "learning_rate": 2.8186342018766413e-06, "loss": 1.3388, "step": 57440 }, { "epoch": 9.49390621772361, "grad_norm": 11.228830337524414, "learning_rate": 2.8094529829780205e-06, "loss": 1.2954, "step": 57450 }, { "epoch": 9.495558768849412, "grad_norm": 17.186487197875977, "learning_rate": 2.8002717640793993e-06, "loss": 1.3855, "step": 57460 }, { "epoch": 9.49721131997521, "grad_norm": 14.050568580627441, "learning_rate": 2.791090545180778e-06, "loss": 1.3872, "step": 57470 }, { "epoch": 9.498863871101012, "grad_norm": 57.6312370300293, "learning_rate": 2.7819093262821574e-06, "loss": 1.3453, "step": 57480 }, { "epoch": 9.500516422226813, "grad_norm": 16.523967742919922, "learning_rate": 2.7727281073835362e-06, "loss": 1.3147, "step": 57490 }, { "epoch": 9.502168973352614, "grad_norm": 18.444759368896484, "learning_rate": 2.7635468884849155e-06, "loss": 1.3901, "step": 57500 }, { "epoch": 9.503821524478413, "grad_norm": 10.994654655456543, "learning_rate": 2.7543656695862943e-06, "loss": 1.3167, "step": 57510 }, { "epoch": 9.505474075604214, "grad_norm": 12.720823287963867, "learning_rate": 2.7451844506876736e-06, "loss": 1.2899, "step": 57520 }, { "epoch": 9.507126626730015, "grad_norm": 16.59524917602539, "learning_rate": 2.7360032317890524e-06, "loss": 1.25, "step": 57530 }, { "epoch": 9.508779177855814, "grad_norm": 14.175232887268066, "learning_rate": 2.7268220128904316e-06, "loss": 1.3248, "step": 57540 }, { "epoch": 9.510431728981615, "grad_norm": 16.468807220458984, "learning_rate": 2.7176407939918105e-06, "loss": 1.3374, "step": 57550 }, { "epoch": 9.512084280107416, "grad_norm": 26.731430053710938, "learning_rate": 2.7084595750931893e-06, "loss": 1.2489, "step": 57560 }, { "epoch": 9.513736831233217, "grad_norm": 9.305521011352539, "learning_rate": 2.6992783561945685e-06, "loss": 1.3049, "step": 57570 }, { "epoch": 9.515389382359016, "grad_norm": 12.240220069885254, "learning_rate": 2.6900971372959478e-06, "loss": 1.3189, "step": 57580 }, { "epoch": 9.517041933484817, "grad_norm": 76.72401428222656, "learning_rate": 2.6809159183973266e-06, "loss": 1.3185, "step": 57590 }, { "epoch": 9.518694484610618, "grad_norm": 13.349190711975098, "learning_rate": 2.6717346994987054e-06, "loss": 1.4297, "step": 57600 }, { "epoch": 9.520347035736417, "grad_norm": 21.3134765625, "learning_rate": 2.6625534806000847e-06, "loss": 1.1601, "step": 57610 }, { "epoch": 9.521999586862218, "grad_norm": 11.682439804077148, "learning_rate": 2.653372261701464e-06, "loss": 1.2994, "step": 57620 }, { "epoch": 9.52365213798802, "grad_norm": 17.29874038696289, "learning_rate": 2.6441910428028427e-06, "loss": 1.3704, "step": 57630 }, { "epoch": 9.525304689113819, "grad_norm": 9.633611679077148, "learning_rate": 2.6350098239042216e-06, "loss": 1.2424, "step": 57640 }, { "epoch": 9.52695724023962, "grad_norm": 73.63542175292969, "learning_rate": 2.625828605005601e-06, "loss": 1.3749, "step": 57650 }, { "epoch": 9.52860979136542, "grad_norm": 20.629478454589844, "learning_rate": 2.6166473861069796e-06, "loss": 1.2985, "step": 57660 }, { "epoch": 9.530262342491222, "grad_norm": 15.100749969482422, "learning_rate": 2.607466167208359e-06, "loss": 1.272, "step": 57670 }, { "epoch": 9.53191489361702, "grad_norm": 16.76715850830078, "learning_rate": 2.5982849483097377e-06, "loss": 1.283, "step": 57680 }, { "epoch": 9.533567444742822, "grad_norm": 23.314260482788086, "learning_rate": 2.5891037294111165e-06, "loss": 1.3745, "step": 57690 }, { "epoch": 9.535219995868623, "grad_norm": 25.0333194732666, "learning_rate": 2.5799225105124958e-06, "loss": 1.1921, "step": 57700 }, { "epoch": 9.536872546994422, "grad_norm": 14.260005950927734, "learning_rate": 2.570741291613875e-06, "loss": 1.2287, "step": 57710 }, { "epoch": 9.538525098120223, "grad_norm": 20.597248077392578, "learning_rate": 2.561560072715254e-06, "loss": 1.3879, "step": 57720 }, { "epoch": 9.540177649246024, "grad_norm": 15.62580680847168, "learning_rate": 2.5523788538166327e-06, "loss": 1.2893, "step": 57730 }, { "epoch": 9.541830200371823, "grad_norm": 11.68388557434082, "learning_rate": 2.543197634918012e-06, "loss": 1.2051, "step": 57740 }, { "epoch": 9.543482751497624, "grad_norm": 15.208742141723633, "learning_rate": 2.534016416019391e-06, "loss": 1.3074, "step": 57750 }, { "epoch": 9.545135302623425, "grad_norm": 15.40304946899414, "learning_rate": 2.5248351971207696e-06, "loss": 1.3454, "step": 57760 }, { "epoch": 9.546787853749226, "grad_norm": 21.844449996948242, "learning_rate": 2.515653978222149e-06, "loss": 1.1692, "step": 57770 }, { "epoch": 9.548440404875025, "grad_norm": 16.35841178894043, "learning_rate": 2.506472759323528e-06, "loss": 1.386, "step": 57780 }, { "epoch": 9.550092956000826, "grad_norm": 15.275668144226074, "learning_rate": 2.497291540424907e-06, "loss": 1.3673, "step": 57790 }, { "epoch": 9.551745507126627, "grad_norm": 13.76074504852295, "learning_rate": 2.4881103215262857e-06, "loss": 1.3039, "step": 57800 }, { "epoch": 9.553398058252426, "grad_norm": 14.345163345336914, "learning_rate": 2.478929102627665e-06, "loss": 1.4154, "step": 57810 }, { "epoch": 9.555050609378227, "grad_norm": 12.346527099609375, "learning_rate": 2.4697478837290442e-06, "loss": 1.2726, "step": 57820 }, { "epoch": 9.556703160504028, "grad_norm": 19.1199893951416, "learning_rate": 2.460566664830423e-06, "loss": 1.263, "step": 57830 }, { "epoch": 9.55835571162983, "grad_norm": 14.129487037658691, "learning_rate": 2.451385445931802e-06, "loss": 1.3791, "step": 57840 }, { "epoch": 9.560008262755629, "grad_norm": 15.209273338317871, "learning_rate": 2.442204227033181e-06, "loss": 1.2314, "step": 57850 }, { "epoch": 9.56166081388143, "grad_norm": 15.329292297363281, "learning_rate": 2.43302300813456e-06, "loss": 1.2872, "step": 57860 }, { "epoch": 9.56331336500723, "grad_norm": 16.666093826293945, "learning_rate": 2.423841789235939e-06, "loss": 1.3597, "step": 57870 }, { "epoch": 9.56496591613303, "grad_norm": 13.654024124145508, "learning_rate": 2.414660570337318e-06, "loss": 1.2514, "step": 57880 }, { "epoch": 9.56661846725883, "grad_norm": 12.923748016357422, "learning_rate": 2.405479351438697e-06, "loss": 1.1992, "step": 57890 }, { "epoch": 9.568271018384632, "grad_norm": 12.820647239685059, "learning_rate": 2.396298132540076e-06, "loss": 1.3571, "step": 57900 }, { "epoch": 9.569923569510431, "grad_norm": 16.327159881591797, "learning_rate": 2.3871169136414553e-06, "loss": 1.3175, "step": 57910 }, { "epoch": 9.571576120636232, "grad_norm": 24.59794807434082, "learning_rate": 2.377935694742834e-06, "loss": 1.3611, "step": 57920 }, { "epoch": 9.573228671762033, "grad_norm": 12.740336418151855, "learning_rate": 2.368754475844213e-06, "loss": 1.344, "step": 57930 }, { "epoch": 9.574881222887832, "grad_norm": 17.685501098632812, "learning_rate": 2.3595732569455922e-06, "loss": 1.394, "step": 57940 }, { "epoch": 9.576533774013633, "grad_norm": 19.866283416748047, "learning_rate": 2.3503920380469715e-06, "loss": 1.1501, "step": 57950 }, { "epoch": 9.578186325139434, "grad_norm": 14.622546195983887, "learning_rate": 2.3412108191483503e-06, "loss": 1.2882, "step": 57960 }, { "epoch": 9.579838876265235, "grad_norm": 16.59194564819336, "learning_rate": 2.332029600249729e-06, "loss": 1.3115, "step": 57970 }, { "epoch": 9.581491427391034, "grad_norm": 18.110830307006836, "learning_rate": 2.3228483813511084e-06, "loss": 1.2563, "step": 57980 }, { "epoch": 9.583143978516835, "grad_norm": 14.122636795043945, "learning_rate": 2.313667162452487e-06, "loss": 1.3451, "step": 57990 }, { "epoch": 9.584796529642636, "grad_norm": 14.215288162231445, "learning_rate": 2.3044859435538664e-06, "loss": 1.2917, "step": 58000 }, { "epoch": 9.586449080768435, "grad_norm": 100.20287322998047, "learning_rate": 2.2953047246552453e-06, "loss": 1.2025, "step": 58010 }, { "epoch": 9.588101631894236, "grad_norm": 12.98987102508545, "learning_rate": 2.2861235057566245e-06, "loss": 1.2286, "step": 58020 }, { "epoch": 9.589754183020037, "grad_norm": 17.084440231323242, "learning_rate": 2.2769422868580033e-06, "loss": 1.2378, "step": 58030 }, { "epoch": 9.591406734145838, "grad_norm": 17.176267623901367, "learning_rate": 2.2677610679593826e-06, "loss": 1.4078, "step": 58040 }, { "epoch": 9.593059285271638, "grad_norm": 38.53559112548828, "learning_rate": 2.2585798490607614e-06, "loss": 1.3422, "step": 58050 }, { "epoch": 9.594711836397439, "grad_norm": 11.070537567138672, "learning_rate": 2.2493986301621402e-06, "loss": 1.3479, "step": 58060 }, { "epoch": 9.59636438752324, "grad_norm": 19.474384307861328, "learning_rate": 2.2402174112635195e-06, "loss": 1.2747, "step": 58070 }, { "epoch": 9.598016938649039, "grad_norm": 15.456579208374023, "learning_rate": 2.2310361923648987e-06, "loss": 1.3125, "step": 58080 }, { "epoch": 9.59966948977484, "grad_norm": 15.075743675231934, "learning_rate": 2.221854973466277e-06, "loss": 1.3164, "step": 58090 }, { "epoch": 9.60132204090064, "grad_norm": 25.315460205078125, "learning_rate": 2.2126737545676564e-06, "loss": 1.347, "step": 58100 }, { "epoch": 9.60297459202644, "grad_norm": 21.648412704467773, "learning_rate": 2.2034925356690356e-06, "loss": 1.332, "step": 58110 }, { "epoch": 9.604627143152241, "grad_norm": 15.424687385559082, "learning_rate": 2.194311316770415e-06, "loss": 1.3377, "step": 58120 }, { "epoch": 9.606279694278042, "grad_norm": 36.263980865478516, "learning_rate": 2.1851300978717933e-06, "loss": 1.3506, "step": 58130 }, { "epoch": 9.607932245403843, "grad_norm": 28.31026840209961, "learning_rate": 2.1759488789731725e-06, "loss": 1.2666, "step": 58140 }, { "epoch": 9.609584796529642, "grad_norm": 17.763492584228516, "learning_rate": 2.1667676600745518e-06, "loss": 1.3172, "step": 58150 }, { "epoch": 9.611237347655443, "grad_norm": 16.161304473876953, "learning_rate": 2.1575864411759306e-06, "loss": 1.2002, "step": 58160 }, { "epoch": 9.612889898781244, "grad_norm": 15.862537384033203, "learning_rate": 2.1484052222773094e-06, "loss": 1.4035, "step": 58170 }, { "epoch": 9.614542449907043, "grad_norm": 13.134415626525879, "learning_rate": 2.1392240033786887e-06, "loss": 1.2955, "step": 58180 }, { "epoch": 9.616195001032844, "grad_norm": 14.605551719665527, "learning_rate": 2.1300427844800675e-06, "loss": 1.4105, "step": 58190 }, { "epoch": 9.617847552158645, "grad_norm": 18.962343215942383, "learning_rate": 2.1208615655814467e-06, "loss": 1.2919, "step": 58200 }, { "epoch": 9.619500103284444, "grad_norm": 13.857765197753906, "learning_rate": 2.1116803466828256e-06, "loss": 1.2513, "step": 58210 }, { "epoch": 9.621152654410245, "grad_norm": 23.394207000732422, "learning_rate": 2.102499127784205e-06, "loss": 1.3572, "step": 58220 }, { "epoch": 9.622805205536046, "grad_norm": 14.980634689331055, "learning_rate": 2.0933179088855836e-06, "loss": 1.2912, "step": 58230 }, { "epoch": 9.624457756661847, "grad_norm": 20.3941593170166, "learning_rate": 2.084136689986963e-06, "loss": 1.3388, "step": 58240 }, { "epoch": 9.626110307787647, "grad_norm": 14.584732055664062, "learning_rate": 2.0749554710883417e-06, "loss": 1.252, "step": 58250 }, { "epoch": 9.627762858913448, "grad_norm": 23.42914581298828, "learning_rate": 2.0657742521897205e-06, "loss": 1.2462, "step": 58260 }, { "epoch": 9.629415410039249, "grad_norm": 10.313939094543457, "learning_rate": 2.0565930332910998e-06, "loss": 1.3573, "step": 58270 }, { "epoch": 9.631067961165048, "grad_norm": 11.711010932922363, "learning_rate": 2.047411814392479e-06, "loss": 1.3872, "step": 58280 }, { "epoch": 9.632720512290849, "grad_norm": 16.497787475585938, "learning_rate": 2.038230595493858e-06, "loss": 1.3352, "step": 58290 }, { "epoch": 9.63437306341665, "grad_norm": 9.749482154846191, "learning_rate": 2.0290493765952367e-06, "loss": 1.2596, "step": 58300 }, { "epoch": 9.63602561454245, "grad_norm": 27.247074127197266, "learning_rate": 2.019868157696616e-06, "loss": 1.1729, "step": 58310 }, { "epoch": 9.63767816566825, "grad_norm": 28.22972869873047, "learning_rate": 2.010686938797995e-06, "loss": 1.2017, "step": 58320 }, { "epoch": 9.639330716794051, "grad_norm": 16.79508399963379, "learning_rate": 2.001505719899374e-06, "loss": 1.2413, "step": 58330 }, { "epoch": 9.640983267919852, "grad_norm": 15.924692153930664, "learning_rate": 1.992324501000753e-06, "loss": 1.2305, "step": 58340 }, { "epoch": 9.642635819045651, "grad_norm": 12.7982816696167, "learning_rate": 1.983143282102132e-06, "loss": 1.3867, "step": 58350 }, { "epoch": 9.644288370171452, "grad_norm": 15.45654296875, "learning_rate": 1.973962063203511e-06, "loss": 1.2799, "step": 58360 }, { "epoch": 9.645940921297253, "grad_norm": 16.585647583007812, "learning_rate": 1.96478084430489e-06, "loss": 1.3226, "step": 58370 }, { "epoch": 9.647593472423052, "grad_norm": 23.49603843688965, "learning_rate": 1.955599625406269e-06, "loss": 1.2294, "step": 58380 }, { "epoch": 9.649246023548853, "grad_norm": 15.694708824157715, "learning_rate": 1.9464184065076478e-06, "loss": 1.1692, "step": 58390 }, { "epoch": 9.650898574674654, "grad_norm": 13.695332527160645, "learning_rate": 1.937237187609027e-06, "loss": 1.2909, "step": 58400 }, { "epoch": 9.652551125800455, "grad_norm": 9.972579002380371, "learning_rate": 1.9280559687104063e-06, "loss": 1.2341, "step": 58410 }, { "epoch": 9.654203676926254, "grad_norm": 14.860621452331543, "learning_rate": 1.918874749811785e-06, "loss": 1.2629, "step": 58420 }, { "epoch": 9.655856228052055, "grad_norm": 11.948851585388184, "learning_rate": 1.909693530913164e-06, "loss": 1.2555, "step": 58430 }, { "epoch": 9.657508779177856, "grad_norm": 22.112714767456055, "learning_rate": 1.9005123120145432e-06, "loss": 1.3156, "step": 58440 }, { "epoch": 9.659161330303656, "grad_norm": 24.64064598083496, "learning_rate": 1.8913310931159222e-06, "loss": 1.4058, "step": 58450 }, { "epoch": 9.660813881429457, "grad_norm": 14.680100440979004, "learning_rate": 1.882149874217301e-06, "loss": 1.357, "step": 58460 }, { "epoch": 9.662466432555258, "grad_norm": 13.727468490600586, "learning_rate": 1.87296865531868e-06, "loss": 1.3138, "step": 58470 }, { "epoch": 9.664118983681057, "grad_norm": 17.917619705200195, "learning_rate": 1.8637874364200593e-06, "loss": 1.286, "step": 58480 }, { "epoch": 9.665771534806858, "grad_norm": 13.282672882080078, "learning_rate": 1.8546062175214381e-06, "loss": 1.253, "step": 58490 }, { "epoch": 9.667424085932659, "grad_norm": 13.979046821594238, "learning_rate": 1.8454249986228172e-06, "loss": 1.2243, "step": 58500 }, { "epoch": 9.66907663705846, "grad_norm": 16.821653366088867, "learning_rate": 1.8362437797241962e-06, "loss": 1.2361, "step": 58510 }, { "epoch": 9.670729188184259, "grad_norm": 14.560995101928711, "learning_rate": 1.8270625608255755e-06, "loss": 1.2431, "step": 58520 }, { "epoch": 9.67238173931006, "grad_norm": 15.146513938903809, "learning_rate": 1.8178813419269543e-06, "loss": 1.1976, "step": 58530 }, { "epoch": 9.674034290435861, "grad_norm": 15.486896514892578, "learning_rate": 1.8087001230283333e-06, "loss": 1.3143, "step": 58540 }, { "epoch": 9.67568684156166, "grad_norm": 18.165647506713867, "learning_rate": 1.7995189041297124e-06, "loss": 1.2991, "step": 58550 }, { "epoch": 9.677339392687461, "grad_norm": 17.75432586669922, "learning_rate": 1.7903376852310912e-06, "loss": 1.1735, "step": 58560 }, { "epoch": 9.678991943813262, "grad_norm": 17.992963790893555, "learning_rate": 1.7811564663324704e-06, "loss": 1.3001, "step": 58570 }, { "epoch": 9.680644494939063, "grad_norm": 65.19438171386719, "learning_rate": 1.7719752474338495e-06, "loss": 1.3303, "step": 58580 }, { "epoch": 9.682297046064862, "grad_norm": 18.94664192199707, "learning_rate": 1.7627940285352283e-06, "loss": 1.2772, "step": 58590 }, { "epoch": 9.683949597190663, "grad_norm": 22.916940689086914, "learning_rate": 1.7536128096366073e-06, "loss": 1.3774, "step": 58600 }, { "epoch": 9.685602148316464, "grad_norm": 17.095354080200195, "learning_rate": 1.7444315907379866e-06, "loss": 1.2863, "step": 58610 }, { "epoch": 9.687254699442263, "grad_norm": 12.96331787109375, "learning_rate": 1.7352503718393656e-06, "loss": 1.4177, "step": 58620 }, { "epoch": 9.688907250568064, "grad_norm": 20.66131019592285, "learning_rate": 1.7260691529407444e-06, "loss": 1.3123, "step": 58630 }, { "epoch": 9.690559801693865, "grad_norm": 15.276062965393066, "learning_rate": 1.7168879340421235e-06, "loss": 1.334, "step": 58640 }, { "epoch": 9.692212352819665, "grad_norm": 30.0185546875, "learning_rate": 1.7077067151435027e-06, "loss": 1.3818, "step": 58650 }, { "epoch": 9.693864903945466, "grad_norm": 14.805960655212402, "learning_rate": 1.6985254962448813e-06, "loss": 1.3392, "step": 58660 }, { "epoch": 9.695517455071267, "grad_norm": 17.313312530517578, "learning_rate": 1.6893442773462606e-06, "loss": 1.421, "step": 58670 }, { "epoch": 9.697170006197066, "grad_norm": 15.689269065856934, "learning_rate": 1.6801630584476396e-06, "loss": 1.375, "step": 58680 }, { "epoch": 9.698822557322867, "grad_norm": 27.808439254760742, "learning_rate": 1.6709818395490184e-06, "loss": 1.3845, "step": 58690 }, { "epoch": 9.700475108448668, "grad_norm": 16.34505271911621, "learning_rate": 1.6618006206503975e-06, "loss": 1.2094, "step": 58700 }, { "epoch": 9.702127659574469, "grad_norm": 30.347618103027344, "learning_rate": 1.6526194017517767e-06, "loss": 1.4331, "step": 58710 }, { "epoch": 9.703780210700268, "grad_norm": 32.67634201049805, "learning_rate": 1.6434381828531558e-06, "loss": 1.3411, "step": 58720 }, { "epoch": 9.705432761826069, "grad_norm": 13.503914833068848, "learning_rate": 1.6342569639545346e-06, "loss": 1.2915, "step": 58730 }, { "epoch": 9.70708531295187, "grad_norm": 30.984130859375, "learning_rate": 1.6250757450559136e-06, "loss": 1.2693, "step": 58740 }, { "epoch": 9.70873786407767, "grad_norm": 22.482847213745117, "learning_rate": 1.6158945261572929e-06, "loss": 1.2009, "step": 58750 }, { "epoch": 9.71039041520347, "grad_norm": 51.00187301635742, "learning_rate": 1.6067133072586717e-06, "loss": 1.3464, "step": 58760 }, { "epoch": 9.712042966329271, "grad_norm": 16.98764991760254, "learning_rate": 1.5975320883600507e-06, "loss": 1.2747, "step": 58770 }, { "epoch": 9.713695517455072, "grad_norm": 14.666571617126465, "learning_rate": 1.5883508694614298e-06, "loss": 1.2513, "step": 58780 }, { "epoch": 9.715348068580871, "grad_norm": 11.236737251281738, "learning_rate": 1.579169650562809e-06, "loss": 1.2613, "step": 58790 }, { "epoch": 9.717000619706672, "grad_norm": 22.248416900634766, "learning_rate": 1.5699884316641878e-06, "loss": 1.3245, "step": 58800 }, { "epoch": 9.718653170832473, "grad_norm": 10.243622779846191, "learning_rate": 1.5608072127655669e-06, "loss": 1.3724, "step": 58810 }, { "epoch": 9.720305721958272, "grad_norm": 16.06927490234375, "learning_rate": 1.551625993866946e-06, "loss": 1.3333, "step": 58820 }, { "epoch": 9.721958273084073, "grad_norm": 15.943928718566895, "learning_rate": 1.542444774968325e-06, "loss": 1.261, "step": 58830 }, { "epoch": 9.723610824209874, "grad_norm": 17.844417572021484, "learning_rate": 1.533263556069704e-06, "loss": 1.3135, "step": 58840 }, { "epoch": 9.725263375335674, "grad_norm": 16.729440689086914, "learning_rate": 1.5240823371710828e-06, "loss": 1.3187, "step": 58850 }, { "epoch": 9.726915926461475, "grad_norm": 15.814526557922363, "learning_rate": 1.514901118272462e-06, "loss": 1.3089, "step": 58860 }, { "epoch": 9.728568477587276, "grad_norm": 12.397698402404785, "learning_rate": 1.5057198993738409e-06, "loss": 1.3365, "step": 58870 }, { "epoch": 9.730221028713077, "grad_norm": 19.588912963867188, "learning_rate": 1.4965386804752201e-06, "loss": 1.3647, "step": 58880 }, { "epoch": 9.731873579838876, "grad_norm": 19.40388298034668, "learning_rate": 1.487357461576599e-06, "loss": 1.228, "step": 58890 }, { "epoch": 9.733526130964677, "grad_norm": 9.690013885498047, "learning_rate": 1.478176242677978e-06, "loss": 1.198, "step": 58900 }, { "epoch": 9.735178682090478, "grad_norm": 21.138465881347656, "learning_rate": 1.468995023779357e-06, "loss": 1.2491, "step": 58910 }, { "epoch": 9.736831233216277, "grad_norm": 13.240848541259766, "learning_rate": 1.459813804880736e-06, "loss": 1.0939, "step": 58920 }, { "epoch": 9.738483784342078, "grad_norm": 11.486268043518066, "learning_rate": 1.450632585982115e-06, "loss": 1.2259, "step": 58930 }, { "epoch": 9.740136335467879, "grad_norm": 18.410409927368164, "learning_rate": 1.4414513670834941e-06, "loss": 1.2794, "step": 58940 }, { "epoch": 9.741788886593678, "grad_norm": 13.786377906799316, "learning_rate": 1.432270148184873e-06, "loss": 1.2376, "step": 58950 }, { "epoch": 9.74344143771948, "grad_norm": 17.038511276245117, "learning_rate": 1.4230889292862522e-06, "loss": 1.2173, "step": 58960 }, { "epoch": 9.74509398884528, "grad_norm": 15.250676155090332, "learning_rate": 1.413907710387631e-06, "loss": 1.3068, "step": 58970 }, { "epoch": 9.746746539971081, "grad_norm": 28.942930221557617, "learning_rate": 1.4047264914890103e-06, "loss": 1.3075, "step": 58980 }, { "epoch": 9.74839909109688, "grad_norm": 12.493021965026855, "learning_rate": 1.395545272590389e-06, "loss": 1.3052, "step": 58990 }, { "epoch": 9.750051642222681, "grad_norm": 15.28077220916748, "learning_rate": 1.3863640536917681e-06, "loss": 1.3504, "step": 59000 }, { "epoch": 9.751704193348482, "grad_norm": 20.08050537109375, "learning_rate": 1.3771828347931472e-06, "loss": 1.2934, "step": 59010 }, { "epoch": 9.753356744474281, "grad_norm": 25.60179901123047, "learning_rate": 1.3680016158945262e-06, "loss": 1.2961, "step": 59020 }, { "epoch": 9.755009295600082, "grad_norm": 14.154158592224121, "learning_rate": 1.3588203969959052e-06, "loss": 1.4102, "step": 59030 }, { "epoch": 9.756661846725883, "grad_norm": 12.870734214782715, "learning_rate": 1.3496391780972843e-06, "loss": 1.134, "step": 59040 }, { "epoch": 9.758314397851684, "grad_norm": 9.939929962158203, "learning_rate": 1.3404579591986633e-06, "loss": 1.2146, "step": 59050 }, { "epoch": 9.759966948977484, "grad_norm": 14.322998046875, "learning_rate": 1.3312767403000423e-06, "loss": 1.1763, "step": 59060 }, { "epoch": 9.761619500103285, "grad_norm": 18.574724197387695, "learning_rate": 1.3220955214014214e-06, "loss": 1.3397, "step": 59070 }, { "epoch": 9.763272051229086, "grad_norm": 12.602153778076172, "learning_rate": 1.3129143025028004e-06, "loss": 1.245, "step": 59080 }, { "epoch": 9.764924602354885, "grad_norm": 21.25696563720703, "learning_rate": 1.3037330836041794e-06, "loss": 1.2831, "step": 59090 }, { "epoch": 9.766577153480686, "grad_norm": 24.119489669799805, "learning_rate": 1.2945518647055583e-06, "loss": 1.2615, "step": 59100 }, { "epoch": 9.768229704606487, "grad_norm": 20.59731101989746, "learning_rate": 1.2853706458069375e-06, "loss": 1.2008, "step": 59110 }, { "epoch": 9.769882255732286, "grad_norm": 17.279531478881836, "learning_rate": 1.2761894269083163e-06, "loss": 1.2945, "step": 59120 }, { "epoch": 9.771534806858087, "grad_norm": 15.187520027160645, "learning_rate": 1.2670082080096956e-06, "loss": 1.3367, "step": 59130 }, { "epoch": 9.773187357983888, "grad_norm": 24.268413543701172, "learning_rate": 1.2578269891110744e-06, "loss": 1.2033, "step": 59140 }, { "epoch": 9.774839909109687, "grad_norm": 10.84234619140625, "learning_rate": 1.2486457702124534e-06, "loss": 1.1307, "step": 59150 }, { "epoch": 9.776492460235488, "grad_norm": 19.090682983398438, "learning_rate": 1.2394645513138325e-06, "loss": 1.2864, "step": 59160 }, { "epoch": 9.778145011361289, "grad_norm": 16.51657485961914, "learning_rate": 1.2302833324152115e-06, "loss": 1.2448, "step": 59170 }, { "epoch": 9.77979756248709, "grad_norm": 10.925396919250488, "learning_rate": 1.2211021135165906e-06, "loss": 1.2297, "step": 59180 }, { "epoch": 9.78145011361289, "grad_norm": 19.890026092529297, "learning_rate": 1.2119208946179696e-06, "loss": 1.2474, "step": 59190 }, { "epoch": 9.78310266473869, "grad_norm": 21.863983154296875, "learning_rate": 1.2027396757193484e-06, "loss": 1.3076, "step": 59200 }, { "epoch": 9.784755215864491, "grad_norm": 20.163433074951172, "learning_rate": 1.1935584568207277e-06, "loss": 1.253, "step": 59210 }, { "epoch": 9.78640776699029, "grad_norm": 13.780261039733887, "learning_rate": 1.1843772379221065e-06, "loss": 1.2075, "step": 59220 }, { "epoch": 9.788060318116091, "grad_norm": 15.498116493225098, "learning_rate": 1.1751960190234857e-06, "loss": 1.3903, "step": 59230 }, { "epoch": 9.789712869241892, "grad_norm": 12.257755279541016, "learning_rate": 1.1660148001248646e-06, "loss": 1.379, "step": 59240 }, { "epoch": 9.791365420367693, "grad_norm": 15.42638111114502, "learning_rate": 1.1568335812262436e-06, "loss": 1.2928, "step": 59250 }, { "epoch": 9.793017971493493, "grad_norm": 17.679750442504883, "learning_rate": 1.1476523623276226e-06, "loss": 1.2155, "step": 59260 }, { "epoch": 9.794670522619294, "grad_norm": 13.531704902648926, "learning_rate": 1.1384711434290017e-06, "loss": 1.396, "step": 59270 }, { "epoch": 9.796323073745095, "grad_norm": 21.790496826171875, "learning_rate": 1.1292899245303807e-06, "loss": 1.3374, "step": 59280 }, { "epoch": 9.797975624870894, "grad_norm": 20.123689651489258, "learning_rate": 1.1201087056317597e-06, "loss": 1.3, "step": 59290 }, { "epoch": 9.799628175996695, "grad_norm": 16.618236541748047, "learning_rate": 1.1109274867331386e-06, "loss": 1.2193, "step": 59300 }, { "epoch": 9.801280727122496, "grad_norm": 87.83232879638672, "learning_rate": 1.1017462678345178e-06, "loss": 1.283, "step": 59310 }, { "epoch": 9.802933278248297, "grad_norm": 19.9451847076416, "learning_rate": 1.0925650489358966e-06, "loss": 1.2853, "step": 59320 }, { "epoch": 9.804585829374096, "grad_norm": 20.5788631439209, "learning_rate": 1.0833838300372759e-06, "loss": 1.3428, "step": 59330 }, { "epoch": 9.806238380499897, "grad_norm": 13.178668975830078, "learning_rate": 1.0742026111386547e-06, "loss": 1.3449, "step": 59340 }, { "epoch": 9.807890931625698, "grad_norm": 9.858376502990723, "learning_rate": 1.0650213922400337e-06, "loss": 1.2549, "step": 59350 }, { "epoch": 9.809543482751497, "grad_norm": 14.950618743896484, "learning_rate": 1.0558401733414128e-06, "loss": 1.1921, "step": 59360 }, { "epoch": 9.811196033877298, "grad_norm": 15.29289436340332, "learning_rate": 1.0466589544427918e-06, "loss": 1.2611, "step": 59370 }, { "epoch": 9.812848585003099, "grad_norm": 10.011371612548828, "learning_rate": 1.0374777355441709e-06, "loss": 1.3245, "step": 59380 }, { "epoch": 9.814501136128898, "grad_norm": 39.90605545043945, "learning_rate": 1.0282965166455499e-06, "loss": 1.1861, "step": 59390 }, { "epoch": 9.8161536872547, "grad_norm": 23.458520889282227, "learning_rate": 1.019115297746929e-06, "loss": 1.3771, "step": 59400 }, { "epoch": 9.8178062383805, "grad_norm": 24.068479537963867, "learning_rate": 1.009934078848308e-06, "loss": 1.1251, "step": 59410 }, { "epoch": 9.8194587895063, "grad_norm": 20.833627700805664, "learning_rate": 1.000752859949687e-06, "loss": 1.4204, "step": 59420 }, { "epoch": 9.8211113406321, "grad_norm": 13.489704132080078, "learning_rate": 9.91571641051066e-07, "loss": 1.3008, "step": 59430 }, { "epoch": 9.822763891757901, "grad_norm": 35.63694381713867, "learning_rate": 9.82390422152445e-07, "loss": 1.3852, "step": 59440 }, { "epoch": 9.824416442883702, "grad_norm": 17.55329132080078, "learning_rate": 9.732092032538239e-07, "loss": 1.3671, "step": 59450 }, { "epoch": 9.826068994009502, "grad_norm": 19.870128631591797, "learning_rate": 9.640279843552031e-07, "loss": 1.3378, "step": 59460 }, { "epoch": 9.827721545135303, "grad_norm": 31.158733367919922, "learning_rate": 9.54846765456582e-07, "loss": 1.2725, "step": 59470 }, { "epoch": 9.829374096261104, "grad_norm": 23.401962280273438, "learning_rate": 9.456655465579611e-07, "loss": 1.3456, "step": 59480 }, { "epoch": 9.831026647386903, "grad_norm": 11.159757614135742, "learning_rate": 9.3648432765934e-07, "loss": 1.2201, "step": 59490 }, { "epoch": 9.832679198512704, "grad_norm": 18.284528732299805, "learning_rate": 9.273031087607191e-07, "loss": 1.3702, "step": 59500 }, { "epoch": 9.834331749638505, "grad_norm": 22.221506118774414, "learning_rate": 9.181218898620981e-07, "loss": 1.1929, "step": 59510 }, { "epoch": 9.835984300764306, "grad_norm": 14.718652725219727, "learning_rate": 9.089406709634771e-07, "loss": 1.3242, "step": 59520 }, { "epoch": 9.837636851890105, "grad_norm": 30.458106994628906, "learning_rate": 8.997594520648562e-07, "loss": 1.2832, "step": 59530 }, { "epoch": 9.839289403015906, "grad_norm": 17.770442962646484, "learning_rate": 8.905782331662352e-07, "loss": 1.375, "step": 59540 }, { "epoch": 9.840941954141707, "grad_norm": 13.754855155944824, "learning_rate": 8.813970142676141e-07, "loss": 1.1981, "step": 59550 }, { "epoch": 9.842594505267506, "grad_norm": 28.639034271240234, "learning_rate": 8.722157953689933e-07, "loss": 1.3355, "step": 59560 }, { "epoch": 9.844247056393307, "grad_norm": 34.25663757324219, "learning_rate": 8.630345764703722e-07, "loss": 1.5129, "step": 59570 }, { "epoch": 9.845899607519108, "grad_norm": 8.183585166931152, "learning_rate": 8.538533575717514e-07, "loss": 1.262, "step": 59580 }, { "epoch": 9.847552158644907, "grad_norm": 27.922536849975586, "learning_rate": 8.446721386731303e-07, "loss": 1.2981, "step": 59590 }, { "epoch": 9.849204709770708, "grad_norm": 11.618725776672363, "learning_rate": 8.354909197745092e-07, "loss": 1.2361, "step": 59600 }, { "epoch": 9.85085726089651, "grad_norm": 21.355430603027344, "learning_rate": 8.263097008758884e-07, "loss": 1.3164, "step": 59610 }, { "epoch": 9.85250981202231, "grad_norm": 17.563817977905273, "learning_rate": 8.171284819772673e-07, "loss": 1.1633, "step": 59620 }, { "epoch": 9.85416236314811, "grad_norm": 15.469801902770996, "learning_rate": 8.079472630786464e-07, "loss": 1.3304, "step": 59630 }, { "epoch": 9.85581491427391, "grad_norm": 12.664761543273926, "learning_rate": 7.987660441800254e-07, "loss": 1.3019, "step": 59640 }, { "epoch": 9.857467465399711, "grad_norm": 18.20703125, "learning_rate": 7.895848252814045e-07, "loss": 1.3006, "step": 59650 }, { "epoch": 9.85912001652551, "grad_norm": 14.753276824951172, "learning_rate": 7.804036063827834e-07, "loss": 1.3308, "step": 59660 }, { "epoch": 9.860772567651312, "grad_norm": 12.871907234191895, "learning_rate": 7.712223874841625e-07, "loss": 1.3225, "step": 59670 }, { "epoch": 9.862425118777113, "grad_norm": 10.84438419342041, "learning_rate": 7.620411685855414e-07, "loss": 1.1678, "step": 59680 }, { "epoch": 9.864077669902912, "grad_norm": 15.910385131835938, "learning_rate": 7.528599496869204e-07, "loss": 1.3399, "step": 59690 }, { "epoch": 9.865730221028713, "grad_norm": 15.912435531616211, "learning_rate": 7.436787307882995e-07, "loss": 1.251, "step": 59700 }, { "epoch": 9.867382772154514, "grad_norm": 14.724613189697266, "learning_rate": 7.344975118896785e-07, "loss": 1.3633, "step": 59710 }, { "epoch": 9.869035323280315, "grad_norm": 19.542469024658203, "learning_rate": 7.253162929910575e-07, "loss": 1.4641, "step": 59720 }, { "epoch": 9.870687874406114, "grad_norm": 15.426322937011719, "learning_rate": 7.161350740924365e-07, "loss": 1.4261, "step": 59730 }, { "epoch": 9.872340425531915, "grad_norm": 12.275348663330078, "learning_rate": 7.069538551938155e-07, "loss": 1.3722, "step": 59740 }, { "epoch": 9.873992976657716, "grad_norm": 13.315044403076172, "learning_rate": 6.977726362951945e-07, "loss": 1.3351, "step": 59750 }, { "epoch": 9.875645527783515, "grad_norm": 24.9056453704834, "learning_rate": 6.885914173965736e-07, "loss": 1.3371, "step": 59760 }, { "epoch": 9.877298078909316, "grad_norm": 14.293803215026855, "learning_rate": 6.794101984979526e-07, "loss": 1.1908, "step": 59770 }, { "epoch": 9.878950630035117, "grad_norm": 20.57039451599121, "learning_rate": 6.702289795993317e-07, "loss": 1.2033, "step": 59780 }, { "epoch": 9.880603181160918, "grad_norm": 13.297779083251953, "learning_rate": 6.610477607007107e-07, "loss": 1.163, "step": 59790 }, { "epoch": 9.882255732286717, "grad_norm": 13.230541229248047, "learning_rate": 6.518665418020897e-07, "loss": 1.4123, "step": 59800 }, { "epoch": 9.883908283412518, "grad_norm": 24.931100845336914, "learning_rate": 6.426853229034688e-07, "loss": 1.3383, "step": 59810 }, { "epoch": 9.88556083453832, "grad_norm": 17.547151565551758, "learning_rate": 6.335041040048478e-07, "loss": 1.3894, "step": 59820 }, { "epoch": 9.887213385664118, "grad_norm": 16.77924346923828, "learning_rate": 6.243228851062267e-07, "loss": 1.32, "step": 59830 }, { "epoch": 9.88886593678992, "grad_norm": 16.8702449798584, "learning_rate": 6.151416662076058e-07, "loss": 1.3243, "step": 59840 }, { "epoch": 9.89051848791572, "grad_norm": 15.003113746643066, "learning_rate": 6.059604473089848e-07, "loss": 1.2222, "step": 59850 }, { "epoch": 9.89217103904152, "grad_norm": 15.460107803344727, "learning_rate": 5.967792284103638e-07, "loss": 1.2588, "step": 59860 }, { "epoch": 9.89382359016732, "grad_norm": 17.36907958984375, "learning_rate": 5.875980095117429e-07, "loss": 1.3467, "step": 59870 }, { "epoch": 9.895476141293122, "grad_norm": 13.089510917663574, "learning_rate": 5.784167906131218e-07, "loss": 1.3112, "step": 59880 }, { "epoch": 9.89712869241892, "grad_norm": 22.797977447509766, "learning_rate": 5.692355717145008e-07, "loss": 1.379, "step": 59890 }, { "epoch": 9.898781243544722, "grad_norm": 18.484411239624023, "learning_rate": 5.600543528158799e-07, "loss": 1.3037, "step": 59900 }, { "epoch": 9.900433794670523, "grad_norm": 10.942911148071289, "learning_rate": 5.508731339172589e-07, "loss": 1.3234, "step": 59910 }, { "epoch": 9.902086345796324, "grad_norm": 17.38446044921875, "learning_rate": 5.416919150186379e-07, "loss": 1.3274, "step": 59920 }, { "epoch": 9.903738896922123, "grad_norm": 140.1362762451172, "learning_rate": 5.325106961200169e-07, "loss": 1.3595, "step": 59930 }, { "epoch": 9.905391448047924, "grad_norm": 16.535696029663086, "learning_rate": 5.233294772213959e-07, "loss": 1.3435, "step": 59940 }, { "epoch": 9.907043999173725, "grad_norm": 17.555944442749023, "learning_rate": 5.141482583227749e-07, "loss": 1.3091, "step": 59950 }, { "epoch": 9.908696550299524, "grad_norm": 13.292182922363281, "learning_rate": 5.04967039424154e-07, "loss": 1.3891, "step": 59960 }, { "epoch": 9.910349101425325, "grad_norm": 15.89194393157959, "learning_rate": 4.95785820525533e-07, "loss": 1.2585, "step": 59970 }, { "epoch": 9.912001652551126, "grad_norm": 11.141946792602539, "learning_rate": 4.866046016269119e-07, "loss": 1.3311, "step": 59980 }, { "epoch": 9.913654203676927, "grad_norm": 18.01121711730957, "learning_rate": 4.77423382728291e-07, "loss": 1.2728, "step": 59990 }, { "epoch": 9.915306754802726, "grad_norm": 14.889471054077148, "learning_rate": 4.6824216382967e-07, "loss": 1.2427, "step": 60000 }, { "epoch": 9.916959305928527, "grad_norm": 24.612850189208984, "learning_rate": 4.5906094493104905e-07, "loss": 1.2339, "step": 60010 }, { "epoch": 9.918611857054328, "grad_norm": 11.734923362731934, "learning_rate": 4.498797260324281e-07, "loss": 1.3032, "step": 60020 }, { "epoch": 9.920264408180127, "grad_norm": 20.81572723388672, "learning_rate": 4.4069850713380707e-07, "loss": 1.2575, "step": 60030 }, { "epoch": 9.921916959305928, "grad_norm": 17.165307998657227, "learning_rate": 4.315172882351861e-07, "loss": 1.3268, "step": 60040 }, { "epoch": 9.92356951043173, "grad_norm": 10.70225715637207, "learning_rate": 4.2233606933656514e-07, "loss": 1.3622, "step": 60050 }, { "epoch": 9.925222061557529, "grad_norm": 36.28559494018555, "learning_rate": 4.131548504379442e-07, "loss": 1.2658, "step": 60060 }, { "epoch": 9.92687461268333, "grad_norm": 17.466758728027344, "learning_rate": 4.039736315393232e-07, "loss": 1.3854, "step": 60070 }, { "epoch": 9.92852716380913, "grad_norm": 19.143348693847656, "learning_rate": 3.9479241264070225e-07, "loss": 1.0911, "step": 60080 }, { "epoch": 9.930179714934932, "grad_norm": 18.350982666015625, "learning_rate": 3.8561119374208123e-07, "loss": 1.2973, "step": 60090 }, { "epoch": 9.93183226606073, "grad_norm": 12.689515113830566, "learning_rate": 3.764299748434602e-07, "loss": 1.3574, "step": 60100 }, { "epoch": 9.933484817186532, "grad_norm": 8.145671844482422, "learning_rate": 3.6724875594483925e-07, "loss": 1.4449, "step": 60110 }, { "epoch": 9.935137368312333, "grad_norm": 23.987871170043945, "learning_rate": 3.5806753704621824e-07, "loss": 1.298, "step": 60120 }, { "epoch": 9.936789919438132, "grad_norm": 18.037437438964844, "learning_rate": 3.4888631814759727e-07, "loss": 1.2382, "step": 60130 }, { "epoch": 9.938442470563933, "grad_norm": 16.41983413696289, "learning_rate": 3.397050992489763e-07, "loss": 1.3517, "step": 60140 }, { "epoch": 9.940095021689734, "grad_norm": 16.38405418395996, "learning_rate": 3.3052388035035534e-07, "loss": 1.2899, "step": 60150 }, { "epoch": 9.941747572815533, "grad_norm": 16.4738826751709, "learning_rate": 3.213426614517344e-07, "loss": 1.29, "step": 60160 }, { "epoch": 9.943400123941334, "grad_norm": 22.556604385375977, "learning_rate": 3.1216144255311336e-07, "loss": 1.2654, "step": 60170 }, { "epoch": 9.945052675067135, "grad_norm": 11.890191078186035, "learning_rate": 3.029802236544924e-07, "loss": 1.0929, "step": 60180 }, { "epoch": 9.946705226192936, "grad_norm": 72.20085906982422, "learning_rate": 2.9379900475587143e-07, "loss": 1.2483, "step": 60190 }, { "epoch": 9.948357777318735, "grad_norm": 17.972667694091797, "learning_rate": 2.846177858572504e-07, "loss": 1.2809, "step": 60200 }, { "epoch": 9.950010328444536, "grad_norm": 21.140539169311523, "learning_rate": 2.7543656695862945e-07, "loss": 1.4348, "step": 60210 }, { "epoch": 9.951662879570337, "grad_norm": 17.385190963745117, "learning_rate": 2.6625534806000844e-07, "loss": 1.3182, "step": 60220 }, { "epoch": 9.953315430696136, "grad_norm": 9.468734741210938, "learning_rate": 2.5707412916138747e-07, "loss": 1.1968, "step": 60230 }, { "epoch": 9.954967981821937, "grad_norm": 9.555684089660645, "learning_rate": 2.478929102627665e-07, "loss": 1.3367, "step": 60240 }, { "epoch": 9.956620532947738, "grad_norm": 11.08018684387207, "learning_rate": 2.387116913641455e-07, "loss": 1.392, "step": 60250 }, { "epoch": 9.95827308407354, "grad_norm": 16.899229049682617, "learning_rate": 2.2953047246552453e-07, "loss": 1.2918, "step": 60260 }, { "epoch": 9.959925635199339, "grad_norm": 15.151360511779785, "learning_rate": 2.2034925356690354e-07, "loss": 1.2946, "step": 60270 }, { "epoch": 9.96157818632514, "grad_norm": 8.768255233764648, "learning_rate": 2.1116803466828257e-07, "loss": 1.3185, "step": 60280 }, { "epoch": 9.96323073745094, "grad_norm": 11.71931266784668, "learning_rate": 2.019868157696616e-07, "loss": 1.3727, "step": 60290 }, { "epoch": 9.96488328857674, "grad_norm": 25.73476219177246, "learning_rate": 1.9280559687104062e-07, "loss": 1.2354, "step": 60300 }, { "epoch": 9.96653583970254, "grad_norm": 25.98105239868164, "learning_rate": 1.8362437797241963e-07, "loss": 1.2941, "step": 60310 }, { "epoch": 9.968188390828342, "grad_norm": 35.79945373535156, "learning_rate": 1.7444315907379864e-07, "loss": 1.2131, "step": 60320 }, { "epoch": 9.969840941954141, "grad_norm": 13.044464111328125, "learning_rate": 1.6526194017517767e-07, "loss": 1.2468, "step": 60330 }, { "epoch": 9.971493493079942, "grad_norm": 25.407047271728516, "learning_rate": 1.5608072127655668e-07, "loss": 1.379, "step": 60340 }, { "epoch": 9.973146044205743, "grad_norm": 13.202648162841797, "learning_rate": 1.4689950237793572e-07, "loss": 1.185, "step": 60350 }, { "epoch": 9.974798595331544, "grad_norm": 38.73662567138672, "learning_rate": 1.3771828347931473e-07, "loss": 1.2843, "step": 60360 }, { "epoch": 9.976451146457343, "grad_norm": 14.0643310546875, "learning_rate": 1.2853706458069374e-07, "loss": 1.294, "step": 60370 }, { "epoch": 9.978103697583144, "grad_norm": 12.296748161315918, "learning_rate": 1.1935584568207275e-07, "loss": 1.2442, "step": 60380 }, { "epoch": 9.979756248708945, "grad_norm": 19.61980628967285, "learning_rate": 1.1017462678345177e-07, "loss": 1.1628, "step": 60390 }, { "epoch": 9.981408799834744, "grad_norm": 15.85641860961914, "learning_rate": 1.009934078848308e-07, "loss": 1.2077, "step": 60400 }, { "epoch": 9.983061350960545, "grad_norm": 18.99192237854004, "learning_rate": 9.181218898620981e-08, "loss": 1.1939, "step": 60410 }, { "epoch": 9.984713902086346, "grad_norm": 34.235374450683594, "learning_rate": 8.263097008758884e-08, "loss": 1.3344, "step": 60420 }, { "epoch": 9.986366453212145, "grad_norm": 13.520135879516602, "learning_rate": 7.344975118896786e-08, "loss": 1.2839, "step": 60430 }, { "epoch": 9.988019004337946, "grad_norm": 18.85852813720703, "learning_rate": 6.426853229034687e-08, "loss": 1.2802, "step": 60440 }, { "epoch": 9.989671555463747, "grad_norm": 16.009300231933594, "learning_rate": 5.5087313391725884e-08, "loss": 1.2258, "step": 60450 }, { "epoch": 9.991324106589548, "grad_norm": 14.094069480895996, "learning_rate": 4.5906094493104907e-08, "loss": 1.3012, "step": 60460 }, { "epoch": 9.992976657715348, "grad_norm": 23.66718101501465, "learning_rate": 3.672487559448393e-08, "loss": 1.329, "step": 60470 }, { "epoch": 9.994629208841149, "grad_norm": 18.014339447021484, "learning_rate": 2.7543656695862942e-08, "loss": 1.3193, "step": 60480 }, { "epoch": 9.99628175996695, "grad_norm": 12.735496520996094, "learning_rate": 1.8362437797241965e-08, "loss": 1.2747, "step": 60490 }, { "epoch": 9.997934311092749, "grad_norm": 29.70014190673828, "learning_rate": 9.181218898620982e-09, "loss": 1.3249, "step": 60500 }, { "epoch": 9.99958686221855, "grad_norm": 17.789243698120117, "learning_rate": 0.0, "loss": 1.3595, "step": 60510 }, { "epoch": 9.99958686221855, "eval_accuracy": 0.33699815576677544, "eval_loss": 2.4853203296661377, "eval_runtime": 861.2243, "eval_samples_per_second": 32.739, "eval_steps_per_second": 8.185, "step": 60510 }, { "epoch": 9.99958686221855, "step": 60510, "total_flos": 9.925024630549355e+19, "train_loss": 1.502550286699811, "train_runtime": 81421.0241, "train_samples_per_second": 11.891, "train_steps_per_second": 0.743 } ], "logging_steps": 10, "max_steps": 60510, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 9.925024630549355e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }