diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,42477 @@ +{ + "best_metric": 0.33997730174492835, + "best_model_checkpoint": "Mrohit01/1_lakh_cards-swinv2-base-patch4-window12to16-192to256-22kto1k-ft-finetuned/checkpoint-54461", + "epoch": 9.99958686221855, + "eval_steps": 500, + "global_step": 60510, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016525511258004544, + "grad_norm": 34.320098876953125, + "learning_rate": 8.263097008758884e-08, + "loss": 2.4664, + "step": 10 + }, + { + "epoch": 0.003305102251600909, + "grad_norm": 319.06292724609375, + "learning_rate": 1.6526194017517767e-07, + "loss": 2.4352, + "step": 20 + }, + { + "epoch": 0.004957653377401363, + "grad_norm": 57.93561553955078, + "learning_rate": 2.478929102627665e-07, + "loss": 2.3859, + "step": 30 + }, + { + "epoch": 0.006610204503201818, + "grad_norm": 37.62372589111328, + "learning_rate": 3.3052388035035534e-07, + "loss": 2.3897, + "step": 40 + }, + { + "epoch": 0.008262755629002272, + "grad_norm": 44.00041580200195, + "learning_rate": 4.131548504379442e-07, + "loss": 2.4616, + "step": 50 + }, + { + "epoch": 0.009915306754802726, + "grad_norm": 25.988052368164062, + "learning_rate": 4.95785820525533e-07, + "loss": 2.427, + "step": 60 + }, + { + "epoch": 0.011567857880603181, + "grad_norm": 105.19337463378906, + "learning_rate": 5.784167906131218e-07, + "loss": 2.4586, + "step": 70 + }, + { + "epoch": 0.013220409006403635, + "grad_norm": 30.26386833190918, + "learning_rate": 6.610477607007107e-07, + "loss": 2.3847, + "step": 80 + }, + { + "epoch": 0.01487296013220409, + "grad_norm": 35.36358642578125, + "learning_rate": 7.436787307882995e-07, + "loss": 2.3328, + "step": 90 + }, + { + "epoch": 0.016525511258004544, + "grad_norm": 262.9168395996094, + "learning_rate": 8.263097008758884e-07, + "loss": 2.3209, + "step": 100 + }, + { + "epoch": 0.018178062383805, + "grad_norm": 31.34343910217285, + "learning_rate": 9.089406709634771e-07, + "loss": 2.3854, + "step": 110 + }, + { + "epoch": 0.019830613509605452, + "grad_norm": 21.935100555419922, + "learning_rate": 9.91571641051066e-07, + "loss": 2.3157, + "step": 120 + }, + { + "epoch": 0.021483164635405907, + "grad_norm": 41.69834899902344, + "learning_rate": 1.0742026111386547e-06, + "loss": 2.3495, + "step": 130 + }, + { + "epoch": 0.023135715761206363, + "grad_norm": 25.686676025390625, + "learning_rate": 1.1568335812262436e-06, + "loss": 2.2876, + "step": 140 + }, + { + "epoch": 0.02478826688700682, + "grad_norm": 20.90071678161621, + "learning_rate": 1.2394645513138325e-06, + "loss": 2.333, + "step": 150 + }, + { + "epoch": 0.02644081801280727, + "grad_norm": 30.158994674682617, + "learning_rate": 1.3220955214014214e-06, + "loss": 2.3064, + "step": 160 + }, + { + "epoch": 0.028093369138607726, + "grad_norm": 29.495481491088867, + "learning_rate": 1.4047264914890103e-06, + "loss": 2.2628, + "step": 170 + }, + { + "epoch": 0.02974592026440818, + "grad_norm": 20.178300857543945, + "learning_rate": 1.487357461576599e-06, + "loss": 2.2932, + "step": 180 + }, + { + "epoch": 0.031398471390208633, + "grad_norm": 36.64740753173828, + "learning_rate": 1.5699884316641878e-06, + "loss": 2.3664, + "step": 190 + }, + { + "epoch": 0.03305102251600909, + "grad_norm": 19.702604293823242, + "learning_rate": 1.6526194017517767e-06, + "loss": 2.3554, + "step": 200 + }, + { + "epoch": 0.034703573641809544, + "grad_norm": 18.79967498779297, + "learning_rate": 1.7352503718393656e-06, + "loss": 2.32, + "step": 210 + }, + { + "epoch": 0.03635612476761, + "grad_norm": 24.093412399291992, + "learning_rate": 1.8178813419269543e-06, + "loss": 2.2258, + "step": 220 + }, + { + "epoch": 0.038008675893410455, + "grad_norm": 20.522418975830078, + "learning_rate": 1.9005123120145432e-06, + "loss": 2.2418, + "step": 230 + }, + { + "epoch": 0.039661227019210904, + "grad_norm": 21.670164108276367, + "learning_rate": 1.983143282102132e-06, + "loss": 2.2425, + "step": 240 + }, + { + "epoch": 0.04131377814501136, + "grad_norm": 24.221439361572266, + "learning_rate": 2.0657742521897205e-06, + "loss": 2.2311, + "step": 250 + }, + { + "epoch": 0.042966329270811815, + "grad_norm": 26.810293197631836, + "learning_rate": 2.1484052222773094e-06, + "loss": 2.3007, + "step": 260 + }, + { + "epoch": 0.04461888039661227, + "grad_norm": 33.146942138671875, + "learning_rate": 2.2310361923648987e-06, + "loss": 2.1901, + "step": 270 + }, + { + "epoch": 0.046271431522412726, + "grad_norm": 24.769393920898438, + "learning_rate": 2.313667162452487e-06, + "loss": 2.2548, + "step": 280 + }, + { + "epoch": 0.04792398264821318, + "grad_norm": 16.452192306518555, + "learning_rate": 2.396298132540076e-06, + "loss": 2.1442, + "step": 290 + }, + { + "epoch": 0.04957653377401364, + "grad_norm": 17.729570388793945, + "learning_rate": 2.478929102627665e-06, + "loss": 2.1889, + "step": 300 + }, + { + "epoch": 0.051229084899814085, + "grad_norm": 19.994186401367188, + "learning_rate": 2.561560072715254e-06, + "loss": 2.2463, + "step": 310 + }, + { + "epoch": 0.05288163602561454, + "grad_norm": 27.13626480102539, + "learning_rate": 2.6441910428028427e-06, + "loss": 2.2024, + "step": 320 + }, + { + "epoch": 0.054534187151414996, + "grad_norm": 20.60004997253418, + "learning_rate": 2.7268220128904316e-06, + "loss": 2.1578, + "step": 330 + }, + { + "epoch": 0.05618673827721545, + "grad_norm": 20.759340286254883, + "learning_rate": 2.8094529829780205e-06, + "loss": 2.0856, + "step": 340 + }, + { + "epoch": 0.05783928940301591, + "grad_norm": 34.48766326904297, + "learning_rate": 2.892083953065609e-06, + "loss": 2.1879, + "step": 350 + }, + { + "epoch": 0.05949184052881636, + "grad_norm": 21.343008041381836, + "learning_rate": 2.974714923153198e-06, + "loss": 2.1215, + "step": 360 + }, + { + "epoch": 0.06114439165461681, + "grad_norm": 20.634981155395508, + "learning_rate": 3.0573458932407868e-06, + "loss": 2.1139, + "step": 370 + }, + { + "epoch": 0.06279694278041727, + "grad_norm": 23.69481086730957, + "learning_rate": 3.1399768633283757e-06, + "loss": 2.1182, + "step": 380 + }, + { + "epoch": 0.06444949390621772, + "grad_norm": 21.073101043701172, + "learning_rate": 3.2226078334159645e-06, + "loss": 2.1002, + "step": 390 + }, + { + "epoch": 0.06610204503201818, + "grad_norm": 74.09575653076172, + "learning_rate": 3.3052388035035534e-06, + "loss": 2.0813, + "step": 400 + }, + { + "epoch": 0.06775459615781863, + "grad_norm": 18.449506759643555, + "learning_rate": 3.3878697735911423e-06, + "loss": 2.0566, + "step": 410 + }, + { + "epoch": 0.06940714728361909, + "grad_norm": 28.852758407592773, + "learning_rate": 3.470500743678731e-06, + "loss": 1.9957, + "step": 420 + }, + { + "epoch": 0.07105969840941954, + "grad_norm": 26.230737686157227, + "learning_rate": 3.5531317137663197e-06, + "loss": 1.9894, + "step": 430 + }, + { + "epoch": 0.07271224953522, + "grad_norm": 24.988384246826172, + "learning_rate": 3.6357626838539086e-06, + "loss": 2.0197, + "step": 440 + }, + { + "epoch": 0.07436480066102046, + "grad_norm": 101.62005615234375, + "learning_rate": 3.7183936539414975e-06, + "loss": 2.074, + "step": 450 + }, + { + "epoch": 0.07601735178682091, + "grad_norm": 24.157245635986328, + "learning_rate": 3.8010246240290863e-06, + "loss": 2.1024, + "step": 460 + }, + { + "epoch": 0.07766990291262135, + "grad_norm": 26.56095314025879, + "learning_rate": 3.883655594116676e-06, + "loss": 2.0546, + "step": 470 + }, + { + "epoch": 0.07932245403842181, + "grad_norm": 59.286495208740234, + "learning_rate": 3.966286564204264e-06, + "loss": 2.0861, + "step": 480 + }, + { + "epoch": 0.08097500516422226, + "grad_norm": 27.197782516479492, + "learning_rate": 4.0489175342918534e-06, + "loss": 2.0757, + "step": 490 + }, + { + "epoch": 0.08262755629002272, + "grad_norm": 28.272613525390625, + "learning_rate": 4.131548504379441e-06, + "loss": 1.8954, + "step": 500 + }, + { + "epoch": 0.08428010741582317, + "grad_norm": 23.627155303955078, + "learning_rate": 4.21417947446703e-06, + "loss": 1.9886, + "step": 510 + }, + { + "epoch": 0.08593265854162363, + "grad_norm": 28.818801879882812, + "learning_rate": 4.296810444554619e-06, + "loss": 2.0049, + "step": 520 + }, + { + "epoch": 0.08758520966742409, + "grad_norm": 31.500320434570312, + "learning_rate": 4.379441414642208e-06, + "loss": 1.9709, + "step": 530 + }, + { + "epoch": 0.08923776079322454, + "grad_norm": 32.4036865234375, + "learning_rate": 4.4620723847297975e-06, + "loss": 1.957, + "step": 540 + }, + { + "epoch": 0.090890311919025, + "grad_norm": 18.269474029541016, + "learning_rate": 4.544703354817386e-06, + "loss": 2.0274, + "step": 550 + }, + { + "epoch": 0.09254286304482545, + "grad_norm": 25.879230499267578, + "learning_rate": 4.627334324904974e-06, + "loss": 1.982, + "step": 560 + }, + { + "epoch": 0.09419541417062591, + "grad_norm": 78.83243560791016, + "learning_rate": 4.709965294992563e-06, + "loss": 1.9635, + "step": 570 + }, + { + "epoch": 0.09584796529642636, + "grad_norm": 24.732942581176758, + "learning_rate": 4.792596265080152e-06, + "loss": 1.9254, + "step": 580 + }, + { + "epoch": 0.09750051642222682, + "grad_norm": 23.506771087646484, + "learning_rate": 4.8752272351677415e-06, + "loss": 1.951, + "step": 590 + }, + { + "epoch": 0.09915306754802727, + "grad_norm": 24.17493438720703, + "learning_rate": 4.95785820525533e-06, + "loss": 1.9573, + "step": 600 + }, + { + "epoch": 0.10080561867382772, + "grad_norm": 25.89427375793457, + "learning_rate": 5.040489175342919e-06, + "loss": 2.0148, + "step": 610 + }, + { + "epoch": 0.10245816979962817, + "grad_norm": 22.84876251220703, + "learning_rate": 5.123120145430508e-06, + "loss": 1.8556, + "step": 620 + }, + { + "epoch": 0.10411072092542863, + "grad_norm": 229.06407165527344, + "learning_rate": 5.205751115518096e-06, + "loss": 1.9412, + "step": 630 + }, + { + "epoch": 0.10576327205122908, + "grad_norm": 22.079832077026367, + "learning_rate": 5.2883820856056855e-06, + "loss": 1.8359, + "step": 640 + }, + { + "epoch": 0.10741582317702954, + "grad_norm": 23.32947540283203, + "learning_rate": 5.371013055693274e-06, + "loss": 1.7962, + "step": 650 + }, + { + "epoch": 0.10906837430282999, + "grad_norm": 25.554899215698242, + "learning_rate": 5.453644025780863e-06, + "loss": 1.8114, + "step": 660 + }, + { + "epoch": 0.11072092542863045, + "grad_norm": 23.066320419311523, + "learning_rate": 5.536274995868452e-06, + "loss": 1.9601, + "step": 670 + }, + { + "epoch": 0.1123734765544309, + "grad_norm": 29.641036987304688, + "learning_rate": 5.618905965956041e-06, + "loss": 1.8361, + "step": 680 + }, + { + "epoch": 0.11402602768023136, + "grad_norm": 40.079246520996094, + "learning_rate": 5.7015369360436295e-06, + "loss": 1.7822, + "step": 690 + }, + { + "epoch": 0.11567857880603181, + "grad_norm": 46.70073699951172, + "learning_rate": 5.784167906131218e-06, + "loss": 1.7975, + "step": 700 + }, + { + "epoch": 0.11733112993183227, + "grad_norm": 27.957921981811523, + "learning_rate": 5.866798876218807e-06, + "loss": 1.9022, + "step": 710 + }, + { + "epoch": 0.11898368105763273, + "grad_norm": 87.9314956665039, + "learning_rate": 5.949429846306396e-06, + "loss": 1.7631, + "step": 720 + }, + { + "epoch": 0.12063623218343318, + "grad_norm": 28.518705368041992, + "learning_rate": 6.032060816393985e-06, + "loss": 1.8346, + "step": 730 + }, + { + "epoch": 0.12228878330923362, + "grad_norm": 35.01418685913086, + "learning_rate": 6.1146917864815735e-06, + "loss": 1.7283, + "step": 740 + }, + { + "epoch": 0.12394133443503408, + "grad_norm": 25.934650421142578, + "learning_rate": 6.197322756569163e-06, + "loss": 1.7489, + "step": 750 + }, + { + "epoch": 0.12559388556083453, + "grad_norm": 49.37101364135742, + "learning_rate": 6.279953726656751e-06, + "loss": 1.7778, + "step": 760 + }, + { + "epoch": 0.127246436686635, + "grad_norm": 40.169677734375, + "learning_rate": 6.362584696744341e-06, + "loss": 1.9885, + "step": 770 + }, + { + "epoch": 0.12889898781243544, + "grad_norm": 39.495182037353516, + "learning_rate": 6.445215666831929e-06, + "loss": 1.8897, + "step": 780 + }, + { + "epoch": 0.1305515389382359, + "grad_norm": 33.42755889892578, + "learning_rate": 6.527846636919517e-06, + "loss": 1.8193, + "step": 790 + }, + { + "epoch": 0.13220409006403636, + "grad_norm": 53.90789794921875, + "learning_rate": 6.610477607007107e-06, + "loss": 1.7312, + "step": 800 + }, + { + "epoch": 0.1338566411898368, + "grad_norm": 42.573890686035156, + "learning_rate": 6.693108577094695e-06, + "loss": 1.8778, + "step": 810 + }, + { + "epoch": 0.13550919231563727, + "grad_norm": 20.9980525970459, + "learning_rate": 6.775739547182285e-06, + "loss": 1.7827, + "step": 820 + }, + { + "epoch": 0.13716174344143772, + "grad_norm": 27.766277313232422, + "learning_rate": 6.858370517269873e-06, + "loss": 1.8191, + "step": 830 + }, + { + "epoch": 0.13881429456723818, + "grad_norm": 26.448427200317383, + "learning_rate": 6.941001487357462e-06, + "loss": 1.8897, + "step": 840 + }, + { + "epoch": 0.14046684569303863, + "grad_norm": 52.19208908081055, + "learning_rate": 7.023632457445051e-06, + "loss": 1.932, + "step": 850 + }, + { + "epoch": 0.1421193968188391, + "grad_norm": 21.94788360595703, + "learning_rate": 7.106263427532639e-06, + "loss": 1.8672, + "step": 860 + }, + { + "epoch": 0.14377194794463954, + "grad_norm": 40.3758430480957, + "learning_rate": 7.188894397620229e-06, + "loss": 1.8301, + "step": 870 + }, + { + "epoch": 0.14542449907044, + "grad_norm": 22.807653427124023, + "learning_rate": 7.271525367707817e-06, + "loss": 1.8321, + "step": 880 + }, + { + "epoch": 0.14707705019624046, + "grad_norm": 22.939292907714844, + "learning_rate": 7.3541563377954064e-06, + "loss": 1.7721, + "step": 890 + }, + { + "epoch": 0.1487296013220409, + "grad_norm": 34.43024444580078, + "learning_rate": 7.436787307882995e-06, + "loss": 1.7898, + "step": 900 + }, + { + "epoch": 0.15038215244784137, + "grad_norm": 19.193397521972656, + "learning_rate": 7.519418277970584e-06, + "loss": 1.8499, + "step": 910 + }, + { + "epoch": 0.15203470357364182, + "grad_norm": 26.9038143157959, + "learning_rate": 7.602049248058173e-06, + "loss": 1.8749, + "step": 920 + }, + { + "epoch": 0.15368725469944228, + "grad_norm": 30.20845603942871, + "learning_rate": 7.684680218145761e-06, + "loss": 1.7995, + "step": 930 + }, + { + "epoch": 0.1553398058252427, + "grad_norm": 30.089397430419922, + "learning_rate": 7.767311188233351e-06, + "loss": 1.8802, + "step": 940 + }, + { + "epoch": 0.15699235695104316, + "grad_norm": 18.006601333618164, + "learning_rate": 7.849942158320938e-06, + "loss": 1.8049, + "step": 950 + }, + { + "epoch": 0.15864490807684362, + "grad_norm": 22.729703903198242, + "learning_rate": 7.932573128408528e-06, + "loss": 1.8031, + "step": 960 + }, + { + "epoch": 0.16029745920264407, + "grad_norm": 22.711090087890625, + "learning_rate": 8.015204098496117e-06, + "loss": 1.7107, + "step": 970 + }, + { + "epoch": 0.16195001032844453, + "grad_norm": 24.80642318725586, + "learning_rate": 8.097835068583707e-06, + "loss": 1.8531, + "step": 980 + }, + { + "epoch": 0.16360256145424498, + "grad_norm": 28.32562255859375, + "learning_rate": 8.180466038671294e-06, + "loss": 1.927, + "step": 990 + }, + { + "epoch": 0.16525511258004544, + "grad_norm": 20.05076789855957, + "learning_rate": 8.263097008758882e-06, + "loss": 1.7359, + "step": 1000 + }, + { + "epoch": 0.1669076637058459, + "grad_norm": 32.90656661987305, + "learning_rate": 8.345727978846472e-06, + "loss": 1.7466, + "step": 1010 + }, + { + "epoch": 0.16856021483164635, + "grad_norm": 44.9754524230957, + "learning_rate": 8.42835894893406e-06, + "loss": 1.8069, + "step": 1020 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 201.73477172851562, + "learning_rate": 8.510989919021651e-06, + "loss": 1.7286, + "step": 1030 + }, + { + "epoch": 0.17186531708324726, + "grad_norm": 18.8424015045166, + "learning_rate": 8.593620889109238e-06, + "loss": 1.8063, + "step": 1040 + }, + { + "epoch": 0.17351786820904772, + "grad_norm": 26.58818244934082, + "learning_rate": 8.676251859196826e-06, + "loss": 1.9267, + "step": 1050 + }, + { + "epoch": 0.17517041933484817, + "grad_norm": 37.0977897644043, + "learning_rate": 8.758882829284416e-06, + "loss": 1.7116, + "step": 1060 + }, + { + "epoch": 0.17682297046064863, + "grad_norm": 19.845773696899414, + "learning_rate": 8.841513799372005e-06, + "loss": 1.7994, + "step": 1070 + }, + { + "epoch": 0.17847552158644908, + "grad_norm": 93.75279235839844, + "learning_rate": 8.924144769459595e-06, + "loss": 1.8503, + "step": 1080 + }, + { + "epoch": 0.18012807271224954, + "grad_norm": 50.5566291809082, + "learning_rate": 9.006775739547182e-06, + "loss": 1.7882, + "step": 1090 + }, + { + "epoch": 0.18178062383805, + "grad_norm": 19.830806732177734, + "learning_rate": 9.089406709634772e-06, + "loss": 1.7492, + "step": 1100 + }, + { + "epoch": 0.18343317496385045, + "grad_norm": 88.4405746459961, + "learning_rate": 9.17203767972236e-06, + "loss": 1.8902, + "step": 1110 + }, + { + "epoch": 0.1850857260896509, + "grad_norm": 38.67186737060547, + "learning_rate": 9.254668649809949e-06, + "loss": 1.7064, + "step": 1120 + }, + { + "epoch": 0.18673827721545136, + "grad_norm": 17.757261276245117, + "learning_rate": 9.337299619897539e-06, + "loss": 1.7849, + "step": 1130 + }, + { + "epoch": 0.18839082834125181, + "grad_norm": 20.545978546142578, + "learning_rate": 9.419930589985126e-06, + "loss": 1.7574, + "step": 1140 + }, + { + "epoch": 0.19004337946705227, + "grad_norm": 15.944082260131836, + "learning_rate": 9.502561560072716e-06, + "loss": 1.7188, + "step": 1150 + }, + { + "epoch": 0.19169593059285273, + "grad_norm": 32.81608963012695, + "learning_rate": 9.585192530160304e-06, + "loss": 1.9046, + "step": 1160 + }, + { + "epoch": 0.19334848171865318, + "grad_norm": 18.563947677612305, + "learning_rate": 9.667823500247894e-06, + "loss": 1.7162, + "step": 1170 + }, + { + "epoch": 0.19500103284445364, + "grad_norm": 25.642000198364258, + "learning_rate": 9.750454470335483e-06, + "loss": 1.7451, + "step": 1180 + }, + { + "epoch": 0.1966535839702541, + "grad_norm": 32.40825271606445, + "learning_rate": 9.83308544042307e-06, + "loss": 1.7892, + "step": 1190 + }, + { + "epoch": 0.19830613509605455, + "grad_norm": 19.13682746887207, + "learning_rate": 9.91571641051066e-06, + "loss": 1.8015, + "step": 1200 + }, + { + "epoch": 0.19995868622185498, + "grad_norm": 28.85736083984375, + "learning_rate": 9.998347380598248e-06, + "loss": 1.7506, + "step": 1210 + }, + { + "epoch": 0.20161123734765543, + "grad_norm": 15.917028427124023, + "learning_rate": 1.0080978350685838e-05, + "loss": 1.7728, + "step": 1220 + }, + { + "epoch": 0.2032637884734559, + "grad_norm": 29.365341186523438, + "learning_rate": 1.0163609320773427e-05, + "loss": 1.8862, + "step": 1230 + }, + { + "epoch": 0.20491633959925634, + "grad_norm": 30.120737075805664, + "learning_rate": 1.0246240290861015e-05, + "loss": 1.7482, + "step": 1240 + }, + { + "epoch": 0.2065688907250568, + "grad_norm": 19.36783790588379, + "learning_rate": 1.0328871260948604e-05, + "loss": 1.6657, + "step": 1250 + }, + { + "epoch": 0.20822144185085725, + "grad_norm": 21.701364517211914, + "learning_rate": 1.0411502231036192e-05, + "loss": 1.8962, + "step": 1260 + }, + { + "epoch": 0.2098739929766577, + "grad_norm": 23.340579986572266, + "learning_rate": 1.0494133201123783e-05, + "loss": 1.828, + "step": 1270 + }, + { + "epoch": 0.21152654410245816, + "grad_norm": 22.94394874572754, + "learning_rate": 1.0576764171211371e-05, + "loss": 1.7493, + "step": 1280 + }, + { + "epoch": 0.21317909522825862, + "grad_norm": 24.351810455322266, + "learning_rate": 1.065939514129896e-05, + "loss": 1.7058, + "step": 1290 + }, + { + "epoch": 0.21483164635405907, + "grad_norm": 35.676246643066406, + "learning_rate": 1.0742026111386548e-05, + "loss": 1.6535, + "step": 1300 + }, + { + "epoch": 0.21648419747985953, + "grad_norm": 19.702678680419922, + "learning_rate": 1.0824657081474138e-05, + "loss": 1.8294, + "step": 1310 + }, + { + "epoch": 0.21813674860565999, + "grad_norm": 29.777185440063477, + "learning_rate": 1.0907288051561727e-05, + "loss": 1.7602, + "step": 1320 + }, + { + "epoch": 0.21978929973146044, + "grad_norm": 41.81754684448242, + "learning_rate": 1.0989919021649315e-05, + "loss": 1.6949, + "step": 1330 + }, + { + "epoch": 0.2214418508572609, + "grad_norm": 24.266855239868164, + "learning_rate": 1.1072549991736903e-05, + "loss": 1.7718, + "step": 1340 + }, + { + "epoch": 0.22309440198306135, + "grad_norm": 36.88713073730469, + "learning_rate": 1.1155180961824492e-05, + "loss": 1.7773, + "step": 1350 + }, + { + "epoch": 0.2247469531088618, + "grad_norm": 20.02907943725586, + "learning_rate": 1.1237811931912082e-05, + "loss": 1.8017, + "step": 1360 + }, + { + "epoch": 0.22639950423466226, + "grad_norm": 32.76356887817383, + "learning_rate": 1.132044290199967e-05, + "loss": 1.6808, + "step": 1370 + }, + { + "epoch": 0.22805205536046272, + "grad_norm": 28.06949234008789, + "learning_rate": 1.1403073872087259e-05, + "loss": 1.7669, + "step": 1380 + }, + { + "epoch": 0.22970460648626317, + "grad_norm": 27.715717315673828, + "learning_rate": 1.1485704842174847e-05, + "loss": 1.8211, + "step": 1390 + }, + { + "epoch": 0.23135715761206363, + "grad_norm": 14.42801570892334, + "learning_rate": 1.1568335812262436e-05, + "loss": 1.7236, + "step": 1400 + }, + { + "epoch": 0.23300970873786409, + "grad_norm": 24.549047470092773, + "learning_rate": 1.1650966782350026e-05, + "loss": 1.7445, + "step": 1410 + }, + { + "epoch": 0.23466225986366454, + "grad_norm": 19.45368003845215, + "learning_rate": 1.1733597752437615e-05, + "loss": 1.7746, + "step": 1420 + }, + { + "epoch": 0.236314810989465, + "grad_norm": 25.40043830871582, + "learning_rate": 1.1816228722525203e-05, + "loss": 1.88, + "step": 1430 + }, + { + "epoch": 0.23796736211526545, + "grad_norm": 80.87067413330078, + "learning_rate": 1.1898859692612792e-05, + "loss": 1.7234, + "step": 1440 + }, + { + "epoch": 0.2396199132410659, + "grad_norm": 12.040018081665039, + "learning_rate": 1.198149066270038e-05, + "loss": 1.7625, + "step": 1450 + }, + { + "epoch": 0.24127246436686636, + "grad_norm": 16.50065803527832, + "learning_rate": 1.206412163278797e-05, + "loss": 1.8548, + "step": 1460 + }, + { + "epoch": 0.24292501549266682, + "grad_norm": 21.08123207092285, + "learning_rate": 1.2146752602875559e-05, + "loss": 1.7773, + "step": 1470 + }, + { + "epoch": 0.24457756661846725, + "grad_norm": 13.436989784240723, + "learning_rate": 1.2229383572963147e-05, + "loss": 1.6914, + "step": 1480 + }, + { + "epoch": 0.2462301177442677, + "grad_norm": 157.91610717773438, + "learning_rate": 1.2312014543050736e-05, + "loss": 1.7556, + "step": 1490 + }, + { + "epoch": 0.24788266887006816, + "grad_norm": 37.71729278564453, + "learning_rate": 1.2394645513138326e-05, + "loss": 1.691, + "step": 1500 + }, + { + "epoch": 0.2495352199958686, + "grad_norm": 73.44625854492188, + "learning_rate": 1.2477276483225914e-05, + "loss": 1.7093, + "step": 1510 + }, + { + "epoch": 0.25118777112166907, + "grad_norm": 24.017974853515625, + "learning_rate": 1.2559907453313503e-05, + "loss": 1.691, + "step": 1520 + }, + { + "epoch": 0.25284032224746955, + "grad_norm": 34.528289794921875, + "learning_rate": 1.264253842340109e-05, + "loss": 1.9174, + "step": 1530 + }, + { + "epoch": 0.25449287337327, + "grad_norm": 16.658061981201172, + "learning_rate": 1.2725169393488681e-05, + "loss": 1.6134, + "step": 1540 + }, + { + "epoch": 0.25614542449907046, + "grad_norm": 49.6403694152832, + "learning_rate": 1.280780036357627e-05, + "loss": 1.7812, + "step": 1550 + }, + { + "epoch": 0.2577979756248709, + "grad_norm": 13.805188179016113, + "learning_rate": 1.2890431333663858e-05, + "loss": 1.7681, + "step": 1560 + }, + { + "epoch": 0.2594505267506714, + "grad_norm": 51.3933219909668, + "learning_rate": 1.2973062303751447e-05, + "loss": 1.7339, + "step": 1570 + }, + { + "epoch": 0.2611030778764718, + "grad_norm": 12.193310737609863, + "learning_rate": 1.3055693273839033e-05, + "loss": 1.8874, + "step": 1580 + }, + { + "epoch": 0.2627556290022723, + "grad_norm": 19.450271606445312, + "learning_rate": 1.3138324243926625e-05, + "loss": 1.8667, + "step": 1590 + }, + { + "epoch": 0.2644081801280727, + "grad_norm": 22.680065155029297, + "learning_rate": 1.3220955214014214e-05, + "loss": 1.8435, + "step": 1600 + }, + { + "epoch": 0.26606073125387314, + "grad_norm": 43.32194900512695, + "learning_rate": 1.3303586184101802e-05, + "loss": 1.685, + "step": 1610 + }, + { + "epoch": 0.2677132823796736, + "grad_norm": 14.092884063720703, + "learning_rate": 1.338621715418939e-05, + "loss": 1.7688, + "step": 1620 + }, + { + "epoch": 0.26936583350547405, + "grad_norm": 20.905393600463867, + "learning_rate": 1.346884812427698e-05, + "loss": 1.8398, + "step": 1630 + }, + { + "epoch": 0.27101838463127453, + "grad_norm": 12.114250183105469, + "learning_rate": 1.355147909436457e-05, + "loss": 1.7383, + "step": 1640 + }, + { + "epoch": 0.27267093575707496, + "grad_norm": 16.64354133605957, + "learning_rate": 1.3634110064452158e-05, + "loss": 1.6308, + "step": 1650 + }, + { + "epoch": 0.27432348688287544, + "grad_norm": 18.8261661529541, + "learning_rate": 1.3716741034539746e-05, + "loss": 1.7076, + "step": 1660 + }, + { + "epoch": 0.2759760380086759, + "grad_norm": 11.279623031616211, + "learning_rate": 1.3799372004627335e-05, + "loss": 1.774, + "step": 1670 + }, + { + "epoch": 0.27762858913447636, + "grad_norm": 12.221219062805176, + "learning_rate": 1.3882002974714925e-05, + "loss": 1.5898, + "step": 1680 + }, + { + "epoch": 0.2792811402602768, + "grad_norm": 14.965773582458496, + "learning_rate": 1.3964633944802513e-05, + "loss": 1.6842, + "step": 1690 + }, + { + "epoch": 0.28093369138607727, + "grad_norm": 28.31591796875, + "learning_rate": 1.4047264914890102e-05, + "loss": 1.6575, + "step": 1700 + }, + { + "epoch": 0.2825862425118777, + "grad_norm": 12.584307670593262, + "learning_rate": 1.412989588497769e-05, + "loss": 1.6283, + "step": 1710 + }, + { + "epoch": 0.2842387936376782, + "grad_norm": 30.381515502929688, + "learning_rate": 1.4212526855065279e-05, + "loss": 1.7115, + "step": 1720 + }, + { + "epoch": 0.2858913447634786, + "grad_norm": 20.99606704711914, + "learning_rate": 1.4295157825152869e-05, + "loss": 1.7297, + "step": 1730 + }, + { + "epoch": 0.2875438958892791, + "grad_norm": 91.88641357421875, + "learning_rate": 1.4377788795240457e-05, + "loss": 1.7907, + "step": 1740 + }, + { + "epoch": 0.2891964470150795, + "grad_norm": 14.486876487731934, + "learning_rate": 1.4460419765328046e-05, + "loss": 1.763, + "step": 1750 + }, + { + "epoch": 0.29084899814088, + "grad_norm": 32.83678436279297, + "learning_rate": 1.4543050735415634e-05, + "loss": 1.8511, + "step": 1760 + }, + { + "epoch": 0.2925015492666804, + "grad_norm": 15.216378211975098, + "learning_rate": 1.4625681705503224e-05, + "loss": 1.8471, + "step": 1770 + }, + { + "epoch": 0.2941541003924809, + "grad_norm": 34.238189697265625, + "learning_rate": 1.4708312675590813e-05, + "loss": 1.7476, + "step": 1780 + }, + { + "epoch": 0.29580665151828134, + "grad_norm": 26.102210998535156, + "learning_rate": 1.4790943645678401e-05, + "loss": 1.5983, + "step": 1790 + }, + { + "epoch": 0.2974592026440818, + "grad_norm": 23.120174407958984, + "learning_rate": 1.487357461576599e-05, + "loss": 1.7779, + "step": 1800 + }, + { + "epoch": 0.29911175376988225, + "grad_norm": 14.618244171142578, + "learning_rate": 1.4956205585853578e-05, + "loss": 1.6681, + "step": 1810 + }, + { + "epoch": 0.30076430489568273, + "grad_norm": 33.785186767578125, + "learning_rate": 1.5038836555941168e-05, + "loss": 1.7867, + "step": 1820 + }, + { + "epoch": 0.30241685602148316, + "grad_norm": 11.777277946472168, + "learning_rate": 1.5121467526028757e-05, + "loss": 1.6726, + "step": 1830 + }, + { + "epoch": 0.30406940714728364, + "grad_norm": 15.27955150604248, + "learning_rate": 1.5204098496116345e-05, + "loss": 1.7069, + "step": 1840 + }, + { + "epoch": 0.30572195827308407, + "grad_norm": 25.062849044799805, + "learning_rate": 1.5286729466203932e-05, + "loss": 1.6681, + "step": 1850 + }, + { + "epoch": 0.30737450939888455, + "grad_norm": 19.023670196533203, + "learning_rate": 1.5369360436291522e-05, + "loss": 1.7546, + "step": 1860 + }, + { + "epoch": 0.309027060524685, + "grad_norm": 14.75047492980957, + "learning_rate": 1.5451991406379112e-05, + "loss": 1.7295, + "step": 1870 + }, + { + "epoch": 0.3106796116504854, + "grad_norm": 12.37612247467041, + "learning_rate": 1.5534622376466703e-05, + "loss": 1.8116, + "step": 1880 + }, + { + "epoch": 0.3123321627762859, + "grad_norm": 26.212312698364258, + "learning_rate": 1.561725334655429e-05, + "loss": 1.7477, + "step": 1890 + }, + { + "epoch": 0.3139847139020863, + "grad_norm": 52.385616302490234, + "learning_rate": 1.5699884316641876e-05, + "loss": 1.6689, + "step": 1900 + }, + { + "epoch": 0.3156372650278868, + "grad_norm": 15.535853385925293, + "learning_rate": 1.5782515286729466e-05, + "loss": 1.7874, + "step": 1910 + }, + { + "epoch": 0.31728981615368723, + "grad_norm": 15.0889892578125, + "learning_rate": 1.5865146256817056e-05, + "loss": 1.812, + "step": 1920 + }, + { + "epoch": 0.3189423672794877, + "grad_norm": 14.585269927978516, + "learning_rate": 1.5947777226904643e-05, + "loss": 1.7859, + "step": 1930 + }, + { + "epoch": 0.32059491840528814, + "grad_norm": 26.54705810546875, + "learning_rate": 1.6030408196992233e-05, + "loss": 1.6718, + "step": 1940 + }, + { + "epoch": 0.3222474695310886, + "grad_norm": 23.145902633666992, + "learning_rate": 1.611303916707982e-05, + "loss": 1.7847, + "step": 1950 + }, + { + "epoch": 0.32390002065688905, + "grad_norm": 19.175052642822266, + "learning_rate": 1.6195670137167414e-05, + "loss": 1.895, + "step": 1960 + }, + { + "epoch": 0.32555257178268954, + "grad_norm": 11.821075439453125, + "learning_rate": 1.6278301107255e-05, + "loss": 1.7336, + "step": 1970 + }, + { + "epoch": 0.32720512290848996, + "grad_norm": 13.310040473937988, + "learning_rate": 1.6360932077342587e-05, + "loss": 1.6935, + "step": 1980 + }, + { + "epoch": 0.32885767403429045, + "grad_norm": 20.371007919311523, + "learning_rate": 1.6443563047430177e-05, + "loss": 1.6693, + "step": 1990 + }, + { + "epoch": 0.3305102251600909, + "grad_norm": 9.38290786743164, + "learning_rate": 1.6526194017517764e-05, + "loss": 1.8035, + "step": 2000 + }, + { + "epoch": 0.33216277628589136, + "grad_norm": 11.759653091430664, + "learning_rate": 1.6608824987605358e-05, + "loss": 1.6504, + "step": 2010 + }, + { + "epoch": 0.3338153274116918, + "grad_norm": 9.93994140625, + "learning_rate": 1.6691455957692945e-05, + "loss": 1.6363, + "step": 2020 + }, + { + "epoch": 0.33546787853749227, + "grad_norm": 39.654541015625, + "learning_rate": 1.677408692778053e-05, + "loss": 1.6831, + "step": 2030 + }, + { + "epoch": 0.3371204296632927, + "grad_norm": 15.763498306274414, + "learning_rate": 1.685671789786812e-05, + "loss": 1.7134, + "step": 2040 + }, + { + "epoch": 0.3387729807890932, + "grad_norm": 13.894293785095215, + "learning_rate": 1.6939348867955708e-05, + "loss": 1.6449, + "step": 2050 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 12.600088119506836, + "learning_rate": 1.7021979838043302e-05, + "loss": 1.8048, + "step": 2060 + }, + { + "epoch": 0.3420780830406941, + "grad_norm": 49.38233947753906, + "learning_rate": 1.710461080813089e-05, + "loss": 1.669, + "step": 2070 + }, + { + "epoch": 0.3437306341664945, + "grad_norm": 123.16156768798828, + "learning_rate": 1.7187241778218475e-05, + "loss": 1.6796, + "step": 2080 + }, + { + "epoch": 0.345383185292295, + "grad_norm": 12.5492525100708, + "learning_rate": 1.7269872748306065e-05, + "loss": 1.5977, + "step": 2090 + }, + { + "epoch": 0.34703573641809543, + "grad_norm": 27.242778778076172, + "learning_rate": 1.7352503718393652e-05, + "loss": 1.8755, + "step": 2100 + }, + { + "epoch": 0.3486882875438959, + "grad_norm": 12.04410457611084, + "learning_rate": 1.7435134688481246e-05, + "loss": 1.7807, + "step": 2110 + }, + { + "epoch": 0.35034083866969634, + "grad_norm": 37.36049270629883, + "learning_rate": 1.7517765658568833e-05, + "loss": 1.7438, + "step": 2120 + }, + { + "epoch": 0.3519933897954968, + "grad_norm": 19.858129501342773, + "learning_rate": 1.760039662865642e-05, + "loss": 1.9511, + "step": 2130 + }, + { + "epoch": 0.35364594092129725, + "grad_norm": 11.481547355651855, + "learning_rate": 1.768302759874401e-05, + "loss": 1.7014, + "step": 2140 + }, + { + "epoch": 0.3552984920470977, + "grad_norm": 81.52596282958984, + "learning_rate": 1.77656585688316e-05, + "loss": 1.6589, + "step": 2150 + }, + { + "epoch": 0.35695104317289816, + "grad_norm": 17.031217575073242, + "learning_rate": 1.784828953891919e-05, + "loss": 1.6914, + "step": 2160 + }, + { + "epoch": 0.3586035942986986, + "grad_norm": 16.88812255859375, + "learning_rate": 1.7930920509006777e-05, + "loss": 1.8675, + "step": 2170 + }, + { + "epoch": 0.3602561454244991, + "grad_norm": 10.55776309967041, + "learning_rate": 1.8013551479094363e-05, + "loss": 1.7589, + "step": 2180 + }, + { + "epoch": 0.3619086965502995, + "grad_norm": 84.68812561035156, + "learning_rate": 1.8096182449181954e-05, + "loss": 1.6879, + "step": 2190 + }, + { + "epoch": 0.3635612476761, + "grad_norm": 18.757328033447266, + "learning_rate": 1.8178813419269544e-05, + "loss": 1.7931, + "step": 2200 + }, + { + "epoch": 0.3652137988019004, + "grad_norm": 23.67148780822754, + "learning_rate": 1.8261444389357134e-05, + "loss": 1.7531, + "step": 2210 + }, + { + "epoch": 0.3668663499277009, + "grad_norm": 38.11347198486328, + "learning_rate": 1.834407535944472e-05, + "loss": 1.6541, + "step": 2220 + }, + { + "epoch": 0.3685189010535013, + "grad_norm": 15.99669075012207, + "learning_rate": 1.8426706329532307e-05, + "loss": 1.7447, + "step": 2230 + }, + { + "epoch": 0.3701714521793018, + "grad_norm": 16.732406616210938, + "learning_rate": 1.8509337299619898e-05, + "loss": 1.7383, + "step": 2240 + }, + { + "epoch": 0.37182400330510224, + "grad_norm": 36.895389556884766, + "learning_rate": 1.8591968269707488e-05, + "loss": 1.7366, + "step": 2250 + }, + { + "epoch": 0.3734765544309027, + "grad_norm": 14.426408767700195, + "learning_rate": 1.8674599239795078e-05, + "loss": 1.7979, + "step": 2260 + }, + { + "epoch": 0.37512910555670315, + "grad_norm": 19.50206184387207, + "learning_rate": 1.8757230209882665e-05, + "loss": 1.7249, + "step": 2270 + }, + { + "epoch": 0.37678165668250363, + "grad_norm": 33.446937561035156, + "learning_rate": 1.883986117997025e-05, + "loss": 1.8677, + "step": 2280 + }, + { + "epoch": 0.37843420780830406, + "grad_norm": 16.420921325683594, + "learning_rate": 1.8922492150057845e-05, + "loss": 1.608, + "step": 2290 + }, + { + "epoch": 0.38008675893410454, + "grad_norm": 33.9530029296875, + "learning_rate": 1.9005123120145432e-05, + "loss": 1.7731, + "step": 2300 + }, + { + "epoch": 0.38173931005990497, + "grad_norm": 14.426531791687012, + "learning_rate": 1.9087754090233022e-05, + "loss": 1.7938, + "step": 2310 + }, + { + "epoch": 0.38339186118570545, + "grad_norm": 18.52570152282715, + "learning_rate": 1.917038506032061e-05, + "loss": 1.7116, + "step": 2320 + }, + { + "epoch": 0.3850444123115059, + "grad_norm": 11.881404876708984, + "learning_rate": 1.9253016030408195e-05, + "loss": 1.719, + "step": 2330 + }, + { + "epoch": 0.38669696343730636, + "grad_norm": 13.673921585083008, + "learning_rate": 1.933564700049579e-05, + "loss": 1.6652, + "step": 2340 + }, + { + "epoch": 0.3883495145631068, + "grad_norm": 12.703744888305664, + "learning_rate": 1.9418277970583376e-05, + "loss": 1.5334, + "step": 2350 + }, + { + "epoch": 0.3900020656889073, + "grad_norm": 25.498388290405273, + "learning_rate": 1.9500908940670966e-05, + "loss": 1.7857, + "step": 2360 + }, + { + "epoch": 0.3916546168147077, + "grad_norm": 10.993797302246094, + "learning_rate": 1.9583539910758553e-05, + "loss": 1.6038, + "step": 2370 + }, + { + "epoch": 0.3933071679405082, + "grad_norm": 17.91847038269043, + "learning_rate": 1.966617088084614e-05, + "loss": 1.6546, + "step": 2380 + }, + { + "epoch": 0.3949597190663086, + "grad_norm": 22.401233673095703, + "learning_rate": 1.9748801850933733e-05, + "loss": 1.7847, + "step": 2390 + }, + { + "epoch": 0.3966122701921091, + "grad_norm": 20.50080108642578, + "learning_rate": 1.983143282102132e-05, + "loss": 1.6773, + "step": 2400 + }, + { + "epoch": 0.3982648213179095, + "grad_norm": 22.017650604248047, + "learning_rate": 1.991406379110891e-05, + "loss": 1.7695, + "step": 2410 + }, + { + "epoch": 0.39991737244370995, + "grad_norm": 10.337092399597168, + "learning_rate": 1.9996694761196497e-05, + "loss": 1.7029, + "step": 2420 + }, + { + "epoch": 0.40156992356951043, + "grad_norm": 10.79664134979248, + "learning_rate": 2.0079325731284083e-05, + "loss": 1.6576, + "step": 2430 + }, + { + "epoch": 0.40322247469531086, + "grad_norm": 33.853580474853516, + "learning_rate": 2.0161956701371677e-05, + "loss": 1.7565, + "step": 2440 + }, + { + "epoch": 0.40487502582111135, + "grad_norm": 15.0186767578125, + "learning_rate": 2.0244587671459264e-05, + "loss": 1.7022, + "step": 2450 + }, + { + "epoch": 0.4065275769469118, + "grad_norm": 21.504180908203125, + "learning_rate": 2.0327218641546854e-05, + "loss": 1.5559, + "step": 2460 + }, + { + "epoch": 0.40818012807271226, + "grad_norm": 15.873148918151855, + "learning_rate": 2.040984961163444e-05, + "loss": 1.7129, + "step": 2470 + }, + { + "epoch": 0.4098326791985127, + "grad_norm": 19.291330337524414, + "learning_rate": 2.049248058172203e-05, + "loss": 1.7669, + "step": 2480 + }, + { + "epoch": 0.41148523032431317, + "grad_norm": 10.179187774658203, + "learning_rate": 2.057511155180962e-05, + "loss": 1.7552, + "step": 2490 + }, + { + "epoch": 0.4131377814501136, + "grad_norm": 46.47639083862305, + "learning_rate": 2.0657742521897208e-05, + "loss": 1.757, + "step": 2500 + }, + { + "epoch": 0.4147903325759141, + "grad_norm": 13.199604988098145, + "learning_rate": 2.0740373491984798e-05, + "loss": 1.6626, + "step": 2510 + }, + { + "epoch": 0.4164428837017145, + "grad_norm": 9.901959419250488, + "learning_rate": 2.0823004462072385e-05, + "loss": 1.6169, + "step": 2520 + }, + { + "epoch": 0.418095434827515, + "grad_norm": 21.13959312438965, + "learning_rate": 2.0905635432159975e-05, + "loss": 1.6024, + "step": 2530 + }, + { + "epoch": 0.4197479859533154, + "grad_norm": 7.372568130493164, + "learning_rate": 2.0988266402247565e-05, + "loss": 1.7535, + "step": 2540 + }, + { + "epoch": 0.4214005370791159, + "grad_norm": 61.1468620300293, + "learning_rate": 2.1070897372335152e-05, + "loss": 1.6967, + "step": 2550 + }, + { + "epoch": 0.4230530882049163, + "grad_norm": 16.36248779296875, + "learning_rate": 2.1153528342422742e-05, + "loss": 1.6661, + "step": 2560 + }, + { + "epoch": 0.4247056393307168, + "grad_norm": 13.132911682128906, + "learning_rate": 2.123615931251033e-05, + "loss": 1.7467, + "step": 2570 + }, + { + "epoch": 0.42635819045651724, + "grad_norm": 17.8082218170166, + "learning_rate": 2.131879028259792e-05, + "loss": 1.7569, + "step": 2580 + }, + { + "epoch": 0.4280107415823177, + "grad_norm": 48.110652923583984, + "learning_rate": 2.140142125268551e-05, + "loss": 1.6307, + "step": 2590 + }, + { + "epoch": 0.42966329270811815, + "grad_norm": 32.73516082763672, + "learning_rate": 2.1484052222773096e-05, + "loss": 1.7427, + "step": 2600 + }, + { + "epoch": 0.43131584383391863, + "grad_norm": 29.785804748535156, + "learning_rate": 2.1566683192860686e-05, + "loss": 1.8084, + "step": 2610 + }, + { + "epoch": 0.43296839495971906, + "grad_norm": 21.294347763061523, + "learning_rate": 2.1649314162948276e-05, + "loss": 1.8063, + "step": 2620 + }, + { + "epoch": 0.43462094608551954, + "grad_norm": 10.334474563598633, + "learning_rate": 2.1731945133035863e-05, + "loss": 1.7292, + "step": 2630 + }, + { + "epoch": 0.43627349721131997, + "grad_norm": 37.316349029541016, + "learning_rate": 2.1814576103123453e-05, + "loss": 1.7178, + "step": 2640 + }, + { + "epoch": 0.43792604833712045, + "grad_norm": 15.207347869873047, + "learning_rate": 2.189720707321104e-05, + "loss": 1.6157, + "step": 2650 + }, + { + "epoch": 0.4395785994629209, + "grad_norm": 44.354366302490234, + "learning_rate": 2.197983804329863e-05, + "loss": 1.6401, + "step": 2660 + }, + { + "epoch": 0.44123115058872137, + "grad_norm": 13.655917167663574, + "learning_rate": 2.206246901338622e-05, + "loss": 1.6388, + "step": 2670 + }, + { + "epoch": 0.4428837017145218, + "grad_norm": 25.89215660095215, + "learning_rate": 2.2145099983473807e-05, + "loss": 1.7077, + "step": 2680 + }, + { + "epoch": 0.4445362528403222, + "grad_norm": 9.225777626037598, + "learning_rate": 2.2227730953561397e-05, + "loss": 1.7651, + "step": 2690 + }, + { + "epoch": 0.4461888039661227, + "grad_norm": 21.08796501159668, + "learning_rate": 2.2310361923648984e-05, + "loss": 1.6447, + "step": 2700 + }, + { + "epoch": 0.44784135509192313, + "grad_norm": 16.045692443847656, + "learning_rate": 2.2392992893736574e-05, + "loss": 1.6521, + "step": 2710 + }, + { + "epoch": 0.4494939062177236, + "grad_norm": 16.895357131958008, + "learning_rate": 2.2475623863824164e-05, + "loss": 1.5923, + "step": 2720 + }, + { + "epoch": 0.45114645734352404, + "grad_norm": 19.075815200805664, + "learning_rate": 2.255825483391175e-05, + "loss": 1.6618, + "step": 2730 + }, + { + "epoch": 0.4527990084693245, + "grad_norm": 12.131684303283691, + "learning_rate": 2.264088580399934e-05, + "loss": 1.7285, + "step": 2740 + }, + { + "epoch": 0.45445155959512495, + "grad_norm": 60.209259033203125, + "learning_rate": 2.2723516774086928e-05, + "loss": 1.6896, + "step": 2750 + }, + { + "epoch": 0.45610411072092544, + "grad_norm": 22.111814498901367, + "learning_rate": 2.2806147744174518e-05, + "loss": 1.7719, + "step": 2760 + }, + { + "epoch": 0.45775666184672587, + "grad_norm": 9.202242851257324, + "learning_rate": 2.2888778714262108e-05, + "loss": 1.7777, + "step": 2770 + }, + { + "epoch": 0.45940921297252635, + "grad_norm": 63.52800750732422, + "learning_rate": 2.2971409684349695e-05, + "loss": 1.6549, + "step": 2780 + }, + { + "epoch": 0.4610617640983268, + "grad_norm": 12.614968299865723, + "learning_rate": 2.3054040654437285e-05, + "loss": 1.8158, + "step": 2790 + }, + { + "epoch": 0.46271431522412726, + "grad_norm": 10.92754077911377, + "learning_rate": 2.3136671624524872e-05, + "loss": 1.6745, + "step": 2800 + }, + { + "epoch": 0.4643668663499277, + "grad_norm": 57.359619140625, + "learning_rate": 2.3219302594612462e-05, + "loss": 1.7044, + "step": 2810 + }, + { + "epoch": 0.46601941747572817, + "grad_norm": 19.017126083374023, + "learning_rate": 2.3301933564700052e-05, + "loss": 1.6981, + "step": 2820 + }, + { + "epoch": 0.4676719686015286, + "grad_norm": 26.398160934448242, + "learning_rate": 2.338456453478764e-05, + "loss": 1.7188, + "step": 2830 + }, + { + "epoch": 0.4693245197273291, + "grad_norm": 13.000535011291504, + "learning_rate": 2.346719550487523e-05, + "loss": 1.6572, + "step": 2840 + }, + { + "epoch": 0.4709770708531295, + "grad_norm": 18.722341537475586, + "learning_rate": 2.3549826474962816e-05, + "loss": 1.8101, + "step": 2850 + }, + { + "epoch": 0.47262962197893, + "grad_norm": 94.03982543945312, + "learning_rate": 2.3632457445050406e-05, + "loss": 1.5997, + "step": 2860 + }, + { + "epoch": 0.4742821731047304, + "grad_norm": 45.00182342529297, + "learning_rate": 2.3715088415137996e-05, + "loss": 1.6486, + "step": 2870 + }, + { + "epoch": 0.4759347242305309, + "grad_norm": 26.30855941772461, + "learning_rate": 2.3797719385225583e-05, + "loss": 1.7285, + "step": 2880 + }, + { + "epoch": 0.47758727535633133, + "grad_norm": 127.1923599243164, + "learning_rate": 2.3880350355313173e-05, + "loss": 1.7588, + "step": 2890 + }, + { + "epoch": 0.4792398264821318, + "grad_norm": 22.71497917175293, + "learning_rate": 2.396298132540076e-05, + "loss": 1.6556, + "step": 2900 + }, + { + "epoch": 0.48089237760793224, + "grad_norm": 17.4209041595459, + "learning_rate": 2.404561229548835e-05, + "loss": 1.6399, + "step": 2910 + }, + { + "epoch": 0.4825449287337327, + "grad_norm": 38.692264556884766, + "learning_rate": 2.412824326557594e-05, + "loss": 1.6147, + "step": 2920 + }, + { + "epoch": 0.48419747985953315, + "grad_norm": 26.006572723388672, + "learning_rate": 2.4210874235663527e-05, + "loss": 1.6381, + "step": 2930 + }, + { + "epoch": 0.48585003098533364, + "grad_norm": 13.967598915100098, + "learning_rate": 2.4293505205751117e-05, + "loss": 1.6556, + "step": 2940 + }, + { + "epoch": 0.48750258211113406, + "grad_norm": 15.196572303771973, + "learning_rate": 2.4376136175838707e-05, + "loss": 1.6714, + "step": 2950 + }, + { + "epoch": 0.4891551332369345, + "grad_norm": 23.98309326171875, + "learning_rate": 2.4458767145926294e-05, + "loss": 1.7578, + "step": 2960 + }, + { + "epoch": 0.490807684362735, + "grad_norm": 19.997920989990234, + "learning_rate": 2.4541398116013884e-05, + "loss": 1.7737, + "step": 2970 + }, + { + "epoch": 0.4924602354885354, + "grad_norm": 26.095102310180664, + "learning_rate": 2.462402908610147e-05, + "loss": 1.709, + "step": 2980 + }, + { + "epoch": 0.4941127866143359, + "grad_norm": 60.75287628173828, + "learning_rate": 2.470666005618906e-05, + "loss": 1.7374, + "step": 2990 + }, + { + "epoch": 0.4957653377401363, + "grad_norm": 12.193611145019531, + "learning_rate": 2.478929102627665e-05, + "loss": 1.7192, + "step": 3000 + }, + { + "epoch": 0.4974178888659368, + "grad_norm": 21.721214294433594, + "learning_rate": 2.4871921996364238e-05, + "loss": 1.6808, + "step": 3010 + }, + { + "epoch": 0.4990704399917372, + "grad_norm": 9.847155570983887, + "learning_rate": 2.4954552966451828e-05, + "loss": 1.6721, + "step": 3020 + }, + { + "epoch": 0.5007229911175377, + "grad_norm": 12.325818061828613, + "learning_rate": 2.5037183936539415e-05, + "loss": 1.8163, + "step": 3030 + }, + { + "epoch": 0.5023755422433381, + "grad_norm": 33.089630126953125, + "learning_rate": 2.5119814906627005e-05, + "loss": 1.8137, + "step": 3040 + }, + { + "epoch": 0.5040280933691386, + "grad_norm": 47.662208557128906, + "learning_rate": 2.5202445876714592e-05, + "loss": 1.7631, + "step": 3050 + }, + { + "epoch": 0.5056806444949391, + "grad_norm": 11.936186790466309, + "learning_rate": 2.528507684680218e-05, + "loss": 1.6799, + "step": 3060 + }, + { + "epoch": 0.5073331956207395, + "grad_norm": 24.59088706970215, + "learning_rate": 2.5367707816889776e-05, + "loss": 1.7288, + "step": 3070 + }, + { + "epoch": 0.50898574674654, + "grad_norm": 12.187572479248047, + "learning_rate": 2.5450338786977362e-05, + "loss": 1.7636, + "step": 3080 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 12.487037658691406, + "learning_rate": 2.553296975706495e-05, + "loss": 1.6933, + "step": 3090 + }, + { + "epoch": 0.5122908489981409, + "grad_norm": 16.490032196044922, + "learning_rate": 2.561560072715254e-05, + "loss": 1.8374, + "step": 3100 + }, + { + "epoch": 0.5139434001239414, + "grad_norm": 17.2027530670166, + "learning_rate": 2.5698231697240126e-05, + "loss": 1.6313, + "step": 3110 + }, + { + "epoch": 0.5155959512497418, + "grad_norm": 67.94872283935547, + "learning_rate": 2.5780862667327716e-05, + "loss": 1.6479, + "step": 3120 + }, + { + "epoch": 0.5172485023755422, + "grad_norm": 17.46609878540039, + "learning_rate": 2.5863493637415303e-05, + "loss": 1.7751, + "step": 3130 + }, + { + "epoch": 0.5189010535013427, + "grad_norm": 13.204253196716309, + "learning_rate": 2.5946124607502893e-05, + "loss": 1.6845, + "step": 3140 + }, + { + "epoch": 0.5205536046271432, + "grad_norm": 10.950384140014648, + "learning_rate": 2.602875557759048e-05, + "loss": 1.6612, + "step": 3150 + }, + { + "epoch": 0.5222061557529436, + "grad_norm": 25.03174591064453, + "learning_rate": 2.6111386547678067e-05, + "loss": 1.7298, + "step": 3160 + }, + { + "epoch": 0.523858706878744, + "grad_norm": 41.82197952270508, + "learning_rate": 2.6194017517765664e-05, + "loss": 1.7301, + "step": 3170 + }, + { + "epoch": 0.5255112580045446, + "grad_norm": 26.862686157226562, + "learning_rate": 2.627664848785325e-05, + "loss": 1.6881, + "step": 3180 + }, + { + "epoch": 0.527163809130345, + "grad_norm": 13.66534423828125, + "learning_rate": 2.6359279457940837e-05, + "loss": 1.6658, + "step": 3190 + }, + { + "epoch": 0.5288163602561454, + "grad_norm": 17.450504302978516, + "learning_rate": 2.6441910428028427e-05, + "loss": 1.6795, + "step": 3200 + }, + { + "epoch": 0.5304689113819459, + "grad_norm": 12.588276863098145, + "learning_rate": 2.6524541398116014e-05, + "loss": 1.6239, + "step": 3210 + }, + { + "epoch": 0.5321214625077463, + "grad_norm": 34.62938690185547, + "learning_rate": 2.6607172368203604e-05, + "loss": 1.5944, + "step": 3220 + }, + { + "epoch": 0.5337740136335468, + "grad_norm": 22.4811954498291, + "learning_rate": 2.668980333829119e-05, + "loss": 1.6605, + "step": 3230 + }, + { + "epoch": 0.5354265647593472, + "grad_norm": 13.54268741607666, + "learning_rate": 2.677243430837878e-05, + "loss": 1.6363, + "step": 3240 + }, + { + "epoch": 0.5370791158851477, + "grad_norm": 13.956992149353027, + "learning_rate": 2.6855065278466368e-05, + "loss": 1.6435, + "step": 3250 + }, + { + "epoch": 0.5387316670109481, + "grad_norm": 23.956907272338867, + "learning_rate": 2.693769624855396e-05, + "loss": 1.6956, + "step": 3260 + }, + { + "epoch": 0.5403842181367486, + "grad_norm": 16.56192398071289, + "learning_rate": 2.7020327218641552e-05, + "loss": 1.6613, + "step": 3270 + }, + { + "epoch": 0.5420367692625491, + "grad_norm": 21.902639389038086, + "learning_rate": 2.710295818872914e-05, + "loss": 1.7387, + "step": 3280 + }, + { + "epoch": 0.5436893203883495, + "grad_norm": 24.28936004638672, + "learning_rate": 2.7185589158816725e-05, + "loss": 1.8334, + "step": 3290 + }, + { + "epoch": 0.5453418715141499, + "grad_norm": 6.949965476989746, + "learning_rate": 2.7268220128904315e-05, + "loss": 1.6161, + "step": 3300 + }, + { + "epoch": 0.5469944226399505, + "grad_norm": 10.36733627319336, + "learning_rate": 2.7350851098991902e-05, + "loss": 1.6539, + "step": 3310 + }, + { + "epoch": 0.5486469737657509, + "grad_norm": 15.764904975891113, + "learning_rate": 2.7433482069079492e-05, + "loss": 1.7367, + "step": 3320 + }, + { + "epoch": 0.5502995248915513, + "grad_norm": 30.997188568115234, + "learning_rate": 2.751611303916708e-05, + "loss": 1.7039, + "step": 3330 + }, + { + "epoch": 0.5519520760173517, + "grad_norm": 63.2742919921875, + "learning_rate": 2.759874400925467e-05, + "loss": 1.6414, + "step": 3340 + }, + { + "epoch": 0.5536046271431523, + "grad_norm": 13.54851245880127, + "learning_rate": 2.7681374979342256e-05, + "loss": 1.741, + "step": 3350 + }, + { + "epoch": 0.5552571782689527, + "grad_norm": 31.921226501464844, + "learning_rate": 2.776400594942985e-05, + "loss": 1.65, + "step": 3360 + }, + { + "epoch": 0.5569097293947531, + "grad_norm": 20.03881072998047, + "learning_rate": 2.7846636919517436e-05, + "loss": 1.6868, + "step": 3370 + }, + { + "epoch": 0.5585622805205536, + "grad_norm": 11.299986839294434, + "learning_rate": 2.7929267889605027e-05, + "loss": 1.7552, + "step": 3380 + }, + { + "epoch": 0.5602148316463541, + "grad_norm": 11.070747375488281, + "learning_rate": 2.8011898859692613e-05, + "loss": 1.6897, + "step": 3390 + }, + { + "epoch": 0.5618673827721545, + "grad_norm": 8.979963302612305, + "learning_rate": 2.8094529829780204e-05, + "loss": 1.6199, + "step": 3400 + }, + { + "epoch": 0.563519933897955, + "grad_norm": 11.309082984924316, + "learning_rate": 2.817716079986779e-05, + "loss": 1.8134, + "step": 3410 + }, + { + "epoch": 0.5651724850237554, + "grad_norm": 17.12977409362793, + "learning_rate": 2.825979176995538e-05, + "loss": 1.6499, + "step": 3420 + }, + { + "epoch": 0.5668250361495559, + "grad_norm": 6.682351589202881, + "learning_rate": 2.8342422740042967e-05, + "loss": 1.6524, + "step": 3430 + }, + { + "epoch": 0.5684775872753564, + "grad_norm": 20.999141693115234, + "learning_rate": 2.8425053710130557e-05, + "loss": 1.6885, + "step": 3440 + }, + { + "epoch": 0.5701301384011568, + "grad_norm": 10.989826202392578, + "learning_rate": 2.850768468021815e-05, + "loss": 1.5895, + "step": 3450 + }, + { + "epoch": 0.5717826895269572, + "grad_norm": 11.188637733459473, + "learning_rate": 2.8590315650305738e-05, + "loss": 1.7236, + "step": 3460 + }, + { + "epoch": 0.5734352406527577, + "grad_norm": 11.868768692016602, + "learning_rate": 2.8672946620393324e-05, + "loss": 1.7752, + "step": 3470 + }, + { + "epoch": 0.5750877917785582, + "grad_norm": 13.934696197509766, + "learning_rate": 2.8755577590480915e-05, + "loss": 1.6321, + "step": 3480 + }, + { + "epoch": 0.5767403429043586, + "grad_norm": 10.623023986816406, + "learning_rate": 2.88382085605685e-05, + "loss": 1.7567, + "step": 3490 + }, + { + "epoch": 0.578392894030159, + "grad_norm": 21.444114685058594, + "learning_rate": 2.892083953065609e-05, + "loss": 1.7353, + "step": 3500 + }, + { + "epoch": 0.5800454451559595, + "grad_norm": 22.53701400756836, + "learning_rate": 2.900347050074368e-05, + "loss": 1.6643, + "step": 3510 + }, + { + "epoch": 0.58169799628176, + "grad_norm": 8.35075855255127, + "learning_rate": 2.908610147083127e-05, + "loss": 1.6099, + "step": 3520 + }, + { + "epoch": 0.5833505474075604, + "grad_norm": 10.763632774353027, + "learning_rate": 2.9168732440918855e-05, + "loss": 1.7619, + "step": 3530 + }, + { + "epoch": 0.5850030985333609, + "grad_norm": 10.737512588500977, + "learning_rate": 2.925136341100645e-05, + "loss": 1.6345, + "step": 3540 + }, + { + "epoch": 0.5866556496591613, + "grad_norm": 15.459436416625977, + "learning_rate": 2.933399438109404e-05, + "loss": 1.6186, + "step": 3550 + }, + { + "epoch": 0.5883082007849618, + "grad_norm": 16.63528823852539, + "learning_rate": 2.9416625351181626e-05, + "loss": 1.7349, + "step": 3560 + }, + { + "epoch": 0.5899607519107622, + "grad_norm": 12.49666690826416, + "learning_rate": 2.9499256321269213e-05, + "loss": 1.7992, + "step": 3570 + }, + { + "epoch": 0.5916133030365627, + "grad_norm": 30.694421768188477, + "learning_rate": 2.9581887291356803e-05, + "loss": 1.7004, + "step": 3580 + }, + { + "epoch": 0.5932658541623631, + "grad_norm": 12.530024528503418, + "learning_rate": 2.966451826144439e-05, + "loss": 1.5653, + "step": 3590 + }, + { + "epoch": 0.5949184052881636, + "grad_norm": 10.325366020202637, + "learning_rate": 2.974714923153198e-05, + "loss": 1.6872, + "step": 3600 + }, + { + "epoch": 0.5965709564139641, + "grad_norm": 17.523683547973633, + "learning_rate": 2.9829780201619566e-05, + "loss": 1.5351, + "step": 3610 + }, + { + "epoch": 0.5982235075397645, + "grad_norm": 12.359028816223145, + "learning_rate": 2.9912411171707157e-05, + "loss": 1.6894, + "step": 3620 + }, + { + "epoch": 0.5998760586655649, + "grad_norm": 9.718979835510254, + "learning_rate": 2.9995042141794743e-05, + "loss": 1.4912, + "step": 3630 + }, + { + "epoch": 0.6015286097913655, + "grad_norm": 12.050354957580566, + "learning_rate": 3.0077673111882337e-05, + "loss": 1.6147, + "step": 3640 + }, + { + "epoch": 0.6031811609171659, + "grad_norm": 73.88978576660156, + "learning_rate": 3.0160304081969927e-05, + "loss": 1.639, + "step": 3650 + }, + { + "epoch": 0.6048337120429663, + "grad_norm": 12.085061073303223, + "learning_rate": 3.0242935052057514e-05, + "loss": 1.7605, + "step": 3660 + }, + { + "epoch": 0.6064862631687667, + "grad_norm": 12.490886688232422, + "learning_rate": 3.03255660221451e-05, + "loss": 1.6698, + "step": 3670 + }, + { + "epoch": 0.6081388142945673, + "grad_norm": 11.048460006713867, + "learning_rate": 3.040819699223269e-05, + "loss": 1.7394, + "step": 3680 + }, + { + "epoch": 0.6097913654203677, + "grad_norm": 9.499938011169434, + "learning_rate": 3.0490827962320277e-05, + "loss": 1.8048, + "step": 3690 + }, + { + "epoch": 0.6114439165461681, + "grad_norm": 17.809703826904297, + "learning_rate": 3.0573458932407864e-05, + "loss": 1.768, + "step": 3700 + }, + { + "epoch": 0.6130964676719686, + "grad_norm": 45.0641975402832, + "learning_rate": 3.0656089902495454e-05, + "loss": 1.7266, + "step": 3710 + }, + { + "epoch": 0.6147490187977691, + "grad_norm": 37.341739654541016, + "learning_rate": 3.0738720872583045e-05, + "loss": 1.7567, + "step": 3720 + }, + { + "epoch": 0.6164015699235695, + "grad_norm": 8.673922538757324, + "learning_rate": 3.0821351842670635e-05, + "loss": 1.8379, + "step": 3730 + }, + { + "epoch": 0.61805412104937, + "grad_norm": 8.250535011291504, + "learning_rate": 3.0903982812758225e-05, + "loss": 1.7141, + "step": 3740 + }, + { + "epoch": 0.6197066721751704, + "grad_norm": 22.169246673583984, + "learning_rate": 3.0986613782845815e-05, + "loss": 1.6403, + "step": 3750 + }, + { + "epoch": 0.6213592233009708, + "grad_norm": 8.162503242492676, + "learning_rate": 3.1069244752933405e-05, + "loss": 1.7723, + "step": 3760 + }, + { + "epoch": 0.6230117744267714, + "grad_norm": 20.341426849365234, + "learning_rate": 3.115187572302099e-05, + "loss": 1.7345, + "step": 3770 + }, + { + "epoch": 0.6246643255525718, + "grad_norm": 10.553240776062012, + "learning_rate": 3.123450669310858e-05, + "loss": 1.8238, + "step": 3780 + }, + { + "epoch": 0.6263168766783722, + "grad_norm": 15.004631042480469, + "learning_rate": 3.131713766319617e-05, + "loss": 1.5615, + "step": 3790 + }, + { + "epoch": 0.6279694278041726, + "grad_norm": 9.84469223022461, + "learning_rate": 3.139976863328375e-05, + "loss": 1.724, + "step": 3800 + }, + { + "epoch": 0.6296219789299732, + "grad_norm": 10.025922775268555, + "learning_rate": 3.148239960337134e-05, + "loss": 1.729, + "step": 3810 + }, + { + "epoch": 0.6312745300557736, + "grad_norm": 11.77392292022705, + "learning_rate": 3.156503057345893e-05, + "loss": 1.7107, + "step": 3820 + }, + { + "epoch": 0.632927081181574, + "grad_norm": 10.818305969238281, + "learning_rate": 3.164766154354652e-05, + "loss": 1.5897, + "step": 3830 + }, + { + "epoch": 0.6345796323073745, + "grad_norm": 10.746159553527832, + "learning_rate": 3.173029251363411e-05, + "loss": 1.7638, + "step": 3840 + }, + { + "epoch": 0.636232183433175, + "grad_norm": 13.713776588439941, + "learning_rate": 3.18129234837217e-05, + "loss": 1.7332, + "step": 3850 + }, + { + "epoch": 0.6378847345589754, + "grad_norm": 10.594388008117676, + "learning_rate": 3.1895554453809286e-05, + "loss": 1.8471, + "step": 3860 + }, + { + "epoch": 0.6395372856847759, + "grad_norm": 16.44990348815918, + "learning_rate": 3.197818542389688e-05, + "loss": 1.7023, + "step": 3870 + }, + { + "epoch": 0.6411898368105763, + "grad_norm": 9.56428050994873, + "learning_rate": 3.206081639398447e-05, + "loss": 1.7214, + "step": 3880 + }, + { + "epoch": 0.6428423879363768, + "grad_norm": 13.247419357299805, + "learning_rate": 3.214344736407206e-05, + "loss": 1.6734, + "step": 3890 + }, + { + "epoch": 0.6444949390621773, + "grad_norm": 13.604035377502441, + "learning_rate": 3.222607833415964e-05, + "loss": 1.6755, + "step": 3900 + }, + { + "epoch": 0.6461474901879777, + "grad_norm": 16.204240798950195, + "learning_rate": 3.230870930424723e-05, + "loss": 1.7442, + "step": 3910 + }, + { + "epoch": 0.6478000413137781, + "grad_norm": 22.446441650390625, + "learning_rate": 3.239134027433483e-05, + "loss": 1.7222, + "step": 3920 + }, + { + "epoch": 0.6494525924395786, + "grad_norm": 13.957686424255371, + "learning_rate": 3.247397124442241e-05, + "loss": 1.6194, + "step": 3930 + }, + { + "epoch": 0.6511051435653791, + "grad_norm": 17.994192123413086, + "learning_rate": 3.255660221451e-05, + "loss": 1.7454, + "step": 3940 + }, + { + "epoch": 0.6527576946911795, + "grad_norm": 10.868365287780762, + "learning_rate": 3.263923318459759e-05, + "loss": 1.707, + "step": 3950 + }, + { + "epoch": 0.6544102458169799, + "grad_norm": 8.294291496276855, + "learning_rate": 3.2721864154685175e-05, + "loss": 1.7903, + "step": 3960 + }, + { + "epoch": 0.6560627969427805, + "grad_norm": 10.930803298950195, + "learning_rate": 3.2804495124772765e-05, + "loss": 1.6172, + "step": 3970 + }, + { + "epoch": 0.6577153480685809, + "grad_norm": 74.83100128173828, + "learning_rate": 3.2887126094860355e-05, + "loss": 1.7242, + "step": 3980 + }, + { + "epoch": 0.6593678991943813, + "grad_norm": 10.721229553222656, + "learning_rate": 3.2969757064947945e-05, + "loss": 1.678, + "step": 3990 + }, + { + "epoch": 0.6610204503201818, + "grad_norm": 22.046354293823242, + "learning_rate": 3.305238803503553e-05, + "loss": 1.5762, + "step": 4000 + }, + { + "epoch": 0.6626730014459823, + "grad_norm": 8.417013168334961, + "learning_rate": 3.313501900512312e-05, + "loss": 1.6341, + "step": 4010 + }, + { + "epoch": 0.6643255525717827, + "grad_norm": 9.018980026245117, + "learning_rate": 3.3217649975210715e-05, + "loss": 1.5761, + "step": 4020 + }, + { + "epoch": 0.6659781036975831, + "grad_norm": 19.58799171447754, + "learning_rate": 3.33002809452983e-05, + "loss": 1.6891, + "step": 4030 + }, + { + "epoch": 0.6676306548233836, + "grad_norm": 13.492208480834961, + "learning_rate": 3.338291191538589e-05, + "loss": 1.6399, + "step": 4040 + }, + { + "epoch": 0.669283205949184, + "grad_norm": 10.820428848266602, + "learning_rate": 3.346554288547348e-05, + "loss": 1.7323, + "step": 4050 + }, + { + "epoch": 0.6709357570749845, + "grad_norm": 24.102577209472656, + "learning_rate": 3.354817385556106e-05, + "loss": 1.7554, + "step": 4060 + }, + { + "epoch": 0.672588308200785, + "grad_norm": 10.357276916503906, + "learning_rate": 3.363080482564865e-05, + "loss": 1.7642, + "step": 4070 + }, + { + "epoch": 0.6742408593265854, + "grad_norm": 16.072227478027344, + "learning_rate": 3.371343579573624e-05, + "loss": 1.6417, + "step": 4080 + }, + { + "epoch": 0.6758934104523858, + "grad_norm": 12.857048034667969, + "learning_rate": 3.379606676582383e-05, + "loss": 1.7107, + "step": 4090 + }, + { + "epoch": 0.6775459615781864, + "grad_norm": 10.534919738769531, + "learning_rate": 3.3878697735911416e-05, + "loss": 1.7853, + "step": 4100 + }, + { + "epoch": 0.6791985127039868, + "grad_norm": 22.40790367126465, + "learning_rate": 3.396132870599901e-05, + "loss": 1.7863, + "step": 4110 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 19.036958694458008, + "learning_rate": 3.4043959676086604e-05, + "loss": 1.5541, + "step": 4120 + }, + { + "epoch": 0.6825036149555876, + "grad_norm": 12.263296127319336, + "learning_rate": 3.412659064617419e-05, + "loss": 1.7194, + "step": 4130 + }, + { + "epoch": 0.6841561660813882, + "grad_norm": 13.276509284973145, + "learning_rate": 3.420922161626178e-05, + "loss": 1.7551, + "step": 4140 + }, + { + "epoch": 0.6858087172071886, + "grad_norm": 21.00751304626465, + "learning_rate": 3.429185258634937e-05, + "loss": 1.7179, + "step": 4150 + }, + { + "epoch": 0.687461268332989, + "grad_norm": 27.047283172607422, + "learning_rate": 3.437448355643695e-05, + "loss": 1.7047, + "step": 4160 + }, + { + "epoch": 0.6891138194587895, + "grad_norm": 12.669699668884277, + "learning_rate": 3.445711452652454e-05, + "loss": 1.6317, + "step": 4170 + }, + { + "epoch": 0.69076637058459, + "grad_norm": 16.105510711669922, + "learning_rate": 3.453974549661213e-05, + "loss": 1.6754, + "step": 4180 + }, + { + "epoch": 0.6924189217103904, + "grad_norm": 9.046321868896484, + "learning_rate": 3.462237646669972e-05, + "loss": 1.5878, + "step": 4190 + }, + { + "epoch": 0.6940714728361909, + "grad_norm": 18.94280433654785, + "learning_rate": 3.4705007436787304e-05, + "loss": 1.6219, + "step": 4200 + }, + { + "epoch": 0.6957240239619913, + "grad_norm": 11.86888313293457, + "learning_rate": 3.47876384068749e-05, + "loss": 1.867, + "step": 4210 + }, + { + "epoch": 0.6973765750877918, + "grad_norm": 11.21365737915039, + "learning_rate": 3.487026937696249e-05, + "loss": 1.7011, + "step": 4220 + }, + { + "epoch": 0.6990291262135923, + "grad_norm": 36.31560134887695, + "learning_rate": 3.4952900347050075e-05, + "loss": 1.6172, + "step": 4230 + }, + { + "epoch": 0.7006816773393927, + "grad_norm": 7.454902648925781, + "learning_rate": 3.5035531317137665e-05, + "loss": 1.682, + "step": 4240 + }, + { + "epoch": 0.7023342284651931, + "grad_norm": 8.024889945983887, + "learning_rate": 3.5118162287225255e-05, + "loss": 1.6474, + "step": 4250 + }, + { + "epoch": 0.7039867795909936, + "grad_norm": 11.65849494934082, + "learning_rate": 3.520079325731284e-05, + "loss": 1.6873, + "step": 4260 + }, + { + "epoch": 0.7056393307167941, + "grad_norm": 9.83031940460205, + "learning_rate": 3.528342422740043e-05, + "loss": 1.6696, + "step": 4270 + }, + { + "epoch": 0.7072918818425945, + "grad_norm": 27.40468978881836, + "learning_rate": 3.536605519748802e-05, + "loss": 1.6058, + "step": 4280 + }, + { + "epoch": 0.7089444329683949, + "grad_norm": 7.991322994232178, + "learning_rate": 3.544868616757561e-05, + "loss": 1.6647, + "step": 4290 + }, + { + "epoch": 0.7105969840941954, + "grad_norm": 17.9750919342041, + "learning_rate": 3.55313171376632e-05, + "loss": 1.7095, + "step": 4300 + }, + { + "epoch": 0.7122495352199959, + "grad_norm": 33.60334777832031, + "learning_rate": 3.561394810775079e-05, + "loss": 1.6818, + "step": 4310 + }, + { + "epoch": 0.7139020863457963, + "grad_norm": 12.004261016845703, + "learning_rate": 3.569657907783838e-05, + "loss": 1.6787, + "step": 4320 + }, + { + "epoch": 0.7155546374715968, + "grad_norm": 8.619003295898438, + "learning_rate": 3.577921004792596e-05, + "loss": 1.6197, + "step": 4330 + }, + { + "epoch": 0.7172071885973972, + "grad_norm": 11.942529678344727, + "learning_rate": 3.586184101801355e-05, + "loss": 1.6431, + "step": 4340 + }, + { + "epoch": 0.7188597397231977, + "grad_norm": 8.107340812683105, + "learning_rate": 3.594447198810114e-05, + "loss": 1.6827, + "step": 4350 + }, + { + "epoch": 0.7205122908489981, + "grad_norm": 11.563468933105469, + "learning_rate": 3.602710295818873e-05, + "loss": 1.5576, + "step": 4360 + }, + { + "epoch": 0.7221648419747986, + "grad_norm": 17.708297729492188, + "learning_rate": 3.610973392827632e-05, + "loss": 1.6501, + "step": 4370 + }, + { + "epoch": 0.723817393100599, + "grad_norm": 9.903080940246582, + "learning_rate": 3.619236489836391e-05, + "loss": 1.604, + "step": 4380 + }, + { + "epoch": 0.7254699442263995, + "grad_norm": 36.305904388427734, + "learning_rate": 3.62749958684515e-05, + "loss": 1.6622, + "step": 4390 + }, + { + "epoch": 0.7271224953522, + "grad_norm": 17.942659378051758, + "learning_rate": 3.635762683853909e-05, + "loss": 1.5846, + "step": 4400 + }, + { + "epoch": 0.7287750464780004, + "grad_norm": 15.10368537902832, + "learning_rate": 3.644025780862668e-05, + "loss": 1.6528, + "step": 4410 + }, + { + "epoch": 0.7304275976038008, + "grad_norm": 11.274897575378418, + "learning_rate": 3.652288877871427e-05, + "loss": 1.6126, + "step": 4420 + }, + { + "epoch": 0.7320801487296014, + "grad_norm": 9.983580589294434, + "learning_rate": 3.660551974880185e-05, + "loss": 1.7114, + "step": 4430 + }, + { + "epoch": 0.7337326998554018, + "grad_norm": 6.765944480895996, + "learning_rate": 3.668815071888944e-05, + "loss": 1.6243, + "step": 4440 + }, + { + "epoch": 0.7353852509812022, + "grad_norm": 6.97224760055542, + "learning_rate": 3.677078168897703e-05, + "loss": 1.6103, + "step": 4450 + }, + { + "epoch": 0.7370378021070026, + "grad_norm": 18.04844856262207, + "learning_rate": 3.6853412659064615e-05, + "loss": 1.5527, + "step": 4460 + }, + { + "epoch": 0.7386903532328032, + "grad_norm": 16.644474029541016, + "learning_rate": 3.6936043629152205e-05, + "loss": 1.7021, + "step": 4470 + }, + { + "epoch": 0.7403429043586036, + "grad_norm": 12.994518280029297, + "learning_rate": 3.7018674599239795e-05, + "loss": 1.779, + "step": 4480 + }, + { + "epoch": 0.741995455484404, + "grad_norm": 8.77868938446045, + "learning_rate": 3.7101305569327385e-05, + "loss": 1.6658, + "step": 4490 + }, + { + "epoch": 0.7436480066102045, + "grad_norm": 16.25444793701172, + "learning_rate": 3.7183936539414975e-05, + "loss": 1.7969, + "step": 4500 + }, + { + "epoch": 0.745300557736005, + "grad_norm": 15.957270622253418, + "learning_rate": 3.7266567509502566e-05, + "loss": 1.6981, + "step": 4510 + }, + { + "epoch": 0.7469531088618054, + "grad_norm": 8.807640075683594, + "learning_rate": 3.7349198479590156e-05, + "loss": 1.8519, + "step": 4520 + }, + { + "epoch": 0.7486056599876059, + "grad_norm": 13.07224178314209, + "learning_rate": 3.743182944967774e-05, + "loss": 1.8313, + "step": 4530 + }, + { + "epoch": 0.7502582111134063, + "grad_norm": 14.359870910644531, + "learning_rate": 3.751446041976533e-05, + "loss": 1.7593, + "step": 4540 + }, + { + "epoch": 0.7519107622392067, + "grad_norm": 10.050626754760742, + "learning_rate": 3.759709138985292e-05, + "loss": 1.7225, + "step": 4550 + }, + { + "epoch": 0.7535633133650073, + "grad_norm": 20.963890075683594, + "learning_rate": 3.76797223599405e-05, + "loss": 1.7461, + "step": 4560 + }, + { + "epoch": 0.7552158644908077, + "grad_norm": 14.734147071838379, + "learning_rate": 3.776235333002809e-05, + "loss": 1.7066, + "step": 4570 + }, + { + "epoch": 0.7568684156166081, + "grad_norm": 12.787677764892578, + "learning_rate": 3.784498430011569e-05, + "loss": 1.7834, + "step": 4580 + }, + { + "epoch": 0.7585209667424085, + "grad_norm": 11.134603500366211, + "learning_rate": 3.792761527020327e-05, + "loss": 1.6588, + "step": 4590 + }, + { + "epoch": 0.7601735178682091, + "grad_norm": 7.746461391448975, + "learning_rate": 3.8010246240290863e-05, + "loss": 1.5581, + "step": 4600 + }, + { + "epoch": 0.7618260689940095, + "grad_norm": 8.252470016479492, + "learning_rate": 3.8092877210378454e-05, + "loss": 1.7961, + "step": 4610 + }, + { + "epoch": 0.7634786201198099, + "grad_norm": 18.933685302734375, + "learning_rate": 3.8175508180466044e-05, + "loss": 1.5873, + "step": 4620 + }, + { + "epoch": 0.7651311712456104, + "grad_norm": 8.816574096679688, + "learning_rate": 3.825813915055363e-05, + "loss": 1.6589, + "step": 4630 + }, + { + "epoch": 0.7667837223714109, + "grad_norm": 36.53535842895508, + "learning_rate": 3.834077012064122e-05, + "loss": 1.6683, + "step": 4640 + }, + { + "epoch": 0.7684362734972113, + "grad_norm": 18.802400588989258, + "learning_rate": 3.842340109072881e-05, + "loss": 1.5741, + "step": 4650 + }, + { + "epoch": 0.7700888246230118, + "grad_norm": 15.650837898254395, + "learning_rate": 3.850603206081639e-05, + "loss": 1.7178, + "step": 4660 + }, + { + "epoch": 0.7717413757488122, + "grad_norm": 9.750154495239258, + "learning_rate": 3.858866303090398e-05, + "loss": 1.5336, + "step": 4670 + }, + { + "epoch": 0.7733939268746127, + "grad_norm": 10.917224884033203, + "learning_rate": 3.867129400099158e-05, + "loss": 1.6023, + "step": 4680 + }, + { + "epoch": 0.7750464780004132, + "grad_norm": 15.657227516174316, + "learning_rate": 3.875392497107916e-05, + "loss": 1.7192, + "step": 4690 + }, + { + "epoch": 0.7766990291262136, + "grad_norm": 8.965598106384277, + "learning_rate": 3.883655594116675e-05, + "loss": 1.7072, + "step": 4700 + }, + { + "epoch": 0.778351580252014, + "grad_norm": 5.406772613525391, + "learning_rate": 3.891918691125434e-05, + "loss": 1.617, + "step": 4710 + }, + { + "epoch": 0.7800041313778145, + "grad_norm": 40.23686599731445, + "learning_rate": 3.900181788134193e-05, + "loss": 1.5586, + "step": 4720 + }, + { + "epoch": 0.781656682503615, + "grad_norm": 6.773040294647217, + "learning_rate": 3.9084448851429515e-05, + "loss": 1.6325, + "step": 4730 + }, + { + "epoch": 0.7833092336294154, + "grad_norm": 15.4874906539917, + "learning_rate": 3.9167079821517105e-05, + "loss": 1.8084, + "step": 4740 + }, + { + "epoch": 0.7849617847552158, + "grad_norm": 9.330727577209473, + "learning_rate": 3.9249710791604695e-05, + "loss": 1.6506, + "step": 4750 + }, + { + "epoch": 0.7866143358810164, + "grad_norm": 8.181884765625, + "learning_rate": 3.933234176169228e-05, + "loss": 1.74, + "step": 4760 + }, + { + "epoch": 0.7882668870068168, + "grad_norm": 18.493993759155273, + "learning_rate": 3.9414972731779876e-05, + "loss": 1.5708, + "step": 4770 + }, + { + "epoch": 0.7899194381326172, + "grad_norm": 8.724126815795898, + "learning_rate": 3.9497603701867466e-05, + "loss": 1.6966, + "step": 4780 + }, + { + "epoch": 0.7915719892584177, + "grad_norm": 11.860855102539062, + "learning_rate": 3.958023467195505e-05, + "loss": 1.6671, + "step": 4790 + }, + { + "epoch": 0.7932245403842182, + "grad_norm": 15.39936351776123, + "learning_rate": 3.966286564204264e-05, + "loss": 1.6334, + "step": 4800 + }, + { + "epoch": 0.7948770915100186, + "grad_norm": 13.233650207519531, + "learning_rate": 3.974549661213023e-05, + "loss": 1.7003, + "step": 4810 + }, + { + "epoch": 0.796529642635819, + "grad_norm": 13.286338806152344, + "learning_rate": 3.982812758221782e-05, + "loss": 1.6586, + "step": 4820 + }, + { + "epoch": 0.7981821937616195, + "grad_norm": 70.44236755371094, + "learning_rate": 3.99107585523054e-05, + "loss": 1.7162, + "step": 4830 + }, + { + "epoch": 0.7998347448874199, + "grad_norm": 38.03678894042969, + "learning_rate": 3.999338952239299e-05, + "loss": 1.6638, + "step": 4840 + }, + { + "epoch": 0.8014872960132204, + "grad_norm": 9.280917167663574, + "learning_rate": 4.0076020492480584e-05, + "loss": 1.5643, + "step": 4850 + }, + { + "epoch": 0.8031398471390209, + "grad_norm": 46.079708099365234, + "learning_rate": 4.015865146256817e-05, + "loss": 1.7017, + "step": 4860 + }, + { + "epoch": 0.8047923982648213, + "grad_norm": 7.869788646697998, + "learning_rate": 4.0241282432655764e-05, + "loss": 1.6566, + "step": 4870 + }, + { + "epoch": 0.8064449493906217, + "grad_norm": 13.424615859985352, + "learning_rate": 4.0323913402743354e-05, + "loss": 1.7328, + "step": 4880 + }, + { + "epoch": 0.8080975005164223, + "grad_norm": 16.283950805664062, + "learning_rate": 4.040654437283094e-05, + "loss": 1.6783, + "step": 4890 + }, + { + "epoch": 0.8097500516422227, + "grad_norm": 53.05949020385742, + "learning_rate": 4.048917534291853e-05, + "loss": 1.6176, + "step": 4900 + }, + { + "epoch": 0.8114026027680231, + "grad_norm": 11.25322151184082, + "learning_rate": 4.057180631300612e-05, + "loss": 1.6248, + "step": 4910 + }, + { + "epoch": 0.8130551538938235, + "grad_norm": 30.585168838500977, + "learning_rate": 4.065443728309371e-05, + "loss": 1.6711, + "step": 4920 + }, + { + "epoch": 0.8147077050196241, + "grad_norm": 8.419675827026367, + "learning_rate": 4.073706825318129e-05, + "loss": 1.6889, + "step": 4930 + }, + { + "epoch": 0.8163602561454245, + "grad_norm": 32.17693328857422, + "learning_rate": 4.081969922326888e-05, + "loss": 1.7163, + "step": 4940 + }, + { + "epoch": 0.8180128072712249, + "grad_norm": 11.359280586242676, + "learning_rate": 4.090233019335647e-05, + "loss": 1.6454, + "step": 4950 + }, + { + "epoch": 0.8196653583970254, + "grad_norm": 15.660289764404297, + "learning_rate": 4.098496116344406e-05, + "loss": 1.7507, + "step": 4960 + }, + { + "epoch": 0.8213179095228259, + "grad_norm": 10.861181259155273, + "learning_rate": 4.106759213353165e-05, + "loss": 1.648, + "step": 4970 + }, + { + "epoch": 0.8229704606486263, + "grad_norm": 8.45966625213623, + "learning_rate": 4.115022310361924e-05, + "loss": 1.6779, + "step": 4980 + }, + { + "epoch": 0.8246230117744268, + "grad_norm": 8.247032165527344, + "learning_rate": 4.1232854073706825e-05, + "loss": 1.7897, + "step": 4990 + }, + { + "epoch": 0.8262755629002272, + "grad_norm": 10.746397018432617, + "learning_rate": 4.1315485043794416e-05, + "loss": 1.6776, + "step": 5000 + }, + { + "epoch": 0.8279281140260277, + "grad_norm": 11.24117660522461, + "learning_rate": 4.1398116013882006e-05, + "loss": 1.6349, + "step": 5010 + }, + { + "epoch": 0.8295806651518282, + "grad_norm": 13.31236457824707, + "learning_rate": 4.1480746983969596e-05, + "loss": 1.5985, + "step": 5020 + }, + { + "epoch": 0.8312332162776286, + "grad_norm": 32.01264572143555, + "learning_rate": 4.156337795405718e-05, + "loss": 1.7555, + "step": 5030 + }, + { + "epoch": 0.832885767403429, + "grad_norm": 9.02853775024414, + "learning_rate": 4.164600892414477e-05, + "loss": 1.5571, + "step": 5040 + }, + { + "epoch": 0.8345383185292296, + "grad_norm": 20.635509490966797, + "learning_rate": 4.172863989423236e-05, + "loss": 1.7462, + "step": 5050 + }, + { + "epoch": 0.83619086965503, + "grad_norm": 11.947609901428223, + "learning_rate": 4.181127086431995e-05, + "loss": 1.6536, + "step": 5060 + }, + { + "epoch": 0.8378434207808304, + "grad_norm": 7.472456932067871, + "learning_rate": 4.189390183440754e-05, + "loss": 1.8139, + "step": 5070 + }, + { + "epoch": 0.8394959719066308, + "grad_norm": 11.594853401184082, + "learning_rate": 4.197653280449513e-05, + "loss": 1.5398, + "step": 5080 + }, + { + "epoch": 0.8411485230324313, + "grad_norm": 22.656166076660156, + "learning_rate": 4.2059163774582713e-05, + "loss": 1.7391, + "step": 5090 + }, + { + "epoch": 0.8428010741582318, + "grad_norm": 15.495370864868164, + "learning_rate": 4.2141794744670304e-05, + "loss": 1.8129, + "step": 5100 + }, + { + "epoch": 0.8444536252840322, + "grad_norm": 6.661768436431885, + "learning_rate": 4.2224425714757894e-05, + "loss": 1.6834, + "step": 5110 + }, + { + "epoch": 0.8461061764098327, + "grad_norm": 7.477358818054199, + "learning_rate": 4.2307056684845484e-05, + "loss": 1.5815, + "step": 5120 + }, + { + "epoch": 0.8477587275356331, + "grad_norm": 11.148330688476562, + "learning_rate": 4.238968765493307e-05, + "loss": 1.6634, + "step": 5130 + }, + { + "epoch": 0.8494112786614336, + "grad_norm": 9.076679229736328, + "learning_rate": 4.247231862502066e-05, + "loss": 1.6155, + "step": 5140 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 10.360162734985352, + "learning_rate": 4.2554949595108254e-05, + "loss": 1.5972, + "step": 5150 + }, + { + "epoch": 0.8527163809130345, + "grad_norm": 10.570571899414062, + "learning_rate": 4.263758056519584e-05, + "loss": 1.7933, + "step": 5160 + }, + { + "epoch": 0.8543689320388349, + "grad_norm": 7.64302396774292, + "learning_rate": 4.272021153528343e-05, + "loss": 1.7115, + "step": 5170 + }, + { + "epoch": 0.8560214831646354, + "grad_norm": 11.577961921691895, + "learning_rate": 4.280284250537102e-05, + "loss": 1.7791, + "step": 5180 + }, + { + "epoch": 0.8576740342904359, + "grad_norm": 9.828375816345215, + "learning_rate": 4.28854734754586e-05, + "loss": 1.7543, + "step": 5190 + }, + { + "epoch": 0.8593265854162363, + "grad_norm": 10.680025100708008, + "learning_rate": 4.296810444554619e-05, + "loss": 1.569, + "step": 5200 + }, + { + "epoch": 0.8609791365420367, + "grad_norm": 10.61168384552002, + "learning_rate": 4.305073541563378e-05, + "loss": 1.6397, + "step": 5210 + }, + { + "epoch": 0.8626316876678373, + "grad_norm": 13.09295654296875, + "learning_rate": 4.313336638572137e-05, + "loss": 1.663, + "step": 5220 + }, + { + "epoch": 0.8642842387936377, + "grad_norm": 10.244070053100586, + "learning_rate": 4.3215997355808955e-05, + "loss": 1.8205, + "step": 5230 + }, + { + "epoch": 0.8659367899194381, + "grad_norm": 14.051331520080566, + "learning_rate": 4.329862832589655e-05, + "loss": 1.7455, + "step": 5240 + }, + { + "epoch": 0.8675893410452385, + "grad_norm": 14.486235618591309, + "learning_rate": 4.338125929598414e-05, + "loss": 1.6745, + "step": 5250 + }, + { + "epoch": 0.8692418921710391, + "grad_norm": 10.930896759033203, + "learning_rate": 4.3463890266071726e-05, + "loss": 1.6661, + "step": 5260 + }, + { + "epoch": 0.8708944432968395, + "grad_norm": 23.3206729888916, + "learning_rate": 4.3546521236159316e-05, + "loss": 1.6653, + "step": 5270 + }, + { + "epoch": 0.8725469944226399, + "grad_norm": 22.273244857788086, + "learning_rate": 4.3629152206246906e-05, + "loss": 1.7524, + "step": 5280 + }, + { + "epoch": 0.8741995455484404, + "grad_norm": 11.886351585388184, + "learning_rate": 4.371178317633449e-05, + "loss": 1.7256, + "step": 5290 + }, + { + "epoch": 0.8758520966742409, + "grad_norm": 8.98186206817627, + "learning_rate": 4.379441414642208e-05, + "loss": 1.7292, + "step": 5300 + }, + { + "epoch": 0.8775046478000413, + "grad_norm": 10.950017929077148, + "learning_rate": 4.387704511650967e-05, + "loss": 1.7558, + "step": 5310 + }, + { + "epoch": 0.8791571989258418, + "grad_norm": 10.48299789428711, + "learning_rate": 4.395967608659726e-05, + "loss": 1.7535, + "step": 5320 + }, + { + "epoch": 0.8808097500516422, + "grad_norm": 22.365188598632812, + "learning_rate": 4.4042307056684843e-05, + "loss": 1.7458, + "step": 5330 + }, + { + "epoch": 0.8824623011774427, + "grad_norm": 9.840352058410645, + "learning_rate": 4.412493802677244e-05, + "loss": 1.668, + "step": 5340 + }, + { + "epoch": 0.8841148523032432, + "grad_norm": 7.298482894897461, + "learning_rate": 4.4207568996860024e-05, + "loss": 1.672, + "step": 5350 + }, + { + "epoch": 0.8857674034290436, + "grad_norm": 26.250947952270508, + "learning_rate": 4.4290199966947614e-05, + "loss": 1.7415, + "step": 5360 + }, + { + "epoch": 0.887419954554844, + "grad_norm": 15.844700813293457, + "learning_rate": 4.4372830937035204e-05, + "loss": 1.6139, + "step": 5370 + }, + { + "epoch": 0.8890725056806444, + "grad_norm": 7.872781276702881, + "learning_rate": 4.4455461907122794e-05, + "loss": 1.6652, + "step": 5380 + }, + { + "epoch": 0.890725056806445, + "grad_norm": 11.659095764160156, + "learning_rate": 4.453809287721038e-05, + "loss": 1.6276, + "step": 5390 + }, + { + "epoch": 0.8923776079322454, + "grad_norm": 25.442895889282227, + "learning_rate": 4.462072384729797e-05, + "loss": 1.7076, + "step": 5400 + }, + { + "epoch": 0.8940301590580458, + "grad_norm": 40.754371643066406, + "learning_rate": 4.470335481738556e-05, + "loss": 1.7066, + "step": 5410 + }, + { + "epoch": 0.8956827101838463, + "grad_norm": 23.09174156188965, + "learning_rate": 4.478598578747315e-05, + "loss": 1.7866, + "step": 5420 + }, + { + "epoch": 0.8973352613096468, + "grad_norm": 24.503602981567383, + "learning_rate": 4.486861675756074e-05, + "loss": 1.5919, + "step": 5430 + }, + { + "epoch": 0.8989878124354472, + "grad_norm": 23.102312088012695, + "learning_rate": 4.495124772764833e-05, + "loss": 1.7015, + "step": 5440 + }, + { + "epoch": 0.9006403635612477, + "grad_norm": 8.59858226776123, + "learning_rate": 4.503387869773591e-05, + "loss": 1.7018, + "step": 5450 + }, + { + "epoch": 0.9022929146870481, + "grad_norm": 23.99196434020996, + "learning_rate": 4.51165096678235e-05, + "loss": 1.627, + "step": 5460 + }, + { + "epoch": 0.9039454658128486, + "grad_norm": 14.380967140197754, + "learning_rate": 4.519914063791109e-05, + "loss": 1.5606, + "step": 5470 + }, + { + "epoch": 0.905598016938649, + "grad_norm": 18.584856033325195, + "learning_rate": 4.528177160799868e-05, + "loss": 1.8168, + "step": 5480 + }, + { + "epoch": 0.9072505680644495, + "grad_norm": 11.22840690612793, + "learning_rate": 4.5364402578086266e-05, + "loss": 1.612, + "step": 5490 + }, + { + "epoch": 0.9089031191902499, + "grad_norm": 20.63224983215332, + "learning_rate": 4.5447033548173856e-05, + "loss": 1.6359, + "step": 5500 + }, + { + "epoch": 0.9105556703160504, + "grad_norm": 9.386175155639648, + "learning_rate": 4.5529664518261446e-05, + "loss": 1.7047, + "step": 5510 + }, + { + "epoch": 0.9122082214418509, + "grad_norm": 12.939002990722656, + "learning_rate": 4.5612295488349036e-05, + "loss": 1.6124, + "step": 5520 + }, + { + "epoch": 0.9138607725676513, + "grad_norm": 10.644198417663574, + "learning_rate": 4.5694926458436626e-05, + "loss": 1.6824, + "step": 5530 + }, + { + "epoch": 0.9155133236934517, + "grad_norm": 15.673925399780273, + "learning_rate": 4.5777557428524216e-05, + "loss": 1.7603, + "step": 5540 + }, + { + "epoch": 0.9171658748192523, + "grad_norm": 10.065011024475098, + "learning_rate": 4.58601883986118e-05, + "loss": 1.804, + "step": 5550 + }, + { + "epoch": 0.9188184259450527, + "grad_norm": 9.452924728393555, + "learning_rate": 4.594281936869939e-05, + "loss": 1.7851, + "step": 5560 + }, + { + "epoch": 0.9204709770708531, + "grad_norm": 10.779854774475098, + "learning_rate": 4.602545033878698e-05, + "loss": 1.7237, + "step": 5570 + }, + { + "epoch": 0.9221235281966536, + "grad_norm": 10.967293739318848, + "learning_rate": 4.610808130887457e-05, + "loss": 1.6173, + "step": 5580 + }, + { + "epoch": 0.9237760793224541, + "grad_norm": 91.86517333984375, + "learning_rate": 4.6190712278962154e-05, + "loss": 1.6469, + "step": 5590 + }, + { + "epoch": 0.9254286304482545, + "grad_norm": 4.923385143280029, + "learning_rate": 4.6273343249049744e-05, + "loss": 1.5892, + "step": 5600 + }, + { + "epoch": 0.927081181574055, + "grad_norm": 19.811214447021484, + "learning_rate": 4.6355974219137334e-05, + "loss": 1.5998, + "step": 5610 + }, + { + "epoch": 0.9287337326998554, + "grad_norm": 9.930047035217285, + "learning_rate": 4.6438605189224924e-05, + "loss": 1.6312, + "step": 5620 + }, + { + "epoch": 0.9303862838256558, + "grad_norm": 20.98347282409668, + "learning_rate": 4.6521236159312514e-05, + "loss": 1.7513, + "step": 5630 + }, + { + "epoch": 0.9320388349514563, + "grad_norm": 11.083711624145508, + "learning_rate": 4.6603867129400104e-05, + "loss": 1.7647, + "step": 5640 + }, + { + "epoch": 0.9336913860772568, + "grad_norm": 15.540210723876953, + "learning_rate": 4.668649809948769e-05, + "loss": 1.606, + "step": 5650 + }, + { + "epoch": 0.9353439372030572, + "grad_norm": 40.91162872314453, + "learning_rate": 4.676912906957528e-05, + "loss": 1.5821, + "step": 5660 + }, + { + "epoch": 0.9369964883288576, + "grad_norm": 10.069815635681152, + "learning_rate": 4.685176003966287e-05, + "loss": 1.5963, + "step": 5670 + }, + { + "epoch": 0.9386490394546582, + "grad_norm": 7.199189186096191, + "learning_rate": 4.693439100975046e-05, + "loss": 1.6151, + "step": 5680 + }, + { + "epoch": 0.9403015905804586, + "grad_norm": 14.112994194030762, + "learning_rate": 4.701702197983804e-05, + "loss": 1.8498, + "step": 5690 + }, + { + "epoch": 0.941954141706259, + "grad_norm": 17.025121688842773, + "learning_rate": 4.709965294992563e-05, + "loss": 1.5699, + "step": 5700 + }, + { + "epoch": 0.9436066928320594, + "grad_norm": 22.465831756591797, + "learning_rate": 4.718228392001322e-05, + "loss": 1.7636, + "step": 5710 + }, + { + "epoch": 0.94525924395786, + "grad_norm": 16.813859939575195, + "learning_rate": 4.726491489010081e-05, + "loss": 1.6191, + "step": 5720 + }, + { + "epoch": 0.9469117950836604, + "grad_norm": 12.327750205993652, + "learning_rate": 4.73475458601884e-05, + "loss": 1.7167, + "step": 5730 + }, + { + "epoch": 0.9485643462094608, + "grad_norm": 12.070796966552734, + "learning_rate": 4.743017683027599e-05, + "loss": 1.7759, + "step": 5740 + }, + { + "epoch": 0.9502168973352613, + "grad_norm": 23.6120548248291, + "learning_rate": 4.7512807800363576e-05, + "loss": 1.781, + "step": 5750 + }, + { + "epoch": 0.9518694484610618, + "grad_norm": 12.13965892791748, + "learning_rate": 4.7595438770451166e-05, + "loss": 1.4901, + "step": 5760 + }, + { + "epoch": 0.9535219995868622, + "grad_norm": 10.353153228759766, + "learning_rate": 4.7678069740538756e-05, + "loss": 1.7373, + "step": 5770 + }, + { + "epoch": 0.9551745507126627, + "grad_norm": 13.95853328704834, + "learning_rate": 4.7760700710626346e-05, + "loss": 1.771, + "step": 5780 + }, + { + "epoch": 0.9568271018384631, + "grad_norm": 9.67155647277832, + "learning_rate": 4.784333168071393e-05, + "loss": 1.6026, + "step": 5790 + }, + { + "epoch": 0.9584796529642636, + "grad_norm": 20.621294021606445, + "learning_rate": 4.792596265080152e-05, + "loss": 1.6695, + "step": 5800 + }, + { + "epoch": 0.9601322040900641, + "grad_norm": 24.754667282104492, + "learning_rate": 4.800859362088912e-05, + "loss": 1.8208, + "step": 5810 + }, + { + "epoch": 0.9617847552158645, + "grad_norm": 8.666321754455566, + "learning_rate": 4.80912245909767e-05, + "loss": 1.7137, + "step": 5820 + }, + { + "epoch": 0.9634373063416649, + "grad_norm": 43.4861946105957, + "learning_rate": 4.817385556106429e-05, + "loss": 1.6497, + "step": 5830 + }, + { + "epoch": 0.9650898574674655, + "grad_norm": 30.73326873779297, + "learning_rate": 4.825648653115188e-05, + "loss": 1.7121, + "step": 5840 + }, + { + "epoch": 0.9667424085932659, + "grad_norm": 33.42872619628906, + "learning_rate": 4.8339117501239464e-05, + "loss": 1.8122, + "step": 5850 + }, + { + "epoch": 0.9683949597190663, + "grad_norm": 27.492687225341797, + "learning_rate": 4.8421748471327054e-05, + "loss": 1.6811, + "step": 5860 + }, + { + "epoch": 0.9700475108448667, + "grad_norm": 6.809346675872803, + "learning_rate": 4.8504379441414644e-05, + "loss": 1.6784, + "step": 5870 + }, + { + "epoch": 0.9717000619706673, + "grad_norm": 6.992137432098389, + "learning_rate": 4.8587010411502234e-05, + "loss": 1.5981, + "step": 5880 + }, + { + "epoch": 0.9733526130964677, + "grad_norm": 9.034411430358887, + "learning_rate": 4.866964138158982e-05, + "loss": 1.7404, + "step": 5890 + }, + { + "epoch": 0.9750051642222681, + "grad_norm": 7.147427082061768, + "learning_rate": 4.8752272351677415e-05, + "loss": 1.719, + "step": 5900 + }, + { + "epoch": 0.9766577153480686, + "grad_norm": 6.848790168762207, + "learning_rate": 4.8834903321765005e-05, + "loss": 1.6843, + "step": 5910 + }, + { + "epoch": 0.978310266473869, + "grad_norm": 13.439620971679688, + "learning_rate": 4.891753429185259e-05, + "loss": 1.7116, + "step": 5920 + }, + { + "epoch": 0.9799628175996695, + "grad_norm": 6.594333648681641, + "learning_rate": 4.900016526194018e-05, + "loss": 1.8479, + "step": 5930 + }, + { + "epoch": 0.98161536872547, + "grad_norm": 7.724569320678711, + "learning_rate": 4.908279623202777e-05, + "loss": 1.6023, + "step": 5940 + }, + { + "epoch": 0.9832679198512704, + "grad_norm": 41.57175064086914, + "learning_rate": 4.916542720211535e-05, + "loss": 1.7183, + "step": 5950 + }, + { + "epoch": 0.9849204709770708, + "grad_norm": 16.64209747314453, + "learning_rate": 4.924805817220294e-05, + "loss": 1.6207, + "step": 5960 + }, + { + "epoch": 0.9865730221028713, + "grad_norm": 13.659209251403809, + "learning_rate": 4.933068914229053e-05, + "loss": 1.6747, + "step": 5970 + }, + { + "epoch": 0.9882255732286718, + "grad_norm": 7.966761589050293, + "learning_rate": 4.941332011237812e-05, + "loss": 1.6441, + "step": 5980 + }, + { + "epoch": 0.9898781243544722, + "grad_norm": 5.628538131713867, + "learning_rate": 4.9495951082465706e-05, + "loss": 1.7499, + "step": 5990 + }, + { + "epoch": 0.9915306754802726, + "grad_norm": 8.069196701049805, + "learning_rate": 4.95785820525533e-05, + "loss": 1.88, + "step": 6000 + }, + { + "epoch": 0.9931832266060732, + "grad_norm": 51.40681457519531, + "learning_rate": 4.966121302264089e-05, + "loss": 1.6778, + "step": 6010 + }, + { + "epoch": 0.9948357777318736, + "grad_norm": 6.667893886566162, + "learning_rate": 4.9743843992728476e-05, + "loss": 1.6331, + "step": 6020 + }, + { + "epoch": 0.996488328857674, + "grad_norm": 8.195232391357422, + "learning_rate": 4.9826474962816066e-05, + "loss": 1.6024, + "step": 6030 + }, + { + "epoch": 0.9981408799834744, + "grad_norm": 14.904173851013184, + "learning_rate": 4.9909105932903657e-05, + "loss": 1.7938, + "step": 6040 + }, + { + "epoch": 0.999793431109275, + "grad_norm": 9.155719757080078, + "learning_rate": 4.999173690299124e-05, + "loss": 1.5715, + "step": 6050 + }, + { + "epoch": 0.999958686221855, + "eval_accuracy": 0.2757837991204426, + "eval_loss": 2.0155398845672607, + "eval_runtime": 854.7614, + "eval_samples_per_second": 32.987, + "eval_steps_per_second": 8.247, + "step": 6051 + }, + { + "epoch": 1.0014459822350754, + "grad_norm": 9.607635498046875, + "learning_rate": 4.999173690299124e-05, + "loss": 1.6654, + "step": 6060 + }, + { + "epoch": 1.003098533360876, + "grad_norm": 5.138330459594727, + "learning_rate": 4.998255568409262e-05, + "loss": 1.5768, + "step": 6070 + }, + { + "epoch": 1.0047510844866763, + "grad_norm": 28.794178009033203, + "learning_rate": 4.9973374465194e-05, + "loss": 1.6652, + "step": 6080 + }, + { + "epoch": 1.0064036356124768, + "grad_norm": 14.260908126831055, + "learning_rate": 4.996419324629538e-05, + "loss": 1.5967, + "step": 6090 + }, + { + "epoch": 1.0080561867382771, + "grad_norm": 11.221604347229004, + "learning_rate": 4.995501202739676e-05, + "loss": 1.6488, + "step": 6100 + }, + { + "epoch": 1.0097087378640777, + "grad_norm": 8.829093933105469, + "learning_rate": 4.994583080849814e-05, + "loss": 1.6887, + "step": 6110 + }, + { + "epoch": 1.0113612889898782, + "grad_norm": 10.191542625427246, + "learning_rate": 4.9936649589599515e-05, + "loss": 1.7274, + "step": 6120 + }, + { + "epoch": 1.0130138401156785, + "grad_norm": 9.99532413482666, + "learning_rate": 4.99274683707009e-05, + "loss": 1.6879, + "step": 6130 + }, + { + "epoch": 1.014666391241479, + "grad_norm": 26.136659622192383, + "learning_rate": 4.9918287151802274e-05, + "loss": 1.8429, + "step": 6140 + }, + { + "epoch": 1.0163189423672794, + "grad_norm": 7.408060073852539, + "learning_rate": 4.9909105932903657e-05, + "loss": 1.6909, + "step": 6150 + }, + { + "epoch": 1.01797149349308, + "grad_norm": 15.327078819274902, + "learning_rate": 4.989992471400504e-05, + "loss": 1.7461, + "step": 6160 + }, + { + "epoch": 1.0196240446188805, + "grad_norm": 22.485273361206055, + "learning_rate": 4.9890743495106415e-05, + "loss": 1.767, + "step": 6170 + }, + { + "epoch": 1.0212765957446808, + "grad_norm": 21.41472816467285, + "learning_rate": 4.988156227620779e-05, + "loss": 1.6902, + "step": 6180 + }, + { + "epoch": 1.0229291468704813, + "grad_norm": 8.6235933303833, + "learning_rate": 4.987238105730917e-05, + "loss": 1.6762, + "step": 6190 + }, + { + "epoch": 1.0245816979962818, + "grad_norm": 13.099306106567383, + "learning_rate": 4.986319983841055e-05, + "loss": 1.7881, + "step": 6200 + }, + { + "epoch": 1.0262342491220822, + "grad_norm": 6.052161693572998, + "learning_rate": 4.9854018619511925e-05, + "loss": 1.5323, + "step": 6210 + }, + { + "epoch": 1.0278868002478827, + "grad_norm": 9.501975059509277, + "learning_rate": 4.984483740061331e-05, + "loss": 1.6617, + "step": 6220 + }, + { + "epoch": 1.029539351373683, + "grad_norm": 68.59492492675781, + "learning_rate": 4.9835656181714684e-05, + "loss": 1.655, + "step": 6230 + }, + { + "epoch": 1.0311919024994836, + "grad_norm": 33.55989456176758, + "learning_rate": 4.9826474962816066e-05, + "loss": 1.7361, + "step": 6240 + }, + { + "epoch": 1.032844453625284, + "grad_norm": 33.48898696899414, + "learning_rate": 4.981729374391744e-05, + "loss": 1.667, + "step": 6250 + }, + { + "epoch": 1.0344970047510844, + "grad_norm": 21.978069305419922, + "learning_rate": 4.9808112525018825e-05, + "loss": 1.6726, + "step": 6260 + }, + { + "epoch": 1.036149555876885, + "grad_norm": 27.184450149536133, + "learning_rate": 4.979893130612021e-05, + "loss": 1.6208, + "step": 6270 + }, + { + "epoch": 1.0378021070026855, + "grad_norm": 12.480914115905762, + "learning_rate": 4.9789750087221583e-05, + "loss": 1.707, + "step": 6280 + }, + { + "epoch": 1.0394546581284858, + "grad_norm": 9.19139575958252, + "learning_rate": 4.9780568868322966e-05, + "loss": 1.6405, + "step": 6290 + }, + { + "epoch": 1.0411072092542863, + "grad_norm": 92.05270385742188, + "learning_rate": 4.977138764942434e-05, + "loss": 1.5893, + "step": 6300 + }, + { + "epoch": 1.0427597603800867, + "grad_norm": 17.508121490478516, + "learning_rate": 4.976220643052572e-05, + "loss": 1.6037, + "step": 6310 + }, + { + "epoch": 1.0444123115058872, + "grad_norm": 46.28389358520508, + "learning_rate": 4.9753025211627094e-05, + "loss": 1.5507, + "step": 6320 + }, + { + "epoch": 1.0460648626316877, + "grad_norm": 26.94911003112793, + "learning_rate": 4.9743843992728476e-05, + "loss": 1.8003, + "step": 6330 + }, + { + "epoch": 1.047717413757488, + "grad_norm": 7.172421932220459, + "learning_rate": 4.973466277382985e-05, + "loss": 1.7725, + "step": 6340 + }, + { + "epoch": 1.0493699648832886, + "grad_norm": 13.676774978637695, + "learning_rate": 4.9725481554931235e-05, + "loss": 1.5975, + "step": 6350 + }, + { + "epoch": 1.0510225160090891, + "grad_norm": 9.980374336242676, + "learning_rate": 4.971630033603261e-05, + "loss": 1.5381, + "step": 6360 + }, + { + "epoch": 1.0526750671348895, + "grad_norm": 11.59304428100586, + "learning_rate": 4.970711911713399e-05, + "loss": 1.7605, + "step": 6370 + }, + { + "epoch": 1.05432761826069, + "grad_norm": 5.923514366149902, + "learning_rate": 4.969793789823537e-05, + "loss": 1.6649, + "step": 6380 + }, + { + "epoch": 1.0559801693864903, + "grad_norm": 10.02888298034668, + "learning_rate": 4.968875667933675e-05, + "loss": 1.7624, + "step": 6390 + }, + { + "epoch": 1.0576327205122908, + "grad_norm": 24.110645294189453, + "learning_rate": 4.9679575460438134e-05, + "loss": 1.7263, + "step": 6400 + }, + { + "epoch": 1.0592852716380914, + "grad_norm": 29.78762435913086, + "learning_rate": 4.967039424153951e-05, + "loss": 1.5858, + "step": 6410 + }, + { + "epoch": 1.0609378227638917, + "grad_norm": 10.339710235595703, + "learning_rate": 4.966121302264089e-05, + "loss": 1.5688, + "step": 6420 + }, + { + "epoch": 1.0625903738896922, + "grad_norm": 12.489291191101074, + "learning_rate": 4.965203180374227e-05, + "loss": 1.7094, + "step": 6430 + }, + { + "epoch": 1.0642429250154928, + "grad_norm": 7.957202434539795, + "learning_rate": 4.9642850584843645e-05, + "loss": 1.6154, + "step": 6440 + }, + { + "epoch": 1.065895476141293, + "grad_norm": 8.301098823547363, + "learning_rate": 4.963366936594502e-05, + "loss": 1.7572, + "step": 6450 + }, + { + "epoch": 1.0675480272670936, + "grad_norm": 16.680442810058594, + "learning_rate": 4.96244881470464e-05, + "loss": 1.7111, + "step": 6460 + }, + { + "epoch": 1.069200578392894, + "grad_norm": 8.970675468444824, + "learning_rate": 4.961530692814778e-05, + "loss": 1.7194, + "step": 6470 + }, + { + "epoch": 1.0708531295186945, + "grad_norm": 16.222583770751953, + "learning_rate": 4.960612570924916e-05, + "loss": 1.6507, + "step": 6480 + }, + { + "epoch": 1.072505680644495, + "grad_norm": 7.895381450653076, + "learning_rate": 4.959694449035054e-05, + "loss": 1.6932, + "step": 6490 + }, + { + "epoch": 1.0741582317702953, + "grad_norm": 19.77764129638672, + "learning_rate": 4.958776327145192e-05, + "loss": 1.8433, + "step": 6500 + }, + { + "epoch": 1.0758107828960959, + "grad_norm": 14.908355712890625, + "learning_rate": 4.95785820525533e-05, + "loss": 1.5765, + "step": 6510 + }, + { + "epoch": 1.0774633340218962, + "grad_norm": 10.35196304321289, + "learning_rate": 4.956940083365468e-05, + "loss": 1.8107, + "step": 6520 + }, + { + "epoch": 1.0791158851476967, + "grad_norm": 9.052248001098633, + "learning_rate": 4.956021961475606e-05, + "loss": 1.7856, + "step": 6530 + }, + { + "epoch": 1.0807684362734973, + "grad_norm": 7.395838737487793, + "learning_rate": 4.955103839585744e-05, + "loss": 1.691, + "step": 6540 + }, + { + "epoch": 1.0824209873992976, + "grad_norm": 36.31234359741211, + "learning_rate": 4.954185717695882e-05, + "loss": 1.6575, + "step": 6550 + }, + { + "epoch": 1.0840735385250981, + "grad_norm": 19.741281509399414, + "learning_rate": 4.9532675958060196e-05, + "loss": 1.7273, + "step": 6560 + }, + { + "epoch": 1.0857260896508987, + "grad_norm": 44.102500915527344, + "learning_rate": 4.952349473916157e-05, + "loss": 1.7415, + "step": 6570 + }, + { + "epoch": 1.087378640776699, + "grad_norm": 5.051490783691406, + "learning_rate": 4.951431352026295e-05, + "loss": 1.6828, + "step": 6580 + }, + { + "epoch": 1.0890311919024995, + "grad_norm": 6.60805606842041, + "learning_rate": 4.950513230136433e-05, + "loss": 1.6364, + "step": 6590 + }, + { + "epoch": 1.0906837430282998, + "grad_norm": 18.499725341796875, + "learning_rate": 4.9495951082465706e-05, + "loss": 1.6844, + "step": 6600 + }, + { + "epoch": 1.0923362941541004, + "grad_norm": 18.149259567260742, + "learning_rate": 4.948676986356709e-05, + "loss": 1.5962, + "step": 6610 + }, + { + "epoch": 1.093988845279901, + "grad_norm": 17.02660369873047, + "learning_rate": 4.947758864466847e-05, + "loss": 1.6441, + "step": 6620 + }, + { + "epoch": 1.0956413964057012, + "grad_norm": 13.559640884399414, + "learning_rate": 4.946840742576985e-05, + "loss": 1.5714, + "step": 6630 + }, + { + "epoch": 1.0972939475315018, + "grad_norm": 12.437646865844727, + "learning_rate": 4.945922620687123e-05, + "loss": 1.7886, + "step": 6640 + }, + { + "epoch": 1.0989464986573023, + "grad_norm": 10.138182640075684, + "learning_rate": 4.9450044987972605e-05, + "loss": 1.6831, + "step": 6650 + }, + { + "epoch": 1.1005990497831026, + "grad_norm": 18.947307586669922, + "learning_rate": 4.944086376907399e-05, + "loss": 1.634, + "step": 6660 + }, + { + "epoch": 1.1022516009089032, + "grad_norm": 7.246707439422607, + "learning_rate": 4.9431682550175364e-05, + "loss": 1.6895, + "step": 6670 + }, + { + "epoch": 1.1039041520347035, + "grad_norm": 8.21375846862793, + "learning_rate": 4.9422501331276747e-05, + "loss": 1.6634, + "step": 6680 + }, + { + "epoch": 1.105556703160504, + "grad_norm": 14.554459571838379, + "learning_rate": 4.941332011237812e-05, + "loss": 1.7784, + "step": 6690 + }, + { + "epoch": 1.1072092542863046, + "grad_norm": 16.16337776184082, + "learning_rate": 4.94041388934795e-05, + "loss": 1.6209, + "step": 6700 + }, + { + "epoch": 1.1088618054121049, + "grad_norm": 25.227270126342773, + "learning_rate": 4.9394957674580874e-05, + "loss": 1.5893, + "step": 6710 + }, + { + "epoch": 1.1105143565379054, + "grad_norm": 11.184518814086914, + "learning_rate": 4.938577645568226e-05, + "loss": 1.7018, + "step": 6720 + }, + { + "epoch": 1.1121669076637057, + "grad_norm": 7.978889465332031, + "learning_rate": 4.937659523678364e-05, + "loss": 1.7783, + "step": 6730 + }, + { + "epoch": 1.1138194587895063, + "grad_norm": 11.834269523620605, + "learning_rate": 4.9367414017885015e-05, + "loss": 1.6588, + "step": 6740 + }, + { + "epoch": 1.1154720099153068, + "grad_norm": 7.372729778289795, + "learning_rate": 4.93582327989864e-05, + "loss": 1.7051, + "step": 6750 + }, + { + "epoch": 1.1171245610411071, + "grad_norm": 15.451234817504883, + "learning_rate": 4.9349051580087774e-05, + "loss": 1.5759, + "step": 6760 + }, + { + "epoch": 1.1187771121669077, + "grad_norm": 7.165411949157715, + "learning_rate": 4.9339870361189156e-05, + "loss": 1.4484, + "step": 6770 + }, + { + "epoch": 1.1204296632927082, + "grad_norm": 9.450922012329102, + "learning_rate": 4.933068914229053e-05, + "loss": 1.6673, + "step": 6780 + }, + { + "epoch": 1.1220822144185085, + "grad_norm": 7.094033718109131, + "learning_rate": 4.9321507923391915e-05, + "loss": 1.7646, + "step": 6790 + }, + { + "epoch": 1.123734765544309, + "grad_norm": 19.683202743530273, + "learning_rate": 4.931232670449329e-05, + "loss": 1.576, + "step": 6800 + }, + { + "epoch": 1.1253873166701096, + "grad_norm": 7.054886341094971, + "learning_rate": 4.9303145485594673e-05, + "loss": 1.6599, + "step": 6810 + }, + { + "epoch": 1.12703986779591, + "grad_norm": 11.178540229797363, + "learning_rate": 4.929396426669605e-05, + "loss": 1.7556, + "step": 6820 + }, + { + "epoch": 1.1286924189217105, + "grad_norm": 7.649514198303223, + "learning_rate": 4.9284783047797425e-05, + "loss": 1.6763, + "step": 6830 + }, + { + "epoch": 1.1303449700475108, + "grad_norm": 27.767852783203125, + "learning_rate": 4.927560182889881e-05, + "loss": 1.6954, + "step": 6840 + }, + { + "epoch": 1.1319975211733113, + "grad_norm": 46.87465286254883, + "learning_rate": 4.9266420610000184e-05, + "loss": 1.6347, + "step": 6850 + }, + { + "epoch": 1.1336500722991119, + "grad_norm": 11.488672256469727, + "learning_rate": 4.9257239391101566e-05, + "loss": 1.6059, + "step": 6860 + }, + { + "epoch": 1.1353026234249122, + "grad_norm": 21.95306396484375, + "learning_rate": 4.924805817220294e-05, + "loss": 1.7236, + "step": 6870 + }, + { + "epoch": 1.1369551745507127, + "grad_norm": 7.683815956115723, + "learning_rate": 4.9238876953304325e-05, + "loss": 1.7474, + "step": 6880 + }, + { + "epoch": 1.138607725676513, + "grad_norm": 41.58155822753906, + "learning_rate": 4.92296957344057e-05, + "loss": 1.6803, + "step": 6890 + }, + { + "epoch": 1.1402602768023136, + "grad_norm": 6.919406890869141, + "learning_rate": 4.922051451550708e-05, + "loss": 1.5835, + "step": 6900 + }, + { + "epoch": 1.141912827928114, + "grad_norm": 11.925729751586914, + "learning_rate": 4.921133329660846e-05, + "loss": 1.52, + "step": 6910 + }, + { + "epoch": 1.1435653790539144, + "grad_norm": 7.959477424621582, + "learning_rate": 4.920215207770984e-05, + "loss": 1.5885, + "step": 6920 + }, + { + "epoch": 1.145217930179715, + "grad_norm": 8.630328178405762, + "learning_rate": 4.919297085881122e-05, + "loss": 1.616, + "step": 6930 + }, + { + "epoch": 1.1468704813055153, + "grad_norm": 15.663494110107422, + "learning_rate": 4.91837896399126e-05, + "loss": 1.6099, + "step": 6940 + }, + { + "epoch": 1.1485230324313158, + "grad_norm": 9.951146125793457, + "learning_rate": 4.9174608421013976e-05, + "loss": 1.6716, + "step": 6950 + }, + { + "epoch": 1.1501755835571164, + "grad_norm": 7.788767337799072, + "learning_rate": 4.916542720211535e-05, + "loss": 1.7533, + "step": 6960 + }, + { + "epoch": 1.1518281346829167, + "grad_norm": 18.3308048248291, + "learning_rate": 4.9156245983216735e-05, + "loss": 1.8288, + "step": 6970 + }, + { + "epoch": 1.1534806858087172, + "grad_norm": 26.051170349121094, + "learning_rate": 4.914706476431811e-05, + "loss": 1.6644, + "step": 6980 + }, + { + "epoch": 1.1551332369345177, + "grad_norm": 12.27104663848877, + "learning_rate": 4.913788354541949e-05, + "loss": 1.6927, + "step": 6990 + }, + { + "epoch": 1.156785788060318, + "grad_norm": 18.62868881225586, + "learning_rate": 4.912870232652087e-05, + "loss": 1.6167, + "step": 7000 + }, + { + "epoch": 1.1584383391861186, + "grad_norm": 5.9332122802734375, + "learning_rate": 4.911952110762225e-05, + "loss": 1.7339, + "step": 7010 + }, + { + "epoch": 1.1600908903119191, + "grad_norm": 12.154903411865234, + "learning_rate": 4.911033988872363e-05, + "loss": 1.7497, + "step": 7020 + }, + { + "epoch": 1.1617434414377195, + "grad_norm": 19.24437713623047, + "learning_rate": 4.910115866982501e-05, + "loss": 1.7414, + "step": 7030 + }, + { + "epoch": 1.16339599256352, + "grad_norm": 26.495933532714844, + "learning_rate": 4.9091977450926386e-05, + "loss": 1.6141, + "step": 7040 + }, + { + "epoch": 1.1650485436893203, + "grad_norm": 6.528138160705566, + "learning_rate": 4.908279623202777e-05, + "loss": 1.5709, + "step": 7050 + }, + { + "epoch": 1.1667010948151209, + "grad_norm": 9.650068283081055, + "learning_rate": 4.9073615013129144e-05, + "loss": 1.5795, + "step": 7060 + }, + { + "epoch": 1.1683536459409214, + "grad_norm": 8.132938385009766, + "learning_rate": 4.906443379423053e-05, + "loss": 1.6091, + "step": 7070 + }, + { + "epoch": 1.1700061970667217, + "grad_norm": 6.303673267364502, + "learning_rate": 4.90552525753319e-05, + "loss": 1.5958, + "step": 7080 + }, + { + "epoch": 1.1716587481925222, + "grad_norm": 8.630463600158691, + "learning_rate": 4.904607135643328e-05, + "loss": 1.5698, + "step": 7090 + }, + { + "epoch": 1.1733112993183226, + "grad_norm": 12.576354026794434, + "learning_rate": 4.903689013753466e-05, + "loss": 1.7056, + "step": 7100 + }, + { + "epoch": 1.174963850444123, + "grad_norm": 56.43805694580078, + "learning_rate": 4.902770891863604e-05, + "loss": 1.7336, + "step": 7110 + }, + { + "epoch": 1.1766164015699236, + "grad_norm": 9.944549560546875, + "learning_rate": 4.901852769973742e-05, + "loss": 1.6921, + "step": 7120 + }, + { + "epoch": 1.178268952695724, + "grad_norm": 7.986307621002197, + "learning_rate": 4.9009346480838796e-05, + "loss": 1.544, + "step": 7130 + }, + { + "epoch": 1.1799215038215245, + "grad_norm": 6.309409141540527, + "learning_rate": 4.900016526194018e-05, + "loss": 1.6083, + "step": 7140 + }, + { + "epoch": 1.1815740549473248, + "grad_norm": 36.086181640625, + "learning_rate": 4.8990984043041554e-05, + "loss": 1.6664, + "step": 7150 + }, + { + "epoch": 1.1832266060731254, + "grad_norm": 8.67955493927002, + "learning_rate": 4.898180282414294e-05, + "loss": 1.574, + "step": 7160 + }, + { + "epoch": 1.184879157198926, + "grad_norm": 7.240811347961426, + "learning_rate": 4.897262160524431e-05, + "loss": 1.6852, + "step": 7170 + }, + { + "epoch": 1.1865317083247262, + "grad_norm": 15.414640426635742, + "learning_rate": 4.8963440386345695e-05, + "loss": 1.5493, + "step": 7180 + }, + { + "epoch": 1.1881842594505267, + "grad_norm": 9.224071502685547, + "learning_rate": 4.895425916744708e-05, + "loss": 1.6202, + "step": 7190 + }, + { + "epoch": 1.1898368105763273, + "grad_norm": 14.074639320373535, + "learning_rate": 4.8945077948548454e-05, + "loss": 1.5506, + "step": 7200 + }, + { + "epoch": 1.1914893617021276, + "grad_norm": 11.120677947998047, + "learning_rate": 4.893589672964983e-05, + "loss": 1.6851, + "step": 7210 + }, + { + "epoch": 1.1931419128279281, + "grad_norm": 10.850142478942871, + "learning_rate": 4.8926715510751206e-05, + "loss": 1.4931, + "step": 7220 + }, + { + "epoch": 1.1947944639537287, + "grad_norm": 18.69784164428711, + "learning_rate": 4.891753429185259e-05, + "loss": 1.7118, + "step": 7230 + }, + { + "epoch": 1.196447015079529, + "grad_norm": 8.451671600341797, + "learning_rate": 4.8908353072953964e-05, + "loss": 1.6661, + "step": 7240 + }, + { + "epoch": 1.1980995662053295, + "grad_norm": 28.36526107788086, + "learning_rate": 4.889917185405535e-05, + "loss": 1.6517, + "step": 7250 + }, + { + "epoch": 1.1997521173311299, + "grad_norm": 8.399593353271484, + "learning_rate": 4.888999063515672e-05, + "loss": 1.5306, + "step": 7260 + }, + { + "epoch": 1.2014046684569304, + "grad_norm": 15.712769508361816, + "learning_rate": 4.8880809416258105e-05, + "loss": 1.6245, + "step": 7270 + }, + { + "epoch": 1.203057219582731, + "grad_norm": 8.373371124267578, + "learning_rate": 4.887162819735948e-05, + "loss": 1.6816, + "step": 7280 + }, + { + "epoch": 1.2047097707085312, + "grad_norm": 11.655881881713867, + "learning_rate": 4.8862446978460864e-05, + "loss": 1.76, + "step": 7290 + }, + { + "epoch": 1.2063623218343318, + "grad_norm": 8.07646369934082, + "learning_rate": 4.8853265759562246e-05, + "loss": 1.7914, + "step": 7300 + }, + { + "epoch": 1.208014872960132, + "grad_norm": 24.092042922973633, + "learning_rate": 4.884408454066362e-05, + "loss": 1.6793, + "step": 7310 + }, + { + "epoch": 1.2096674240859326, + "grad_norm": 10.025934219360352, + "learning_rate": 4.8834903321765005e-05, + "loss": 1.7057, + "step": 7320 + }, + { + "epoch": 1.2113199752117332, + "grad_norm": 9.85206413269043, + "learning_rate": 4.882572210286638e-05, + "loss": 1.5728, + "step": 7330 + }, + { + "epoch": 1.2129725263375335, + "grad_norm": 13.544818878173828, + "learning_rate": 4.8816540883967757e-05, + "loss": 1.783, + "step": 7340 + }, + { + "epoch": 1.214625077463334, + "grad_norm": 17.217933654785156, + "learning_rate": 4.880735966506913e-05, + "loss": 1.8045, + "step": 7350 + }, + { + "epoch": 1.2162776285891344, + "grad_norm": 24.209354400634766, + "learning_rate": 4.8798178446170515e-05, + "loss": 1.6106, + "step": 7360 + }, + { + "epoch": 1.217930179714935, + "grad_norm": 24.636571884155273, + "learning_rate": 4.878899722727189e-05, + "loss": 1.6975, + "step": 7370 + }, + { + "epoch": 1.2195827308407354, + "grad_norm": 16.614473342895508, + "learning_rate": 4.8779816008373274e-05, + "loss": 1.7127, + "step": 7380 + }, + { + "epoch": 1.2212352819665357, + "grad_norm": 57.88620376586914, + "learning_rate": 4.877063478947465e-05, + "loss": 1.7516, + "step": 7390 + }, + { + "epoch": 1.2228878330923363, + "grad_norm": 28.026735305786133, + "learning_rate": 4.876145357057603e-05, + "loss": 1.7203, + "step": 7400 + }, + { + "epoch": 1.2245403842181368, + "grad_norm": 8.524139404296875, + "learning_rate": 4.8752272351677415e-05, + "loss": 1.5154, + "step": 7410 + }, + { + "epoch": 1.2261929353439371, + "grad_norm": 11.738126754760742, + "learning_rate": 4.874309113277879e-05, + "loss": 1.6059, + "step": 7420 + }, + { + "epoch": 1.2278454864697377, + "grad_norm": 35.693504333496094, + "learning_rate": 4.873390991388017e-05, + "loss": 1.716, + "step": 7430 + }, + { + "epoch": 1.2294980375955382, + "grad_norm": 16.629175186157227, + "learning_rate": 4.872472869498155e-05, + "loss": 1.7924, + "step": 7440 + }, + { + "epoch": 1.2311505887213385, + "grad_norm": 9.903701782226562, + "learning_rate": 4.871554747608293e-05, + "loss": 1.8271, + "step": 7450 + }, + { + "epoch": 1.232803139847139, + "grad_norm": 10.66246509552002, + "learning_rate": 4.870636625718431e-05, + "loss": 1.7185, + "step": 7460 + }, + { + "epoch": 1.2344556909729394, + "grad_norm": 8.404884338378906, + "learning_rate": 4.8697185038285683e-05, + "loss": 1.7562, + "step": 7470 + }, + { + "epoch": 1.23610824209874, + "grad_norm": 14.62071418762207, + "learning_rate": 4.868800381938706e-05, + "loss": 1.7265, + "step": 7480 + }, + { + "epoch": 1.2377607932245405, + "grad_norm": 21.035802841186523, + "learning_rate": 4.867882260048844e-05, + "loss": 1.6935, + "step": 7490 + }, + { + "epoch": 1.2394133443503408, + "grad_norm": 13.605401039123535, + "learning_rate": 4.866964138158982e-05, + "loss": 1.6896, + "step": 7500 + }, + { + "epoch": 1.2410658954761413, + "grad_norm": 31.078174591064453, + "learning_rate": 4.86604601626912e-05, + "loss": 1.6084, + "step": 7510 + }, + { + "epoch": 1.2427184466019416, + "grad_norm": 11.396821975708008, + "learning_rate": 4.8651278943792576e-05, + "loss": 1.6015, + "step": 7520 + }, + { + "epoch": 1.2443709977277422, + "grad_norm": 10.65982723236084, + "learning_rate": 4.864209772489396e-05, + "loss": 1.6715, + "step": 7530 + }, + { + "epoch": 1.2460235488535427, + "grad_norm": 34.76524353027344, + "learning_rate": 4.863291650599534e-05, + "loss": 1.7029, + "step": 7540 + }, + { + "epoch": 1.247676099979343, + "grad_norm": 21.453048706054688, + "learning_rate": 4.862373528709672e-05, + "loss": 1.6779, + "step": 7550 + }, + { + "epoch": 1.2493286511051436, + "grad_norm": 9.208992958068848, + "learning_rate": 4.86145540681981e-05, + "loss": 1.7251, + "step": 7560 + }, + { + "epoch": 1.250981202230944, + "grad_norm": 9.565735816955566, + "learning_rate": 4.8605372849299476e-05, + "loss": 1.6829, + "step": 7570 + }, + { + "epoch": 1.2526337533567444, + "grad_norm": 8.265256881713867, + "learning_rate": 4.859619163040086e-05, + "loss": 1.6577, + "step": 7580 + }, + { + "epoch": 1.254286304482545, + "grad_norm": 14.750138282775879, + "learning_rate": 4.8587010411502234e-05, + "loss": 1.6659, + "step": 7590 + }, + { + "epoch": 1.2559388556083455, + "grad_norm": 30.3494930267334, + "learning_rate": 4.857782919260361e-05, + "loss": 1.5829, + "step": 7600 + }, + { + "epoch": 1.2575914067341458, + "grad_norm": 13.287981033325195, + "learning_rate": 4.8568647973704986e-05, + "loss": 1.6724, + "step": 7610 + }, + { + "epoch": 1.2592439578599464, + "grad_norm": 29.33921241760254, + "learning_rate": 4.855946675480637e-05, + "loss": 1.5848, + "step": 7620 + }, + { + "epoch": 1.2608965089857467, + "grad_norm": 7.407712459564209, + "learning_rate": 4.8550285535907745e-05, + "loss": 1.6882, + "step": 7630 + }, + { + "epoch": 1.2625490601115472, + "grad_norm": 6.948355674743652, + "learning_rate": 4.854110431700913e-05, + "loss": 1.6093, + "step": 7640 + }, + { + "epoch": 1.2642016112373478, + "grad_norm": 13.627409934997559, + "learning_rate": 4.853192309811051e-05, + "loss": 1.6143, + "step": 7650 + }, + { + "epoch": 1.265854162363148, + "grad_norm": 10.397058486938477, + "learning_rate": 4.8522741879211886e-05, + "loss": 1.6205, + "step": 7660 + }, + { + "epoch": 1.2675067134889486, + "grad_norm": 12.780577659606934, + "learning_rate": 4.851356066031327e-05, + "loss": 1.6489, + "step": 7670 + }, + { + "epoch": 1.269159264614749, + "grad_norm": 36.843563079833984, + "learning_rate": 4.8504379441414644e-05, + "loss": 1.5179, + "step": 7680 + }, + { + "epoch": 1.2708118157405495, + "grad_norm": 8.480043411254883, + "learning_rate": 4.849519822251603e-05, + "loss": 1.9982, + "step": 7690 + }, + { + "epoch": 1.27246436686635, + "grad_norm": 8.302135467529297, + "learning_rate": 4.84860170036174e-05, + "loss": 1.7339, + "step": 7700 + }, + { + "epoch": 1.2741169179921503, + "grad_norm": 7.312380790710449, + "learning_rate": 4.8476835784718785e-05, + "loss": 1.7085, + "step": 7710 + }, + { + "epoch": 1.2757694691179509, + "grad_norm": 9.16114330291748, + "learning_rate": 4.846765456582016e-05, + "loss": 1.6013, + "step": 7720 + }, + { + "epoch": 1.2774220202437512, + "grad_norm": 10.258267402648926, + "learning_rate": 4.845847334692154e-05, + "loss": 1.5444, + "step": 7730 + }, + { + "epoch": 1.2790745713695517, + "grad_norm": 41.98063278198242, + "learning_rate": 4.844929212802291e-05, + "loss": 1.7147, + "step": 7740 + }, + { + "epoch": 1.2807271224953523, + "grad_norm": 5.9125237464904785, + "learning_rate": 4.8440110909124296e-05, + "loss": 1.5708, + "step": 7750 + }, + { + "epoch": 1.2823796736211526, + "grad_norm": 10.1311616897583, + "learning_rate": 4.843092969022568e-05, + "loss": 1.6326, + "step": 7760 + }, + { + "epoch": 1.284032224746953, + "grad_norm": 9.376750946044922, + "learning_rate": 4.8421748471327054e-05, + "loss": 1.7641, + "step": 7770 + }, + { + "epoch": 1.2856847758727534, + "grad_norm": 8.097874641418457, + "learning_rate": 4.841256725242844e-05, + "loss": 1.5963, + "step": 7780 + }, + { + "epoch": 1.287337326998554, + "grad_norm": 8.448856353759766, + "learning_rate": 4.840338603352981e-05, + "loss": 1.5056, + "step": 7790 + }, + { + "epoch": 1.2889898781243545, + "grad_norm": 10.571368217468262, + "learning_rate": 4.8394204814631195e-05, + "loss": 1.7242, + "step": 7800 + }, + { + "epoch": 1.290642429250155, + "grad_norm": 28.546253204345703, + "learning_rate": 4.838502359573257e-05, + "loss": 1.7191, + "step": 7810 + }, + { + "epoch": 1.2922949803759554, + "grad_norm": 20.1251220703125, + "learning_rate": 4.8375842376833954e-05, + "loss": 1.6916, + "step": 7820 + }, + { + "epoch": 1.293947531501756, + "grad_norm": 10.026655197143555, + "learning_rate": 4.836666115793533e-05, + "loss": 1.6829, + "step": 7830 + }, + { + "epoch": 1.2956000826275562, + "grad_norm": 21.374874114990234, + "learning_rate": 4.835747993903671e-05, + "loss": 1.5028, + "step": 7840 + }, + { + "epoch": 1.2972526337533568, + "grad_norm": 42.24235916137695, + "learning_rate": 4.834829872013809e-05, + "loss": 1.631, + "step": 7850 + }, + { + "epoch": 1.2989051848791573, + "grad_norm": 6.917150974273682, + "learning_rate": 4.8339117501239464e-05, + "loss": 1.7458, + "step": 7860 + }, + { + "epoch": 1.3005577360049576, + "grad_norm": 26.261568069458008, + "learning_rate": 4.8329936282340847e-05, + "loss": 1.7711, + "step": 7870 + }, + { + "epoch": 1.3022102871307581, + "grad_norm": 5.60324239730835, + "learning_rate": 4.832075506344222e-05, + "loss": 1.726, + "step": 7880 + }, + { + "epoch": 1.3038628382565585, + "grad_norm": 12.937496185302734, + "learning_rate": 4.8311573844543605e-05, + "loss": 1.7153, + "step": 7890 + }, + { + "epoch": 1.305515389382359, + "grad_norm": 14.33709716796875, + "learning_rate": 4.830239262564498e-05, + "loss": 1.713, + "step": 7900 + }, + { + "epoch": 1.3071679405081595, + "grad_norm": 8.725417137145996, + "learning_rate": 4.8293211406746364e-05, + "loss": 1.7221, + "step": 7910 + }, + { + "epoch": 1.3088204916339599, + "grad_norm": 13.017943382263184, + "learning_rate": 4.828403018784774e-05, + "loss": 1.5677, + "step": 7920 + }, + { + "epoch": 1.3104730427597604, + "grad_norm": 10.267081260681152, + "learning_rate": 4.827484896894912e-05, + "loss": 1.6575, + "step": 7930 + }, + { + "epoch": 1.3121255938855607, + "grad_norm": 4.439043045043945, + "learning_rate": 4.82656677500505e-05, + "loss": 1.6943, + "step": 7940 + }, + { + "epoch": 1.3137781450113613, + "grad_norm": 9.938862800598145, + "learning_rate": 4.825648653115188e-05, + "loss": 1.6691, + "step": 7950 + }, + { + "epoch": 1.3154306961371618, + "grad_norm": 7.4380669593811035, + "learning_rate": 4.8247305312253256e-05, + "loss": 1.6049, + "step": 7960 + }, + { + "epoch": 1.3170832472629623, + "grad_norm": 9.574049949645996, + "learning_rate": 4.823812409335464e-05, + "loss": 1.5485, + "step": 7970 + }, + { + "epoch": 1.3187357983887626, + "grad_norm": 6.69705057144165, + "learning_rate": 4.8228942874456015e-05, + "loss": 1.7664, + "step": 7980 + }, + { + "epoch": 1.3203883495145632, + "grad_norm": 51.60087203979492, + "learning_rate": 4.821976165555739e-05, + "loss": 1.7427, + "step": 7990 + }, + { + "epoch": 1.3220409006403635, + "grad_norm": 8.675333023071289, + "learning_rate": 4.821058043665877e-05, + "loss": 1.5976, + "step": 8000 + }, + { + "epoch": 1.323693451766164, + "grad_norm": 11.984296798706055, + "learning_rate": 4.820139921776015e-05, + "loss": 1.655, + "step": 8010 + }, + { + "epoch": 1.3253460028919646, + "grad_norm": 7.3102617263793945, + "learning_rate": 4.819221799886153e-05, + "loss": 1.6596, + "step": 8020 + }, + { + "epoch": 1.326998554017765, + "grad_norm": 5.259336948394775, + "learning_rate": 4.818303677996291e-05, + "loss": 1.6986, + "step": 8030 + }, + { + "epoch": 1.3286511051435654, + "grad_norm": 8.154091835021973, + "learning_rate": 4.817385556106429e-05, + "loss": 1.7009, + "step": 8040 + }, + { + "epoch": 1.3303036562693658, + "grad_norm": 6.396356582641602, + "learning_rate": 4.8164674342165666e-05, + "loss": 1.5735, + "step": 8050 + }, + { + "epoch": 1.3319562073951663, + "grad_norm": 7.4831647872924805, + "learning_rate": 4.815549312326705e-05, + "loss": 1.684, + "step": 8060 + }, + { + "epoch": 1.3336087585209668, + "grad_norm": 9.999634742736816, + "learning_rate": 4.8146311904368425e-05, + "loss": 1.7047, + "step": 8070 + }, + { + "epoch": 1.3352613096467671, + "grad_norm": 9.42754077911377, + "learning_rate": 4.813713068546981e-05, + "loss": 1.7498, + "step": 8080 + }, + { + "epoch": 1.3369138607725677, + "grad_norm": 14.416214942932129, + "learning_rate": 4.812794946657118e-05, + "loss": 1.6447, + "step": 8090 + }, + { + "epoch": 1.338566411898368, + "grad_norm": 6.572526454925537, + "learning_rate": 4.8118768247672566e-05, + "loss": 1.6883, + "step": 8100 + }, + { + "epoch": 1.3402189630241685, + "grad_norm": 27.258047103881836, + "learning_rate": 4.810958702877394e-05, + "loss": 1.606, + "step": 8110 + }, + { + "epoch": 1.341871514149969, + "grad_norm": 6.700786590576172, + "learning_rate": 4.810040580987532e-05, + "loss": 1.6751, + "step": 8120 + }, + { + "epoch": 1.3435240652757694, + "grad_norm": 8.797626495361328, + "learning_rate": 4.80912245909767e-05, + "loss": 1.6453, + "step": 8130 + }, + { + "epoch": 1.34517661640157, + "grad_norm": 14.59374713897705, + "learning_rate": 4.8082043372078076e-05, + "loss": 1.6201, + "step": 8140 + }, + { + "epoch": 1.3468291675273703, + "grad_norm": 12.410117149353027, + "learning_rate": 4.807286215317946e-05, + "loss": 1.6923, + "step": 8150 + }, + { + "epoch": 1.3484817186531708, + "grad_norm": 5.754236221313477, + "learning_rate": 4.8063680934280835e-05, + "loss": 1.5902, + "step": 8160 + }, + { + "epoch": 1.3501342697789713, + "grad_norm": 21.738666534423828, + "learning_rate": 4.805449971538222e-05, + "loss": 1.7867, + "step": 8170 + }, + { + "epoch": 1.3517868209047719, + "grad_norm": 11.151928901672363, + "learning_rate": 4.804531849648359e-05, + "loss": 1.6477, + "step": 8180 + }, + { + "epoch": 1.3534393720305722, + "grad_norm": 7.115407466888428, + "learning_rate": 4.8036137277584976e-05, + "loss": 1.6847, + "step": 8190 + }, + { + "epoch": 1.3550919231563727, + "grad_norm": 9.613741874694824, + "learning_rate": 4.802695605868635e-05, + "loss": 1.7105, + "step": 8200 + }, + { + "epoch": 1.356744474282173, + "grad_norm": 9.931926727294922, + "learning_rate": 4.8017774839787734e-05, + "loss": 1.6286, + "step": 8210 + }, + { + "epoch": 1.3583970254079736, + "grad_norm": 46.38539505004883, + "learning_rate": 4.800859362088912e-05, + "loss": 1.7295, + "step": 8220 + }, + { + "epoch": 1.3600495765337741, + "grad_norm": 11.234518051147461, + "learning_rate": 4.799941240199049e-05, + "loss": 1.6839, + "step": 8230 + }, + { + "epoch": 1.3617021276595744, + "grad_norm": 29.555355072021484, + "learning_rate": 4.799023118309187e-05, + "loss": 1.6353, + "step": 8240 + }, + { + "epoch": 1.363354678785375, + "grad_norm": 13.247050285339355, + "learning_rate": 4.7981049964193244e-05, + "loss": 1.7446, + "step": 8250 + }, + { + "epoch": 1.3650072299111753, + "grad_norm": 31.702011108398438, + "learning_rate": 4.797186874529463e-05, + "loss": 1.6522, + "step": 8260 + }, + { + "epoch": 1.3666597810369758, + "grad_norm": 19.7655029296875, + "learning_rate": 4.7962687526396e-05, + "loss": 1.7344, + "step": 8270 + }, + { + "epoch": 1.3683123321627764, + "grad_norm": 31.14563751220703, + "learning_rate": 4.7953506307497386e-05, + "loss": 1.6795, + "step": 8280 + }, + { + "epoch": 1.3699648832885767, + "grad_norm": 10.664438247680664, + "learning_rate": 4.794432508859876e-05, + "loss": 1.7746, + "step": 8290 + }, + { + "epoch": 1.3716174344143772, + "grad_norm": 14.965887069702148, + "learning_rate": 4.7935143869700144e-05, + "loss": 1.6984, + "step": 8300 + }, + { + "epoch": 1.3732699855401775, + "grad_norm": 18.51651954650879, + "learning_rate": 4.792596265080152e-05, + "loss": 1.6421, + "step": 8310 + }, + { + "epoch": 1.374922536665978, + "grad_norm": 8.034571647644043, + "learning_rate": 4.79167814319029e-05, + "loss": 1.6664, + "step": 8320 + }, + { + "epoch": 1.3765750877917786, + "grad_norm": 11.700427055358887, + "learning_rate": 4.7907600213004285e-05, + "loss": 1.6122, + "step": 8330 + }, + { + "epoch": 1.378227638917579, + "grad_norm": 7.673017501831055, + "learning_rate": 4.789841899410566e-05, + "loss": 1.6424, + "step": 8340 + }, + { + "epoch": 1.3798801900433795, + "grad_norm": 4.827954292297363, + "learning_rate": 4.7889237775207044e-05, + "loss": 1.6598, + "step": 8350 + }, + { + "epoch": 1.3815327411691798, + "grad_norm": 11.37001895904541, + "learning_rate": 4.788005655630842e-05, + "loss": 1.6502, + "step": 8360 + }, + { + "epoch": 1.3831852922949803, + "grad_norm": 7.629409313201904, + "learning_rate": 4.7870875337409795e-05, + "loss": 1.7068, + "step": 8370 + }, + { + "epoch": 1.3848378434207809, + "grad_norm": 21.615652084350586, + "learning_rate": 4.786169411851117e-05, + "loss": 1.5153, + "step": 8380 + }, + { + "epoch": 1.3864903945465814, + "grad_norm": 9.277376174926758, + "learning_rate": 4.7852512899612554e-05, + "loss": 1.6594, + "step": 8390 + }, + { + "epoch": 1.3881429456723817, + "grad_norm": 11.108504295349121, + "learning_rate": 4.784333168071393e-05, + "loss": 1.6291, + "step": 8400 + }, + { + "epoch": 1.3897954967981823, + "grad_norm": 9.173980712890625, + "learning_rate": 4.783415046181531e-05, + "loss": 1.6998, + "step": 8410 + }, + { + "epoch": 1.3914480479239826, + "grad_norm": 8.537729263305664, + "learning_rate": 4.782496924291669e-05, + "loss": 1.5601, + "step": 8420 + }, + { + "epoch": 1.3931005990497831, + "grad_norm": 10.740826606750488, + "learning_rate": 4.781578802401807e-05, + "loss": 1.4999, + "step": 8430 + }, + { + "epoch": 1.3947531501755837, + "grad_norm": 9.543370246887207, + "learning_rate": 4.7806606805119453e-05, + "loss": 1.6375, + "step": 8440 + }, + { + "epoch": 1.396405701301384, + "grad_norm": 9.984679222106934, + "learning_rate": 4.779742558622083e-05, + "loss": 1.5013, + "step": 8450 + }, + { + "epoch": 1.3980582524271845, + "grad_norm": 9.201363563537598, + "learning_rate": 4.778824436732221e-05, + "loss": 1.5612, + "step": 8460 + }, + { + "epoch": 1.3997108035529848, + "grad_norm": 17.85106086730957, + "learning_rate": 4.777906314842359e-05, + "loss": 1.7215, + "step": 8470 + }, + { + "epoch": 1.4013633546787854, + "grad_norm": 8.731996536254883, + "learning_rate": 4.776988192952497e-05, + "loss": 1.4592, + "step": 8480 + }, + { + "epoch": 1.403015905804586, + "grad_norm": 12.443288803100586, + "learning_rate": 4.7760700710626346e-05, + "loss": 1.6517, + "step": 8490 + }, + { + "epoch": 1.4046684569303862, + "grad_norm": 4.665842056274414, + "learning_rate": 4.775151949172772e-05, + "loss": 1.5992, + "step": 8500 + }, + { + "epoch": 1.4063210080561868, + "grad_norm": 8.88791275024414, + "learning_rate": 4.77423382728291e-05, + "loss": 1.7198, + "step": 8510 + }, + { + "epoch": 1.407973559181987, + "grad_norm": 7.840359687805176, + "learning_rate": 4.773315705393048e-05, + "loss": 1.633, + "step": 8520 + }, + { + "epoch": 1.4096261103077876, + "grad_norm": 42.78237533569336, + "learning_rate": 4.7723975835031857e-05, + "loss": 1.7015, + "step": 8530 + }, + { + "epoch": 1.4112786614335882, + "grad_norm": 31.31909942626953, + "learning_rate": 4.771479461613324e-05, + "loss": 1.8256, + "step": 8540 + }, + { + "epoch": 1.4129312125593887, + "grad_norm": 9.952142715454102, + "learning_rate": 4.7705613397234615e-05, + "loss": 1.7117, + "step": 8550 + }, + { + "epoch": 1.414583763685189, + "grad_norm": 10.509651184082031, + "learning_rate": 4.7696432178336e-05, + "loss": 1.6899, + "step": 8560 + }, + { + "epoch": 1.4162363148109893, + "grad_norm": 13.65649700164795, + "learning_rate": 4.768725095943738e-05, + "loss": 1.6753, + "step": 8570 + }, + { + "epoch": 1.4178888659367899, + "grad_norm": 9.891709327697754, + "learning_rate": 4.7678069740538756e-05, + "loss": 1.6893, + "step": 8580 + }, + { + "epoch": 1.4195414170625904, + "grad_norm": 6.675931453704834, + "learning_rate": 4.766888852164014e-05, + "loss": 1.6134, + "step": 8590 + }, + { + "epoch": 1.421193968188391, + "grad_norm": 8.769001007080078, + "learning_rate": 4.7659707302741515e-05, + "loss": 1.6775, + "step": 8600 + }, + { + "epoch": 1.4228465193141913, + "grad_norm": 9.261868476867676, + "learning_rate": 4.76505260838429e-05, + "loss": 1.7286, + "step": 8610 + }, + { + "epoch": 1.4244990704399918, + "grad_norm": 9.514866828918457, + "learning_rate": 4.764134486494427e-05, + "loss": 1.5825, + "step": 8620 + }, + { + "epoch": 1.4261516215657921, + "grad_norm": 11.191073417663574, + "learning_rate": 4.763216364604565e-05, + "loss": 1.699, + "step": 8630 + }, + { + "epoch": 1.4278041726915927, + "grad_norm": 7.584383487701416, + "learning_rate": 4.7622982427147025e-05, + "loss": 1.7905, + "step": 8640 + }, + { + "epoch": 1.4294567238173932, + "grad_norm": 7.346413612365723, + "learning_rate": 4.761380120824841e-05, + "loss": 1.7111, + "step": 8650 + }, + { + "epoch": 1.4311092749431935, + "grad_norm": 12.299200057983398, + "learning_rate": 4.7604619989349783e-05, + "loss": 1.6624, + "step": 8660 + }, + { + "epoch": 1.432761826068994, + "grad_norm": 28.52010154724121, + "learning_rate": 4.7595438770451166e-05, + "loss": 1.7085, + "step": 8670 + }, + { + "epoch": 1.4344143771947944, + "grad_norm": 20.789058685302734, + "learning_rate": 4.758625755155255e-05, + "loss": 1.7056, + "step": 8680 + }, + { + "epoch": 1.436066928320595, + "grad_norm": 13.599698066711426, + "learning_rate": 4.7577076332653925e-05, + "loss": 1.5663, + "step": 8690 + }, + { + "epoch": 1.4377194794463954, + "grad_norm": 15.90054702758789, + "learning_rate": 4.756789511375531e-05, + "loss": 1.7743, + "step": 8700 + }, + { + "epoch": 1.4393720305721958, + "grad_norm": 21.3345890045166, + "learning_rate": 4.755871389485668e-05, + "loss": 1.7051, + "step": 8710 + }, + { + "epoch": 1.4410245816979963, + "grad_norm": 10.17492389678955, + "learning_rate": 4.7549532675958066e-05, + "loss": 1.6872, + "step": 8720 + }, + { + "epoch": 1.4426771328237966, + "grad_norm": 5.552209377288818, + "learning_rate": 4.754035145705944e-05, + "loss": 1.6925, + "step": 8730 + }, + { + "epoch": 1.4443296839495972, + "grad_norm": 17.34671401977539, + "learning_rate": 4.7531170238160824e-05, + "loss": 1.5588, + "step": 8740 + }, + { + "epoch": 1.4459822350753977, + "grad_norm": 7.345114231109619, + "learning_rate": 4.75219890192622e-05, + "loss": 1.6705, + "step": 8750 + }, + { + "epoch": 1.4476347862011982, + "grad_norm": 7.885573387145996, + "learning_rate": 4.7512807800363576e-05, + "loss": 1.5831, + "step": 8760 + }, + { + "epoch": 1.4492873373269985, + "grad_norm": 26.88873291015625, + "learning_rate": 4.750362658146495e-05, + "loss": 1.6182, + "step": 8770 + }, + { + "epoch": 1.450939888452799, + "grad_norm": 6.879226207733154, + "learning_rate": 4.7494445362566334e-05, + "loss": 1.8231, + "step": 8780 + }, + { + "epoch": 1.4525924395785994, + "grad_norm": 15.858061790466309, + "learning_rate": 4.748526414366772e-05, + "loss": 1.5887, + "step": 8790 + }, + { + "epoch": 1.4542449907044, + "grad_norm": 6.855731010437012, + "learning_rate": 4.747608292476909e-05, + "loss": 1.7003, + "step": 8800 + }, + { + "epoch": 1.4558975418302005, + "grad_norm": 32.11435317993164, + "learning_rate": 4.7466901705870475e-05, + "loss": 1.682, + "step": 8810 + }, + { + "epoch": 1.4575500929560008, + "grad_norm": 10.053679466247559, + "learning_rate": 4.745772048697185e-05, + "loss": 1.6652, + "step": 8820 + }, + { + "epoch": 1.4592026440818013, + "grad_norm": 9.131372451782227, + "learning_rate": 4.7448539268073234e-05, + "loss": 1.592, + "step": 8830 + }, + { + "epoch": 1.4608551952076017, + "grad_norm": 7.688348293304443, + "learning_rate": 4.743935804917461e-05, + "loss": 1.6068, + "step": 8840 + }, + { + "epoch": 1.4625077463334022, + "grad_norm": 6.706598281860352, + "learning_rate": 4.743017683027599e-05, + "loss": 1.5156, + "step": 8850 + }, + { + "epoch": 1.4641602974592027, + "grad_norm": 10.917819023132324, + "learning_rate": 4.742099561137737e-05, + "loss": 1.5903, + "step": 8860 + }, + { + "epoch": 1.465812848585003, + "grad_norm": 21.588254928588867, + "learning_rate": 4.741181439247875e-05, + "loss": 1.6745, + "step": 8870 + }, + { + "epoch": 1.4674653997108036, + "grad_norm": 20.283750534057617, + "learning_rate": 4.740263317358013e-05, + "loss": 1.5973, + "step": 8880 + }, + { + "epoch": 1.469117950836604, + "grad_norm": 21.922521591186523, + "learning_rate": 4.73934519546815e-05, + "loss": 1.5737, + "step": 8890 + }, + { + "epoch": 1.4707705019624044, + "grad_norm": 8.129682540893555, + "learning_rate": 4.7384270735782885e-05, + "loss": 1.7585, + "step": 8900 + }, + { + "epoch": 1.472423053088205, + "grad_norm": 24.920692443847656, + "learning_rate": 4.737508951688426e-05, + "loss": 1.6209, + "step": 8910 + }, + { + "epoch": 1.4740756042140053, + "grad_norm": 10.06619930267334, + "learning_rate": 4.7365908297985644e-05, + "loss": 1.686, + "step": 8920 + }, + { + "epoch": 1.4757281553398058, + "grad_norm": 13.576436042785645, + "learning_rate": 4.735672707908702e-05, + "loss": 1.6421, + "step": 8930 + }, + { + "epoch": 1.4773807064656062, + "grad_norm": 13.374571800231934, + "learning_rate": 4.73475458601884e-05, + "loss": 1.7228, + "step": 8940 + }, + { + "epoch": 1.4790332575914067, + "grad_norm": 7.154562473297119, + "learning_rate": 4.733836464128978e-05, + "loss": 1.6549, + "step": 8950 + }, + { + "epoch": 1.4806858087172072, + "grad_norm": 15.508869171142578, + "learning_rate": 4.732918342239116e-05, + "loss": 1.675, + "step": 8960 + }, + { + "epoch": 1.4823383598430078, + "grad_norm": 10.355504989624023, + "learning_rate": 4.732000220349254e-05, + "loss": 1.6165, + "step": 8970 + }, + { + "epoch": 1.483990910968808, + "grad_norm": 9.313375473022461, + "learning_rate": 4.731082098459392e-05, + "loss": 1.6555, + "step": 8980 + }, + { + "epoch": 1.4856434620946086, + "grad_norm": 11.76880168914795, + "learning_rate": 4.7301639765695295e-05, + "loss": 1.7011, + "step": 8990 + }, + { + "epoch": 1.487296013220409, + "grad_norm": 9.017674446105957, + "learning_rate": 4.729245854679668e-05, + "loss": 1.696, + "step": 9000 + }, + { + "epoch": 1.4889485643462095, + "grad_norm": 30.168582916259766, + "learning_rate": 4.7283277327898054e-05, + "loss": 1.6945, + "step": 9010 + }, + { + "epoch": 1.49060111547201, + "grad_norm": 28.239177703857422, + "learning_rate": 4.727409610899943e-05, + "loss": 1.5782, + "step": 9020 + }, + { + "epoch": 1.4922536665978103, + "grad_norm": 7.943751335144043, + "learning_rate": 4.726491489010081e-05, + "loss": 1.6587, + "step": 9030 + }, + { + "epoch": 1.4939062177236109, + "grad_norm": 19.905715942382812, + "learning_rate": 4.725573367120219e-05, + "loss": 1.5575, + "step": 9040 + }, + { + "epoch": 1.4955587688494112, + "grad_norm": 10.705276489257812, + "learning_rate": 4.724655245230357e-05, + "loss": 1.59, + "step": 9050 + }, + { + "epoch": 1.4972113199752117, + "grad_norm": 5.611824035644531, + "learning_rate": 4.7237371233404947e-05, + "loss": 1.63, + "step": 9060 + }, + { + "epoch": 1.4988638711010123, + "grad_norm": 14.214753150939941, + "learning_rate": 4.722819001450633e-05, + "loss": 1.6358, + "step": 9070 + }, + { + "epoch": 1.5005164222268128, + "grad_norm": 12.516484260559082, + "learning_rate": 4.7219008795607705e-05, + "loss": 1.7702, + "step": 9080 + }, + { + "epoch": 1.5021689733526131, + "grad_norm": 7.916662693023682, + "learning_rate": 4.720982757670909e-05, + "loss": 1.677, + "step": 9090 + }, + { + "epoch": 1.5038215244784134, + "grad_norm": 8.450326919555664, + "learning_rate": 4.7200646357810464e-05, + "loss": 1.6053, + "step": 9100 + }, + { + "epoch": 1.505474075604214, + "grad_norm": 15.917101860046387, + "learning_rate": 4.7191465138911846e-05, + "loss": 1.6568, + "step": 9110 + }, + { + "epoch": 1.5071266267300145, + "grad_norm": 10.047342300415039, + "learning_rate": 4.718228392001322e-05, + "loss": 1.7409, + "step": 9120 + }, + { + "epoch": 1.508779177855815, + "grad_norm": 8.033047676086426, + "learning_rate": 4.7173102701114605e-05, + "loss": 1.6499, + "step": 9130 + }, + { + "epoch": 1.5104317289816154, + "grad_norm": 8.630041122436523, + "learning_rate": 4.716392148221598e-05, + "loss": 1.5798, + "step": 9140 + }, + { + "epoch": 1.5120842801074157, + "grad_norm": 18.576356887817383, + "learning_rate": 4.7154740263317356e-05, + "loss": 1.7651, + "step": 9150 + }, + { + "epoch": 1.5137368312332162, + "grad_norm": 11.440652847290039, + "learning_rate": 4.714555904441874e-05, + "loss": 1.6616, + "step": 9160 + }, + { + "epoch": 1.5153893823590168, + "grad_norm": 11.976500511169434, + "learning_rate": 4.7136377825520115e-05, + "loss": 1.673, + "step": 9170 + }, + { + "epoch": 1.5170419334848173, + "grad_norm": 7.650939464569092, + "learning_rate": 4.71271966066215e-05, + "loss": 1.5774, + "step": 9180 + }, + { + "epoch": 1.5186944846106176, + "grad_norm": 8.787154197692871, + "learning_rate": 4.711801538772287e-05, + "loss": 1.581, + "step": 9190 + }, + { + "epoch": 1.520347035736418, + "grad_norm": 11.962265014648438, + "learning_rate": 4.7108834168824256e-05, + "loss": 1.5368, + "step": 9200 + }, + { + "epoch": 1.5219995868622185, + "grad_norm": 8.907034873962402, + "learning_rate": 4.709965294992563e-05, + "loss": 1.5875, + "step": 9210 + }, + { + "epoch": 1.523652137988019, + "grad_norm": 13.364984512329102, + "learning_rate": 4.7090471731027014e-05, + "loss": 1.6646, + "step": 9220 + }, + { + "epoch": 1.5253046891138196, + "grad_norm": 7.773966312408447, + "learning_rate": 4.708129051212839e-05, + "loss": 1.5193, + "step": 9230 + }, + { + "epoch": 1.5269572402396199, + "grad_norm": 6.378410339355469, + "learning_rate": 4.707210929322977e-05, + "loss": 1.7244, + "step": 9240 + }, + { + "epoch": 1.5286097913654204, + "grad_norm": 15.127907752990723, + "learning_rate": 4.7062928074331156e-05, + "loss": 1.6726, + "step": 9250 + }, + { + "epoch": 1.5302623424912207, + "grad_norm": 16.456928253173828, + "learning_rate": 4.705374685543253e-05, + "loss": 1.7078, + "step": 9260 + }, + { + "epoch": 1.5319148936170213, + "grad_norm": 11.239667892456055, + "learning_rate": 4.704456563653391e-05, + "loss": 1.6224, + "step": 9270 + }, + { + "epoch": 1.5335674447428218, + "grad_norm": 26.4625186920166, + "learning_rate": 4.703538441763528e-05, + "loss": 1.6131, + "step": 9280 + }, + { + "epoch": 1.5352199958686223, + "grad_norm": 7.001266956329346, + "learning_rate": 4.7026203198736666e-05, + "loss": 1.6851, + "step": 9290 + }, + { + "epoch": 1.5368725469944227, + "grad_norm": 4.010128974914551, + "learning_rate": 4.701702197983804e-05, + "loss": 1.5748, + "step": 9300 + }, + { + "epoch": 1.538525098120223, + "grad_norm": 5.630429267883301, + "learning_rate": 4.7007840760939424e-05, + "loss": 1.7095, + "step": 9310 + }, + { + "epoch": 1.5401776492460235, + "grad_norm": 10.235610961914062, + "learning_rate": 4.69986595420408e-05, + "loss": 1.7108, + "step": 9320 + }, + { + "epoch": 1.541830200371824, + "grad_norm": 27.188196182250977, + "learning_rate": 4.698947832314218e-05, + "loss": 1.6722, + "step": 9330 + }, + { + "epoch": 1.5434827514976246, + "grad_norm": 9.516741752624512, + "learning_rate": 4.698029710424356e-05, + "loss": 1.5869, + "step": 9340 + }, + { + "epoch": 1.545135302623425, + "grad_norm": 16.91329574584961, + "learning_rate": 4.697111588534494e-05, + "loss": 1.8084, + "step": 9350 + }, + { + "epoch": 1.5467878537492252, + "grad_norm": 26.68416976928711, + "learning_rate": 4.6961934666446324e-05, + "loss": 1.5328, + "step": 9360 + }, + { + "epoch": 1.5484404048750258, + "grad_norm": 7.9282073974609375, + "learning_rate": 4.69527534475477e-05, + "loss": 1.6512, + "step": 9370 + }, + { + "epoch": 1.5500929560008263, + "grad_norm": 10.000015258789062, + "learning_rate": 4.694357222864908e-05, + "loss": 1.6211, + "step": 9380 + }, + { + "epoch": 1.5517455071266268, + "grad_norm": 51.1877555847168, + "learning_rate": 4.693439100975046e-05, + "loss": 1.5976, + "step": 9390 + }, + { + "epoch": 1.5533980582524272, + "grad_norm": 14.719481468200684, + "learning_rate": 4.6925209790851834e-05, + "loss": 1.5779, + "step": 9400 + }, + { + "epoch": 1.5550506093782275, + "grad_norm": 9.398426055908203, + "learning_rate": 4.691602857195321e-05, + "loss": 1.5547, + "step": 9410 + }, + { + "epoch": 1.556703160504028, + "grad_norm": 13.369629859924316, + "learning_rate": 4.690684735305459e-05, + "loss": 1.625, + "step": 9420 + }, + { + "epoch": 1.5583557116298286, + "grad_norm": 7.2386674880981445, + "learning_rate": 4.689766613415597e-05, + "loss": 1.6962, + "step": 9430 + }, + { + "epoch": 1.560008262755629, + "grad_norm": 6.752237319946289, + "learning_rate": 4.688848491525735e-05, + "loss": 1.6528, + "step": 9440 + }, + { + "epoch": 1.5616608138814294, + "grad_norm": 16.95864486694336, + "learning_rate": 4.687930369635873e-05, + "loss": 1.6891, + "step": 9450 + }, + { + "epoch": 1.56331336500723, + "grad_norm": 29.162368774414062, + "learning_rate": 4.687012247746011e-05, + "loss": 1.6036, + "step": 9460 + }, + { + "epoch": 1.5649659161330303, + "grad_norm": 8.65707778930664, + "learning_rate": 4.686094125856149e-05, + "loss": 1.749, + "step": 9470 + }, + { + "epoch": 1.5666184672588308, + "grad_norm": 8.36571979522705, + "learning_rate": 4.685176003966287e-05, + "loss": 1.597, + "step": 9480 + }, + { + "epoch": 1.5682710183846313, + "grad_norm": 10.10719108581543, + "learning_rate": 4.684257882076425e-05, + "loss": 1.7746, + "step": 9490 + }, + { + "epoch": 1.5699235695104319, + "grad_norm": 7.152252197265625, + "learning_rate": 4.683339760186563e-05, + "loss": 1.604, + "step": 9500 + }, + { + "epoch": 1.5715761206362322, + "grad_norm": 7.251582145690918, + "learning_rate": 4.682421638296701e-05, + "loss": 1.599, + "step": 9510 + }, + { + "epoch": 1.5732286717620325, + "grad_norm": 14.877774238586426, + "learning_rate": 4.6815035164068385e-05, + "loss": 1.5955, + "step": 9520 + }, + { + "epoch": 1.574881222887833, + "grad_norm": 7.264001846313477, + "learning_rate": 4.680585394516976e-05, + "loss": 1.5446, + "step": 9530 + }, + { + "epoch": 1.5765337740136336, + "grad_norm": 31.467344284057617, + "learning_rate": 4.679667272627114e-05, + "loss": 1.6975, + "step": 9540 + }, + { + "epoch": 1.5781863251394341, + "grad_norm": 6.771696090698242, + "learning_rate": 4.678749150737252e-05, + "loss": 1.5352, + "step": 9550 + }, + { + "epoch": 1.5798388762652344, + "grad_norm": 16.742000579833984, + "learning_rate": 4.6778310288473895e-05, + "loss": 1.7136, + "step": 9560 + }, + { + "epoch": 1.5814914273910348, + "grad_norm": 29.50790023803711, + "learning_rate": 4.676912906957528e-05, + "loss": 1.6819, + "step": 9570 + }, + { + "epoch": 1.5831439785168353, + "grad_norm": 15.069239616394043, + "learning_rate": 4.675994785067666e-05, + "loss": 1.5652, + "step": 9580 + }, + { + "epoch": 1.5847965296426358, + "grad_norm": 10.359492301940918, + "learning_rate": 4.6750766631778036e-05, + "loss": 1.7212, + "step": 9590 + }, + { + "epoch": 1.5864490807684364, + "grad_norm": 14.912342071533203, + "learning_rate": 4.674158541287942e-05, + "loss": 1.7613, + "step": 9600 + }, + { + "epoch": 1.5881016318942367, + "grad_norm": 15.676534652709961, + "learning_rate": 4.6732404193980795e-05, + "loss": 1.5133, + "step": 9610 + }, + { + "epoch": 1.5897541830200372, + "grad_norm": 6.48195743560791, + "learning_rate": 4.672322297508218e-05, + "loss": 1.6921, + "step": 9620 + }, + { + "epoch": 1.5914067341458376, + "grad_norm": 10.377370834350586, + "learning_rate": 4.6714041756183553e-05, + "loss": 1.6365, + "step": 9630 + }, + { + "epoch": 1.593059285271638, + "grad_norm": 6.733672142028809, + "learning_rate": 4.6704860537284936e-05, + "loss": 1.6149, + "step": 9640 + }, + { + "epoch": 1.5947118363974386, + "grad_norm": 13.145439147949219, + "learning_rate": 4.669567931838631e-05, + "loss": 1.7273, + "step": 9650 + }, + { + "epoch": 1.596364387523239, + "grad_norm": 23.76000213623047, + "learning_rate": 4.668649809948769e-05, + "loss": 1.5651, + "step": 9660 + }, + { + "epoch": 1.5980169386490395, + "grad_norm": 10.361004829406738, + "learning_rate": 4.6677316880589064e-05, + "loss": 1.679, + "step": 9670 + }, + { + "epoch": 1.5996694897748398, + "grad_norm": 7.273348808288574, + "learning_rate": 4.6668135661690446e-05, + "loss": 1.6333, + "step": 9680 + }, + { + "epoch": 1.6013220409006403, + "grad_norm": 5.444534778594971, + "learning_rate": 4.665895444279182e-05, + "loss": 1.6189, + "step": 9690 + }, + { + "epoch": 1.6029745920264409, + "grad_norm": 11.040206909179688, + "learning_rate": 4.6649773223893205e-05, + "loss": 1.5579, + "step": 9700 + }, + { + "epoch": 1.6046271431522414, + "grad_norm": 8.984979629516602, + "learning_rate": 4.664059200499459e-05, + "loss": 1.5681, + "step": 9710 + }, + { + "epoch": 1.6062796942780417, + "grad_norm": 8.338875770568848, + "learning_rate": 4.663141078609596e-05, + "loss": 1.6869, + "step": 9720 + }, + { + "epoch": 1.607932245403842, + "grad_norm": 8.329351425170898, + "learning_rate": 4.6622229567197346e-05, + "loss": 1.4787, + "step": 9730 + }, + { + "epoch": 1.6095847965296426, + "grad_norm": 9.657584190368652, + "learning_rate": 4.661304834829872e-05, + "loss": 1.7737, + "step": 9740 + }, + { + "epoch": 1.6112373476554431, + "grad_norm": 9.302536964416504, + "learning_rate": 4.6603867129400104e-05, + "loss": 1.6792, + "step": 9750 + }, + { + "epoch": 1.6128898987812437, + "grad_norm": 7.729322910308838, + "learning_rate": 4.659468591050148e-05, + "loss": 1.6573, + "step": 9760 + }, + { + "epoch": 1.614542449907044, + "grad_norm": 8.147910118103027, + "learning_rate": 4.658550469160286e-05, + "loss": 1.5995, + "step": 9770 + }, + { + "epoch": 1.6161950010328443, + "grad_norm": 7.201321601867676, + "learning_rate": 4.657632347270424e-05, + "loss": 1.6919, + "step": 9780 + }, + { + "epoch": 1.6178475521586448, + "grad_norm": 8.503691673278809, + "learning_rate": 4.6567142253805615e-05, + "loss": 1.6173, + "step": 9790 + }, + { + "epoch": 1.6195001032844454, + "grad_norm": 21.700334548950195, + "learning_rate": 4.655796103490699e-05, + "loss": 1.5879, + "step": 9800 + }, + { + "epoch": 1.621152654410246, + "grad_norm": 18.57465171813965, + "learning_rate": 4.654877981600837e-05, + "loss": 1.6012, + "step": 9810 + }, + { + "epoch": 1.6228052055360462, + "grad_norm": 6.389706611633301, + "learning_rate": 4.6539598597109756e-05, + "loss": 1.6895, + "step": 9820 + }, + { + "epoch": 1.6244577566618468, + "grad_norm": 6.799575328826904, + "learning_rate": 4.653041737821113e-05, + "loss": 1.6661, + "step": 9830 + }, + { + "epoch": 1.626110307787647, + "grad_norm": 6.362144947052002, + "learning_rate": 4.6521236159312514e-05, + "loss": 1.6193, + "step": 9840 + }, + { + "epoch": 1.6277628589134476, + "grad_norm": 18.13498306274414, + "learning_rate": 4.651205494041389e-05, + "loss": 1.6063, + "step": 9850 + }, + { + "epoch": 1.6294154100392482, + "grad_norm": 15.989859580993652, + "learning_rate": 4.650287372151527e-05, + "loss": 1.6043, + "step": 9860 + }, + { + "epoch": 1.6310679611650487, + "grad_norm": 30.983396530151367, + "learning_rate": 4.649369250261665e-05, + "loss": 1.8636, + "step": 9870 + }, + { + "epoch": 1.632720512290849, + "grad_norm": 9.010120391845703, + "learning_rate": 4.648451128371803e-05, + "loss": 1.6487, + "step": 9880 + }, + { + "epoch": 1.6343730634166493, + "grad_norm": 6.68414831161499, + "learning_rate": 4.647533006481941e-05, + "loss": 1.6081, + "step": 9890 + }, + { + "epoch": 1.6360256145424499, + "grad_norm": 9.62691593170166, + "learning_rate": 4.646614884592079e-05, + "loss": 1.5234, + "step": 9900 + }, + { + "epoch": 1.6376781656682504, + "grad_norm": 7.007033348083496, + "learning_rate": 4.6456967627022166e-05, + "loss": 1.584, + "step": 9910 + }, + { + "epoch": 1.639330716794051, + "grad_norm": 17.63177490234375, + "learning_rate": 4.644778640812354e-05, + "loss": 1.4388, + "step": 9920 + }, + { + "epoch": 1.6409832679198513, + "grad_norm": 19.87029266357422, + "learning_rate": 4.6438605189224924e-05, + "loss": 1.5444, + "step": 9930 + }, + { + "epoch": 1.6426358190456516, + "grad_norm": 28.15524673461914, + "learning_rate": 4.64294239703263e-05, + "loss": 1.6218, + "step": 9940 + }, + { + "epoch": 1.6442883701714521, + "grad_norm": 22.747636795043945, + "learning_rate": 4.642024275142768e-05, + "loss": 1.5842, + "step": 9950 + }, + { + "epoch": 1.6459409212972527, + "grad_norm": 12.167838096618652, + "learning_rate": 4.641106153252906e-05, + "loss": 1.5029, + "step": 9960 + }, + { + "epoch": 1.6475934724230532, + "grad_norm": 9.144806861877441, + "learning_rate": 4.640188031363044e-05, + "loss": 1.6125, + "step": 9970 + }, + { + "epoch": 1.6492460235488535, + "grad_norm": 11.655991554260254, + "learning_rate": 4.639269909473182e-05, + "loss": 1.5915, + "step": 9980 + }, + { + "epoch": 1.6508985746746538, + "grad_norm": 29.67504119873047, + "learning_rate": 4.63835178758332e-05, + "loss": 1.6178, + "step": 9990 + }, + { + "epoch": 1.6525511258004544, + "grad_norm": 18.424442291259766, + "learning_rate": 4.6374336656934575e-05, + "loss": 1.46, + "step": 10000 + }, + { + "epoch": 1.654203676926255, + "grad_norm": 13.235391616821289, + "learning_rate": 4.636515543803596e-05, + "loss": 1.5958, + "step": 10010 + }, + { + "epoch": 1.6558562280520555, + "grad_norm": 10.103015899658203, + "learning_rate": 4.6355974219137334e-05, + "loss": 1.7109, + "step": 10020 + }, + { + "epoch": 1.6575087791778558, + "grad_norm": 10.416471481323242, + "learning_rate": 4.6346793000238717e-05, + "loss": 1.7097, + "step": 10030 + }, + { + "epoch": 1.6591613303036563, + "grad_norm": 12.568252563476562, + "learning_rate": 4.633761178134009e-05, + "loss": 1.6269, + "step": 10040 + }, + { + "epoch": 1.6608138814294566, + "grad_norm": 65.68162536621094, + "learning_rate": 4.632843056244147e-05, + "loss": 1.5952, + "step": 10050 + }, + { + "epoch": 1.6624664325552572, + "grad_norm": 29.50044822692871, + "learning_rate": 4.631924934354285e-05, + "loss": 1.529, + "step": 10060 + }, + { + "epoch": 1.6641189836810577, + "grad_norm": 33.595272064208984, + "learning_rate": 4.631006812464423e-05, + "loss": 1.5998, + "step": 10070 + }, + { + "epoch": 1.6657715348068582, + "grad_norm": 8.986908912658691, + "learning_rate": 4.630088690574561e-05, + "loss": 1.7082, + "step": 10080 + }, + { + "epoch": 1.6674240859326586, + "grad_norm": 12.171062469482422, + "learning_rate": 4.6291705686846985e-05, + "loss": 1.6, + "step": 10090 + }, + { + "epoch": 1.6690766370584589, + "grad_norm": 28.181230545043945, + "learning_rate": 4.628252446794837e-05, + "loss": 1.6835, + "step": 10100 + }, + { + "epoch": 1.6707291881842594, + "grad_norm": 13.5730562210083, + "learning_rate": 4.6273343249049744e-05, + "loss": 1.7877, + "step": 10110 + }, + { + "epoch": 1.67238173931006, + "grad_norm": 13.940990447998047, + "learning_rate": 4.6264162030151126e-05, + "loss": 1.7061, + "step": 10120 + }, + { + "epoch": 1.6740342904358605, + "grad_norm": 10.991562843322754, + "learning_rate": 4.62549808112525e-05, + "loss": 1.6114, + "step": 10130 + }, + { + "epoch": 1.6756868415616608, + "grad_norm": 24.29729461669922, + "learning_rate": 4.6245799592353885e-05, + "loss": 1.6884, + "step": 10140 + }, + { + "epoch": 1.6773393926874611, + "grad_norm": 11.747648239135742, + "learning_rate": 4.623661837345526e-05, + "loss": 1.6264, + "step": 10150 + }, + { + "epoch": 1.6789919438132617, + "grad_norm": 10.77138614654541, + "learning_rate": 4.6227437154556643e-05, + "loss": 1.6901, + "step": 10160 + }, + { + "epoch": 1.6806444949390622, + "grad_norm": 13.280838012695312, + "learning_rate": 4.621825593565802e-05, + "loss": 1.7122, + "step": 10170 + }, + { + "epoch": 1.6822970460648627, + "grad_norm": 12.801493644714355, + "learning_rate": 4.6209074716759395e-05, + "loss": 1.5463, + "step": 10180 + }, + { + "epoch": 1.683949597190663, + "grad_norm": 11.150125503540039, + "learning_rate": 4.619989349786078e-05, + "loss": 1.6348, + "step": 10190 + }, + { + "epoch": 1.6856021483164634, + "grad_norm": 8.492911338806152, + "learning_rate": 4.6190712278962154e-05, + "loss": 1.545, + "step": 10200 + }, + { + "epoch": 1.687254699442264, + "grad_norm": 7.204408168792725, + "learning_rate": 4.6181531060063536e-05, + "loss": 1.6983, + "step": 10210 + }, + { + "epoch": 1.6889072505680645, + "grad_norm": 53.46291732788086, + "learning_rate": 4.617234984116491e-05, + "loss": 1.6098, + "step": 10220 + }, + { + "epoch": 1.690559801693865, + "grad_norm": 5.803036212921143, + "learning_rate": 4.6163168622266295e-05, + "loss": 1.5576, + "step": 10230 + }, + { + "epoch": 1.6922123528196653, + "grad_norm": 20.07267951965332, + "learning_rate": 4.615398740336767e-05, + "loss": 1.7001, + "step": 10240 + }, + { + "epoch": 1.6938649039454658, + "grad_norm": 9.970727920532227, + "learning_rate": 4.614480618446905e-05, + "loss": 1.6336, + "step": 10250 + }, + { + "epoch": 1.6955174550712662, + "grad_norm": 9.943961143493652, + "learning_rate": 4.613562496557043e-05, + "loss": 1.7672, + "step": 10260 + }, + { + "epoch": 1.6971700061970667, + "grad_norm": 13.809309959411621, + "learning_rate": 4.612644374667181e-05, + "loss": 1.6802, + "step": 10270 + }, + { + "epoch": 1.6988225573228672, + "grad_norm": 5.0867462158203125, + "learning_rate": 4.6117262527773194e-05, + "loss": 1.6411, + "step": 10280 + }, + { + "epoch": 1.7004751084486678, + "grad_norm": 15.1895751953125, + "learning_rate": 4.610808130887457e-05, + "loss": 1.6637, + "step": 10290 + }, + { + "epoch": 1.702127659574468, + "grad_norm": 5.628765106201172, + "learning_rate": 4.6098900089975946e-05, + "loss": 1.6516, + "step": 10300 + }, + { + "epoch": 1.7037802107002684, + "grad_norm": 9.596723556518555, + "learning_rate": 4.608971887107732e-05, + "loss": 1.5923, + "step": 10310 + }, + { + "epoch": 1.705432761826069, + "grad_norm": 6.9686455726623535, + "learning_rate": 4.6080537652178705e-05, + "loss": 1.5587, + "step": 10320 + }, + { + "epoch": 1.7070853129518695, + "grad_norm": 12.421316146850586, + "learning_rate": 4.607135643328008e-05, + "loss": 1.6681, + "step": 10330 + }, + { + "epoch": 1.70873786407767, + "grad_norm": 21.520915985107422, + "learning_rate": 4.606217521438146e-05, + "loss": 1.6138, + "step": 10340 + }, + { + "epoch": 1.7103904152034703, + "grad_norm": 8.50918960571289, + "learning_rate": 4.605299399548284e-05, + "loss": 1.7301, + "step": 10350 + }, + { + "epoch": 1.7120429663292707, + "grad_norm": 10.39163875579834, + "learning_rate": 4.604381277658422e-05, + "loss": 1.6752, + "step": 10360 + }, + { + "epoch": 1.7136955174550712, + "grad_norm": 6.190021514892578, + "learning_rate": 4.60346315576856e-05, + "loss": 1.5676, + "step": 10370 + }, + { + "epoch": 1.7153480685808717, + "grad_norm": 9.422684669494629, + "learning_rate": 4.602545033878698e-05, + "loss": 1.5699, + "step": 10380 + }, + { + "epoch": 1.7170006197066723, + "grad_norm": 7.96755838394165, + "learning_rate": 4.601626911988836e-05, + "loss": 1.6369, + "step": 10390 + }, + { + "epoch": 1.7186531708324726, + "grad_norm": 6.596837997436523, + "learning_rate": 4.600708790098974e-05, + "loss": 1.6232, + "step": 10400 + }, + { + "epoch": 1.7203057219582731, + "grad_norm": 18.394948959350586, + "learning_rate": 4.599790668209112e-05, + "loss": 1.6457, + "step": 10410 + }, + { + "epoch": 1.7219582730840735, + "grad_norm": 9.514167785644531, + "learning_rate": 4.59887254631925e-05, + "loss": 1.5002, + "step": 10420 + }, + { + "epoch": 1.723610824209874, + "grad_norm": 15.90142822265625, + "learning_rate": 4.597954424429387e-05, + "loss": 1.6668, + "step": 10430 + }, + { + "epoch": 1.7252633753356745, + "grad_norm": 14.313610076904297, + "learning_rate": 4.597036302539525e-05, + "loss": 1.6253, + "step": 10440 + }, + { + "epoch": 1.7269159264614748, + "grad_norm": 5.3916144371032715, + "learning_rate": 4.596118180649663e-05, + "loss": 1.5947, + "step": 10450 + }, + { + "epoch": 1.7285684775872754, + "grad_norm": 9.352823257446289, + "learning_rate": 4.595200058759801e-05, + "loss": 1.502, + "step": 10460 + }, + { + "epoch": 1.7302210287130757, + "grad_norm": 27.035520553588867, + "learning_rate": 4.594281936869939e-05, + "loss": 1.7585, + "step": 10470 + }, + { + "epoch": 1.7318735798388762, + "grad_norm": 5.941956520080566, + "learning_rate": 4.5933638149800766e-05, + "loss": 1.8005, + "step": 10480 + }, + { + "epoch": 1.7335261309646768, + "grad_norm": 7.316133499145508, + "learning_rate": 4.592445693090215e-05, + "loss": 1.5243, + "step": 10490 + }, + { + "epoch": 1.7351786820904773, + "grad_norm": 7.501877784729004, + "learning_rate": 4.591527571200353e-05, + "loss": 1.5457, + "step": 10500 + }, + { + "epoch": 1.7368312332162776, + "grad_norm": 8.191024780273438, + "learning_rate": 4.590609449310491e-05, + "loss": 1.6192, + "step": 10510 + }, + { + "epoch": 1.738483784342078, + "grad_norm": 7.398270130157471, + "learning_rate": 4.589691327420629e-05, + "loss": 1.713, + "step": 10520 + }, + { + "epoch": 1.7401363354678785, + "grad_norm": 7.048802375793457, + "learning_rate": 4.5887732055307665e-05, + "loss": 1.7066, + "step": 10530 + }, + { + "epoch": 1.741788886593679, + "grad_norm": 6.611818313598633, + "learning_rate": 4.587855083640905e-05, + "loss": 1.5422, + "step": 10540 + }, + { + "epoch": 1.7434414377194796, + "grad_norm": 11.106283187866211, + "learning_rate": 4.5869369617510424e-05, + "loss": 1.6228, + "step": 10550 + }, + { + "epoch": 1.7450939888452799, + "grad_norm": 17.441743850708008, + "learning_rate": 4.58601883986118e-05, + "loss": 1.6286, + "step": 10560 + }, + { + "epoch": 1.7467465399710802, + "grad_norm": 17.710969924926758, + "learning_rate": 4.585100717971318e-05, + "loss": 1.5696, + "step": 10570 + }, + { + "epoch": 1.7483990910968807, + "grad_norm": 12.676824569702148, + "learning_rate": 4.584182596081456e-05, + "loss": 1.6077, + "step": 10580 + }, + { + "epoch": 1.7500516422226813, + "grad_norm": 8.125213623046875, + "learning_rate": 4.5832644741915934e-05, + "loss": 1.7088, + "step": 10590 + }, + { + "epoch": 1.7517041933484818, + "grad_norm": 7.755391597747803, + "learning_rate": 4.582346352301732e-05, + "loss": 1.638, + "step": 10600 + }, + { + "epoch": 1.7533567444742821, + "grad_norm": 19.214263916015625, + "learning_rate": 4.58142823041187e-05, + "loss": 1.5969, + "step": 10610 + }, + { + "epoch": 1.7550092956000827, + "grad_norm": 32.08985900878906, + "learning_rate": 4.5805101085220075e-05, + "loss": 1.7992, + "step": 10620 + }, + { + "epoch": 1.756661846725883, + "grad_norm": 6.143647193908691, + "learning_rate": 4.579591986632146e-05, + "loss": 1.6176, + "step": 10630 + }, + { + "epoch": 1.7583143978516835, + "grad_norm": 12.872699737548828, + "learning_rate": 4.5786738647422834e-05, + "loss": 1.5866, + "step": 10640 + }, + { + "epoch": 1.759966948977484, + "grad_norm": 7.934300899505615, + "learning_rate": 4.5777557428524216e-05, + "loss": 1.5976, + "step": 10650 + }, + { + "epoch": 1.7616195001032846, + "grad_norm": 6.824174404144287, + "learning_rate": 4.576837620962559e-05, + "loss": 1.5911, + "step": 10660 + }, + { + "epoch": 1.763272051229085, + "grad_norm": 7.65330696105957, + "learning_rate": 4.5759194990726975e-05, + "loss": 1.5306, + "step": 10670 + }, + { + "epoch": 1.7649246023548852, + "grad_norm": 4.150641918182373, + "learning_rate": 4.575001377182835e-05, + "loss": 1.6023, + "step": 10680 + }, + { + "epoch": 1.7665771534806858, + "grad_norm": 15.337532997131348, + "learning_rate": 4.574083255292973e-05, + "loss": 1.5932, + "step": 10690 + }, + { + "epoch": 1.7682297046064863, + "grad_norm": 10.681439399719238, + "learning_rate": 4.573165133403111e-05, + "loss": 1.7345, + "step": 10700 + }, + { + "epoch": 1.7698822557322869, + "grad_norm": 21.317352294921875, + "learning_rate": 4.5722470115132485e-05, + "loss": 1.6743, + "step": 10710 + }, + { + "epoch": 1.7715348068580872, + "grad_norm": 25.704092025756836, + "learning_rate": 4.571328889623386e-05, + "loss": 1.3808, + "step": 10720 + }, + { + "epoch": 1.7731873579838875, + "grad_norm": 7.380794525146484, + "learning_rate": 4.5704107677335244e-05, + "loss": 1.6079, + "step": 10730 + }, + { + "epoch": 1.774839909109688, + "grad_norm": 12.925481796264648, + "learning_rate": 4.5694926458436626e-05, + "loss": 1.656, + "step": 10740 + }, + { + "epoch": 1.7764924602354886, + "grad_norm": 62.927032470703125, + "learning_rate": 4.5685745239538e-05, + "loss": 1.7919, + "step": 10750 + }, + { + "epoch": 1.778145011361289, + "grad_norm": 13.639461517333984, + "learning_rate": 4.5676564020639385e-05, + "loss": 1.6534, + "step": 10760 + }, + { + "epoch": 1.7797975624870894, + "grad_norm": 7.046122074127197, + "learning_rate": 4.566738280174076e-05, + "loss": 1.549, + "step": 10770 + }, + { + "epoch": 1.7814501136128897, + "grad_norm": 12.599806785583496, + "learning_rate": 4.565820158284214e-05, + "loss": 1.6526, + "step": 10780 + }, + { + "epoch": 1.7831026647386903, + "grad_norm": 8.485870361328125, + "learning_rate": 4.564902036394352e-05, + "loss": 1.7176, + "step": 10790 + }, + { + "epoch": 1.7847552158644908, + "grad_norm": 16.592004776000977, + "learning_rate": 4.56398391450449e-05, + "loss": 1.5627, + "step": 10800 + }, + { + "epoch": 1.7864077669902914, + "grad_norm": 11.336044311523438, + "learning_rate": 4.563065792614628e-05, + "loss": 1.6818, + "step": 10810 + }, + { + "epoch": 1.7880603181160917, + "grad_norm": 8.369160652160645, + "learning_rate": 4.5621476707247653e-05, + "loss": 1.6238, + "step": 10820 + }, + { + "epoch": 1.7897128692418922, + "grad_norm": 9.106146812438965, + "learning_rate": 4.5612295488349036e-05, + "loss": 1.4657, + "step": 10830 + }, + { + "epoch": 1.7913654203676925, + "grad_norm": 18.96662712097168, + "learning_rate": 4.560311426945041e-05, + "loss": 1.6315, + "step": 10840 + }, + { + "epoch": 1.793017971493493, + "grad_norm": 9.854429244995117, + "learning_rate": 4.5593933050551795e-05, + "loss": 1.685, + "step": 10850 + }, + { + "epoch": 1.7946705226192936, + "grad_norm": 20.518041610717773, + "learning_rate": 4.558475183165317e-05, + "loss": 1.6613, + "step": 10860 + }, + { + "epoch": 1.7963230737450941, + "grad_norm": 16.10544204711914, + "learning_rate": 4.557557061275455e-05, + "loss": 1.6003, + "step": 10870 + }, + { + "epoch": 1.7979756248708945, + "grad_norm": 28.675254821777344, + "learning_rate": 4.556638939385593e-05, + "loss": 1.5901, + "step": 10880 + }, + { + "epoch": 1.7996281759966948, + "grad_norm": 27.472984313964844, + "learning_rate": 4.555720817495731e-05, + "loss": 1.6684, + "step": 10890 + }, + { + "epoch": 1.8012807271224953, + "grad_norm": 7.602771759033203, + "learning_rate": 4.554802695605869e-05, + "loss": 1.8076, + "step": 10900 + }, + { + "epoch": 1.8029332782482959, + "grad_norm": 8.444791793823242, + "learning_rate": 4.553884573716007e-05, + "loss": 1.7259, + "step": 10910 + }, + { + "epoch": 1.8045858293740964, + "grad_norm": 8.157547950744629, + "learning_rate": 4.5529664518261446e-05, + "loss": 1.7186, + "step": 10920 + }, + { + "epoch": 1.8062383804998967, + "grad_norm": 7.917896270751953, + "learning_rate": 4.552048329936283e-05, + "loss": 1.6898, + "step": 10930 + }, + { + "epoch": 1.807890931625697, + "grad_norm": 12.761302947998047, + "learning_rate": 4.5511302080464204e-05, + "loss": 1.6634, + "step": 10940 + }, + { + "epoch": 1.8095434827514976, + "grad_norm": 5.908732891082764, + "learning_rate": 4.550212086156558e-05, + "loss": 1.6357, + "step": 10950 + }, + { + "epoch": 1.811196033877298, + "grad_norm": 6.7102766036987305, + "learning_rate": 4.549293964266696e-05, + "loss": 1.6942, + "step": 10960 + }, + { + "epoch": 1.8128485850030986, + "grad_norm": 117.56930541992188, + "learning_rate": 4.548375842376834e-05, + "loss": 1.6321, + "step": 10970 + }, + { + "epoch": 1.814501136128899, + "grad_norm": 9.186972618103027, + "learning_rate": 4.547457720486972e-05, + "loss": 1.6335, + "step": 10980 + }, + { + "epoch": 1.8161536872546993, + "grad_norm": 12.602619171142578, + "learning_rate": 4.54653959859711e-05, + "loss": 1.6513, + "step": 10990 + }, + { + "epoch": 1.8178062383804998, + "grad_norm": 6.2811279296875, + "learning_rate": 4.545621476707248e-05, + "loss": 1.5188, + "step": 11000 + }, + { + "epoch": 1.8194587895063004, + "grad_norm": 15.940876960754395, + "learning_rate": 4.5447033548173856e-05, + "loss": 1.7214, + "step": 11010 + }, + { + "epoch": 1.821111340632101, + "grad_norm": 8.485485076904297, + "learning_rate": 4.543785232927524e-05, + "loss": 1.7004, + "step": 11020 + }, + { + "epoch": 1.8227638917579012, + "grad_norm": 37.532020568847656, + "learning_rate": 4.5428671110376614e-05, + "loss": 1.5909, + "step": 11030 + }, + { + "epoch": 1.8244164428837017, + "grad_norm": 10.41656494140625, + "learning_rate": 4.5419489891478e-05, + "loss": 1.5895, + "step": 11040 + }, + { + "epoch": 1.826068994009502, + "grad_norm": 5.756435394287109, + "learning_rate": 4.541030867257937e-05, + "loss": 1.6121, + "step": 11050 + }, + { + "epoch": 1.8277215451353026, + "grad_norm": 16.178802490234375, + "learning_rate": 4.5401127453680755e-05, + "loss": 1.5903, + "step": 11060 + }, + { + "epoch": 1.8293740962611031, + "grad_norm": 10.916646957397461, + "learning_rate": 4.539194623478213e-05, + "loss": 1.7007, + "step": 11070 + }, + { + "epoch": 1.8310266473869037, + "grad_norm": 6.1422224044799805, + "learning_rate": 4.538276501588351e-05, + "loss": 1.6766, + "step": 11080 + }, + { + "epoch": 1.832679198512704, + "grad_norm": 15.099671363830566, + "learning_rate": 4.537358379698489e-05, + "loss": 1.6593, + "step": 11090 + }, + { + "epoch": 1.8343317496385043, + "grad_norm": 19.930587768554688, + "learning_rate": 4.5364402578086266e-05, + "loss": 1.6625, + "step": 11100 + }, + { + "epoch": 1.8359843007643049, + "grad_norm": 9.389049530029297, + "learning_rate": 4.535522135918765e-05, + "loss": 1.6434, + "step": 11110 + }, + { + "epoch": 1.8376368518901054, + "grad_norm": 10.457796096801758, + "learning_rate": 4.5346040140289024e-05, + "loss": 1.6743, + "step": 11120 + }, + { + "epoch": 1.839289403015906, + "grad_norm": 9.54992961883545, + "learning_rate": 4.533685892139041e-05, + "loss": 1.6617, + "step": 11130 + }, + { + "epoch": 1.8409419541417062, + "grad_norm": 19.115575790405273, + "learning_rate": 4.532767770249178e-05, + "loss": 1.6583, + "step": 11140 + }, + { + "epoch": 1.8425945052675066, + "grad_norm": 9.527848243713379, + "learning_rate": 4.5318496483593165e-05, + "loss": 1.5821, + "step": 11150 + }, + { + "epoch": 1.844247056393307, + "grad_norm": 9.40245246887207, + "learning_rate": 4.530931526469454e-05, + "loss": 1.6554, + "step": 11160 + }, + { + "epoch": 1.8458996075191076, + "grad_norm": 18.67144012451172, + "learning_rate": 4.5300134045795924e-05, + "loss": 1.7357, + "step": 11170 + }, + { + "epoch": 1.8475521586449082, + "grad_norm": 25.492530822753906, + "learning_rate": 4.5290952826897306e-05, + "loss": 1.6508, + "step": 11180 + }, + { + "epoch": 1.8492047097707085, + "grad_norm": 13.621424674987793, + "learning_rate": 4.528177160799868e-05, + "loss": 1.7588, + "step": 11190 + }, + { + "epoch": 1.850857260896509, + "grad_norm": 12.195111274719238, + "learning_rate": 4.527259038910006e-05, + "loss": 1.6934, + "step": 11200 + }, + { + "epoch": 1.8525098120223094, + "grad_norm": 9.302632331848145, + "learning_rate": 4.526340917020144e-05, + "loss": 1.5907, + "step": 11210 + }, + { + "epoch": 1.85416236314811, + "grad_norm": 6.693589687347412, + "learning_rate": 4.5254227951302817e-05, + "loss": 1.7577, + "step": 11220 + }, + { + "epoch": 1.8558149142739104, + "grad_norm": 7.7038116455078125, + "learning_rate": 4.524504673240419e-05, + "loss": 1.574, + "step": 11230 + }, + { + "epoch": 1.8574674653997107, + "grad_norm": 10.348893165588379, + "learning_rate": 4.5235865513505575e-05, + "loss": 1.5739, + "step": 11240 + }, + { + "epoch": 1.8591200165255113, + "grad_norm": 12.050825119018555, + "learning_rate": 4.522668429460695e-05, + "loss": 1.658, + "step": 11250 + }, + { + "epoch": 1.8607725676513116, + "grad_norm": 47.17829132080078, + "learning_rate": 4.5217503075708334e-05, + "loss": 1.4564, + "step": 11260 + }, + { + "epoch": 1.8624251187771121, + "grad_norm": 8.023752212524414, + "learning_rate": 4.520832185680971e-05, + "loss": 1.5252, + "step": 11270 + }, + { + "epoch": 1.8640776699029127, + "grad_norm": 11.899310111999512, + "learning_rate": 4.519914063791109e-05, + "loss": 1.7046, + "step": 11280 + }, + { + "epoch": 1.8657302210287132, + "grad_norm": 7.167046546936035, + "learning_rate": 4.518995941901247e-05, + "loss": 1.6141, + "step": 11290 + }, + { + "epoch": 1.8673827721545135, + "grad_norm": 42.9312858581543, + "learning_rate": 4.518077820011385e-05, + "loss": 1.6363, + "step": 11300 + }, + { + "epoch": 1.8690353232803139, + "grad_norm": 12.356457710266113, + "learning_rate": 4.517159698121523e-05, + "loss": 1.7502, + "step": 11310 + }, + { + "epoch": 1.8706878744061144, + "grad_norm": 5.740696430206299, + "learning_rate": 4.516241576231661e-05, + "loss": 1.6006, + "step": 11320 + }, + { + "epoch": 1.872340425531915, + "grad_norm": 5.727828502655029, + "learning_rate": 4.5153234543417985e-05, + "loss": 1.6271, + "step": 11330 + }, + { + "epoch": 1.8739929766577155, + "grad_norm": 10.743067741394043, + "learning_rate": 4.514405332451937e-05, + "loss": 1.5755, + "step": 11340 + }, + { + "epoch": 1.8756455277835158, + "grad_norm": 14.965692520141602, + "learning_rate": 4.5134872105620743e-05, + "loss": 1.53, + "step": 11350 + }, + { + "epoch": 1.877298078909316, + "grad_norm": 11.617084503173828, + "learning_rate": 4.512569088672212e-05, + "loss": 1.6813, + "step": 11360 + }, + { + "epoch": 1.8789506300351166, + "grad_norm": 253.05593872070312, + "learning_rate": 4.51165096678235e-05, + "loss": 1.5946, + "step": 11370 + }, + { + "epoch": 1.8806031811609172, + "grad_norm": 9.766687393188477, + "learning_rate": 4.510732844892488e-05, + "loss": 1.5899, + "step": 11380 + }, + { + "epoch": 1.8822557322867177, + "grad_norm": 22.282573699951172, + "learning_rate": 4.509814723002626e-05, + "loss": 1.6214, + "step": 11390 + }, + { + "epoch": 1.883908283412518, + "grad_norm": 7.993402481079102, + "learning_rate": 4.5088966011127636e-05, + "loss": 1.4566, + "step": 11400 + }, + { + "epoch": 1.8855608345383186, + "grad_norm": 8.523614883422852, + "learning_rate": 4.507978479222902e-05, + "loss": 1.6375, + "step": 11410 + }, + { + "epoch": 1.887213385664119, + "grad_norm": 32.4261474609375, + "learning_rate": 4.50706035733304e-05, + "loss": 1.6616, + "step": 11420 + }, + { + "epoch": 1.8888659367899194, + "grad_norm": 9.417035102844238, + "learning_rate": 4.506142235443178e-05, + "loss": 1.5917, + "step": 11430 + }, + { + "epoch": 1.89051848791572, + "grad_norm": 46.04423904418945, + "learning_rate": 4.505224113553316e-05, + "loss": 1.6917, + "step": 11440 + }, + { + "epoch": 1.8921710390415205, + "grad_norm": 14.668797492980957, + "learning_rate": 4.5043059916634536e-05, + "loss": 1.5452, + "step": 11450 + }, + { + "epoch": 1.8938235901673208, + "grad_norm": 10.886049270629883, + "learning_rate": 4.503387869773591e-05, + "loss": 1.5244, + "step": 11460 + }, + { + "epoch": 1.8954761412931211, + "grad_norm": 10.956747055053711, + "learning_rate": 4.5024697478837294e-05, + "loss": 1.5967, + "step": 11470 + }, + { + "epoch": 1.8971286924189217, + "grad_norm": 15.368989944458008, + "learning_rate": 4.501551625993867e-05, + "loss": 1.5721, + "step": 11480 + }, + { + "epoch": 1.8987812435447222, + "grad_norm": 8.719017028808594, + "learning_rate": 4.5006335041040046e-05, + "loss": 1.6545, + "step": 11490 + }, + { + "epoch": 1.9004337946705228, + "grad_norm": 10.52745246887207, + "learning_rate": 4.499715382214143e-05, + "loss": 1.6317, + "step": 11500 + }, + { + "epoch": 1.902086345796323, + "grad_norm": 27.014604568481445, + "learning_rate": 4.4987972603242805e-05, + "loss": 1.4449, + "step": 11510 + }, + { + "epoch": 1.9037388969221234, + "grad_norm": 14.527423858642578, + "learning_rate": 4.497879138434419e-05, + "loss": 1.6464, + "step": 11520 + }, + { + "epoch": 1.905391448047924, + "grad_norm": 28.174190521240234, + "learning_rate": 4.496961016544557e-05, + "loss": 1.5219, + "step": 11530 + }, + { + "epoch": 1.9070439991737245, + "grad_norm": 10.394495964050293, + "learning_rate": 4.4960428946546946e-05, + "loss": 1.6479, + "step": 11540 + }, + { + "epoch": 1.908696550299525, + "grad_norm": 29.0770263671875, + "learning_rate": 4.495124772764833e-05, + "loss": 1.5932, + "step": 11550 + }, + { + "epoch": 1.9103491014253253, + "grad_norm": 6.238829612731934, + "learning_rate": 4.4942066508749704e-05, + "loss": 1.5789, + "step": 11560 + }, + { + "epoch": 1.9120016525511256, + "grad_norm": 7.41868257522583, + "learning_rate": 4.493288528985109e-05, + "loss": 1.5565, + "step": 11570 + }, + { + "epoch": 1.9136542036769262, + "grad_norm": 18.375661849975586, + "learning_rate": 4.492370407095246e-05, + "loss": 1.6516, + "step": 11580 + }, + { + "epoch": 1.9153067548027267, + "grad_norm": 11.252774238586426, + "learning_rate": 4.491452285205384e-05, + "loss": 1.4842, + "step": 11590 + }, + { + "epoch": 1.9169593059285273, + "grad_norm": 13.860923767089844, + "learning_rate": 4.490534163315522e-05, + "loss": 1.6212, + "step": 11600 + }, + { + "epoch": 1.9186118570543276, + "grad_norm": 6.298261642456055, + "learning_rate": 4.48961604142566e-05, + "loss": 1.5285, + "step": 11610 + }, + { + "epoch": 1.9202644081801281, + "grad_norm": 19.169706344604492, + "learning_rate": 4.488697919535797e-05, + "loss": 1.6163, + "step": 11620 + }, + { + "epoch": 1.9219169593059284, + "grad_norm": 15.898575782775879, + "learning_rate": 4.4877797976459356e-05, + "loss": 1.609, + "step": 11630 + }, + { + "epoch": 1.923569510431729, + "grad_norm": 16.889341354370117, + "learning_rate": 4.486861675756074e-05, + "loss": 1.5827, + "step": 11640 + }, + { + "epoch": 1.9252220615575295, + "grad_norm": 8.887215614318848, + "learning_rate": 4.4859435538662114e-05, + "loss": 1.581, + "step": 11650 + }, + { + "epoch": 1.92687461268333, + "grad_norm": 27.01335906982422, + "learning_rate": 4.48502543197635e-05, + "loss": 1.7247, + "step": 11660 + }, + { + "epoch": 1.9285271638091304, + "grad_norm": 7.634554862976074, + "learning_rate": 4.484107310086487e-05, + "loss": 1.5232, + "step": 11670 + }, + { + "epoch": 1.9301797149349307, + "grad_norm": 17.882055282592773, + "learning_rate": 4.4831891881966255e-05, + "loss": 1.5038, + "step": 11680 + }, + { + "epoch": 1.9318322660607312, + "grad_norm": 24.722803115844727, + "learning_rate": 4.482271066306763e-05, + "loss": 1.4617, + "step": 11690 + }, + { + "epoch": 1.9334848171865318, + "grad_norm": 62.85787582397461, + "learning_rate": 4.4813529444169014e-05, + "loss": 1.6762, + "step": 11700 + }, + { + "epoch": 1.9351373683123323, + "grad_norm": 20.34491729736328, + "learning_rate": 4.480434822527039e-05, + "loss": 1.7011, + "step": 11710 + }, + { + "epoch": 1.9367899194381326, + "grad_norm": 8.389205932617188, + "learning_rate": 4.4795167006371765e-05, + "loss": 1.6886, + "step": 11720 + }, + { + "epoch": 1.938442470563933, + "grad_norm": 7.912249565124512, + "learning_rate": 4.478598578747315e-05, + "loss": 1.6878, + "step": 11730 + }, + { + "epoch": 1.9400950216897335, + "grad_norm": 26.233190536499023, + "learning_rate": 4.4776804568574524e-05, + "loss": 1.7822, + "step": 11740 + }, + { + "epoch": 1.941747572815534, + "grad_norm": 24.81481170654297, + "learning_rate": 4.4767623349675907e-05, + "loss": 1.7393, + "step": 11750 + }, + { + "epoch": 1.9434001239413345, + "grad_norm": 19.601308822631836, + "learning_rate": 4.475844213077728e-05, + "loss": 1.5535, + "step": 11760 + }, + { + "epoch": 1.9450526750671349, + "grad_norm": 9.827634811401367, + "learning_rate": 4.4749260911878665e-05, + "loss": 1.5759, + "step": 11770 + }, + { + "epoch": 1.9467052261929352, + "grad_norm": 10.674678802490234, + "learning_rate": 4.474007969298004e-05, + "loss": 1.6419, + "step": 11780 + }, + { + "epoch": 1.9483577773187357, + "grad_norm": 13.497589111328125, + "learning_rate": 4.4730898474081424e-05, + "loss": 1.6282, + "step": 11790 + }, + { + "epoch": 1.9500103284445363, + "grad_norm": 9.30100154876709, + "learning_rate": 4.47217172551828e-05, + "loss": 1.6183, + "step": 11800 + }, + { + "epoch": 1.9516628795703368, + "grad_norm": 9.254987716674805, + "learning_rate": 4.471253603628418e-05, + "loss": 1.6552, + "step": 11810 + }, + { + "epoch": 1.9533154306961371, + "grad_norm": 14.543704986572266, + "learning_rate": 4.470335481738556e-05, + "loss": 1.5962, + "step": 11820 + }, + { + "epoch": 1.9549679818219377, + "grad_norm": 17.067127227783203, + "learning_rate": 4.469417359848694e-05, + "loss": 1.6292, + "step": 11830 + }, + { + "epoch": 1.956620532947738, + "grad_norm": 49.91341018676758, + "learning_rate": 4.4684992379588316e-05, + "loss": 1.6171, + "step": 11840 + }, + { + "epoch": 1.9582730840735385, + "grad_norm": 11.141582489013672, + "learning_rate": 4.467581116068969e-05, + "loss": 1.6286, + "step": 11850 + }, + { + "epoch": 1.959925635199339, + "grad_norm": 148.42442321777344, + "learning_rate": 4.4666629941791075e-05, + "loss": 1.598, + "step": 11860 + }, + { + "epoch": 1.9615781863251396, + "grad_norm": 7.553905010223389, + "learning_rate": 4.465744872289245e-05, + "loss": 1.7413, + "step": 11870 + }, + { + "epoch": 1.96323073745094, + "grad_norm": 8.26811408996582, + "learning_rate": 4.464826750399383e-05, + "loss": 1.7399, + "step": 11880 + }, + { + "epoch": 1.9648832885767402, + "grad_norm": 12.610078811645508, + "learning_rate": 4.463908628509521e-05, + "loss": 1.6173, + "step": 11890 + }, + { + "epoch": 1.9665358397025408, + "grad_norm": 12.24221134185791, + "learning_rate": 4.462990506619659e-05, + "loss": 1.5597, + "step": 11900 + }, + { + "epoch": 1.9681883908283413, + "grad_norm": 32.20631790161133, + "learning_rate": 4.462072384729797e-05, + "loss": 1.6357, + "step": 11910 + }, + { + "epoch": 1.9698409419541418, + "grad_norm": 13.021040916442871, + "learning_rate": 4.461154262839935e-05, + "loss": 1.5987, + "step": 11920 + }, + { + "epoch": 1.9714934930799421, + "grad_norm": 9.593809127807617, + "learning_rate": 4.4602361409500726e-05, + "loss": 1.5853, + "step": 11930 + }, + { + "epoch": 1.9731460442057425, + "grad_norm": 6.821667194366455, + "learning_rate": 4.459318019060211e-05, + "loss": 1.5064, + "step": 11940 + }, + { + "epoch": 1.974798595331543, + "grad_norm": 15.331339836120605, + "learning_rate": 4.4583998971703485e-05, + "loss": 1.6008, + "step": 11950 + }, + { + "epoch": 1.9764511464573435, + "grad_norm": 8.000182151794434, + "learning_rate": 4.457481775280487e-05, + "loss": 1.655, + "step": 11960 + }, + { + "epoch": 1.978103697583144, + "grad_norm": 8.434917449951172, + "learning_rate": 4.456563653390624e-05, + "loss": 1.753, + "step": 11970 + }, + { + "epoch": 1.9797562487089444, + "grad_norm": 48.595760345458984, + "learning_rate": 4.4556455315007626e-05, + "loss": 1.685, + "step": 11980 + }, + { + "epoch": 1.981408799834745, + "grad_norm": 9.696410179138184, + "learning_rate": 4.4547274096109e-05, + "loss": 1.6055, + "step": 11990 + }, + { + "epoch": 1.9830613509605453, + "grad_norm": 9.626373291015625, + "learning_rate": 4.453809287721038e-05, + "loss": 1.7424, + "step": 12000 + }, + { + "epoch": 1.9847139020863458, + "grad_norm": 62.46161651611328, + "learning_rate": 4.452891165831176e-05, + "loss": 1.596, + "step": 12010 + }, + { + "epoch": 1.9863664532121463, + "grad_norm": 25.460296630859375, + "learning_rate": 4.4519730439413136e-05, + "loss": 1.7073, + "step": 12020 + }, + { + "epoch": 1.9880190043379469, + "grad_norm": 6.691403865814209, + "learning_rate": 4.451054922051452e-05, + "loss": 1.61, + "step": 12030 + }, + { + "epoch": 1.9896715554637472, + "grad_norm": 9.661092758178711, + "learning_rate": 4.4501368001615895e-05, + "loss": 1.6125, + "step": 12040 + }, + { + "epoch": 1.9913241065895475, + "grad_norm": 26.178430557250977, + "learning_rate": 4.449218678271728e-05, + "loss": 1.5553, + "step": 12050 + }, + { + "epoch": 1.992976657715348, + "grad_norm": 96.2996826171875, + "learning_rate": 4.448300556381865e-05, + "loss": 1.7229, + "step": 12060 + }, + { + "epoch": 1.9946292088411486, + "grad_norm": 18.224470138549805, + "learning_rate": 4.4473824344920036e-05, + "loss": 1.6561, + "step": 12070 + }, + { + "epoch": 1.9962817599669491, + "grad_norm": 12.857390403747559, + "learning_rate": 4.446464312602141e-05, + "loss": 1.5519, + "step": 12080 + }, + { + "epoch": 1.9979343110927494, + "grad_norm": 65.5601577758789, + "learning_rate": 4.4455461907122794e-05, + "loss": 1.5376, + "step": 12090 + }, + { + "epoch": 1.9995868622185498, + "grad_norm": 31.238407135009766, + "learning_rate": 4.444628068822417e-05, + "loss": 1.6433, + "step": 12100 + }, + { + "epoch": 1.99991737244371, + "eval_accuracy": 0.2733721095190807, + "eval_loss": 2.2641355991363525, + "eval_runtime": 819.3221, + "eval_samples_per_second": 34.414, + "eval_steps_per_second": 8.603, + "step": 12102 + }, + { + "epoch": 2.0012394133443503, + "grad_norm": 44.95764923095703, + "learning_rate": 4.443709946932555e-05, + "loss": 1.6572, + "step": 12110 + }, + { + "epoch": 2.002891964470151, + "grad_norm": 28.433679580688477, + "learning_rate": 4.442791825042693e-05, + "loss": 1.5288, + "step": 12120 + }, + { + "epoch": 2.0045445155959514, + "grad_norm": 20.113248825073242, + "learning_rate": 4.4418737031528304e-05, + "loss": 1.5625, + "step": 12130 + }, + { + "epoch": 2.006197066721752, + "grad_norm": 18.262046813964844, + "learning_rate": 4.440955581262969e-05, + "loss": 1.6292, + "step": 12140 + }, + { + "epoch": 2.007849617847552, + "grad_norm": 9.9489164352417, + "learning_rate": 4.440037459373106e-05, + "loss": 1.7362, + "step": 12150 + }, + { + "epoch": 2.0095021689733525, + "grad_norm": 71.15792846679688, + "learning_rate": 4.4391193374832446e-05, + "loss": 1.7299, + "step": 12160 + }, + { + "epoch": 2.011154720099153, + "grad_norm": 12.936304092407227, + "learning_rate": 4.438201215593382e-05, + "loss": 1.5459, + "step": 12170 + }, + { + "epoch": 2.0128072712249536, + "grad_norm": 17.437440872192383, + "learning_rate": 4.4372830937035204e-05, + "loss": 1.604, + "step": 12180 + }, + { + "epoch": 2.014459822350754, + "grad_norm": 12.949777603149414, + "learning_rate": 4.436364971813658e-05, + "loss": 1.5845, + "step": 12190 + }, + { + "epoch": 2.0161123734765543, + "grad_norm": 34.75144958496094, + "learning_rate": 4.435446849923796e-05, + "loss": 1.7017, + "step": 12200 + }, + { + "epoch": 2.017764924602355, + "grad_norm": 25.475833892822266, + "learning_rate": 4.4345287280339345e-05, + "loss": 1.6947, + "step": 12210 + }, + { + "epoch": 2.0194174757281553, + "grad_norm": 10.156431198120117, + "learning_rate": 4.433610606144072e-05, + "loss": 1.6284, + "step": 12220 + }, + { + "epoch": 2.021070026853956, + "grad_norm": 19.941617965698242, + "learning_rate": 4.43269248425421e-05, + "loss": 1.6042, + "step": 12230 + }, + { + "epoch": 2.0227225779797564, + "grad_norm": 9.263725280761719, + "learning_rate": 4.431774362364348e-05, + "loss": 1.6048, + "step": 12240 + }, + { + "epoch": 2.0243751291055565, + "grad_norm": 7.461373805999756, + "learning_rate": 4.4308562404744855e-05, + "loss": 1.6171, + "step": 12250 + }, + { + "epoch": 2.026027680231357, + "grad_norm": 28.221065521240234, + "learning_rate": 4.429938118584623e-05, + "loss": 1.6176, + "step": 12260 + }, + { + "epoch": 2.0276802313571576, + "grad_norm": 6.061464309692383, + "learning_rate": 4.4290199966947614e-05, + "loss": 1.5904, + "step": 12270 + }, + { + "epoch": 2.029332782482958, + "grad_norm": 7.372241973876953, + "learning_rate": 4.428101874804899e-05, + "loss": 1.5756, + "step": 12280 + }, + { + "epoch": 2.0309853336087587, + "grad_norm": 5.46327543258667, + "learning_rate": 4.427183752915037e-05, + "loss": 1.6985, + "step": 12290 + }, + { + "epoch": 2.0326378847345588, + "grad_norm": 8.150008201599121, + "learning_rate": 4.426265631025175e-05, + "loss": 1.6157, + "step": 12300 + }, + { + "epoch": 2.0342904358603593, + "grad_norm": 7.626075267791748, + "learning_rate": 4.425347509135313e-05, + "loss": 1.5679, + "step": 12310 + }, + { + "epoch": 2.03594298698616, + "grad_norm": 14.908061981201172, + "learning_rate": 4.424429387245451e-05, + "loss": 1.548, + "step": 12320 + }, + { + "epoch": 2.0375955381119604, + "grad_norm": 7.012806415557861, + "learning_rate": 4.423511265355589e-05, + "loss": 1.7013, + "step": 12330 + }, + { + "epoch": 2.039248089237761, + "grad_norm": 5.985696792602539, + "learning_rate": 4.422593143465727e-05, + "loss": 1.543, + "step": 12340 + }, + { + "epoch": 2.0409006403635614, + "grad_norm": 20.78647232055664, + "learning_rate": 4.421675021575865e-05, + "loss": 1.6405, + "step": 12350 + }, + { + "epoch": 2.0425531914893615, + "grad_norm": 11.975393295288086, + "learning_rate": 4.4207568996860024e-05, + "loss": 1.6962, + "step": 12360 + }, + { + "epoch": 2.044205742615162, + "grad_norm": 8.53466796875, + "learning_rate": 4.4198387777961406e-05, + "loss": 1.6913, + "step": 12370 + }, + { + "epoch": 2.0458582937409626, + "grad_norm": 9.030325889587402, + "learning_rate": 4.418920655906278e-05, + "loss": 1.578, + "step": 12380 + }, + { + "epoch": 2.047510844866763, + "grad_norm": 21.684886932373047, + "learning_rate": 4.418002534016416e-05, + "loss": 1.5179, + "step": 12390 + }, + { + "epoch": 2.0491633959925637, + "grad_norm": 13.426318168640137, + "learning_rate": 4.417084412126554e-05, + "loss": 1.637, + "step": 12400 + }, + { + "epoch": 2.050815947118364, + "grad_norm": 6.491002082824707, + "learning_rate": 4.4161662902366917e-05, + "loss": 1.5211, + "step": 12410 + }, + { + "epoch": 2.0524684982441643, + "grad_norm": 8.91051197052002, + "learning_rate": 4.41524816834683e-05, + "loss": 1.6806, + "step": 12420 + }, + { + "epoch": 2.054121049369965, + "grad_norm": 26.088380813598633, + "learning_rate": 4.4143300464569675e-05, + "loss": 1.6835, + "step": 12430 + }, + { + "epoch": 2.0557736004957654, + "grad_norm": 9.795445442199707, + "learning_rate": 4.413411924567106e-05, + "loss": 1.7466, + "step": 12440 + }, + { + "epoch": 2.057426151621566, + "grad_norm": 16.947595596313477, + "learning_rate": 4.412493802677244e-05, + "loss": 1.6179, + "step": 12450 + }, + { + "epoch": 2.059078702747366, + "grad_norm": 14.341743469238281, + "learning_rate": 4.4115756807873816e-05, + "loss": 1.6692, + "step": 12460 + }, + { + "epoch": 2.0607312538731666, + "grad_norm": 8.210740089416504, + "learning_rate": 4.41065755889752e-05, + "loss": 1.5425, + "step": 12470 + }, + { + "epoch": 2.062383804998967, + "grad_norm": 16.021188735961914, + "learning_rate": 4.4097394370076575e-05, + "loss": 1.466, + "step": 12480 + }, + { + "epoch": 2.0640363561247677, + "grad_norm": 25.511781692504883, + "learning_rate": 4.408821315117795e-05, + "loss": 1.6227, + "step": 12490 + }, + { + "epoch": 2.065688907250568, + "grad_norm": 10.479546546936035, + "learning_rate": 4.407903193227933e-05, + "loss": 1.6323, + "step": 12500 + }, + { + "epoch": 2.0673414583763687, + "grad_norm": 29.294937133789062, + "learning_rate": 4.406985071338071e-05, + "loss": 1.7135, + "step": 12510 + }, + { + "epoch": 2.068994009502169, + "grad_norm": 9.334646224975586, + "learning_rate": 4.4060669494482085e-05, + "loss": 1.4985, + "step": 12520 + }, + { + "epoch": 2.0706465606279694, + "grad_norm": 6.601807117462158, + "learning_rate": 4.405148827558347e-05, + "loss": 1.6179, + "step": 12530 + }, + { + "epoch": 2.07229911175377, + "grad_norm": 39.724403381347656, + "learning_rate": 4.4042307056684843e-05, + "loss": 1.5593, + "step": 12540 + }, + { + "epoch": 2.0739516628795704, + "grad_norm": 21.564411163330078, + "learning_rate": 4.4033125837786226e-05, + "loss": 1.6602, + "step": 12550 + }, + { + "epoch": 2.075604214005371, + "grad_norm": 9.211846351623535, + "learning_rate": 4.402394461888761e-05, + "loss": 1.5664, + "step": 12560 + }, + { + "epoch": 2.077256765131171, + "grad_norm": 86.83989715576172, + "learning_rate": 4.4014763399988985e-05, + "loss": 1.6355, + "step": 12570 + }, + { + "epoch": 2.0789093162569716, + "grad_norm": 15.011252403259277, + "learning_rate": 4.400558218109037e-05, + "loss": 1.7233, + "step": 12580 + }, + { + "epoch": 2.080561867382772, + "grad_norm": 22.9763240814209, + "learning_rate": 4.399640096219174e-05, + "loss": 1.6327, + "step": 12590 + }, + { + "epoch": 2.0822144185085727, + "grad_norm": 9.363661766052246, + "learning_rate": 4.3987219743293126e-05, + "loss": 1.6782, + "step": 12600 + }, + { + "epoch": 2.0838669696343732, + "grad_norm": 10.958879470825195, + "learning_rate": 4.39780385243945e-05, + "loss": 1.7072, + "step": 12610 + }, + { + "epoch": 2.0855195207601733, + "grad_norm": 6.636941432952881, + "learning_rate": 4.3968857305495884e-05, + "loss": 1.5247, + "step": 12620 + }, + { + "epoch": 2.087172071885974, + "grad_norm": 74.40653228759766, + "learning_rate": 4.395967608659726e-05, + "loss": 1.5546, + "step": 12630 + }, + { + "epoch": 2.0888246230117744, + "grad_norm": 9.923933029174805, + "learning_rate": 4.3950494867698636e-05, + "loss": 1.5428, + "step": 12640 + }, + { + "epoch": 2.090477174137575, + "grad_norm": 8.562093734741211, + "learning_rate": 4.394131364880001e-05, + "loss": 1.5351, + "step": 12650 + }, + { + "epoch": 2.0921297252633755, + "grad_norm": 46.868709564208984, + "learning_rate": 4.3932132429901394e-05, + "loss": 1.6106, + "step": 12660 + }, + { + "epoch": 2.0937822763891756, + "grad_norm": 5.111778259277344, + "learning_rate": 4.392295121100278e-05, + "loss": 1.5635, + "step": 12670 + }, + { + "epoch": 2.095434827514976, + "grad_norm": 9.51055908203125, + "learning_rate": 4.391376999210415e-05, + "loss": 1.5957, + "step": 12680 + }, + { + "epoch": 2.0970873786407767, + "grad_norm": 14.892036437988281, + "learning_rate": 4.3904588773205535e-05, + "loss": 1.6939, + "step": 12690 + }, + { + "epoch": 2.098739929766577, + "grad_norm": 6.7437567710876465, + "learning_rate": 4.389540755430691e-05, + "loss": 1.5671, + "step": 12700 + }, + { + "epoch": 2.1003924808923777, + "grad_norm": 34.27804946899414, + "learning_rate": 4.3886226335408294e-05, + "loss": 1.6071, + "step": 12710 + }, + { + "epoch": 2.1020450320181783, + "grad_norm": 13.502313613891602, + "learning_rate": 4.387704511650967e-05, + "loss": 1.5568, + "step": 12720 + }, + { + "epoch": 2.1036975831439784, + "grad_norm": 12.005770683288574, + "learning_rate": 4.386786389761105e-05, + "loss": 1.563, + "step": 12730 + }, + { + "epoch": 2.105350134269779, + "grad_norm": 8.112605094909668, + "learning_rate": 4.385868267871243e-05, + "loss": 1.7326, + "step": 12740 + }, + { + "epoch": 2.1070026853955794, + "grad_norm": 6.765664100646973, + "learning_rate": 4.384950145981381e-05, + "loss": 1.6562, + "step": 12750 + }, + { + "epoch": 2.10865523652138, + "grad_norm": 12.570945739746094, + "learning_rate": 4.384032024091519e-05, + "loss": 1.6129, + "step": 12760 + }, + { + "epoch": 2.1103077876471805, + "grad_norm": 10.410295486450195, + "learning_rate": 4.383113902201656e-05, + "loss": 1.6615, + "step": 12770 + }, + { + "epoch": 2.1119603387729806, + "grad_norm": 25.10518455505371, + "learning_rate": 4.3821957803117945e-05, + "loss": 1.7272, + "step": 12780 + }, + { + "epoch": 2.113612889898781, + "grad_norm": 19.222471237182617, + "learning_rate": 4.381277658421932e-05, + "loss": 1.5842, + "step": 12790 + }, + { + "epoch": 2.1152654410245817, + "grad_norm": 9.53968334197998, + "learning_rate": 4.3803595365320704e-05, + "loss": 1.5341, + "step": 12800 + }, + { + "epoch": 2.1169179921503822, + "grad_norm": 11.091026306152344, + "learning_rate": 4.379441414642208e-05, + "loss": 1.7523, + "step": 12810 + }, + { + "epoch": 2.1185705432761828, + "grad_norm": 5.745179653167725, + "learning_rate": 4.378523292752346e-05, + "loss": 1.6201, + "step": 12820 + }, + { + "epoch": 2.120223094401983, + "grad_norm": 21.147724151611328, + "learning_rate": 4.377605170862484e-05, + "loss": 1.5084, + "step": 12830 + }, + { + "epoch": 2.1218756455277834, + "grad_norm": 5.363860130310059, + "learning_rate": 4.376687048972622e-05, + "loss": 1.6167, + "step": 12840 + }, + { + "epoch": 2.123528196653584, + "grad_norm": 5.5015459060668945, + "learning_rate": 4.37576892708276e-05, + "loss": 1.5093, + "step": 12850 + }, + { + "epoch": 2.1251807477793845, + "grad_norm": 6.91679573059082, + "learning_rate": 4.374850805192898e-05, + "loss": 1.5668, + "step": 12860 + }, + { + "epoch": 2.126833298905185, + "grad_norm": 15.651091575622559, + "learning_rate": 4.3739326833030355e-05, + "loss": 1.6675, + "step": 12870 + }, + { + "epoch": 2.1284858500309856, + "grad_norm": 9.818643569946289, + "learning_rate": 4.373014561413174e-05, + "loss": 1.5916, + "step": 12880 + }, + { + "epoch": 2.1301384011567857, + "grad_norm": 11.20659065246582, + "learning_rate": 4.3720964395233114e-05, + "loss": 1.6256, + "step": 12890 + }, + { + "epoch": 2.131790952282586, + "grad_norm": 9.242450714111328, + "learning_rate": 4.371178317633449e-05, + "loss": 1.5514, + "step": 12900 + }, + { + "epoch": 2.1334435034083867, + "grad_norm": 11.752909660339355, + "learning_rate": 4.370260195743587e-05, + "loss": 1.6745, + "step": 12910 + }, + { + "epoch": 2.1350960545341873, + "grad_norm": 19.809019088745117, + "learning_rate": 4.369342073853725e-05, + "loss": 1.537, + "step": 12920 + }, + { + "epoch": 2.136748605659988, + "grad_norm": 13.41193962097168, + "learning_rate": 4.368423951963863e-05, + "loss": 1.5797, + "step": 12930 + }, + { + "epoch": 2.138401156785788, + "grad_norm": 17.19065284729004, + "learning_rate": 4.3675058300740007e-05, + "loss": 1.6646, + "step": 12940 + }, + { + "epoch": 2.1400537079115884, + "grad_norm": 32.37812423706055, + "learning_rate": 4.366587708184139e-05, + "loss": 1.6777, + "step": 12950 + }, + { + "epoch": 2.141706259037389, + "grad_norm": 14.590213775634766, + "learning_rate": 4.3656695862942765e-05, + "loss": 1.582, + "step": 12960 + }, + { + "epoch": 2.1433588101631895, + "grad_norm": 11.580479621887207, + "learning_rate": 4.364751464404415e-05, + "loss": 1.5257, + "step": 12970 + }, + { + "epoch": 2.14501136128899, + "grad_norm": 13.083703994750977, + "learning_rate": 4.3638333425145524e-05, + "loss": 1.6449, + "step": 12980 + }, + { + "epoch": 2.14666391241479, + "grad_norm": 44.948280334472656, + "learning_rate": 4.3629152206246906e-05, + "loss": 1.6575, + "step": 12990 + }, + { + "epoch": 2.1483164635405907, + "grad_norm": 12.724278450012207, + "learning_rate": 4.361997098734828e-05, + "loss": 1.576, + "step": 13000 + }, + { + "epoch": 2.1499690146663912, + "grad_norm": 14.588871002197266, + "learning_rate": 4.3610789768449665e-05, + "loss": 1.5038, + "step": 13010 + }, + { + "epoch": 2.1516215657921918, + "grad_norm": 47.780120849609375, + "learning_rate": 4.360160854955104e-05, + "loss": 1.6169, + "step": 13020 + }, + { + "epoch": 2.1532741169179923, + "grad_norm": 303.17950439453125, + "learning_rate": 4.3592427330652416e-05, + "loss": 1.4563, + "step": 13030 + }, + { + "epoch": 2.1549266680437924, + "grad_norm": 8.716339111328125, + "learning_rate": 4.35832461117538e-05, + "loss": 1.5758, + "step": 13040 + }, + { + "epoch": 2.156579219169593, + "grad_norm": 11.586296081542969, + "learning_rate": 4.3574064892855175e-05, + "loss": 1.6525, + "step": 13050 + }, + { + "epoch": 2.1582317702953935, + "grad_norm": 20.37527847290039, + "learning_rate": 4.356488367395656e-05, + "loss": 1.6296, + "step": 13060 + }, + { + "epoch": 2.159884321421194, + "grad_norm": 258.34210205078125, + "learning_rate": 4.355570245505793e-05, + "loss": 1.6202, + "step": 13070 + }, + { + "epoch": 2.1615368725469946, + "grad_norm": 8.469282150268555, + "learning_rate": 4.3546521236159316e-05, + "loss": 1.6968, + "step": 13080 + }, + { + "epoch": 2.1631894236727947, + "grad_norm": 16.505489349365234, + "learning_rate": 4.353734001726069e-05, + "loss": 1.6207, + "step": 13090 + }, + { + "epoch": 2.164841974798595, + "grad_norm": 12.483043670654297, + "learning_rate": 4.3528158798362074e-05, + "loss": 1.674, + "step": 13100 + }, + { + "epoch": 2.1664945259243957, + "grad_norm": 11.628996849060059, + "learning_rate": 4.351897757946345e-05, + "loss": 1.6864, + "step": 13110 + }, + { + "epoch": 2.1681470770501963, + "grad_norm": 9.3779878616333, + "learning_rate": 4.350979636056483e-05, + "loss": 1.5968, + "step": 13120 + }, + { + "epoch": 2.169799628175997, + "grad_norm": 7.350371360778809, + "learning_rate": 4.350061514166621e-05, + "loss": 1.5171, + "step": 13130 + }, + { + "epoch": 2.1714521793017973, + "grad_norm": 17.027692794799805, + "learning_rate": 4.349143392276759e-05, + "loss": 1.7044, + "step": 13140 + }, + { + "epoch": 2.1731047304275974, + "grad_norm": 6.943367958068848, + "learning_rate": 4.348225270386897e-05, + "loss": 1.585, + "step": 13150 + }, + { + "epoch": 2.174757281553398, + "grad_norm": 14.061978340148926, + "learning_rate": 4.347307148497034e-05, + "loss": 1.5373, + "step": 13160 + }, + { + "epoch": 2.1764098326791985, + "grad_norm": 10.38961410522461, + "learning_rate": 4.3463890266071726e-05, + "loss": 1.7171, + "step": 13170 + }, + { + "epoch": 2.178062383804999, + "grad_norm": 33.14574432373047, + "learning_rate": 4.34547090471731e-05, + "loss": 1.5202, + "step": 13180 + }, + { + "epoch": 2.1797149349307996, + "grad_norm": 34.48433303833008, + "learning_rate": 4.3445527828274484e-05, + "loss": 1.671, + "step": 13190 + }, + { + "epoch": 2.1813674860565997, + "grad_norm": 8.59406566619873, + "learning_rate": 4.343634660937586e-05, + "loss": 1.6391, + "step": 13200 + }, + { + "epoch": 2.1830200371824002, + "grad_norm": 9.573761940002441, + "learning_rate": 4.342716539047724e-05, + "loss": 1.6025, + "step": 13210 + }, + { + "epoch": 2.1846725883082008, + "grad_norm": 11.121373176574707, + "learning_rate": 4.341798417157862e-05, + "loss": 1.5423, + "step": 13220 + }, + { + "epoch": 2.1863251394340013, + "grad_norm": 8.483185768127441, + "learning_rate": 4.340880295268e-05, + "loss": 1.6423, + "step": 13230 + }, + { + "epoch": 2.187977690559802, + "grad_norm": 11.853544235229492, + "learning_rate": 4.3399621733781384e-05, + "loss": 1.6377, + "step": 13240 + }, + { + "epoch": 2.189630241685602, + "grad_norm": 6.522881031036377, + "learning_rate": 4.339044051488276e-05, + "loss": 1.5402, + "step": 13250 + }, + { + "epoch": 2.1912827928114025, + "grad_norm": 14.069580078125, + "learning_rate": 4.338125929598414e-05, + "loss": 1.5593, + "step": 13260 + }, + { + "epoch": 2.192935343937203, + "grad_norm": 7.207234859466553, + "learning_rate": 4.337207807708552e-05, + "loss": 1.6057, + "step": 13270 + }, + { + "epoch": 2.1945878950630036, + "grad_norm": 11.689896583557129, + "learning_rate": 4.3362896858186894e-05, + "loss": 1.5226, + "step": 13280 + }, + { + "epoch": 2.196240446188804, + "grad_norm": 57.798797607421875, + "learning_rate": 4.335371563928827e-05, + "loss": 1.5292, + "step": 13290 + }, + { + "epoch": 2.1978929973146046, + "grad_norm": 15.47495174407959, + "learning_rate": 4.334453442038965e-05, + "loss": 1.5943, + "step": 13300 + }, + { + "epoch": 2.1995455484404047, + "grad_norm": 6.988327980041504, + "learning_rate": 4.333535320149103e-05, + "loss": 1.5755, + "step": 13310 + }, + { + "epoch": 2.2011980995662053, + "grad_norm": 10.670233726501465, + "learning_rate": 4.332617198259241e-05, + "loss": 1.6494, + "step": 13320 + }, + { + "epoch": 2.202850650692006, + "grad_norm": 9.843840599060059, + "learning_rate": 4.331699076369379e-05, + "loss": 1.6159, + "step": 13330 + }, + { + "epoch": 2.2045032018178063, + "grad_norm": 7.6408233642578125, + "learning_rate": 4.330780954479517e-05, + "loss": 1.6032, + "step": 13340 + }, + { + "epoch": 2.206155752943607, + "grad_norm": 7.837189197540283, + "learning_rate": 4.329862832589655e-05, + "loss": 1.5953, + "step": 13350 + }, + { + "epoch": 2.207808304069407, + "grad_norm": 14.017073631286621, + "learning_rate": 4.328944710699793e-05, + "loss": 1.5831, + "step": 13360 + }, + { + "epoch": 2.2094608551952075, + "grad_norm": 19.07843017578125, + "learning_rate": 4.328026588809931e-05, + "loss": 1.5628, + "step": 13370 + }, + { + "epoch": 2.211113406321008, + "grad_norm": 8.755454063415527, + "learning_rate": 4.327108466920069e-05, + "loss": 1.6262, + "step": 13380 + }, + { + "epoch": 2.2127659574468086, + "grad_norm": 41.75136184692383, + "learning_rate": 4.326190345030207e-05, + "loss": 1.6595, + "step": 13390 + }, + { + "epoch": 2.214418508572609, + "grad_norm": 10.803937911987305, + "learning_rate": 4.3252722231403445e-05, + "loss": 1.7103, + "step": 13400 + }, + { + "epoch": 2.2160710596984092, + "grad_norm": 64.1683349609375, + "learning_rate": 4.324354101250482e-05, + "loss": 1.6162, + "step": 13410 + }, + { + "epoch": 2.2177236108242098, + "grad_norm": 14.838408470153809, + "learning_rate": 4.32343597936062e-05, + "loss": 1.6336, + "step": 13420 + }, + { + "epoch": 2.2193761619500103, + "grad_norm": 15.170886039733887, + "learning_rate": 4.322517857470758e-05, + "loss": 1.4815, + "step": 13430 + }, + { + "epoch": 2.221028713075811, + "grad_norm": 13.091402053833008, + "learning_rate": 4.3215997355808955e-05, + "loss": 1.6045, + "step": 13440 + }, + { + "epoch": 2.2226812642016114, + "grad_norm": 8.757658958435059, + "learning_rate": 4.320681613691034e-05, + "loss": 1.7558, + "step": 13450 + }, + { + "epoch": 2.2243338153274115, + "grad_norm": 12.71592903137207, + "learning_rate": 4.3197634918011714e-05, + "loss": 1.581, + "step": 13460 + }, + { + "epoch": 2.225986366453212, + "grad_norm": 18.19761085510254, + "learning_rate": 4.3188453699113096e-05, + "loss": 1.5034, + "step": 13470 + }, + { + "epoch": 2.2276389175790126, + "grad_norm": 8.124272346496582, + "learning_rate": 4.317927248021448e-05, + "loss": 1.561, + "step": 13480 + }, + { + "epoch": 2.229291468704813, + "grad_norm": 6.5509796142578125, + "learning_rate": 4.3170091261315855e-05, + "loss": 1.709, + "step": 13490 + }, + { + "epoch": 2.2309440198306136, + "grad_norm": 34.16242599487305, + "learning_rate": 4.316091004241724e-05, + "loss": 1.5383, + "step": 13500 + }, + { + "epoch": 2.2325965709564137, + "grad_norm": 60.66366195678711, + "learning_rate": 4.3151728823518613e-05, + "loss": 1.6962, + "step": 13510 + }, + { + "epoch": 2.2342491220822143, + "grad_norm": 8.734251976013184, + "learning_rate": 4.3142547604619996e-05, + "loss": 1.5747, + "step": 13520 + }, + { + "epoch": 2.235901673208015, + "grad_norm": 7.926821231842041, + "learning_rate": 4.313336638572137e-05, + "loss": 1.5384, + "step": 13530 + }, + { + "epoch": 2.2375542243338153, + "grad_norm": 13.231529235839844, + "learning_rate": 4.312418516682275e-05, + "loss": 1.6822, + "step": 13540 + }, + { + "epoch": 2.239206775459616, + "grad_norm": 19.836008071899414, + "learning_rate": 4.3115003947924124e-05, + "loss": 1.5861, + "step": 13550 + }, + { + "epoch": 2.2408593265854164, + "grad_norm": 9.528962135314941, + "learning_rate": 4.3105822729025506e-05, + "loss": 1.7453, + "step": 13560 + }, + { + "epoch": 2.2425118777112165, + "grad_norm": 25.385387420654297, + "learning_rate": 4.309664151012688e-05, + "loss": 1.5686, + "step": 13570 + }, + { + "epoch": 2.244164428837017, + "grad_norm": 7.8074870109558105, + "learning_rate": 4.3087460291228265e-05, + "loss": 1.6753, + "step": 13580 + }, + { + "epoch": 2.2458169799628176, + "grad_norm": 142.185302734375, + "learning_rate": 4.307827907232965e-05, + "loss": 1.5779, + "step": 13590 + }, + { + "epoch": 2.247469531088618, + "grad_norm": 24.071073532104492, + "learning_rate": 4.306909785343102e-05, + "loss": 1.5333, + "step": 13600 + }, + { + "epoch": 2.2491220822144187, + "grad_norm": 16.71168327331543, + "learning_rate": 4.3059916634532406e-05, + "loss": 1.6225, + "step": 13610 + }, + { + "epoch": 2.250774633340219, + "grad_norm": 14.29904556274414, + "learning_rate": 4.305073541563378e-05, + "loss": 1.7723, + "step": 13620 + }, + { + "epoch": 2.2524271844660193, + "grad_norm": 9.134810447692871, + "learning_rate": 4.3041554196735164e-05, + "loss": 1.5871, + "step": 13630 + }, + { + "epoch": 2.25407973559182, + "grad_norm": 15.202738761901855, + "learning_rate": 4.303237297783654e-05, + "loss": 1.6463, + "step": 13640 + }, + { + "epoch": 2.2557322867176204, + "grad_norm": 61.93202209472656, + "learning_rate": 4.302319175893792e-05, + "loss": 1.7039, + "step": 13650 + }, + { + "epoch": 2.257384837843421, + "grad_norm": 48.4030647277832, + "learning_rate": 4.30140105400393e-05, + "loss": 1.6503, + "step": 13660 + }, + { + "epoch": 2.2590373889692215, + "grad_norm": 17.932994842529297, + "learning_rate": 4.3004829321140675e-05, + "loss": 1.5064, + "step": 13670 + }, + { + "epoch": 2.2606899400950216, + "grad_norm": 12.965964317321777, + "learning_rate": 4.299564810224205e-05, + "loss": 1.5113, + "step": 13680 + }, + { + "epoch": 2.262342491220822, + "grad_norm": 34.27570724487305, + "learning_rate": 4.298646688334343e-05, + "loss": 1.5163, + "step": 13690 + }, + { + "epoch": 2.2639950423466226, + "grad_norm": 31.845481872558594, + "learning_rate": 4.2977285664444816e-05, + "loss": 1.5038, + "step": 13700 + }, + { + "epoch": 2.265647593472423, + "grad_norm": 10.161199569702148, + "learning_rate": 4.296810444554619e-05, + "loss": 1.4943, + "step": 13710 + }, + { + "epoch": 2.2673001445982237, + "grad_norm": 14.812753677368164, + "learning_rate": 4.2958923226647574e-05, + "loss": 1.5093, + "step": 13720 + }, + { + "epoch": 2.268952695724024, + "grad_norm": 8.536107063293457, + "learning_rate": 4.294974200774895e-05, + "loss": 1.5917, + "step": 13730 + }, + { + "epoch": 2.2706052468498243, + "grad_norm": 12.14443302154541, + "learning_rate": 4.294056078885033e-05, + "loss": 1.6145, + "step": 13740 + }, + { + "epoch": 2.272257797975625, + "grad_norm": 9.584919929504395, + "learning_rate": 4.293137956995171e-05, + "loss": 1.5674, + "step": 13750 + }, + { + "epoch": 2.2739103491014254, + "grad_norm": 8.43072509765625, + "learning_rate": 4.292219835105309e-05, + "loss": 1.6024, + "step": 13760 + }, + { + "epoch": 2.275562900227226, + "grad_norm": 10.97778606414795, + "learning_rate": 4.291301713215447e-05, + "loss": 1.7366, + "step": 13770 + }, + { + "epoch": 2.277215451353026, + "grad_norm": 7.326660633087158, + "learning_rate": 4.290383591325585e-05, + "loss": 1.632, + "step": 13780 + }, + { + "epoch": 2.2788680024788266, + "grad_norm": 6.518946170806885, + "learning_rate": 4.2894654694357226e-05, + "loss": 1.5364, + "step": 13790 + }, + { + "epoch": 2.280520553604627, + "grad_norm": 28.530855178833008, + "learning_rate": 4.28854734754586e-05, + "loss": 1.7219, + "step": 13800 + }, + { + "epoch": 2.2821731047304277, + "grad_norm": 14.148917198181152, + "learning_rate": 4.2876292256559984e-05, + "loss": 1.6488, + "step": 13810 + }, + { + "epoch": 2.283825655856228, + "grad_norm": 40.1848030090332, + "learning_rate": 4.286711103766136e-05, + "loss": 1.6645, + "step": 13820 + }, + { + "epoch": 2.2854782069820283, + "grad_norm": 7.235057830810547, + "learning_rate": 4.285792981876274e-05, + "loss": 1.584, + "step": 13830 + }, + { + "epoch": 2.287130758107829, + "grad_norm": 10.927480697631836, + "learning_rate": 4.284874859986412e-05, + "loss": 1.612, + "step": 13840 + }, + { + "epoch": 2.2887833092336294, + "grad_norm": 9.236492156982422, + "learning_rate": 4.28395673809655e-05, + "loss": 1.6889, + "step": 13850 + }, + { + "epoch": 2.29043586035943, + "grad_norm": 13.05184555053711, + "learning_rate": 4.283038616206688e-05, + "loss": 1.564, + "step": 13860 + }, + { + "epoch": 2.2920884114852305, + "grad_norm": 17.070354461669922, + "learning_rate": 4.282120494316826e-05, + "loss": 1.5551, + "step": 13870 + }, + { + "epoch": 2.2937409626110306, + "grad_norm": 11.861235618591309, + "learning_rate": 4.2812023724269635e-05, + "loss": 1.763, + "step": 13880 + }, + { + "epoch": 2.295393513736831, + "grad_norm": 10.376340866088867, + "learning_rate": 4.280284250537102e-05, + "loss": 1.6106, + "step": 13890 + }, + { + "epoch": 2.2970460648626316, + "grad_norm": 77.84674835205078, + "learning_rate": 4.2793661286472394e-05, + "loss": 1.6489, + "step": 13900 + }, + { + "epoch": 2.298698615988432, + "grad_norm": 8.317307472229004, + "learning_rate": 4.2784480067573777e-05, + "loss": 1.6459, + "step": 13910 + }, + { + "epoch": 2.3003511671142327, + "grad_norm": 9.242462158203125, + "learning_rate": 4.277529884867515e-05, + "loss": 1.5626, + "step": 13920 + }, + { + "epoch": 2.302003718240033, + "grad_norm": 9.533720016479492, + "learning_rate": 4.276611762977653e-05, + "loss": 1.6698, + "step": 13930 + }, + { + "epoch": 2.3036562693658333, + "grad_norm": 8.523353576660156, + "learning_rate": 4.275693641087791e-05, + "loss": 1.6613, + "step": 13940 + }, + { + "epoch": 2.305308820491634, + "grad_norm": 7.527460098266602, + "learning_rate": 4.274775519197929e-05, + "loss": 1.5093, + "step": 13950 + }, + { + "epoch": 2.3069613716174344, + "grad_norm": 7.213209629058838, + "learning_rate": 4.273857397308067e-05, + "loss": 1.6411, + "step": 13960 + }, + { + "epoch": 2.308613922743235, + "grad_norm": 8.763940811157227, + "learning_rate": 4.2729392754182045e-05, + "loss": 1.666, + "step": 13970 + }, + { + "epoch": 2.3102664738690355, + "grad_norm": 26.79812240600586, + "learning_rate": 4.272021153528343e-05, + "loss": 1.5755, + "step": 13980 + }, + { + "epoch": 2.3119190249948356, + "grad_norm": 20.0844669342041, + "learning_rate": 4.2711030316384804e-05, + "loss": 1.6604, + "step": 13990 + }, + { + "epoch": 2.313571576120636, + "grad_norm": 9.126409530639648, + "learning_rate": 4.2701849097486186e-05, + "loss": 1.5742, + "step": 14000 + }, + { + "epoch": 2.3152241272464367, + "grad_norm": 24.678070068359375, + "learning_rate": 4.269266787858756e-05, + "loss": 1.5018, + "step": 14010 + }, + { + "epoch": 2.316876678372237, + "grad_norm": 6.996845245361328, + "learning_rate": 4.2683486659688945e-05, + "loss": 1.5528, + "step": 14020 + }, + { + "epoch": 2.3185292294980377, + "grad_norm": 7.3137617111206055, + "learning_rate": 4.267430544079032e-05, + "loss": 1.54, + "step": 14030 + }, + { + "epoch": 2.3201817806238383, + "grad_norm": 10.51693344116211, + "learning_rate": 4.2665124221891703e-05, + "loss": 1.7331, + "step": 14040 + }, + { + "epoch": 2.3218343317496384, + "grad_norm": 9.240628242492676, + "learning_rate": 4.265594300299308e-05, + "loss": 1.5992, + "step": 14050 + }, + { + "epoch": 2.323486882875439, + "grad_norm": 34.22092056274414, + "learning_rate": 4.2646761784094455e-05, + "loss": 1.5359, + "step": 14060 + }, + { + "epoch": 2.3251394340012395, + "grad_norm": 8.278003692626953, + "learning_rate": 4.263758056519584e-05, + "loss": 1.587, + "step": 14070 + }, + { + "epoch": 2.32679198512704, + "grad_norm": 9.079841613769531, + "learning_rate": 4.2628399346297214e-05, + "loss": 1.615, + "step": 14080 + }, + { + "epoch": 2.3284445362528405, + "grad_norm": 130.9219512939453, + "learning_rate": 4.2619218127398596e-05, + "loss": 1.6647, + "step": 14090 + }, + { + "epoch": 2.3300970873786406, + "grad_norm": 8.039260864257812, + "learning_rate": 4.261003690849997e-05, + "loss": 1.5117, + "step": 14100 + }, + { + "epoch": 2.331749638504441, + "grad_norm": 20.554414749145508, + "learning_rate": 4.2600855689601355e-05, + "loss": 1.659, + "step": 14110 + }, + { + "epoch": 2.3334021896302417, + "grad_norm": 16.2318172454834, + "learning_rate": 4.259167447070273e-05, + "loss": 1.7476, + "step": 14120 + }, + { + "epoch": 2.3350547407560422, + "grad_norm": 14.719922065734863, + "learning_rate": 4.258249325180411e-05, + "loss": 1.6945, + "step": 14130 + }, + { + "epoch": 2.336707291881843, + "grad_norm": 19.263084411621094, + "learning_rate": 4.257331203290549e-05, + "loss": 1.6284, + "step": 14140 + }, + { + "epoch": 2.338359843007643, + "grad_norm": 8.643294334411621, + "learning_rate": 4.256413081400687e-05, + "loss": 1.568, + "step": 14150 + }, + { + "epoch": 2.3400123941334434, + "grad_norm": 7.2459797859191895, + "learning_rate": 4.2554949595108254e-05, + "loss": 1.6159, + "step": 14160 + }, + { + "epoch": 2.341664945259244, + "grad_norm": 9.25794792175293, + "learning_rate": 4.254576837620963e-05, + "loss": 1.548, + "step": 14170 + }, + { + "epoch": 2.3433174963850445, + "grad_norm": 39.79923629760742, + "learning_rate": 4.2536587157311006e-05, + "loss": 1.5182, + "step": 14180 + }, + { + "epoch": 2.344970047510845, + "grad_norm": 10.711408615112305, + "learning_rate": 4.252740593841238e-05, + "loss": 1.5869, + "step": 14190 + }, + { + "epoch": 2.346622598636645, + "grad_norm": 4.972466945648193, + "learning_rate": 4.2518224719513765e-05, + "loss": 1.6212, + "step": 14200 + }, + { + "epoch": 2.3482751497624457, + "grad_norm": 23.379873275756836, + "learning_rate": 4.250904350061514e-05, + "loss": 1.581, + "step": 14210 + }, + { + "epoch": 2.349927700888246, + "grad_norm": 9.372620582580566, + "learning_rate": 4.249986228171652e-05, + "loss": 1.6083, + "step": 14220 + }, + { + "epoch": 2.3515802520140467, + "grad_norm": 7.492948532104492, + "learning_rate": 4.24906810628179e-05, + "loss": 1.5183, + "step": 14230 + }, + { + "epoch": 2.3532328031398473, + "grad_norm": 11.669918060302734, + "learning_rate": 4.248149984391928e-05, + "loss": 1.6055, + "step": 14240 + }, + { + "epoch": 2.3548853542656474, + "grad_norm": 14.879387855529785, + "learning_rate": 4.247231862502066e-05, + "loss": 1.5403, + "step": 14250 + }, + { + "epoch": 2.356537905391448, + "grad_norm": 10.409502029418945, + "learning_rate": 4.246313740612204e-05, + "loss": 1.6121, + "step": 14260 + }, + { + "epoch": 2.3581904565172485, + "grad_norm": 12.416383743286133, + "learning_rate": 4.245395618722342e-05, + "loss": 1.7431, + "step": 14270 + }, + { + "epoch": 2.359843007643049, + "grad_norm": 13.863862991333008, + "learning_rate": 4.24447749683248e-05, + "loss": 1.5425, + "step": 14280 + }, + { + "epoch": 2.3614955587688495, + "grad_norm": 10.172608375549316, + "learning_rate": 4.243559374942618e-05, + "loss": 1.5315, + "step": 14290 + }, + { + "epoch": 2.3631481098946496, + "grad_norm": 21.186439514160156, + "learning_rate": 4.242641253052756e-05, + "loss": 1.7509, + "step": 14300 + }, + { + "epoch": 2.36480066102045, + "grad_norm": 14.802777290344238, + "learning_rate": 4.241723131162893e-05, + "loss": 1.6284, + "step": 14310 + }, + { + "epoch": 2.3664532121462507, + "grad_norm": 7.647261142730713, + "learning_rate": 4.240805009273031e-05, + "loss": 1.6912, + "step": 14320 + }, + { + "epoch": 2.3681057632720512, + "grad_norm": 12.083390235900879, + "learning_rate": 4.239886887383169e-05, + "loss": 1.5557, + "step": 14330 + }, + { + "epoch": 2.369758314397852, + "grad_norm": 7.496702194213867, + "learning_rate": 4.238968765493307e-05, + "loss": 1.7399, + "step": 14340 + }, + { + "epoch": 2.3714108655236523, + "grad_norm": 13.159797668457031, + "learning_rate": 4.238050643603445e-05, + "loss": 1.5162, + "step": 14350 + }, + { + "epoch": 2.3730634166494524, + "grad_norm": 6.241695404052734, + "learning_rate": 4.2371325217135826e-05, + "loss": 1.6829, + "step": 14360 + }, + { + "epoch": 2.374715967775253, + "grad_norm": 10.561747550964355, + "learning_rate": 4.236214399823721e-05, + "loss": 1.6487, + "step": 14370 + }, + { + "epoch": 2.3763685189010535, + "grad_norm": 24.415855407714844, + "learning_rate": 4.235296277933859e-05, + "loss": 1.7504, + "step": 14380 + }, + { + "epoch": 2.378021070026854, + "grad_norm": 6.549388885498047, + "learning_rate": 4.234378156043997e-05, + "loss": 1.4467, + "step": 14390 + }, + { + "epoch": 2.3796736211526546, + "grad_norm": 8.12175464630127, + "learning_rate": 4.233460034154135e-05, + "loss": 1.6034, + "step": 14400 + }, + { + "epoch": 2.381326172278455, + "grad_norm": 10.860891342163086, + "learning_rate": 4.2325419122642725e-05, + "loss": 1.5926, + "step": 14410 + }, + { + "epoch": 2.382978723404255, + "grad_norm": 5.750729560852051, + "learning_rate": 4.231623790374411e-05, + "loss": 1.5773, + "step": 14420 + }, + { + "epoch": 2.3846312745300557, + "grad_norm": 4.9887871742248535, + "learning_rate": 4.2307056684845484e-05, + "loss": 1.5499, + "step": 14430 + }, + { + "epoch": 2.3862838256558563, + "grad_norm": 15.247535705566406, + "learning_rate": 4.229787546594686e-05, + "loss": 1.6275, + "step": 14440 + }, + { + "epoch": 2.387936376781657, + "grad_norm": 5.501838207244873, + "learning_rate": 4.2288694247048236e-05, + "loss": 1.507, + "step": 14450 + }, + { + "epoch": 2.3895889279074574, + "grad_norm": 6.640819549560547, + "learning_rate": 4.227951302814962e-05, + "loss": 1.5051, + "step": 14460 + }, + { + "epoch": 2.3912414790332575, + "grad_norm": 6.430408477783203, + "learning_rate": 4.2270331809250994e-05, + "loss": 1.7827, + "step": 14470 + }, + { + "epoch": 2.392894030159058, + "grad_norm": 10.69271183013916, + "learning_rate": 4.226115059035238e-05, + "loss": 1.6772, + "step": 14480 + }, + { + "epoch": 2.3945465812848585, + "grad_norm": 12.075413703918457, + "learning_rate": 4.225196937145376e-05, + "loss": 1.586, + "step": 14490 + }, + { + "epoch": 2.396199132410659, + "grad_norm": 12.302436828613281, + "learning_rate": 4.2242788152555135e-05, + "loss": 1.7124, + "step": 14500 + }, + { + "epoch": 2.3978516835364596, + "grad_norm": 19.029186248779297, + "learning_rate": 4.223360693365652e-05, + "loss": 1.5647, + "step": 14510 + }, + { + "epoch": 2.3995042346622597, + "grad_norm": 10.742748260498047, + "learning_rate": 4.2224425714757894e-05, + "loss": 1.6184, + "step": 14520 + }, + { + "epoch": 2.4011567857880602, + "grad_norm": 7.062446594238281, + "learning_rate": 4.2215244495859276e-05, + "loss": 1.5524, + "step": 14530 + }, + { + "epoch": 2.402809336913861, + "grad_norm": 9.456984519958496, + "learning_rate": 4.220606327696065e-05, + "loss": 1.4961, + "step": 14540 + }, + { + "epoch": 2.4044618880396613, + "grad_norm": 17.99965476989746, + "learning_rate": 4.2196882058062035e-05, + "loss": 1.5853, + "step": 14550 + }, + { + "epoch": 2.406114439165462, + "grad_norm": 56.12897491455078, + "learning_rate": 4.218770083916341e-05, + "loss": 1.6261, + "step": 14560 + }, + { + "epoch": 2.407766990291262, + "grad_norm": 7.298854827880859, + "learning_rate": 4.217851962026479e-05, + "loss": 1.8109, + "step": 14570 + }, + { + "epoch": 2.4094195414170625, + "grad_norm": 6.553171634674072, + "learning_rate": 4.216933840136616e-05, + "loss": 1.5705, + "step": 14580 + }, + { + "epoch": 2.411072092542863, + "grad_norm": 18.439796447753906, + "learning_rate": 4.2160157182467545e-05, + "loss": 1.4941, + "step": 14590 + }, + { + "epoch": 2.4127246436686636, + "grad_norm": 16.03473663330078, + "learning_rate": 4.215097596356892e-05, + "loss": 1.5356, + "step": 14600 + }, + { + "epoch": 2.414377194794464, + "grad_norm": 23.698720932006836, + "learning_rate": 4.2141794744670304e-05, + "loss": 1.6713, + "step": 14610 + }, + { + "epoch": 2.416029745920264, + "grad_norm": 12.044432640075684, + "learning_rate": 4.2132613525771686e-05, + "loss": 1.6328, + "step": 14620 + }, + { + "epoch": 2.4176822970460647, + "grad_norm": 44.36164093017578, + "learning_rate": 4.212343230687306e-05, + "loss": 1.5393, + "step": 14630 + }, + { + "epoch": 2.4193348481718653, + "grad_norm": 8.344030380249023, + "learning_rate": 4.2114251087974445e-05, + "loss": 1.7533, + "step": 14640 + }, + { + "epoch": 2.420987399297666, + "grad_norm": 17.99836540222168, + "learning_rate": 4.210506986907582e-05, + "loss": 1.7182, + "step": 14650 + }, + { + "epoch": 2.4226399504234664, + "grad_norm": 31.487668991088867, + "learning_rate": 4.20958886501772e-05, + "loss": 1.5901, + "step": 14660 + }, + { + "epoch": 2.4242925015492665, + "grad_norm": 10.33210277557373, + "learning_rate": 4.208670743127858e-05, + "loss": 1.5997, + "step": 14670 + }, + { + "epoch": 2.425945052675067, + "grad_norm": 7.618146896362305, + "learning_rate": 4.207752621237996e-05, + "loss": 1.5562, + "step": 14680 + }, + { + "epoch": 2.4275976038008675, + "grad_norm": 39.23401641845703, + "learning_rate": 4.206834499348134e-05, + "loss": 1.5395, + "step": 14690 + }, + { + "epoch": 2.429250154926668, + "grad_norm": 32.31549072265625, + "learning_rate": 4.2059163774582713e-05, + "loss": 1.5347, + "step": 14700 + }, + { + "epoch": 2.4309027060524686, + "grad_norm": 17.82357406616211, + "learning_rate": 4.204998255568409e-05, + "loss": 1.5538, + "step": 14710 + }, + { + "epoch": 2.4325552571782687, + "grad_norm": 9.20028305053711, + "learning_rate": 4.204080133678547e-05, + "loss": 1.6732, + "step": 14720 + }, + { + "epoch": 2.4342078083040692, + "grad_norm": 5.321588039398193, + "learning_rate": 4.2031620117886855e-05, + "loss": 1.5215, + "step": 14730 + }, + { + "epoch": 2.43586035942987, + "grad_norm": 21.302366256713867, + "learning_rate": 4.202243889898823e-05, + "loss": 1.4867, + "step": 14740 + }, + { + "epoch": 2.4375129105556703, + "grad_norm": 6.692672252655029, + "learning_rate": 4.201325768008961e-05, + "loss": 1.5802, + "step": 14750 + }, + { + "epoch": 2.439165461681471, + "grad_norm": 36.045379638671875, + "learning_rate": 4.200407646119099e-05, + "loss": 1.465, + "step": 14760 + }, + { + "epoch": 2.4408180128072714, + "grad_norm": 11.485674858093262, + "learning_rate": 4.199489524229237e-05, + "loss": 1.5289, + "step": 14770 + }, + { + "epoch": 2.4424705639330715, + "grad_norm": 11.762076377868652, + "learning_rate": 4.198571402339375e-05, + "loss": 1.5009, + "step": 14780 + }, + { + "epoch": 2.444123115058872, + "grad_norm": 9.65092658996582, + "learning_rate": 4.197653280449513e-05, + "loss": 1.7058, + "step": 14790 + }, + { + "epoch": 2.4457756661846726, + "grad_norm": 12.469741821289062, + "learning_rate": 4.1967351585596506e-05, + "loss": 1.5731, + "step": 14800 + }, + { + "epoch": 2.447428217310473, + "grad_norm": 48.536380767822266, + "learning_rate": 4.195817036669789e-05, + "loss": 1.509, + "step": 14810 + }, + { + "epoch": 2.4490807684362736, + "grad_norm": 9.138647079467773, + "learning_rate": 4.1948989147799264e-05, + "loss": 1.5453, + "step": 14820 + }, + { + "epoch": 2.450733319562074, + "grad_norm": 13.465476036071777, + "learning_rate": 4.193980792890064e-05, + "loss": 1.4734, + "step": 14830 + }, + { + "epoch": 2.4523858706878743, + "grad_norm": 9.56730842590332, + "learning_rate": 4.193062671000202e-05, + "loss": 1.6971, + "step": 14840 + }, + { + "epoch": 2.454038421813675, + "grad_norm": 30.218923568725586, + "learning_rate": 4.19214454911034e-05, + "loss": 1.6432, + "step": 14850 + }, + { + "epoch": 2.4556909729394754, + "grad_norm": 9.52951717376709, + "learning_rate": 4.191226427220478e-05, + "loss": 1.5145, + "step": 14860 + }, + { + "epoch": 2.457343524065276, + "grad_norm": 15.97042179107666, + "learning_rate": 4.190308305330616e-05, + "loss": 1.5629, + "step": 14870 + }, + { + "epoch": 2.4589960751910764, + "grad_norm": 7.123246669769287, + "learning_rate": 4.189390183440754e-05, + "loss": 1.6142, + "step": 14880 + }, + { + "epoch": 2.4606486263168765, + "grad_norm": 10.944896697998047, + "learning_rate": 4.1884720615508916e-05, + "loss": 1.4761, + "step": 14890 + }, + { + "epoch": 2.462301177442677, + "grad_norm": 14.6271333694458, + "learning_rate": 4.18755393966103e-05, + "loss": 1.6334, + "step": 14900 + }, + { + "epoch": 2.4639537285684776, + "grad_norm": 13.755815505981445, + "learning_rate": 4.1866358177711674e-05, + "loss": 1.6396, + "step": 14910 + }, + { + "epoch": 2.465606279694278, + "grad_norm": 5.795435428619385, + "learning_rate": 4.185717695881306e-05, + "loss": 1.6156, + "step": 14920 + }, + { + "epoch": 2.4672588308200787, + "grad_norm": 8.652409553527832, + "learning_rate": 4.184799573991443e-05, + "loss": 1.5923, + "step": 14930 + }, + { + "epoch": 2.468911381945879, + "grad_norm": 34.65216064453125, + "learning_rate": 4.1838814521015815e-05, + "loss": 1.6352, + "step": 14940 + }, + { + "epoch": 2.4705639330716793, + "grad_norm": 8.559181213378906, + "learning_rate": 4.182963330211719e-05, + "loss": 1.567, + "step": 14950 + }, + { + "epoch": 2.47221648419748, + "grad_norm": 9.969573020935059, + "learning_rate": 4.182045208321857e-05, + "loss": 1.5711, + "step": 14960 + }, + { + "epoch": 2.4738690353232804, + "grad_norm": 18.179960250854492, + "learning_rate": 4.181127086431995e-05, + "loss": 1.6054, + "step": 14970 + }, + { + "epoch": 2.475521586449081, + "grad_norm": 14.064473152160645, + "learning_rate": 4.1802089645421326e-05, + "loss": 1.6849, + "step": 14980 + }, + { + "epoch": 2.477174137574881, + "grad_norm": 32.33445739746094, + "learning_rate": 4.179290842652271e-05, + "loss": 1.6486, + "step": 14990 + }, + { + "epoch": 2.4788266887006816, + "grad_norm": 10.423996925354004, + "learning_rate": 4.1783727207624084e-05, + "loss": 1.5323, + "step": 15000 + }, + { + "epoch": 2.480479239826482, + "grad_norm": 18.493953704833984, + "learning_rate": 4.177454598872547e-05, + "loss": 1.6395, + "step": 15010 + }, + { + "epoch": 2.4821317909522826, + "grad_norm": 12.233598709106445, + "learning_rate": 4.176536476982684e-05, + "loss": 1.6386, + "step": 15020 + }, + { + "epoch": 2.483784342078083, + "grad_norm": 6.207724571228027, + "learning_rate": 4.1756183550928225e-05, + "loss": 1.6277, + "step": 15030 + }, + { + "epoch": 2.4854368932038833, + "grad_norm": 29.125558853149414, + "learning_rate": 4.17470023320296e-05, + "loss": 1.7678, + "step": 15040 + }, + { + "epoch": 2.487089444329684, + "grad_norm": 155.11293029785156, + "learning_rate": 4.1737821113130984e-05, + "loss": 1.6569, + "step": 15050 + }, + { + "epoch": 2.4887419954554844, + "grad_norm": 11.938981056213379, + "learning_rate": 4.172863989423236e-05, + "loss": 1.4698, + "step": 15060 + }, + { + "epoch": 2.490394546581285, + "grad_norm": 17.687034606933594, + "learning_rate": 4.171945867533374e-05, + "loss": 1.4958, + "step": 15070 + }, + { + "epoch": 2.4920470977070854, + "grad_norm": 18.96551513671875, + "learning_rate": 4.171027745643512e-05, + "loss": 1.5655, + "step": 15080 + }, + { + "epoch": 2.4936996488328855, + "grad_norm": 151.6097869873047, + "learning_rate": 4.1701096237536494e-05, + "loss": 1.553, + "step": 15090 + }, + { + "epoch": 2.495352199958686, + "grad_norm": 18.68946075439453, + "learning_rate": 4.1691915018637877e-05, + "loss": 1.7066, + "step": 15100 + }, + { + "epoch": 2.4970047510844866, + "grad_norm": 6.90726375579834, + "learning_rate": 4.168273379973925e-05, + "loss": 1.5377, + "step": 15110 + }, + { + "epoch": 2.498657302210287, + "grad_norm": 5.961545467376709, + "learning_rate": 4.1673552580840635e-05, + "loss": 1.6982, + "step": 15120 + }, + { + "epoch": 2.5003098533360877, + "grad_norm": 14.853984832763672, + "learning_rate": 4.166437136194201e-05, + "loss": 1.4802, + "step": 15130 + }, + { + "epoch": 2.501962404461888, + "grad_norm": 22.466632843017578, + "learning_rate": 4.1655190143043394e-05, + "loss": 1.7366, + "step": 15140 + }, + { + "epoch": 2.5036149555876888, + "grad_norm": 17.365001678466797, + "learning_rate": 4.164600892414477e-05, + "loss": 1.574, + "step": 15150 + }, + { + "epoch": 2.505267506713489, + "grad_norm": 14.483482360839844, + "learning_rate": 4.163682770524615e-05, + "loss": 1.7487, + "step": 15160 + }, + { + "epoch": 2.5069200578392894, + "grad_norm": 16.953773498535156, + "learning_rate": 4.162764648634753e-05, + "loss": 1.6225, + "step": 15170 + }, + { + "epoch": 2.50857260896509, + "grad_norm": 10.457685470581055, + "learning_rate": 4.161846526744891e-05, + "loss": 1.7095, + "step": 15180 + }, + { + "epoch": 2.5102251600908905, + "grad_norm": 17.119020462036133, + "learning_rate": 4.160928404855029e-05, + "loss": 1.6546, + "step": 15190 + }, + { + "epoch": 2.511877711216691, + "grad_norm": 5.457040309906006, + "learning_rate": 4.160010282965167e-05, + "loss": 1.5472, + "step": 15200 + }, + { + "epoch": 2.513530262342491, + "grad_norm": 11.147836685180664, + "learning_rate": 4.1590921610753045e-05, + "loss": 1.6512, + "step": 15210 + }, + { + "epoch": 2.5151828134682916, + "grad_norm": 9.593242645263672, + "learning_rate": 4.158174039185442e-05, + "loss": 1.6628, + "step": 15220 + }, + { + "epoch": 2.516835364594092, + "grad_norm": 11.19896125793457, + "learning_rate": 4.1572559172955803e-05, + "loss": 1.6524, + "step": 15230 + }, + { + "epoch": 2.5184879157198927, + "grad_norm": 7.335904121398926, + "learning_rate": 4.156337795405718e-05, + "loss": 1.5542, + "step": 15240 + }, + { + "epoch": 2.5201404668456933, + "grad_norm": 15.49926471710205, + "learning_rate": 4.155419673515856e-05, + "loss": 1.5335, + "step": 15250 + }, + { + "epoch": 2.5217930179714934, + "grad_norm": 8.069356918334961, + "learning_rate": 4.154501551625994e-05, + "loss": 1.498, + "step": 15260 + }, + { + "epoch": 2.523445569097294, + "grad_norm": 12.631221771240234, + "learning_rate": 4.153583429736132e-05, + "loss": 1.5483, + "step": 15270 + }, + { + "epoch": 2.5250981202230944, + "grad_norm": 10.724843978881836, + "learning_rate": 4.1526653078462696e-05, + "loss": 1.459, + "step": 15280 + }, + { + "epoch": 2.526750671348895, + "grad_norm": 28.29610252380371, + "learning_rate": 4.151747185956408e-05, + "loss": 1.5287, + "step": 15290 + }, + { + "epoch": 2.5284032224746955, + "grad_norm": 11.114155769348145, + "learning_rate": 4.150829064066546e-05, + "loss": 1.5773, + "step": 15300 + }, + { + "epoch": 2.5300557736004956, + "grad_norm": 16.751001358032227, + "learning_rate": 4.149910942176684e-05, + "loss": 1.6019, + "step": 15310 + }, + { + "epoch": 2.531708324726296, + "grad_norm": 12.741584777832031, + "learning_rate": 4.148992820286822e-05, + "loss": 1.602, + "step": 15320 + }, + { + "epoch": 2.5333608758520967, + "grad_norm": 14.595974922180176, + "learning_rate": 4.1480746983969596e-05, + "loss": 1.5229, + "step": 15330 + }, + { + "epoch": 2.535013426977897, + "grad_norm": 9.688117027282715, + "learning_rate": 4.147156576507097e-05, + "loss": 1.573, + "step": 15340 + }, + { + "epoch": 2.5366659781036978, + "grad_norm": 5.253176212310791, + "learning_rate": 4.146238454617235e-05, + "loss": 1.6018, + "step": 15350 + }, + { + "epoch": 2.538318529229498, + "grad_norm": 11.29525375366211, + "learning_rate": 4.145320332727373e-05, + "loss": 1.5689, + "step": 15360 + }, + { + "epoch": 2.5399710803552984, + "grad_norm": 10.628171920776367, + "learning_rate": 4.1444022108375106e-05, + "loss": 1.4875, + "step": 15370 + }, + { + "epoch": 2.541623631481099, + "grad_norm": 11.296927452087402, + "learning_rate": 4.143484088947649e-05, + "loss": 1.6718, + "step": 15380 + }, + { + "epoch": 2.5432761826068995, + "grad_norm": 9.183626174926758, + "learning_rate": 4.1425659670577865e-05, + "loss": 1.6317, + "step": 15390 + }, + { + "epoch": 2.5449287337327, + "grad_norm": 7.3979268074035645, + "learning_rate": 4.141647845167925e-05, + "loss": 1.6035, + "step": 15400 + }, + { + "epoch": 2.5465812848585, + "grad_norm": 9.273728370666504, + "learning_rate": 4.140729723278063e-05, + "loss": 1.596, + "step": 15410 + }, + { + "epoch": 2.5482338359843006, + "grad_norm": 16.016374588012695, + "learning_rate": 4.1398116013882006e-05, + "loss": 1.5189, + "step": 15420 + }, + { + "epoch": 2.549886387110101, + "grad_norm": 7.42404317855835, + "learning_rate": 4.138893479498339e-05, + "loss": 1.6049, + "step": 15430 + }, + { + "epoch": 2.5515389382359017, + "grad_norm": 14.383243560791016, + "learning_rate": 4.1379753576084764e-05, + "loss": 1.6131, + "step": 15440 + }, + { + "epoch": 2.5531914893617023, + "grad_norm": 7.443404197692871, + "learning_rate": 4.137057235718615e-05, + "loss": 1.7355, + "step": 15450 + }, + { + "epoch": 2.5548440404875024, + "grad_norm": 7.681070327758789, + "learning_rate": 4.136139113828752e-05, + "loss": 1.6039, + "step": 15460 + }, + { + "epoch": 2.556496591613303, + "grad_norm": 17.475801467895508, + "learning_rate": 4.13522099193889e-05, + "loss": 1.5287, + "step": 15470 + }, + { + "epoch": 2.5581491427391034, + "grad_norm": 11.472020149230957, + "learning_rate": 4.1343028700490274e-05, + "loss": 1.572, + "step": 15480 + }, + { + "epoch": 2.559801693864904, + "grad_norm": 8.961161613464355, + "learning_rate": 4.133384748159166e-05, + "loss": 1.6715, + "step": 15490 + }, + { + "epoch": 2.5614542449907045, + "grad_norm": 9.045692443847656, + "learning_rate": 4.132466626269303e-05, + "loss": 1.6285, + "step": 15500 + }, + { + "epoch": 2.5631067961165046, + "grad_norm": 9.706321716308594, + "learning_rate": 4.1315485043794416e-05, + "loss": 1.6204, + "step": 15510 + }, + { + "epoch": 2.564759347242305, + "grad_norm": 22.907732009887695, + "learning_rate": 4.13063038248958e-05, + "loss": 1.6672, + "step": 15520 + }, + { + "epoch": 2.5664118983681057, + "grad_norm": 13.848122596740723, + "learning_rate": 4.1297122605997174e-05, + "loss": 1.7706, + "step": 15530 + }, + { + "epoch": 2.568064449493906, + "grad_norm": 9.652771949768066, + "learning_rate": 4.128794138709856e-05, + "loss": 1.6259, + "step": 15540 + }, + { + "epoch": 2.5697170006197068, + "grad_norm": 12.885273933410645, + "learning_rate": 4.127876016819993e-05, + "loss": 1.7457, + "step": 15550 + }, + { + "epoch": 2.571369551745507, + "grad_norm": 6.977317810058594, + "learning_rate": 4.1269578949301315e-05, + "loss": 1.5125, + "step": 15560 + }, + { + "epoch": 2.573022102871308, + "grad_norm": 34.75843811035156, + "learning_rate": 4.126039773040269e-05, + "loss": 1.6154, + "step": 15570 + }, + { + "epoch": 2.574674653997108, + "grad_norm": 9.55295181274414, + "learning_rate": 4.1251216511504074e-05, + "loss": 1.5819, + "step": 15580 + }, + { + "epoch": 2.5763272051229085, + "grad_norm": 13.800176620483398, + "learning_rate": 4.124203529260545e-05, + "loss": 1.5153, + "step": 15590 + }, + { + "epoch": 2.577979756248709, + "grad_norm": 53.50123977661133, + "learning_rate": 4.1232854073706825e-05, + "loss": 1.6517, + "step": 15600 + }, + { + "epoch": 2.5796323073745095, + "grad_norm": 24.61274528503418, + "learning_rate": 4.12236728548082e-05, + "loss": 1.6843, + "step": 15610 + }, + { + "epoch": 2.58128485850031, + "grad_norm": 8.19063663482666, + "learning_rate": 4.1214491635909584e-05, + "loss": 1.5765, + "step": 15620 + }, + { + "epoch": 2.58293740962611, + "grad_norm": 8.86671257019043, + "learning_rate": 4.120531041701096e-05, + "loss": 1.5666, + "step": 15630 + }, + { + "epoch": 2.5845899607519107, + "grad_norm": 8.37794303894043, + "learning_rate": 4.119612919811234e-05, + "loss": 1.6589, + "step": 15640 + }, + { + "epoch": 2.5862425118777113, + "grad_norm": 12.301908493041992, + "learning_rate": 4.1186947979213725e-05, + "loss": 1.5103, + "step": 15650 + }, + { + "epoch": 2.587895063003512, + "grad_norm": 21.372512817382812, + "learning_rate": 4.11777667603151e-05, + "loss": 1.5983, + "step": 15660 + }, + { + "epoch": 2.5895476141293123, + "grad_norm": 10.715096473693848, + "learning_rate": 4.1168585541416484e-05, + "loss": 1.5617, + "step": 15670 + }, + { + "epoch": 2.5912001652551124, + "grad_norm": 24.612468719482422, + "learning_rate": 4.115940432251786e-05, + "loss": 1.6039, + "step": 15680 + }, + { + "epoch": 2.592852716380913, + "grad_norm": 26.50051498413086, + "learning_rate": 4.115022310361924e-05, + "loss": 1.5615, + "step": 15690 + }, + { + "epoch": 2.5945052675067135, + "grad_norm": 21.82634162902832, + "learning_rate": 4.114104188472062e-05, + "loss": 1.5786, + "step": 15700 + }, + { + "epoch": 2.596157818632514, + "grad_norm": 7.950484275817871, + "learning_rate": 4.1131860665822e-05, + "loss": 1.6964, + "step": 15710 + }, + { + "epoch": 2.5978103697583146, + "grad_norm": 30.824180603027344, + "learning_rate": 4.1122679446923376e-05, + "loss": 1.514, + "step": 15720 + }, + { + "epoch": 2.5994629208841147, + "grad_norm": 7.380947113037109, + "learning_rate": 4.111349822802475e-05, + "loss": 1.6425, + "step": 15730 + }, + { + "epoch": 2.601115472009915, + "grad_norm": 7.936986923217773, + "learning_rate": 4.110431700912613e-05, + "loss": 1.5766, + "step": 15740 + }, + { + "epoch": 2.6027680231357158, + "grad_norm": 9.09585189819336, + "learning_rate": 4.109513579022751e-05, + "loss": 1.6068, + "step": 15750 + }, + { + "epoch": 2.6044205742615163, + "grad_norm": 23.57267951965332, + "learning_rate": 4.108595457132889e-05, + "loss": 1.4544, + "step": 15760 + }, + { + "epoch": 2.606073125387317, + "grad_norm": 8.182823181152344, + "learning_rate": 4.107677335243027e-05, + "loss": 1.5769, + "step": 15770 + }, + { + "epoch": 2.607725676513117, + "grad_norm": 15.561851501464844, + "learning_rate": 4.106759213353165e-05, + "loss": 1.5978, + "step": 15780 + }, + { + "epoch": 2.6093782276389175, + "grad_norm": 11.034695625305176, + "learning_rate": 4.105841091463303e-05, + "loss": 1.608, + "step": 15790 + }, + { + "epoch": 2.611030778764718, + "grad_norm": 63.234703063964844, + "learning_rate": 4.104922969573441e-05, + "loss": 1.6947, + "step": 15800 + }, + { + "epoch": 2.6126833298905185, + "grad_norm": 8.965785026550293, + "learning_rate": 4.1040048476835786e-05, + "loss": 1.6128, + "step": 15810 + }, + { + "epoch": 2.614335881016319, + "grad_norm": 7.541203022003174, + "learning_rate": 4.103086725793717e-05, + "loss": 1.5494, + "step": 15820 + }, + { + "epoch": 2.615988432142119, + "grad_norm": 12.703873634338379, + "learning_rate": 4.1021686039038545e-05, + "loss": 1.6318, + "step": 15830 + }, + { + "epoch": 2.6176409832679197, + "grad_norm": 8.111274719238281, + "learning_rate": 4.101250482013993e-05, + "loss": 1.5468, + "step": 15840 + }, + { + "epoch": 2.6192935343937203, + "grad_norm": 23.11355209350586, + "learning_rate": 4.10033236012413e-05, + "loss": 1.6287, + "step": 15850 + }, + { + "epoch": 2.620946085519521, + "grad_norm": 12.819192886352539, + "learning_rate": 4.099414238234268e-05, + "loss": 1.6666, + "step": 15860 + }, + { + "epoch": 2.6225986366453213, + "grad_norm": 9.071503639221191, + "learning_rate": 4.098496116344406e-05, + "loss": 1.6456, + "step": 15870 + }, + { + "epoch": 2.6242511877711214, + "grad_norm": 14.59699821472168, + "learning_rate": 4.097577994454544e-05, + "loss": 1.5137, + "step": 15880 + }, + { + "epoch": 2.625903738896922, + "grad_norm": 11.511642456054688, + "learning_rate": 4.096659872564682e-05, + "loss": 1.5802, + "step": 15890 + }, + { + "epoch": 2.6275562900227225, + "grad_norm": 13.53958511352539, + "learning_rate": 4.0957417506748196e-05, + "loss": 1.704, + "step": 15900 + }, + { + "epoch": 2.629208841148523, + "grad_norm": 12.743249893188477, + "learning_rate": 4.094823628784958e-05, + "loss": 1.567, + "step": 15910 + }, + { + "epoch": 2.6308613922743236, + "grad_norm": 31.744903564453125, + "learning_rate": 4.0939055068950955e-05, + "loss": 1.4322, + "step": 15920 + }, + { + "epoch": 2.6325139434001237, + "grad_norm": 12.78609848022461, + "learning_rate": 4.092987385005234e-05, + "loss": 1.7043, + "step": 15930 + }, + { + "epoch": 2.6341664945259247, + "grad_norm": 19.535892486572266, + "learning_rate": 4.092069263115371e-05, + "loss": 1.569, + "step": 15940 + }, + { + "epoch": 2.6358190456517248, + "grad_norm": 13.720259666442871, + "learning_rate": 4.0911511412255096e-05, + "loss": 1.6444, + "step": 15950 + }, + { + "epoch": 2.6374715967775253, + "grad_norm": 7.525299549102783, + "learning_rate": 4.090233019335647e-05, + "loss": 1.511, + "step": 15960 + }, + { + "epoch": 2.639124147903326, + "grad_norm": 12.380800247192383, + "learning_rate": 4.0893148974457854e-05, + "loss": 1.6673, + "step": 15970 + }, + { + "epoch": 2.6407766990291264, + "grad_norm": 14.61955451965332, + "learning_rate": 4.088396775555923e-05, + "loss": 1.5086, + "step": 15980 + }, + { + "epoch": 2.642429250154927, + "grad_norm": 11.027737617492676, + "learning_rate": 4.0874786536660606e-05, + "loss": 1.7163, + "step": 15990 + }, + { + "epoch": 2.644081801280727, + "grad_norm": 19.45004653930664, + "learning_rate": 4.086560531776199e-05, + "loss": 1.6217, + "step": 16000 + }, + { + "epoch": 2.6457343524065275, + "grad_norm": 13.399431228637695, + "learning_rate": 4.0856424098863364e-05, + "loss": 1.5046, + "step": 16010 + }, + { + "epoch": 2.647386903532328, + "grad_norm": 11.20180892944336, + "learning_rate": 4.084724287996475e-05, + "loss": 1.5504, + "step": 16020 + }, + { + "epoch": 2.6490394546581286, + "grad_norm": 24.36643409729004, + "learning_rate": 4.083806166106612e-05, + "loss": 1.633, + "step": 16030 + }, + { + "epoch": 2.650692005783929, + "grad_norm": 9.366679191589355, + "learning_rate": 4.0828880442167506e-05, + "loss": 1.6683, + "step": 16040 + }, + { + "epoch": 2.6523445569097293, + "grad_norm": 11.785042762756348, + "learning_rate": 4.081969922326888e-05, + "loss": 1.6522, + "step": 16050 + }, + { + "epoch": 2.65399710803553, + "grad_norm": 5.310166358947754, + "learning_rate": 4.0810518004370264e-05, + "loss": 1.621, + "step": 16060 + }, + { + "epoch": 2.6556496591613303, + "grad_norm": 39.18266677856445, + "learning_rate": 4.080133678547164e-05, + "loss": 1.6034, + "step": 16070 + }, + { + "epoch": 2.657302210287131, + "grad_norm": 12.86594009399414, + "learning_rate": 4.079215556657302e-05, + "loss": 1.438, + "step": 16080 + }, + { + "epoch": 2.6589547614129314, + "grad_norm": 13.897305488586426, + "learning_rate": 4.0782974347674405e-05, + "loss": 1.4932, + "step": 16090 + }, + { + "epoch": 2.6606073125387315, + "grad_norm": 25.27904510498047, + "learning_rate": 4.077379312877578e-05, + "loss": 1.7065, + "step": 16100 + }, + { + "epoch": 2.662259863664532, + "grad_norm": 10.040287017822266, + "learning_rate": 4.076461190987716e-05, + "loss": 1.571, + "step": 16110 + }, + { + "epoch": 2.6639124147903326, + "grad_norm": 11.403328895568848, + "learning_rate": 4.075543069097853e-05, + "loss": 1.6483, + "step": 16120 + }, + { + "epoch": 2.665564965916133, + "grad_norm": 10.510461807250977, + "learning_rate": 4.0746249472079915e-05, + "loss": 1.6841, + "step": 16130 + }, + { + "epoch": 2.6672175170419337, + "grad_norm": 6.346688270568848, + "learning_rate": 4.073706825318129e-05, + "loss": 1.6634, + "step": 16140 + }, + { + "epoch": 2.6688700681677338, + "grad_norm": 6.1756696701049805, + "learning_rate": 4.0727887034282674e-05, + "loss": 1.6516, + "step": 16150 + }, + { + "epoch": 2.6705226192935343, + "grad_norm": 10.464988708496094, + "learning_rate": 4.071870581538405e-05, + "loss": 1.6066, + "step": 16160 + }, + { + "epoch": 2.672175170419335, + "grad_norm": 18.168760299682617, + "learning_rate": 4.070952459648543e-05, + "loss": 1.6268, + "step": 16170 + }, + { + "epoch": 2.6738277215451354, + "grad_norm": 31.59029197692871, + "learning_rate": 4.070034337758681e-05, + "loss": 1.593, + "step": 16180 + }, + { + "epoch": 2.675480272670936, + "grad_norm": 13.246702194213867, + "learning_rate": 4.069116215868819e-05, + "loss": 1.6339, + "step": 16190 + }, + { + "epoch": 2.677132823796736, + "grad_norm": 13.714371681213379, + "learning_rate": 4.068198093978957e-05, + "loss": 1.6681, + "step": 16200 + }, + { + "epoch": 2.6787853749225365, + "grad_norm": 9.815982818603516, + "learning_rate": 4.067279972089095e-05, + "loss": 1.4579, + "step": 16210 + }, + { + "epoch": 2.680437926048337, + "grad_norm": 11.735651016235352, + "learning_rate": 4.066361850199233e-05, + "loss": 1.6187, + "step": 16220 + }, + { + "epoch": 2.6820904771741376, + "grad_norm": 10.747529983520508, + "learning_rate": 4.065443728309371e-05, + "loss": 1.5022, + "step": 16230 + }, + { + "epoch": 2.683743028299938, + "grad_norm": 19.077959060668945, + "learning_rate": 4.0645256064195084e-05, + "loss": 1.5181, + "step": 16240 + }, + { + "epoch": 2.6853955794257383, + "grad_norm": 23.054729461669922, + "learning_rate": 4.063607484529646e-05, + "loss": 1.6234, + "step": 16250 + }, + { + "epoch": 2.687048130551539, + "grad_norm": 6.9372992515563965, + "learning_rate": 4.062689362639784e-05, + "loss": 1.5558, + "step": 16260 + }, + { + "epoch": 2.6887006816773393, + "grad_norm": 9.9459867477417, + "learning_rate": 4.061771240749922e-05, + "loss": 1.4208, + "step": 16270 + }, + { + "epoch": 2.69035323280314, + "grad_norm": 8.664091110229492, + "learning_rate": 4.06085311886006e-05, + "loss": 1.6421, + "step": 16280 + }, + { + "epoch": 2.6920057839289404, + "grad_norm": 11.299137115478516, + "learning_rate": 4.0599349969701977e-05, + "loss": 1.6188, + "step": 16290 + }, + { + "epoch": 2.6936583350547405, + "grad_norm": 8.935805320739746, + "learning_rate": 4.059016875080336e-05, + "loss": 1.6265, + "step": 16300 + }, + { + "epoch": 2.6953108861805415, + "grad_norm": 8.099499702453613, + "learning_rate": 4.0580987531904735e-05, + "loss": 1.4238, + "step": 16310 + }, + { + "epoch": 2.6969634373063416, + "grad_norm": 6.492707252502441, + "learning_rate": 4.057180631300612e-05, + "loss": 1.5873, + "step": 16320 + }, + { + "epoch": 2.698615988432142, + "grad_norm": 6.262495517730713, + "learning_rate": 4.05626250941075e-05, + "loss": 1.639, + "step": 16330 + }, + { + "epoch": 2.7002685395579427, + "grad_norm": 16.449399948120117, + "learning_rate": 4.0553443875208876e-05, + "loss": 1.5878, + "step": 16340 + }, + { + "epoch": 2.7019210906837428, + "grad_norm": 21.750577926635742, + "learning_rate": 4.054426265631026e-05, + "loss": 1.7173, + "step": 16350 + }, + { + "epoch": 2.7035736418095437, + "grad_norm": 10.185629844665527, + "learning_rate": 4.0535081437411635e-05, + "loss": 1.5977, + "step": 16360 + }, + { + "epoch": 2.705226192935344, + "grad_norm": 66.03086853027344, + "learning_rate": 4.052590021851301e-05, + "loss": 1.7398, + "step": 16370 + }, + { + "epoch": 2.7068787440611444, + "grad_norm": 6.185853481292725, + "learning_rate": 4.0516718999614386e-05, + "loss": 1.5928, + "step": 16380 + }, + { + "epoch": 2.708531295186945, + "grad_norm": 5.825438499450684, + "learning_rate": 4.050753778071577e-05, + "loss": 1.5435, + "step": 16390 + }, + { + "epoch": 2.7101838463127454, + "grad_norm": 19.0218505859375, + "learning_rate": 4.0498356561817145e-05, + "loss": 1.6741, + "step": 16400 + }, + { + "epoch": 2.711836397438546, + "grad_norm": 12.254176139831543, + "learning_rate": 4.048917534291853e-05, + "loss": 1.5611, + "step": 16410 + }, + { + "epoch": 2.713488948564346, + "grad_norm": 18.687061309814453, + "learning_rate": 4.04799941240199e-05, + "loss": 1.5091, + "step": 16420 + }, + { + "epoch": 2.7151414996901466, + "grad_norm": 11.243393898010254, + "learning_rate": 4.0470812905121286e-05, + "loss": 1.5715, + "step": 16430 + }, + { + "epoch": 2.716794050815947, + "grad_norm": 15.61385726928711, + "learning_rate": 4.046163168622267e-05, + "loss": 1.4761, + "step": 16440 + }, + { + "epoch": 2.7184466019417477, + "grad_norm": 6.631096839904785, + "learning_rate": 4.0452450467324045e-05, + "loss": 1.6744, + "step": 16450 + }, + { + "epoch": 2.7200991530675482, + "grad_norm": 31.41004753112793, + "learning_rate": 4.044326924842543e-05, + "loss": 1.4479, + "step": 16460 + }, + { + "epoch": 2.7217517041933483, + "grad_norm": 8.312516212463379, + "learning_rate": 4.04340880295268e-05, + "loss": 1.4798, + "step": 16470 + }, + { + "epoch": 2.723404255319149, + "grad_norm": 16.965839385986328, + "learning_rate": 4.0424906810628186e-05, + "loss": 1.5273, + "step": 16480 + }, + { + "epoch": 2.7250568064449494, + "grad_norm": 22.01923179626465, + "learning_rate": 4.041572559172956e-05, + "loss": 1.5548, + "step": 16490 + }, + { + "epoch": 2.72670935757075, + "grad_norm": 8.084673881530762, + "learning_rate": 4.040654437283094e-05, + "loss": 1.6711, + "step": 16500 + }, + { + "epoch": 2.7283619086965505, + "grad_norm": 8.843356132507324, + "learning_rate": 4.039736315393231e-05, + "loss": 1.5687, + "step": 16510 + }, + { + "epoch": 2.7300144598223506, + "grad_norm": 10.197770118713379, + "learning_rate": 4.0388181935033696e-05, + "loss": 1.5118, + "step": 16520 + }, + { + "epoch": 2.731667010948151, + "grad_norm": 38.914649963378906, + "learning_rate": 4.037900071613507e-05, + "loss": 1.6277, + "step": 16530 + }, + { + "epoch": 2.7333195620739517, + "grad_norm": 5.684174060821533, + "learning_rate": 4.0369819497236454e-05, + "loss": 1.7301, + "step": 16540 + }, + { + "epoch": 2.734972113199752, + "grad_norm": 12.711173057556152, + "learning_rate": 4.036063827833784e-05, + "loss": 1.4475, + "step": 16550 + }, + { + "epoch": 2.7366246643255527, + "grad_norm": 48.89714813232422, + "learning_rate": 4.035145705943921e-05, + "loss": 1.584, + "step": 16560 + }, + { + "epoch": 2.738277215451353, + "grad_norm": 22.189855575561523, + "learning_rate": 4.0342275840540595e-05, + "loss": 1.6205, + "step": 16570 + }, + { + "epoch": 2.7399297665771534, + "grad_norm": 7.027297019958496, + "learning_rate": 4.033309462164197e-05, + "loss": 1.6204, + "step": 16580 + }, + { + "epoch": 2.741582317702954, + "grad_norm": 16.53078842163086, + "learning_rate": 4.0323913402743354e-05, + "loss": 1.5647, + "step": 16590 + }, + { + "epoch": 2.7432348688287544, + "grad_norm": 14.532068252563477, + "learning_rate": 4.031473218384473e-05, + "loss": 1.5833, + "step": 16600 + }, + { + "epoch": 2.744887419954555, + "grad_norm": 17.68378448486328, + "learning_rate": 4.030555096494611e-05, + "loss": 1.6564, + "step": 16610 + }, + { + "epoch": 2.746539971080355, + "grad_norm": 8.336248397827148, + "learning_rate": 4.029636974604749e-05, + "loss": 1.6376, + "step": 16620 + }, + { + "epoch": 2.7481925222061556, + "grad_norm": 8.331178665161133, + "learning_rate": 4.0287188527148864e-05, + "loss": 1.5466, + "step": 16630 + }, + { + "epoch": 2.749845073331956, + "grad_norm": 9.454358100891113, + "learning_rate": 4.027800730825024e-05, + "loss": 1.6387, + "step": 16640 + }, + { + "epoch": 2.7514976244577567, + "grad_norm": 19.310138702392578, + "learning_rate": 4.026882608935162e-05, + "loss": 1.6703, + "step": 16650 + }, + { + "epoch": 2.7531501755835572, + "grad_norm": 7.6864705085754395, + "learning_rate": 4.0259644870453005e-05, + "loss": 1.5447, + "step": 16660 + }, + { + "epoch": 2.7548027267093573, + "grad_norm": 5.934784412384033, + "learning_rate": 4.025046365155438e-05, + "loss": 1.4783, + "step": 16670 + }, + { + "epoch": 2.756455277835158, + "grad_norm": 53.78288269042969, + "learning_rate": 4.0241282432655764e-05, + "loss": 1.7438, + "step": 16680 + }, + { + "epoch": 2.7581078289609584, + "grad_norm": 9.907873153686523, + "learning_rate": 4.023210121375714e-05, + "loss": 1.5094, + "step": 16690 + }, + { + "epoch": 2.759760380086759, + "grad_norm": 17.700838088989258, + "learning_rate": 4.022291999485852e-05, + "loss": 1.5519, + "step": 16700 + }, + { + "epoch": 2.7614129312125595, + "grad_norm": 11.269119262695312, + "learning_rate": 4.02137387759599e-05, + "loss": 1.661, + "step": 16710 + }, + { + "epoch": 2.7630654823383596, + "grad_norm": 12.648497581481934, + "learning_rate": 4.020455755706128e-05, + "loss": 1.7044, + "step": 16720 + }, + { + "epoch": 2.7647180334641606, + "grad_norm": 7.257272720336914, + "learning_rate": 4.019537633816266e-05, + "loss": 1.5785, + "step": 16730 + }, + { + "epoch": 2.7663705845899607, + "grad_norm": 10.091726303100586, + "learning_rate": 4.018619511926404e-05, + "loss": 1.5245, + "step": 16740 + }, + { + "epoch": 2.768023135715761, + "grad_norm": 23.164775848388672, + "learning_rate": 4.0177013900365415e-05, + "loss": 1.5764, + "step": 16750 + }, + { + "epoch": 2.7696756868415617, + "grad_norm": 21.203231811523438, + "learning_rate": 4.016783268146679e-05, + "loss": 1.5322, + "step": 16760 + }, + { + "epoch": 2.7713282379673623, + "grad_norm": 8.188257217407227, + "learning_rate": 4.015865146256817e-05, + "loss": 1.5544, + "step": 16770 + }, + { + "epoch": 2.772980789093163, + "grad_norm": 5.449336051940918, + "learning_rate": 4.014947024366955e-05, + "loss": 1.5638, + "step": 16780 + }, + { + "epoch": 2.774633340218963, + "grad_norm": 30.01235008239746, + "learning_rate": 4.014028902477093e-05, + "loss": 1.6193, + "step": 16790 + }, + { + "epoch": 2.7762858913447634, + "grad_norm": 7.071667194366455, + "learning_rate": 4.013110780587231e-05, + "loss": 1.5834, + "step": 16800 + }, + { + "epoch": 2.777938442470564, + "grad_norm": 7.050570011138916, + "learning_rate": 4.012192658697369e-05, + "loss": 1.4917, + "step": 16810 + }, + { + "epoch": 2.7795909935963645, + "grad_norm": 6.538548469543457, + "learning_rate": 4.0112745368075067e-05, + "loss": 1.6527, + "step": 16820 + }, + { + "epoch": 2.781243544722165, + "grad_norm": 10.154224395751953, + "learning_rate": 4.010356414917645e-05, + "loss": 1.5722, + "step": 16830 + }, + { + "epoch": 2.782896095847965, + "grad_norm": 13.937148094177246, + "learning_rate": 4.0094382930277825e-05, + "loss": 1.6655, + "step": 16840 + }, + { + "epoch": 2.7845486469737657, + "grad_norm": 10.971857070922852, + "learning_rate": 4.008520171137921e-05, + "loss": 1.6278, + "step": 16850 + }, + { + "epoch": 2.7862011980995662, + "grad_norm": 19.598770141601562, + "learning_rate": 4.0076020492480584e-05, + "loss": 1.6375, + "step": 16860 + }, + { + "epoch": 2.7878537492253668, + "grad_norm": 13.447637557983398, + "learning_rate": 4.0066839273581966e-05, + "loss": 1.5709, + "step": 16870 + }, + { + "epoch": 2.7895063003511673, + "grad_norm": 12.843521118164062, + "learning_rate": 4.005765805468334e-05, + "loss": 1.6029, + "step": 16880 + }, + { + "epoch": 2.7911588514769674, + "grad_norm": 58.23180389404297, + "learning_rate": 4.004847683578472e-05, + "loss": 1.5457, + "step": 16890 + }, + { + "epoch": 2.792811402602768, + "grad_norm": 9.176880836486816, + "learning_rate": 4.00392956168861e-05, + "loss": 1.5729, + "step": 16900 + }, + { + "epoch": 2.7944639537285685, + "grad_norm": 5.380598545074463, + "learning_rate": 4.0030114397987476e-05, + "loss": 1.5439, + "step": 16910 + }, + { + "epoch": 2.796116504854369, + "grad_norm": 8.638605117797852, + "learning_rate": 4.002093317908886e-05, + "loss": 1.5537, + "step": 16920 + }, + { + "epoch": 2.7977690559801696, + "grad_norm": 5.76012659072876, + "learning_rate": 4.0011751960190235e-05, + "loss": 1.521, + "step": 16930 + }, + { + "epoch": 2.7994216071059697, + "grad_norm": 8.071260452270508, + "learning_rate": 4.000257074129162e-05, + "loss": 1.559, + "step": 16940 + }, + { + "epoch": 2.80107415823177, + "grad_norm": 13.014986991882324, + "learning_rate": 3.999338952239299e-05, + "loss": 1.6444, + "step": 16950 + }, + { + "epoch": 2.8027267093575707, + "grad_norm": 5.6536431312561035, + "learning_rate": 3.9984208303494376e-05, + "loss": 1.6925, + "step": 16960 + }, + { + "epoch": 2.8043792604833713, + "grad_norm": 6.50076961517334, + "learning_rate": 3.997502708459575e-05, + "loss": 1.5527, + "step": 16970 + }, + { + "epoch": 2.806031811609172, + "grad_norm": 11.952144622802734, + "learning_rate": 3.9965845865697134e-05, + "loss": 1.6111, + "step": 16980 + }, + { + "epoch": 2.807684362734972, + "grad_norm": 12.421814918518066, + "learning_rate": 3.995666464679851e-05, + "loss": 1.5704, + "step": 16990 + }, + { + "epoch": 2.8093369138607724, + "grad_norm": 7.645977973937988, + "learning_rate": 3.994748342789989e-05, + "loss": 1.6001, + "step": 17000 + }, + { + "epoch": 2.810989464986573, + "grad_norm": 9.946296691894531, + "learning_rate": 3.993830220900127e-05, + "loss": 1.5803, + "step": 17010 + }, + { + "epoch": 2.8126420161123735, + "grad_norm": 61.43771743774414, + "learning_rate": 3.9929120990102645e-05, + "loss": 1.5299, + "step": 17020 + }, + { + "epoch": 2.814294567238174, + "grad_norm": 13.09609317779541, + "learning_rate": 3.991993977120403e-05, + "loss": 1.5403, + "step": 17030 + }, + { + "epoch": 2.815947118363974, + "grad_norm": 10.614474296569824, + "learning_rate": 3.99107585523054e-05, + "loss": 1.5166, + "step": 17040 + }, + { + "epoch": 2.8175996694897747, + "grad_norm": 64.39093780517578, + "learning_rate": 3.9901577333406786e-05, + "loss": 1.6604, + "step": 17050 + }, + { + "epoch": 2.8192522206155752, + "grad_norm": 27.681407928466797, + "learning_rate": 3.989239611450816e-05, + "loss": 1.6366, + "step": 17060 + }, + { + "epoch": 2.8209047717413758, + "grad_norm": 9.592436790466309, + "learning_rate": 3.9883214895609544e-05, + "loss": 1.5281, + "step": 17070 + }, + { + "epoch": 2.8225573228671763, + "grad_norm": 10.364959716796875, + "learning_rate": 3.987403367671092e-05, + "loss": 1.6947, + "step": 17080 + }, + { + "epoch": 2.8242098739929764, + "grad_norm": 8.182236671447754, + "learning_rate": 3.98648524578123e-05, + "loss": 1.6698, + "step": 17090 + }, + { + "epoch": 2.8258624251187774, + "grad_norm": 13.211899757385254, + "learning_rate": 3.985567123891368e-05, + "loss": 1.5847, + "step": 17100 + }, + { + "epoch": 2.8275149762445775, + "grad_norm": 12.27530288696289, + "learning_rate": 3.984649002001506e-05, + "loss": 1.6346, + "step": 17110 + }, + { + "epoch": 2.829167527370378, + "grad_norm": 27.445775985717773, + "learning_rate": 3.9837308801116444e-05, + "loss": 1.5896, + "step": 17120 + }, + { + "epoch": 2.8308200784961786, + "grad_norm": 107.77072143554688, + "learning_rate": 3.982812758221782e-05, + "loss": 1.5398, + "step": 17130 + }, + { + "epoch": 2.8324726296219787, + "grad_norm": 6.350532531738281, + "learning_rate": 3.9818946363319196e-05, + "loss": 1.6412, + "step": 17140 + }, + { + "epoch": 2.8341251807477796, + "grad_norm": 10.610336303710938, + "learning_rate": 3.980976514442057e-05, + "loss": 1.5777, + "step": 17150 + }, + { + "epoch": 2.8357777318735797, + "grad_norm": 6.003240585327148, + "learning_rate": 3.9800583925521954e-05, + "loss": 1.5696, + "step": 17160 + }, + { + "epoch": 2.8374302829993803, + "grad_norm": 7.450899124145508, + "learning_rate": 3.979140270662333e-05, + "loss": 1.6595, + "step": 17170 + }, + { + "epoch": 2.839082834125181, + "grad_norm": 10.081241607666016, + "learning_rate": 3.978222148772471e-05, + "loss": 1.5032, + "step": 17180 + }, + { + "epoch": 2.8407353852509813, + "grad_norm": 10.23855972290039, + "learning_rate": 3.977304026882609e-05, + "loss": 1.5916, + "step": 17190 + }, + { + "epoch": 2.842387936376782, + "grad_norm": 9.486077308654785, + "learning_rate": 3.976385904992747e-05, + "loss": 1.6483, + "step": 17200 + }, + { + "epoch": 2.844040487502582, + "grad_norm": 22.05427360534668, + "learning_rate": 3.975467783102885e-05, + "loss": 1.6892, + "step": 17210 + }, + { + "epoch": 2.8456930386283825, + "grad_norm": 16.26018524169922, + "learning_rate": 3.974549661213023e-05, + "loss": 1.5048, + "step": 17220 + }, + { + "epoch": 2.847345589754183, + "grad_norm": 14.785170555114746, + "learning_rate": 3.9736315393231606e-05, + "loss": 1.7072, + "step": 17230 + }, + { + "epoch": 2.8489981408799836, + "grad_norm": 7.535512924194336, + "learning_rate": 3.972713417433299e-05, + "loss": 1.6278, + "step": 17240 + }, + { + "epoch": 2.850650692005784, + "grad_norm": 14.22430419921875, + "learning_rate": 3.971795295543437e-05, + "loss": 1.6181, + "step": 17250 + }, + { + "epoch": 2.8523032431315842, + "grad_norm": 9.323792457580566, + "learning_rate": 3.970877173653575e-05, + "loss": 1.72, + "step": 17260 + }, + { + "epoch": 2.8539557942573848, + "grad_norm": 9.750657081604004, + "learning_rate": 3.969959051763712e-05, + "loss": 1.4657, + "step": 17270 + }, + { + "epoch": 2.8556083453831853, + "grad_norm": 16.884572982788086, + "learning_rate": 3.96904092987385e-05, + "loss": 1.6819, + "step": 17280 + }, + { + "epoch": 2.857260896508986, + "grad_norm": 29.167482376098633, + "learning_rate": 3.968122807983988e-05, + "loss": 1.6451, + "step": 17290 + }, + { + "epoch": 2.8589134476347864, + "grad_norm": 36.17399215698242, + "learning_rate": 3.967204686094126e-05, + "loss": 1.6083, + "step": 17300 + }, + { + "epoch": 2.8605659987605865, + "grad_norm": 13.15809440612793, + "learning_rate": 3.966286564204264e-05, + "loss": 1.6859, + "step": 17310 + }, + { + "epoch": 2.862218549886387, + "grad_norm": 9.510421752929688, + "learning_rate": 3.9653684423144015e-05, + "loss": 1.6486, + "step": 17320 + }, + { + "epoch": 2.8638711010121876, + "grad_norm": 9.244708061218262, + "learning_rate": 3.96445032042454e-05, + "loss": 1.5877, + "step": 17330 + }, + { + "epoch": 2.865523652137988, + "grad_norm": 6.563562393188477, + "learning_rate": 3.9635321985346774e-05, + "loss": 1.575, + "step": 17340 + }, + { + "epoch": 2.8671762032637886, + "grad_norm": 15.244311332702637, + "learning_rate": 3.9626140766448156e-05, + "loss": 1.55, + "step": 17350 + }, + { + "epoch": 2.8688287543895887, + "grad_norm": 5.563783645629883, + "learning_rate": 3.961695954754954e-05, + "loss": 1.5702, + "step": 17360 + }, + { + "epoch": 2.8704813055153893, + "grad_norm": 11.255026817321777, + "learning_rate": 3.9607778328650915e-05, + "loss": 1.713, + "step": 17370 + }, + { + "epoch": 2.87213385664119, + "grad_norm": 13.83602237701416, + "learning_rate": 3.95985971097523e-05, + "loss": 1.4584, + "step": 17380 + }, + { + "epoch": 2.8737864077669903, + "grad_norm": 10.390022277832031, + "learning_rate": 3.9589415890853673e-05, + "loss": 1.5963, + "step": 17390 + }, + { + "epoch": 2.875438958892791, + "grad_norm": 6.194431781768799, + "learning_rate": 3.958023467195505e-05, + "loss": 1.5349, + "step": 17400 + }, + { + "epoch": 2.877091510018591, + "grad_norm": 6.946686744689941, + "learning_rate": 3.9571053453056425e-05, + "loss": 1.5731, + "step": 17410 + }, + { + "epoch": 2.8787440611443915, + "grad_norm": 11.890841484069824, + "learning_rate": 3.956187223415781e-05, + "loss": 1.5738, + "step": 17420 + }, + { + "epoch": 2.880396612270192, + "grad_norm": 4.596217632293701, + "learning_rate": 3.9552691015259184e-05, + "loss": 1.5325, + "step": 17430 + }, + { + "epoch": 2.8820491633959926, + "grad_norm": 9.431676864624023, + "learning_rate": 3.9543509796360566e-05, + "loss": 1.6274, + "step": 17440 + }, + { + "epoch": 2.883701714521793, + "grad_norm": 9.621706008911133, + "learning_rate": 3.953432857746194e-05, + "loss": 1.5976, + "step": 17450 + }, + { + "epoch": 2.8853542656475932, + "grad_norm": 6.648024559020996, + "learning_rate": 3.9525147358563325e-05, + "loss": 1.5725, + "step": 17460 + }, + { + "epoch": 2.8870068167733938, + "grad_norm": 33.377403259277344, + "learning_rate": 3.951596613966471e-05, + "loss": 1.5394, + "step": 17470 + }, + { + "epoch": 2.8886593678991943, + "grad_norm": 7.782329082489014, + "learning_rate": 3.950678492076608e-05, + "loss": 1.5998, + "step": 17480 + }, + { + "epoch": 2.890311919024995, + "grad_norm": 6.886441230773926, + "learning_rate": 3.9497603701867466e-05, + "loss": 1.6038, + "step": 17490 + }, + { + "epoch": 2.8919644701507954, + "grad_norm": 7.424654006958008, + "learning_rate": 3.948842248296884e-05, + "loss": 1.5715, + "step": 17500 + }, + { + "epoch": 2.8936170212765955, + "grad_norm": 35.0805549621582, + "learning_rate": 3.9479241264070224e-05, + "loss": 1.5405, + "step": 17510 + }, + { + "epoch": 2.8952695724023965, + "grad_norm": 8.772028923034668, + "learning_rate": 3.94700600451716e-05, + "loss": 1.6665, + "step": 17520 + }, + { + "epoch": 2.8969221235281966, + "grad_norm": 10.116393089294434, + "learning_rate": 3.9460878826272976e-05, + "loss": 1.4747, + "step": 17530 + }, + { + "epoch": 2.898574674653997, + "grad_norm": 12.616483688354492, + "learning_rate": 3.945169760737435e-05, + "loss": 1.6141, + "step": 17540 + }, + { + "epoch": 2.9002272257797976, + "grad_norm": 10.682483673095703, + "learning_rate": 3.9442516388475735e-05, + "loss": 1.526, + "step": 17550 + }, + { + "epoch": 2.901879776905598, + "grad_norm": 12.046555519104004, + "learning_rate": 3.943333516957711e-05, + "loss": 1.5962, + "step": 17560 + }, + { + "epoch": 2.9035323280313987, + "grad_norm": 9.173718452453613, + "learning_rate": 3.942415395067849e-05, + "loss": 1.5042, + "step": 17570 + }, + { + "epoch": 2.905184879157199, + "grad_norm": 7.157196998596191, + "learning_rate": 3.9414972731779876e-05, + "loss": 1.5015, + "step": 17580 + }, + { + "epoch": 2.9068374302829993, + "grad_norm": 9.088848114013672, + "learning_rate": 3.940579151288125e-05, + "loss": 1.5697, + "step": 17590 + }, + { + "epoch": 2.9084899814088, + "grad_norm": 7.118719577789307, + "learning_rate": 3.9396610293982634e-05, + "loss": 1.5875, + "step": 17600 + }, + { + "epoch": 2.9101425325346004, + "grad_norm": 4.978763103485107, + "learning_rate": 3.938742907508401e-05, + "loss": 1.4973, + "step": 17610 + }, + { + "epoch": 2.911795083660401, + "grad_norm": 11.87973403930664, + "learning_rate": 3.937824785618539e-05, + "loss": 1.6336, + "step": 17620 + }, + { + "epoch": 2.913447634786201, + "grad_norm": 23.558897018432617, + "learning_rate": 3.936906663728677e-05, + "loss": 1.4903, + "step": 17630 + }, + { + "epoch": 2.9151001859120016, + "grad_norm": 8.35072135925293, + "learning_rate": 3.935988541838815e-05, + "loss": 1.5911, + "step": 17640 + }, + { + "epoch": 2.916752737037802, + "grad_norm": 7.296316623687744, + "learning_rate": 3.935070419948953e-05, + "loss": 1.4861, + "step": 17650 + }, + { + "epoch": 2.9184052881636027, + "grad_norm": 6.886360168457031, + "learning_rate": 3.93415229805909e-05, + "loss": 1.4605, + "step": 17660 + }, + { + "epoch": 2.920057839289403, + "grad_norm": 9.66377067565918, + "learning_rate": 3.933234176169228e-05, + "loss": 1.4889, + "step": 17670 + }, + { + "epoch": 2.9217103904152033, + "grad_norm": 12.230368614196777, + "learning_rate": 3.932316054279366e-05, + "loss": 1.6329, + "step": 17680 + }, + { + "epoch": 2.923362941541004, + "grad_norm": 23.454057693481445, + "learning_rate": 3.9313979323895044e-05, + "loss": 1.5017, + "step": 17690 + }, + { + "epoch": 2.9250154926668044, + "grad_norm": 10.6051025390625, + "learning_rate": 3.930479810499642e-05, + "loss": 1.6555, + "step": 17700 + }, + { + "epoch": 2.926668043792605, + "grad_norm": 5.926885604858398, + "learning_rate": 3.92956168860978e-05, + "loss": 1.5747, + "step": 17710 + }, + { + "epoch": 2.9283205949184055, + "grad_norm": 9.678142547607422, + "learning_rate": 3.928643566719918e-05, + "loss": 1.7058, + "step": 17720 + }, + { + "epoch": 2.9299731460442056, + "grad_norm": 6.809134483337402, + "learning_rate": 3.927725444830056e-05, + "loss": 1.5812, + "step": 17730 + }, + { + "epoch": 2.931625697170006, + "grad_norm": 25.03977394104004, + "learning_rate": 3.926807322940194e-05, + "loss": 1.5576, + "step": 17740 + }, + { + "epoch": 2.9332782482958066, + "grad_norm": 6.897164344787598, + "learning_rate": 3.925889201050332e-05, + "loss": 1.5183, + "step": 17750 + }, + { + "epoch": 2.934930799421607, + "grad_norm": 6.26050329208374, + "learning_rate": 3.9249710791604695e-05, + "loss": 1.7254, + "step": 17760 + }, + { + "epoch": 2.9365833505474077, + "grad_norm": 7.9940385818481445, + "learning_rate": 3.924052957270608e-05, + "loss": 1.5969, + "step": 17770 + }, + { + "epoch": 2.938235901673208, + "grad_norm": 13.388635635375977, + "learning_rate": 3.9231348353807454e-05, + "loss": 1.5729, + "step": 17780 + }, + { + "epoch": 2.9398884527990083, + "grad_norm": 5.230599880218506, + "learning_rate": 3.922216713490883e-05, + "loss": 1.6222, + "step": 17790 + }, + { + "epoch": 2.941541003924809, + "grad_norm": 13.589559555053711, + "learning_rate": 3.9212985916010206e-05, + "loss": 1.6322, + "step": 17800 + }, + { + "epoch": 2.9431935550506094, + "grad_norm": 22.378572463989258, + "learning_rate": 3.920380469711159e-05, + "loss": 1.6475, + "step": 17810 + }, + { + "epoch": 2.94484610617641, + "grad_norm": 6.242623805999756, + "learning_rate": 3.919462347821297e-05, + "loss": 1.6062, + "step": 17820 + }, + { + "epoch": 2.94649865730221, + "grad_norm": 14.694461822509766, + "learning_rate": 3.918544225931435e-05, + "loss": 1.6167, + "step": 17830 + }, + { + "epoch": 2.9481512084280106, + "grad_norm": 6.985626697540283, + "learning_rate": 3.917626104041573e-05, + "loss": 1.7133, + "step": 17840 + }, + { + "epoch": 2.949803759553811, + "grad_norm": 30.49583625793457, + "learning_rate": 3.9167079821517105e-05, + "loss": 1.6603, + "step": 17850 + }, + { + "epoch": 2.9514563106796117, + "grad_norm": 16.23304557800293, + "learning_rate": 3.915789860261849e-05, + "loss": 1.5594, + "step": 17860 + }, + { + "epoch": 2.953108861805412, + "grad_norm": 8.873785972595215, + "learning_rate": 3.9148717383719864e-05, + "loss": 1.6136, + "step": 17870 + }, + { + "epoch": 2.9547614129312123, + "grad_norm": 10.205568313598633, + "learning_rate": 3.9139536164821246e-05, + "loss": 1.5299, + "step": 17880 + }, + { + "epoch": 2.9564139640570133, + "grad_norm": 7.908629894256592, + "learning_rate": 3.913035494592262e-05, + "loss": 1.5855, + "step": 17890 + }, + { + "epoch": 2.9580665151828134, + "grad_norm": 14.08654499053955, + "learning_rate": 3.9121173727024005e-05, + "loss": 1.6594, + "step": 17900 + }, + { + "epoch": 2.959719066308614, + "grad_norm": 6.650778293609619, + "learning_rate": 3.911199250812538e-05, + "loss": 1.5049, + "step": 17910 + }, + { + "epoch": 2.9613716174344145, + "grad_norm": 14.497530937194824, + "learning_rate": 3.910281128922676e-05, + "loss": 1.5831, + "step": 17920 + }, + { + "epoch": 2.9630241685602146, + "grad_norm": 9.797993659973145, + "learning_rate": 3.909363007032814e-05, + "loss": 1.6278, + "step": 17930 + }, + { + "epoch": 2.9646767196860155, + "grad_norm": 8.208698272705078, + "learning_rate": 3.9084448851429515e-05, + "loss": 1.7192, + "step": 17940 + }, + { + "epoch": 2.9663292708118156, + "grad_norm": 27.290040969848633, + "learning_rate": 3.90752676325309e-05, + "loss": 1.5312, + "step": 17950 + }, + { + "epoch": 2.967981821937616, + "grad_norm": 7.827337741851807, + "learning_rate": 3.9066086413632274e-05, + "loss": 1.4619, + "step": 17960 + }, + { + "epoch": 2.9696343730634167, + "grad_norm": 9.8374605178833, + "learning_rate": 3.9056905194733656e-05, + "loss": 1.6841, + "step": 17970 + }, + { + "epoch": 2.9712869241892172, + "grad_norm": 6.857816219329834, + "learning_rate": 3.904772397583503e-05, + "loss": 1.6227, + "step": 17980 + }, + { + "epoch": 2.972939475315018, + "grad_norm": 12.391343116760254, + "learning_rate": 3.9038542756936415e-05, + "loss": 1.6735, + "step": 17990 + }, + { + "epoch": 2.974592026440818, + "grad_norm": 9.983665466308594, + "learning_rate": 3.902936153803779e-05, + "loss": 1.5625, + "step": 18000 + }, + { + "epoch": 2.9762445775666184, + "grad_norm": 6.569074630737305, + "learning_rate": 3.902018031913917e-05, + "loss": 1.408, + "step": 18010 + }, + { + "epoch": 2.977897128692419, + "grad_norm": 10.561470985412598, + "learning_rate": 3.901099910024055e-05, + "loss": 1.5342, + "step": 18020 + }, + { + "epoch": 2.9795496798182195, + "grad_norm": 7.086844444274902, + "learning_rate": 3.900181788134193e-05, + "loss": 1.7286, + "step": 18030 + }, + { + "epoch": 2.98120223094402, + "grad_norm": 15.396596908569336, + "learning_rate": 3.899263666244331e-05, + "loss": 1.4908, + "step": 18040 + }, + { + "epoch": 2.98285478206982, + "grad_norm": 16.880029678344727, + "learning_rate": 3.8983455443544683e-05, + "loss": 1.623, + "step": 18050 + }, + { + "epoch": 2.9845073331956207, + "grad_norm": 35.76949691772461, + "learning_rate": 3.8974274224646066e-05, + "loss": 1.6083, + "step": 18060 + }, + { + "epoch": 2.986159884321421, + "grad_norm": 11.887407302856445, + "learning_rate": 3.896509300574744e-05, + "loss": 1.5319, + "step": 18070 + }, + { + "epoch": 2.9878124354472217, + "grad_norm": 15.86429500579834, + "learning_rate": 3.8955911786848825e-05, + "loss": 1.6974, + "step": 18080 + }, + { + "epoch": 2.9894649865730223, + "grad_norm": 45.645751953125, + "learning_rate": 3.89467305679502e-05, + "loss": 1.5547, + "step": 18090 + }, + { + "epoch": 2.9911175376988224, + "grad_norm": 4.425729274749756, + "learning_rate": 3.893754934905158e-05, + "loss": 1.6069, + "step": 18100 + }, + { + "epoch": 2.992770088824623, + "grad_norm": 14.894420623779297, + "learning_rate": 3.892836813015296e-05, + "loss": 1.4857, + "step": 18110 + }, + { + "epoch": 2.9944226399504235, + "grad_norm": 12.32193660736084, + "learning_rate": 3.891918691125434e-05, + "loss": 1.7373, + "step": 18120 + }, + { + "epoch": 2.996075191076224, + "grad_norm": 8.957423210144043, + "learning_rate": 3.891000569235572e-05, + "loss": 1.5757, + "step": 18130 + }, + { + "epoch": 2.9977277422020245, + "grad_norm": 12.539956092834473, + "learning_rate": 3.89008244734571e-05, + "loss": 1.6558, + "step": 18140 + }, + { + "epoch": 2.9993802933278246, + "grad_norm": 18.08677864074707, + "learning_rate": 3.889164325455848e-05, + "loss": 1.6866, + "step": 18150 + }, + { + "epoch": 2.999876058665565, + "eval_accuracy": 0.2888352957866364, + "eval_loss": 2.114047050476074, + "eval_runtime": 817.3743, + "eval_samples_per_second": 34.496, + "eval_steps_per_second": 8.624, + "step": 18153 + }, + { + "epoch": 3.001032844453625, + "grad_norm": 7.226766586303711, + "learning_rate": 3.888246203565986e-05, + "loss": 1.481, + "step": 18160 + }, + { + "epoch": 3.0026853955794257, + "grad_norm": 6.795170307159424, + "learning_rate": 3.8873280816761234e-05, + "loss": 1.5827, + "step": 18170 + }, + { + "epoch": 3.0043379467052262, + "grad_norm": 9.203761100769043, + "learning_rate": 3.886409959786261e-05, + "loss": 1.5858, + "step": 18180 + }, + { + "epoch": 3.005990497831027, + "grad_norm": 8.978983879089355, + "learning_rate": 3.885491837896399e-05, + "loss": 1.6911, + "step": 18190 + }, + { + "epoch": 3.007643048956827, + "grad_norm": 6.757550239562988, + "learning_rate": 3.884573716006537e-05, + "loss": 1.4933, + "step": 18200 + }, + { + "epoch": 3.0092956000826274, + "grad_norm": 6.979025363922119, + "learning_rate": 3.883655594116675e-05, + "loss": 1.4942, + "step": 18210 + }, + { + "epoch": 3.010948151208428, + "grad_norm": 8.403514862060547, + "learning_rate": 3.882737472226813e-05, + "loss": 1.5651, + "step": 18220 + }, + { + "epoch": 3.0126007023342285, + "grad_norm": 16.475433349609375, + "learning_rate": 3.881819350336951e-05, + "loss": 1.5098, + "step": 18230 + }, + { + "epoch": 3.014253253460029, + "grad_norm": 10.676935195922852, + "learning_rate": 3.8809012284470886e-05, + "loss": 1.5301, + "step": 18240 + }, + { + "epoch": 3.0159058045858296, + "grad_norm": 80.31954193115234, + "learning_rate": 3.879983106557227e-05, + "loss": 1.5747, + "step": 18250 + }, + { + "epoch": 3.0175583557116297, + "grad_norm": 9.030322074890137, + "learning_rate": 3.879064984667365e-05, + "loss": 1.6205, + "step": 18260 + }, + { + "epoch": 3.01921090683743, + "grad_norm": 9.721772193908691, + "learning_rate": 3.878146862777503e-05, + "loss": 1.4312, + "step": 18270 + }, + { + "epoch": 3.0208634579632307, + "grad_norm": 8.545890808105469, + "learning_rate": 3.877228740887641e-05, + "loss": 1.6293, + "step": 18280 + }, + { + "epoch": 3.0225160090890313, + "grad_norm": 15.735355377197266, + "learning_rate": 3.8763106189977785e-05, + "loss": 1.5873, + "step": 18290 + }, + { + "epoch": 3.024168560214832, + "grad_norm": 7.491411209106445, + "learning_rate": 3.875392497107916e-05, + "loss": 1.5171, + "step": 18300 + }, + { + "epoch": 3.025821111340632, + "grad_norm": 8.309893608093262, + "learning_rate": 3.874474375218054e-05, + "loss": 1.4795, + "step": 18310 + }, + { + "epoch": 3.0274736624664325, + "grad_norm": 19.71273422241211, + "learning_rate": 3.873556253328192e-05, + "loss": 1.5505, + "step": 18320 + }, + { + "epoch": 3.029126213592233, + "grad_norm": 9.179100036621094, + "learning_rate": 3.8726381314383296e-05, + "loss": 1.6547, + "step": 18330 + }, + { + "epoch": 3.0307787647180335, + "grad_norm": 6.885425090789795, + "learning_rate": 3.871720009548468e-05, + "loss": 1.5285, + "step": 18340 + }, + { + "epoch": 3.032431315843834, + "grad_norm": 7.15316104888916, + "learning_rate": 3.8708018876586054e-05, + "loss": 1.4998, + "step": 18350 + }, + { + "epoch": 3.034083866969634, + "grad_norm": 7.689057350158691, + "learning_rate": 3.869883765768744e-05, + "loss": 1.5134, + "step": 18360 + }, + { + "epoch": 3.0357364180954347, + "grad_norm": 6.1142449378967285, + "learning_rate": 3.868965643878881e-05, + "loss": 1.5328, + "step": 18370 + }, + { + "epoch": 3.0373889692212352, + "grad_norm": 19.263931274414062, + "learning_rate": 3.8680475219890195e-05, + "loss": 1.5652, + "step": 18380 + }, + { + "epoch": 3.039041520347036, + "grad_norm": 17.659536361694336, + "learning_rate": 3.867129400099158e-05, + "loss": 1.6269, + "step": 18390 + }, + { + "epoch": 3.0406940714728363, + "grad_norm": 8.499923706054688, + "learning_rate": 3.8662112782092954e-05, + "loss": 1.5685, + "step": 18400 + }, + { + "epoch": 3.042346622598637, + "grad_norm": 7.187222480773926, + "learning_rate": 3.8652931563194336e-05, + "loss": 1.6992, + "step": 18410 + }, + { + "epoch": 3.043999173724437, + "grad_norm": 5.891327381134033, + "learning_rate": 3.864375034429571e-05, + "loss": 1.4802, + "step": 18420 + }, + { + "epoch": 3.0456517248502375, + "grad_norm": 17.82583999633789, + "learning_rate": 3.863456912539709e-05, + "loss": 1.6052, + "step": 18430 + }, + { + "epoch": 3.047304275976038, + "grad_norm": 19.411766052246094, + "learning_rate": 3.8625387906498464e-05, + "loss": 1.4634, + "step": 18440 + }, + { + "epoch": 3.0489568271018386, + "grad_norm": 9.680952072143555, + "learning_rate": 3.861620668759985e-05, + "loss": 1.6466, + "step": 18450 + }, + { + "epoch": 3.050609378227639, + "grad_norm": 7.9444990158081055, + "learning_rate": 3.860702546870122e-05, + "loss": 1.5706, + "step": 18460 + }, + { + "epoch": 3.052261929353439, + "grad_norm": 7.4515814781188965, + "learning_rate": 3.8597844249802605e-05, + "loss": 1.6711, + "step": 18470 + }, + { + "epoch": 3.0539144804792397, + "grad_norm": 14.548356056213379, + "learning_rate": 3.858866303090398e-05, + "loss": 1.6145, + "step": 18480 + }, + { + "epoch": 3.0555670316050403, + "grad_norm": 24.74275016784668, + "learning_rate": 3.8579481812005364e-05, + "loss": 1.542, + "step": 18490 + }, + { + "epoch": 3.057219582730841, + "grad_norm": 5.011672496795654, + "learning_rate": 3.8570300593106746e-05, + "loss": 1.5655, + "step": 18500 + }, + { + "epoch": 3.0588721338566414, + "grad_norm": 9.48055648803711, + "learning_rate": 3.856111937420812e-05, + "loss": 1.5704, + "step": 18510 + }, + { + "epoch": 3.0605246849824415, + "grad_norm": 17.483966827392578, + "learning_rate": 3.8551938155309505e-05, + "loss": 1.5234, + "step": 18520 + }, + { + "epoch": 3.062177236108242, + "grad_norm": 6.578785419464111, + "learning_rate": 3.854275693641088e-05, + "loss": 1.4352, + "step": 18530 + }, + { + "epoch": 3.0638297872340425, + "grad_norm": 9.402719497680664, + "learning_rate": 3.853357571751226e-05, + "loss": 1.6183, + "step": 18540 + }, + { + "epoch": 3.065482338359843, + "grad_norm": 17.517202377319336, + "learning_rate": 3.852439449861364e-05, + "loss": 1.6068, + "step": 18550 + }, + { + "epoch": 3.0671348894856436, + "grad_norm": 8.648771286010742, + "learning_rate": 3.8515213279715015e-05, + "loss": 1.6782, + "step": 18560 + }, + { + "epoch": 3.0687874406114437, + "grad_norm": 7.555728912353516, + "learning_rate": 3.850603206081639e-05, + "loss": 1.5315, + "step": 18570 + }, + { + "epoch": 3.0704399917372442, + "grad_norm": 43.67970657348633, + "learning_rate": 3.8496850841917773e-05, + "loss": 1.5716, + "step": 18580 + }, + { + "epoch": 3.072092542863045, + "grad_norm": 11.917938232421875, + "learning_rate": 3.848766962301915e-05, + "loss": 1.4323, + "step": 18590 + }, + { + "epoch": 3.0737450939888453, + "grad_norm": 6.986249923706055, + "learning_rate": 3.847848840412053e-05, + "loss": 1.4702, + "step": 18600 + }, + { + "epoch": 3.075397645114646, + "grad_norm": 12.20637035369873, + "learning_rate": 3.8469307185221915e-05, + "loss": 1.568, + "step": 18610 + }, + { + "epoch": 3.0770501962404464, + "grad_norm": 29.53895378112793, + "learning_rate": 3.846012596632329e-05, + "loss": 1.5619, + "step": 18620 + }, + { + "epoch": 3.0787027473662465, + "grad_norm": 19.85051918029785, + "learning_rate": 3.845094474742467e-05, + "loss": 1.6694, + "step": 18630 + }, + { + "epoch": 3.080355298492047, + "grad_norm": 6.126530170440674, + "learning_rate": 3.844176352852605e-05, + "loss": 1.5997, + "step": 18640 + }, + { + "epoch": 3.0820078496178476, + "grad_norm": 6.928379058837891, + "learning_rate": 3.843258230962743e-05, + "loss": 1.6335, + "step": 18650 + }, + { + "epoch": 3.083660400743648, + "grad_norm": 6.429804801940918, + "learning_rate": 3.842340109072881e-05, + "loss": 1.6117, + "step": 18660 + }, + { + "epoch": 3.0853129518694486, + "grad_norm": 6.85563850402832, + "learning_rate": 3.841421987183019e-05, + "loss": 1.5142, + "step": 18670 + }, + { + "epoch": 3.0869655029952487, + "grad_norm": 8.979350090026855, + "learning_rate": 3.8405038652931566e-05, + "loss": 1.5052, + "step": 18680 + }, + { + "epoch": 3.0886180541210493, + "grad_norm": 8.090424537658691, + "learning_rate": 3.839585743403294e-05, + "loss": 1.5765, + "step": 18690 + }, + { + "epoch": 3.09027060524685, + "grad_norm": 12.26007080078125, + "learning_rate": 3.838667621513432e-05, + "loss": 1.6073, + "step": 18700 + }, + { + "epoch": 3.0919231563726504, + "grad_norm": 13.63984489440918, + "learning_rate": 3.83774949962357e-05, + "loss": 1.6378, + "step": 18710 + }, + { + "epoch": 3.093575707498451, + "grad_norm": 35.192535400390625, + "learning_rate": 3.836831377733708e-05, + "loss": 1.7231, + "step": 18720 + }, + { + "epoch": 3.095228258624251, + "grad_norm": 10.192204475402832, + "learning_rate": 3.835913255843846e-05, + "loss": 1.5279, + "step": 18730 + }, + { + "epoch": 3.0968808097500515, + "grad_norm": 5.712940216064453, + "learning_rate": 3.834995133953984e-05, + "loss": 1.5624, + "step": 18740 + }, + { + "epoch": 3.098533360875852, + "grad_norm": 7.214580535888672, + "learning_rate": 3.834077012064122e-05, + "loss": 1.672, + "step": 18750 + }, + { + "epoch": 3.1001859120016526, + "grad_norm": 19.402013778686523, + "learning_rate": 3.83315889017426e-05, + "loss": 1.6678, + "step": 18760 + }, + { + "epoch": 3.101838463127453, + "grad_norm": 7.78207540512085, + "learning_rate": 3.8322407682843976e-05, + "loss": 1.6826, + "step": 18770 + }, + { + "epoch": 3.1034910142532537, + "grad_norm": 6.110933303833008, + "learning_rate": 3.831322646394536e-05, + "loss": 1.5121, + "step": 18780 + }, + { + "epoch": 3.105143565379054, + "grad_norm": 5.636849403381348, + "learning_rate": 3.8304045245046734e-05, + "loss": 1.4881, + "step": 18790 + }, + { + "epoch": 3.1067961165048543, + "grad_norm": 8.485923767089844, + "learning_rate": 3.829486402614812e-05, + "loss": 1.6185, + "step": 18800 + }, + { + "epoch": 3.108448667630655, + "grad_norm": 7.813500881195068, + "learning_rate": 3.828568280724949e-05, + "loss": 1.645, + "step": 18810 + }, + { + "epoch": 3.1101012187564554, + "grad_norm": 8.218082427978516, + "learning_rate": 3.827650158835087e-05, + "loss": 1.601, + "step": 18820 + }, + { + "epoch": 3.111753769882256, + "grad_norm": 11.229850769042969, + "learning_rate": 3.826732036945225e-05, + "loss": 1.5679, + "step": 18830 + }, + { + "epoch": 3.113406321008056, + "grad_norm": 8.275188446044922, + "learning_rate": 3.825813915055363e-05, + "loss": 1.5716, + "step": 18840 + }, + { + "epoch": 3.1150588721338566, + "grad_norm": 7.776087284088135, + "learning_rate": 3.824895793165501e-05, + "loss": 1.4384, + "step": 18850 + }, + { + "epoch": 3.116711423259657, + "grad_norm": 37.47963333129883, + "learning_rate": 3.8239776712756386e-05, + "loss": 1.6417, + "step": 18860 + }, + { + "epoch": 3.1183639743854576, + "grad_norm": 8.274860382080078, + "learning_rate": 3.823059549385777e-05, + "loss": 1.5703, + "step": 18870 + }, + { + "epoch": 3.120016525511258, + "grad_norm": 8.022534370422363, + "learning_rate": 3.8221414274959144e-05, + "loss": 1.5707, + "step": 18880 + }, + { + "epoch": 3.1216690766370583, + "grad_norm": 5.824304103851318, + "learning_rate": 3.821223305606053e-05, + "loss": 1.4785, + "step": 18890 + }, + { + "epoch": 3.123321627762859, + "grad_norm": 9.422294616699219, + "learning_rate": 3.82030518371619e-05, + "loss": 1.5626, + "step": 18900 + }, + { + "epoch": 3.1249741788886594, + "grad_norm": 6.564004898071289, + "learning_rate": 3.8193870618263285e-05, + "loss": 1.6056, + "step": 18910 + }, + { + "epoch": 3.12662673001446, + "grad_norm": 10.041267395019531, + "learning_rate": 3.818468939936466e-05, + "loss": 1.4272, + "step": 18920 + }, + { + "epoch": 3.1282792811402604, + "grad_norm": 36.88932800292969, + "learning_rate": 3.8175508180466044e-05, + "loss": 1.3849, + "step": 18930 + }, + { + "epoch": 3.1299318322660605, + "grad_norm": 13.796693801879883, + "learning_rate": 3.816632696156742e-05, + "loss": 1.5946, + "step": 18940 + }, + { + "epoch": 3.131584383391861, + "grad_norm": 14.519100189208984, + "learning_rate": 3.8157145742668795e-05, + "loss": 1.532, + "step": 18950 + }, + { + "epoch": 3.1332369345176616, + "grad_norm": 9.376446723937988, + "learning_rate": 3.814796452377018e-05, + "loss": 1.683, + "step": 18960 + }, + { + "epoch": 3.134889485643462, + "grad_norm": 9.751054763793945, + "learning_rate": 3.8138783304871554e-05, + "loss": 1.5191, + "step": 18970 + }, + { + "epoch": 3.1365420367692627, + "grad_norm": 11.590496063232422, + "learning_rate": 3.8129602085972937e-05, + "loss": 1.4696, + "step": 18980 + }, + { + "epoch": 3.138194587895063, + "grad_norm": 18.807069778442383, + "learning_rate": 3.812042086707431e-05, + "loss": 1.5602, + "step": 18990 + }, + { + "epoch": 3.1398471390208633, + "grad_norm": 6.233638286590576, + "learning_rate": 3.8111239648175695e-05, + "loss": 1.5646, + "step": 19000 + }, + { + "epoch": 3.141499690146664, + "grad_norm": 6.848485469818115, + "learning_rate": 3.810205842927707e-05, + "loss": 1.5728, + "step": 19010 + }, + { + "epoch": 3.1431522412724644, + "grad_norm": 18.452594757080078, + "learning_rate": 3.8092877210378454e-05, + "loss": 1.6127, + "step": 19020 + }, + { + "epoch": 3.144804792398265, + "grad_norm": 59.3503303527832, + "learning_rate": 3.808369599147983e-05, + "loss": 1.6197, + "step": 19030 + }, + { + "epoch": 3.1464573435240655, + "grad_norm": 11.30215835571289, + "learning_rate": 3.807451477258121e-05, + "loss": 1.5164, + "step": 19040 + }, + { + "epoch": 3.1481098946498656, + "grad_norm": 8.094250679016113, + "learning_rate": 3.806533355368259e-05, + "loss": 1.4852, + "step": 19050 + }, + { + "epoch": 3.149762445775666, + "grad_norm": 8.159111022949219, + "learning_rate": 3.805615233478397e-05, + "loss": 1.6521, + "step": 19060 + }, + { + "epoch": 3.1514149969014666, + "grad_norm": 27.37004280090332, + "learning_rate": 3.8046971115885346e-05, + "loss": 1.4812, + "step": 19070 + }, + { + "epoch": 3.153067548027267, + "grad_norm": 9.601912498474121, + "learning_rate": 3.803778989698672e-05, + "loss": 1.6315, + "step": 19080 + }, + { + "epoch": 3.1547200991530677, + "grad_norm": 10.574562072753906, + "learning_rate": 3.8028608678088105e-05, + "loss": 1.5586, + "step": 19090 + }, + { + "epoch": 3.156372650278868, + "grad_norm": 5.611733913421631, + "learning_rate": 3.801942745918948e-05, + "loss": 1.5879, + "step": 19100 + }, + { + "epoch": 3.1580252014046684, + "grad_norm": 24.429452896118164, + "learning_rate": 3.8010246240290863e-05, + "loss": 1.5259, + "step": 19110 + }, + { + "epoch": 3.159677752530469, + "grad_norm": 6.225391864776611, + "learning_rate": 3.800106502139224e-05, + "loss": 1.5108, + "step": 19120 + }, + { + "epoch": 3.1613303036562694, + "grad_norm": 8.502754211425781, + "learning_rate": 3.799188380249362e-05, + "loss": 1.5145, + "step": 19130 + }, + { + "epoch": 3.16298285478207, + "grad_norm": 17.88811683654785, + "learning_rate": 3.7982702583595e-05, + "loss": 1.6389, + "step": 19140 + }, + { + "epoch": 3.16463540590787, + "grad_norm": 13.118366241455078, + "learning_rate": 3.797352136469638e-05, + "loss": 1.5567, + "step": 19150 + }, + { + "epoch": 3.1662879570336706, + "grad_norm": 8.860665321350098, + "learning_rate": 3.7964340145797756e-05, + "loss": 1.6773, + "step": 19160 + }, + { + "epoch": 3.167940508159471, + "grad_norm": 6.436194896697998, + "learning_rate": 3.795515892689914e-05, + "loss": 1.4645, + "step": 19170 + }, + { + "epoch": 3.1695930592852717, + "grad_norm": 11.198339462280273, + "learning_rate": 3.794597770800052e-05, + "loss": 1.5443, + "step": 19180 + }, + { + "epoch": 3.1712456104110722, + "grad_norm": 15.714473724365234, + "learning_rate": 3.79367964891019e-05, + "loss": 1.5452, + "step": 19190 + }, + { + "epoch": 3.1728981615368728, + "grad_norm": 17.801097869873047, + "learning_rate": 3.792761527020327e-05, + "loss": 1.5366, + "step": 19200 + }, + { + "epoch": 3.174550712662673, + "grad_norm": 11.515276908874512, + "learning_rate": 3.791843405130465e-05, + "loss": 1.6625, + "step": 19210 + }, + { + "epoch": 3.1762032637884734, + "grad_norm": 6.000487804412842, + "learning_rate": 3.790925283240603e-05, + "loss": 1.4931, + "step": 19220 + }, + { + "epoch": 3.177855814914274, + "grad_norm": 8.129054069519043, + "learning_rate": 3.790007161350741e-05, + "loss": 1.7251, + "step": 19230 + }, + { + "epoch": 3.1795083660400745, + "grad_norm": 14.319493293762207, + "learning_rate": 3.789089039460879e-05, + "loss": 1.6495, + "step": 19240 + }, + { + "epoch": 3.181160917165875, + "grad_norm": 30.3303165435791, + "learning_rate": 3.7881709175710166e-05, + "loss": 1.4484, + "step": 19250 + }, + { + "epoch": 3.182813468291675, + "grad_norm": 10.565496444702148, + "learning_rate": 3.787252795681155e-05, + "loss": 1.6871, + "step": 19260 + }, + { + "epoch": 3.1844660194174756, + "grad_norm": 9.682084083557129, + "learning_rate": 3.7863346737912925e-05, + "loss": 1.6799, + "step": 19270 + }, + { + "epoch": 3.186118570543276, + "grad_norm": 6.963440418243408, + "learning_rate": 3.785416551901431e-05, + "loss": 1.6251, + "step": 19280 + }, + { + "epoch": 3.1877711216690767, + "grad_norm": 20.987451553344727, + "learning_rate": 3.784498430011569e-05, + "loss": 1.4718, + "step": 19290 + }, + { + "epoch": 3.1894236727948773, + "grad_norm": 12.118252754211426, + "learning_rate": 3.7835803081217066e-05, + "loss": 1.5291, + "step": 19300 + }, + { + "epoch": 3.1910762239206774, + "grad_norm": 8.367793083190918, + "learning_rate": 3.782662186231845e-05, + "loss": 1.5914, + "step": 19310 + }, + { + "epoch": 3.192728775046478, + "grad_norm": 7.3973164558410645, + "learning_rate": 3.7817440643419824e-05, + "loss": 1.4778, + "step": 19320 + }, + { + "epoch": 3.1943813261722784, + "grad_norm": 28.844614028930664, + "learning_rate": 3.78082594245212e-05, + "loss": 1.4982, + "step": 19330 + }, + { + "epoch": 3.196033877298079, + "grad_norm": 14.039192199707031, + "learning_rate": 3.7799078205622576e-05, + "loss": 1.5076, + "step": 19340 + }, + { + "epoch": 3.1976864284238795, + "grad_norm": 6.318631172180176, + "learning_rate": 3.778989698672396e-05, + "loss": 1.5983, + "step": 19350 + }, + { + "epoch": 3.1993389795496796, + "grad_norm": 11.539376258850098, + "learning_rate": 3.7780715767825334e-05, + "loss": 1.6026, + "step": 19360 + }, + { + "epoch": 3.20099153067548, + "grad_norm": 15.837471961975098, + "learning_rate": 3.777153454892672e-05, + "loss": 1.6435, + "step": 19370 + }, + { + "epoch": 3.2026440818012807, + "grad_norm": 10.379616737365723, + "learning_rate": 3.776235333002809e-05, + "loss": 1.6636, + "step": 19380 + }, + { + "epoch": 3.2042966329270812, + "grad_norm": 18.38787841796875, + "learning_rate": 3.7753172111129476e-05, + "loss": 1.5559, + "step": 19390 + }, + { + "epoch": 3.2059491840528818, + "grad_norm": 22.003950119018555, + "learning_rate": 3.774399089223085e-05, + "loss": 1.5326, + "step": 19400 + }, + { + "epoch": 3.207601735178682, + "grad_norm": 6.592588424682617, + "learning_rate": 3.7734809673332234e-05, + "loss": 1.4978, + "step": 19410 + }, + { + "epoch": 3.2092542863044824, + "grad_norm": 23.952259063720703, + "learning_rate": 3.772562845443362e-05, + "loss": 1.5836, + "step": 19420 + }, + { + "epoch": 3.210906837430283, + "grad_norm": 16.6333065032959, + "learning_rate": 3.771644723553499e-05, + "loss": 1.61, + "step": 19430 + }, + { + "epoch": 3.2125593885560835, + "grad_norm": 11.940427780151367, + "learning_rate": 3.7707266016636375e-05, + "loss": 1.4965, + "step": 19440 + }, + { + "epoch": 3.214211939681884, + "grad_norm": 49.953617095947266, + "learning_rate": 3.769808479773775e-05, + "loss": 1.3932, + "step": 19450 + }, + { + "epoch": 3.2158644908076846, + "grad_norm": 11.84276008605957, + "learning_rate": 3.768890357883913e-05, + "loss": 1.5016, + "step": 19460 + }, + { + "epoch": 3.2175170419334846, + "grad_norm": 17.914264678955078, + "learning_rate": 3.76797223599405e-05, + "loss": 1.5665, + "step": 19470 + }, + { + "epoch": 3.219169593059285, + "grad_norm": 8.291590690612793, + "learning_rate": 3.7670541141041885e-05, + "loss": 1.5544, + "step": 19480 + }, + { + "epoch": 3.2208221441850857, + "grad_norm": 11.181132316589355, + "learning_rate": 3.766135992214326e-05, + "loss": 1.6092, + "step": 19490 + }, + { + "epoch": 3.2224746953108863, + "grad_norm": 7.685586452484131, + "learning_rate": 3.7652178703244644e-05, + "loss": 1.6567, + "step": 19500 + }, + { + "epoch": 3.224127246436687, + "grad_norm": 9.388480186462402, + "learning_rate": 3.764299748434602e-05, + "loss": 1.7372, + "step": 19510 + }, + { + "epoch": 3.225779797562487, + "grad_norm": 6.502718448638916, + "learning_rate": 3.76338162654474e-05, + "loss": 1.5202, + "step": 19520 + }, + { + "epoch": 3.2274323486882874, + "grad_norm": 19.254741668701172, + "learning_rate": 3.7624635046548785e-05, + "loss": 1.5534, + "step": 19530 + }, + { + "epoch": 3.229084899814088, + "grad_norm": 12.590047836303711, + "learning_rate": 3.761545382765016e-05, + "loss": 1.4716, + "step": 19540 + }, + { + "epoch": 3.2307374509398885, + "grad_norm": 58.66154479980469, + "learning_rate": 3.7606272608751544e-05, + "loss": 1.5091, + "step": 19550 + }, + { + "epoch": 3.232390002065689, + "grad_norm": 9.476922035217285, + "learning_rate": 3.759709138985292e-05, + "loss": 1.5869, + "step": 19560 + }, + { + "epoch": 3.2340425531914896, + "grad_norm": 62.604740142822266, + "learning_rate": 3.75879101709543e-05, + "loss": 1.5135, + "step": 19570 + }, + { + "epoch": 3.2356951043172897, + "grad_norm": 7.957396507263184, + "learning_rate": 3.757872895205568e-05, + "loss": 1.5217, + "step": 19580 + }, + { + "epoch": 3.23734765544309, + "grad_norm": 6.035177230834961, + "learning_rate": 3.7569547733157054e-05, + "loss": 1.4088, + "step": 19590 + }, + { + "epoch": 3.2390002065688908, + "grad_norm": 33.28099822998047, + "learning_rate": 3.756036651425843e-05, + "loss": 1.5084, + "step": 19600 + }, + { + "epoch": 3.2406527576946913, + "grad_norm": 10.489121437072754, + "learning_rate": 3.755118529535981e-05, + "loss": 1.5133, + "step": 19610 + }, + { + "epoch": 3.242305308820492, + "grad_norm": 9.935800552368164, + "learning_rate": 3.754200407646119e-05, + "loss": 1.5428, + "step": 19620 + }, + { + "epoch": 3.243957859946292, + "grad_norm": 8.586844444274902, + "learning_rate": 3.753282285756257e-05, + "loss": 1.4201, + "step": 19630 + }, + { + "epoch": 3.2456104110720925, + "grad_norm": 24.38202476501465, + "learning_rate": 3.752364163866395e-05, + "loss": 1.5493, + "step": 19640 + }, + { + "epoch": 3.247262962197893, + "grad_norm": 9.439826965332031, + "learning_rate": 3.751446041976533e-05, + "loss": 1.5399, + "step": 19650 + }, + { + "epoch": 3.2489155133236935, + "grad_norm": 12.375255584716797, + "learning_rate": 3.750527920086671e-05, + "loss": 1.511, + "step": 19660 + }, + { + "epoch": 3.250568064449494, + "grad_norm": 9.748537063598633, + "learning_rate": 3.749609798196809e-05, + "loss": 1.5526, + "step": 19670 + }, + { + "epoch": 3.252220615575294, + "grad_norm": 6.342364311218262, + "learning_rate": 3.748691676306947e-05, + "loss": 1.513, + "step": 19680 + }, + { + "epoch": 3.2538731667010947, + "grad_norm": 23.94266700744629, + "learning_rate": 3.7477735544170846e-05, + "loss": 1.5372, + "step": 19690 + }, + { + "epoch": 3.2555257178268953, + "grad_norm": 15.126330375671387, + "learning_rate": 3.746855432527223e-05, + "loss": 1.5006, + "step": 19700 + }, + { + "epoch": 3.257178268952696, + "grad_norm": 23.147714614868164, + "learning_rate": 3.7459373106373605e-05, + "loss": 1.6257, + "step": 19710 + }, + { + "epoch": 3.2588308200784963, + "grad_norm": 6.821142673492432, + "learning_rate": 3.745019188747498e-05, + "loss": 1.3936, + "step": 19720 + }, + { + "epoch": 3.2604833712042964, + "grad_norm": 7.183291912078857, + "learning_rate": 3.7441010668576356e-05, + "loss": 1.4941, + "step": 19730 + }, + { + "epoch": 3.262135922330097, + "grad_norm": 9.05080509185791, + "learning_rate": 3.743182944967774e-05, + "loss": 1.466, + "step": 19740 + }, + { + "epoch": 3.2637884734558975, + "grad_norm": 30.209848403930664, + "learning_rate": 3.742264823077912e-05, + "loss": 1.6091, + "step": 19750 + }, + { + "epoch": 3.265441024581698, + "grad_norm": 7.082402229309082, + "learning_rate": 3.74134670118805e-05, + "loss": 1.6003, + "step": 19760 + }, + { + "epoch": 3.2670935757074986, + "grad_norm": 17.827167510986328, + "learning_rate": 3.740428579298188e-05, + "loss": 1.4811, + "step": 19770 + }, + { + "epoch": 3.2687461268332987, + "grad_norm": 10.344867706298828, + "learning_rate": 3.7395104574083256e-05, + "loss": 1.5169, + "step": 19780 + }, + { + "epoch": 3.270398677959099, + "grad_norm": 12.192914009094238, + "learning_rate": 3.738592335518464e-05, + "loss": 1.4465, + "step": 19790 + }, + { + "epoch": 3.2720512290848998, + "grad_norm": 5.389448165893555, + "learning_rate": 3.7376742136286015e-05, + "loss": 1.4501, + "step": 19800 + }, + { + "epoch": 3.2737037802107003, + "grad_norm": 10.874370574951172, + "learning_rate": 3.73675609173874e-05, + "loss": 1.5779, + "step": 19810 + }, + { + "epoch": 3.275356331336501, + "grad_norm": 12.324655532836914, + "learning_rate": 3.735837969848877e-05, + "loss": 1.459, + "step": 19820 + }, + { + "epoch": 3.277008882462301, + "grad_norm": 10.794198036193848, + "learning_rate": 3.7349198479590156e-05, + "loss": 1.4958, + "step": 19830 + }, + { + "epoch": 3.2786614335881015, + "grad_norm": 14.633699417114258, + "learning_rate": 3.734001726069153e-05, + "loss": 1.5338, + "step": 19840 + }, + { + "epoch": 3.280313984713902, + "grad_norm": 8.332018852233887, + "learning_rate": 3.733083604179291e-05, + "loss": 1.4962, + "step": 19850 + }, + { + "epoch": 3.2819665358397025, + "grad_norm": 9.715315818786621, + "learning_rate": 3.732165482289429e-05, + "loss": 1.5335, + "step": 19860 + }, + { + "epoch": 3.283619086965503, + "grad_norm": 13.441153526306152, + "learning_rate": 3.7312473603995666e-05, + "loss": 1.5538, + "step": 19870 + }, + { + "epoch": 3.2852716380913036, + "grad_norm": 16.49589729309082, + "learning_rate": 3.730329238509705e-05, + "loss": 1.5301, + "step": 19880 + }, + { + "epoch": 3.2869241892171037, + "grad_norm": 31.22637939453125, + "learning_rate": 3.7294111166198424e-05, + "loss": 1.6213, + "step": 19890 + }, + { + "epoch": 3.2885767403429043, + "grad_norm": 8.242454528808594, + "learning_rate": 3.728492994729981e-05, + "loss": 1.6577, + "step": 19900 + }, + { + "epoch": 3.290229291468705, + "grad_norm": 8.728832244873047, + "learning_rate": 3.727574872840118e-05, + "loss": 1.5487, + "step": 19910 + }, + { + "epoch": 3.2918818425945053, + "grad_norm": 7.110820770263672, + "learning_rate": 3.7266567509502566e-05, + "loss": 1.6357, + "step": 19920 + }, + { + "epoch": 3.293534393720306, + "grad_norm": 6.833790302276611, + "learning_rate": 3.725738629060394e-05, + "loss": 1.5007, + "step": 19930 + }, + { + "epoch": 3.2951869448461064, + "grad_norm": 10.365285873413086, + "learning_rate": 3.7248205071705324e-05, + "loss": 1.6454, + "step": 19940 + }, + { + "epoch": 3.2968394959719065, + "grad_norm": 10.187037467956543, + "learning_rate": 3.72390238528067e-05, + "loss": 1.4288, + "step": 19950 + }, + { + "epoch": 3.298492047097707, + "grad_norm": 33.366790771484375, + "learning_rate": 3.722984263390808e-05, + "loss": 1.6844, + "step": 19960 + }, + { + "epoch": 3.3001445982235076, + "grad_norm": 6.164004802703857, + "learning_rate": 3.722066141500946e-05, + "loss": 1.5483, + "step": 19970 + }, + { + "epoch": 3.301797149349308, + "grad_norm": 14.696124076843262, + "learning_rate": 3.7211480196110834e-05, + "loss": 1.5542, + "step": 19980 + }, + { + "epoch": 3.3034497004751087, + "grad_norm": 11.939413070678711, + "learning_rate": 3.720229897721222e-05, + "loss": 1.5707, + "step": 19990 + }, + { + "epoch": 3.3051022516009088, + "grad_norm": 6.623092174530029, + "learning_rate": 3.719311775831359e-05, + "loss": 1.5703, + "step": 20000 + }, + { + "epoch": 3.3067548027267093, + "grad_norm": 7.930966854095459, + "learning_rate": 3.7183936539414975e-05, + "loss": 1.4843, + "step": 20010 + }, + { + "epoch": 3.30840735385251, + "grad_norm": 5.481695652008057, + "learning_rate": 3.717475532051635e-05, + "loss": 1.5541, + "step": 20020 + }, + { + "epoch": 3.3100599049783104, + "grad_norm": 6.865151405334473, + "learning_rate": 3.7165574101617734e-05, + "loss": 1.5562, + "step": 20030 + }, + { + "epoch": 3.311712456104111, + "grad_norm": 9.590388298034668, + "learning_rate": 3.715639288271911e-05, + "loss": 1.6177, + "step": 20040 + }, + { + "epoch": 3.313365007229911, + "grad_norm": 7.312343120574951, + "learning_rate": 3.714721166382049e-05, + "loss": 1.7381, + "step": 20050 + }, + { + "epoch": 3.3150175583557115, + "grad_norm": 9.11748218536377, + "learning_rate": 3.713803044492187e-05, + "loss": 1.5066, + "step": 20060 + }, + { + "epoch": 3.316670109481512, + "grad_norm": 7.51953649520874, + "learning_rate": 3.712884922602325e-05, + "loss": 1.7568, + "step": 20070 + }, + { + "epoch": 3.3183226606073126, + "grad_norm": 7.698461532592773, + "learning_rate": 3.711966800712463e-05, + "loss": 1.4384, + "step": 20080 + }, + { + "epoch": 3.319975211733113, + "grad_norm": 4.527771949768066, + "learning_rate": 3.711048678822601e-05, + "loss": 1.5915, + "step": 20090 + }, + { + "epoch": 3.3216277628589133, + "grad_norm": 7.3180131912231445, + "learning_rate": 3.7101305569327385e-05, + "loss": 1.5506, + "step": 20100 + }, + { + "epoch": 3.323280313984714, + "grad_norm": 5.010993480682373, + "learning_rate": 3.709212435042876e-05, + "loss": 1.4742, + "step": 20110 + }, + { + "epoch": 3.3249328651105143, + "grad_norm": 15.535231590270996, + "learning_rate": 3.7082943131530144e-05, + "loss": 1.5647, + "step": 20120 + }, + { + "epoch": 3.326585416236315, + "grad_norm": 7.405104637145996, + "learning_rate": 3.707376191263152e-05, + "loss": 1.3815, + "step": 20130 + }, + { + "epoch": 3.3282379673621154, + "grad_norm": 6.594423770904541, + "learning_rate": 3.70645806937329e-05, + "loss": 1.5013, + "step": 20140 + }, + { + "epoch": 3.3298905184879155, + "grad_norm": 16.589862823486328, + "learning_rate": 3.705539947483428e-05, + "loss": 1.5586, + "step": 20150 + }, + { + "epoch": 3.331543069613716, + "grad_norm": 21.43290138244629, + "learning_rate": 3.704621825593566e-05, + "loss": 1.4964, + "step": 20160 + }, + { + "epoch": 3.3331956207395166, + "grad_norm": 13.260900497436523, + "learning_rate": 3.7037037037037037e-05, + "loss": 1.5953, + "step": 20170 + }, + { + "epoch": 3.334848171865317, + "grad_norm": 7.0593037605285645, + "learning_rate": 3.702785581813842e-05, + "loss": 1.6032, + "step": 20180 + }, + { + "epoch": 3.3365007229911177, + "grad_norm": 5.94556188583374, + "learning_rate": 3.7018674599239795e-05, + "loss": 1.677, + "step": 20190 + }, + { + "epoch": 3.3381532741169178, + "grad_norm": 6.961794376373291, + "learning_rate": 3.700949338034118e-05, + "loss": 1.3566, + "step": 20200 + }, + { + "epoch": 3.3398058252427183, + "grad_norm": 12.444746017456055, + "learning_rate": 3.700031216144256e-05, + "loss": 1.6006, + "step": 20210 + }, + { + "epoch": 3.341458376368519, + "grad_norm": 7.811690330505371, + "learning_rate": 3.6991130942543936e-05, + "loss": 1.557, + "step": 20220 + }, + { + "epoch": 3.3431109274943194, + "grad_norm": 58.86392593383789, + "learning_rate": 3.698194972364531e-05, + "loss": 1.6786, + "step": 20230 + }, + { + "epoch": 3.34476347862012, + "grad_norm": 13.110153198242188, + "learning_rate": 3.697276850474669e-05, + "loss": 1.5049, + "step": 20240 + }, + { + "epoch": 3.3464160297459205, + "grad_norm": 26.155275344848633, + "learning_rate": 3.696358728584807e-05, + "loss": 1.6196, + "step": 20250 + }, + { + "epoch": 3.3480685808717205, + "grad_norm": 9.868633270263672, + "learning_rate": 3.6954406066949446e-05, + "loss": 1.52, + "step": 20260 + }, + { + "epoch": 3.349721131997521, + "grad_norm": 6.902059078216553, + "learning_rate": 3.694522484805083e-05, + "loss": 1.6092, + "step": 20270 + }, + { + "epoch": 3.3513736831233216, + "grad_norm": 13.972870826721191, + "learning_rate": 3.6936043629152205e-05, + "loss": 1.4675, + "step": 20280 + }, + { + "epoch": 3.353026234249122, + "grad_norm": 4.600803852081299, + "learning_rate": 3.692686241025359e-05, + "loss": 1.5782, + "step": 20290 + }, + { + "epoch": 3.3546787853749227, + "grad_norm": 29.91995620727539, + "learning_rate": 3.691768119135496e-05, + "loss": 1.6141, + "step": 20300 + }, + { + "epoch": 3.3563313365007232, + "grad_norm": 13.344480514526367, + "learning_rate": 3.6908499972456346e-05, + "loss": 1.5646, + "step": 20310 + }, + { + "epoch": 3.3579838876265233, + "grad_norm": 36.023399353027344, + "learning_rate": 3.689931875355773e-05, + "loss": 1.5179, + "step": 20320 + }, + { + "epoch": 3.359636438752324, + "grad_norm": 14.964554786682129, + "learning_rate": 3.6890137534659105e-05, + "loss": 1.523, + "step": 20330 + }, + { + "epoch": 3.3612889898781244, + "grad_norm": 10.848349571228027, + "learning_rate": 3.688095631576049e-05, + "loss": 1.8119, + "step": 20340 + }, + { + "epoch": 3.362941541003925, + "grad_norm": 13.974653244018555, + "learning_rate": 3.687177509686186e-05, + "loss": 1.6156, + "step": 20350 + }, + { + "epoch": 3.3645940921297255, + "grad_norm": 6.851309776306152, + "learning_rate": 3.686259387796324e-05, + "loss": 1.5522, + "step": 20360 + }, + { + "epoch": 3.3662466432555256, + "grad_norm": 7.36599063873291, + "learning_rate": 3.6853412659064615e-05, + "loss": 1.6563, + "step": 20370 + }, + { + "epoch": 3.367899194381326, + "grad_norm": 8.092686653137207, + "learning_rate": 3.6844231440166e-05, + "loss": 1.4674, + "step": 20380 + }, + { + "epoch": 3.3695517455071267, + "grad_norm": 13.80720329284668, + "learning_rate": 3.683505022126737e-05, + "loss": 1.6301, + "step": 20390 + }, + { + "epoch": 3.371204296632927, + "grad_norm": 15.420303344726562, + "learning_rate": 3.6825869002368756e-05, + "loss": 1.5869, + "step": 20400 + }, + { + "epoch": 3.3728568477587277, + "grad_norm": 8.8609037399292, + "learning_rate": 3.681668778347013e-05, + "loss": 1.3465, + "step": 20410 + }, + { + "epoch": 3.374509398884528, + "grad_norm": 12.477286338806152, + "learning_rate": 3.6807506564571514e-05, + "loss": 1.6163, + "step": 20420 + }, + { + "epoch": 3.3761619500103284, + "grad_norm": 18.01897430419922, + "learning_rate": 3.67983253456729e-05, + "loss": 1.5364, + "step": 20430 + }, + { + "epoch": 3.377814501136129, + "grad_norm": 13.501622200012207, + "learning_rate": 3.678914412677427e-05, + "loss": 1.4659, + "step": 20440 + }, + { + "epoch": 3.3794670522619294, + "grad_norm": 16.007644653320312, + "learning_rate": 3.6779962907875655e-05, + "loss": 1.6269, + "step": 20450 + }, + { + "epoch": 3.38111960338773, + "grad_norm": 6.28301477432251, + "learning_rate": 3.677078168897703e-05, + "loss": 1.4513, + "step": 20460 + }, + { + "epoch": 3.38277215451353, + "grad_norm": 8.616754531860352, + "learning_rate": 3.6761600470078414e-05, + "loss": 1.4903, + "step": 20470 + }, + { + "epoch": 3.3844247056393306, + "grad_norm": 9.9006986618042, + "learning_rate": 3.675241925117979e-05, + "loss": 1.5364, + "step": 20480 + }, + { + "epoch": 3.386077256765131, + "grad_norm": 11.879949569702148, + "learning_rate": 3.6743238032281166e-05, + "loss": 1.6025, + "step": 20490 + }, + { + "epoch": 3.3877298078909317, + "grad_norm": 28.69721221923828, + "learning_rate": 3.673405681338254e-05, + "loss": 1.5899, + "step": 20500 + }, + { + "epoch": 3.3893823590167322, + "grad_norm": 10.507150650024414, + "learning_rate": 3.6724875594483924e-05, + "loss": 1.6434, + "step": 20510 + }, + { + "epoch": 3.3910349101425323, + "grad_norm": 10.478775978088379, + "learning_rate": 3.67156943755853e-05, + "loss": 1.5265, + "step": 20520 + }, + { + "epoch": 3.392687461268333, + "grad_norm": 24.88822364807129, + "learning_rate": 3.670651315668668e-05, + "loss": 1.6751, + "step": 20530 + }, + { + "epoch": 3.3943400123941334, + "grad_norm": 9.983197212219238, + "learning_rate": 3.669733193778806e-05, + "loss": 1.5644, + "step": 20540 + }, + { + "epoch": 3.395992563519934, + "grad_norm": 7.989924430847168, + "learning_rate": 3.668815071888944e-05, + "loss": 1.6401, + "step": 20550 + }, + { + "epoch": 3.3976451146457345, + "grad_norm": 5.701867580413818, + "learning_rate": 3.6678969499990824e-05, + "loss": 1.5425, + "step": 20560 + }, + { + "epoch": 3.3992976657715346, + "grad_norm": 14.750271797180176, + "learning_rate": 3.66697882810922e-05, + "loss": 1.585, + "step": 20570 + }, + { + "epoch": 3.400950216897335, + "grad_norm": 21.3060245513916, + "learning_rate": 3.666060706219358e-05, + "loss": 1.5328, + "step": 20580 + }, + { + "epoch": 3.4026027680231357, + "grad_norm": 14.30643367767334, + "learning_rate": 3.665142584329496e-05, + "loss": 1.5749, + "step": 20590 + }, + { + "epoch": 3.404255319148936, + "grad_norm": 10.9791898727417, + "learning_rate": 3.664224462439634e-05, + "loss": 1.6841, + "step": 20600 + }, + { + "epoch": 3.4059078702747367, + "grad_norm": 10.282464981079102, + "learning_rate": 3.663306340549772e-05, + "loss": 1.586, + "step": 20610 + }, + { + "epoch": 3.407560421400537, + "grad_norm": 10.054344177246094, + "learning_rate": 3.662388218659909e-05, + "loss": 1.5445, + "step": 20620 + }, + { + "epoch": 3.4092129725263374, + "grad_norm": 6.583102703094482, + "learning_rate": 3.661470096770047e-05, + "loss": 1.507, + "step": 20630 + }, + { + "epoch": 3.410865523652138, + "grad_norm": 7.482510566711426, + "learning_rate": 3.660551974880185e-05, + "loss": 1.5865, + "step": 20640 + }, + { + "epoch": 3.4125180747779384, + "grad_norm": 23.863901138305664, + "learning_rate": 3.659633852990323e-05, + "loss": 1.7097, + "step": 20650 + }, + { + "epoch": 3.414170625903739, + "grad_norm": 5.8396830558776855, + "learning_rate": 3.658715731100461e-05, + "loss": 1.4904, + "step": 20660 + }, + { + "epoch": 3.4158231770295395, + "grad_norm": 14.669455528259277, + "learning_rate": 3.657797609210599e-05, + "loss": 1.5476, + "step": 20670 + }, + { + "epoch": 3.4174757281553396, + "grad_norm": 9.562002182006836, + "learning_rate": 3.656879487320737e-05, + "loss": 1.474, + "step": 20680 + }, + { + "epoch": 3.41912827928114, + "grad_norm": 13.14786434173584, + "learning_rate": 3.655961365430875e-05, + "loss": 1.6046, + "step": 20690 + }, + { + "epoch": 3.4207808304069407, + "grad_norm": 27.712749481201172, + "learning_rate": 3.6550432435410127e-05, + "loss": 1.6887, + "step": 20700 + }, + { + "epoch": 3.4224333815327412, + "grad_norm": 8.249465942382812, + "learning_rate": 3.654125121651151e-05, + "loss": 1.5875, + "step": 20710 + }, + { + "epoch": 3.4240859326585418, + "grad_norm": 34.13796615600586, + "learning_rate": 3.6532069997612885e-05, + "loss": 1.5839, + "step": 20720 + }, + { + "epoch": 3.4257384837843423, + "grad_norm": 9.606948852539062, + "learning_rate": 3.652288877871427e-05, + "loss": 1.6466, + "step": 20730 + }, + { + "epoch": 3.4273910349101424, + "grad_norm": 11.323090553283691, + "learning_rate": 3.6513707559815644e-05, + "loss": 1.4478, + "step": 20740 + }, + { + "epoch": 3.429043586035943, + "grad_norm": 6.942873477935791, + "learning_rate": 3.650452634091702e-05, + "loss": 1.5668, + "step": 20750 + }, + { + "epoch": 3.4306961371617435, + "grad_norm": 6.9645867347717285, + "learning_rate": 3.6495345122018395e-05, + "loss": 1.3887, + "step": 20760 + }, + { + "epoch": 3.432348688287544, + "grad_norm": 9.70297622680664, + "learning_rate": 3.648616390311978e-05, + "loss": 1.5917, + "step": 20770 + }, + { + "epoch": 3.4340012394133446, + "grad_norm": 19.935123443603516, + "learning_rate": 3.647698268422116e-05, + "loss": 1.6505, + "step": 20780 + }, + { + "epoch": 3.4356537905391447, + "grad_norm": 8.549393653869629, + "learning_rate": 3.6467801465322536e-05, + "loss": 1.7168, + "step": 20790 + }, + { + "epoch": 3.437306341664945, + "grad_norm": 7.79632568359375, + "learning_rate": 3.645862024642392e-05, + "loss": 1.5747, + "step": 20800 + }, + { + "epoch": 3.4389588927907457, + "grad_norm": 15.404546737670898, + "learning_rate": 3.6449439027525295e-05, + "loss": 1.54, + "step": 20810 + }, + { + "epoch": 3.4406114439165463, + "grad_norm": 7.910075664520264, + "learning_rate": 3.644025780862668e-05, + "loss": 1.4787, + "step": 20820 + }, + { + "epoch": 3.442263995042347, + "grad_norm": 38.832923889160156, + "learning_rate": 3.643107658972805e-05, + "loss": 1.3986, + "step": 20830 + }, + { + "epoch": 3.443916546168147, + "grad_norm": 15.27536392211914, + "learning_rate": 3.6421895370829436e-05, + "loss": 1.5966, + "step": 20840 + }, + { + "epoch": 3.4455690972939474, + "grad_norm": 5.51830530166626, + "learning_rate": 3.641271415193081e-05, + "loss": 1.4576, + "step": 20850 + }, + { + "epoch": 3.447221648419748, + "grad_norm": 9.22146987915039, + "learning_rate": 3.6403532933032194e-05, + "loss": 1.5793, + "step": 20860 + }, + { + "epoch": 3.4488741995455485, + "grad_norm": 18.014942169189453, + "learning_rate": 3.639435171413357e-05, + "loss": 1.4621, + "step": 20870 + }, + { + "epoch": 3.450526750671349, + "grad_norm": 11.17994499206543, + "learning_rate": 3.6385170495234946e-05, + "loss": 1.5456, + "step": 20880 + }, + { + "epoch": 3.452179301797149, + "grad_norm": 7.204450607299805, + "learning_rate": 3.637598927633633e-05, + "loss": 1.5712, + "step": 20890 + }, + { + "epoch": 3.4538318529229497, + "grad_norm": 9.417290687561035, + "learning_rate": 3.6366808057437705e-05, + "loss": 1.4765, + "step": 20900 + }, + { + "epoch": 3.4554844040487502, + "grad_norm": 8.76371955871582, + "learning_rate": 3.635762683853909e-05, + "loss": 1.6961, + "step": 20910 + }, + { + "epoch": 3.4571369551745508, + "grad_norm": 9.267240524291992, + "learning_rate": 3.634844561964046e-05, + "loss": 1.5875, + "step": 20920 + }, + { + "epoch": 3.4587895063003513, + "grad_norm": 11.603541374206543, + "learning_rate": 3.6339264400741846e-05, + "loss": 1.6531, + "step": 20930 + }, + { + "epoch": 3.4604420574261514, + "grad_norm": 8.296456336975098, + "learning_rate": 3.633008318184322e-05, + "loss": 1.6704, + "step": 20940 + }, + { + "epoch": 3.462094608551952, + "grad_norm": 8.473135948181152, + "learning_rate": 3.6320901962944604e-05, + "loss": 1.4977, + "step": 20950 + }, + { + "epoch": 3.4637471596777525, + "grad_norm": 18.611042022705078, + "learning_rate": 3.631172074404598e-05, + "loss": 1.475, + "step": 20960 + }, + { + "epoch": 3.465399710803553, + "grad_norm": 7.0088090896606445, + "learning_rate": 3.630253952514736e-05, + "loss": 1.4484, + "step": 20970 + }, + { + "epoch": 3.4670522619293536, + "grad_norm": 9.421944618225098, + "learning_rate": 3.629335830624874e-05, + "loss": 1.4781, + "step": 20980 + }, + { + "epoch": 3.4687048130551537, + "grad_norm": 35.80681610107422, + "learning_rate": 3.628417708735012e-05, + "loss": 1.5584, + "step": 20990 + }, + { + "epoch": 3.470357364180954, + "grad_norm": 7.813302040100098, + "learning_rate": 3.62749958684515e-05, + "loss": 1.5762, + "step": 21000 + }, + { + "epoch": 3.4720099153067547, + "grad_norm": 21.3023681640625, + "learning_rate": 3.626581464955287e-05, + "loss": 1.6249, + "step": 21010 + }, + { + "epoch": 3.4736624664325553, + "grad_norm": 62.12176513671875, + "learning_rate": 3.6256633430654256e-05, + "loss": 1.5029, + "step": 21020 + }, + { + "epoch": 3.475315017558356, + "grad_norm": 8.91295337677002, + "learning_rate": 3.624745221175563e-05, + "loss": 1.5845, + "step": 21030 + }, + { + "epoch": 3.4769675686841564, + "grad_norm": 10.726168632507324, + "learning_rate": 3.6238270992857014e-05, + "loss": 1.6008, + "step": 21040 + }, + { + "epoch": 3.4786201198099564, + "grad_norm": 7.324353218078613, + "learning_rate": 3.622908977395839e-05, + "loss": 1.5174, + "step": 21050 + }, + { + "epoch": 3.480272670935757, + "grad_norm": 14.042418479919434, + "learning_rate": 3.621990855505977e-05, + "loss": 1.6201, + "step": 21060 + }, + { + "epoch": 3.4819252220615575, + "grad_norm": 22.140363693237305, + "learning_rate": 3.621072733616115e-05, + "loss": 1.6755, + "step": 21070 + }, + { + "epoch": 3.483577773187358, + "grad_norm": 10.497157096862793, + "learning_rate": 3.620154611726253e-05, + "loss": 1.4485, + "step": 21080 + }, + { + "epoch": 3.4852303243131586, + "grad_norm": 7.269160270690918, + "learning_rate": 3.619236489836391e-05, + "loss": 1.5914, + "step": 21090 + }, + { + "epoch": 3.486882875438959, + "grad_norm": 11.066585540771484, + "learning_rate": 3.618318367946529e-05, + "loss": 1.4845, + "step": 21100 + }, + { + "epoch": 3.4885354265647592, + "grad_norm": 16.51125144958496, + "learning_rate": 3.6174002460566666e-05, + "loss": 1.5517, + "step": 21110 + }, + { + "epoch": 3.4901879776905598, + "grad_norm": 35.069183349609375, + "learning_rate": 3.616482124166805e-05, + "loss": 1.5281, + "step": 21120 + }, + { + "epoch": 3.4918405288163603, + "grad_norm": 16.97264289855957, + "learning_rate": 3.6155640022769424e-05, + "loss": 1.469, + "step": 21130 + }, + { + "epoch": 3.493493079942161, + "grad_norm": 21.5349178314209, + "learning_rate": 3.61464588038708e-05, + "loss": 1.6392, + "step": 21140 + }, + { + "epoch": 3.4951456310679614, + "grad_norm": 8.533900260925293, + "learning_rate": 3.613727758497218e-05, + "loss": 1.5645, + "step": 21150 + }, + { + "epoch": 3.4967981821937615, + "grad_norm": 59.57920455932617, + "learning_rate": 3.612809636607356e-05, + "loss": 1.598, + "step": 21160 + }, + { + "epoch": 3.498450733319562, + "grad_norm": 34.87241744995117, + "learning_rate": 3.611891514717494e-05, + "loss": 1.6019, + "step": 21170 + }, + { + "epoch": 3.5001032844453626, + "grad_norm": 12.19272232055664, + "learning_rate": 3.610973392827632e-05, + "loss": 1.5174, + "step": 21180 + }, + { + "epoch": 3.501755835571163, + "grad_norm": 23.954174041748047, + "learning_rate": 3.61005527093777e-05, + "loss": 1.6589, + "step": 21190 + }, + { + "epoch": 3.5034083866969636, + "grad_norm": 11.947124481201172, + "learning_rate": 3.6091371490479075e-05, + "loss": 1.6736, + "step": 21200 + }, + { + "epoch": 3.5050609378227637, + "grad_norm": 30.422679901123047, + "learning_rate": 3.608219027158046e-05, + "loss": 1.5366, + "step": 21210 + }, + { + "epoch": 3.5067134889485643, + "grad_norm": 5.95852518081665, + "learning_rate": 3.6073009052681834e-05, + "loss": 1.6103, + "step": 21220 + }, + { + "epoch": 3.508366040074365, + "grad_norm": 7.431918144226074, + "learning_rate": 3.6063827833783216e-05, + "loss": 1.4234, + "step": 21230 + }, + { + "epoch": 3.5100185912001653, + "grad_norm": 19.810720443725586, + "learning_rate": 3.60546466148846e-05, + "loss": 1.5673, + "step": 21240 + }, + { + "epoch": 3.511671142325966, + "grad_norm": 9.25375747680664, + "learning_rate": 3.6045465395985975e-05, + "loss": 1.5157, + "step": 21250 + }, + { + "epoch": 3.513323693451766, + "grad_norm": 15.374961853027344, + "learning_rate": 3.603628417708735e-05, + "loss": 1.5093, + "step": 21260 + }, + { + "epoch": 3.5149762445775665, + "grad_norm": 15.200456619262695, + "learning_rate": 3.602710295818873e-05, + "loss": 1.6222, + "step": 21270 + }, + { + "epoch": 3.516628795703367, + "grad_norm": 10.129278182983398, + "learning_rate": 3.601792173929011e-05, + "loss": 1.5672, + "step": 21280 + }, + { + "epoch": 3.5182813468291676, + "grad_norm": 15.771454811096191, + "learning_rate": 3.6008740520391485e-05, + "loss": 1.6731, + "step": 21290 + }, + { + "epoch": 3.519933897954968, + "grad_norm": 10.804917335510254, + "learning_rate": 3.599955930149287e-05, + "loss": 1.6508, + "step": 21300 + }, + { + "epoch": 3.5215864490807682, + "grad_norm": 10.056406021118164, + "learning_rate": 3.5990378082594244e-05, + "loss": 1.527, + "step": 21310 + }, + { + "epoch": 3.5232390002065688, + "grad_norm": 14.683306694030762, + "learning_rate": 3.5981196863695626e-05, + "loss": 1.6713, + "step": 21320 + }, + { + "epoch": 3.5248915513323693, + "grad_norm": 12.720149040222168, + "learning_rate": 3.5972015644797e-05, + "loss": 1.5369, + "step": 21330 + }, + { + "epoch": 3.52654410245817, + "grad_norm": 20.362594604492188, + "learning_rate": 3.5962834425898385e-05, + "loss": 1.4948, + "step": 21340 + }, + { + "epoch": 3.5281966535839704, + "grad_norm": 9.18614387512207, + "learning_rate": 3.595365320699977e-05, + "loss": 1.5522, + "step": 21350 + }, + { + "epoch": 3.5298492047097705, + "grad_norm": 15.578404426574707, + "learning_rate": 3.594447198810114e-05, + "loss": 1.5016, + "step": 21360 + }, + { + "epoch": 3.531501755835571, + "grad_norm": 28.72294807434082, + "learning_rate": 3.5935290769202526e-05, + "loss": 1.5429, + "step": 21370 + }, + { + "epoch": 3.5331543069613716, + "grad_norm": 8.129132270812988, + "learning_rate": 3.59261095503039e-05, + "loss": 1.6308, + "step": 21380 + }, + { + "epoch": 3.534806858087172, + "grad_norm": 7.975843906402588, + "learning_rate": 3.591692833140528e-05, + "loss": 1.5086, + "step": 21390 + }, + { + "epoch": 3.5364594092129726, + "grad_norm": 69.10977935791016, + "learning_rate": 3.5907747112506654e-05, + "loss": 1.6113, + "step": 21400 + }, + { + "epoch": 3.5381119603387727, + "grad_norm": 28.328746795654297, + "learning_rate": 3.5898565893608036e-05, + "loss": 1.5441, + "step": 21410 + }, + { + "epoch": 3.5397645114645733, + "grad_norm": 15.218031883239746, + "learning_rate": 3.588938467470941e-05, + "loss": 1.6617, + "step": 21420 + }, + { + "epoch": 3.541417062590374, + "grad_norm": 7.890848636627197, + "learning_rate": 3.5880203455810795e-05, + "loss": 1.4507, + "step": 21430 + }, + { + "epoch": 3.5430696137161743, + "grad_norm": 23.32213592529297, + "learning_rate": 3.587102223691217e-05, + "loss": 1.4829, + "step": 21440 + }, + { + "epoch": 3.544722164841975, + "grad_norm": 9.921598434448242, + "learning_rate": 3.586184101801355e-05, + "loss": 1.5639, + "step": 21450 + }, + { + "epoch": 3.546374715967775, + "grad_norm": 10.007643699645996, + "learning_rate": 3.5852659799114936e-05, + "loss": 1.6078, + "step": 21460 + }, + { + "epoch": 3.548027267093576, + "grad_norm": 6.750604629516602, + "learning_rate": 3.584347858021631e-05, + "loss": 1.5142, + "step": 21470 + }, + { + "epoch": 3.549679818219376, + "grad_norm": 12.836925506591797, + "learning_rate": 3.5834297361317694e-05, + "loss": 1.5415, + "step": 21480 + }, + { + "epoch": 3.5513323693451766, + "grad_norm": 9.204744338989258, + "learning_rate": 3.582511614241907e-05, + "loss": 1.5137, + "step": 21490 + }, + { + "epoch": 3.552984920470977, + "grad_norm": 22.862333297729492, + "learning_rate": 3.581593492352045e-05, + "loss": 1.5273, + "step": 21500 + }, + { + "epoch": 3.5546374715967777, + "grad_norm": 8.714118003845215, + "learning_rate": 3.580675370462183e-05, + "loss": 1.6176, + "step": 21510 + }, + { + "epoch": 3.556290022722578, + "grad_norm": 32.12099838256836, + "learning_rate": 3.5797572485723204e-05, + "loss": 1.5262, + "step": 21520 + }, + { + "epoch": 3.5579425738483783, + "grad_norm": 33.427494049072266, + "learning_rate": 3.578839126682458e-05, + "loss": 1.6296, + "step": 21530 + }, + { + "epoch": 3.559595124974179, + "grad_norm": 7.42344856262207, + "learning_rate": 3.577921004792596e-05, + "loss": 1.6682, + "step": 21540 + }, + { + "epoch": 3.5612476760999794, + "grad_norm": 7.865045547485352, + "learning_rate": 3.577002882902734e-05, + "loss": 1.6207, + "step": 21550 + }, + { + "epoch": 3.56290022722578, + "grad_norm": 8.04334831237793, + "learning_rate": 3.576084761012872e-05, + "loss": 1.5475, + "step": 21560 + }, + { + "epoch": 3.5645527783515805, + "grad_norm": 9.874261856079102, + "learning_rate": 3.5751666391230104e-05, + "loss": 1.5455, + "step": 21570 + }, + { + "epoch": 3.5662053294773806, + "grad_norm": 18.90970802307129, + "learning_rate": 3.574248517233148e-05, + "loss": 1.6134, + "step": 21580 + }, + { + "epoch": 3.567857880603181, + "grad_norm": 7.690478324890137, + "learning_rate": 3.573330395343286e-05, + "loss": 1.6368, + "step": 21590 + }, + { + "epoch": 3.5695104317289816, + "grad_norm": 12.998335838317871, + "learning_rate": 3.572412273453424e-05, + "loss": 1.6065, + "step": 21600 + }, + { + "epoch": 3.571162982854782, + "grad_norm": 6.6522979736328125, + "learning_rate": 3.571494151563562e-05, + "loss": 1.3663, + "step": 21610 + }, + { + "epoch": 3.5728155339805827, + "grad_norm": 9.580485343933105, + "learning_rate": 3.5705760296737e-05, + "loss": 1.5932, + "step": 21620 + }, + { + "epoch": 3.574468085106383, + "grad_norm": 14.493224143981934, + "learning_rate": 3.569657907783838e-05, + "loss": 1.548, + "step": 21630 + }, + { + "epoch": 3.5761206362321833, + "grad_norm": 9.22100830078125, + "learning_rate": 3.5687397858939755e-05, + "loss": 1.4881, + "step": 21640 + }, + { + "epoch": 3.577773187357984, + "grad_norm": 32.90215301513672, + "learning_rate": 3.567821664004113e-05, + "loss": 1.5178, + "step": 21650 + }, + { + "epoch": 3.5794257384837844, + "grad_norm": 10.091255187988281, + "learning_rate": 3.566903542114251e-05, + "loss": 1.5195, + "step": 21660 + }, + { + "epoch": 3.581078289609585, + "grad_norm": 6.877391815185547, + "learning_rate": 3.565985420224389e-05, + "loss": 1.5511, + "step": 21670 + }, + { + "epoch": 3.582730840735385, + "grad_norm": 15.701227188110352, + "learning_rate": 3.5650672983345266e-05, + "loss": 1.6004, + "step": 21680 + }, + { + "epoch": 3.5843833918611856, + "grad_norm": 5.971979141235352, + "learning_rate": 3.564149176444665e-05, + "loss": 1.4233, + "step": 21690 + }, + { + "epoch": 3.586035942986986, + "grad_norm": 28.64961051940918, + "learning_rate": 3.563231054554803e-05, + "loss": 1.5402, + "step": 21700 + }, + { + "epoch": 3.5876884941127867, + "grad_norm": 18.97611427307129, + "learning_rate": 3.562312932664941e-05, + "loss": 1.6188, + "step": 21710 + }, + { + "epoch": 3.589341045238587, + "grad_norm": 11.403149604797363, + "learning_rate": 3.561394810775079e-05, + "loss": 1.6067, + "step": 21720 + }, + { + "epoch": 3.5909935963643873, + "grad_norm": 10.633176803588867, + "learning_rate": 3.5604766888852165e-05, + "loss": 1.5884, + "step": 21730 + }, + { + "epoch": 3.592646147490188, + "grad_norm": 6.8395490646362305, + "learning_rate": 3.559558566995355e-05, + "loss": 1.5274, + "step": 21740 + }, + { + "epoch": 3.5942986986159884, + "grad_norm": 7.640970706939697, + "learning_rate": 3.5586404451054924e-05, + "loss": 1.4888, + "step": 21750 + }, + { + "epoch": 3.595951249741789, + "grad_norm": 27.678407669067383, + "learning_rate": 3.5577223232156306e-05, + "loss": 1.4142, + "step": 21760 + }, + { + "epoch": 3.5976038008675895, + "grad_norm": 15.568962097167969, + "learning_rate": 3.556804201325768e-05, + "loss": 1.5022, + "step": 21770 + }, + { + "epoch": 3.5992563519933896, + "grad_norm": 7.94047737121582, + "learning_rate": 3.555886079435906e-05, + "loss": 1.4938, + "step": 21780 + }, + { + "epoch": 3.60090890311919, + "grad_norm": 8.355306625366211, + "learning_rate": 3.5549679575460434e-05, + "loss": 1.6414, + "step": 21790 + }, + { + "epoch": 3.6025614542449906, + "grad_norm": 9.412763595581055, + "learning_rate": 3.554049835656182e-05, + "loss": 1.5618, + "step": 21800 + }, + { + "epoch": 3.604214005370791, + "grad_norm": 8.5521240234375, + "learning_rate": 3.55313171376632e-05, + "loss": 1.5158, + "step": 21810 + }, + { + "epoch": 3.6058665564965917, + "grad_norm": 14.980252265930176, + "learning_rate": 3.5522135918764575e-05, + "loss": 1.5169, + "step": 21820 + }, + { + "epoch": 3.607519107622392, + "grad_norm": 5.556758403778076, + "learning_rate": 3.551295469986596e-05, + "loss": 1.5842, + "step": 21830 + }, + { + "epoch": 3.609171658748193, + "grad_norm": 18.579702377319336, + "learning_rate": 3.5503773480967334e-05, + "loss": 1.5249, + "step": 21840 + }, + { + "epoch": 3.610824209873993, + "grad_norm": 26.00187873840332, + "learning_rate": 3.5494592262068716e-05, + "loss": 1.5037, + "step": 21850 + }, + { + "epoch": 3.6124767609997934, + "grad_norm": 34.09498977661133, + "learning_rate": 3.548541104317009e-05, + "loss": 1.4003, + "step": 21860 + }, + { + "epoch": 3.614129312125594, + "grad_norm": 15.18822956085205, + "learning_rate": 3.5476229824271475e-05, + "loss": 1.6049, + "step": 21870 + }, + { + "epoch": 3.6157818632513945, + "grad_norm": 8.645750999450684, + "learning_rate": 3.546704860537285e-05, + "loss": 1.5309, + "step": 21880 + }, + { + "epoch": 3.617434414377195, + "grad_norm": 9.381811141967773, + "learning_rate": 3.545786738647423e-05, + "loss": 1.3985, + "step": 21890 + }, + { + "epoch": 3.619086965502995, + "grad_norm": 10.318458557128906, + "learning_rate": 3.544868616757561e-05, + "loss": 1.5741, + "step": 21900 + }, + { + "epoch": 3.6207395166287957, + "grad_norm": 7.140669345855713, + "learning_rate": 3.5439504948676985e-05, + "loss": 1.5998, + "step": 21910 + }, + { + "epoch": 3.622392067754596, + "grad_norm": 11.092816352844238, + "learning_rate": 3.543032372977837e-05, + "loss": 1.6585, + "step": 21920 + }, + { + "epoch": 3.6240446188803968, + "grad_norm": 14.348875045776367, + "learning_rate": 3.5421142510879743e-05, + "loss": 1.6889, + "step": 21930 + }, + { + "epoch": 3.6256971700061973, + "grad_norm": 20.467607498168945, + "learning_rate": 3.5411961291981126e-05, + "loss": 1.5698, + "step": 21940 + }, + { + "epoch": 3.6273497211319974, + "grad_norm": 6.864068508148193, + "learning_rate": 3.54027800730825e-05, + "loss": 1.3678, + "step": 21950 + }, + { + "epoch": 3.629002272257798, + "grad_norm": 10.878697395324707, + "learning_rate": 3.5393598854183885e-05, + "loss": 1.5885, + "step": 21960 + }, + { + "epoch": 3.6306548233835985, + "grad_norm": 11.186368942260742, + "learning_rate": 3.538441763528526e-05, + "loss": 1.4722, + "step": 21970 + }, + { + "epoch": 3.632307374509399, + "grad_norm": 11.424324989318848, + "learning_rate": 3.537523641638664e-05, + "loss": 1.6041, + "step": 21980 + }, + { + "epoch": 3.6339599256351995, + "grad_norm": 11.108640670776367, + "learning_rate": 3.536605519748802e-05, + "loss": 1.5555, + "step": 21990 + }, + { + "epoch": 3.6356124767609996, + "grad_norm": 6.946378707885742, + "learning_rate": 3.53568739785894e-05, + "loss": 1.5627, + "step": 22000 + }, + { + "epoch": 3.6372650278868, + "grad_norm": 17.414968490600586, + "learning_rate": 3.534769275969078e-05, + "loss": 1.5827, + "step": 22010 + }, + { + "epoch": 3.6389175790126007, + "grad_norm": 8.051285743713379, + "learning_rate": 3.533851154079216e-05, + "loss": 1.561, + "step": 22020 + }, + { + "epoch": 3.6405701301384013, + "grad_norm": 6.667621612548828, + "learning_rate": 3.5329330321893536e-05, + "loss": 1.4156, + "step": 22030 + }, + { + "epoch": 3.642222681264202, + "grad_norm": 19.320453643798828, + "learning_rate": 3.532014910299491e-05, + "loss": 1.5852, + "step": 22040 + }, + { + "epoch": 3.643875232390002, + "grad_norm": 8.498446464538574, + "learning_rate": 3.5310967884096294e-05, + "loss": 1.5934, + "step": 22050 + }, + { + "epoch": 3.6455277835158024, + "grad_norm": 8.024924278259277, + "learning_rate": 3.530178666519767e-05, + "loss": 1.5443, + "step": 22060 + }, + { + "epoch": 3.647180334641603, + "grad_norm": 14.608724594116211, + "learning_rate": 3.529260544629905e-05, + "loss": 1.6092, + "step": 22070 + }, + { + "epoch": 3.6488328857674035, + "grad_norm": 8.343809127807617, + "learning_rate": 3.528342422740043e-05, + "loss": 1.6338, + "step": 22080 + }, + { + "epoch": 3.650485436893204, + "grad_norm": 8.434662818908691, + "learning_rate": 3.527424300850181e-05, + "loss": 1.5697, + "step": 22090 + }, + { + "epoch": 3.652137988019004, + "grad_norm": 7.201797008514404, + "learning_rate": 3.526506178960319e-05, + "loss": 1.4681, + "step": 22100 + }, + { + "epoch": 3.6537905391448047, + "grad_norm": 7.6197896003723145, + "learning_rate": 3.525588057070457e-05, + "loss": 1.6327, + "step": 22110 + }, + { + "epoch": 3.655443090270605, + "grad_norm": 6.695487022399902, + "learning_rate": 3.5246699351805946e-05, + "loss": 1.5547, + "step": 22120 + }, + { + "epoch": 3.6570956413964057, + "grad_norm": 7.320518493652344, + "learning_rate": 3.523751813290733e-05, + "loss": 1.376, + "step": 22130 + }, + { + "epoch": 3.6587481925222063, + "grad_norm": 10.218902587890625, + "learning_rate": 3.5228336914008704e-05, + "loss": 1.6282, + "step": 22140 + }, + { + "epoch": 3.6604007436480064, + "grad_norm": 26.678739547729492, + "learning_rate": 3.521915569511009e-05, + "loss": 1.5529, + "step": 22150 + }, + { + "epoch": 3.662053294773807, + "grad_norm": 7.635467529296875, + "learning_rate": 3.520997447621146e-05, + "loss": 1.5517, + "step": 22160 + }, + { + "epoch": 3.6637058458996075, + "grad_norm": 21.92401123046875, + "learning_rate": 3.520079325731284e-05, + "loss": 1.5591, + "step": 22170 + }, + { + "epoch": 3.665358397025408, + "grad_norm": 5.554649829864502, + "learning_rate": 3.519161203841422e-05, + "loss": 1.5443, + "step": 22180 + }, + { + "epoch": 3.6670109481512085, + "grad_norm": 9.22087287902832, + "learning_rate": 3.51824308195156e-05, + "loss": 1.3653, + "step": 22190 + }, + { + "epoch": 3.6686634992770086, + "grad_norm": 9.278251647949219, + "learning_rate": 3.517324960061698e-05, + "loss": 1.5557, + "step": 22200 + }, + { + "epoch": 3.670316050402809, + "grad_norm": 9.113847732543945, + "learning_rate": 3.5164068381718356e-05, + "loss": 1.5877, + "step": 22210 + }, + { + "epoch": 3.6719686015286097, + "grad_norm": 7.552914619445801, + "learning_rate": 3.515488716281974e-05, + "loss": 1.5065, + "step": 22220 + }, + { + "epoch": 3.6736211526544102, + "grad_norm": 10.888140678405762, + "learning_rate": 3.5145705943921114e-05, + "loss": 1.4641, + "step": 22230 + }, + { + "epoch": 3.675273703780211, + "grad_norm": 11.267471313476562, + "learning_rate": 3.51365247250225e-05, + "loss": 1.6272, + "step": 22240 + }, + { + "epoch": 3.676926254906011, + "grad_norm": 7.349489688873291, + "learning_rate": 3.512734350612387e-05, + "loss": 1.5629, + "step": 22250 + }, + { + "epoch": 3.678578806031812, + "grad_norm": 9.634526252746582, + "learning_rate": 3.5118162287225255e-05, + "loss": 1.5289, + "step": 22260 + }, + { + "epoch": 3.680231357157612, + "grad_norm": 9.006583213806152, + "learning_rate": 3.510898106832664e-05, + "loss": 1.5232, + "step": 22270 + }, + { + "epoch": 3.6818839082834125, + "grad_norm": 9.722323417663574, + "learning_rate": 3.5099799849428014e-05, + "loss": 1.4823, + "step": 22280 + }, + { + "epoch": 3.683536459409213, + "grad_norm": 7.501804828643799, + "learning_rate": 3.509061863052939e-05, + "loss": 1.609, + "step": 22290 + }, + { + "epoch": 3.6851890105350136, + "grad_norm": 62.3031120300293, + "learning_rate": 3.5081437411630765e-05, + "loss": 1.554, + "step": 22300 + }, + { + "epoch": 3.686841561660814, + "grad_norm": 7.480673313140869, + "learning_rate": 3.507225619273215e-05, + "loss": 1.4209, + "step": 22310 + }, + { + "epoch": 3.688494112786614, + "grad_norm": 8.595160484313965, + "learning_rate": 3.5063074973833524e-05, + "loss": 1.7512, + "step": 22320 + }, + { + "epoch": 3.6901466639124147, + "grad_norm": 5.803942680358887, + "learning_rate": 3.5053893754934907e-05, + "loss": 1.5432, + "step": 22330 + }, + { + "epoch": 3.6917992150382153, + "grad_norm": 13.61528491973877, + "learning_rate": 3.504471253603628e-05, + "loss": 1.617, + "step": 22340 + }, + { + "epoch": 3.693451766164016, + "grad_norm": 60.33266067504883, + "learning_rate": 3.5035531317137665e-05, + "loss": 1.5357, + "step": 22350 + }, + { + "epoch": 3.6951043172898164, + "grad_norm": 11.57695484161377, + "learning_rate": 3.502635009823904e-05, + "loss": 1.3428, + "step": 22360 + }, + { + "epoch": 3.6967568684156165, + "grad_norm": 9.341504096984863, + "learning_rate": 3.5017168879340424e-05, + "loss": 1.5852, + "step": 22370 + }, + { + "epoch": 3.698409419541417, + "grad_norm": 9.287238121032715, + "learning_rate": 3.5007987660441806e-05, + "loss": 1.4823, + "step": 22380 + }, + { + "epoch": 3.7000619706672175, + "grad_norm": 11.68283462524414, + "learning_rate": 3.499880644154318e-05, + "loss": 1.5553, + "step": 22390 + }, + { + "epoch": 3.701714521793018, + "grad_norm": 9.870018005371094, + "learning_rate": 3.4989625222644565e-05, + "loss": 1.5829, + "step": 22400 + }, + { + "epoch": 3.7033670729188186, + "grad_norm": 41.26398849487305, + "learning_rate": 3.498044400374594e-05, + "loss": 1.4629, + "step": 22410 + }, + { + "epoch": 3.7050196240446187, + "grad_norm": 13.389437675476074, + "learning_rate": 3.4971262784847316e-05, + "loss": 1.6009, + "step": 22420 + }, + { + "epoch": 3.7066721751704192, + "grad_norm": 8.6530179977417, + "learning_rate": 3.496208156594869e-05, + "loss": 1.6486, + "step": 22430 + }, + { + "epoch": 3.70832472629622, + "grad_norm": 8.932863235473633, + "learning_rate": 3.4952900347050075e-05, + "loss": 1.6876, + "step": 22440 + }, + { + "epoch": 3.7099772774220203, + "grad_norm": 14.32821273803711, + "learning_rate": 3.494371912815145e-05, + "loss": 1.4702, + "step": 22450 + }, + { + "epoch": 3.711629828547821, + "grad_norm": 10.528609275817871, + "learning_rate": 3.4934537909252833e-05, + "loss": 1.4973, + "step": 22460 + }, + { + "epoch": 3.713282379673621, + "grad_norm": 13.962540626525879, + "learning_rate": 3.492535669035421e-05, + "loss": 1.6201, + "step": 22470 + }, + { + "epoch": 3.7149349307994215, + "grad_norm": 15.784793853759766, + "learning_rate": 3.491617547145559e-05, + "loss": 1.6017, + "step": 22480 + }, + { + "epoch": 3.716587481925222, + "grad_norm": 9.065518379211426, + "learning_rate": 3.4906994252556975e-05, + "loss": 1.5692, + "step": 22490 + }, + { + "epoch": 3.7182400330510226, + "grad_norm": 10.009034156799316, + "learning_rate": 3.489781303365835e-05, + "loss": 1.5791, + "step": 22500 + }, + { + "epoch": 3.719892584176823, + "grad_norm": 8.641139030456543, + "learning_rate": 3.488863181475973e-05, + "loss": 1.5955, + "step": 22510 + }, + { + "epoch": 3.721545135302623, + "grad_norm": 10.301948547363281, + "learning_rate": 3.487945059586111e-05, + "loss": 1.5108, + "step": 22520 + }, + { + "epoch": 3.7231976864284237, + "grad_norm": 28.2911319732666, + "learning_rate": 3.487026937696249e-05, + "loss": 1.5386, + "step": 22530 + }, + { + "epoch": 3.7248502375542243, + "grad_norm": 129.94375610351562, + "learning_rate": 3.486108815806387e-05, + "loss": 1.7079, + "step": 22540 + }, + { + "epoch": 3.726502788680025, + "grad_norm": 23.256444931030273, + "learning_rate": 3.485190693916524e-05, + "loss": 1.5878, + "step": 22550 + }, + { + "epoch": 3.7281553398058254, + "grad_norm": 7.868270397186279, + "learning_rate": 3.484272572026662e-05, + "loss": 1.536, + "step": 22560 + }, + { + "epoch": 3.7298078909316255, + "grad_norm": 14.722084045410156, + "learning_rate": 3.4833544501368e-05, + "loss": 1.4831, + "step": 22570 + }, + { + "epoch": 3.731460442057426, + "grad_norm": 19.196247100830078, + "learning_rate": 3.482436328246938e-05, + "loss": 1.4927, + "step": 22580 + }, + { + "epoch": 3.7331129931832265, + "grad_norm": 7.337265968322754, + "learning_rate": 3.481518206357076e-05, + "loss": 1.6621, + "step": 22590 + }, + { + "epoch": 3.734765544309027, + "grad_norm": 21.33555793762207, + "learning_rate": 3.480600084467214e-05, + "loss": 1.4837, + "step": 22600 + }, + { + "epoch": 3.7364180954348276, + "grad_norm": 33.45065689086914, + "learning_rate": 3.479681962577352e-05, + "loss": 1.5462, + "step": 22610 + }, + { + "epoch": 3.7380706465606277, + "grad_norm": 13.384827613830566, + "learning_rate": 3.47876384068749e-05, + "loss": 1.5441, + "step": 22620 + }, + { + "epoch": 3.7397231976864287, + "grad_norm": 75.69512939453125, + "learning_rate": 3.477845718797628e-05, + "loss": 1.4274, + "step": 22630 + }, + { + "epoch": 3.741375748812229, + "grad_norm": 14.372223854064941, + "learning_rate": 3.476927596907766e-05, + "loss": 1.6275, + "step": 22640 + }, + { + "epoch": 3.7430282999380293, + "grad_norm": 8.416259765625, + "learning_rate": 3.4760094750179036e-05, + "loss": 1.5561, + "step": 22650 + }, + { + "epoch": 3.74468085106383, + "grad_norm": 11.804091453552246, + "learning_rate": 3.475091353128042e-05, + "loss": 1.6549, + "step": 22660 + }, + { + "epoch": 3.7463334021896304, + "grad_norm": 11.802560806274414, + "learning_rate": 3.4741732312381794e-05, + "loss": 1.5372, + "step": 22670 + }, + { + "epoch": 3.747985953315431, + "grad_norm": 8.755041122436523, + "learning_rate": 3.473255109348317e-05, + "loss": 1.6324, + "step": 22680 + }, + { + "epoch": 3.749638504441231, + "grad_norm": 6.660466194152832, + "learning_rate": 3.4723369874584546e-05, + "loss": 1.537, + "step": 22690 + }, + { + "epoch": 3.7512910555670316, + "grad_norm": 19.879173278808594, + "learning_rate": 3.471418865568593e-05, + "loss": 1.474, + "step": 22700 + }, + { + "epoch": 3.752943606692832, + "grad_norm": 4.748521327972412, + "learning_rate": 3.4705007436787304e-05, + "loss": 1.5009, + "step": 22710 + }, + { + "epoch": 3.7545961578186327, + "grad_norm": 30.82050895690918, + "learning_rate": 3.469582621788869e-05, + "loss": 1.5368, + "step": 22720 + }, + { + "epoch": 3.756248708944433, + "grad_norm": 28.105743408203125, + "learning_rate": 3.468664499899007e-05, + "loss": 1.359, + "step": 22730 + }, + { + "epoch": 3.7579012600702333, + "grad_norm": 20.436500549316406, + "learning_rate": 3.4677463780091446e-05, + "loss": 1.4738, + "step": 22740 + }, + { + "epoch": 3.759553811196034, + "grad_norm": 18.063720703125, + "learning_rate": 3.466828256119283e-05, + "loss": 1.6101, + "step": 22750 + }, + { + "epoch": 3.7612063623218344, + "grad_norm": 11.609806060791016, + "learning_rate": 3.4659101342294204e-05, + "loss": 1.4377, + "step": 22760 + }, + { + "epoch": 3.762858913447635, + "grad_norm": 7.862545490264893, + "learning_rate": 3.464992012339559e-05, + "loss": 1.5208, + "step": 22770 + }, + { + "epoch": 3.7645114645734354, + "grad_norm": 13.573543548583984, + "learning_rate": 3.464073890449696e-05, + "loss": 1.6314, + "step": 22780 + }, + { + "epoch": 3.7661640156992355, + "grad_norm": 6.686908721923828, + "learning_rate": 3.4631557685598345e-05, + "loss": 1.5347, + "step": 22790 + }, + { + "epoch": 3.767816566825036, + "grad_norm": 16.876447677612305, + "learning_rate": 3.462237646669972e-05, + "loss": 1.3998, + "step": 22800 + }, + { + "epoch": 3.7694691179508366, + "grad_norm": 5.551175117492676, + "learning_rate": 3.46131952478011e-05, + "loss": 1.6144, + "step": 22810 + }, + { + "epoch": 3.771121669076637, + "grad_norm": 12.1668701171875, + "learning_rate": 3.460401402890247e-05, + "loss": 1.5202, + "step": 22820 + }, + { + "epoch": 3.7727742202024377, + "grad_norm": 7.639707565307617, + "learning_rate": 3.4594832810003855e-05, + "loss": 1.5545, + "step": 22830 + }, + { + "epoch": 3.774426771328238, + "grad_norm": 27.100339889526367, + "learning_rate": 3.458565159110524e-05, + "loss": 1.5098, + "step": 22840 + }, + { + "epoch": 3.7760793224540383, + "grad_norm": 6.334820747375488, + "learning_rate": 3.4576470372206614e-05, + "loss": 1.4167, + "step": 22850 + }, + { + "epoch": 3.777731873579839, + "grad_norm": 56.109954833984375, + "learning_rate": 3.4567289153307997e-05, + "loss": 1.5223, + "step": 22860 + }, + { + "epoch": 3.7793844247056394, + "grad_norm": 9.93230152130127, + "learning_rate": 3.455810793440937e-05, + "loss": 1.5619, + "step": 22870 + }, + { + "epoch": 3.78103697583144, + "grad_norm": 9.03853702545166, + "learning_rate": 3.4548926715510755e-05, + "loss": 1.468, + "step": 22880 + }, + { + "epoch": 3.78268952695724, + "grad_norm": 11.272610664367676, + "learning_rate": 3.453974549661213e-05, + "loss": 1.4569, + "step": 22890 + }, + { + "epoch": 3.7843420780830406, + "grad_norm": 11.486401557922363, + "learning_rate": 3.4530564277713514e-05, + "loss": 1.5401, + "step": 22900 + }, + { + "epoch": 3.785994629208841, + "grad_norm": 12.329487800598145, + "learning_rate": 3.452138305881489e-05, + "loss": 1.5879, + "step": 22910 + }, + { + "epoch": 3.7876471803346417, + "grad_norm": 13.06454849243164, + "learning_rate": 3.451220183991627e-05, + "loss": 1.5854, + "step": 22920 + }, + { + "epoch": 3.789299731460442, + "grad_norm": 17.574142456054688, + "learning_rate": 3.450302062101765e-05, + "loss": 1.5392, + "step": 22930 + }, + { + "epoch": 3.7909522825862423, + "grad_norm": 17.874441146850586, + "learning_rate": 3.4493839402119024e-05, + "loss": 1.542, + "step": 22940 + }, + { + "epoch": 3.792604833712043, + "grad_norm": 8.613436698913574, + "learning_rate": 3.4484658183220406e-05, + "loss": 1.5781, + "step": 22950 + }, + { + "epoch": 3.7942573848378434, + "grad_norm": 9.75782585144043, + "learning_rate": 3.447547696432178e-05, + "loss": 1.5628, + "step": 22960 + }, + { + "epoch": 3.795909935963644, + "grad_norm": 13.988187789916992, + "learning_rate": 3.4466295745423165e-05, + "loss": 1.5528, + "step": 22970 + }, + { + "epoch": 3.7975624870894444, + "grad_norm": 15.687790870666504, + "learning_rate": 3.445711452652454e-05, + "loss": 1.555, + "step": 22980 + }, + { + "epoch": 3.7992150382152445, + "grad_norm": 10.992048263549805, + "learning_rate": 3.4447933307625923e-05, + "loss": 1.7061, + "step": 22990 + }, + { + "epoch": 3.8008675893410455, + "grad_norm": 10.867881774902344, + "learning_rate": 3.44387520887273e-05, + "loss": 1.6758, + "step": 23000 + }, + { + "epoch": 3.8025201404668456, + "grad_norm": 10.184456825256348, + "learning_rate": 3.442957086982868e-05, + "loss": 1.5848, + "step": 23010 + }, + { + "epoch": 3.804172691592646, + "grad_norm": 17.802675247192383, + "learning_rate": 3.442038965093006e-05, + "loss": 1.592, + "step": 23020 + }, + { + "epoch": 3.8058252427184467, + "grad_norm": 12.011911392211914, + "learning_rate": 3.441120843203144e-05, + "loss": 1.4794, + "step": 23030 + }, + { + "epoch": 3.807477793844247, + "grad_norm": 8.725467681884766, + "learning_rate": 3.4402027213132816e-05, + "loss": 1.5482, + "step": 23040 + }, + { + "epoch": 3.8091303449700478, + "grad_norm": 10.62525749206543, + "learning_rate": 3.43928459942342e-05, + "loss": 1.4878, + "step": 23050 + }, + { + "epoch": 3.810782896095848, + "grad_norm": 10.504483222961426, + "learning_rate": 3.4383664775335575e-05, + "loss": 1.4817, + "step": 23060 + }, + { + "epoch": 3.8124354472216484, + "grad_norm": 12.407063484191895, + "learning_rate": 3.437448355643695e-05, + "loss": 1.5458, + "step": 23070 + }, + { + "epoch": 3.814087998347449, + "grad_norm": 17.53013038635254, + "learning_rate": 3.436530233753833e-05, + "loss": 1.5761, + "step": 23080 + }, + { + "epoch": 3.8157405494732495, + "grad_norm": 16.574785232543945, + "learning_rate": 3.435612111863971e-05, + "loss": 1.5797, + "step": 23090 + }, + { + "epoch": 3.81739310059905, + "grad_norm": 11.763534545898438, + "learning_rate": 3.434693989974109e-05, + "loss": 1.5059, + "step": 23100 + }, + { + "epoch": 3.81904565172485, + "grad_norm": 11.488256454467773, + "learning_rate": 3.433775868084247e-05, + "loss": 1.6877, + "step": 23110 + }, + { + "epoch": 3.8206982028506506, + "grad_norm": 8.004593849182129, + "learning_rate": 3.432857746194385e-05, + "loss": 1.491, + "step": 23120 + }, + { + "epoch": 3.822350753976451, + "grad_norm": 8.863856315612793, + "learning_rate": 3.4319396243045226e-05, + "loss": 1.5296, + "step": 23130 + }, + { + "epoch": 3.8240033051022517, + "grad_norm": 9.865023612976074, + "learning_rate": 3.431021502414661e-05, + "loss": 1.5924, + "step": 23140 + }, + { + "epoch": 3.8256558562280523, + "grad_norm": 5.761257171630859, + "learning_rate": 3.4301033805247985e-05, + "loss": 1.4003, + "step": 23150 + }, + { + "epoch": 3.8273084073538524, + "grad_norm": 14.028019905090332, + "learning_rate": 3.429185258634937e-05, + "loss": 1.5787, + "step": 23160 + }, + { + "epoch": 3.828960958479653, + "grad_norm": 11.172537803649902, + "learning_rate": 3.428267136745075e-05, + "loss": 1.5284, + "step": 23170 + }, + { + "epoch": 3.8306135096054534, + "grad_norm": 8.914949417114258, + "learning_rate": 3.4273490148552126e-05, + "loss": 1.5841, + "step": 23180 + }, + { + "epoch": 3.832266060731254, + "grad_norm": 11.068294525146484, + "learning_rate": 3.42643089296535e-05, + "loss": 1.5591, + "step": 23190 + }, + { + "epoch": 3.8339186118570545, + "grad_norm": 7.558146953582764, + "learning_rate": 3.425512771075488e-05, + "loss": 1.6917, + "step": 23200 + }, + { + "epoch": 3.8355711629828546, + "grad_norm": 11.731191635131836, + "learning_rate": 3.424594649185626e-05, + "loss": 1.5366, + "step": 23210 + }, + { + "epoch": 3.837223714108655, + "grad_norm": 19.394287109375, + "learning_rate": 3.4236765272957636e-05, + "loss": 1.5022, + "step": 23220 + }, + { + "epoch": 3.8388762652344557, + "grad_norm": 27.054391860961914, + "learning_rate": 3.422758405405902e-05, + "loss": 1.538, + "step": 23230 + }, + { + "epoch": 3.8405288163602562, + "grad_norm": 15.95034408569336, + "learning_rate": 3.4218402835160394e-05, + "loss": 1.6006, + "step": 23240 + }, + { + "epoch": 3.8421813674860568, + "grad_norm": 8.388917922973633, + "learning_rate": 3.420922161626178e-05, + "loss": 1.4591, + "step": 23250 + }, + { + "epoch": 3.843833918611857, + "grad_norm": 11.338773727416992, + "learning_rate": 3.420004039736315e-05, + "loss": 1.6336, + "step": 23260 + }, + { + "epoch": 3.8454864697376574, + "grad_norm": 12.91083812713623, + "learning_rate": 3.4190859178464536e-05, + "loss": 1.6015, + "step": 23270 + }, + { + "epoch": 3.847139020863458, + "grad_norm": 10.893755912780762, + "learning_rate": 3.418167795956591e-05, + "loss": 1.5369, + "step": 23280 + }, + { + "epoch": 3.8487915719892585, + "grad_norm": 10.60979175567627, + "learning_rate": 3.4172496740667294e-05, + "loss": 1.591, + "step": 23290 + }, + { + "epoch": 3.850444123115059, + "grad_norm": 9.582144737243652, + "learning_rate": 3.416331552176868e-05, + "loss": 1.6382, + "step": 23300 + }, + { + "epoch": 3.852096674240859, + "grad_norm": 14.1846923828125, + "learning_rate": 3.415413430287005e-05, + "loss": 1.5981, + "step": 23310 + }, + { + "epoch": 3.8537492253666596, + "grad_norm": 10.367266654968262, + "learning_rate": 3.414495308397143e-05, + "loss": 1.5864, + "step": 23320 + }, + { + "epoch": 3.85540177649246, + "grad_norm": 9.663958549499512, + "learning_rate": 3.4135771865072804e-05, + "loss": 1.544, + "step": 23330 + }, + { + "epoch": 3.8570543276182607, + "grad_norm": 13.666308403015137, + "learning_rate": 3.412659064617419e-05, + "loss": 1.4994, + "step": 23340 + }, + { + "epoch": 3.8587068787440613, + "grad_norm": 9.828655242919922, + "learning_rate": 3.411740942727556e-05, + "loss": 1.5709, + "step": 23350 + }, + { + "epoch": 3.8603594298698614, + "grad_norm": 9.607194900512695, + "learning_rate": 3.4108228208376945e-05, + "loss": 1.54, + "step": 23360 + }, + { + "epoch": 3.862011980995662, + "grad_norm": 12.873283386230469, + "learning_rate": 3.409904698947832e-05, + "loss": 1.3461, + "step": 23370 + }, + { + "epoch": 3.8636645321214624, + "grad_norm": 8.448955535888672, + "learning_rate": 3.4089865770579704e-05, + "loss": 1.5732, + "step": 23380 + }, + { + "epoch": 3.865317083247263, + "grad_norm": 36.90695571899414, + "learning_rate": 3.408068455168108e-05, + "loss": 1.539, + "step": 23390 + }, + { + "epoch": 3.8669696343730635, + "grad_norm": 14.371119499206543, + "learning_rate": 3.407150333278246e-05, + "loss": 1.542, + "step": 23400 + }, + { + "epoch": 3.8686221854988636, + "grad_norm": 6.606020450592041, + "learning_rate": 3.4062322113883845e-05, + "loss": 1.412, + "step": 23410 + }, + { + "epoch": 3.8702747366246646, + "grad_norm": 7.5736083984375, + "learning_rate": 3.405314089498522e-05, + "loss": 1.6222, + "step": 23420 + }, + { + "epoch": 3.8719272877504647, + "grad_norm": 184.7552490234375, + "learning_rate": 3.4043959676086604e-05, + "loss": 1.5178, + "step": 23430 + }, + { + "epoch": 3.8735798388762652, + "grad_norm": 7.478529930114746, + "learning_rate": 3.403477845718798e-05, + "loss": 1.6738, + "step": 23440 + }, + { + "epoch": 3.8752323900020658, + "grad_norm": 14.52708911895752, + "learning_rate": 3.4025597238289355e-05, + "loss": 1.6449, + "step": 23450 + }, + { + "epoch": 3.8768849411278663, + "grad_norm": 8.332096099853516, + "learning_rate": 3.401641601939073e-05, + "loss": 1.5679, + "step": 23460 + }, + { + "epoch": 3.878537492253667, + "grad_norm": 9.562748908996582, + "learning_rate": 3.4007234800492114e-05, + "loss": 1.5742, + "step": 23470 + }, + { + "epoch": 3.880190043379467, + "grad_norm": 7.013890743255615, + "learning_rate": 3.399805358159349e-05, + "loss": 1.6466, + "step": 23480 + }, + { + "epoch": 3.8818425945052675, + "grad_norm": 16.500028610229492, + "learning_rate": 3.398887236269487e-05, + "loss": 1.6899, + "step": 23490 + }, + { + "epoch": 3.883495145631068, + "grad_norm": 8.28381633758545, + "learning_rate": 3.397969114379625e-05, + "loss": 1.5692, + "step": 23500 + }, + { + "epoch": 3.8851476967568686, + "grad_norm": 35.62749099731445, + "learning_rate": 3.397050992489763e-05, + "loss": 1.4717, + "step": 23510 + }, + { + "epoch": 3.886800247882669, + "grad_norm": 8.142434120178223, + "learning_rate": 3.396132870599901e-05, + "loss": 1.4372, + "step": 23520 + }, + { + "epoch": 3.888452799008469, + "grad_norm": 13.488250732421875, + "learning_rate": 3.395214748710039e-05, + "loss": 1.6781, + "step": 23530 + }, + { + "epoch": 3.8901053501342697, + "grad_norm": 8.421558380126953, + "learning_rate": 3.394296626820177e-05, + "loss": 1.4748, + "step": 23540 + }, + { + "epoch": 3.8917579012600703, + "grad_norm": 14.573169708251953, + "learning_rate": 3.393378504930315e-05, + "loss": 1.5179, + "step": 23550 + }, + { + "epoch": 3.893410452385871, + "grad_norm": 8.99917984008789, + "learning_rate": 3.392460383040453e-05, + "loss": 1.5438, + "step": 23560 + }, + { + "epoch": 3.8950630035116713, + "grad_norm": 16.3277645111084, + "learning_rate": 3.3915422611505906e-05, + "loss": 1.5428, + "step": 23570 + }, + { + "epoch": 3.8967155546374714, + "grad_norm": 7.9492878913879395, + "learning_rate": 3.390624139260728e-05, + "loss": 1.6256, + "step": 23580 + }, + { + "epoch": 3.898368105763272, + "grad_norm": 7.273673057556152, + "learning_rate": 3.389706017370866e-05, + "loss": 1.6642, + "step": 23590 + }, + { + "epoch": 3.9000206568890725, + "grad_norm": 8.6654634475708, + "learning_rate": 3.388787895481004e-05, + "loss": 1.5932, + "step": 23600 + }, + { + "epoch": 3.901673208014873, + "grad_norm": 10.21167278289795, + "learning_rate": 3.3878697735911416e-05, + "loss": 1.5356, + "step": 23610 + }, + { + "epoch": 3.9033257591406736, + "grad_norm": 16.551963806152344, + "learning_rate": 3.38695165170128e-05, + "loss": 1.6243, + "step": 23620 + }, + { + "epoch": 3.9049783102664737, + "grad_norm": 28.581851959228516, + "learning_rate": 3.386033529811418e-05, + "loss": 1.6437, + "step": 23630 + }, + { + "epoch": 3.9066308613922742, + "grad_norm": 22.335962295532227, + "learning_rate": 3.385115407921556e-05, + "loss": 1.4263, + "step": 23640 + }, + { + "epoch": 3.9082834125180748, + "grad_norm": 7.5155158042907715, + "learning_rate": 3.384197286031694e-05, + "loss": 1.4715, + "step": 23650 + }, + { + "epoch": 3.9099359636438753, + "grad_norm": 12.375370979309082, + "learning_rate": 3.3832791641418316e-05, + "loss": 1.5336, + "step": 23660 + }, + { + "epoch": 3.911588514769676, + "grad_norm": 26.960378646850586, + "learning_rate": 3.38236104225197e-05, + "loss": 1.5445, + "step": 23670 + }, + { + "epoch": 3.913241065895476, + "grad_norm": 17.1274471282959, + "learning_rate": 3.3814429203621075e-05, + "loss": 1.6372, + "step": 23680 + }, + { + "epoch": 3.9148936170212765, + "grad_norm": 8.658958435058594, + "learning_rate": 3.380524798472246e-05, + "loss": 1.5278, + "step": 23690 + }, + { + "epoch": 3.916546168147077, + "grad_norm": 8.599283218383789, + "learning_rate": 3.379606676582383e-05, + "loss": 1.5516, + "step": 23700 + }, + { + "epoch": 3.9181987192728776, + "grad_norm": 17.065500259399414, + "learning_rate": 3.378688554692521e-05, + "loss": 1.5585, + "step": 23710 + }, + { + "epoch": 3.919851270398678, + "grad_norm": 6.110587120056152, + "learning_rate": 3.3777704328026585e-05, + "loss": 1.4451, + "step": 23720 + }, + { + "epoch": 3.921503821524478, + "grad_norm": 11.20459270477295, + "learning_rate": 3.376852310912797e-05, + "loss": 1.5366, + "step": 23730 + }, + { + "epoch": 3.9231563726502787, + "grad_norm": 7.95195198059082, + "learning_rate": 3.375934189022935e-05, + "loss": 1.3379, + "step": 23740 + }, + { + "epoch": 3.9248089237760793, + "grad_norm": 17.207660675048828, + "learning_rate": 3.3750160671330726e-05, + "loss": 1.5098, + "step": 23750 + }, + { + "epoch": 3.92646147490188, + "grad_norm": 18.350038528442383, + "learning_rate": 3.374097945243211e-05, + "loss": 1.421, + "step": 23760 + }, + { + "epoch": 3.9281140260276803, + "grad_norm": 21.70857048034668, + "learning_rate": 3.3731798233533484e-05, + "loss": 1.5718, + "step": 23770 + }, + { + "epoch": 3.9297665771534804, + "grad_norm": 12.964973449707031, + "learning_rate": 3.372261701463487e-05, + "loss": 1.5394, + "step": 23780 + }, + { + "epoch": 3.9314191282792814, + "grad_norm": 7.696381092071533, + "learning_rate": 3.371343579573624e-05, + "loss": 1.392, + "step": 23790 + }, + { + "epoch": 3.9330716794050815, + "grad_norm": 8.877033233642578, + "learning_rate": 3.3704254576837626e-05, + "loss": 1.6341, + "step": 23800 + }, + { + "epoch": 3.934724230530882, + "grad_norm": 6.651622772216797, + "learning_rate": 3.3695073357939e-05, + "loss": 1.3865, + "step": 23810 + }, + { + "epoch": 3.9363767816566826, + "grad_norm": 13.928619384765625, + "learning_rate": 3.3685892139040384e-05, + "loss": 1.5814, + "step": 23820 + }, + { + "epoch": 3.9380293327824827, + "grad_norm": 14.732012748718262, + "learning_rate": 3.367671092014176e-05, + "loss": 1.4712, + "step": 23830 + }, + { + "epoch": 3.9396818839082837, + "grad_norm": 14.067058563232422, + "learning_rate": 3.3667529701243136e-05, + "loss": 1.5988, + "step": 23840 + }, + { + "epoch": 3.9413344350340838, + "grad_norm": 10.48046875, + "learning_rate": 3.365834848234452e-05, + "loss": 1.6534, + "step": 23850 + }, + { + "epoch": 3.9429869861598843, + "grad_norm": 25.77431297302246, + "learning_rate": 3.3649167263445894e-05, + "loss": 1.4357, + "step": 23860 + }, + { + "epoch": 3.944639537285685, + "grad_norm": 39.13468551635742, + "learning_rate": 3.363998604454728e-05, + "loss": 1.5746, + "step": 23870 + }, + { + "epoch": 3.9462920884114854, + "grad_norm": 12.897835731506348, + "learning_rate": 3.363080482564865e-05, + "loss": 1.435, + "step": 23880 + }, + { + "epoch": 3.947944639537286, + "grad_norm": 18.15728187561035, + "learning_rate": 3.3621623606750035e-05, + "loss": 1.5303, + "step": 23890 + }, + { + "epoch": 3.949597190663086, + "grad_norm": 20.04810905456543, + "learning_rate": 3.361244238785141e-05, + "loss": 1.5076, + "step": 23900 + }, + { + "epoch": 3.9512497417888865, + "grad_norm": 6.630679607391357, + "learning_rate": 3.3603261168952794e-05, + "loss": 1.4879, + "step": 23910 + }, + { + "epoch": 3.952902292914687, + "grad_norm": 30.30918312072754, + "learning_rate": 3.359407995005417e-05, + "loss": 1.5553, + "step": 23920 + }, + { + "epoch": 3.9545548440404876, + "grad_norm": 7.379817008972168, + "learning_rate": 3.358489873115555e-05, + "loss": 1.5888, + "step": 23930 + }, + { + "epoch": 3.956207395166288, + "grad_norm": 8.264845848083496, + "learning_rate": 3.357571751225693e-05, + "loss": 1.637, + "step": 23940 + }, + { + "epoch": 3.9578599462920883, + "grad_norm": 14.769917488098145, + "learning_rate": 3.356653629335831e-05, + "loss": 1.5499, + "step": 23950 + }, + { + "epoch": 3.959512497417889, + "grad_norm": 6.285022735595703, + "learning_rate": 3.355735507445969e-05, + "loss": 1.6016, + "step": 23960 + }, + { + "epoch": 3.9611650485436893, + "grad_norm": 8.565293312072754, + "learning_rate": 3.354817385556106e-05, + "loss": 1.4791, + "step": 23970 + }, + { + "epoch": 3.96281759966949, + "grad_norm": 13.735636711120605, + "learning_rate": 3.3538992636662445e-05, + "loss": 1.4269, + "step": 23980 + }, + { + "epoch": 3.9644701507952904, + "grad_norm": 6.193691253662109, + "learning_rate": 3.352981141776382e-05, + "loss": 1.4687, + "step": 23990 + }, + { + "epoch": 3.9661227019210905, + "grad_norm": 9.597599029541016, + "learning_rate": 3.3520630198865204e-05, + "loss": 1.4732, + "step": 24000 + }, + { + "epoch": 3.967775253046891, + "grad_norm": 12.805542945861816, + "learning_rate": 3.351144897996658e-05, + "loss": 1.7671, + "step": 24010 + }, + { + "epoch": 3.9694278041726916, + "grad_norm": 11.650763511657715, + "learning_rate": 3.350226776106796e-05, + "loss": 1.4774, + "step": 24020 + }, + { + "epoch": 3.971080355298492, + "grad_norm": 32.57954406738281, + "learning_rate": 3.349308654216934e-05, + "loss": 1.6312, + "step": 24030 + }, + { + "epoch": 3.9727329064242927, + "grad_norm": 10.600648880004883, + "learning_rate": 3.348390532327072e-05, + "loss": 1.4607, + "step": 24040 + }, + { + "epoch": 3.9743854575500928, + "grad_norm": 7.16843843460083, + "learning_rate": 3.3474724104372097e-05, + "loss": 1.5808, + "step": 24050 + }, + { + "epoch": 3.9760380086758933, + "grad_norm": 60.35005187988281, + "learning_rate": 3.346554288547348e-05, + "loss": 1.4766, + "step": 24060 + }, + { + "epoch": 3.977690559801694, + "grad_norm": 8.780489921569824, + "learning_rate": 3.3456361666574855e-05, + "loss": 1.5446, + "step": 24070 + }, + { + "epoch": 3.9793431109274944, + "grad_norm": 17.84938621520996, + "learning_rate": 3.344718044767624e-05, + "loss": 1.569, + "step": 24080 + }, + { + "epoch": 3.980995662053295, + "grad_norm": 10.445038795471191, + "learning_rate": 3.3437999228777614e-05, + "loss": 1.6016, + "step": 24090 + }, + { + "epoch": 3.982648213179095, + "grad_norm": 6.451947212219238, + "learning_rate": 3.342881800987899e-05, + "loss": 1.5652, + "step": 24100 + }, + { + "epoch": 3.9843007643048955, + "grad_norm": 13.274555206298828, + "learning_rate": 3.341963679098037e-05, + "loss": 1.3444, + "step": 24110 + }, + { + "epoch": 3.985953315430696, + "grad_norm": 92.55748748779297, + "learning_rate": 3.341045557208175e-05, + "loss": 1.5804, + "step": 24120 + }, + { + "epoch": 3.9876058665564966, + "grad_norm": 6.433228015899658, + "learning_rate": 3.340127435318313e-05, + "loss": 1.5241, + "step": 24130 + }, + { + "epoch": 3.989258417682297, + "grad_norm": 7.102395534515381, + "learning_rate": 3.3392093134284506e-05, + "loss": 1.4866, + "step": 24140 + }, + { + "epoch": 3.9909109688080973, + "grad_norm": 7.430931568145752, + "learning_rate": 3.338291191538589e-05, + "loss": 1.5228, + "step": 24150 + }, + { + "epoch": 3.992563519933898, + "grad_norm": 7.019854545593262, + "learning_rate": 3.3373730696487265e-05, + "loss": 1.6071, + "step": 24160 + }, + { + "epoch": 3.9942160710596983, + "grad_norm": 5.7070512771606445, + "learning_rate": 3.336454947758865e-05, + "loss": 1.4218, + "step": 24170 + }, + { + "epoch": 3.995868622185499, + "grad_norm": 15.087320327758789, + "learning_rate": 3.335536825869002e-05, + "loss": 1.4703, + "step": 24180 + }, + { + "epoch": 3.9975211733112994, + "grad_norm": 10.09599781036377, + "learning_rate": 3.3346187039791406e-05, + "loss": 1.5294, + "step": 24190 + }, + { + "epoch": 3.9991737244370995, + "grad_norm": 9.25766658782959, + "learning_rate": 3.333700582089279e-05, + "loss": 1.5693, + "step": 24200 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.3066392396084551, + "eval_loss": 2.2003278732299805, + "eval_runtime": 814.2753, + "eval_samples_per_second": 34.627, + "eval_steps_per_second": 8.657, + "step": 24205 + }, + { + "epoch": 4.0008262755629005, + "grad_norm": 6.969991207122803, + "learning_rate": 3.3327824601994165e-05, + "loss": 1.5264, + "step": 24210 + }, + { + "epoch": 4.002478826688701, + "grad_norm": 6.718509674072266, + "learning_rate": 3.331864338309554e-05, + "loss": 1.5123, + "step": 24220 + }, + { + "epoch": 4.004131377814502, + "grad_norm": 13.605385780334473, + "learning_rate": 3.3309462164196916e-05, + "loss": 1.5449, + "step": 24230 + }, + { + "epoch": 4.005783928940302, + "grad_norm": 8.160221099853516, + "learning_rate": 3.33002809452983e-05, + "loss": 1.6421, + "step": 24240 + }, + { + "epoch": 4.007436480066102, + "grad_norm": 6.459905624389648, + "learning_rate": 3.3291099726399675e-05, + "loss": 1.6331, + "step": 24250 + }, + { + "epoch": 4.009089031191903, + "grad_norm": 11.185013771057129, + "learning_rate": 3.328191850750106e-05, + "loss": 1.4834, + "step": 24260 + }, + { + "epoch": 4.010741582317703, + "grad_norm": 11.707094192504883, + "learning_rate": 3.327273728860243e-05, + "loss": 1.5363, + "step": 24270 + }, + { + "epoch": 4.012394133443504, + "grad_norm": 44.96635055541992, + "learning_rate": 3.3263556069703816e-05, + "loss": 1.4972, + "step": 24280 + }, + { + "epoch": 4.014046684569304, + "grad_norm": 15.029733657836914, + "learning_rate": 3.325437485080519e-05, + "loss": 1.575, + "step": 24290 + }, + { + "epoch": 4.015699235695104, + "grad_norm": 8.984262466430664, + "learning_rate": 3.3245193631906574e-05, + "loss": 1.6195, + "step": 24300 + }, + { + "epoch": 4.017351786820905, + "grad_norm": 6.634871482849121, + "learning_rate": 3.323601241300795e-05, + "loss": 1.5722, + "step": 24310 + }, + { + "epoch": 4.019004337946705, + "grad_norm": 13.543932914733887, + "learning_rate": 3.322683119410933e-05, + "loss": 1.6134, + "step": 24320 + }, + { + "epoch": 4.020656889072506, + "grad_norm": 7.46361780166626, + "learning_rate": 3.3217649975210715e-05, + "loss": 1.41, + "step": 24330 + }, + { + "epoch": 4.022309440198306, + "grad_norm": 16.312719345092773, + "learning_rate": 3.320846875631209e-05, + "loss": 1.4198, + "step": 24340 + }, + { + "epoch": 4.023961991324106, + "grad_norm": 16.578506469726562, + "learning_rate": 3.319928753741347e-05, + "loss": 1.4846, + "step": 24350 + }, + { + "epoch": 4.025614542449907, + "grad_norm": 25.313661575317383, + "learning_rate": 3.319010631851484e-05, + "loss": 1.4667, + "step": 24360 + }, + { + "epoch": 4.027267093575707, + "grad_norm": 71.65775299072266, + "learning_rate": 3.3180925099616226e-05, + "loss": 1.6426, + "step": 24370 + }, + { + "epoch": 4.028919644701508, + "grad_norm": 17.943498611450195, + "learning_rate": 3.31717438807176e-05, + "loss": 1.5965, + "step": 24380 + }, + { + "epoch": 4.030572195827308, + "grad_norm": 12.518502235412598, + "learning_rate": 3.3162562661818984e-05, + "loss": 1.57, + "step": 24390 + }, + { + "epoch": 4.0322247469531085, + "grad_norm": 64.42125701904297, + "learning_rate": 3.315338144292036e-05, + "loss": 1.5285, + "step": 24400 + }, + { + "epoch": 4.0338772980789095, + "grad_norm": 12.844326972961426, + "learning_rate": 3.314420022402174e-05, + "loss": 1.5768, + "step": 24410 + }, + { + "epoch": 4.03552984920471, + "grad_norm": 14.874149322509766, + "learning_rate": 3.313501900512312e-05, + "loss": 1.424, + "step": 24420 + }, + { + "epoch": 4.037182400330511, + "grad_norm": 18.798112869262695, + "learning_rate": 3.31258377862245e-05, + "loss": 1.6189, + "step": 24430 + }, + { + "epoch": 4.038834951456311, + "grad_norm": 8.762454986572266, + "learning_rate": 3.3116656567325884e-05, + "loss": 1.6398, + "step": 24440 + }, + { + "epoch": 4.040487502582111, + "grad_norm": 47.06496047973633, + "learning_rate": 3.310747534842726e-05, + "loss": 1.4831, + "step": 24450 + }, + { + "epoch": 4.042140053707912, + "grad_norm": 7.9549384117126465, + "learning_rate": 3.309829412952864e-05, + "loss": 1.5666, + "step": 24460 + }, + { + "epoch": 4.043792604833712, + "grad_norm": 8.519120216369629, + "learning_rate": 3.308911291063002e-05, + "loss": 1.5574, + "step": 24470 + }, + { + "epoch": 4.045445155959513, + "grad_norm": 16.42377471923828, + "learning_rate": 3.3079931691731394e-05, + "loss": 1.5932, + "step": 24480 + }, + { + "epoch": 4.047097707085313, + "grad_norm": 6.637816429138184, + "learning_rate": 3.307075047283277e-05, + "loss": 1.5165, + "step": 24490 + }, + { + "epoch": 4.048750258211113, + "grad_norm": 91.19857025146484, + "learning_rate": 3.306156925393415e-05, + "loss": 1.4989, + "step": 24500 + }, + { + "epoch": 4.050402809336914, + "grad_norm": 8.961830139160156, + "learning_rate": 3.305238803503553e-05, + "loss": 1.5585, + "step": 24510 + }, + { + "epoch": 4.052055360462714, + "grad_norm": 11.918331146240234, + "learning_rate": 3.304320681613691e-05, + "loss": 1.3883, + "step": 24520 + }, + { + "epoch": 4.053707911588515, + "grad_norm": 15.587654113769531, + "learning_rate": 3.303402559723829e-05, + "loss": 1.5789, + "step": 24530 + }, + { + "epoch": 4.055360462714315, + "grad_norm": 15.174756050109863, + "learning_rate": 3.302484437833967e-05, + "loss": 1.5297, + "step": 24540 + }, + { + "epoch": 4.057013013840115, + "grad_norm": 8.750446319580078, + "learning_rate": 3.301566315944105e-05, + "loss": 1.4318, + "step": 24550 + }, + { + "epoch": 4.058665564965916, + "grad_norm": 10.243573188781738, + "learning_rate": 3.300648194054243e-05, + "loss": 1.5251, + "step": 24560 + }, + { + "epoch": 4.060318116091716, + "grad_norm": 21.074369430541992, + "learning_rate": 3.299730072164381e-05, + "loss": 1.7139, + "step": 24570 + }, + { + "epoch": 4.061970667217517, + "grad_norm": 8.997513771057129, + "learning_rate": 3.2988119502745187e-05, + "loss": 1.7064, + "step": 24580 + }, + { + "epoch": 4.063623218343317, + "grad_norm": 29.157140731811523, + "learning_rate": 3.297893828384657e-05, + "loss": 1.464, + "step": 24590 + }, + { + "epoch": 4.0652757694691175, + "grad_norm": 11.638101577758789, + "learning_rate": 3.2969757064947945e-05, + "loss": 1.5183, + "step": 24600 + }, + { + "epoch": 4.0669283205949185, + "grad_norm": 7.134934425354004, + "learning_rate": 3.296057584604932e-05, + "loss": 1.4353, + "step": 24610 + }, + { + "epoch": 4.068580871720719, + "grad_norm": 4.920504093170166, + "learning_rate": 3.2951394627150704e-05, + "loss": 1.5342, + "step": 24620 + }, + { + "epoch": 4.07023342284652, + "grad_norm": 8.68477725982666, + "learning_rate": 3.294221340825208e-05, + "loss": 1.49, + "step": 24630 + }, + { + "epoch": 4.07188597397232, + "grad_norm": 9.163007736206055, + "learning_rate": 3.2933032189353455e-05, + "loss": 1.3947, + "step": 24640 + }, + { + "epoch": 4.073538525098121, + "grad_norm": 32.37632751464844, + "learning_rate": 3.292385097045484e-05, + "loss": 1.4837, + "step": 24650 + }, + { + "epoch": 4.075191076223921, + "grad_norm": 9.217835426330566, + "learning_rate": 3.291466975155622e-05, + "loss": 1.579, + "step": 24660 + }, + { + "epoch": 4.076843627349721, + "grad_norm": 11.495637893676758, + "learning_rate": 3.2905488532657596e-05, + "loss": 1.6953, + "step": 24670 + }, + { + "epoch": 4.078496178475522, + "grad_norm": 7.819090843200684, + "learning_rate": 3.289630731375898e-05, + "loss": 1.5833, + "step": 24680 + }, + { + "epoch": 4.080148729601322, + "grad_norm": 17.969079971313477, + "learning_rate": 3.2887126094860355e-05, + "loss": 1.3082, + "step": 24690 + }, + { + "epoch": 4.081801280727123, + "grad_norm": 11.410839080810547, + "learning_rate": 3.287794487596174e-05, + "loss": 1.5623, + "step": 24700 + }, + { + "epoch": 4.083453831852923, + "grad_norm": 37.196895599365234, + "learning_rate": 3.286876365706311e-05, + "loss": 1.5952, + "step": 24710 + }, + { + "epoch": 4.085106382978723, + "grad_norm": 7.64441442489624, + "learning_rate": 3.2859582438164496e-05, + "loss": 1.4717, + "step": 24720 + }, + { + "epoch": 4.086758934104524, + "grad_norm": 8.817895889282227, + "learning_rate": 3.285040121926587e-05, + "loss": 1.4053, + "step": 24730 + }, + { + "epoch": 4.088411485230324, + "grad_norm": 19.185001373291016, + "learning_rate": 3.284122000036725e-05, + "loss": 1.5218, + "step": 24740 + }, + { + "epoch": 4.090064036356125, + "grad_norm": 15.323920249938965, + "learning_rate": 3.283203878146863e-05, + "loss": 1.5812, + "step": 24750 + }, + { + "epoch": 4.091716587481925, + "grad_norm": 17.28553009033203, + "learning_rate": 3.2822857562570006e-05, + "loss": 1.4426, + "step": 24760 + }, + { + "epoch": 4.093369138607725, + "grad_norm": 10.657970428466797, + "learning_rate": 3.281367634367139e-05, + "loss": 1.5622, + "step": 24770 + }, + { + "epoch": 4.095021689733526, + "grad_norm": 10.3989896774292, + "learning_rate": 3.2804495124772765e-05, + "loss": 1.5272, + "step": 24780 + }, + { + "epoch": 4.096674240859326, + "grad_norm": 8.306746482849121, + "learning_rate": 3.279531390587415e-05, + "loss": 1.6179, + "step": 24790 + }, + { + "epoch": 4.098326791985127, + "grad_norm": 7.192846298217773, + "learning_rate": 3.278613268697552e-05, + "loss": 1.5878, + "step": 24800 + }, + { + "epoch": 4.0999793431109275, + "grad_norm": 9.504925727844238, + "learning_rate": 3.2776951468076906e-05, + "loss": 1.4805, + "step": 24810 + }, + { + "epoch": 4.101631894236728, + "grad_norm": 7.679511070251465, + "learning_rate": 3.276777024917828e-05, + "loss": 1.6694, + "step": 24820 + }, + { + "epoch": 4.103284445362529, + "grad_norm": 8.45281982421875, + "learning_rate": 3.2758589030279664e-05, + "loss": 1.4295, + "step": 24830 + }, + { + "epoch": 4.104936996488329, + "grad_norm": 33.410911560058594, + "learning_rate": 3.274940781138104e-05, + "loss": 1.6718, + "step": 24840 + }, + { + "epoch": 4.10658954761413, + "grad_norm": 10.545844078063965, + "learning_rate": 3.274022659248242e-05, + "loss": 1.4676, + "step": 24850 + }, + { + "epoch": 4.10824209873993, + "grad_norm": 9.887289047241211, + "learning_rate": 3.27310453735838e-05, + "loss": 1.4952, + "step": 24860 + }, + { + "epoch": 4.10989464986573, + "grad_norm": 13.104098320007324, + "learning_rate": 3.2721864154685175e-05, + "loss": 1.6299, + "step": 24870 + }, + { + "epoch": 4.111547200991531, + "grad_norm": 8.035774230957031, + "learning_rate": 3.271268293578656e-05, + "loss": 1.5514, + "step": 24880 + }, + { + "epoch": 4.113199752117331, + "grad_norm": 7.870905876159668, + "learning_rate": 3.270350171688793e-05, + "loss": 1.4943, + "step": 24890 + }, + { + "epoch": 4.114852303243132, + "grad_norm": 8.745378494262695, + "learning_rate": 3.2694320497989316e-05, + "loss": 1.4858, + "step": 24900 + }, + { + "epoch": 4.116504854368932, + "grad_norm": 10.829198837280273, + "learning_rate": 3.268513927909069e-05, + "loss": 1.5582, + "step": 24910 + }, + { + "epoch": 4.118157405494732, + "grad_norm": 19.55958366394043, + "learning_rate": 3.2675958060192074e-05, + "loss": 1.5017, + "step": 24920 + }, + { + "epoch": 4.119809956620533, + "grad_norm": 12.78548812866211, + "learning_rate": 3.266677684129345e-05, + "loss": 1.4994, + "step": 24930 + }, + { + "epoch": 4.121462507746333, + "grad_norm": 13.971933364868164, + "learning_rate": 3.265759562239483e-05, + "loss": 1.41, + "step": 24940 + }, + { + "epoch": 4.123115058872134, + "grad_norm": 7.70231294631958, + "learning_rate": 3.264841440349621e-05, + "loss": 1.5179, + "step": 24950 + }, + { + "epoch": 4.124767609997934, + "grad_norm": 11.750558853149414, + "learning_rate": 3.263923318459759e-05, + "loss": 1.6527, + "step": 24960 + }, + { + "epoch": 4.126420161123734, + "grad_norm": 12.228842735290527, + "learning_rate": 3.263005196569897e-05, + "loss": 1.6643, + "step": 24970 + }, + { + "epoch": 4.128072712249535, + "grad_norm": 8.996533393859863, + "learning_rate": 3.262087074680035e-05, + "loss": 1.4033, + "step": 24980 + }, + { + "epoch": 4.129725263375335, + "grad_norm": 5.231132984161377, + "learning_rate": 3.2611689527901726e-05, + "loss": 1.4871, + "step": 24990 + }, + { + "epoch": 4.131377814501136, + "grad_norm": 8.555381774902344, + "learning_rate": 3.26025083090031e-05, + "loss": 1.5646, + "step": 25000 + }, + { + "epoch": 4.1330303656269365, + "grad_norm": 6.629762649536133, + "learning_rate": 3.2593327090104484e-05, + "loss": 1.5089, + "step": 25010 + }, + { + "epoch": 4.1346829167527375, + "grad_norm": 12.015291213989258, + "learning_rate": 3.258414587120586e-05, + "loss": 1.5633, + "step": 25020 + }, + { + "epoch": 4.136335467878538, + "grad_norm": 7.017959117889404, + "learning_rate": 3.257496465230724e-05, + "loss": 1.5515, + "step": 25030 + }, + { + "epoch": 4.137988019004338, + "grad_norm": 9.60529899597168, + "learning_rate": 3.256578343340862e-05, + "loss": 1.4825, + "step": 25040 + }, + { + "epoch": 4.139640570130139, + "grad_norm": 10.894176483154297, + "learning_rate": 3.255660221451e-05, + "loss": 1.4634, + "step": 25050 + }, + { + "epoch": 4.141293121255939, + "grad_norm": 11.371975898742676, + "learning_rate": 3.254742099561138e-05, + "loss": 1.5382, + "step": 25060 + }, + { + "epoch": 4.14294567238174, + "grad_norm": 6.190928936004639, + "learning_rate": 3.253823977671276e-05, + "loss": 1.3551, + "step": 25070 + }, + { + "epoch": 4.14459822350754, + "grad_norm": 36.7093620300293, + "learning_rate": 3.2529058557814135e-05, + "loss": 1.4276, + "step": 25080 + }, + { + "epoch": 4.14625077463334, + "grad_norm": 9.693130493164062, + "learning_rate": 3.251987733891552e-05, + "loss": 1.6435, + "step": 25090 + }, + { + "epoch": 4.147903325759141, + "grad_norm": 25.358810424804688, + "learning_rate": 3.2510696120016894e-05, + "loss": 1.4884, + "step": 25100 + }, + { + "epoch": 4.149555876884941, + "grad_norm": 16.18667984008789, + "learning_rate": 3.2501514901118276e-05, + "loss": 1.5553, + "step": 25110 + }, + { + "epoch": 4.151208428010742, + "grad_norm": 7.252718925476074, + "learning_rate": 3.249233368221965e-05, + "loss": 1.4565, + "step": 25120 + }, + { + "epoch": 4.152860979136542, + "grad_norm": 14.278046607971191, + "learning_rate": 3.248315246332103e-05, + "loss": 1.5002, + "step": 25130 + }, + { + "epoch": 4.154513530262342, + "grad_norm": 33.27809524536133, + "learning_rate": 3.247397124442241e-05, + "loss": 1.5514, + "step": 25140 + }, + { + "epoch": 4.156166081388143, + "grad_norm": 11.636425971984863, + "learning_rate": 3.246479002552379e-05, + "loss": 1.4199, + "step": 25150 + }, + { + "epoch": 4.157818632513943, + "grad_norm": 13.851326942443848, + "learning_rate": 3.245560880662517e-05, + "loss": 1.4482, + "step": 25160 + }, + { + "epoch": 4.159471183639744, + "grad_norm": 11.560588836669922, + "learning_rate": 3.2446427587726545e-05, + "loss": 1.3819, + "step": 25170 + }, + { + "epoch": 4.161123734765544, + "grad_norm": 12.765569686889648, + "learning_rate": 3.243724636882793e-05, + "loss": 1.5236, + "step": 25180 + }, + { + "epoch": 4.162776285891344, + "grad_norm": 30.970333099365234, + "learning_rate": 3.2428065149929304e-05, + "loss": 1.5637, + "step": 25190 + }, + { + "epoch": 4.164428837017145, + "grad_norm": 13.522059440612793, + "learning_rate": 3.2418883931030686e-05, + "loss": 1.4642, + "step": 25200 + }, + { + "epoch": 4.1660813881429455, + "grad_norm": 9.605454444885254, + "learning_rate": 3.240970271213206e-05, + "loss": 1.4922, + "step": 25210 + }, + { + "epoch": 4.1677339392687465, + "grad_norm": 15.900501251220703, + "learning_rate": 3.2400521493233445e-05, + "loss": 1.5052, + "step": 25220 + }, + { + "epoch": 4.169386490394547, + "grad_norm": 21.861635208129883, + "learning_rate": 3.239134027433483e-05, + "loss": 1.5523, + "step": 25230 + }, + { + "epoch": 4.171039041520347, + "grad_norm": 10.886978149414062, + "learning_rate": 3.23821590554362e-05, + "loss": 1.5286, + "step": 25240 + }, + { + "epoch": 4.172691592646148, + "grad_norm": 13.990205764770508, + "learning_rate": 3.237297783653758e-05, + "loss": 1.5499, + "step": 25250 + }, + { + "epoch": 4.174344143771948, + "grad_norm": 17.176815032958984, + "learning_rate": 3.236379661763896e-05, + "loss": 1.5911, + "step": 25260 + }, + { + "epoch": 4.175996694897749, + "grad_norm": 7.191830635070801, + "learning_rate": 3.235461539874034e-05, + "loss": 1.4213, + "step": 25270 + }, + { + "epoch": 4.177649246023549, + "grad_norm": 8.074373245239258, + "learning_rate": 3.2345434179841714e-05, + "loss": 1.4857, + "step": 25280 + }, + { + "epoch": 4.179301797149349, + "grad_norm": 26.866851806640625, + "learning_rate": 3.2336252960943096e-05, + "loss": 1.629, + "step": 25290 + }, + { + "epoch": 4.18095434827515, + "grad_norm": 9.89456558227539, + "learning_rate": 3.232707174204447e-05, + "loss": 1.5325, + "step": 25300 + }, + { + "epoch": 4.18260689940095, + "grad_norm": 9.872194290161133, + "learning_rate": 3.2317890523145855e-05, + "loss": 1.5802, + "step": 25310 + }, + { + "epoch": 4.184259450526751, + "grad_norm": 8.017595291137695, + "learning_rate": 3.230870930424723e-05, + "loss": 1.4805, + "step": 25320 + }, + { + "epoch": 4.185912001652551, + "grad_norm": 6.862139701843262, + "learning_rate": 3.229952808534861e-05, + "loss": 1.4683, + "step": 25330 + }, + { + "epoch": 4.187564552778351, + "grad_norm": 10.27568244934082, + "learning_rate": 3.2290346866449996e-05, + "loss": 1.5286, + "step": 25340 + }, + { + "epoch": 4.189217103904152, + "grad_norm": 26.036792755126953, + "learning_rate": 3.228116564755137e-05, + "loss": 1.5395, + "step": 25350 + }, + { + "epoch": 4.190869655029952, + "grad_norm": 6.976101398468018, + "learning_rate": 3.2271984428652754e-05, + "loss": 1.5876, + "step": 25360 + }, + { + "epoch": 4.192522206155753, + "grad_norm": 13.752799987792969, + "learning_rate": 3.226280320975413e-05, + "loss": 1.4922, + "step": 25370 + }, + { + "epoch": 4.194174757281553, + "grad_norm": 12.30428409576416, + "learning_rate": 3.2253621990855506e-05, + "loss": 1.5245, + "step": 25380 + }, + { + "epoch": 4.195827308407354, + "grad_norm": 8.185288429260254, + "learning_rate": 3.224444077195689e-05, + "loss": 1.4679, + "step": 25390 + }, + { + "epoch": 4.197479859533154, + "grad_norm": 11.520121574401855, + "learning_rate": 3.2235259553058264e-05, + "loss": 1.5309, + "step": 25400 + }, + { + "epoch": 4.1991324106589545, + "grad_norm": 23.830415725708008, + "learning_rate": 3.222607833415964e-05, + "loss": 1.521, + "step": 25410 + }, + { + "epoch": 4.2007849617847555, + "grad_norm": 13.380767822265625, + "learning_rate": 3.221689711526102e-05, + "loss": 1.6148, + "step": 25420 + }, + { + "epoch": 4.202437512910556, + "grad_norm": 11.789327621459961, + "learning_rate": 3.22077158963624e-05, + "loss": 1.5611, + "step": 25430 + }, + { + "epoch": 4.2040900640363565, + "grad_norm": 8.8134765625, + "learning_rate": 3.219853467746378e-05, + "loss": 1.4858, + "step": 25440 + }, + { + "epoch": 4.205742615162157, + "grad_norm": 9.195273399353027, + "learning_rate": 3.218935345856516e-05, + "loss": 1.457, + "step": 25450 + }, + { + "epoch": 4.207395166287957, + "grad_norm": 12.645697593688965, + "learning_rate": 3.218017223966654e-05, + "loss": 1.5891, + "step": 25460 + }, + { + "epoch": 4.209047717413758, + "grad_norm": 14.299896240234375, + "learning_rate": 3.217099102076792e-05, + "loss": 1.6082, + "step": 25470 + }, + { + "epoch": 4.210700268539558, + "grad_norm": 16.324865341186523, + "learning_rate": 3.21618098018693e-05, + "loss": 1.5836, + "step": 25480 + }, + { + "epoch": 4.212352819665359, + "grad_norm": 16.070384979248047, + "learning_rate": 3.215262858297068e-05, + "loss": 1.5849, + "step": 25490 + }, + { + "epoch": 4.214005370791159, + "grad_norm": 20.05198097229004, + "learning_rate": 3.214344736407206e-05, + "loss": 1.6418, + "step": 25500 + }, + { + "epoch": 4.215657921916959, + "grad_norm": 15.854426383972168, + "learning_rate": 3.213426614517343e-05, + "loss": 1.5086, + "step": 25510 + }, + { + "epoch": 4.21731047304276, + "grad_norm": 13.941832542419434, + "learning_rate": 3.2125084926274815e-05, + "loss": 1.6784, + "step": 25520 + }, + { + "epoch": 4.21896302416856, + "grad_norm": 11.559526443481445, + "learning_rate": 3.211590370737619e-05, + "loss": 1.5038, + "step": 25530 + }, + { + "epoch": 4.220615575294361, + "grad_norm": 8.316058158874512, + "learning_rate": 3.210672248847757e-05, + "loss": 1.6032, + "step": 25540 + }, + { + "epoch": 4.222268126420161, + "grad_norm": 9.870329856872559, + "learning_rate": 3.209754126957895e-05, + "loss": 1.4683, + "step": 25550 + }, + { + "epoch": 4.223920677545961, + "grad_norm": 7.676739692687988, + "learning_rate": 3.2088360050680326e-05, + "loss": 1.499, + "step": 25560 + }, + { + "epoch": 4.225573228671762, + "grad_norm": 10.399463653564453, + "learning_rate": 3.207917883178171e-05, + "loss": 1.4412, + "step": 25570 + }, + { + "epoch": 4.227225779797562, + "grad_norm": 15.944662094116211, + "learning_rate": 3.206999761288309e-05, + "loss": 1.5239, + "step": 25580 + }, + { + "epoch": 4.228878330923363, + "grad_norm": 10.55685806274414, + "learning_rate": 3.206081639398447e-05, + "loss": 1.4544, + "step": 25590 + }, + { + "epoch": 4.230530882049163, + "grad_norm": 16.161338806152344, + "learning_rate": 3.205163517508585e-05, + "loss": 1.4372, + "step": 25600 + }, + { + "epoch": 4.2321834331749635, + "grad_norm": 11.07420539855957, + "learning_rate": 3.2042453956187225e-05, + "loss": 1.5521, + "step": 25610 + }, + { + "epoch": 4.2338359843007645, + "grad_norm": 19.852649688720703, + "learning_rate": 3.203327273728861e-05, + "loss": 1.5212, + "step": 25620 + }, + { + "epoch": 4.235488535426565, + "grad_norm": 12.890336036682129, + "learning_rate": 3.2024091518389984e-05, + "loss": 1.6077, + "step": 25630 + }, + { + "epoch": 4.2371410865523655, + "grad_norm": 17.776052474975586, + "learning_rate": 3.201491029949136e-05, + "loss": 1.6323, + "step": 25640 + }, + { + "epoch": 4.238793637678166, + "grad_norm": 14.739234924316406, + "learning_rate": 3.200572908059274e-05, + "loss": 1.5555, + "step": 25650 + }, + { + "epoch": 4.240446188803966, + "grad_norm": 9.28357982635498, + "learning_rate": 3.199654786169412e-05, + "loss": 1.5533, + "step": 25660 + }, + { + "epoch": 4.242098739929767, + "grad_norm": 9.304981231689453, + "learning_rate": 3.1987366642795494e-05, + "loss": 1.5796, + "step": 25670 + }, + { + "epoch": 4.243751291055567, + "grad_norm": 17.264741897583008, + "learning_rate": 3.197818542389688e-05, + "loss": 1.3938, + "step": 25680 + }, + { + "epoch": 4.245403842181368, + "grad_norm": 6.289753437042236, + "learning_rate": 3.196900420499826e-05, + "loss": 1.3865, + "step": 25690 + }, + { + "epoch": 4.247056393307168, + "grad_norm": 7.785802364349365, + "learning_rate": 3.1959822986099635e-05, + "loss": 1.4776, + "step": 25700 + }, + { + "epoch": 4.248708944432968, + "grad_norm": 39.08346939086914, + "learning_rate": 3.195064176720102e-05, + "loss": 1.5611, + "step": 25710 + }, + { + "epoch": 4.250361495558769, + "grad_norm": 29.96815299987793, + "learning_rate": 3.1941460548302394e-05, + "loss": 1.5694, + "step": 25720 + }, + { + "epoch": 4.252014046684569, + "grad_norm": 17.154983520507812, + "learning_rate": 3.1932279329403776e-05, + "loss": 1.5357, + "step": 25730 + }, + { + "epoch": 4.25366659781037, + "grad_norm": 11.302206993103027, + "learning_rate": 3.192309811050515e-05, + "loss": 1.6063, + "step": 25740 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 9.660160064697266, + "learning_rate": 3.1913916891606535e-05, + "loss": 1.5413, + "step": 25750 + }, + { + "epoch": 4.256971700061971, + "grad_norm": 7.870529651641846, + "learning_rate": 3.190473567270791e-05, + "loss": 1.5043, + "step": 25760 + }, + { + "epoch": 4.258624251187771, + "grad_norm": 16.945133209228516, + "learning_rate": 3.1895554453809286e-05, + "loss": 1.5214, + "step": 25770 + }, + { + "epoch": 4.260276802313571, + "grad_norm": 11.31995677947998, + "learning_rate": 3.188637323491067e-05, + "loss": 1.4101, + "step": 25780 + }, + { + "epoch": 4.261929353439372, + "grad_norm": 16.450275421142578, + "learning_rate": 3.1877192016012045e-05, + "loss": 1.4787, + "step": 25790 + }, + { + "epoch": 4.263581904565172, + "grad_norm": 10.560013771057129, + "learning_rate": 3.186801079711343e-05, + "loss": 1.49, + "step": 25800 + }, + { + "epoch": 4.2652344556909725, + "grad_norm": 27.106367111206055, + "learning_rate": 3.1858829578214803e-05, + "loss": 1.5307, + "step": 25810 + }, + { + "epoch": 4.2668870068167735, + "grad_norm": 9.301347732543945, + "learning_rate": 3.1849648359316186e-05, + "loss": 1.4364, + "step": 25820 + }, + { + "epoch": 4.268539557942574, + "grad_norm": 13.334724426269531, + "learning_rate": 3.184046714041756e-05, + "loss": 1.6001, + "step": 25830 + }, + { + "epoch": 4.2701921090683745, + "grad_norm": 9.709102630615234, + "learning_rate": 3.1831285921518945e-05, + "loss": 1.4029, + "step": 25840 + }, + { + "epoch": 4.271844660194175, + "grad_norm": 28.314720153808594, + "learning_rate": 3.182210470262032e-05, + "loss": 1.4576, + "step": 25850 + }, + { + "epoch": 4.273497211319976, + "grad_norm": 9.064886093139648, + "learning_rate": 3.18129234837217e-05, + "loss": 1.5087, + "step": 25860 + }, + { + "epoch": 4.275149762445776, + "grad_norm": 8.588004112243652, + "learning_rate": 3.180374226482308e-05, + "loss": 1.4572, + "step": 25870 + }, + { + "epoch": 4.276802313571576, + "grad_norm": 9.804033279418945, + "learning_rate": 3.179456104592446e-05, + "loss": 1.4764, + "step": 25880 + }, + { + "epoch": 4.278454864697377, + "grad_norm": 65.4351806640625, + "learning_rate": 3.178537982702584e-05, + "loss": 1.5435, + "step": 25890 + }, + { + "epoch": 4.280107415823177, + "grad_norm": 6.215262413024902, + "learning_rate": 3.177619860812722e-05, + "loss": 1.6816, + "step": 25900 + }, + { + "epoch": 4.281759966948978, + "grad_norm": 105.8828353881836, + "learning_rate": 3.1767017389228596e-05, + "loss": 1.5335, + "step": 25910 + }, + { + "epoch": 4.283412518074778, + "grad_norm": 13.918194770812988, + "learning_rate": 3.175783617032997e-05, + "loss": 1.5267, + "step": 25920 + }, + { + "epoch": 4.285065069200578, + "grad_norm": 8.10260009765625, + "learning_rate": 3.1748654951431354e-05, + "loss": 1.4447, + "step": 25930 + }, + { + "epoch": 4.286717620326379, + "grad_norm": 16.183929443359375, + "learning_rate": 3.173947373253273e-05, + "loss": 1.4827, + "step": 25940 + }, + { + "epoch": 4.288370171452179, + "grad_norm": 9.055156707763672, + "learning_rate": 3.173029251363411e-05, + "loss": 1.506, + "step": 25950 + }, + { + "epoch": 4.29002272257798, + "grad_norm": 7.047090530395508, + "learning_rate": 3.172111129473549e-05, + "loss": 1.5904, + "step": 25960 + }, + { + "epoch": 4.29167527370378, + "grad_norm": 6.232375144958496, + "learning_rate": 3.171193007583687e-05, + "loss": 1.5418, + "step": 25970 + }, + { + "epoch": 4.29332782482958, + "grad_norm": 12.954999923706055, + "learning_rate": 3.170274885693825e-05, + "loss": 1.5424, + "step": 25980 + }, + { + "epoch": 4.294980375955381, + "grad_norm": 15.10276985168457, + "learning_rate": 3.169356763803963e-05, + "loss": 1.5881, + "step": 25990 + }, + { + "epoch": 4.296632927081181, + "grad_norm": 14.072202682495117, + "learning_rate": 3.1684386419141006e-05, + "loss": 1.6264, + "step": 26000 + }, + { + "epoch": 4.298285478206982, + "grad_norm": 11.788676261901855, + "learning_rate": 3.167520520024239e-05, + "loss": 1.4439, + "step": 26010 + }, + { + "epoch": 4.2999380293327825, + "grad_norm": 9.451334953308105, + "learning_rate": 3.1666023981343764e-05, + "loss": 1.5252, + "step": 26020 + }, + { + "epoch": 4.301590580458583, + "grad_norm": 22.73940086364746, + "learning_rate": 3.165684276244515e-05, + "loss": 1.4381, + "step": 26030 + }, + { + "epoch": 4.3032431315843835, + "grad_norm": 10.139789581298828, + "learning_rate": 3.164766154354652e-05, + "loss": 1.4498, + "step": 26040 + }, + { + "epoch": 4.304895682710184, + "grad_norm": 8.749555587768555, + "learning_rate": 3.16384803246479e-05, + "loss": 1.4908, + "step": 26050 + }, + { + "epoch": 4.306548233835985, + "grad_norm": 4.8961405754089355, + "learning_rate": 3.162929910574928e-05, + "loss": 1.5442, + "step": 26060 + }, + { + "epoch": 4.308200784961785, + "grad_norm": 25.065303802490234, + "learning_rate": 3.162011788685066e-05, + "loss": 1.5435, + "step": 26070 + }, + { + "epoch": 4.309853336087585, + "grad_norm": 8.996482849121094, + "learning_rate": 3.161093666795204e-05, + "loss": 1.5444, + "step": 26080 + }, + { + "epoch": 4.311505887213386, + "grad_norm": 7.40595817565918, + "learning_rate": 3.1601755449053416e-05, + "loss": 1.6196, + "step": 26090 + }, + { + "epoch": 4.313158438339186, + "grad_norm": 10.102790832519531, + "learning_rate": 3.15925742301548e-05, + "loss": 1.6037, + "step": 26100 + }, + { + "epoch": 4.314810989464987, + "grad_norm": 7.849573612213135, + "learning_rate": 3.1583393011256174e-05, + "loss": 1.5106, + "step": 26110 + }, + { + "epoch": 4.316463540590787, + "grad_norm": 8.720571517944336, + "learning_rate": 3.157421179235756e-05, + "loss": 1.5115, + "step": 26120 + }, + { + "epoch": 4.318116091716588, + "grad_norm": 25.104869842529297, + "learning_rate": 3.156503057345893e-05, + "loss": 1.5522, + "step": 26130 + }, + { + "epoch": 4.319768642842388, + "grad_norm": 8.44864559173584, + "learning_rate": 3.1555849354560315e-05, + "loss": 1.5257, + "step": 26140 + }, + { + "epoch": 4.321421193968188, + "grad_norm": 9.844928741455078, + "learning_rate": 3.154666813566169e-05, + "loss": 1.5298, + "step": 26150 + }, + { + "epoch": 4.323073745093989, + "grad_norm": 7.365574836730957, + "learning_rate": 3.1537486916763074e-05, + "loss": 1.4086, + "step": 26160 + }, + { + "epoch": 4.324726296219789, + "grad_norm": 5.956984043121338, + "learning_rate": 3.152830569786445e-05, + "loss": 1.5345, + "step": 26170 + }, + { + "epoch": 4.326378847345589, + "grad_norm": 18.497196197509766, + "learning_rate": 3.1519124478965825e-05, + "loss": 1.6195, + "step": 26180 + }, + { + "epoch": 4.32803139847139, + "grad_norm": 9.667485237121582, + "learning_rate": 3.150994326006721e-05, + "loss": 1.5314, + "step": 26190 + }, + { + "epoch": 4.32968394959719, + "grad_norm": 10.36966609954834, + "learning_rate": 3.1500762041168584e-05, + "loss": 1.5571, + "step": 26200 + }, + { + "epoch": 4.331336500722991, + "grad_norm": 10.898240089416504, + "learning_rate": 3.1491580822269967e-05, + "loss": 1.4782, + "step": 26210 + }, + { + "epoch": 4.3329890518487915, + "grad_norm": 19.933618545532227, + "learning_rate": 3.148239960337134e-05, + "loss": 1.4672, + "step": 26220 + }, + { + "epoch": 4.3346416029745924, + "grad_norm": 16.246917724609375, + "learning_rate": 3.1473218384472725e-05, + "loss": 1.4771, + "step": 26230 + }, + { + "epoch": 4.3362941541003925, + "grad_norm": 25.84589385986328, + "learning_rate": 3.14640371655741e-05, + "loss": 1.4655, + "step": 26240 + }, + { + "epoch": 4.337946705226193, + "grad_norm": 34.56238555908203, + "learning_rate": 3.1454855946675484e-05, + "loss": 1.616, + "step": 26250 + }, + { + "epoch": 4.339599256351994, + "grad_norm": 15.874077796936035, + "learning_rate": 3.1445674727776866e-05, + "loss": 1.5548, + "step": 26260 + }, + { + "epoch": 4.341251807477794, + "grad_norm": 20.784616470336914, + "learning_rate": 3.143649350887824e-05, + "loss": 1.5053, + "step": 26270 + }, + { + "epoch": 4.342904358603595, + "grad_norm": 8.26366138458252, + "learning_rate": 3.142731228997962e-05, + "loss": 1.5606, + "step": 26280 + }, + { + "epoch": 4.344556909729395, + "grad_norm": 13.238356590270996, + "learning_rate": 3.1418131071081e-05, + "loss": 1.5958, + "step": 26290 + }, + { + "epoch": 4.346209460855195, + "grad_norm": 14.99146556854248, + "learning_rate": 3.1408949852182376e-05, + "loss": 1.5942, + "step": 26300 + }, + { + "epoch": 4.347862011980996, + "grad_norm": 20.563129425048828, + "learning_rate": 3.139976863328375e-05, + "loss": 1.5517, + "step": 26310 + }, + { + "epoch": 4.349514563106796, + "grad_norm": 8.888270378112793, + "learning_rate": 3.1390587414385135e-05, + "loss": 1.48, + "step": 26320 + }, + { + "epoch": 4.351167114232597, + "grad_norm": 12.94379997253418, + "learning_rate": 3.138140619548651e-05, + "loss": 1.473, + "step": 26330 + }, + { + "epoch": 4.352819665358397, + "grad_norm": 17.277467727661133, + "learning_rate": 3.1372224976587893e-05, + "loss": 1.4717, + "step": 26340 + }, + { + "epoch": 4.354472216484197, + "grad_norm": 5.993367671966553, + "learning_rate": 3.136304375768927e-05, + "loss": 1.4952, + "step": 26350 + }, + { + "epoch": 4.356124767609998, + "grad_norm": 30.13902473449707, + "learning_rate": 3.135386253879065e-05, + "loss": 1.5322, + "step": 26360 + }, + { + "epoch": 4.357777318735798, + "grad_norm": 18.45423126220703, + "learning_rate": 3.1344681319892035e-05, + "loss": 1.4632, + "step": 26370 + }, + { + "epoch": 4.359429869861599, + "grad_norm": 13.011897087097168, + "learning_rate": 3.133550010099341e-05, + "loss": 1.5353, + "step": 26380 + }, + { + "epoch": 4.361082420987399, + "grad_norm": 31.791933059692383, + "learning_rate": 3.132631888209479e-05, + "loss": 1.3787, + "step": 26390 + }, + { + "epoch": 4.362734972113199, + "grad_norm": 10.626348495483398, + "learning_rate": 3.131713766319617e-05, + "loss": 1.4245, + "step": 26400 + }, + { + "epoch": 4.364387523239, + "grad_norm": 8.767387390136719, + "learning_rate": 3.1307956444297545e-05, + "loss": 1.5493, + "step": 26410 + }, + { + "epoch": 4.3660400743648005, + "grad_norm": 9.301674842834473, + "learning_rate": 3.129877522539893e-05, + "loss": 1.5448, + "step": 26420 + }, + { + "epoch": 4.367692625490601, + "grad_norm": 7.154281139373779, + "learning_rate": 3.12895940065003e-05, + "loss": 1.6527, + "step": 26430 + }, + { + "epoch": 4.3693451766164015, + "grad_norm": 10.634243965148926, + "learning_rate": 3.128041278760168e-05, + "loss": 1.4756, + "step": 26440 + }, + { + "epoch": 4.370997727742202, + "grad_norm": 23.75998878479004, + "learning_rate": 3.127123156870306e-05, + "loss": 1.4276, + "step": 26450 + }, + { + "epoch": 4.372650278868003, + "grad_norm": 11.77112102508545, + "learning_rate": 3.126205034980444e-05, + "loss": 1.3796, + "step": 26460 + }, + { + "epoch": 4.374302829993803, + "grad_norm": 15.972389221191406, + "learning_rate": 3.125286913090582e-05, + "loss": 1.4398, + "step": 26470 + }, + { + "epoch": 4.375955381119604, + "grad_norm": 16.23674964904785, + "learning_rate": 3.1243687912007196e-05, + "loss": 1.5868, + "step": 26480 + }, + { + "epoch": 4.377607932245404, + "grad_norm": 9.995187759399414, + "learning_rate": 3.123450669310858e-05, + "loss": 1.5567, + "step": 26490 + }, + { + "epoch": 4.379260483371204, + "grad_norm": 14.570067405700684, + "learning_rate": 3.122532547420996e-05, + "loss": 1.5392, + "step": 26500 + }, + { + "epoch": 4.380913034497005, + "grad_norm": 15.810830116271973, + "learning_rate": 3.121614425531134e-05, + "loss": 1.5493, + "step": 26510 + }, + { + "epoch": 4.382565585622805, + "grad_norm": 11.888526916503906, + "learning_rate": 3.120696303641272e-05, + "loss": 1.4854, + "step": 26520 + }, + { + "epoch": 4.384218136748606, + "grad_norm": 6.8027663230896, + "learning_rate": 3.1197781817514096e-05, + "loss": 1.5692, + "step": 26530 + }, + { + "epoch": 4.385870687874406, + "grad_norm": 12.093624114990234, + "learning_rate": 3.118860059861548e-05, + "loss": 1.409, + "step": 26540 + }, + { + "epoch": 4.387523239000206, + "grad_norm": 28.978029251098633, + "learning_rate": 3.1179419379716854e-05, + "loss": 1.5128, + "step": 26550 + }, + { + "epoch": 4.389175790126007, + "grad_norm": 7.327620506286621, + "learning_rate": 3.117023816081823e-05, + "loss": 1.5797, + "step": 26560 + }, + { + "epoch": 4.390828341251807, + "grad_norm": 10.677287101745605, + "learning_rate": 3.1161056941919606e-05, + "loss": 1.5038, + "step": 26570 + }, + { + "epoch": 4.392480892377608, + "grad_norm": 9.653717994689941, + "learning_rate": 3.115187572302099e-05, + "loss": 1.6638, + "step": 26580 + }, + { + "epoch": 4.394133443503408, + "grad_norm": 6.950878620147705, + "learning_rate": 3.1142694504122364e-05, + "loss": 1.4129, + "step": 26590 + }, + { + "epoch": 4.395785994629209, + "grad_norm": 11.13272476196289, + "learning_rate": 3.113351328522375e-05, + "loss": 1.5374, + "step": 26600 + }, + { + "epoch": 4.397438545755009, + "grad_norm": 9.883722305297852, + "learning_rate": 3.112433206632513e-05, + "loss": 1.6651, + "step": 26610 + }, + { + "epoch": 4.3990910968808095, + "grad_norm": 9.824180603027344, + "learning_rate": 3.1115150847426506e-05, + "loss": 1.5053, + "step": 26620 + }, + { + "epoch": 4.40074364800661, + "grad_norm": 11.171745300292969, + "learning_rate": 3.110596962852789e-05, + "loss": 1.4877, + "step": 26630 + }, + { + "epoch": 4.4023961991324105, + "grad_norm": 12.247041702270508, + "learning_rate": 3.1096788409629264e-05, + "loss": 1.6003, + "step": 26640 + }, + { + "epoch": 4.4040487502582115, + "grad_norm": 7.303963661193848, + "learning_rate": 3.108760719073065e-05, + "loss": 1.442, + "step": 26650 + }, + { + "epoch": 4.405701301384012, + "grad_norm": 20.831327438354492, + "learning_rate": 3.107842597183202e-05, + "loss": 1.5416, + "step": 26660 + }, + { + "epoch": 4.407353852509812, + "grad_norm": 7.863111972808838, + "learning_rate": 3.1069244752933405e-05, + "loss": 1.6196, + "step": 26670 + }, + { + "epoch": 4.409006403635613, + "grad_norm": 9.105256080627441, + "learning_rate": 3.106006353403478e-05, + "loss": 1.4094, + "step": 26680 + }, + { + "epoch": 4.410658954761413, + "grad_norm": 54.83008575439453, + "learning_rate": 3.105088231513616e-05, + "loss": 1.528, + "step": 26690 + }, + { + "epoch": 4.412311505887214, + "grad_norm": 10.44033432006836, + "learning_rate": 3.104170109623753e-05, + "loss": 1.5623, + "step": 26700 + }, + { + "epoch": 4.413964057013014, + "grad_norm": 8.483716011047363, + "learning_rate": 3.1032519877338915e-05, + "loss": 1.4732, + "step": 26710 + }, + { + "epoch": 4.415616608138814, + "grad_norm": 10.583048820495605, + "learning_rate": 3.10233386584403e-05, + "loss": 1.5956, + "step": 26720 + }, + { + "epoch": 4.417269159264615, + "grad_norm": 12.964887619018555, + "learning_rate": 3.1014157439541674e-05, + "loss": 1.3888, + "step": 26730 + }, + { + "epoch": 4.418921710390415, + "grad_norm": 7.349795818328857, + "learning_rate": 3.1004976220643057e-05, + "loss": 1.4753, + "step": 26740 + }, + { + "epoch": 4.420574261516216, + "grad_norm": 17.131244659423828, + "learning_rate": 3.099579500174443e-05, + "loss": 1.4025, + "step": 26750 + }, + { + "epoch": 4.422226812642016, + "grad_norm": 7.006505012512207, + "learning_rate": 3.0986613782845815e-05, + "loss": 1.5483, + "step": 26760 + }, + { + "epoch": 4.423879363767816, + "grad_norm": 7.9716620445251465, + "learning_rate": 3.097743256394719e-05, + "loss": 1.5293, + "step": 26770 + }, + { + "epoch": 4.425531914893617, + "grad_norm": 10.431143760681152, + "learning_rate": 3.0968251345048574e-05, + "loss": 1.4546, + "step": 26780 + }, + { + "epoch": 4.427184466019417, + "grad_norm": 15.854337692260742, + "learning_rate": 3.095907012614995e-05, + "loss": 1.3846, + "step": 26790 + }, + { + "epoch": 4.428837017145218, + "grad_norm": 8.748208999633789, + "learning_rate": 3.094988890725133e-05, + "loss": 1.4832, + "step": 26800 + }, + { + "epoch": 4.430489568271018, + "grad_norm": 10.268428802490234, + "learning_rate": 3.094070768835271e-05, + "loss": 1.4754, + "step": 26810 + }, + { + "epoch": 4.4321421193968185, + "grad_norm": 8.496047973632812, + "learning_rate": 3.0931526469454084e-05, + "loss": 1.5358, + "step": 26820 + }, + { + "epoch": 4.433794670522619, + "grad_norm": 6.902714729309082, + "learning_rate": 3.0922345250555466e-05, + "loss": 1.5926, + "step": 26830 + }, + { + "epoch": 4.4354472216484195, + "grad_norm": 8.306710243225098, + "learning_rate": 3.091316403165684e-05, + "loss": 1.5422, + "step": 26840 + }, + { + "epoch": 4.4370997727742205, + "grad_norm": 8.802634239196777, + "learning_rate": 3.0903982812758225e-05, + "loss": 1.4948, + "step": 26850 + }, + { + "epoch": 4.438752323900021, + "grad_norm": 13.463654518127441, + "learning_rate": 3.08948015938596e-05, + "loss": 1.598, + "step": 26860 + }, + { + "epoch": 4.440404875025821, + "grad_norm": 16.30317497253418, + "learning_rate": 3.0885620374960983e-05, + "loss": 1.4191, + "step": 26870 + }, + { + "epoch": 4.442057426151622, + "grad_norm": 19.12843894958496, + "learning_rate": 3.087643915606236e-05, + "loss": 1.5168, + "step": 26880 + }, + { + "epoch": 4.443709977277422, + "grad_norm": 8.68060302734375, + "learning_rate": 3.086725793716374e-05, + "loss": 1.4416, + "step": 26890 + }, + { + "epoch": 4.445362528403223, + "grad_norm": 26.926401138305664, + "learning_rate": 3.085807671826512e-05, + "loss": 1.5008, + "step": 26900 + }, + { + "epoch": 4.447015079529023, + "grad_norm": 12.683781623840332, + "learning_rate": 3.08488954993665e-05, + "loss": 1.6921, + "step": 26910 + }, + { + "epoch": 4.448667630654823, + "grad_norm": 6.749821186065674, + "learning_rate": 3.0839714280467876e-05, + "loss": 1.5364, + "step": 26920 + }, + { + "epoch": 4.450320181780624, + "grad_norm": 7.689281463623047, + "learning_rate": 3.083053306156926e-05, + "loss": 1.5276, + "step": 26930 + }, + { + "epoch": 4.451972732906424, + "grad_norm": 10.439373016357422, + "learning_rate": 3.0821351842670635e-05, + "loss": 1.4987, + "step": 26940 + }, + { + "epoch": 4.453625284032225, + "grad_norm": 15.080227851867676, + "learning_rate": 3.081217062377201e-05, + "loss": 1.5277, + "step": 26950 + }, + { + "epoch": 4.455277835158025, + "grad_norm": 7.284642219543457, + "learning_rate": 3.080298940487339e-05, + "loss": 1.6219, + "step": 26960 + }, + { + "epoch": 4.456930386283826, + "grad_norm": 26.771888732910156, + "learning_rate": 3.079380818597477e-05, + "loss": 1.3984, + "step": 26970 + }, + { + "epoch": 4.458582937409626, + "grad_norm": 11.829761505126953, + "learning_rate": 3.078462696707615e-05, + "loss": 1.4057, + "step": 26980 + }, + { + "epoch": 4.460235488535426, + "grad_norm": 7.455525875091553, + "learning_rate": 3.077544574817753e-05, + "loss": 1.5112, + "step": 26990 + }, + { + "epoch": 4.461888039661227, + "grad_norm": 18.92814826965332, + "learning_rate": 3.076626452927891e-05, + "loss": 1.5195, + "step": 27000 + }, + { + "epoch": 4.463540590787027, + "grad_norm": 10.309431076049805, + "learning_rate": 3.0757083310380286e-05, + "loss": 1.5205, + "step": 27010 + }, + { + "epoch": 4.4651931419128275, + "grad_norm": 6.530293941497803, + "learning_rate": 3.074790209148167e-05, + "loss": 1.5346, + "step": 27020 + }, + { + "epoch": 4.466845693038628, + "grad_norm": 13.004956245422363, + "learning_rate": 3.0738720872583045e-05, + "loss": 1.5465, + "step": 27030 + }, + { + "epoch": 4.4684982441644285, + "grad_norm": 9.708096504211426, + "learning_rate": 3.072953965368443e-05, + "loss": 1.5709, + "step": 27040 + }, + { + "epoch": 4.4701507952902295, + "grad_norm": 32.74457931518555, + "learning_rate": 3.07203584347858e-05, + "loss": 1.6366, + "step": 27050 + }, + { + "epoch": 4.47180334641603, + "grad_norm": 16.446645736694336, + "learning_rate": 3.0711177215887186e-05, + "loss": 1.456, + "step": 27060 + }, + { + "epoch": 4.473455897541831, + "grad_norm": 11.239441871643066, + "learning_rate": 3.070199599698856e-05, + "loss": 1.4364, + "step": 27070 + }, + { + "epoch": 4.475108448667631, + "grad_norm": 12.089319229125977, + "learning_rate": 3.069281477808994e-05, + "loss": 1.4886, + "step": 27080 + }, + { + "epoch": 4.476760999793431, + "grad_norm": 8.01862621307373, + "learning_rate": 3.068363355919132e-05, + "loss": 1.5668, + "step": 27090 + }, + { + "epoch": 4.478413550919232, + "grad_norm": 10.913430213928223, + "learning_rate": 3.0674452340292696e-05, + "loss": 1.5688, + "step": 27100 + }, + { + "epoch": 4.480066102045032, + "grad_norm": 50.19123458862305, + "learning_rate": 3.066527112139408e-05, + "loss": 1.5296, + "step": 27110 + }, + { + "epoch": 4.481718653170833, + "grad_norm": 7.208930015563965, + "learning_rate": 3.0656089902495454e-05, + "loss": 1.5731, + "step": 27120 + }, + { + "epoch": 4.483371204296633, + "grad_norm": 8.685304641723633, + "learning_rate": 3.064690868359684e-05, + "loss": 1.4869, + "step": 27130 + }, + { + "epoch": 4.485023755422433, + "grad_norm": 10.306434631347656, + "learning_rate": 3.063772746469821e-05, + "loss": 1.5608, + "step": 27140 + }, + { + "epoch": 4.486676306548234, + "grad_norm": 15.503512382507324, + "learning_rate": 3.0628546245799596e-05, + "loss": 1.5302, + "step": 27150 + }, + { + "epoch": 4.488328857674034, + "grad_norm": 11.12451171875, + "learning_rate": 3.061936502690097e-05, + "loss": 1.3706, + "step": 27160 + }, + { + "epoch": 4.489981408799835, + "grad_norm": 8.692049026489258, + "learning_rate": 3.0610183808002354e-05, + "loss": 1.4919, + "step": 27170 + }, + { + "epoch": 4.491633959925635, + "grad_norm": 12.720965385437012, + "learning_rate": 3.060100258910373e-05, + "loss": 1.63, + "step": 27180 + }, + { + "epoch": 4.493286511051435, + "grad_norm": 7.457226753234863, + "learning_rate": 3.059182137020511e-05, + "loss": 1.7286, + "step": 27190 + }, + { + "epoch": 4.494939062177236, + "grad_norm": 10.036781311035156, + "learning_rate": 3.058264015130649e-05, + "loss": 1.4945, + "step": 27200 + }, + { + "epoch": 4.496591613303036, + "grad_norm": 9.686598777770996, + "learning_rate": 3.0573458932407864e-05, + "loss": 1.4577, + "step": 27210 + }, + { + "epoch": 4.498244164428837, + "grad_norm": 10.819511413574219, + "learning_rate": 3.056427771350925e-05, + "loss": 1.4983, + "step": 27220 + }, + { + "epoch": 4.499896715554637, + "grad_norm": 6.3481879234313965, + "learning_rate": 3.055509649461062e-05, + "loss": 1.5714, + "step": 27230 + }, + { + "epoch": 4.501549266680438, + "grad_norm": 6.407865524291992, + "learning_rate": 3.0545915275712005e-05, + "loss": 1.4642, + "step": 27240 + }, + { + "epoch": 4.5032018178062385, + "grad_norm": 22.46175193786621, + "learning_rate": 3.053673405681338e-05, + "loss": 1.5947, + "step": 27250 + }, + { + "epoch": 4.504854368932039, + "grad_norm": 12.746009826660156, + "learning_rate": 3.0527552837914764e-05, + "loss": 1.4251, + "step": 27260 + }, + { + "epoch": 4.50650692005784, + "grad_norm": 19.7728328704834, + "learning_rate": 3.051837161901614e-05, + "loss": 1.5405, + "step": 27270 + }, + { + "epoch": 4.50815947118364, + "grad_norm": 15.723058700561523, + "learning_rate": 3.0509190400117522e-05, + "loss": 1.6367, + "step": 27280 + }, + { + "epoch": 4.50981202230944, + "grad_norm": 8.419683456420898, + "learning_rate": 3.05000091812189e-05, + "loss": 1.5474, + "step": 27290 + }, + { + "epoch": 4.511464573435241, + "grad_norm": 13.057724952697754, + "learning_rate": 3.0490827962320277e-05, + "loss": 1.4442, + "step": 27300 + }, + { + "epoch": 4.513117124561041, + "grad_norm": 12.913500785827637, + "learning_rate": 3.048164674342166e-05, + "loss": 1.5384, + "step": 27310 + }, + { + "epoch": 4.514769675686842, + "grad_norm": 7.9755659103393555, + "learning_rate": 3.0472465524523036e-05, + "loss": 1.5771, + "step": 27320 + }, + { + "epoch": 4.516422226812642, + "grad_norm": 8.68862247467041, + "learning_rate": 3.046328430562442e-05, + "loss": 1.3936, + "step": 27330 + }, + { + "epoch": 4.518074777938443, + "grad_norm": 10.90861701965332, + "learning_rate": 3.0454103086725794e-05, + "loss": 1.4325, + "step": 27340 + }, + { + "epoch": 4.519727329064243, + "grad_norm": 10.610472679138184, + "learning_rate": 3.0444921867827174e-05, + "loss": 1.4947, + "step": 27350 + }, + { + "epoch": 4.521379880190043, + "grad_norm": 14.162581443786621, + "learning_rate": 3.043574064892855e-05, + "loss": 1.4768, + "step": 27360 + }, + { + "epoch": 4.523032431315844, + "grad_norm": 7.975509166717529, + "learning_rate": 3.0426559430029932e-05, + "loss": 1.3613, + "step": 27370 + }, + { + "epoch": 4.524684982441644, + "grad_norm": 11.789381980895996, + "learning_rate": 3.0417378211131308e-05, + "loss": 1.5751, + "step": 27380 + }, + { + "epoch": 4.526337533567444, + "grad_norm": 8.49382495880127, + "learning_rate": 3.040819699223269e-05, + "loss": 1.565, + "step": 27390 + }, + { + "epoch": 4.527990084693245, + "grad_norm": 11.56302547454834, + "learning_rate": 3.0399015773334073e-05, + "loss": 1.6405, + "step": 27400 + }, + { + "epoch": 4.529642635819045, + "grad_norm": 7.2713494300842285, + "learning_rate": 3.038983455443545e-05, + "loss": 1.5445, + "step": 27410 + }, + { + "epoch": 4.531295186944846, + "grad_norm": 40.96128463745117, + "learning_rate": 3.038065333553683e-05, + "loss": 1.5608, + "step": 27420 + }, + { + "epoch": 4.532947738070646, + "grad_norm": 6.933916091918945, + "learning_rate": 3.0371472116638204e-05, + "loss": 1.7051, + "step": 27430 + }, + { + "epoch": 4.534600289196447, + "grad_norm": 9.174333572387695, + "learning_rate": 3.0362290897739587e-05, + "loss": 1.6537, + "step": 27440 + }, + { + "epoch": 4.5362528403222475, + "grad_norm": 11.921561241149902, + "learning_rate": 3.0353109678840963e-05, + "loss": 1.458, + "step": 27450 + }, + { + "epoch": 4.537905391448048, + "grad_norm": 11.688679695129395, + "learning_rate": 3.0343928459942345e-05, + "loss": 1.4882, + "step": 27460 + }, + { + "epoch": 4.539557942573849, + "grad_norm": 9.94924545288086, + "learning_rate": 3.033474724104372e-05, + "loss": 1.3484, + "step": 27470 + }, + { + "epoch": 4.541210493699649, + "grad_norm": 14.807235717773438, + "learning_rate": 3.03255660221451e-05, + "loss": 1.5368, + "step": 27480 + }, + { + "epoch": 4.54286304482545, + "grad_norm": 7.9106597900390625, + "learning_rate": 3.0316384803246476e-05, + "loss": 1.5756, + "step": 27490 + }, + { + "epoch": 4.54451559595125, + "grad_norm": 17.72800636291504, + "learning_rate": 3.030720358434786e-05, + "loss": 1.6303, + "step": 27500 + }, + { + "epoch": 4.54616814707705, + "grad_norm": 14.3507080078125, + "learning_rate": 3.0298022365449242e-05, + "loss": 1.488, + "step": 27510 + }, + { + "epoch": 4.547820698202851, + "grad_norm": 11.83436107635498, + "learning_rate": 3.0288841146550618e-05, + "loss": 1.4159, + "step": 27520 + }, + { + "epoch": 4.549473249328651, + "grad_norm": 32.23595428466797, + "learning_rate": 3.0279659927652e-05, + "loss": 1.4765, + "step": 27530 + }, + { + "epoch": 4.551125800454452, + "grad_norm": 7.518164157867432, + "learning_rate": 3.0270478708753376e-05, + "loss": 1.5396, + "step": 27540 + }, + { + "epoch": 4.552778351580252, + "grad_norm": 12.867490768432617, + "learning_rate": 3.0261297489854755e-05, + "loss": 1.6184, + "step": 27550 + }, + { + "epoch": 4.554430902706052, + "grad_norm": 59.21015548706055, + "learning_rate": 3.025211627095613e-05, + "loss": 1.3638, + "step": 27560 + }, + { + "epoch": 4.556083453831853, + "grad_norm": 8.144857406616211, + "learning_rate": 3.0242935052057514e-05, + "loss": 1.6169, + "step": 27570 + }, + { + "epoch": 4.557736004957653, + "grad_norm": 13.78732681274414, + "learning_rate": 3.023375383315889e-05, + "loss": 1.5019, + "step": 27580 + }, + { + "epoch": 4.559388556083454, + "grad_norm": 12.740043640136719, + "learning_rate": 3.0224572614260272e-05, + "loss": 1.5669, + "step": 27590 + }, + { + "epoch": 4.561041107209254, + "grad_norm": 7.079008102416992, + "learning_rate": 3.0215391395361648e-05, + "loss": 1.5509, + "step": 27600 + }, + { + "epoch": 4.562693658335054, + "grad_norm": 13.054255485534668, + "learning_rate": 3.0206210176463027e-05, + "loss": 1.443, + "step": 27610 + }, + { + "epoch": 4.564346209460855, + "grad_norm": 8.606029510498047, + "learning_rate": 3.0197028957564407e-05, + "loss": 1.5148, + "step": 27620 + }, + { + "epoch": 4.565998760586655, + "grad_norm": 6.800489902496338, + "learning_rate": 3.0187847738665786e-05, + "loss": 1.5745, + "step": 27630 + }, + { + "epoch": 4.567651311712456, + "grad_norm": 10.607884407043457, + "learning_rate": 3.017866651976717e-05, + "loss": 1.5926, + "step": 27640 + }, + { + "epoch": 4.5693038628382565, + "grad_norm": 9.12733268737793, + "learning_rate": 3.0169485300868544e-05, + "loss": 1.5111, + "step": 27650 + }, + { + "epoch": 4.570956413964057, + "grad_norm": 7.448328971862793, + "learning_rate": 3.0160304081969927e-05, + "loss": 1.6386, + "step": 27660 + }, + { + "epoch": 4.572608965089858, + "grad_norm": 21.30801010131836, + "learning_rate": 3.0151122863071303e-05, + "loss": 1.4581, + "step": 27670 + }, + { + "epoch": 4.574261516215658, + "grad_norm": 19.67100715637207, + "learning_rate": 3.0141941644172682e-05, + "loss": 1.4829, + "step": 27680 + }, + { + "epoch": 4.575914067341459, + "grad_norm": 14.657584190368652, + "learning_rate": 3.0132760425274058e-05, + "loss": 1.5703, + "step": 27690 + }, + { + "epoch": 4.577566618467259, + "grad_norm": 7.990481853485107, + "learning_rate": 3.012357920637544e-05, + "loss": 1.4242, + "step": 27700 + }, + { + "epoch": 4.57921916959306, + "grad_norm": 8.869722366333008, + "learning_rate": 3.0114397987476816e-05, + "loss": 1.5055, + "step": 27710 + }, + { + "epoch": 4.58087172071886, + "grad_norm": 11.212498664855957, + "learning_rate": 3.01052167685782e-05, + "loss": 1.4827, + "step": 27720 + }, + { + "epoch": 4.58252427184466, + "grad_norm": 10.676173210144043, + "learning_rate": 3.0096035549679575e-05, + "loss": 1.5627, + "step": 27730 + }, + { + "epoch": 4.584176822970461, + "grad_norm": 25.556760787963867, + "learning_rate": 3.0086854330780954e-05, + "loss": 1.5832, + "step": 27740 + }, + { + "epoch": 4.585829374096261, + "grad_norm": 8.269768714904785, + "learning_rate": 3.0077673111882337e-05, + "loss": 1.5174, + "step": 27750 + }, + { + "epoch": 4.587481925222061, + "grad_norm": 11.795088768005371, + "learning_rate": 3.0068491892983713e-05, + "loss": 1.5345, + "step": 27760 + }, + { + "epoch": 4.589134476347862, + "grad_norm": 12.628785133361816, + "learning_rate": 3.0059310674085095e-05, + "loss": 1.4388, + "step": 27770 + }, + { + "epoch": 4.590787027473662, + "grad_norm": 8.711159706115723, + "learning_rate": 3.005012945518647e-05, + "loss": 1.4389, + "step": 27780 + }, + { + "epoch": 4.592439578599463, + "grad_norm": 5.210586071014404, + "learning_rate": 3.0040948236287854e-05, + "loss": 1.4444, + "step": 27790 + }, + { + "epoch": 4.594092129725263, + "grad_norm": 11.018863677978516, + "learning_rate": 3.003176701738923e-05, + "loss": 1.4392, + "step": 27800 + }, + { + "epoch": 4.595744680851064, + "grad_norm": 8.213711738586426, + "learning_rate": 3.002258579849061e-05, + "loss": 1.4233, + "step": 27810 + }, + { + "epoch": 4.597397231976864, + "grad_norm": 22.795164108276367, + "learning_rate": 3.0013404579591985e-05, + "loss": 1.5016, + "step": 27820 + }, + { + "epoch": 4.599049783102664, + "grad_norm": 10.903739929199219, + "learning_rate": 3.0004223360693367e-05, + "loss": 1.456, + "step": 27830 + }, + { + "epoch": 4.600702334228465, + "grad_norm": 13.617247581481934, + "learning_rate": 2.9995042141794743e-05, + "loss": 1.6139, + "step": 27840 + }, + { + "epoch": 4.6023548853542655, + "grad_norm": 11.749178886413574, + "learning_rate": 2.9985860922896126e-05, + "loss": 1.4379, + "step": 27850 + }, + { + "epoch": 4.604007436480066, + "grad_norm": 11.892571449279785, + "learning_rate": 2.9976679703997505e-05, + "loss": 1.5134, + "step": 27860 + }, + { + "epoch": 4.605659987605867, + "grad_norm": 16.06785011291504, + "learning_rate": 2.996749848509888e-05, + "loss": 1.5021, + "step": 27870 + }, + { + "epoch": 4.607312538731667, + "grad_norm": 11.480622291564941, + "learning_rate": 2.9958317266200264e-05, + "loss": 1.5267, + "step": 27880 + }, + { + "epoch": 4.608965089857468, + "grad_norm": 11.960569381713867, + "learning_rate": 2.994913604730164e-05, + "loss": 1.5191, + "step": 27890 + }, + { + "epoch": 4.610617640983268, + "grad_norm": 10.92312240600586, + "learning_rate": 2.9939954828403022e-05, + "loss": 1.6442, + "step": 27900 + }, + { + "epoch": 4.612270192109069, + "grad_norm": 8.785314559936523, + "learning_rate": 2.9930773609504398e-05, + "loss": 1.6221, + "step": 27910 + }, + { + "epoch": 4.613922743234869, + "grad_norm": 10.37857723236084, + "learning_rate": 2.992159239060578e-05, + "loss": 1.4723, + "step": 27920 + }, + { + "epoch": 4.615575294360669, + "grad_norm": 47.27583694458008, + "learning_rate": 2.9912411171707157e-05, + "loss": 1.5619, + "step": 27930 + }, + { + "epoch": 4.61722784548647, + "grad_norm": 28.969919204711914, + "learning_rate": 2.9903229952808536e-05, + "loss": 1.3872, + "step": 27940 + }, + { + "epoch": 4.61888039661227, + "grad_norm": 16.729145050048828, + "learning_rate": 2.989404873390991e-05, + "loss": 1.5894, + "step": 27950 + }, + { + "epoch": 4.620532947738071, + "grad_norm": 6.659755229949951, + "learning_rate": 2.9884867515011294e-05, + "loss": 1.6566, + "step": 27960 + }, + { + "epoch": 4.622185498863871, + "grad_norm": 15.295912742614746, + "learning_rate": 2.9875686296112677e-05, + "loss": 1.4874, + "step": 27970 + }, + { + "epoch": 4.623838049989671, + "grad_norm": 9.182865142822266, + "learning_rate": 2.9866505077214053e-05, + "loss": 1.6185, + "step": 27980 + }, + { + "epoch": 4.625490601115472, + "grad_norm": 12.22716999053955, + "learning_rate": 2.9857323858315432e-05, + "loss": 1.4013, + "step": 27990 + }, + { + "epoch": 4.627143152241272, + "grad_norm": 14.52023983001709, + "learning_rate": 2.9848142639416808e-05, + "loss": 1.6018, + "step": 28000 + }, + { + "epoch": 4.628795703367073, + "grad_norm": 9.691109657287598, + "learning_rate": 2.983896142051819e-05, + "loss": 1.4829, + "step": 28010 + }, + { + "epoch": 4.630448254492873, + "grad_norm": 7.735055446624756, + "learning_rate": 2.9829780201619566e-05, + "loss": 1.5496, + "step": 28020 + }, + { + "epoch": 4.632100805618673, + "grad_norm": 9.349820137023926, + "learning_rate": 2.982059898272095e-05, + "loss": 1.5716, + "step": 28030 + }, + { + "epoch": 4.633753356744474, + "grad_norm": 47.04668426513672, + "learning_rate": 2.9811417763822325e-05, + "loss": 1.5508, + "step": 28040 + }, + { + "epoch": 4.6354059078702745, + "grad_norm": 11.281471252441406, + "learning_rate": 2.9802236544923708e-05, + "loss": 1.5374, + "step": 28050 + }, + { + "epoch": 4.6370584589960755, + "grad_norm": 10.420440673828125, + "learning_rate": 2.9793055326025083e-05, + "loss": 1.4952, + "step": 28060 + }, + { + "epoch": 4.638711010121876, + "grad_norm": 7.602105140686035, + "learning_rate": 2.9783874107126463e-05, + "loss": 1.6077, + "step": 28070 + }, + { + "epoch": 4.640363561247677, + "grad_norm": 11.72873306274414, + "learning_rate": 2.9774692888227845e-05, + "loss": 1.517, + "step": 28080 + }, + { + "epoch": 4.642016112373477, + "grad_norm": 5.891235828399658, + "learning_rate": 2.976551166932922e-05, + "loss": 1.5004, + "step": 28090 + }, + { + "epoch": 4.643668663499277, + "grad_norm": 8.504074096679688, + "learning_rate": 2.9756330450430604e-05, + "loss": 1.4433, + "step": 28100 + }, + { + "epoch": 4.645321214625078, + "grad_norm": 11.097142219543457, + "learning_rate": 2.974714923153198e-05, + "loss": 1.517, + "step": 28110 + }, + { + "epoch": 4.646973765750878, + "grad_norm": 25.318063735961914, + "learning_rate": 2.973796801263336e-05, + "loss": 1.475, + "step": 28120 + }, + { + "epoch": 4.648626316876678, + "grad_norm": 12.631922721862793, + "learning_rate": 2.9728786793734735e-05, + "loss": 1.6466, + "step": 28130 + }, + { + "epoch": 4.650278868002479, + "grad_norm": 18.910295486450195, + "learning_rate": 2.9719605574836117e-05, + "loss": 1.4223, + "step": 28140 + }, + { + "epoch": 4.651931419128279, + "grad_norm": 10.944915771484375, + "learning_rate": 2.9710424355937493e-05, + "loss": 1.4081, + "step": 28150 + }, + { + "epoch": 4.65358397025408, + "grad_norm": 36.19316482543945, + "learning_rate": 2.9701243137038876e-05, + "loss": 1.5777, + "step": 28160 + }, + { + "epoch": 4.65523652137988, + "grad_norm": 11.189594268798828, + "learning_rate": 2.9692061918140252e-05, + "loss": 1.6023, + "step": 28170 + }, + { + "epoch": 4.656889072505681, + "grad_norm": 8.958883285522461, + "learning_rate": 2.9682880699241634e-05, + "loss": 1.4499, + "step": 28180 + }, + { + "epoch": 4.658541623631481, + "grad_norm": 21.078588485717773, + "learning_rate": 2.967369948034301e-05, + "loss": 1.5326, + "step": 28190 + }, + { + "epoch": 4.660194174757281, + "grad_norm": 9.72829818725586, + "learning_rate": 2.966451826144439e-05, + "loss": 1.665, + "step": 28200 + }, + { + "epoch": 4.661846725883082, + "grad_norm": 6.814817905426025, + "learning_rate": 2.9655337042545772e-05, + "loss": 1.4978, + "step": 28210 + }, + { + "epoch": 4.663499277008882, + "grad_norm": 12.29881763458252, + "learning_rate": 2.9646155823647148e-05, + "loss": 1.5255, + "step": 28220 + }, + { + "epoch": 4.665151828134682, + "grad_norm": 9.907049179077148, + "learning_rate": 2.963697460474853e-05, + "loss": 1.5061, + "step": 28230 + }, + { + "epoch": 4.666804379260483, + "grad_norm": 8.112325668334961, + "learning_rate": 2.9627793385849906e-05, + "loss": 1.5927, + "step": 28240 + }, + { + "epoch": 4.6684569303862835, + "grad_norm": 31.478939056396484, + "learning_rate": 2.9618612166951286e-05, + "loss": 1.4132, + "step": 28250 + }, + { + "epoch": 4.6701094815120845, + "grad_norm": 8.31009292602539, + "learning_rate": 2.9609430948052665e-05, + "loss": 1.409, + "step": 28260 + }, + { + "epoch": 4.671762032637885, + "grad_norm": 69.70064544677734, + "learning_rate": 2.9600249729154044e-05, + "loss": 1.646, + "step": 28270 + }, + { + "epoch": 4.673414583763686, + "grad_norm": 8.266491889953613, + "learning_rate": 2.959106851025542e-05, + "loss": 1.5419, + "step": 28280 + }, + { + "epoch": 4.675067134889486, + "grad_norm": 7.468424320220947, + "learning_rate": 2.9581887291356803e-05, + "loss": 1.4903, + "step": 28290 + }, + { + "epoch": 4.676719686015286, + "grad_norm": 8.034427642822266, + "learning_rate": 2.957270607245818e-05, + "loss": 1.5049, + "step": 28300 + }, + { + "epoch": 4.678372237141087, + "grad_norm": 6.364953517913818, + "learning_rate": 2.956352485355956e-05, + "loss": 1.4946, + "step": 28310 + }, + { + "epoch": 4.680024788266887, + "grad_norm": 6.952188014984131, + "learning_rate": 2.955434363466094e-05, + "loss": 1.585, + "step": 28320 + }, + { + "epoch": 4.681677339392688, + "grad_norm": 6.363594055175781, + "learning_rate": 2.9545162415762316e-05, + "loss": 1.4206, + "step": 28330 + }, + { + "epoch": 4.683329890518488, + "grad_norm": 12.087472915649414, + "learning_rate": 2.95359811968637e-05, + "loss": 1.5401, + "step": 28340 + }, + { + "epoch": 4.684982441644288, + "grad_norm": 7.647066593170166, + "learning_rate": 2.9526799977965075e-05, + "loss": 1.626, + "step": 28350 + }, + { + "epoch": 4.686634992770089, + "grad_norm": 10.387514114379883, + "learning_rate": 2.9517618759066457e-05, + "loss": 1.5102, + "step": 28360 + }, + { + "epoch": 4.688287543895889, + "grad_norm": 10.926636695861816, + "learning_rate": 2.9508437540167833e-05, + "loss": 1.511, + "step": 28370 + }, + { + "epoch": 4.68994009502169, + "grad_norm": 10.67368221282959, + "learning_rate": 2.9499256321269213e-05, + "loss": 1.4165, + "step": 28380 + }, + { + "epoch": 4.69159264614749, + "grad_norm": 9.141593933105469, + "learning_rate": 2.9490075102370592e-05, + "loss": 1.5694, + "step": 28390 + }, + { + "epoch": 4.69324519727329, + "grad_norm": 6.583410739898682, + "learning_rate": 2.948089388347197e-05, + "loss": 1.5357, + "step": 28400 + }, + { + "epoch": 4.694897748399091, + "grad_norm": 10.10991382598877, + "learning_rate": 2.9471712664573347e-05, + "loss": 1.5769, + "step": 28410 + }, + { + "epoch": 4.696550299524891, + "grad_norm": 7.276322841644287, + "learning_rate": 2.946253144567473e-05, + "loss": 1.6132, + "step": 28420 + }, + { + "epoch": 4.698202850650692, + "grad_norm": 6.623241901397705, + "learning_rate": 2.9453350226776112e-05, + "loss": 1.449, + "step": 28430 + }, + { + "epoch": 4.699855401776492, + "grad_norm": 8.261223793029785, + "learning_rate": 2.9444169007877488e-05, + "loss": 1.4652, + "step": 28440 + }, + { + "epoch": 4.701507952902293, + "grad_norm": 38.858604431152344, + "learning_rate": 2.9434987788978867e-05, + "loss": 1.5068, + "step": 28450 + }, + { + "epoch": 4.7031605040280935, + "grad_norm": 15.038886070251465, + "learning_rate": 2.9425806570080243e-05, + "loss": 1.636, + "step": 28460 + }, + { + "epoch": 4.704813055153894, + "grad_norm": 11.770708084106445, + "learning_rate": 2.9416625351181626e-05, + "loss": 1.5393, + "step": 28470 + }, + { + "epoch": 4.706465606279695, + "grad_norm": 24.59950065612793, + "learning_rate": 2.9407444132283e-05, + "loss": 1.508, + "step": 28480 + }, + { + "epoch": 4.708118157405495, + "grad_norm": 8.745185852050781, + "learning_rate": 2.9398262913384384e-05, + "loss": 1.6451, + "step": 28490 + }, + { + "epoch": 4.709770708531295, + "grad_norm": 12.335895538330078, + "learning_rate": 2.938908169448576e-05, + "loss": 1.487, + "step": 28500 + }, + { + "epoch": 4.711423259657096, + "grad_norm": 38.088829040527344, + "learning_rate": 2.937990047558714e-05, + "loss": 1.4954, + "step": 28510 + }, + { + "epoch": 4.713075810782896, + "grad_norm": 12.632776260375977, + "learning_rate": 2.937071925668852e-05, + "loss": 1.5633, + "step": 28520 + }, + { + "epoch": 4.714728361908697, + "grad_norm": 11.09786319732666, + "learning_rate": 2.9361538037789898e-05, + "loss": 1.4451, + "step": 28530 + }, + { + "epoch": 4.716380913034497, + "grad_norm": 11.804417610168457, + "learning_rate": 2.935235681889128e-05, + "loss": 1.596, + "step": 28540 + }, + { + "epoch": 4.718033464160298, + "grad_norm": 10.606738090515137, + "learning_rate": 2.9343175599992656e-05, + "loss": 1.4794, + "step": 28550 + }, + { + "epoch": 4.719686015286098, + "grad_norm": 8.68057632446289, + "learning_rate": 2.933399438109404e-05, + "loss": 1.5829, + "step": 28560 + }, + { + "epoch": 4.721338566411898, + "grad_norm": 9.055428504943848, + "learning_rate": 2.9324813162195415e-05, + "loss": 1.5983, + "step": 28570 + }, + { + "epoch": 4.722991117537699, + "grad_norm": 18.714929580688477, + "learning_rate": 2.9315631943296794e-05, + "loss": 1.5333, + "step": 28580 + }, + { + "epoch": 4.724643668663499, + "grad_norm": 11.138300895690918, + "learning_rate": 2.930645072439817e-05, + "loss": 1.5573, + "step": 28590 + }, + { + "epoch": 4.726296219789299, + "grad_norm": 11.27102279663086, + "learning_rate": 2.9297269505499553e-05, + "loss": 1.4813, + "step": 28600 + }, + { + "epoch": 4.7279487709151, + "grad_norm": 9.76452350616455, + "learning_rate": 2.928808828660093e-05, + "loss": 1.6399, + "step": 28610 + }, + { + "epoch": 4.7296013220409, + "grad_norm": 9.003868103027344, + "learning_rate": 2.927890706770231e-05, + "loss": 1.5414, + "step": 28620 + }, + { + "epoch": 4.731253873166701, + "grad_norm": 8.384162902832031, + "learning_rate": 2.9269725848803687e-05, + "loss": 1.4369, + "step": 28630 + }, + { + "epoch": 4.732906424292501, + "grad_norm": 13.919119834899902, + "learning_rate": 2.9260544629905066e-05, + "loss": 1.4788, + "step": 28640 + }, + { + "epoch": 4.734558975418302, + "grad_norm": 9.051968574523926, + "learning_rate": 2.925136341100645e-05, + "loss": 1.6114, + "step": 28650 + }, + { + "epoch": 4.7362115265441025, + "grad_norm": 12.999975204467773, + "learning_rate": 2.9242182192107825e-05, + "loss": 1.4965, + "step": 28660 + }, + { + "epoch": 4.737864077669903, + "grad_norm": 9.09343147277832, + "learning_rate": 2.9233000973209207e-05, + "loss": 1.5995, + "step": 28670 + }, + { + "epoch": 4.739516628795704, + "grad_norm": 9.132031440734863, + "learning_rate": 2.9223819754310583e-05, + "loss": 1.4807, + "step": 28680 + }, + { + "epoch": 4.741169179921504, + "grad_norm": 7.520742416381836, + "learning_rate": 2.9214638535411966e-05, + "loss": 1.5562, + "step": 28690 + }, + { + "epoch": 4.742821731047305, + "grad_norm": 8.723919868469238, + "learning_rate": 2.920545731651334e-05, + "loss": 1.4716, + "step": 28700 + }, + { + "epoch": 4.744474282173105, + "grad_norm": 22.840511322021484, + "learning_rate": 2.919627609761472e-05, + "loss": 1.5385, + "step": 28710 + }, + { + "epoch": 4.746126833298905, + "grad_norm": 8.98953628540039, + "learning_rate": 2.9187094878716097e-05, + "loss": 1.4941, + "step": 28720 + }, + { + "epoch": 4.747779384424706, + "grad_norm": 8.429163932800293, + "learning_rate": 2.917791365981748e-05, + "loss": 1.5198, + "step": 28730 + }, + { + "epoch": 4.749431935550506, + "grad_norm": 12.46285629272461, + "learning_rate": 2.9168732440918855e-05, + "loss": 1.4711, + "step": 28740 + }, + { + "epoch": 4.751084486676307, + "grad_norm": 10.32840633392334, + "learning_rate": 2.9159551222020238e-05, + "loss": 1.4423, + "step": 28750 + }, + { + "epoch": 4.752737037802107, + "grad_norm": 5.583861827850342, + "learning_rate": 2.9150370003121614e-05, + "loss": 1.6284, + "step": 28760 + }, + { + "epoch": 4.754389588927907, + "grad_norm": 9.908193588256836, + "learning_rate": 2.9141188784222993e-05, + "loss": 1.5041, + "step": 28770 + }, + { + "epoch": 4.756042140053708, + "grad_norm": 11.036250114440918, + "learning_rate": 2.9132007565324376e-05, + "loss": 1.529, + "step": 28780 + }, + { + "epoch": 4.757694691179508, + "grad_norm": 13.065105438232422, + "learning_rate": 2.912282634642575e-05, + "loss": 1.5542, + "step": 28790 + }, + { + "epoch": 4.759347242305309, + "grad_norm": 11.913582801818848, + "learning_rate": 2.9113645127527134e-05, + "loss": 1.6069, + "step": 28800 + }, + { + "epoch": 4.760999793431109, + "grad_norm": 9.790139198303223, + "learning_rate": 2.910446390862851e-05, + "loss": 1.5313, + "step": 28810 + }, + { + "epoch": 4.76265234455691, + "grad_norm": 10.083866119384766, + "learning_rate": 2.9095282689729893e-05, + "loss": 1.5612, + "step": 28820 + }, + { + "epoch": 4.76430489568271, + "grad_norm": 17.150331497192383, + "learning_rate": 2.908610147083127e-05, + "loss": 1.5995, + "step": 28830 + }, + { + "epoch": 4.76595744680851, + "grad_norm": 13.593814849853516, + "learning_rate": 2.9076920251932648e-05, + "loss": 1.4406, + "step": 28840 + }, + { + "epoch": 4.767609997934311, + "grad_norm": 8.73280143737793, + "learning_rate": 2.9067739033034024e-05, + "loss": 1.6393, + "step": 28850 + }, + { + "epoch": 4.7692625490601115, + "grad_norm": 11.49311351776123, + "learning_rate": 2.9058557814135406e-05, + "loss": 1.5007, + "step": 28860 + }, + { + "epoch": 4.770915100185912, + "grad_norm": 18.270648956298828, + "learning_rate": 2.9049376595236782e-05, + "loss": 1.4804, + "step": 28870 + }, + { + "epoch": 4.772567651311713, + "grad_norm": 7.664954662322998, + "learning_rate": 2.9040195376338165e-05, + "loss": 1.5895, + "step": 28880 + }, + { + "epoch": 4.774220202437513, + "grad_norm": 7.60715389251709, + "learning_rate": 2.9031014157439544e-05, + "loss": 1.5162, + "step": 28890 + }, + { + "epoch": 4.775872753563314, + "grad_norm": 9.755659103393555, + "learning_rate": 2.9021832938540923e-05, + "loss": 1.4321, + "step": 28900 + }, + { + "epoch": 4.777525304689114, + "grad_norm": 15.464385032653809, + "learning_rate": 2.9012651719642302e-05, + "loss": 1.4708, + "step": 28910 + }, + { + "epoch": 4.779177855814915, + "grad_norm": 24.310163497924805, + "learning_rate": 2.900347050074368e-05, + "loss": 1.4532, + "step": 28920 + }, + { + "epoch": 4.780830406940715, + "grad_norm": 15.663667678833008, + "learning_rate": 2.899428928184506e-05, + "loss": 1.4912, + "step": 28930 + }, + { + "epoch": 4.782482958066515, + "grad_norm": 12.316214561462402, + "learning_rate": 2.8985108062946437e-05, + "loss": 1.5429, + "step": 28940 + }, + { + "epoch": 4.784135509192316, + "grad_norm": 14.147255897521973, + "learning_rate": 2.897592684404782e-05, + "loss": 1.4476, + "step": 28950 + }, + { + "epoch": 4.785788060318116, + "grad_norm": 9.104997634887695, + "learning_rate": 2.8966745625149195e-05, + "loss": 1.4273, + "step": 28960 + }, + { + "epoch": 4.787440611443916, + "grad_norm": 11.243693351745605, + "learning_rate": 2.8957564406250575e-05, + "loss": 1.5976, + "step": 28970 + }, + { + "epoch": 4.789093162569717, + "grad_norm": 15.91396427154541, + "learning_rate": 2.894838318735195e-05, + "loss": 1.5532, + "step": 28980 + }, + { + "epoch": 4.790745713695517, + "grad_norm": 23.990699768066406, + "learning_rate": 2.8939201968453333e-05, + "loss": 1.5387, + "step": 28990 + }, + { + "epoch": 4.792398264821318, + "grad_norm": 13.71623706817627, + "learning_rate": 2.8930020749554716e-05, + "loss": 1.5653, + "step": 29000 + }, + { + "epoch": 4.794050815947118, + "grad_norm": 7.524801731109619, + "learning_rate": 2.892083953065609e-05, + "loss": 1.5804, + "step": 29010 + }, + { + "epoch": 4.795703367072919, + "grad_norm": 31.6929874420166, + "learning_rate": 2.891165831175747e-05, + "loss": 1.5599, + "step": 29020 + }, + { + "epoch": 4.797355918198719, + "grad_norm": 10.225500106811523, + "learning_rate": 2.890247709285885e-05, + "loss": 1.3953, + "step": 29030 + }, + { + "epoch": 4.799008469324519, + "grad_norm": 11.53248405456543, + "learning_rate": 2.889329587396023e-05, + "loss": 1.5633, + "step": 29040 + }, + { + "epoch": 4.80066102045032, + "grad_norm": 9.548233985900879, + "learning_rate": 2.8884114655061605e-05, + "loss": 1.4989, + "step": 29050 + }, + { + "epoch": 4.8023135715761205, + "grad_norm": 5.22447395324707, + "learning_rate": 2.8874933436162988e-05, + "loss": 1.4038, + "step": 29060 + }, + { + "epoch": 4.8039661227019215, + "grad_norm": 15.642428398132324, + "learning_rate": 2.8865752217264364e-05, + "loss": 1.4773, + "step": 29070 + }, + { + "epoch": 4.805618673827722, + "grad_norm": 7.29582405090332, + "learning_rate": 2.8856570998365746e-05, + "loss": 1.5097, + "step": 29080 + }, + { + "epoch": 4.807271224953522, + "grad_norm": 14.306315422058105, + "learning_rate": 2.8847389779467122e-05, + "loss": 1.4798, + "step": 29090 + }, + { + "epoch": 4.808923776079323, + "grad_norm": 9.41856861114502, + "learning_rate": 2.88382085605685e-05, + "loss": 1.5179, + "step": 29100 + }, + { + "epoch": 4.810576327205123, + "grad_norm": 14.0534086227417, + "learning_rate": 2.8829027341669884e-05, + "loss": 1.4123, + "step": 29110 + }, + { + "epoch": 4.812228878330924, + "grad_norm": 73.24070739746094, + "learning_rate": 2.881984612277126e-05, + "loss": 1.5063, + "step": 29120 + }, + { + "epoch": 4.813881429456724, + "grad_norm": 11.320660591125488, + "learning_rate": 2.8810664903872643e-05, + "loss": 1.5333, + "step": 29130 + }, + { + "epoch": 4.815533980582524, + "grad_norm": 10.174736976623535, + "learning_rate": 2.880148368497402e-05, + "loss": 1.4865, + "step": 29140 + }, + { + "epoch": 4.817186531708325, + "grad_norm": 13.212072372436523, + "learning_rate": 2.8792302466075398e-05, + "loss": 1.486, + "step": 29150 + }, + { + "epoch": 4.818839082834125, + "grad_norm": 14.246745109558105, + "learning_rate": 2.8783121247176777e-05, + "loss": 1.5052, + "step": 29160 + }, + { + "epoch": 4.820491633959926, + "grad_norm": 10.12642765045166, + "learning_rate": 2.8773940028278156e-05, + "loss": 1.3828, + "step": 29170 + }, + { + "epoch": 4.822144185085726, + "grad_norm": 8.442862510681152, + "learning_rate": 2.8764758809379532e-05, + "loss": 1.5126, + "step": 29180 + }, + { + "epoch": 4.823796736211526, + "grad_norm": 18.026945114135742, + "learning_rate": 2.8755577590480915e-05, + "loss": 1.5676, + "step": 29190 + }, + { + "epoch": 4.825449287337327, + "grad_norm": 7.779977321624756, + "learning_rate": 2.874639637158229e-05, + "loss": 1.4621, + "step": 29200 + }, + { + "epoch": 4.827101838463127, + "grad_norm": 21.727739334106445, + "learning_rate": 2.8737215152683673e-05, + "loss": 1.551, + "step": 29210 + }, + { + "epoch": 4.828754389588928, + "grad_norm": 9.406710624694824, + "learning_rate": 2.872803393378505e-05, + "loss": 1.4938, + "step": 29220 + }, + { + "epoch": 4.830406940714728, + "grad_norm": 9.602388381958008, + "learning_rate": 2.8718852714886428e-05, + "loss": 1.5477, + "step": 29230 + }, + { + "epoch": 4.832059491840528, + "grad_norm": 13.313919067382812, + "learning_rate": 2.870967149598781e-05, + "loss": 1.5438, + "step": 29240 + }, + { + "epoch": 4.833712042966329, + "grad_norm": 6.431525707244873, + "learning_rate": 2.8700490277089187e-05, + "loss": 1.5184, + "step": 29250 + }, + { + "epoch": 4.8353645940921295, + "grad_norm": 6.1765313148498535, + "learning_rate": 2.869130905819057e-05, + "loss": 1.4919, + "step": 29260 + }, + { + "epoch": 4.8370171452179305, + "grad_norm": 9.743697166442871, + "learning_rate": 2.8682127839291945e-05, + "loss": 1.4928, + "step": 29270 + }, + { + "epoch": 4.838669696343731, + "grad_norm": 9.40673542022705, + "learning_rate": 2.8672946620393324e-05, + "loss": 1.4567, + "step": 29280 + }, + { + "epoch": 4.8403222474695315, + "grad_norm": 9.90953254699707, + "learning_rate": 2.8663765401494704e-05, + "loss": 1.4985, + "step": 29290 + }, + { + "epoch": 4.841974798595332, + "grad_norm": 15.230709075927734, + "learning_rate": 2.8654584182596083e-05, + "loss": 1.5292, + "step": 29300 + }, + { + "epoch": 4.843627349721132, + "grad_norm": 8.818098068237305, + "learning_rate": 2.864540296369746e-05, + "loss": 1.4527, + "step": 29310 + }, + { + "epoch": 4.845279900846933, + "grad_norm": 7.097944736480713, + "learning_rate": 2.863622174479884e-05, + "loss": 1.627, + "step": 29320 + }, + { + "epoch": 4.846932451972733, + "grad_norm": 9.903128623962402, + "learning_rate": 2.8627040525900217e-05, + "loss": 1.6714, + "step": 29330 + }, + { + "epoch": 4.848585003098533, + "grad_norm": 11.924121856689453, + "learning_rate": 2.86178593070016e-05, + "loss": 1.2808, + "step": 29340 + }, + { + "epoch": 4.850237554224334, + "grad_norm": 15.464925765991211, + "learning_rate": 2.860867808810298e-05, + "loss": 1.5392, + "step": 29350 + }, + { + "epoch": 4.851890105350134, + "grad_norm": 9.023946762084961, + "learning_rate": 2.8599496869204355e-05, + "loss": 1.5101, + "step": 29360 + }, + { + "epoch": 4.853542656475935, + "grad_norm": 13.63484001159668, + "learning_rate": 2.8590315650305738e-05, + "loss": 1.4716, + "step": 29370 + }, + { + "epoch": 4.855195207601735, + "grad_norm": 8.59793472290039, + "learning_rate": 2.8581134431407114e-05, + "loss": 1.4911, + "step": 29380 + }, + { + "epoch": 4.856847758727536, + "grad_norm": 10.655295372009277, + "learning_rate": 2.8571953212508496e-05, + "loss": 1.4926, + "step": 29390 + }, + { + "epoch": 4.858500309853336, + "grad_norm": 44.5798225402832, + "learning_rate": 2.8562771993609872e-05, + "loss": 1.4292, + "step": 29400 + }, + { + "epoch": 4.860152860979136, + "grad_norm": 10.469300270080566, + "learning_rate": 2.855359077471125e-05, + "loss": 1.3958, + "step": 29410 + }, + { + "epoch": 4.861805412104937, + "grad_norm": 125.58094024658203, + "learning_rate": 2.854440955581263e-05, + "loss": 1.5653, + "step": 29420 + }, + { + "epoch": 4.863457963230737, + "grad_norm": 9.53484058380127, + "learning_rate": 2.853522833691401e-05, + "loss": 1.5401, + "step": 29430 + }, + { + "epoch": 4.865110514356537, + "grad_norm": 8.112886428833008, + "learning_rate": 2.8526047118015386e-05, + "loss": 1.4984, + "step": 29440 + }, + { + "epoch": 4.866763065482338, + "grad_norm": 19.068845748901367, + "learning_rate": 2.851686589911677e-05, + "loss": 1.4704, + "step": 29450 + }, + { + "epoch": 4.8684156166081385, + "grad_norm": 14.235884666442871, + "learning_rate": 2.850768468021815e-05, + "loss": 1.4131, + "step": 29460 + }, + { + "epoch": 4.8700681677339395, + "grad_norm": 8.707880973815918, + "learning_rate": 2.8498503461319527e-05, + "loss": 1.3683, + "step": 29470 + }, + { + "epoch": 4.87172071885974, + "grad_norm": 8.955572128295898, + "learning_rate": 2.8489322242420906e-05, + "loss": 1.4594, + "step": 29480 + }, + { + "epoch": 4.8733732699855405, + "grad_norm": 14.345805168151855, + "learning_rate": 2.8480141023522282e-05, + "loss": 1.5525, + "step": 29490 + }, + { + "epoch": 4.875025821111341, + "grad_norm": 20.3609619140625, + "learning_rate": 2.8470959804623665e-05, + "loss": 1.5474, + "step": 29500 + }, + { + "epoch": 4.876678372237141, + "grad_norm": 9.365708351135254, + "learning_rate": 2.846177858572504e-05, + "loss": 1.5406, + "step": 29510 + }, + { + "epoch": 4.878330923362942, + "grad_norm": 8.881852149963379, + "learning_rate": 2.8452597366826423e-05, + "loss": 1.3521, + "step": 29520 + }, + { + "epoch": 4.879983474488742, + "grad_norm": 46.270851135253906, + "learning_rate": 2.84434161479278e-05, + "loss": 1.6585, + "step": 29530 + }, + { + "epoch": 4.881636025614543, + "grad_norm": 6.4464240074157715, + "learning_rate": 2.843423492902918e-05, + "loss": 1.4865, + "step": 29540 + }, + { + "epoch": 4.883288576740343, + "grad_norm": 12.958675384521484, + "learning_rate": 2.8425053710130557e-05, + "loss": 1.5643, + "step": 29550 + }, + { + "epoch": 4.884941127866143, + "grad_norm": 9.830168724060059, + "learning_rate": 2.8415872491231937e-05, + "loss": 1.3667, + "step": 29560 + }, + { + "epoch": 4.886593678991944, + "grad_norm": 15.14609146118164, + "learning_rate": 2.840669127233332e-05, + "loss": 1.4713, + "step": 29570 + }, + { + "epoch": 4.888246230117744, + "grad_norm": 10.329089164733887, + "learning_rate": 2.8397510053434695e-05, + "loss": 1.5543, + "step": 29580 + }, + { + "epoch": 4.889898781243545, + "grad_norm": 6.754423141479492, + "learning_rate": 2.8388328834536078e-05, + "loss": 1.3762, + "step": 29590 + }, + { + "epoch": 4.891551332369345, + "grad_norm": 23.90659523010254, + "learning_rate": 2.8379147615637454e-05, + "loss": 1.5471, + "step": 29600 + }, + { + "epoch": 4.893203883495145, + "grad_norm": 14.84753131866455, + "learning_rate": 2.8369966396738833e-05, + "loss": 1.6663, + "step": 29610 + }, + { + "epoch": 4.894856434620946, + "grad_norm": 8.822469711303711, + "learning_rate": 2.836078517784021e-05, + "loss": 1.3787, + "step": 29620 + }, + { + "epoch": 4.896508985746746, + "grad_norm": 7.7165913581848145, + "learning_rate": 2.835160395894159e-05, + "loss": 1.4144, + "step": 29630 + }, + { + "epoch": 4.898161536872547, + "grad_norm": 15.842850685119629, + "learning_rate": 2.8342422740042967e-05, + "loss": 1.4095, + "step": 29640 + }, + { + "epoch": 4.899814087998347, + "grad_norm": 6.203104019165039, + "learning_rate": 2.833324152114435e-05, + "loss": 1.4753, + "step": 29650 + }, + { + "epoch": 4.901466639124148, + "grad_norm": 9.8826904296875, + "learning_rate": 2.8324060302245726e-05, + "loss": 1.4384, + "step": 29660 + }, + { + "epoch": 4.9031191902499485, + "grad_norm": 12.302288055419922, + "learning_rate": 2.831487908334711e-05, + "loss": 1.5356, + "step": 29670 + }, + { + "epoch": 4.904771741375749, + "grad_norm": 11.829875946044922, + "learning_rate": 2.8305697864448488e-05, + "loss": 1.4053, + "step": 29680 + }, + { + "epoch": 4.9064242925015495, + "grad_norm": 91.15592956542969, + "learning_rate": 2.8296516645549863e-05, + "loss": 1.5348, + "step": 29690 + }, + { + "epoch": 4.90807684362735, + "grad_norm": 8.566816329956055, + "learning_rate": 2.8287335426651246e-05, + "loss": 1.4056, + "step": 29700 + }, + { + "epoch": 4.90972939475315, + "grad_norm": 14.642806053161621, + "learning_rate": 2.8278154207752622e-05, + "loss": 1.5218, + "step": 29710 + }, + { + "epoch": 4.911381945878951, + "grad_norm": 11.299127578735352, + "learning_rate": 2.8268972988854005e-05, + "loss": 1.5021, + "step": 29720 + }, + { + "epoch": 4.913034497004751, + "grad_norm": 7.661716938018799, + "learning_rate": 2.825979176995538e-05, + "loss": 1.4599, + "step": 29730 + }, + { + "epoch": 4.914687048130552, + "grad_norm": 16.636594772338867, + "learning_rate": 2.825061055105676e-05, + "loss": 1.4983, + "step": 29740 + }, + { + "epoch": 4.916339599256352, + "grad_norm": 11.943872451782227, + "learning_rate": 2.8241429332158136e-05, + "loss": 1.6077, + "step": 29750 + }, + { + "epoch": 4.917992150382153, + "grad_norm": 8.194153785705566, + "learning_rate": 2.8232248113259518e-05, + "loss": 1.4847, + "step": 29760 + }, + { + "epoch": 4.919644701507953, + "grad_norm": 9.320284843444824, + "learning_rate": 2.8223066894360894e-05, + "loss": 1.5322, + "step": 29770 + }, + { + "epoch": 4.921297252633753, + "grad_norm": 6.6243391036987305, + "learning_rate": 2.8213885675462277e-05, + "loss": 1.5005, + "step": 29780 + }, + { + "epoch": 4.922949803759554, + "grad_norm": 6.813876628875732, + "learning_rate": 2.8204704456563653e-05, + "loss": 1.6577, + "step": 29790 + }, + { + "epoch": 4.924602354885354, + "grad_norm": 7.4178876876831055, + "learning_rate": 2.8195523237665035e-05, + "loss": 1.4614, + "step": 29800 + }, + { + "epoch": 4.926254906011154, + "grad_norm": 5.743941783905029, + "learning_rate": 2.8186342018766414e-05, + "loss": 1.448, + "step": 29810 + }, + { + "epoch": 4.927907457136955, + "grad_norm": 17.722078323364258, + "learning_rate": 2.817716079986779e-05, + "loss": 1.5352, + "step": 29820 + }, + { + "epoch": 4.929560008262755, + "grad_norm": 8.234187126159668, + "learning_rate": 2.8167979580969173e-05, + "loss": 1.4319, + "step": 29830 + }, + { + "epoch": 4.931212559388556, + "grad_norm": 14.171329498291016, + "learning_rate": 2.815879836207055e-05, + "loss": 1.6987, + "step": 29840 + }, + { + "epoch": 4.932865110514356, + "grad_norm": 18.609638214111328, + "learning_rate": 2.814961714317193e-05, + "loss": 1.4605, + "step": 29850 + }, + { + "epoch": 4.934517661640157, + "grad_norm": 10.389782905578613, + "learning_rate": 2.8140435924273307e-05, + "loss": 1.571, + "step": 29860 + }, + { + "epoch": 4.9361702127659575, + "grad_norm": 206.19345092773438, + "learning_rate": 2.8131254705374687e-05, + "loss": 1.4592, + "step": 29870 + }, + { + "epoch": 4.937822763891758, + "grad_norm": 8.415513038635254, + "learning_rate": 2.8122073486476062e-05, + "loss": 1.4644, + "step": 29880 + }, + { + "epoch": 4.9394753150175585, + "grad_norm": 13.903251647949219, + "learning_rate": 2.8112892267577445e-05, + "loss": 1.4461, + "step": 29890 + }, + { + "epoch": 4.941127866143359, + "grad_norm": 7.16209602355957, + "learning_rate": 2.810371104867882e-05, + "loss": 1.5743, + "step": 29900 + }, + { + "epoch": 4.94278041726916, + "grad_norm": 11.736125946044922, + "learning_rate": 2.8094529829780204e-05, + "loss": 1.6183, + "step": 29910 + }, + { + "epoch": 4.94443296839496, + "grad_norm": 12.971633911132812, + "learning_rate": 2.8085348610881583e-05, + "loss": 1.4226, + "step": 29920 + }, + { + "epoch": 4.94608551952076, + "grad_norm": 6.504984378814697, + "learning_rate": 2.8076167391982962e-05, + "loss": 1.4956, + "step": 29930 + }, + { + "epoch": 4.947738070646561, + "grad_norm": 7.893176555633545, + "learning_rate": 2.806698617308434e-05, + "loss": 1.4615, + "step": 29940 + }, + { + "epoch": 4.949390621772361, + "grad_norm": 8.680290222167969, + "learning_rate": 2.8057804954185717e-05, + "loss": 1.4396, + "step": 29950 + }, + { + "epoch": 4.951043172898162, + "grad_norm": 11.371504783630371, + "learning_rate": 2.80486237352871e-05, + "loss": 1.4316, + "step": 29960 + }, + { + "epoch": 4.952695724023962, + "grad_norm": 11.4708251953125, + "learning_rate": 2.8039442516388476e-05, + "loss": 1.5216, + "step": 29970 + }, + { + "epoch": 4.954348275149762, + "grad_norm": 7.596912860870361, + "learning_rate": 2.8030261297489858e-05, + "loss": 1.538, + "step": 29980 + }, + { + "epoch": 4.956000826275563, + "grad_norm": 12.889708518981934, + "learning_rate": 2.8021080078591234e-05, + "loss": 1.4701, + "step": 29990 + }, + { + "epoch": 4.957653377401363, + "grad_norm": 23.94351577758789, + "learning_rate": 2.8011898859692613e-05, + "loss": 1.4761, + "step": 30000 + }, + { + "epoch": 4.959305928527164, + "grad_norm": 10.514241218566895, + "learning_rate": 2.800271764079399e-05, + "loss": 1.5601, + "step": 30010 + }, + { + "epoch": 4.960958479652964, + "grad_norm": 134.59942626953125, + "learning_rate": 2.7993536421895372e-05, + "loss": 1.5578, + "step": 30020 + }, + { + "epoch": 4.962611030778765, + "grad_norm": 8.633646965026855, + "learning_rate": 2.7984355202996755e-05, + "loss": 1.3932, + "step": 30030 + }, + { + "epoch": 4.964263581904565, + "grad_norm": 8.6370849609375, + "learning_rate": 2.797517398409813e-05, + "loss": 1.4556, + "step": 30040 + }, + { + "epoch": 4.965916133030365, + "grad_norm": 7.444171905517578, + "learning_rate": 2.796599276519951e-05, + "loss": 1.4341, + "step": 30050 + }, + { + "epoch": 4.967568684156166, + "grad_norm": 14.582514762878418, + "learning_rate": 2.795681154630089e-05, + "loss": 1.5428, + "step": 30060 + }, + { + "epoch": 4.9692212352819665, + "grad_norm": 10.481239318847656, + "learning_rate": 2.7947630327402268e-05, + "loss": 1.4816, + "step": 30070 + }, + { + "epoch": 4.970873786407767, + "grad_norm": 9.104494094848633, + "learning_rate": 2.7938449108503644e-05, + "loss": 1.4667, + "step": 30080 + }, + { + "epoch": 4.9725263375335675, + "grad_norm": 14.128423690795898, + "learning_rate": 2.7929267889605027e-05, + "loss": 1.4932, + "step": 30090 + }, + { + "epoch": 4.974178888659368, + "grad_norm": 8.712443351745605, + "learning_rate": 2.7920086670706402e-05, + "loss": 1.57, + "step": 30100 + }, + { + "epoch": 4.975831439785169, + "grad_norm": 8.335115432739258, + "learning_rate": 2.7910905451807785e-05, + "loss": 1.4954, + "step": 30110 + }, + { + "epoch": 4.977483990910969, + "grad_norm": 47.504981994628906, + "learning_rate": 2.790172423290916e-05, + "loss": 1.5175, + "step": 30120 + }, + { + "epoch": 4.97913654203677, + "grad_norm": 8.322532653808594, + "learning_rate": 2.789254301401054e-05, + "loss": 1.5074, + "step": 30130 + }, + { + "epoch": 4.98078909316257, + "grad_norm": 7.302855014801025, + "learning_rate": 2.7883361795111923e-05, + "loss": 1.6079, + "step": 30140 + }, + { + "epoch": 4.98244164428837, + "grad_norm": 17.811246871948242, + "learning_rate": 2.78741805762133e-05, + "loss": 1.4465, + "step": 30150 + }, + { + "epoch": 4.984094195414171, + "grad_norm": 16.895263671875, + "learning_rate": 2.786499935731468e-05, + "loss": 1.5104, + "step": 30160 + }, + { + "epoch": 4.985746746539971, + "grad_norm": 31.71302604675293, + "learning_rate": 2.7855818138416057e-05, + "loss": 1.5411, + "step": 30170 + }, + { + "epoch": 4.987399297665771, + "grad_norm": 19.637605667114258, + "learning_rate": 2.7846636919517436e-05, + "loss": 1.4842, + "step": 30180 + }, + { + "epoch": 4.989051848791572, + "grad_norm": 134.47610473632812, + "learning_rate": 2.7837455700618816e-05, + "loss": 1.6898, + "step": 30190 + }, + { + "epoch": 4.990704399917372, + "grad_norm": 18.40015983581543, + "learning_rate": 2.7828274481720195e-05, + "loss": 1.4774, + "step": 30200 + }, + { + "epoch": 4.992356951043173, + "grad_norm": 8.392579078674316, + "learning_rate": 2.781909326282157e-05, + "loss": 1.3739, + "step": 30210 + }, + { + "epoch": 4.994009502168973, + "grad_norm": 12.9806547164917, + "learning_rate": 2.7809912043922953e-05, + "loss": 1.4431, + "step": 30220 + }, + { + "epoch": 4.995662053294774, + "grad_norm": 9.284590721130371, + "learning_rate": 2.780073082502433e-05, + "loss": 1.5775, + "step": 30230 + }, + { + "epoch": 4.997314604420574, + "grad_norm": 13.390445709228516, + "learning_rate": 2.7791549606125712e-05, + "loss": 1.5908, + "step": 30240 + }, + { + "epoch": 4.998967155546374, + "grad_norm": 9.49555778503418, + "learning_rate": 2.778236838722709e-05, + "loss": 1.5371, + "step": 30250 + }, + { + "epoch": 4.999958686221855, + "eval_accuracy": 0.2968151510852603, + "eval_loss": 2.206881046295166, + "eval_runtime": 816.4999, + "eval_samples_per_second": 34.533, + "eval_steps_per_second": 8.633, + "step": 30256 + }, + { + "epoch": 5.000619706672175, + "grad_norm": 15.309430122375488, + "learning_rate": 2.7773187168328467e-05, + "loss": 1.5734, + "step": 30260 + }, + { + "epoch": 5.0022722577979755, + "grad_norm": 11.606006622314453, + "learning_rate": 2.776400594942985e-05, + "loss": 1.4502, + "step": 30270 + }, + { + "epoch": 5.0039248089237764, + "grad_norm": 13.329957962036133, + "learning_rate": 2.7754824730531226e-05, + "loss": 1.3861, + "step": 30280 + }, + { + "epoch": 5.0055773600495765, + "grad_norm": 35.278804779052734, + "learning_rate": 2.7745643511632608e-05, + "loss": 1.468, + "step": 30290 + }, + { + "epoch": 5.007229911175377, + "grad_norm": 27.012672424316406, + "learning_rate": 2.7736462292733984e-05, + "loss": 1.6023, + "step": 30300 + }, + { + "epoch": 5.008882462301178, + "grad_norm": 7.429107189178467, + "learning_rate": 2.7727281073835367e-05, + "loss": 1.4019, + "step": 30310 + }, + { + "epoch": 5.010535013426978, + "grad_norm": 8.644866943359375, + "learning_rate": 2.7718099854936743e-05, + "loss": 1.3994, + "step": 30320 + }, + { + "epoch": 5.012187564552779, + "grad_norm": 8.626925468444824, + "learning_rate": 2.7708918636038122e-05, + "loss": 1.6237, + "step": 30330 + }, + { + "epoch": 5.013840115678579, + "grad_norm": 23.41725730895996, + "learning_rate": 2.7699737417139498e-05, + "loss": 1.3434, + "step": 30340 + }, + { + "epoch": 5.015492666804379, + "grad_norm": 12.992168426513672, + "learning_rate": 2.769055619824088e-05, + "loss": 1.4989, + "step": 30350 + }, + { + "epoch": 5.01714521793018, + "grad_norm": 19.700824737548828, + "learning_rate": 2.7681374979342256e-05, + "loss": 1.6157, + "step": 30360 + }, + { + "epoch": 5.01879776905598, + "grad_norm": 35.20869827270508, + "learning_rate": 2.767219376044364e-05, + "loss": 1.3213, + "step": 30370 + }, + { + "epoch": 5.020450320181781, + "grad_norm": 10.241279602050781, + "learning_rate": 2.7663012541545018e-05, + "loss": 1.5054, + "step": 30380 + }, + { + "epoch": 5.022102871307581, + "grad_norm": 7.736096382141113, + "learning_rate": 2.7653831322646394e-05, + "loss": 1.5098, + "step": 30390 + }, + { + "epoch": 5.023755422433381, + "grad_norm": 16.00613784790039, + "learning_rate": 2.7644650103747777e-05, + "loss": 1.4232, + "step": 30400 + }, + { + "epoch": 5.025407973559182, + "grad_norm": 9.938050270080566, + "learning_rate": 2.7635468884849152e-05, + "loss": 1.472, + "step": 30410 + }, + { + "epoch": 5.027060524684982, + "grad_norm": 7.0826640129089355, + "learning_rate": 2.7626287665950535e-05, + "loss": 1.4146, + "step": 30420 + }, + { + "epoch": 5.028713075810783, + "grad_norm": 8.384482383728027, + "learning_rate": 2.761710644705191e-05, + "loss": 1.503, + "step": 30430 + }, + { + "epoch": 5.030365626936583, + "grad_norm": 8.713443756103516, + "learning_rate": 2.7607925228153293e-05, + "loss": 1.2782, + "step": 30440 + }, + { + "epoch": 5.032018178062383, + "grad_norm": 11.568880081176758, + "learning_rate": 2.759874400925467e-05, + "loss": 1.4832, + "step": 30450 + }, + { + "epoch": 5.033670729188184, + "grad_norm": 11.942070960998535, + "learning_rate": 2.758956279035605e-05, + "loss": 1.4826, + "step": 30460 + }, + { + "epoch": 5.0353232803139845, + "grad_norm": 8.392718315124512, + "learning_rate": 2.7580381571457424e-05, + "loss": 1.5613, + "step": 30470 + }, + { + "epoch": 5.0369758314397854, + "grad_norm": 8.625317573547363, + "learning_rate": 2.7571200352558807e-05, + "loss": 1.5039, + "step": 30480 + }, + { + "epoch": 5.0386283825655855, + "grad_norm": 34.13085174560547, + "learning_rate": 2.756201913366019e-05, + "loss": 1.5013, + "step": 30490 + }, + { + "epoch": 5.0402809336913865, + "grad_norm": 9.937477111816406, + "learning_rate": 2.7552837914761566e-05, + "loss": 1.4878, + "step": 30500 + }, + { + "epoch": 5.041933484817187, + "grad_norm": 10.615894317626953, + "learning_rate": 2.7543656695862945e-05, + "loss": 1.5141, + "step": 30510 + }, + { + "epoch": 5.043586035942987, + "grad_norm": 9.882899284362793, + "learning_rate": 2.753447547696432e-05, + "loss": 1.4341, + "step": 30520 + }, + { + "epoch": 5.045238587068788, + "grad_norm": 13.298185348510742, + "learning_rate": 2.7525294258065703e-05, + "loss": 1.3448, + "step": 30530 + }, + { + "epoch": 5.046891138194588, + "grad_norm": 10.530019760131836, + "learning_rate": 2.751611303916708e-05, + "loss": 1.4876, + "step": 30540 + }, + { + "epoch": 5.048543689320389, + "grad_norm": 7.8333892822265625, + "learning_rate": 2.7506931820268462e-05, + "loss": 1.3568, + "step": 30550 + }, + { + "epoch": 5.050196240446189, + "grad_norm": 8.093653678894043, + "learning_rate": 2.7497750601369838e-05, + "loss": 1.4622, + "step": 30560 + }, + { + "epoch": 5.051848791571989, + "grad_norm": 15.520524978637695, + "learning_rate": 2.748856938247122e-05, + "loss": 1.5338, + "step": 30570 + }, + { + "epoch": 5.05350134269779, + "grad_norm": 7.4860382080078125, + "learning_rate": 2.7479388163572596e-05, + "loss": 1.3255, + "step": 30580 + }, + { + "epoch": 5.05515389382359, + "grad_norm": 24.090126037597656, + "learning_rate": 2.7470206944673975e-05, + "loss": 1.4614, + "step": 30590 + }, + { + "epoch": 5.056806444949391, + "grad_norm": 6.862220287322998, + "learning_rate": 2.7461025725775358e-05, + "loss": 1.4854, + "step": 30600 + }, + { + "epoch": 5.058458996075191, + "grad_norm": 28.299789428710938, + "learning_rate": 2.7451844506876734e-05, + "loss": 1.4952, + "step": 30610 + }, + { + "epoch": 5.060111547200991, + "grad_norm": 9.547993659973145, + "learning_rate": 2.7442663287978117e-05, + "loss": 1.4909, + "step": 30620 + }, + { + "epoch": 5.061764098326792, + "grad_norm": 14.898249626159668, + "learning_rate": 2.7433482069079492e-05, + "loss": 1.4672, + "step": 30630 + }, + { + "epoch": 5.063416649452592, + "grad_norm": 8.26840591430664, + "learning_rate": 2.742430085018087e-05, + "loss": 1.5186, + "step": 30640 + }, + { + "epoch": 5.065069200578393, + "grad_norm": 7.622718811035156, + "learning_rate": 2.7415119631282248e-05, + "loss": 1.3791, + "step": 30650 + }, + { + "epoch": 5.066721751704193, + "grad_norm": 9.911038398742676, + "learning_rate": 2.740593841238363e-05, + "loss": 1.3867, + "step": 30660 + }, + { + "epoch": 5.0683743028299935, + "grad_norm": 21.617273330688477, + "learning_rate": 2.7396757193485006e-05, + "loss": 1.4964, + "step": 30670 + }, + { + "epoch": 5.070026853955794, + "grad_norm": 10.754417419433594, + "learning_rate": 2.738757597458639e-05, + "loss": 1.4902, + "step": 30680 + }, + { + "epoch": 5.0716794050815945, + "grad_norm": 15.153556823730469, + "learning_rate": 2.7378394755687765e-05, + "loss": 1.4118, + "step": 30690 + }, + { + "epoch": 5.0733319562073955, + "grad_norm": 10.339966773986816, + "learning_rate": 2.7369213536789147e-05, + "loss": 1.5785, + "step": 30700 + }, + { + "epoch": 5.074984507333196, + "grad_norm": 16.77724838256836, + "learning_rate": 2.7360032317890526e-05, + "loss": 1.3934, + "step": 30710 + }, + { + "epoch": 5.076637058458996, + "grad_norm": 11.809764862060547, + "learning_rate": 2.7350851098991902e-05, + "loss": 1.4605, + "step": 30720 + }, + { + "epoch": 5.078289609584797, + "grad_norm": 16.061250686645508, + "learning_rate": 2.7341669880093285e-05, + "loss": 1.5512, + "step": 30730 + }, + { + "epoch": 5.079942160710597, + "grad_norm": 8.934623718261719, + "learning_rate": 2.733248866119466e-05, + "loss": 1.4989, + "step": 30740 + }, + { + "epoch": 5.081594711836398, + "grad_norm": 8.88762378692627, + "learning_rate": 2.7323307442296043e-05, + "loss": 1.3776, + "step": 30750 + }, + { + "epoch": 5.083247262962198, + "grad_norm": 14.036128044128418, + "learning_rate": 2.731412622339742e-05, + "loss": 1.5089, + "step": 30760 + }, + { + "epoch": 5.084899814087998, + "grad_norm": 12.190908432006836, + "learning_rate": 2.73049450044988e-05, + "loss": 1.4895, + "step": 30770 + }, + { + "epoch": 5.086552365213799, + "grad_norm": 10.279311180114746, + "learning_rate": 2.7295763785600174e-05, + "loss": 1.5135, + "step": 30780 + }, + { + "epoch": 5.088204916339599, + "grad_norm": 7.8421549797058105, + "learning_rate": 2.7286582566701557e-05, + "loss": 1.4824, + "step": 30790 + }, + { + "epoch": 5.0898574674654, + "grad_norm": 33.02604293823242, + "learning_rate": 2.7277401347802933e-05, + "loss": 1.4672, + "step": 30800 + }, + { + "epoch": 5.0915100185912, + "grad_norm": 9.192763328552246, + "learning_rate": 2.7268220128904315e-05, + "loss": 1.5224, + "step": 30810 + }, + { + "epoch": 5.093162569717, + "grad_norm": 14.467486381530762, + "learning_rate": 2.7259038910005695e-05, + "loss": 1.5533, + "step": 30820 + }, + { + "epoch": 5.094815120842801, + "grad_norm": 12.854667663574219, + "learning_rate": 2.7249857691107074e-05, + "loss": 1.5342, + "step": 30830 + }, + { + "epoch": 5.096467671968601, + "grad_norm": 8.334192276000977, + "learning_rate": 2.7240676472208453e-05, + "loss": 1.5695, + "step": 30840 + }, + { + "epoch": 5.098120223094402, + "grad_norm": 7.920324325561523, + "learning_rate": 2.723149525330983e-05, + "loss": 1.5243, + "step": 30850 + }, + { + "epoch": 5.099772774220202, + "grad_norm": 15.681721687316895, + "learning_rate": 2.7222314034411212e-05, + "loss": 1.5742, + "step": 30860 + }, + { + "epoch": 5.1014253253460025, + "grad_norm": 10.732522010803223, + "learning_rate": 2.7213132815512588e-05, + "loss": 1.451, + "step": 30870 + }, + { + "epoch": 5.103077876471803, + "grad_norm": 9.83503532409668, + "learning_rate": 2.720395159661397e-05, + "loss": 1.5081, + "step": 30880 + }, + { + "epoch": 5.1047304275976035, + "grad_norm": 15.027039527893066, + "learning_rate": 2.7194770377715346e-05, + "loss": 1.5076, + "step": 30890 + }, + { + "epoch": 5.1063829787234045, + "grad_norm": 6.365040302276611, + "learning_rate": 2.7185589158816725e-05, + "loss": 1.4941, + "step": 30900 + }, + { + "epoch": 5.108035529849205, + "grad_norm": 14.915818214416504, + "learning_rate": 2.71764079399181e-05, + "loss": 1.5407, + "step": 30910 + }, + { + "epoch": 5.109688080975006, + "grad_norm": 6.7997236251831055, + "learning_rate": 2.7167226721019484e-05, + "loss": 1.5268, + "step": 30920 + }, + { + "epoch": 5.111340632100806, + "grad_norm": 9.963415145874023, + "learning_rate": 2.715804550212086e-05, + "loss": 1.3845, + "step": 30930 + }, + { + "epoch": 5.112993183226606, + "grad_norm": 10.191669464111328, + "learning_rate": 2.7148864283222242e-05, + "loss": 1.4634, + "step": 30940 + }, + { + "epoch": 5.114645734352407, + "grad_norm": 12.32084846496582, + "learning_rate": 2.7139683064323625e-05, + "loss": 1.4858, + "step": 30950 + }, + { + "epoch": 5.116298285478207, + "grad_norm": 5.869612216949463, + "learning_rate": 2.7130501845425e-05, + "loss": 1.4629, + "step": 30960 + }, + { + "epoch": 5.117950836604008, + "grad_norm": 17.75486183166504, + "learning_rate": 2.712132062652638e-05, + "loss": 1.4306, + "step": 30970 + }, + { + "epoch": 5.119603387729808, + "grad_norm": 15.648809432983398, + "learning_rate": 2.7112139407627756e-05, + "loss": 1.4946, + "step": 30980 + }, + { + "epoch": 5.121255938855608, + "grad_norm": 10.89876651763916, + "learning_rate": 2.710295818872914e-05, + "loss": 1.6039, + "step": 30990 + }, + { + "epoch": 5.122908489981409, + "grad_norm": 13.772411346435547, + "learning_rate": 2.7093776969830514e-05, + "loss": 1.5348, + "step": 31000 + }, + { + "epoch": 5.124561041107209, + "grad_norm": 11.758411407470703, + "learning_rate": 2.7084595750931897e-05, + "loss": 1.4812, + "step": 31010 + }, + { + "epoch": 5.12621359223301, + "grad_norm": 27.372583389282227, + "learning_rate": 2.7075414532033273e-05, + "loss": 1.4999, + "step": 31020 + }, + { + "epoch": 5.12786614335881, + "grad_norm": 9.395577430725098, + "learning_rate": 2.7066233313134652e-05, + "loss": 1.4854, + "step": 31030 + }, + { + "epoch": 5.12951869448461, + "grad_norm": 24.79270362854004, + "learning_rate": 2.7057052094236028e-05, + "loss": 1.4465, + "step": 31040 + }, + { + "epoch": 5.131171245610411, + "grad_norm": 10.64253044128418, + "learning_rate": 2.704787087533741e-05, + "loss": 1.5273, + "step": 31050 + }, + { + "epoch": 5.132823796736211, + "grad_norm": 15.544374465942383, + "learning_rate": 2.7038689656438793e-05, + "loss": 1.4328, + "step": 31060 + }, + { + "epoch": 5.134476347862012, + "grad_norm": 13.274250030517578, + "learning_rate": 2.702950843754017e-05, + "loss": 1.3941, + "step": 31070 + }, + { + "epoch": 5.136128898987812, + "grad_norm": 7.694459438323975, + "learning_rate": 2.7020327218641552e-05, + "loss": 1.2814, + "step": 31080 + }, + { + "epoch": 5.1377814501136125, + "grad_norm": 10.96912956237793, + "learning_rate": 2.7011145999742928e-05, + "loss": 1.5482, + "step": 31090 + }, + { + "epoch": 5.1394340012394135, + "grad_norm": 107.31693267822266, + "learning_rate": 2.7001964780844307e-05, + "loss": 1.6004, + "step": 31100 + }, + { + "epoch": 5.141086552365214, + "grad_norm": 10.470490455627441, + "learning_rate": 2.6992783561945683e-05, + "loss": 1.4453, + "step": 31110 + }, + { + "epoch": 5.142739103491015, + "grad_norm": 8.99507999420166, + "learning_rate": 2.6983602343047065e-05, + "loss": 1.6031, + "step": 31120 + }, + { + "epoch": 5.144391654616815, + "grad_norm": 7.556997776031494, + "learning_rate": 2.697442112414844e-05, + "loss": 1.6775, + "step": 31130 + }, + { + "epoch": 5.146044205742615, + "grad_norm": 6.91307258605957, + "learning_rate": 2.6965239905249824e-05, + "loss": 1.4369, + "step": 31140 + }, + { + "epoch": 5.147696756868416, + "grad_norm": 11.001294136047363, + "learning_rate": 2.69560586863512e-05, + "loss": 1.5253, + "step": 31150 + }, + { + "epoch": 5.149349307994216, + "grad_norm": 11.226962089538574, + "learning_rate": 2.694687746745258e-05, + "loss": 1.4318, + "step": 31160 + }, + { + "epoch": 5.151001859120017, + "grad_norm": 9.484665870666504, + "learning_rate": 2.693769624855396e-05, + "loss": 1.6958, + "step": 31170 + }, + { + "epoch": 5.152654410245817, + "grad_norm": 11.47225570678711, + "learning_rate": 2.6928515029655337e-05, + "loss": 1.4713, + "step": 31180 + }, + { + "epoch": 5.154306961371617, + "grad_norm": 13.216545104980469, + "learning_rate": 2.691933381075672e-05, + "loss": 1.462, + "step": 31190 + }, + { + "epoch": 5.155959512497418, + "grad_norm": 14.571917533874512, + "learning_rate": 2.6910152591858096e-05, + "loss": 1.5572, + "step": 31200 + }, + { + "epoch": 5.157612063623218, + "grad_norm": 6.982619285583496, + "learning_rate": 2.690097137295948e-05, + "loss": 1.5357, + "step": 31210 + }, + { + "epoch": 5.159264614749019, + "grad_norm": 7.8256988525390625, + "learning_rate": 2.6891790154060854e-05, + "loss": 1.4597, + "step": 31220 + }, + { + "epoch": 5.160917165874819, + "grad_norm": 11.033082962036133, + "learning_rate": 2.6882608935162234e-05, + "loss": 1.3216, + "step": 31230 + }, + { + "epoch": 5.162569717000619, + "grad_norm": 9.309578895568848, + "learning_rate": 2.687342771626361e-05, + "loss": 1.3888, + "step": 31240 + }, + { + "epoch": 5.16422226812642, + "grad_norm": 36.22593307495117, + "learning_rate": 2.6864246497364992e-05, + "loss": 1.3437, + "step": 31250 + }, + { + "epoch": 5.16587481925222, + "grad_norm": 24.026330947875977, + "learning_rate": 2.6855065278466368e-05, + "loss": 1.4197, + "step": 31260 + }, + { + "epoch": 5.167527370378021, + "grad_norm": 11.367629051208496, + "learning_rate": 2.684588405956775e-05, + "loss": 1.4089, + "step": 31270 + }, + { + "epoch": 5.169179921503821, + "grad_norm": 8.361258506774902, + "learning_rate": 2.683670284066913e-05, + "loss": 1.4635, + "step": 31280 + }, + { + "epoch": 5.1708324726296215, + "grad_norm": 25.029769897460938, + "learning_rate": 2.6827521621770506e-05, + "loss": 1.446, + "step": 31290 + }, + { + "epoch": 5.1724850237554225, + "grad_norm": 11.334177017211914, + "learning_rate": 2.681834040287189e-05, + "loss": 1.4217, + "step": 31300 + }, + { + "epoch": 5.174137574881223, + "grad_norm": 8.15962028503418, + "learning_rate": 2.6809159183973264e-05, + "loss": 1.4519, + "step": 31310 + }, + { + "epoch": 5.175790126007024, + "grad_norm": 10.570094108581543, + "learning_rate": 2.6799977965074647e-05, + "loss": 1.3329, + "step": 31320 + }, + { + "epoch": 5.177442677132824, + "grad_norm": 9.071272850036621, + "learning_rate": 2.6790796746176023e-05, + "loss": 1.558, + "step": 31330 + }, + { + "epoch": 5.179095228258625, + "grad_norm": 9.73098373413086, + "learning_rate": 2.6781615527277405e-05, + "loss": 1.4698, + "step": 31340 + }, + { + "epoch": 5.180747779384425, + "grad_norm": 13.9347505569458, + "learning_rate": 2.677243430837878e-05, + "loss": 1.5003, + "step": 31350 + }, + { + "epoch": 5.182400330510225, + "grad_norm": 10.36252498626709, + "learning_rate": 2.676325308948016e-05, + "loss": 1.4587, + "step": 31360 + }, + { + "epoch": 5.184052881636026, + "grad_norm": 16.580341339111328, + "learning_rate": 2.6754071870581536e-05, + "loss": 1.4521, + "step": 31370 + }, + { + "epoch": 5.185705432761826, + "grad_norm": 14.264723777770996, + "learning_rate": 2.674489065168292e-05, + "loss": 1.3813, + "step": 31380 + }, + { + "epoch": 5.187357983887627, + "grad_norm": 13.60984992980957, + "learning_rate": 2.6735709432784295e-05, + "loss": 1.6065, + "step": 31390 + }, + { + "epoch": 5.189010535013427, + "grad_norm": 8.337096214294434, + "learning_rate": 2.6726528213885678e-05, + "loss": 1.4814, + "step": 31400 + }, + { + "epoch": 5.190663086139227, + "grad_norm": 8.234798431396484, + "learning_rate": 2.6717346994987057e-05, + "loss": 1.5305, + "step": 31410 + }, + { + "epoch": 5.192315637265028, + "grad_norm": 12.562371253967285, + "learning_rate": 2.6708165776088433e-05, + "loss": 1.5045, + "step": 31420 + }, + { + "epoch": 5.193968188390828, + "grad_norm": 11.537344932556152, + "learning_rate": 2.6698984557189815e-05, + "loss": 1.5556, + "step": 31430 + }, + { + "epoch": 5.195620739516629, + "grad_norm": 34.379127502441406, + "learning_rate": 2.668980333829119e-05, + "loss": 1.4086, + "step": 31440 + }, + { + "epoch": 5.197273290642429, + "grad_norm": 10.825907707214355, + "learning_rate": 2.6680622119392574e-05, + "loss": 1.6994, + "step": 31450 + }, + { + "epoch": 5.198925841768229, + "grad_norm": 9.560426712036133, + "learning_rate": 2.667144090049395e-05, + "loss": 1.606, + "step": 31460 + }, + { + "epoch": 5.20057839289403, + "grad_norm": 7.058825492858887, + "learning_rate": 2.6662259681595332e-05, + "loss": 1.5243, + "step": 31470 + }, + { + "epoch": 5.20223094401983, + "grad_norm": 8.447346687316895, + "learning_rate": 2.6653078462696708e-05, + "loss": 1.4428, + "step": 31480 + }, + { + "epoch": 5.203883495145631, + "grad_norm": 8.007600784301758, + "learning_rate": 2.6643897243798087e-05, + "loss": 1.419, + "step": 31490 + }, + { + "epoch": 5.2055360462714315, + "grad_norm": 10.7009916305542, + "learning_rate": 2.6634716024899463e-05, + "loss": 1.5057, + "step": 31500 + }, + { + "epoch": 5.207188597397232, + "grad_norm": 10.528794288635254, + "learning_rate": 2.6625534806000846e-05, + "loss": 1.3964, + "step": 31510 + }, + { + "epoch": 5.208841148523033, + "grad_norm": 11.727227210998535, + "learning_rate": 2.661635358710223e-05, + "loss": 1.5043, + "step": 31520 + }, + { + "epoch": 5.210493699648833, + "grad_norm": 7.113905429840088, + "learning_rate": 2.6607172368203604e-05, + "loss": 1.4591, + "step": 31530 + }, + { + "epoch": 5.212146250774634, + "grad_norm": 8.97098159790039, + "learning_rate": 2.6597991149304984e-05, + "loss": 1.5621, + "step": 31540 + }, + { + "epoch": 5.213798801900434, + "grad_norm": 10.305506706237793, + "learning_rate": 2.658880993040636e-05, + "loss": 1.4602, + "step": 31550 + }, + { + "epoch": 5.215451353026234, + "grad_norm": 13.641674995422363, + "learning_rate": 2.6579628711507742e-05, + "loss": 1.4489, + "step": 31560 + }, + { + "epoch": 5.217103904152035, + "grad_norm": 9.43690299987793, + "learning_rate": 2.6570447492609118e-05, + "loss": 1.5889, + "step": 31570 + }, + { + "epoch": 5.218756455277835, + "grad_norm": 6.873445510864258, + "learning_rate": 2.65612662737105e-05, + "loss": 1.5353, + "step": 31580 + }, + { + "epoch": 5.220409006403636, + "grad_norm": 28.370594024658203, + "learning_rate": 2.6552085054811876e-05, + "loss": 1.5797, + "step": 31590 + }, + { + "epoch": 5.222061557529436, + "grad_norm": 11.743301391601562, + "learning_rate": 2.654290383591326e-05, + "loss": 1.379, + "step": 31600 + }, + { + "epoch": 5.223714108655236, + "grad_norm": 8.087701797485352, + "learning_rate": 2.6533722617014635e-05, + "loss": 1.5028, + "step": 31610 + }, + { + "epoch": 5.225366659781037, + "grad_norm": 49.10771179199219, + "learning_rate": 2.6524541398116014e-05, + "loss": 1.4961, + "step": 31620 + }, + { + "epoch": 5.227019210906837, + "grad_norm": 12.940389633178711, + "learning_rate": 2.6515360179217397e-05, + "loss": 1.5188, + "step": 31630 + }, + { + "epoch": 5.228671762032638, + "grad_norm": 9.253148078918457, + "learning_rate": 2.6506178960318773e-05, + "loss": 1.5103, + "step": 31640 + }, + { + "epoch": 5.230324313158438, + "grad_norm": 7.253983497619629, + "learning_rate": 2.6496997741420155e-05, + "loss": 1.5957, + "step": 31650 + }, + { + "epoch": 5.231976864284238, + "grad_norm": 7.679978370666504, + "learning_rate": 2.648781652252153e-05, + "loss": 1.4185, + "step": 31660 + }, + { + "epoch": 5.233629415410039, + "grad_norm": 13.257478713989258, + "learning_rate": 2.647863530362291e-05, + "loss": 1.5459, + "step": 31670 + }, + { + "epoch": 5.235281966535839, + "grad_norm": 11.092429161071777, + "learning_rate": 2.6469454084724286e-05, + "loss": 1.4213, + "step": 31680 + }, + { + "epoch": 5.23693451766164, + "grad_norm": 13.886821746826172, + "learning_rate": 2.646027286582567e-05, + "loss": 1.4144, + "step": 31690 + }, + { + "epoch": 5.2385870687874405, + "grad_norm": 9.925250053405762, + "learning_rate": 2.6451091646927045e-05, + "loss": 1.3833, + "step": 31700 + }, + { + "epoch": 5.2402396199132415, + "grad_norm": 8.703375816345215, + "learning_rate": 2.6441910428028427e-05, + "loss": 1.4315, + "step": 31710 + }, + { + "epoch": 5.241892171039042, + "grad_norm": 91.34703826904297, + "learning_rate": 2.6432729209129803e-05, + "loss": 1.3998, + "step": 31720 + }, + { + "epoch": 5.243544722164842, + "grad_norm": 15.27595043182373, + "learning_rate": 2.6423547990231186e-05, + "loss": 1.5865, + "step": 31730 + }, + { + "epoch": 5.245197273290643, + "grad_norm": 11.300504684448242, + "learning_rate": 2.6414366771332565e-05, + "loss": 1.4581, + "step": 31740 + }, + { + "epoch": 5.246849824416443, + "grad_norm": 10.756152153015137, + "learning_rate": 2.640518555243394e-05, + "loss": 1.5089, + "step": 31750 + }, + { + "epoch": 5.248502375542244, + "grad_norm": 32.1532096862793, + "learning_rate": 2.6396004333535324e-05, + "loss": 1.4962, + "step": 31760 + }, + { + "epoch": 5.250154926668044, + "grad_norm": 9.208897590637207, + "learning_rate": 2.63868231146367e-05, + "loss": 1.4471, + "step": 31770 + }, + { + "epoch": 5.251807477793844, + "grad_norm": 17.185142517089844, + "learning_rate": 2.6377641895738082e-05, + "loss": 1.5346, + "step": 31780 + }, + { + "epoch": 5.253460028919645, + "grad_norm": 15.642415046691895, + "learning_rate": 2.6368460676839458e-05, + "loss": 1.4897, + "step": 31790 + }, + { + "epoch": 5.255112580045445, + "grad_norm": 11.783285140991211, + "learning_rate": 2.6359279457940837e-05, + "loss": 1.4354, + "step": 31800 + }, + { + "epoch": 5.256765131171246, + "grad_norm": 10.766020774841309, + "learning_rate": 2.6350098239042213e-05, + "loss": 1.4611, + "step": 31810 + }, + { + "epoch": 5.258417682297046, + "grad_norm": 9.75991439819336, + "learning_rate": 2.6340917020143596e-05, + "loss": 1.397, + "step": 31820 + }, + { + "epoch": 5.260070233422846, + "grad_norm": 9.640849113464355, + "learning_rate": 2.633173580124497e-05, + "loss": 1.4211, + "step": 31830 + }, + { + "epoch": 5.261722784548647, + "grad_norm": 49.9780387878418, + "learning_rate": 2.6322554582346354e-05, + "loss": 1.4722, + "step": 31840 + }, + { + "epoch": 5.263375335674447, + "grad_norm": 11.358599662780762, + "learning_rate": 2.6313373363447737e-05, + "loss": 1.4281, + "step": 31850 + }, + { + "epoch": 5.265027886800248, + "grad_norm": 6.540308475494385, + "learning_rate": 2.6304192144549113e-05, + "loss": 1.4257, + "step": 31860 + }, + { + "epoch": 5.266680437926048, + "grad_norm": 11.856473922729492, + "learning_rate": 2.6295010925650492e-05, + "loss": 1.4528, + "step": 31870 + }, + { + "epoch": 5.268332989051848, + "grad_norm": 6.613503456115723, + "learning_rate": 2.6285829706751868e-05, + "loss": 1.2896, + "step": 31880 + }, + { + "epoch": 5.269985540177649, + "grad_norm": 10.662419319152832, + "learning_rate": 2.627664848785325e-05, + "loss": 1.4951, + "step": 31890 + }, + { + "epoch": 5.2716380913034495, + "grad_norm": 13.057232856750488, + "learning_rate": 2.6267467268954626e-05, + "loss": 1.5273, + "step": 31900 + }, + { + "epoch": 5.2732906424292505, + "grad_norm": 11.477456092834473, + "learning_rate": 2.625828605005601e-05, + "loss": 1.6712, + "step": 31910 + }, + { + "epoch": 5.274943193555051, + "grad_norm": 7.808887481689453, + "learning_rate": 2.6249104831157385e-05, + "loss": 1.4828, + "step": 31920 + }, + { + "epoch": 5.276595744680851, + "grad_norm": 8.068138122558594, + "learning_rate": 2.6239923612258764e-05, + "loss": 1.5149, + "step": 31930 + }, + { + "epoch": 5.278248295806652, + "grad_norm": 11.689132690429688, + "learning_rate": 2.623074239336014e-05, + "loss": 1.4828, + "step": 31940 + }, + { + "epoch": 5.279900846932452, + "grad_norm": 11.739019393920898, + "learning_rate": 2.6221561174461523e-05, + "loss": 1.5513, + "step": 31950 + }, + { + "epoch": 5.281553398058253, + "grad_norm": 9.972562789916992, + "learning_rate": 2.62123799555629e-05, + "loss": 1.4665, + "step": 31960 + }, + { + "epoch": 5.283205949184053, + "grad_norm": 9.730892181396484, + "learning_rate": 2.620319873666428e-05, + "loss": 1.6372, + "step": 31970 + }, + { + "epoch": 5.284858500309853, + "grad_norm": 7.058382034301758, + "learning_rate": 2.6194017517765664e-05, + "loss": 1.3208, + "step": 31980 + }, + { + "epoch": 5.286511051435654, + "grad_norm": 21.382532119750977, + "learning_rate": 2.618483629886704e-05, + "loss": 1.6161, + "step": 31990 + }, + { + "epoch": 5.288163602561454, + "grad_norm": 12.467560768127441, + "learning_rate": 2.617565507996842e-05, + "loss": 1.5576, + "step": 32000 + }, + { + "epoch": 5.289816153687255, + "grad_norm": 8.383124351501465, + "learning_rate": 2.6166473861069795e-05, + "loss": 1.5683, + "step": 32010 + }, + { + "epoch": 5.291468704813055, + "grad_norm": 11.486825942993164, + "learning_rate": 2.6157292642171177e-05, + "loss": 1.5452, + "step": 32020 + }, + { + "epoch": 5.293121255938855, + "grad_norm": 10.539969444274902, + "learning_rate": 2.6148111423272553e-05, + "loss": 1.5899, + "step": 32030 + }, + { + "epoch": 5.294773807064656, + "grad_norm": 6.630085468292236, + "learning_rate": 2.6138930204373936e-05, + "loss": 1.6407, + "step": 32040 + }, + { + "epoch": 5.296426358190456, + "grad_norm": 18.89915657043457, + "learning_rate": 2.6129748985475312e-05, + "loss": 1.4234, + "step": 32050 + }, + { + "epoch": 5.298078909316257, + "grad_norm": 6.79443883895874, + "learning_rate": 2.612056776657669e-05, + "loss": 1.428, + "step": 32060 + }, + { + "epoch": 5.299731460442057, + "grad_norm": 6.143259048461914, + "learning_rate": 2.6111386547678067e-05, + "loss": 1.4715, + "step": 32070 + }, + { + "epoch": 5.301384011567858, + "grad_norm": 11.97968864440918, + "learning_rate": 2.610220532877945e-05, + "loss": 1.3861, + "step": 32080 + }, + { + "epoch": 5.303036562693658, + "grad_norm": 6.659870624542236, + "learning_rate": 2.6093024109880832e-05, + "loss": 1.3774, + "step": 32090 + }, + { + "epoch": 5.3046891138194585, + "grad_norm": 19.500179290771484, + "learning_rate": 2.6083842890982208e-05, + "loss": 1.4016, + "step": 32100 + }, + { + "epoch": 5.3063416649452595, + "grad_norm": 10.958991050720215, + "learning_rate": 2.607466167208359e-05, + "loss": 1.2945, + "step": 32110 + }, + { + "epoch": 5.30799421607106, + "grad_norm": 12.997007369995117, + "learning_rate": 2.6065480453184966e-05, + "loss": 1.4438, + "step": 32120 + }, + { + "epoch": 5.30964676719686, + "grad_norm": 6.068414688110352, + "learning_rate": 2.6056299234286346e-05, + "loss": 1.4711, + "step": 32130 + }, + { + "epoch": 5.311299318322661, + "grad_norm": 11.195246696472168, + "learning_rate": 2.604711801538772e-05, + "loss": 1.4093, + "step": 32140 + }, + { + "epoch": 5.312951869448461, + "grad_norm": 40.626102447509766, + "learning_rate": 2.6037936796489104e-05, + "loss": 1.575, + "step": 32150 + }, + { + "epoch": 5.314604420574262, + "grad_norm": 9.044241905212402, + "learning_rate": 2.602875557759048e-05, + "loss": 1.3712, + "step": 32160 + }, + { + "epoch": 5.316256971700062, + "grad_norm": 13.25019359588623, + "learning_rate": 2.6019574358691863e-05, + "loss": 1.4445, + "step": 32170 + }, + { + "epoch": 5.317909522825863, + "grad_norm": 20.42643165588379, + "learning_rate": 2.601039313979324e-05, + "loss": 1.3569, + "step": 32180 + }, + { + "epoch": 5.319562073951663, + "grad_norm": 9.683372497558594, + "learning_rate": 2.6001211920894618e-05, + "loss": 1.5151, + "step": 32190 + }, + { + "epoch": 5.321214625077463, + "grad_norm": 11.241543769836426, + "learning_rate": 2.5992030701996e-05, + "loss": 1.4425, + "step": 32200 + }, + { + "epoch": 5.322867176203264, + "grad_norm": 12.246685981750488, + "learning_rate": 2.5982849483097376e-05, + "loss": 1.6109, + "step": 32210 + }, + { + "epoch": 5.324519727329064, + "grad_norm": 30.807636260986328, + "learning_rate": 2.597366826419876e-05, + "loss": 1.4588, + "step": 32220 + }, + { + "epoch": 5.326172278454865, + "grad_norm": 19.119007110595703, + "learning_rate": 2.5964487045300135e-05, + "loss": 1.4311, + "step": 32230 + }, + { + "epoch": 5.327824829580665, + "grad_norm": 12.0513277053833, + "learning_rate": 2.5955305826401517e-05, + "loss": 1.5163, + "step": 32240 + }, + { + "epoch": 5.329477380706465, + "grad_norm": 11.274496078491211, + "learning_rate": 2.5946124607502893e-05, + "loss": 1.3246, + "step": 32250 + }, + { + "epoch": 5.331129931832266, + "grad_norm": 56.508384704589844, + "learning_rate": 2.5936943388604273e-05, + "loss": 1.5643, + "step": 32260 + }, + { + "epoch": 5.332782482958066, + "grad_norm": 14.02528190612793, + "learning_rate": 2.592776216970565e-05, + "loss": 1.5164, + "step": 32270 + }, + { + "epoch": 5.334435034083867, + "grad_norm": 16.86151123046875, + "learning_rate": 2.591858095080703e-05, + "loss": 1.4939, + "step": 32280 + }, + { + "epoch": 5.336087585209667, + "grad_norm": 8.078414916992188, + "learning_rate": 2.5909399731908407e-05, + "loss": 1.4786, + "step": 32290 + }, + { + "epoch": 5.3377401363354675, + "grad_norm": 11.507722854614258, + "learning_rate": 2.590021851300979e-05, + "loss": 1.4795, + "step": 32300 + }, + { + "epoch": 5.3393926874612685, + "grad_norm": 20.504240036010742, + "learning_rate": 2.589103729411117e-05, + "loss": 1.5318, + "step": 32310 + }, + { + "epoch": 5.341045238587069, + "grad_norm": 8.86858081817627, + "learning_rate": 2.5881856075212545e-05, + "loss": 1.5606, + "step": 32320 + }, + { + "epoch": 5.34269778971287, + "grad_norm": 6.756838321685791, + "learning_rate": 2.5872674856313927e-05, + "loss": 1.4607, + "step": 32330 + }, + { + "epoch": 5.34435034083867, + "grad_norm": 17.09296989440918, + "learning_rate": 2.5863493637415303e-05, + "loss": 1.5001, + "step": 32340 + }, + { + "epoch": 5.34600289196447, + "grad_norm": 7.092482566833496, + "learning_rate": 2.5854312418516686e-05, + "loss": 1.5361, + "step": 32350 + }, + { + "epoch": 5.347655443090271, + "grad_norm": 10.106451988220215, + "learning_rate": 2.584513119961806e-05, + "loss": 1.4626, + "step": 32360 + }, + { + "epoch": 5.349307994216071, + "grad_norm": 7.64391565322876, + "learning_rate": 2.5835949980719444e-05, + "loss": 1.5781, + "step": 32370 + }, + { + "epoch": 5.350960545341872, + "grad_norm": 10.09816837310791, + "learning_rate": 2.582676876182082e-05, + "loss": 1.5339, + "step": 32380 + }, + { + "epoch": 5.352613096467672, + "grad_norm": 12.53437328338623, + "learning_rate": 2.58175875429222e-05, + "loss": 1.4137, + "step": 32390 + }, + { + "epoch": 5.354265647593472, + "grad_norm": 9.417365074157715, + "learning_rate": 2.5808406324023575e-05, + "loss": 1.4312, + "step": 32400 + }, + { + "epoch": 5.355918198719273, + "grad_norm": 8.617480278015137, + "learning_rate": 2.5799225105124958e-05, + "loss": 1.527, + "step": 32410 + }, + { + "epoch": 5.357570749845073, + "grad_norm": 13.1574068069458, + "learning_rate": 2.579004388622634e-05, + "loss": 1.6434, + "step": 32420 + }, + { + "epoch": 5.359223300970874, + "grad_norm": 7.331067085266113, + "learning_rate": 2.5780862667327716e-05, + "loss": 1.4821, + "step": 32430 + }, + { + "epoch": 5.360875852096674, + "grad_norm": 9.73812484741211, + "learning_rate": 2.5771681448429096e-05, + "loss": 1.4768, + "step": 32440 + }, + { + "epoch": 5.362528403222475, + "grad_norm": 9.941301345825195, + "learning_rate": 2.576250022953047e-05, + "loss": 1.5879, + "step": 32450 + }, + { + "epoch": 5.364180954348275, + "grad_norm": 12.169427871704102, + "learning_rate": 2.5753319010631854e-05, + "loss": 1.4923, + "step": 32460 + }, + { + "epoch": 5.365833505474075, + "grad_norm": 12.982132911682129, + "learning_rate": 2.574413779173323e-05, + "loss": 1.3733, + "step": 32470 + }, + { + "epoch": 5.367486056599876, + "grad_norm": 7.65261173248291, + "learning_rate": 2.5734956572834613e-05, + "loss": 1.436, + "step": 32480 + }, + { + "epoch": 5.369138607725676, + "grad_norm": 9.102943420410156, + "learning_rate": 2.572577535393599e-05, + "loss": 1.515, + "step": 32490 + }, + { + "epoch": 5.3707911588514765, + "grad_norm": 5.440056800842285, + "learning_rate": 2.571659413503737e-05, + "loss": 1.4411, + "step": 32500 + }, + { + "epoch": 5.3724437099772775, + "grad_norm": 6.815498352050781, + "learning_rate": 2.5707412916138747e-05, + "loss": 1.4195, + "step": 32510 + }, + { + "epoch": 5.374096261103078, + "grad_norm": 9.051548957824707, + "learning_rate": 2.5698231697240126e-05, + "loss": 1.4627, + "step": 32520 + }, + { + "epoch": 5.375748812228879, + "grad_norm": 10.097661972045898, + "learning_rate": 2.5689050478341502e-05, + "loss": 1.5081, + "step": 32530 + }, + { + "epoch": 5.377401363354679, + "grad_norm": 8.773504257202148, + "learning_rate": 2.5679869259442885e-05, + "loss": 1.5666, + "step": 32540 + }, + { + "epoch": 5.37905391448048, + "grad_norm": 9.091877937316895, + "learning_rate": 2.5670688040544267e-05, + "loss": 1.5096, + "step": 32550 + }, + { + "epoch": 5.38070646560628, + "grad_norm": 6.017947196960449, + "learning_rate": 2.5661506821645643e-05, + "loss": 1.4604, + "step": 32560 + }, + { + "epoch": 5.38235901673208, + "grad_norm": 8.248211860656738, + "learning_rate": 2.5652325602747022e-05, + "loss": 1.4782, + "step": 32570 + }, + { + "epoch": 5.384011567857881, + "grad_norm": 8.899044036865234, + "learning_rate": 2.5643144383848398e-05, + "loss": 1.4193, + "step": 32580 + }, + { + "epoch": 5.385664118983681, + "grad_norm": 10.997679710388184, + "learning_rate": 2.563396316494978e-05, + "loss": 1.469, + "step": 32590 + }, + { + "epoch": 5.387316670109482, + "grad_norm": 14.518561363220215, + "learning_rate": 2.5624781946051157e-05, + "loss": 1.447, + "step": 32600 + }, + { + "epoch": 5.388969221235282, + "grad_norm": 12.715332984924316, + "learning_rate": 2.561560072715254e-05, + "loss": 1.55, + "step": 32610 + }, + { + "epoch": 5.390621772361082, + "grad_norm": 74.79273223876953, + "learning_rate": 2.5606419508253915e-05, + "loss": 1.3885, + "step": 32620 + }, + { + "epoch": 5.392274323486883, + "grad_norm": 13.330283164978027, + "learning_rate": 2.5597238289355298e-05, + "loss": 1.5348, + "step": 32630 + }, + { + "epoch": 5.393926874612683, + "grad_norm": 11.709308624267578, + "learning_rate": 2.5588057070456674e-05, + "loss": 1.5496, + "step": 32640 + }, + { + "epoch": 5.395579425738484, + "grad_norm": 7.356724739074707, + "learning_rate": 2.5578875851558053e-05, + "loss": 1.3092, + "step": 32650 + }, + { + "epoch": 5.397231976864284, + "grad_norm": 7.576127529144287, + "learning_rate": 2.5569694632659436e-05, + "loss": 1.3922, + "step": 32660 + }, + { + "epoch": 5.398884527990084, + "grad_norm": 8.769586563110352, + "learning_rate": 2.556051341376081e-05, + "loss": 1.4997, + "step": 32670 + }, + { + "epoch": 5.400537079115885, + "grad_norm": 8.266812324523926, + "learning_rate": 2.5551332194862194e-05, + "loss": 1.504, + "step": 32680 + }, + { + "epoch": 5.402189630241685, + "grad_norm": 11.372197151184082, + "learning_rate": 2.554215097596357e-05, + "loss": 1.4409, + "step": 32690 + }, + { + "epoch": 5.403842181367486, + "grad_norm": 19.209890365600586, + "learning_rate": 2.553296975706495e-05, + "loss": 1.5131, + "step": 32700 + }, + { + "epoch": 5.4054947324932865, + "grad_norm": 11.207270622253418, + "learning_rate": 2.5523788538166325e-05, + "loss": 1.4532, + "step": 32710 + }, + { + "epoch": 5.407147283619087, + "grad_norm": 7.834964752197266, + "learning_rate": 2.5514607319267708e-05, + "loss": 1.4809, + "step": 32720 + }, + { + "epoch": 5.408799834744888, + "grad_norm": 10.717996597290039, + "learning_rate": 2.5505426100369084e-05, + "loss": 1.4442, + "step": 32730 + }, + { + "epoch": 5.410452385870688, + "grad_norm": 7.571774005889893, + "learning_rate": 2.5496244881470466e-05, + "loss": 1.5859, + "step": 32740 + }, + { + "epoch": 5.412104936996489, + "grad_norm": 9.474056243896484, + "learning_rate": 2.5487063662571842e-05, + "loss": 1.5151, + "step": 32750 + }, + { + "epoch": 5.413757488122289, + "grad_norm": 8.241455078125, + "learning_rate": 2.5477882443673225e-05, + "loss": 1.4181, + "step": 32760 + }, + { + "epoch": 5.415410039248089, + "grad_norm": 8.962974548339844, + "learning_rate": 2.5468701224774604e-05, + "loss": 1.4505, + "step": 32770 + }, + { + "epoch": 5.41706259037389, + "grad_norm": 9.275091171264648, + "learning_rate": 2.545952000587598e-05, + "loss": 1.4723, + "step": 32780 + }, + { + "epoch": 5.41871514149969, + "grad_norm": 9.588011741638184, + "learning_rate": 2.5450338786977362e-05, + "loss": 1.5176, + "step": 32790 + }, + { + "epoch": 5.420367692625491, + "grad_norm": 6.4946441650390625, + "learning_rate": 2.544115756807874e-05, + "loss": 1.4376, + "step": 32800 + }, + { + "epoch": 5.422020243751291, + "grad_norm": 13.272873878479004, + "learning_rate": 2.543197634918012e-05, + "loss": 1.5343, + "step": 32810 + }, + { + "epoch": 5.423672794877092, + "grad_norm": 9.417807579040527, + "learning_rate": 2.5422795130281497e-05, + "loss": 1.4349, + "step": 32820 + }, + { + "epoch": 5.425325346002892, + "grad_norm": 16.028039932250977, + "learning_rate": 2.5413613911382876e-05, + "loss": 1.464, + "step": 32830 + }, + { + "epoch": 5.426977897128692, + "grad_norm": 6.217875003814697, + "learning_rate": 2.5404432692484252e-05, + "loss": 1.4224, + "step": 32840 + }, + { + "epoch": 5.428630448254493, + "grad_norm": 4.6173529624938965, + "learning_rate": 2.5395251473585635e-05, + "loss": 1.389, + "step": 32850 + }, + { + "epoch": 5.430282999380293, + "grad_norm": 16.59287452697754, + "learning_rate": 2.538607025468701e-05, + "loss": 1.406, + "step": 32860 + }, + { + "epoch": 5.431935550506093, + "grad_norm": 28.710718154907227, + "learning_rate": 2.5376889035788393e-05, + "loss": 1.5599, + "step": 32870 + }, + { + "epoch": 5.433588101631894, + "grad_norm": 6.974539756774902, + "learning_rate": 2.5367707816889776e-05, + "loss": 1.5168, + "step": 32880 + }, + { + "epoch": 5.435240652757694, + "grad_norm": 18.166751861572266, + "learning_rate": 2.535852659799115e-05, + "loss": 1.4325, + "step": 32890 + }, + { + "epoch": 5.436893203883495, + "grad_norm": 13.46557331085205, + "learning_rate": 2.534934537909253e-05, + "loss": 1.3934, + "step": 32900 + }, + { + "epoch": 5.4385457550092955, + "grad_norm": 11.49219036102295, + "learning_rate": 2.5340164160193907e-05, + "loss": 1.4228, + "step": 32910 + }, + { + "epoch": 5.4401983061350965, + "grad_norm": 12.140527725219727, + "learning_rate": 2.533098294129529e-05, + "loss": 1.5574, + "step": 32920 + }, + { + "epoch": 5.441850857260897, + "grad_norm": 13.28624153137207, + "learning_rate": 2.5321801722396665e-05, + "loss": 1.4256, + "step": 32930 + }, + { + "epoch": 5.443503408386697, + "grad_norm": 19.24834632873535, + "learning_rate": 2.5312620503498048e-05, + "loss": 1.4024, + "step": 32940 + }, + { + "epoch": 5.445155959512498, + "grad_norm": 12.811565399169922, + "learning_rate": 2.5303439284599424e-05, + "loss": 1.5038, + "step": 32950 + }, + { + "epoch": 5.446808510638298, + "grad_norm": 135.61062622070312, + "learning_rate": 2.5294258065700803e-05, + "loss": 1.5554, + "step": 32960 + }, + { + "epoch": 5.448461061764099, + "grad_norm": 13.871460914611816, + "learning_rate": 2.528507684680218e-05, + "loss": 1.5125, + "step": 32970 + }, + { + "epoch": 5.450113612889899, + "grad_norm": 7.033801078796387, + "learning_rate": 2.527589562790356e-05, + "loss": 1.6154, + "step": 32980 + }, + { + "epoch": 5.451766164015699, + "grad_norm": 17.240575790405273, + "learning_rate": 2.5266714409004944e-05, + "loss": 1.4924, + "step": 32990 + }, + { + "epoch": 5.4534187151415, + "grad_norm": 11.401257514953613, + "learning_rate": 2.525753319010632e-05, + "loss": 1.449, + "step": 33000 + }, + { + "epoch": 5.4550712662673, + "grad_norm": 23.046972274780273, + "learning_rate": 2.5248351971207703e-05, + "loss": 1.3946, + "step": 33010 + }, + { + "epoch": 5.456723817393101, + "grad_norm": 13.394057273864746, + "learning_rate": 2.523917075230908e-05, + "loss": 1.4518, + "step": 33020 + }, + { + "epoch": 5.458376368518901, + "grad_norm": 7.417761325836182, + "learning_rate": 2.5229989533410458e-05, + "loss": 1.5553, + "step": 33030 + }, + { + "epoch": 5.460028919644701, + "grad_norm": 15.697338104248047, + "learning_rate": 2.5220808314511834e-05, + "loss": 1.3501, + "step": 33040 + }, + { + "epoch": 5.461681470770502, + "grad_norm": 13.63474178314209, + "learning_rate": 2.5211627095613216e-05, + "loss": 1.5208, + "step": 33050 + }, + { + "epoch": 5.463334021896302, + "grad_norm": 15.581018447875977, + "learning_rate": 2.5202445876714592e-05, + "loss": 1.3335, + "step": 33060 + }, + { + "epoch": 5.464986573022103, + "grad_norm": 15.496585845947266, + "learning_rate": 2.5193264657815975e-05, + "loss": 1.3965, + "step": 33070 + }, + { + "epoch": 5.466639124147903, + "grad_norm": 11.815439224243164, + "learning_rate": 2.518408343891735e-05, + "loss": 1.4156, + "step": 33080 + }, + { + "epoch": 5.468291675273703, + "grad_norm": 15.371979713439941, + "learning_rate": 2.517490222001873e-05, + "loss": 1.452, + "step": 33090 + }, + { + "epoch": 5.469944226399504, + "grad_norm": 14.599471092224121, + "learning_rate": 2.5165721001120106e-05, + "loss": 1.4849, + "step": 33100 + }, + { + "epoch": 5.4715967775253045, + "grad_norm": 16.83903694152832, + "learning_rate": 2.5156539782221488e-05, + "loss": 1.3825, + "step": 33110 + }, + { + "epoch": 5.4732493286511055, + "grad_norm": 11.542963027954102, + "learning_rate": 2.514735856332287e-05, + "loss": 1.4628, + "step": 33120 + }, + { + "epoch": 5.474901879776906, + "grad_norm": 44.345481872558594, + "learning_rate": 2.5138177344424247e-05, + "loss": 1.3834, + "step": 33130 + }, + { + "epoch": 5.476554430902706, + "grad_norm": 13.720561981201172, + "learning_rate": 2.512899612552563e-05, + "loss": 1.3351, + "step": 33140 + }, + { + "epoch": 5.478206982028507, + "grad_norm": 11.957843780517578, + "learning_rate": 2.5119814906627005e-05, + "loss": 1.4726, + "step": 33150 + }, + { + "epoch": 5.479859533154307, + "grad_norm": 11.13582706451416, + "learning_rate": 2.5110633687728384e-05, + "loss": 1.3245, + "step": 33160 + }, + { + "epoch": 5.481512084280108, + "grad_norm": 33.86077117919922, + "learning_rate": 2.510145246882976e-05, + "loss": 1.6398, + "step": 33170 + }, + { + "epoch": 5.483164635405908, + "grad_norm": 19.655872344970703, + "learning_rate": 2.5092271249931143e-05, + "loss": 1.5043, + "step": 33180 + }, + { + "epoch": 5.484817186531708, + "grad_norm": 10.309099197387695, + "learning_rate": 2.508309003103252e-05, + "loss": 1.4599, + "step": 33190 + }, + { + "epoch": 5.486469737657509, + "grad_norm": 11.701153755187988, + "learning_rate": 2.50739088121339e-05, + "loss": 1.4324, + "step": 33200 + }, + { + "epoch": 5.488122288783309, + "grad_norm": 12.09907054901123, + "learning_rate": 2.5064727593235277e-05, + "loss": 1.4819, + "step": 33210 + }, + { + "epoch": 5.48977483990911, + "grad_norm": 11.532211303710938, + "learning_rate": 2.5055546374336657e-05, + "loss": 1.4336, + "step": 33220 + }, + { + "epoch": 5.49142739103491, + "grad_norm": 7.926141738891602, + "learning_rate": 2.504636515543804e-05, + "loss": 1.4861, + "step": 33230 + }, + { + "epoch": 5.49307994216071, + "grad_norm": 9.352069854736328, + "learning_rate": 2.5037183936539415e-05, + "loss": 1.403, + "step": 33240 + }, + { + "epoch": 5.494732493286511, + "grad_norm": 8.540033340454102, + "learning_rate": 2.5028002717640798e-05, + "loss": 1.4863, + "step": 33250 + }, + { + "epoch": 5.496385044412311, + "grad_norm": 119.00782775878906, + "learning_rate": 2.5018821498742174e-05, + "loss": 1.4733, + "step": 33260 + }, + { + "epoch": 5.498037595538112, + "grad_norm": 9.270196914672852, + "learning_rate": 2.5009640279843556e-05, + "loss": 1.5347, + "step": 33270 + }, + { + "epoch": 5.499690146663912, + "grad_norm": 7.050109386444092, + "learning_rate": 2.5000459060944932e-05, + "loss": 1.4698, + "step": 33280 + }, + { + "epoch": 5.501342697789713, + "grad_norm": 15.336756706237793, + "learning_rate": 2.499127784204631e-05, + "loss": 1.3084, + "step": 33290 + }, + { + "epoch": 5.502995248915513, + "grad_norm": 7.765586853027344, + "learning_rate": 2.498209662314769e-05, + "loss": 1.46, + "step": 33300 + }, + { + "epoch": 5.5046478000413135, + "grad_norm": 22.45989227294922, + "learning_rate": 2.497291540424907e-05, + "loss": 1.5559, + "step": 33310 + }, + { + "epoch": 5.5063003511671145, + "grad_norm": 16.461627960205078, + "learning_rate": 2.496373418535045e-05, + "loss": 1.5358, + "step": 33320 + }, + { + "epoch": 5.507952902292915, + "grad_norm": 13.17129898071289, + "learning_rate": 2.4954552966451828e-05, + "loss": 1.4679, + "step": 33330 + }, + { + "epoch": 5.509605453418715, + "grad_norm": 8.983606338500977, + "learning_rate": 2.4945371747553208e-05, + "loss": 1.4101, + "step": 33340 + }, + { + "epoch": 5.511258004544516, + "grad_norm": 16.77591323852539, + "learning_rate": 2.4936190528654583e-05, + "loss": 1.4592, + "step": 33350 + }, + { + "epoch": 5.512910555670316, + "grad_norm": 20.5835018157959, + "learning_rate": 2.4927009309755963e-05, + "loss": 1.4214, + "step": 33360 + }, + { + "epoch": 5.514563106796117, + "grad_norm": 14.100784301757812, + "learning_rate": 2.4917828090857342e-05, + "loss": 1.4042, + "step": 33370 + }, + { + "epoch": 5.516215657921917, + "grad_norm": 8.669175148010254, + "learning_rate": 2.490864687195872e-05, + "loss": 1.3463, + "step": 33380 + }, + { + "epoch": 5.517868209047718, + "grad_norm": 19.604291915893555, + "learning_rate": 2.4899465653060104e-05, + "loss": 1.3455, + "step": 33390 + }, + { + "epoch": 5.519520760173518, + "grad_norm": 13.911442756652832, + "learning_rate": 2.4890284434161483e-05, + "loss": 1.5867, + "step": 33400 + }, + { + "epoch": 5.521173311299318, + "grad_norm": 9.792315483093262, + "learning_rate": 2.488110321526286e-05, + "loss": 1.4837, + "step": 33410 + }, + { + "epoch": 5.522825862425119, + "grad_norm": 11.161901473999023, + "learning_rate": 2.4871921996364238e-05, + "loss": 1.4366, + "step": 33420 + }, + { + "epoch": 5.524478413550919, + "grad_norm": 12.950854301452637, + "learning_rate": 2.4862740777465617e-05, + "loss": 1.3821, + "step": 33430 + }, + { + "epoch": 5.52613096467672, + "grad_norm": 49.98655700683594, + "learning_rate": 2.4853559558566997e-05, + "loss": 1.5996, + "step": 33440 + }, + { + "epoch": 5.52778351580252, + "grad_norm": 12.21288013458252, + "learning_rate": 2.4844378339668376e-05, + "loss": 1.5804, + "step": 33450 + }, + { + "epoch": 5.52943606692832, + "grad_norm": 7.666236400604248, + "learning_rate": 2.4835197120769755e-05, + "loss": 1.665, + "step": 33460 + }, + { + "epoch": 5.531088618054121, + "grad_norm": 11.325262069702148, + "learning_rate": 2.4826015901871134e-05, + "loss": 1.5233, + "step": 33470 + }, + { + "epoch": 5.532741169179921, + "grad_norm": 9.03773021697998, + "learning_rate": 2.481683468297251e-05, + "loss": 1.5621, + "step": 33480 + }, + { + "epoch": 5.534393720305722, + "grad_norm": 9.082817077636719, + "learning_rate": 2.480765346407389e-05, + "loss": 1.5058, + "step": 33490 + }, + { + "epoch": 5.536046271431522, + "grad_norm": 7.868617534637451, + "learning_rate": 2.479847224517527e-05, + "loss": 1.4172, + "step": 33500 + }, + { + "epoch": 5.5376988225573225, + "grad_norm": 16.261873245239258, + "learning_rate": 2.478929102627665e-05, + "loss": 1.4935, + "step": 33510 + }, + { + "epoch": 5.5393513736831235, + "grad_norm": 8.341659545898438, + "learning_rate": 2.478010980737803e-05, + "loss": 1.6001, + "step": 33520 + }, + { + "epoch": 5.541003924808924, + "grad_norm": 8.13469123840332, + "learning_rate": 2.477092858847941e-05, + "loss": 1.5278, + "step": 33530 + }, + { + "epoch": 5.5426564759347245, + "grad_norm": 8.067828178405762, + "learning_rate": 2.4761747369580786e-05, + "loss": 1.3339, + "step": 33540 + }, + { + "epoch": 5.544309027060525, + "grad_norm": 9.125925064086914, + "learning_rate": 2.4752566150682165e-05, + "loss": 1.4147, + "step": 33550 + }, + { + "epoch": 5.545961578186326, + "grad_norm": 8.341336250305176, + "learning_rate": 2.4743384931783544e-05, + "loss": 1.4189, + "step": 33560 + }, + { + "epoch": 5.547614129312126, + "grad_norm": 10.543508529663086, + "learning_rate": 2.4734203712884923e-05, + "loss": 1.4079, + "step": 33570 + }, + { + "epoch": 5.549266680437926, + "grad_norm": 16.04317283630371, + "learning_rate": 2.4725022493986303e-05, + "loss": 1.6624, + "step": 33580 + }, + { + "epoch": 5.550919231563727, + "grad_norm": 10.512799263000488, + "learning_rate": 2.4715841275087682e-05, + "loss": 1.4323, + "step": 33590 + }, + { + "epoch": 5.552571782689527, + "grad_norm": 8.222831726074219, + "learning_rate": 2.470666005618906e-05, + "loss": 1.604, + "step": 33600 + }, + { + "epoch": 5.554224333815327, + "grad_norm": 8.435620307922363, + "learning_rate": 2.4697478837290437e-05, + "loss": 1.5879, + "step": 33610 + }, + { + "epoch": 5.555876884941128, + "grad_norm": 16.17806625366211, + "learning_rate": 2.468829761839182e-05, + "loss": 1.4183, + "step": 33620 + }, + { + "epoch": 5.557529436066928, + "grad_norm": 10.076927185058594, + "learning_rate": 2.46791163994932e-05, + "loss": 1.332, + "step": 33630 + }, + { + "epoch": 5.559181987192729, + "grad_norm": 8.407470703125, + "learning_rate": 2.4669935180594578e-05, + "loss": 1.5482, + "step": 33640 + }, + { + "epoch": 5.560834538318529, + "grad_norm": 5.60792350769043, + "learning_rate": 2.4660753961695957e-05, + "loss": 1.4146, + "step": 33650 + }, + { + "epoch": 5.56248708944433, + "grad_norm": 9.023984909057617, + "learning_rate": 2.4651572742797337e-05, + "loss": 1.4729, + "step": 33660 + }, + { + "epoch": 5.56413964057013, + "grad_norm": 9.371933937072754, + "learning_rate": 2.4642391523898713e-05, + "loss": 1.4933, + "step": 33670 + }, + { + "epoch": 5.56579219169593, + "grad_norm": 11.463881492614746, + "learning_rate": 2.4633210305000092e-05, + "loss": 1.3909, + "step": 33680 + }, + { + "epoch": 5.567444742821731, + "grad_norm": 22.128860473632812, + "learning_rate": 2.462402908610147e-05, + "loss": 1.5586, + "step": 33690 + }, + { + "epoch": 5.569097293947531, + "grad_norm": 7.879482746124268, + "learning_rate": 2.461484786720285e-05, + "loss": 1.4793, + "step": 33700 + }, + { + "epoch": 5.5707498450733315, + "grad_norm": 16.057891845703125, + "learning_rate": 2.460566664830423e-05, + "loss": 1.4825, + "step": 33710 + }, + { + "epoch": 5.5724023961991325, + "grad_norm": 21.956098556518555, + "learning_rate": 2.459648542940561e-05, + "loss": 1.4789, + "step": 33720 + }, + { + "epoch": 5.574054947324933, + "grad_norm": 10.628647804260254, + "learning_rate": 2.4587304210506988e-05, + "loss": 1.4485, + "step": 33730 + }, + { + "epoch": 5.5757074984507335, + "grad_norm": 22.851346969604492, + "learning_rate": 2.4578122991608367e-05, + "loss": 1.5087, + "step": 33740 + }, + { + "epoch": 5.577360049576534, + "grad_norm": 7.7167744636535645, + "learning_rate": 2.4568941772709747e-05, + "loss": 1.4648, + "step": 33750 + }, + { + "epoch": 5.579012600702335, + "grad_norm": 32.65045166015625, + "learning_rate": 2.4559760553811126e-05, + "loss": 1.4757, + "step": 33760 + }, + { + "epoch": 5.580665151828135, + "grad_norm": 9.209506034851074, + "learning_rate": 2.4550579334912505e-05, + "loss": 1.4859, + "step": 33770 + }, + { + "epoch": 5.582317702953935, + "grad_norm": 10.900243759155273, + "learning_rate": 2.4541398116013884e-05, + "loss": 1.5101, + "step": 33780 + }, + { + "epoch": 5.583970254079736, + "grad_norm": 13.830432891845703, + "learning_rate": 2.4532216897115264e-05, + "loss": 1.5847, + "step": 33790 + }, + { + "epoch": 5.585622805205536, + "grad_norm": 13.913232803344727, + "learning_rate": 2.452303567821664e-05, + "loss": 1.419, + "step": 33800 + }, + { + "epoch": 5.587275356331337, + "grad_norm": 19.409276962280273, + "learning_rate": 2.451385445931802e-05, + "loss": 1.5777, + "step": 33810 + }, + { + "epoch": 5.588927907457137, + "grad_norm": 14.467093467712402, + "learning_rate": 2.4504673240419398e-05, + "loss": 1.635, + "step": 33820 + }, + { + "epoch": 5.590580458582937, + "grad_norm": 9.140510559082031, + "learning_rate": 2.4495492021520777e-05, + "loss": 1.5292, + "step": 33830 + }, + { + "epoch": 5.592233009708738, + "grad_norm": 7.822348594665527, + "learning_rate": 2.4486310802622156e-05, + "loss": 1.3846, + "step": 33840 + }, + { + "epoch": 5.593885560834538, + "grad_norm": 9.017069816589355, + "learning_rate": 2.447712958372354e-05, + "loss": 1.4916, + "step": 33850 + }, + { + "epoch": 5.595538111960339, + "grad_norm": 11.385196685791016, + "learning_rate": 2.4467948364824915e-05, + "loss": 1.508, + "step": 33860 + }, + { + "epoch": 5.597190663086139, + "grad_norm": 7.629830837249756, + "learning_rate": 2.4458767145926294e-05, + "loss": 1.4346, + "step": 33870 + }, + { + "epoch": 5.598843214211939, + "grad_norm": 7.943394184112549, + "learning_rate": 2.4449585927027673e-05, + "loss": 1.4446, + "step": 33880 + }, + { + "epoch": 5.60049576533774, + "grad_norm": 12.818558692932129, + "learning_rate": 2.4440404708129053e-05, + "loss": 1.4871, + "step": 33890 + }, + { + "epoch": 5.60214831646354, + "grad_norm": 8.883275985717773, + "learning_rate": 2.4431223489230432e-05, + "loss": 1.4815, + "step": 33900 + }, + { + "epoch": 5.603800867589341, + "grad_norm": 12.792842864990234, + "learning_rate": 2.442204227033181e-05, + "loss": 1.4886, + "step": 33910 + }, + { + "epoch": 5.6054534187151415, + "grad_norm": 9.756999015808105, + "learning_rate": 2.441286105143319e-05, + "loss": 1.4582, + "step": 33920 + }, + { + "epoch": 5.6071059698409424, + "grad_norm": 11.55717658996582, + "learning_rate": 2.4403679832534566e-05, + "loss": 1.6197, + "step": 33930 + }, + { + "epoch": 5.6087585209667425, + "grad_norm": 6.848897933959961, + "learning_rate": 2.4394498613635945e-05, + "loss": 1.4918, + "step": 33940 + }, + { + "epoch": 5.610411072092543, + "grad_norm": 7.045119762420654, + "learning_rate": 2.4385317394737325e-05, + "loss": 1.6231, + "step": 33950 + }, + { + "epoch": 5.612063623218344, + "grad_norm": 12.93820571899414, + "learning_rate": 2.4376136175838707e-05, + "loss": 1.4263, + "step": 33960 + }, + { + "epoch": 5.613716174344144, + "grad_norm": 11.486026763916016, + "learning_rate": 2.4366954956940087e-05, + "loss": 1.4914, + "step": 33970 + }, + { + "epoch": 5.615368725469944, + "grad_norm": 9.217084884643555, + "learning_rate": 2.4357773738041466e-05, + "loss": 1.4765, + "step": 33980 + }, + { + "epoch": 5.617021276595745, + "grad_norm": 20.32526206970215, + "learning_rate": 2.4348592519142842e-05, + "loss": 1.493, + "step": 33990 + }, + { + "epoch": 5.618673827721545, + "grad_norm": 12.086780548095703, + "learning_rate": 2.433941130024422e-05, + "loss": 1.5825, + "step": 34000 + }, + { + "epoch": 5.620326378847346, + "grad_norm": 11.141498565673828, + "learning_rate": 2.43302300813456e-05, + "loss": 1.4825, + "step": 34010 + }, + { + "epoch": 5.621978929973146, + "grad_norm": 6.86226224899292, + "learning_rate": 2.432104886244698e-05, + "loss": 1.5988, + "step": 34020 + }, + { + "epoch": 5.623631481098947, + "grad_norm": 7.528665542602539, + "learning_rate": 2.431186764354836e-05, + "loss": 1.5774, + "step": 34030 + }, + { + "epoch": 5.625284032224747, + "grad_norm": 11.368343353271484, + "learning_rate": 2.4302686424649738e-05, + "loss": 1.3792, + "step": 34040 + }, + { + "epoch": 5.626936583350547, + "grad_norm": 5.986963272094727, + "learning_rate": 2.4293505205751117e-05, + "loss": 1.4285, + "step": 34050 + }, + { + "epoch": 5.628589134476348, + "grad_norm": 9.011881828308105, + "learning_rate": 2.4284323986852493e-05, + "loss": 1.476, + "step": 34060 + }, + { + "epoch": 5.630241685602148, + "grad_norm": 8.9256010055542, + "learning_rate": 2.4275142767953872e-05, + "loss": 1.4706, + "step": 34070 + }, + { + "epoch": 5.631894236727948, + "grad_norm": 34.386566162109375, + "learning_rate": 2.4265961549055255e-05, + "loss": 1.3625, + "step": 34080 + }, + { + "epoch": 5.633546787853749, + "grad_norm": 15.83523178100586, + "learning_rate": 2.4256780330156634e-05, + "loss": 1.3465, + "step": 34090 + }, + { + "epoch": 5.635199338979549, + "grad_norm": 9.349414825439453, + "learning_rate": 2.4247599111258013e-05, + "loss": 1.4396, + "step": 34100 + }, + { + "epoch": 5.63685189010535, + "grad_norm": 13.072619438171387, + "learning_rate": 2.4238417892359393e-05, + "loss": 1.5748, + "step": 34110 + }, + { + "epoch": 5.6385044412311505, + "grad_norm": 8.265012741088867, + "learning_rate": 2.422923667346077e-05, + "loss": 1.4326, + "step": 34120 + }, + { + "epoch": 5.6401569923569514, + "grad_norm": 14.93253231048584, + "learning_rate": 2.4220055454562148e-05, + "loss": 1.4903, + "step": 34130 + }, + { + "epoch": 5.6418095434827515, + "grad_norm": 8.096601486206055, + "learning_rate": 2.4210874235663527e-05, + "loss": 1.384, + "step": 34140 + }, + { + "epoch": 5.643462094608552, + "grad_norm": 7.2971343994140625, + "learning_rate": 2.4201693016764906e-05, + "loss": 1.4753, + "step": 34150 + }, + { + "epoch": 5.645114645734353, + "grad_norm": 7.540863990783691, + "learning_rate": 2.4192511797866286e-05, + "loss": 1.3667, + "step": 34160 + }, + { + "epoch": 5.646767196860153, + "grad_norm": 5.347985744476318, + "learning_rate": 2.4183330578967665e-05, + "loss": 1.5807, + "step": 34170 + }, + { + "epoch": 5.648419747985954, + "grad_norm": 10.622434616088867, + "learning_rate": 2.4174149360069044e-05, + "loss": 1.5232, + "step": 34180 + }, + { + "epoch": 5.650072299111754, + "grad_norm": 17.396007537841797, + "learning_rate": 2.4164968141170423e-05, + "loss": 1.4468, + "step": 34190 + }, + { + "epoch": 5.651724850237554, + "grad_norm": 17.4361629486084, + "learning_rate": 2.4155786922271803e-05, + "loss": 1.5216, + "step": 34200 + }, + { + "epoch": 5.653377401363355, + "grad_norm": 12.362852096557617, + "learning_rate": 2.4146605703373182e-05, + "loss": 1.5796, + "step": 34210 + }, + { + "epoch": 5.655029952489155, + "grad_norm": 8.334657669067383, + "learning_rate": 2.413742448447456e-05, + "loss": 1.5931, + "step": 34220 + }, + { + "epoch": 5.656682503614956, + "grad_norm": 15.316583633422852, + "learning_rate": 2.412824326557594e-05, + "loss": 1.5869, + "step": 34230 + }, + { + "epoch": 5.658335054740756, + "grad_norm": 17.858875274658203, + "learning_rate": 2.411906204667732e-05, + "loss": 1.5354, + "step": 34240 + }, + { + "epoch": 5.659987605866556, + "grad_norm": 15.654756546020508, + "learning_rate": 2.4109880827778695e-05, + "loss": 1.5952, + "step": 34250 + }, + { + "epoch": 5.661640156992357, + "grad_norm": 5.612504005432129, + "learning_rate": 2.4100699608880075e-05, + "loss": 1.3402, + "step": 34260 + }, + { + "epoch": 5.663292708118157, + "grad_norm": 12.783700942993164, + "learning_rate": 2.4091518389981454e-05, + "loss": 1.4176, + "step": 34270 + }, + { + "epoch": 5.664945259243958, + "grad_norm": 7.64247989654541, + "learning_rate": 2.4082337171082833e-05, + "loss": 1.4333, + "step": 34280 + }, + { + "epoch": 5.666597810369758, + "grad_norm": 7.911691665649414, + "learning_rate": 2.4073155952184212e-05, + "loss": 1.3645, + "step": 34290 + }, + { + "epoch": 5.668250361495558, + "grad_norm": 8.747984886169434, + "learning_rate": 2.406397473328559e-05, + "loss": 1.5304, + "step": 34300 + }, + { + "epoch": 5.669902912621359, + "grad_norm": 12.237468719482422, + "learning_rate": 2.405479351438697e-05, + "loss": 1.3955, + "step": 34310 + }, + { + "epoch": 5.6715554637471595, + "grad_norm": 9.102463722229004, + "learning_rate": 2.404561229548835e-05, + "loss": 1.4228, + "step": 34320 + }, + { + "epoch": 5.6732080148729604, + "grad_norm": 11.882768630981445, + "learning_rate": 2.403643107658973e-05, + "loss": 1.3596, + "step": 34330 + }, + { + "epoch": 5.6748605659987605, + "grad_norm": 10.731307029724121, + "learning_rate": 2.402724985769111e-05, + "loss": 1.6394, + "step": 34340 + }, + { + "epoch": 5.676513117124561, + "grad_norm": 13.682147026062012, + "learning_rate": 2.4018068638792488e-05, + "loss": 1.4626, + "step": 34350 + }, + { + "epoch": 5.678165668250362, + "grad_norm": 14.700006484985352, + "learning_rate": 2.4008887419893867e-05, + "loss": 1.5381, + "step": 34360 + }, + { + "epoch": 5.679818219376162, + "grad_norm": 8.677534103393555, + "learning_rate": 2.3999706200995246e-05, + "loss": 1.3861, + "step": 34370 + }, + { + "epoch": 5.681470770501963, + "grad_norm": 9.908595085144043, + "learning_rate": 2.3990524982096622e-05, + "loss": 1.3655, + "step": 34380 + }, + { + "epoch": 5.683123321627763, + "grad_norm": 22.74381446838379, + "learning_rate": 2.3981343763198e-05, + "loss": 1.479, + "step": 34390 + }, + { + "epoch": 5.684775872753564, + "grad_norm": 13.582764625549316, + "learning_rate": 2.397216254429938e-05, + "loss": 1.3896, + "step": 34400 + }, + { + "epoch": 5.686428423879364, + "grad_norm": 21.719499588012695, + "learning_rate": 2.396298132540076e-05, + "loss": 1.6054, + "step": 34410 + }, + { + "epoch": 5.688080975005164, + "grad_norm": 12.603348731994629, + "learning_rate": 2.3953800106502143e-05, + "loss": 1.4277, + "step": 34420 + }, + { + "epoch": 5.689733526130965, + "grad_norm": 9.63592529296875, + "learning_rate": 2.3944618887603522e-05, + "loss": 1.485, + "step": 34430 + }, + { + "epoch": 5.691386077256765, + "grad_norm": 13.966578483581543, + "learning_rate": 2.3935437668704898e-05, + "loss": 1.5003, + "step": 34440 + }, + { + "epoch": 5.693038628382565, + "grad_norm": 13.241275787353516, + "learning_rate": 2.3926256449806277e-05, + "loss": 1.5059, + "step": 34450 + }, + { + "epoch": 5.694691179508366, + "grad_norm": 18.274063110351562, + "learning_rate": 2.3917075230907656e-05, + "loss": 1.4191, + "step": 34460 + }, + { + "epoch": 5.696343730634166, + "grad_norm": 6.728181838989258, + "learning_rate": 2.3907894012009035e-05, + "loss": 1.5287, + "step": 34470 + }, + { + "epoch": 5.697996281759967, + "grad_norm": 9.49169921875, + "learning_rate": 2.3898712793110415e-05, + "loss": 1.5762, + "step": 34480 + }, + { + "epoch": 5.699648832885767, + "grad_norm": 13.1586332321167, + "learning_rate": 2.3889531574211794e-05, + "loss": 1.4773, + "step": 34490 + }, + { + "epoch": 5.701301384011568, + "grad_norm": 36.238487243652344, + "learning_rate": 2.3880350355313173e-05, + "loss": 1.4757, + "step": 34500 + }, + { + "epoch": 5.702953935137368, + "grad_norm": 14.713821411132812, + "learning_rate": 2.387116913641455e-05, + "loss": 1.3241, + "step": 34510 + }, + { + "epoch": 5.7046064862631685, + "grad_norm": 8.172168731689453, + "learning_rate": 2.3861987917515928e-05, + "loss": 1.4352, + "step": 34520 + }, + { + "epoch": 5.7062590373889694, + "grad_norm": 12.12569808959961, + "learning_rate": 2.3852806698617308e-05, + "loss": 1.6369, + "step": 34530 + }, + { + "epoch": 5.7079115885147695, + "grad_norm": 8.741676330566406, + "learning_rate": 2.384362547971869e-05, + "loss": 1.4149, + "step": 34540 + }, + { + "epoch": 5.70956413964057, + "grad_norm": 15.845980644226074, + "learning_rate": 2.383444426082007e-05, + "loss": 1.4781, + "step": 34550 + }, + { + "epoch": 5.711216690766371, + "grad_norm": 10.60511302947998, + "learning_rate": 2.382526304192145e-05, + "loss": 1.4277, + "step": 34560 + }, + { + "epoch": 5.712869241892171, + "grad_norm": 12.912707328796387, + "learning_rate": 2.3816081823022825e-05, + "loss": 1.4863, + "step": 34570 + }, + { + "epoch": 5.714521793017972, + "grad_norm": 7.532742977142334, + "learning_rate": 2.3806900604124204e-05, + "loss": 1.3594, + "step": 34580 + }, + { + "epoch": 5.716174344143772, + "grad_norm": 109.51636505126953, + "learning_rate": 2.3797719385225583e-05, + "loss": 1.4491, + "step": 34590 + }, + { + "epoch": 5.717826895269573, + "grad_norm": 25.65382957458496, + "learning_rate": 2.3788538166326962e-05, + "loss": 1.5347, + "step": 34600 + }, + { + "epoch": 5.719479446395373, + "grad_norm": 6.745087146759033, + "learning_rate": 2.377935694742834e-05, + "loss": 1.5059, + "step": 34610 + }, + { + "epoch": 5.721131997521173, + "grad_norm": 8.282033920288086, + "learning_rate": 2.377017572852972e-05, + "loss": 1.4472, + "step": 34620 + }, + { + "epoch": 5.722784548646974, + "grad_norm": 12.051273345947266, + "learning_rate": 2.37609945096311e-05, + "loss": 1.4371, + "step": 34630 + }, + { + "epoch": 5.724437099772774, + "grad_norm": 7.050853252410889, + "learning_rate": 2.3751813290732476e-05, + "loss": 1.5519, + "step": 34640 + }, + { + "epoch": 5.726089650898575, + "grad_norm": 29.751380920410156, + "learning_rate": 2.374263207183386e-05, + "loss": 1.38, + "step": 34650 + }, + { + "epoch": 5.727742202024375, + "grad_norm": 17.988632202148438, + "learning_rate": 2.3733450852935238e-05, + "loss": 1.4125, + "step": 34660 + }, + { + "epoch": 5.729394753150175, + "grad_norm": 19.773149490356445, + "learning_rate": 2.3724269634036617e-05, + "loss": 1.3835, + "step": 34670 + }, + { + "epoch": 5.731047304275976, + "grad_norm": 23.994436264038086, + "learning_rate": 2.3715088415137996e-05, + "loss": 1.4931, + "step": 34680 + }, + { + "epoch": 5.732699855401776, + "grad_norm": 24.97893714904785, + "learning_rate": 2.3705907196239375e-05, + "loss": 1.3949, + "step": 34690 + }, + { + "epoch": 5.734352406527577, + "grad_norm": 6.368709564208984, + "learning_rate": 2.369672597734075e-05, + "loss": 1.3729, + "step": 34700 + }, + { + "epoch": 5.736004957653377, + "grad_norm": 14.183878898620605, + "learning_rate": 2.368754475844213e-05, + "loss": 1.5165, + "step": 34710 + }, + { + "epoch": 5.7376575087791775, + "grad_norm": 11.956380844116211, + "learning_rate": 2.367836353954351e-05, + "loss": 1.4883, + "step": 34720 + }, + { + "epoch": 5.7393100599049784, + "grad_norm": 16.989643096923828, + "learning_rate": 2.366918232064489e-05, + "loss": 1.5531, + "step": 34730 + }, + { + "epoch": 5.7409626110307785, + "grad_norm": 14.136730194091797, + "learning_rate": 2.366000110174627e-05, + "loss": 1.4212, + "step": 34740 + }, + { + "epoch": 5.7426151621565795, + "grad_norm": 10.436883926391602, + "learning_rate": 2.3650819882847648e-05, + "loss": 1.5093, + "step": 34750 + }, + { + "epoch": 5.74426771328238, + "grad_norm": 23.723215103149414, + "learning_rate": 2.3641638663949027e-05, + "loss": 1.5963, + "step": 34760 + }, + { + "epoch": 5.745920264408181, + "grad_norm": 29.90015983581543, + "learning_rate": 2.3632457445050406e-05, + "loss": 1.5902, + "step": 34770 + }, + { + "epoch": 5.747572815533981, + "grad_norm": 20.19734764099121, + "learning_rate": 2.3623276226151785e-05, + "loss": 1.3964, + "step": 34780 + }, + { + "epoch": 5.749225366659781, + "grad_norm": 7.617761611938477, + "learning_rate": 2.3614095007253165e-05, + "loss": 1.4068, + "step": 34790 + }, + { + "epoch": 5.750877917785582, + "grad_norm": 12.926080703735352, + "learning_rate": 2.3604913788354544e-05, + "loss": 1.4668, + "step": 34800 + }, + { + "epoch": 5.752530468911382, + "grad_norm": 10.092638969421387, + "learning_rate": 2.3595732569455923e-05, + "loss": 1.526, + "step": 34810 + }, + { + "epoch": 5.754183020037182, + "grad_norm": 16.717660903930664, + "learning_rate": 2.3586551350557302e-05, + "loss": 1.5049, + "step": 34820 + }, + { + "epoch": 5.755835571162983, + "grad_norm": 9.57276439666748, + "learning_rate": 2.3577370131658678e-05, + "loss": 1.4629, + "step": 34830 + }, + { + "epoch": 5.757488122288783, + "grad_norm": 8.01769733428955, + "learning_rate": 2.3568188912760057e-05, + "loss": 1.4832, + "step": 34840 + }, + { + "epoch": 5.759140673414584, + "grad_norm": 11.659310340881348, + "learning_rate": 2.3559007693861437e-05, + "loss": 1.4063, + "step": 34850 + }, + { + "epoch": 5.760793224540384, + "grad_norm": 11.5115327835083, + "learning_rate": 2.3549826474962816e-05, + "loss": 1.4312, + "step": 34860 + }, + { + "epoch": 5.762445775666185, + "grad_norm": 8.317569732666016, + "learning_rate": 2.3540645256064195e-05, + "loss": 1.4929, + "step": 34870 + }, + { + "epoch": 5.764098326791985, + "grad_norm": 12.994110107421875, + "learning_rate": 2.3531464037165578e-05, + "loss": 1.4944, + "step": 34880 + }, + { + "epoch": 5.765750877917785, + "grad_norm": 8.37218189239502, + "learning_rate": 2.3522282818266954e-05, + "loss": 1.4133, + "step": 34890 + }, + { + "epoch": 5.767403429043586, + "grad_norm": 9.333100318908691, + "learning_rate": 2.3513101599368333e-05, + "loss": 1.4405, + "step": 34900 + }, + { + "epoch": 5.769055980169386, + "grad_norm": 17.137237548828125, + "learning_rate": 2.3503920380469712e-05, + "loss": 1.513, + "step": 34910 + }, + { + "epoch": 5.7707085312951865, + "grad_norm": 12.368675231933594, + "learning_rate": 2.349473916157109e-05, + "loss": 1.3646, + "step": 34920 + }, + { + "epoch": 5.772361082420987, + "grad_norm": 14.37025260925293, + "learning_rate": 2.348555794267247e-05, + "loss": 1.5304, + "step": 34930 + }, + { + "epoch": 5.7740136335467875, + "grad_norm": 40.79022216796875, + "learning_rate": 2.347637672377385e-05, + "loss": 1.3919, + "step": 34940 + }, + { + "epoch": 5.7756661846725885, + "grad_norm": 8.884847640991211, + "learning_rate": 2.346719550487523e-05, + "loss": 1.4815, + "step": 34950 + }, + { + "epoch": 5.777318735798389, + "grad_norm": 9.6146879196167, + "learning_rate": 2.3458014285976605e-05, + "loss": 1.4746, + "step": 34960 + }, + { + "epoch": 5.77897128692419, + "grad_norm": 18.530406951904297, + "learning_rate": 2.3448833067077984e-05, + "loss": 1.644, + "step": 34970 + }, + { + "epoch": 5.78062383804999, + "grad_norm": 26.01064109802246, + "learning_rate": 2.3439651848179364e-05, + "loss": 1.5561, + "step": 34980 + }, + { + "epoch": 5.78227638917579, + "grad_norm": 6.7076239585876465, + "learning_rate": 2.3430470629280746e-05, + "loss": 1.441, + "step": 34990 + }, + { + "epoch": 5.783928940301591, + "grad_norm": 58.96223831176758, + "learning_rate": 2.3421289410382125e-05, + "loss": 1.4621, + "step": 35000 + }, + { + "epoch": 5.785581491427391, + "grad_norm": 12.242283821105957, + "learning_rate": 2.3412108191483505e-05, + "loss": 1.4991, + "step": 35010 + }, + { + "epoch": 5.787234042553192, + "grad_norm": 60.0484619140625, + "learning_rate": 2.340292697258488e-05, + "loss": 1.4799, + "step": 35020 + }, + { + "epoch": 5.788886593678992, + "grad_norm": 15.15187931060791, + "learning_rate": 2.339374575368626e-05, + "loss": 1.5587, + "step": 35030 + }, + { + "epoch": 5.790539144804792, + "grad_norm": 14.363666534423828, + "learning_rate": 2.338456453478764e-05, + "loss": 1.5531, + "step": 35040 + }, + { + "epoch": 5.792191695930593, + "grad_norm": 9.732187271118164, + "learning_rate": 2.3375383315889018e-05, + "loss": 1.5367, + "step": 35050 + }, + { + "epoch": 5.793844247056393, + "grad_norm": 8.981781005859375, + "learning_rate": 2.3366202096990397e-05, + "loss": 1.4947, + "step": 35060 + }, + { + "epoch": 5.795496798182194, + "grad_norm": 10.503220558166504, + "learning_rate": 2.3357020878091777e-05, + "loss": 1.3545, + "step": 35070 + }, + { + "epoch": 5.797149349307994, + "grad_norm": 8.12704849243164, + "learning_rate": 2.3347839659193156e-05, + "loss": 1.5702, + "step": 35080 + }, + { + "epoch": 5.798801900433794, + "grad_norm": 13.842459678649902, + "learning_rate": 2.3338658440294532e-05, + "loss": 1.4373, + "step": 35090 + }, + { + "epoch": 5.800454451559595, + "grad_norm": 11.6895170211792, + "learning_rate": 2.332947722139591e-05, + "loss": 1.3984, + "step": 35100 + }, + { + "epoch": 5.802107002685395, + "grad_norm": 55.09220886230469, + "learning_rate": 2.3320296002497294e-05, + "loss": 1.5888, + "step": 35110 + }, + { + "epoch": 5.803759553811196, + "grad_norm": 10.366085052490234, + "learning_rate": 2.3311114783598673e-05, + "loss": 1.4724, + "step": 35120 + }, + { + "epoch": 5.805412104936996, + "grad_norm": 12.669951438903809, + "learning_rate": 2.3301933564700052e-05, + "loss": 1.595, + "step": 35130 + }, + { + "epoch": 5.807064656062797, + "grad_norm": 9.713029861450195, + "learning_rate": 2.329275234580143e-05, + "loss": 1.3403, + "step": 35140 + }, + { + "epoch": 5.8087172071885975, + "grad_norm": 10.147992134094238, + "learning_rate": 2.3283571126902807e-05, + "loss": 1.4291, + "step": 35150 + }, + { + "epoch": 5.810369758314398, + "grad_norm": 8.817466735839844, + "learning_rate": 2.3274389908004187e-05, + "loss": 1.4716, + "step": 35160 + }, + { + "epoch": 5.812022309440199, + "grad_norm": 17.47516441345215, + "learning_rate": 2.3265208689105566e-05, + "loss": 1.4725, + "step": 35170 + }, + { + "epoch": 5.813674860565999, + "grad_norm": 33.834625244140625, + "learning_rate": 2.3256027470206945e-05, + "loss": 1.5657, + "step": 35180 + }, + { + "epoch": 5.815327411691799, + "grad_norm": 14.494001388549805, + "learning_rate": 2.3246846251308324e-05, + "loss": 1.486, + "step": 35190 + }, + { + "epoch": 5.8169799628176, + "grad_norm": 10.042065620422363, + "learning_rate": 2.3237665032409704e-05, + "loss": 1.4782, + "step": 35200 + }, + { + "epoch": 5.8186325139434, + "grad_norm": 8.86281681060791, + "learning_rate": 2.3228483813511083e-05, + "loss": 1.5148, + "step": 35210 + }, + { + "epoch": 5.820285065069201, + "grad_norm": 17.04343605041504, + "learning_rate": 2.3219302594612462e-05, + "loss": 1.3849, + "step": 35220 + }, + { + "epoch": 5.821937616195001, + "grad_norm": 11.681707382202148, + "learning_rate": 2.321012137571384e-05, + "loss": 1.4524, + "step": 35230 + }, + { + "epoch": 5.823590167320802, + "grad_norm": 15.696507453918457, + "learning_rate": 2.320094015681522e-05, + "loss": 1.4256, + "step": 35240 + }, + { + "epoch": 5.825242718446602, + "grad_norm": 21.752723693847656, + "learning_rate": 2.31917589379166e-05, + "loss": 1.376, + "step": 35250 + }, + { + "epoch": 5.826895269572402, + "grad_norm": 16.060373306274414, + "learning_rate": 2.318257771901798e-05, + "loss": 1.514, + "step": 35260 + }, + { + "epoch": 5.828547820698203, + "grad_norm": 51.685455322265625, + "learning_rate": 2.3173396500119358e-05, + "loss": 1.433, + "step": 35270 + }, + { + "epoch": 5.830200371824003, + "grad_norm": 8.148418426513672, + "learning_rate": 2.3164215281220734e-05, + "loss": 1.4283, + "step": 35280 + }, + { + "epoch": 5.831852922949803, + "grad_norm": 7.561429977416992, + "learning_rate": 2.3155034062322113e-05, + "loss": 1.4483, + "step": 35290 + }, + { + "epoch": 5.833505474075604, + "grad_norm": 13.371988296508789, + "learning_rate": 2.3145852843423493e-05, + "loss": 1.4895, + "step": 35300 + }, + { + "epoch": 5.835158025201404, + "grad_norm": 15.92422103881836, + "learning_rate": 2.3136671624524872e-05, + "loss": 1.3263, + "step": 35310 + }, + { + "epoch": 5.836810576327205, + "grad_norm": 49.8977165222168, + "learning_rate": 2.312749040562625e-05, + "loss": 1.5131, + "step": 35320 + }, + { + "epoch": 5.838463127453005, + "grad_norm": 11.140594482421875, + "learning_rate": 2.311830918672763e-05, + "loss": 1.5284, + "step": 35330 + }, + { + "epoch": 5.840115678578806, + "grad_norm": 9.05160903930664, + "learning_rate": 2.310912796782901e-05, + "loss": 1.515, + "step": 35340 + }, + { + "epoch": 5.8417682297046065, + "grad_norm": 11.951143264770508, + "learning_rate": 2.309994674893039e-05, + "loss": 1.5105, + "step": 35350 + }, + { + "epoch": 5.843420780830407, + "grad_norm": 10.511120796203613, + "learning_rate": 2.3090765530031768e-05, + "loss": 1.4417, + "step": 35360 + }, + { + "epoch": 5.845073331956208, + "grad_norm": 18.432544708251953, + "learning_rate": 2.3081584311133147e-05, + "loss": 1.5113, + "step": 35370 + }, + { + "epoch": 5.846725883082008, + "grad_norm": 12.873449325561523, + "learning_rate": 2.3072403092234527e-05, + "loss": 1.4663, + "step": 35380 + }, + { + "epoch": 5.848378434207809, + "grad_norm": 7.534992218017578, + "learning_rate": 2.3063221873335906e-05, + "loss": 1.4698, + "step": 35390 + }, + { + "epoch": 5.850030985333609, + "grad_norm": 6.996005535125732, + "learning_rate": 2.3054040654437285e-05, + "loss": 1.3313, + "step": 35400 + }, + { + "epoch": 5.851683536459409, + "grad_norm": 122.31879425048828, + "learning_rate": 2.304485943553866e-05, + "loss": 1.3871, + "step": 35410 + }, + { + "epoch": 5.85333608758521, + "grad_norm": 10.604686737060547, + "learning_rate": 2.303567821664004e-05, + "loss": 1.5325, + "step": 35420 + }, + { + "epoch": 5.85498863871101, + "grad_norm": 16.7020263671875, + "learning_rate": 2.302649699774142e-05, + "loss": 1.5011, + "step": 35430 + }, + { + "epoch": 5.856641189836811, + "grad_norm": 17.591352462768555, + "learning_rate": 2.30173157788428e-05, + "loss": 1.4792, + "step": 35440 + }, + { + "epoch": 5.858293740962611, + "grad_norm": 10.369803428649902, + "learning_rate": 2.300813455994418e-05, + "loss": 1.4176, + "step": 35450 + }, + { + "epoch": 5.859946292088411, + "grad_norm": 41.722450256347656, + "learning_rate": 2.299895334104556e-05, + "loss": 1.3517, + "step": 35460 + }, + { + "epoch": 5.861598843214212, + "grad_norm": 13.766722679138184, + "learning_rate": 2.2989772122146936e-05, + "loss": 1.5139, + "step": 35470 + }, + { + "epoch": 5.863251394340012, + "grad_norm": 11.477117538452148, + "learning_rate": 2.2980590903248316e-05, + "loss": 1.3467, + "step": 35480 + }, + { + "epoch": 5.864903945465813, + "grad_norm": 7.684751033782959, + "learning_rate": 2.2971409684349695e-05, + "loss": 1.3977, + "step": 35490 + }, + { + "epoch": 5.866556496591613, + "grad_norm": 19.736038208007812, + "learning_rate": 2.2962228465451074e-05, + "loss": 1.4578, + "step": 35500 + }, + { + "epoch": 5.868209047717414, + "grad_norm": 9.840457916259766, + "learning_rate": 2.2953047246552453e-05, + "loss": 1.5332, + "step": 35510 + }, + { + "epoch": 5.869861598843214, + "grad_norm": 10.037358283996582, + "learning_rate": 2.2943866027653833e-05, + "loss": 1.4446, + "step": 35520 + }, + { + "epoch": 5.871514149969014, + "grad_norm": 12.01092529296875, + "learning_rate": 2.2934684808755212e-05, + "loss": 1.4489, + "step": 35530 + }, + { + "epoch": 5.873166701094815, + "grad_norm": 11.730144500732422, + "learning_rate": 2.292550358985659e-05, + "loss": 1.4665, + "step": 35540 + }, + { + "epoch": 5.8748192522206155, + "grad_norm": 9.427178382873535, + "learning_rate": 2.2916322370957967e-05, + "loss": 1.4231, + "step": 35550 + }, + { + "epoch": 5.876471803346416, + "grad_norm": 9.14688491821289, + "learning_rate": 2.290714115205935e-05, + "loss": 1.4917, + "step": 35560 + }, + { + "epoch": 5.878124354472217, + "grad_norm": 8.813817977905273, + "learning_rate": 2.289795993316073e-05, + "loss": 1.386, + "step": 35570 + }, + { + "epoch": 5.879776905598017, + "grad_norm": 13.4219970703125, + "learning_rate": 2.2888778714262108e-05, + "loss": 1.6158, + "step": 35580 + }, + { + "epoch": 5.881429456723818, + "grad_norm": 18.55006217956543, + "learning_rate": 2.2879597495363487e-05, + "loss": 1.5049, + "step": 35590 + }, + { + "epoch": 5.883082007849618, + "grad_norm": 10.324186325073242, + "learning_rate": 2.2870416276464863e-05, + "loss": 1.4228, + "step": 35600 + }, + { + "epoch": 5.884734558975419, + "grad_norm": 14.822036743164062, + "learning_rate": 2.2861235057566243e-05, + "loss": 1.4369, + "step": 35610 + }, + { + "epoch": 5.886387110101219, + "grad_norm": 9.402215003967285, + "learning_rate": 2.2852053838667622e-05, + "loss": 1.5191, + "step": 35620 + }, + { + "epoch": 5.888039661227019, + "grad_norm": 15.71996021270752, + "learning_rate": 2.2842872619769e-05, + "loss": 1.4482, + "step": 35630 + }, + { + "epoch": 5.88969221235282, + "grad_norm": 6.093387126922607, + "learning_rate": 2.283369140087038e-05, + "loss": 1.4423, + "step": 35640 + }, + { + "epoch": 5.89134476347862, + "grad_norm": 9.502908706665039, + "learning_rate": 2.282451018197176e-05, + "loss": 1.4896, + "step": 35650 + }, + { + "epoch": 5.89299731460442, + "grad_norm": 10.822142601013184, + "learning_rate": 2.281532896307314e-05, + "loss": 1.5373, + "step": 35660 + }, + { + "epoch": 5.894649865730221, + "grad_norm": 12.028356552124023, + "learning_rate": 2.2806147744174518e-05, + "loss": 1.6253, + "step": 35670 + }, + { + "epoch": 5.896302416856021, + "grad_norm": 9.975237846374512, + "learning_rate": 2.2796966525275897e-05, + "loss": 1.57, + "step": 35680 + }, + { + "epoch": 5.897954967981822, + "grad_norm": 8.391766548156738, + "learning_rate": 2.2787785306377277e-05, + "loss": 1.5282, + "step": 35690 + }, + { + "epoch": 5.899607519107622, + "grad_norm": 13.360061645507812, + "learning_rate": 2.2778604087478656e-05, + "loss": 1.5431, + "step": 35700 + }, + { + "epoch": 5.901260070233423, + "grad_norm": 6.314427375793457, + "learning_rate": 2.2769422868580035e-05, + "loss": 1.5607, + "step": 35710 + }, + { + "epoch": 5.902912621359223, + "grad_norm": 7.808109283447266, + "learning_rate": 2.2760241649681414e-05, + "loss": 1.4794, + "step": 35720 + }, + { + "epoch": 5.904565172485023, + "grad_norm": 9.234506607055664, + "learning_rate": 2.275106043078279e-05, + "loss": 1.3842, + "step": 35730 + }, + { + "epoch": 5.906217723610824, + "grad_norm": 35.93871307373047, + "learning_rate": 2.274187921188417e-05, + "loss": 1.3852, + "step": 35740 + }, + { + "epoch": 5.9078702747366245, + "grad_norm": 10.612404823303223, + "learning_rate": 2.273269799298555e-05, + "loss": 1.5685, + "step": 35750 + }, + { + "epoch": 5.9095228258624255, + "grad_norm": 43.04104232788086, + "learning_rate": 2.2723516774086928e-05, + "loss": 1.3909, + "step": 35760 + }, + { + "epoch": 5.911175376988226, + "grad_norm": 16.839719772338867, + "learning_rate": 2.2714335555188307e-05, + "loss": 1.4255, + "step": 35770 + }, + { + "epoch": 5.912827928114026, + "grad_norm": 16.71356773376465, + "learning_rate": 2.2705154336289686e-05, + "loss": 1.6229, + "step": 35780 + }, + { + "epoch": 5.914480479239827, + "grad_norm": 28.990924835205078, + "learning_rate": 2.2695973117391066e-05, + "loss": 1.5036, + "step": 35790 + }, + { + "epoch": 5.916133030365627, + "grad_norm": 8.961358070373535, + "learning_rate": 2.2686791898492445e-05, + "loss": 1.4317, + "step": 35800 + }, + { + "epoch": 5.917785581491428, + "grad_norm": 9.525544166564941, + "learning_rate": 2.2677610679593824e-05, + "loss": 1.5382, + "step": 35810 + }, + { + "epoch": 5.919438132617228, + "grad_norm": 16.726709365844727, + "learning_rate": 2.2668429460695203e-05, + "loss": 1.4789, + "step": 35820 + }, + { + "epoch": 5.921090683743028, + "grad_norm": 26.31761360168457, + "learning_rate": 2.2659248241796583e-05, + "loss": 1.53, + "step": 35830 + }, + { + "epoch": 5.922743234868829, + "grad_norm": 18.97273826599121, + "learning_rate": 2.2650067022897962e-05, + "loss": 1.4395, + "step": 35840 + }, + { + "epoch": 5.924395785994629, + "grad_norm": 6.1228928565979, + "learning_rate": 2.264088580399934e-05, + "loss": 1.3988, + "step": 35850 + }, + { + "epoch": 5.92604833712043, + "grad_norm": 16.22532081604004, + "learning_rate": 2.263170458510072e-05, + "loss": 1.5295, + "step": 35860 + }, + { + "epoch": 5.92770088824623, + "grad_norm": 7.841466426849365, + "learning_rate": 2.2622523366202096e-05, + "loss": 1.2468, + "step": 35870 + }, + { + "epoch": 5.92935343937203, + "grad_norm": 9.497376441955566, + "learning_rate": 2.2613342147303475e-05, + "loss": 1.5498, + "step": 35880 + }, + { + "epoch": 5.931005990497831, + "grad_norm": 13.135749816894531, + "learning_rate": 2.2604160928404855e-05, + "loss": 1.3572, + "step": 35890 + }, + { + "epoch": 5.932658541623631, + "grad_norm": 12.444751739501953, + "learning_rate": 2.2594979709506234e-05, + "loss": 1.4146, + "step": 35900 + }, + { + "epoch": 5.934311092749432, + "grad_norm": 10.811918258666992, + "learning_rate": 2.2585798490607617e-05, + "loss": 1.5179, + "step": 35910 + }, + { + "epoch": 5.935963643875232, + "grad_norm": 14.414387702941895, + "learning_rate": 2.2576617271708992e-05, + "loss": 1.4109, + "step": 35920 + }, + { + "epoch": 5.937616195001032, + "grad_norm": 10.46023178100586, + "learning_rate": 2.2567436052810372e-05, + "loss": 1.4051, + "step": 35930 + }, + { + "epoch": 5.939268746126833, + "grad_norm": 46.42742156982422, + "learning_rate": 2.255825483391175e-05, + "loss": 1.4066, + "step": 35940 + }, + { + "epoch": 5.9409212972526335, + "grad_norm": 18.5954532623291, + "learning_rate": 2.254907361501313e-05, + "loss": 1.4041, + "step": 35950 + }, + { + "epoch": 5.9425738483784345, + "grad_norm": 8.255085945129395, + "learning_rate": 2.253989239611451e-05, + "loss": 1.4775, + "step": 35960 + }, + { + "epoch": 5.944226399504235, + "grad_norm": 16.39137840270996, + "learning_rate": 2.253071117721589e-05, + "loss": 1.495, + "step": 35970 + }, + { + "epoch": 5.945878950630036, + "grad_norm": 22.78189468383789, + "learning_rate": 2.2521529958317268e-05, + "loss": 1.5671, + "step": 35980 + }, + { + "epoch": 5.947531501755836, + "grad_norm": 10.982939720153809, + "learning_rate": 2.2512348739418647e-05, + "loss": 1.5311, + "step": 35990 + }, + { + "epoch": 5.949184052881636, + "grad_norm": 8.388236045837402, + "learning_rate": 2.2503167520520023e-05, + "loss": 1.5759, + "step": 36000 + }, + { + "epoch": 5.950836604007437, + "grad_norm": 19.174211502075195, + "learning_rate": 2.2493986301621402e-05, + "loss": 1.4469, + "step": 36010 + }, + { + "epoch": 5.952489155133237, + "grad_norm": 9.777421951293945, + "learning_rate": 2.2484805082722785e-05, + "loss": 1.4646, + "step": 36020 + }, + { + "epoch": 5.954141706259037, + "grad_norm": 33.97954559326172, + "learning_rate": 2.2475623863824164e-05, + "loss": 1.2839, + "step": 36030 + }, + { + "epoch": 5.955794257384838, + "grad_norm": 10.13986587524414, + "learning_rate": 2.2466442644925543e-05, + "loss": 1.4924, + "step": 36040 + }, + { + "epoch": 5.957446808510638, + "grad_norm": 11.5507173538208, + "learning_rate": 2.245726142602692e-05, + "loss": 1.53, + "step": 36050 + }, + { + "epoch": 5.959099359636439, + "grad_norm": 7.484362602233887, + "learning_rate": 2.24480802071283e-05, + "loss": 1.3517, + "step": 36060 + }, + { + "epoch": 5.960751910762239, + "grad_norm": 49.174068450927734, + "learning_rate": 2.2438898988229678e-05, + "loss": 1.4585, + "step": 36070 + }, + { + "epoch": 5.96240446188804, + "grad_norm": 9.194089889526367, + "learning_rate": 2.2429717769331057e-05, + "loss": 1.5553, + "step": 36080 + }, + { + "epoch": 5.96405701301384, + "grad_norm": 20.3260440826416, + "learning_rate": 2.2420536550432436e-05, + "loss": 1.5759, + "step": 36090 + }, + { + "epoch": 5.96570956413964, + "grad_norm": 10.215295791625977, + "learning_rate": 2.2411355331533816e-05, + "loss": 1.4843, + "step": 36100 + }, + { + "epoch": 5.967362115265441, + "grad_norm": 12.862039566040039, + "learning_rate": 2.2402174112635195e-05, + "loss": 1.5045, + "step": 36110 + }, + { + "epoch": 5.969014666391241, + "grad_norm": 22.09560775756836, + "learning_rate": 2.2392992893736574e-05, + "loss": 1.4349, + "step": 36120 + }, + { + "epoch": 5.970667217517041, + "grad_norm": 11.517844200134277, + "learning_rate": 2.2383811674837953e-05, + "loss": 1.3689, + "step": 36130 + }, + { + "epoch": 5.972319768642842, + "grad_norm": 14.348694801330566, + "learning_rate": 2.2374630455939333e-05, + "loss": 1.4601, + "step": 36140 + }, + { + "epoch": 5.9739723197686425, + "grad_norm": 9.823168754577637, + "learning_rate": 2.2365449237040712e-05, + "loss": 1.4595, + "step": 36150 + }, + { + "epoch": 5.9756248708944435, + "grad_norm": 19.39566421508789, + "learning_rate": 2.235626801814209e-05, + "loss": 1.4817, + "step": 36160 + }, + { + "epoch": 5.977277422020244, + "grad_norm": 12.078182220458984, + "learning_rate": 2.234708679924347e-05, + "loss": 1.5023, + "step": 36170 + }, + { + "epoch": 5.978929973146045, + "grad_norm": 27.949277877807617, + "learning_rate": 2.2337905580344846e-05, + "loss": 1.4331, + "step": 36180 + }, + { + "epoch": 5.980582524271845, + "grad_norm": 12.060958862304688, + "learning_rate": 2.2328724361446225e-05, + "loss": 1.4613, + "step": 36190 + }, + { + "epoch": 5.982235075397645, + "grad_norm": 12.409534454345703, + "learning_rate": 2.2319543142547605e-05, + "loss": 1.3824, + "step": 36200 + }, + { + "epoch": 5.983887626523446, + "grad_norm": 11.61839485168457, + "learning_rate": 2.2310361923648984e-05, + "loss": 1.4633, + "step": 36210 + }, + { + "epoch": 5.985540177649246, + "grad_norm": 17.775489807128906, + "learning_rate": 2.2301180704750363e-05, + "loss": 1.5373, + "step": 36220 + }, + { + "epoch": 5.987192728775047, + "grad_norm": 13.473231315612793, + "learning_rate": 2.2291999485851742e-05, + "loss": 1.282, + "step": 36230 + }, + { + "epoch": 5.988845279900847, + "grad_norm": 11.443365097045898, + "learning_rate": 2.228281826695312e-05, + "loss": 1.4227, + "step": 36240 + }, + { + "epoch": 5.990497831026647, + "grad_norm": 9.717024803161621, + "learning_rate": 2.22736370480545e-05, + "loss": 1.5218, + "step": 36250 + }, + { + "epoch": 5.992150382152448, + "grad_norm": 17.857152938842773, + "learning_rate": 2.226445582915588e-05, + "loss": 1.5035, + "step": 36260 + }, + { + "epoch": 5.993802933278248, + "grad_norm": 10.381898880004883, + "learning_rate": 2.225527461025726e-05, + "loss": 1.4807, + "step": 36270 + }, + { + "epoch": 5.995455484404049, + "grad_norm": 16.007429122924805, + "learning_rate": 2.224609339135864e-05, + "loss": 1.6168, + "step": 36280 + }, + { + "epoch": 5.997108035529849, + "grad_norm": 8.93297290802002, + "learning_rate": 2.2236912172460018e-05, + "loss": 1.4956, + "step": 36290 + }, + { + "epoch": 5.998760586655649, + "grad_norm": 10.40009880065918, + "learning_rate": 2.2227730953561397e-05, + "loss": 1.4969, + "step": 36300 + }, + { + "epoch": 5.99991737244371, + "eval_accuracy": 0.3295502908213931, + "eval_loss": 2.154688835144043, + "eval_runtime": 818.3335, + "eval_samples_per_second": 34.455, + "eval_steps_per_second": 8.614, + "step": 36307 + }, + { + "epoch": 6.00041313778145, + "grad_norm": 9.945453643798828, + "learning_rate": 2.2218549734662776e-05, + "loss": 1.5087, + "step": 36310 + }, + { + "epoch": 6.00206568890725, + "grad_norm": 9.44550609588623, + "learning_rate": 2.2209368515764152e-05, + "loss": 1.2805, + "step": 36320 + }, + { + "epoch": 6.003718240033051, + "grad_norm": 52.20871353149414, + "learning_rate": 2.220018729686553e-05, + "loss": 1.32, + "step": 36330 + }, + { + "epoch": 6.005370791158851, + "grad_norm": 9.028908729553223, + "learning_rate": 2.219100607796691e-05, + "loss": 1.3254, + "step": 36340 + }, + { + "epoch": 6.0070233422846515, + "grad_norm": 10.057682991027832, + "learning_rate": 2.218182485906829e-05, + "loss": 1.3984, + "step": 36350 + }, + { + "epoch": 6.0086758934104525, + "grad_norm": 13.148024559020996, + "learning_rate": 2.2172643640169673e-05, + "loss": 1.4118, + "step": 36360 + }, + { + "epoch": 6.010328444536253, + "grad_norm": 7.585305690765381, + "learning_rate": 2.216346242127105e-05, + "loss": 1.4126, + "step": 36370 + }, + { + "epoch": 6.011980995662054, + "grad_norm": 16.880674362182617, + "learning_rate": 2.2154281202372428e-05, + "loss": 1.4296, + "step": 36380 + }, + { + "epoch": 6.013633546787854, + "grad_norm": 9.740089416503906, + "learning_rate": 2.2145099983473807e-05, + "loss": 1.4044, + "step": 36390 + }, + { + "epoch": 6.015286097913654, + "grad_norm": 16.309249877929688, + "learning_rate": 2.2135918764575186e-05, + "loss": 1.3367, + "step": 36400 + }, + { + "epoch": 6.016938649039455, + "grad_norm": 88.99636840820312, + "learning_rate": 2.2126737545676565e-05, + "loss": 1.4252, + "step": 36410 + }, + { + "epoch": 6.018591200165255, + "grad_norm": 20.119401931762695, + "learning_rate": 2.2117556326777945e-05, + "loss": 1.5106, + "step": 36420 + }, + { + "epoch": 6.020243751291056, + "grad_norm": 26.70507049560547, + "learning_rate": 2.2108375107879324e-05, + "loss": 1.4082, + "step": 36430 + }, + { + "epoch": 6.021896302416856, + "grad_norm": 10.413867950439453, + "learning_rate": 2.2099193888980703e-05, + "loss": 1.4838, + "step": 36440 + }, + { + "epoch": 6.023548853542657, + "grad_norm": 20.84398078918457, + "learning_rate": 2.209001267008208e-05, + "loss": 1.4627, + "step": 36450 + }, + { + "epoch": 6.025201404668457, + "grad_norm": 7.607133388519287, + "learning_rate": 2.2080831451183458e-05, + "loss": 1.5133, + "step": 36460 + }, + { + "epoch": 6.026853955794257, + "grad_norm": 9.936842918395996, + "learning_rate": 2.2071650232284838e-05, + "loss": 1.4638, + "step": 36470 + }, + { + "epoch": 6.028506506920058, + "grad_norm": 9.79660415649414, + "learning_rate": 2.206246901338622e-05, + "loss": 1.3432, + "step": 36480 + }, + { + "epoch": 6.030159058045858, + "grad_norm": 14.434847831726074, + "learning_rate": 2.20532877944876e-05, + "loss": 1.4072, + "step": 36490 + }, + { + "epoch": 6.031811609171659, + "grad_norm": 11.613893508911133, + "learning_rate": 2.2044106575588975e-05, + "loss": 1.3748, + "step": 36500 + }, + { + "epoch": 6.033464160297459, + "grad_norm": 14.374581336975098, + "learning_rate": 2.2034925356690355e-05, + "loss": 1.5375, + "step": 36510 + }, + { + "epoch": 6.035116711423259, + "grad_norm": 10.08636474609375, + "learning_rate": 2.2025744137791734e-05, + "loss": 1.4055, + "step": 36520 + }, + { + "epoch": 6.03676926254906, + "grad_norm": 11.865565299987793, + "learning_rate": 2.2016562918893113e-05, + "loss": 1.3775, + "step": 36530 + }, + { + "epoch": 6.03842181367486, + "grad_norm": 88.45521545410156, + "learning_rate": 2.2007381699994492e-05, + "loss": 1.5659, + "step": 36540 + }, + { + "epoch": 6.040074364800661, + "grad_norm": 9.256975173950195, + "learning_rate": 2.199820048109587e-05, + "loss": 1.3508, + "step": 36550 + }, + { + "epoch": 6.0417269159264615, + "grad_norm": 14.002177238464355, + "learning_rate": 2.198901926219725e-05, + "loss": 1.4102, + "step": 36560 + }, + { + "epoch": 6.043379467052262, + "grad_norm": 21.786453247070312, + "learning_rate": 2.197983804329863e-05, + "loss": 1.5863, + "step": 36570 + }, + { + "epoch": 6.045032018178063, + "grad_norm": 9.75375747680664, + "learning_rate": 2.1970656824400006e-05, + "loss": 1.3894, + "step": 36580 + }, + { + "epoch": 6.046684569303863, + "grad_norm": 7.86843729019165, + "learning_rate": 2.196147560550139e-05, + "loss": 1.3645, + "step": 36590 + }, + { + "epoch": 6.048337120429664, + "grad_norm": 7.051876068115234, + "learning_rate": 2.1952294386602768e-05, + "loss": 1.3968, + "step": 36600 + }, + { + "epoch": 6.049989671555464, + "grad_norm": 53.84910202026367, + "learning_rate": 2.1943113167704147e-05, + "loss": 1.4543, + "step": 36610 + }, + { + "epoch": 6.051642222681264, + "grad_norm": 8.938406944274902, + "learning_rate": 2.1933931948805526e-05, + "loss": 1.3795, + "step": 36620 + }, + { + "epoch": 6.053294773807065, + "grad_norm": 10.21081829071045, + "learning_rate": 2.1924750729906905e-05, + "loss": 1.5895, + "step": 36630 + }, + { + "epoch": 6.054947324932865, + "grad_norm": 11.11033821105957, + "learning_rate": 2.191556951100828e-05, + "loss": 1.4376, + "step": 36640 + }, + { + "epoch": 6.056599876058666, + "grad_norm": 12.69739818572998, + "learning_rate": 2.190638829210966e-05, + "loss": 1.4503, + "step": 36650 + }, + { + "epoch": 6.058252427184466, + "grad_norm": 14.584735870361328, + "learning_rate": 2.189720707321104e-05, + "loss": 1.3427, + "step": 36660 + }, + { + "epoch": 6.059904978310266, + "grad_norm": 10.828611373901367, + "learning_rate": 2.188802585431242e-05, + "loss": 1.3451, + "step": 36670 + }, + { + "epoch": 6.061557529436067, + "grad_norm": 7.586915493011475, + "learning_rate": 2.18788446354138e-05, + "loss": 1.3483, + "step": 36680 + }, + { + "epoch": 6.063210080561867, + "grad_norm": 11.18470573425293, + "learning_rate": 2.1869663416515178e-05, + "loss": 1.4135, + "step": 36690 + }, + { + "epoch": 6.064862631687668, + "grad_norm": 12.794292449951172, + "learning_rate": 2.1860482197616557e-05, + "loss": 1.3996, + "step": 36700 + }, + { + "epoch": 6.066515182813468, + "grad_norm": 10.54262924194336, + "learning_rate": 2.1851300978717936e-05, + "loss": 1.5032, + "step": 36710 + }, + { + "epoch": 6.068167733939268, + "grad_norm": 16.40874671936035, + "learning_rate": 2.1842119759819315e-05, + "loss": 1.4283, + "step": 36720 + }, + { + "epoch": 6.069820285065069, + "grad_norm": 10.831640243530273, + "learning_rate": 2.1832938540920695e-05, + "loss": 1.472, + "step": 36730 + }, + { + "epoch": 6.071472836190869, + "grad_norm": 10.649149894714355, + "learning_rate": 2.1823757322022074e-05, + "loss": 1.294, + "step": 36740 + }, + { + "epoch": 6.07312538731667, + "grad_norm": 30.559856414794922, + "learning_rate": 2.1814576103123453e-05, + "loss": 1.4514, + "step": 36750 + }, + { + "epoch": 6.0747779384424705, + "grad_norm": 10.63766860961914, + "learning_rate": 2.1805394884224832e-05, + "loss": 1.4639, + "step": 36760 + }, + { + "epoch": 6.076430489568271, + "grad_norm": 14.730107307434082, + "learning_rate": 2.1796213665326208e-05, + "loss": 1.4646, + "step": 36770 + }, + { + "epoch": 6.078083040694072, + "grad_norm": 15.621095657348633, + "learning_rate": 2.1787032446427587e-05, + "loss": 1.3344, + "step": 36780 + }, + { + "epoch": 6.079735591819872, + "grad_norm": 15.240421295166016, + "learning_rate": 2.1777851227528967e-05, + "loss": 1.4834, + "step": 36790 + }, + { + "epoch": 6.081388142945673, + "grad_norm": 8.048723220825195, + "learning_rate": 2.1768670008630346e-05, + "loss": 1.3243, + "step": 36800 + }, + { + "epoch": 6.083040694071473, + "grad_norm": 14.045221328735352, + "learning_rate": 2.1759488789731725e-05, + "loss": 1.4613, + "step": 36810 + }, + { + "epoch": 6.084693245197274, + "grad_norm": 9.798041343688965, + "learning_rate": 2.1750307570833104e-05, + "loss": 1.4506, + "step": 36820 + }, + { + "epoch": 6.086345796323074, + "grad_norm": 9.147562980651855, + "learning_rate": 2.1741126351934484e-05, + "loss": 1.4779, + "step": 36830 + }, + { + "epoch": 6.087998347448874, + "grad_norm": 7.114458084106445, + "learning_rate": 2.1731945133035863e-05, + "loss": 1.3534, + "step": 36840 + }, + { + "epoch": 6.089650898574675, + "grad_norm": 15.428686141967773, + "learning_rate": 2.1722763914137242e-05, + "loss": 1.4559, + "step": 36850 + }, + { + "epoch": 6.091303449700475, + "grad_norm": 8.084449768066406, + "learning_rate": 2.171358269523862e-05, + "loss": 1.4538, + "step": 36860 + }, + { + "epoch": 6.092956000826276, + "grad_norm": 13.73076343536377, + "learning_rate": 2.170440147634e-05, + "loss": 1.3203, + "step": 36870 + }, + { + "epoch": 6.094608551952076, + "grad_norm": 12.01843547821045, + "learning_rate": 2.169522025744138e-05, + "loss": 1.3937, + "step": 36880 + }, + { + "epoch": 6.096261103077876, + "grad_norm": 11.336709022521973, + "learning_rate": 2.168603903854276e-05, + "loss": 1.4382, + "step": 36890 + }, + { + "epoch": 6.097913654203677, + "grad_norm": 20.88669776916504, + "learning_rate": 2.1676857819644135e-05, + "loss": 1.3326, + "step": 36900 + }, + { + "epoch": 6.099566205329477, + "grad_norm": 12.064225196838379, + "learning_rate": 2.1667676600745514e-05, + "loss": 1.4931, + "step": 36910 + }, + { + "epoch": 6.101218756455278, + "grad_norm": 10.633503913879395, + "learning_rate": 2.1658495381846894e-05, + "loss": 1.5304, + "step": 36920 + }, + { + "epoch": 6.102871307581078, + "grad_norm": 11.907389640808105, + "learning_rate": 2.1649314162948276e-05, + "loss": 1.4893, + "step": 36930 + }, + { + "epoch": 6.104523858706878, + "grad_norm": 11.007550239562988, + "learning_rate": 2.1640132944049655e-05, + "loss": 1.4218, + "step": 36940 + }, + { + "epoch": 6.106176409832679, + "grad_norm": 7.1882405281066895, + "learning_rate": 2.1630951725151035e-05, + "loss": 1.347, + "step": 36950 + }, + { + "epoch": 6.1078289609584795, + "grad_norm": 21.802770614624023, + "learning_rate": 2.162177050625241e-05, + "loss": 1.3269, + "step": 36960 + }, + { + "epoch": 6.1094815120842805, + "grad_norm": 28.251907348632812, + "learning_rate": 2.161258928735379e-05, + "loss": 1.5196, + "step": 36970 + }, + { + "epoch": 6.111134063210081, + "grad_norm": 13.803258895874023, + "learning_rate": 2.160340806845517e-05, + "loss": 1.45, + "step": 36980 + }, + { + "epoch": 6.112786614335881, + "grad_norm": 26.397541046142578, + "learning_rate": 2.1594226849556548e-05, + "loss": 1.4174, + "step": 36990 + }, + { + "epoch": 6.114439165461682, + "grad_norm": 16.90498924255371, + "learning_rate": 2.1585045630657927e-05, + "loss": 1.4125, + "step": 37000 + }, + { + "epoch": 6.116091716587482, + "grad_norm": 17.150217056274414, + "learning_rate": 2.1575864411759307e-05, + "loss": 1.411, + "step": 37010 + }, + { + "epoch": 6.117744267713283, + "grad_norm": 11.79568862915039, + "learning_rate": 2.1566683192860686e-05, + "loss": 1.4386, + "step": 37020 + }, + { + "epoch": 6.119396818839083, + "grad_norm": 10.997562408447266, + "learning_rate": 2.1557501973962062e-05, + "loss": 1.4482, + "step": 37030 + }, + { + "epoch": 6.121049369964883, + "grad_norm": 14.64095687866211, + "learning_rate": 2.154832075506344e-05, + "loss": 1.4078, + "step": 37040 + }, + { + "epoch": 6.122701921090684, + "grad_norm": 7.124085426330566, + "learning_rate": 2.1539139536164824e-05, + "loss": 1.489, + "step": 37050 + }, + { + "epoch": 6.124354472216484, + "grad_norm": 16.14177894592285, + "learning_rate": 2.1529958317266203e-05, + "loss": 1.449, + "step": 37060 + }, + { + "epoch": 6.126007023342285, + "grad_norm": 14.747572898864746, + "learning_rate": 2.1520777098367582e-05, + "loss": 1.4958, + "step": 37070 + }, + { + "epoch": 6.127659574468085, + "grad_norm": 15.584579467773438, + "learning_rate": 2.151159587946896e-05, + "loss": 1.4513, + "step": 37080 + }, + { + "epoch": 6.129312125593885, + "grad_norm": 28.18446922302246, + "learning_rate": 2.1502414660570337e-05, + "loss": 1.3798, + "step": 37090 + }, + { + "epoch": 6.130964676719686, + "grad_norm": 37.316654205322266, + "learning_rate": 2.1493233441671717e-05, + "loss": 1.5111, + "step": 37100 + }, + { + "epoch": 6.132617227845486, + "grad_norm": 8.39609432220459, + "learning_rate": 2.1484052222773096e-05, + "loss": 1.5339, + "step": 37110 + }, + { + "epoch": 6.134269778971287, + "grad_norm": 14.053455352783203, + "learning_rate": 2.1474871003874475e-05, + "loss": 1.4811, + "step": 37120 + }, + { + "epoch": 6.135922330097087, + "grad_norm": 10.345773696899414, + "learning_rate": 2.1465689784975854e-05, + "loss": 1.4089, + "step": 37130 + }, + { + "epoch": 6.137574881222887, + "grad_norm": 7.838363170623779, + "learning_rate": 2.1456508566077234e-05, + "loss": 1.4742, + "step": 37140 + }, + { + "epoch": 6.139227432348688, + "grad_norm": 22.944093704223633, + "learning_rate": 2.1447327347178613e-05, + "loss": 1.4821, + "step": 37150 + }, + { + "epoch": 6.1408799834744885, + "grad_norm": 8.07544994354248, + "learning_rate": 2.1438146128279992e-05, + "loss": 1.3206, + "step": 37160 + }, + { + "epoch": 6.1425325346002895, + "grad_norm": 99.47127532958984, + "learning_rate": 2.142896490938137e-05, + "loss": 1.3522, + "step": 37170 + }, + { + "epoch": 6.14418508572609, + "grad_norm": 10.434372901916504, + "learning_rate": 2.141978369048275e-05, + "loss": 1.4843, + "step": 37180 + }, + { + "epoch": 6.1458376368518906, + "grad_norm": 9.524206161499023, + "learning_rate": 2.141060247158413e-05, + "loss": 1.4098, + "step": 37190 + }, + { + "epoch": 6.147490187977691, + "grad_norm": 7.026861667633057, + "learning_rate": 2.140142125268551e-05, + "loss": 1.3957, + "step": 37200 + }, + { + "epoch": 6.149142739103491, + "grad_norm": 12.716541290283203, + "learning_rate": 2.1392240033786888e-05, + "loss": 1.5463, + "step": 37210 + }, + { + "epoch": 6.150795290229292, + "grad_norm": 38.672367095947266, + "learning_rate": 2.1383058814888264e-05, + "loss": 1.4292, + "step": 37220 + }, + { + "epoch": 6.152447841355092, + "grad_norm": 14.989725112915039, + "learning_rate": 2.1373877595989643e-05, + "loss": 1.4267, + "step": 37230 + }, + { + "epoch": 6.154100392480893, + "grad_norm": 9.272750854492188, + "learning_rate": 2.1364696377091023e-05, + "loss": 1.4773, + "step": 37240 + }, + { + "epoch": 6.155752943606693, + "grad_norm": 13.461223602294922, + "learning_rate": 2.1355515158192402e-05, + "loss": 1.3765, + "step": 37250 + }, + { + "epoch": 6.157405494732493, + "grad_norm": 11.42111587524414, + "learning_rate": 2.134633393929378e-05, + "loss": 1.399, + "step": 37260 + }, + { + "epoch": 6.159058045858294, + "grad_norm": 9.016616821289062, + "learning_rate": 2.133715272039516e-05, + "loss": 1.596, + "step": 37270 + }, + { + "epoch": 6.160710596984094, + "grad_norm": 8.332453727722168, + "learning_rate": 2.132797150149654e-05, + "loss": 1.388, + "step": 37280 + }, + { + "epoch": 6.162363148109895, + "grad_norm": 17.8102970123291, + "learning_rate": 2.131879028259792e-05, + "loss": 1.3235, + "step": 37290 + }, + { + "epoch": 6.164015699235695, + "grad_norm": 11.08936882019043, + "learning_rate": 2.1309609063699298e-05, + "loss": 1.3742, + "step": 37300 + }, + { + "epoch": 6.165668250361495, + "grad_norm": 8.341262817382812, + "learning_rate": 2.1300427844800677e-05, + "loss": 1.4266, + "step": 37310 + }, + { + "epoch": 6.167320801487296, + "grad_norm": 18.066402435302734, + "learning_rate": 2.1291246625902057e-05, + "loss": 1.6203, + "step": 37320 + }, + { + "epoch": 6.168973352613096, + "grad_norm": 19.090791702270508, + "learning_rate": 2.1282065407003436e-05, + "loss": 1.4881, + "step": 37330 + }, + { + "epoch": 6.170625903738897, + "grad_norm": 7.528576850891113, + "learning_rate": 2.1272884188104815e-05, + "loss": 1.3519, + "step": 37340 + }, + { + "epoch": 6.172278454864697, + "grad_norm": 19.304250717163086, + "learning_rate": 2.126370296920619e-05, + "loss": 1.4546, + "step": 37350 + }, + { + "epoch": 6.1739310059904975, + "grad_norm": 7.075794219970703, + "learning_rate": 2.125452175030757e-05, + "loss": 1.4554, + "step": 37360 + }, + { + "epoch": 6.1755835571162985, + "grad_norm": 12.549452781677246, + "learning_rate": 2.124534053140895e-05, + "loss": 1.5573, + "step": 37370 + }, + { + "epoch": 6.177236108242099, + "grad_norm": 9.695381164550781, + "learning_rate": 2.123615931251033e-05, + "loss": 1.3744, + "step": 37380 + }, + { + "epoch": 6.1788886593678995, + "grad_norm": 21.600099563598633, + "learning_rate": 2.122697809361171e-05, + "loss": 1.5132, + "step": 37390 + }, + { + "epoch": 6.1805412104937, + "grad_norm": 9.879249572753906, + "learning_rate": 2.121779687471309e-05, + "loss": 1.4469, + "step": 37400 + }, + { + "epoch": 6.1821937616195, + "grad_norm": 12.330972671508789, + "learning_rate": 2.1208615655814466e-05, + "loss": 1.5801, + "step": 37410 + }, + { + "epoch": 6.183846312745301, + "grad_norm": 12.194676399230957, + "learning_rate": 2.1199434436915846e-05, + "loss": 1.3992, + "step": 37420 + }, + { + "epoch": 6.185498863871101, + "grad_norm": 14.351995468139648, + "learning_rate": 2.1190253218017225e-05, + "loss": 1.5508, + "step": 37430 + }, + { + "epoch": 6.187151414996902, + "grad_norm": 9.946985244750977, + "learning_rate": 2.1181071999118604e-05, + "loss": 1.5336, + "step": 37440 + }, + { + "epoch": 6.188803966122702, + "grad_norm": 10.623150825500488, + "learning_rate": 2.1171890780219983e-05, + "loss": 1.4028, + "step": 37450 + }, + { + "epoch": 6.190456517248502, + "grad_norm": 12.726574897766113, + "learning_rate": 2.1162709561321363e-05, + "loss": 1.4419, + "step": 37460 + }, + { + "epoch": 6.192109068374303, + "grad_norm": 11.567181587219238, + "learning_rate": 2.1153528342422742e-05, + "loss": 1.3462, + "step": 37470 + }, + { + "epoch": 6.193761619500103, + "grad_norm": 15.38206672668457, + "learning_rate": 2.1144347123524118e-05, + "loss": 1.4159, + "step": 37480 + }, + { + "epoch": 6.195414170625904, + "grad_norm": 9.006597518920898, + "learning_rate": 2.1135165904625497e-05, + "loss": 1.3282, + "step": 37490 + }, + { + "epoch": 6.197066721751704, + "grad_norm": 16.817569732666016, + "learning_rate": 2.112598468572688e-05, + "loss": 1.3807, + "step": 37500 + }, + { + "epoch": 6.198719272877504, + "grad_norm": 14.372156143188477, + "learning_rate": 2.111680346682826e-05, + "loss": 1.5622, + "step": 37510 + }, + { + "epoch": 6.200371824003305, + "grad_norm": 10.266152381896973, + "learning_rate": 2.1107622247929638e-05, + "loss": 1.4466, + "step": 37520 + }, + { + "epoch": 6.202024375129105, + "grad_norm": 15.889741897583008, + "learning_rate": 2.1098441029031017e-05, + "loss": 1.4577, + "step": 37530 + }, + { + "epoch": 6.203676926254906, + "grad_norm": 11.235896110534668, + "learning_rate": 2.1089259810132393e-05, + "loss": 1.4971, + "step": 37540 + }, + { + "epoch": 6.205329477380706, + "grad_norm": 10.462620735168457, + "learning_rate": 2.1080078591233773e-05, + "loss": 1.3453, + "step": 37550 + }, + { + "epoch": 6.206982028506507, + "grad_norm": 9.256996154785156, + "learning_rate": 2.1070897372335152e-05, + "loss": 1.4294, + "step": 37560 + }, + { + "epoch": 6.2086345796323075, + "grad_norm": 9.670038223266602, + "learning_rate": 2.106171615343653e-05, + "loss": 1.498, + "step": 37570 + }, + { + "epoch": 6.210287130758108, + "grad_norm": 10.330695152282715, + "learning_rate": 2.105253493453791e-05, + "loss": 1.4457, + "step": 37580 + }, + { + "epoch": 6.2119396818839085, + "grad_norm": 13.623739242553711, + "learning_rate": 2.104335371563929e-05, + "loss": 1.52, + "step": 37590 + }, + { + "epoch": 6.213592233009709, + "grad_norm": 8.723990440368652, + "learning_rate": 2.103417249674067e-05, + "loss": 1.4207, + "step": 37600 + }, + { + "epoch": 6.215244784135509, + "grad_norm": 10.08510684967041, + "learning_rate": 2.1024991277842045e-05, + "loss": 1.4864, + "step": 37610 + }, + { + "epoch": 6.21689733526131, + "grad_norm": 13.55770206451416, + "learning_rate": 2.1015810058943427e-05, + "loss": 1.3481, + "step": 37620 + }, + { + "epoch": 6.21854988638711, + "grad_norm": 26.282560348510742, + "learning_rate": 2.1006628840044807e-05, + "loss": 1.4423, + "step": 37630 + }, + { + "epoch": 6.220202437512911, + "grad_norm": 13.998091697692871, + "learning_rate": 2.0997447621146186e-05, + "loss": 1.5324, + "step": 37640 + }, + { + "epoch": 6.221854988638711, + "grad_norm": 10.550564765930176, + "learning_rate": 2.0988266402247565e-05, + "loss": 1.4653, + "step": 37650 + }, + { + "epoch": 6.223507539764512, + "grad_norm": 9.701068878173828, + "learning_rate": 2.0979085183348944e-05, + "loss": 1.4896, + "step": 37660 + }, + { + "epoch": 6.225160090890312, + "grad_norm": 19.163440704345703, + "learning_rate": 2.096990396445032e-05, + "loss": 1.3897, + "step": 37670 + }, + { + "epoch": 6.226812642016112, + "grad_norm": 21.417760848999023, + "learning_rate": 2.09607227455517e-05, + "loss": 1.3892, + "step": 37680 + }, + { + "epoch": 6.228465193141913, + "grad_norm": 13.961776733398438, + "learning_rate": 2.095154152665308e-05, + "loss": 1.3522, + "step": 37690 + }, + { + "epoch": 6.230117744267713, + "grad_norm": 7.012969493865967, + "learning_rate": 2.0942360307754458e-05, + "loss": 1.3648, + "step": 37700 + }, + { + "epoch": 6.231770295393514, + "grad_norm": 15.90501594543457, + "learning_rate": 2.0933179088855837e-05, + "loss": 1.4158, + "step": 37710 + }, + { + "epoch": 6.233422846519314, + "grad_norm": 16.21110725402832, + "learning_rate": 2.0923997869957216e-05, + "loss": 1.5681, + "step": 37720 + }, + { + "epoch": 6.235075397645114, + "grad_norm": 11.805452346801758, + "learning_rate": 2.0914816651058596e-05, + "loss": 1.387, + "step": 37730 + }, + { + "epoch": 6.236727948770915, + "grad_norm": 13.68657398223877, + "learning_rate": 2.0905635432159975e-05, + "loss": 1.3717, + "step": 37740 + }, + { + "epoch": 6.238380499896715, + "grad_norm": 9.461950302124023, + "learning_rate": 2.0896454213261354e-05, + "loss": 1.3141, + "step": 37750 + }, + { + "epoch": 6.240033051022516, + "grad_norm": 9.578817367553711, + "learning_rate": 2.0887272994362733e-05, + "loss": 1.3581, + "step": 37760 + }, + { + "epoch": 6.2416856021483165, + "grad_norm": 11.16032600402832, + "learning_rate": 2.0878091775464113e-05, + "loss": 1.4463, + "step": 37770 + }, + { + "epoch": 6.243338153274117, + "grad_norm": 36.12056350708008, + "learning_rate": 2.0868910556565492e-05, + "loss": 1.3249, + "step": 37780 + }, + { + "epoch": 6.2449907043999175, + "grad_norm": 17.16036605834961, + "learning_rate": 2.085972933766687e-05, + "loss": 1.4926, + "step": 37790 + }, + { + "epoch": 6.246643255525718, + "grad_norm": 8.647866249084473, + "learning_rate": 2.0850548118768247e-05, + "loss": 1.3782, + "step": 37800 + }, + { + "epoch": 6.248295806651519, + "grad_norm": 8.385272026062012, + "learning_rate": 2.0841366899869626e-05, + "loss": 1.4198, + "step": 37810 + }, + { + "epoch": 6.249948357777319, + "grad_norm": 14.405436515808105, + "learning_rate": 2.0832185680971005e-05, + "loss": 1.3887, + "step": 37820 + }, + { + "epoch": 6.251600908903119, + "grad_norm": 7.519594669342041, + "learning_rate": 2.0823004462072385e-05, + "loss": 1.3571, + "step": 37830 + }, + { + "epoch": 6.25325346002892, + "grad_norm": 27.60797882080078, + "learning_rate": 2.0813823243173764e-05, + "loss": 1.5992, + "step": 37840 + }, + { + "epoch": 6.25490601115472, + "grad_norm": 11.991599082946777, + "learning_rate": 2.0804642024275147e-05, + "loss": 1.4627, + "step": 37850 + }, + { + "epoch": 6.256558562280521, + "grad_norm": 12.27054500579834, + "learning_rate": 2.0795460805376522e-05, + "loss": 1.3893, + "step": 37860 + }, + { + "epoch": 6.258211113406321, + "grad_norm": 33.28194808959961, + "learning_rate": 2.0786279586477902e-05, + "loss": 1.4389, + "step": 37870 + }, + { + "epoch": 6.259863664532121, + "grad_norm": 22.726533889770508, + "learning_rate": 2.077709836757928e-05, + "loss": 1.5418, + "step": 37880 + }, + { + "epoch": 6.261516215657922, + "grad_norm": 13.192795753479004, + "learning_rate": 2.076791714868066e-05, + "loss": 1.3905, + "step": 37890 + }, + { + "epoch": 6.263168766783722, + "grad_norm": 14.053077697753906, + "learning_rate": 2.075873592978204e-05, + "loss": 1.4147, + "step": 37900 + }, + { + "epoch": 6.264821317909523, + "grad_norm": 16.81157875061035, + "learning_rate": 2.074955471088342e-05, + "loss": 1.3977, + "step": 37910 + }, + { + "epoch": 6.266473869035323, + "grad_norm": 7.91226863861084, + "learning_rate": 2.0740373491984798e-05, + "loss": 1.3966, + "step": 37920 + }, + { + "epoch": 6.268126420161124, + "grad_norm": 13.343158721923828, + "learning_rate": 2.0731192273086174e-05, + "loss": 1.4248, + "step": 37930 + }, + { + "epoch": 6.269778971286924, + "grad_norm": 10.185911178588867, + "learning_rate": 2.0722011054187553e-05, + "loss": 1.4735, + "step": 37940 + }, + { + "epoch": 6.271431522412724, + "grad_norm": 8.653183937072754, + "learning_rate": 2.0712829835288932e-05, + "loss": 1.4104, + "step": 37950 + }, + { + "epoch": 6.273084073538525, + "grad_norm": 16.011350631713867, + "learning_rate": 2.0703648616390315e-05, + "loss": 1.4787, + "step": 37960 + }, + { + "epoch": 6.2747366246643255, + "grad_norm": 11.425875663757324, + "learning_rate": 2.0694467397491694e-05, + "loss": 1.5623, + "step": 37970 + }, + { + "epoch": 6.276389175790126, + "grad_norm": 10.677700996398926, + "learning_rate": 2.0685286178593073e-05, + "loss": 1.4261, + "step": 37980 + }, + { + "epoch": 6.2780417269159265, + "grad_norm": 19.281002044677734, + "learning_rate": 2.067610495969445e-05, + "loss": 1.7146, + "step": 37990 + }, + { + "epoch": 6.279694278041727, + "grad_norm": 16.140913009643555, + "learning_rate": 2.066692374079583e-05, + "loss": 1.4536, + "step": 38000 + }, + { + "epoch": 6.281346829167528, + "grad_norm": 11.096719741821289, + "learning_rate": 2.0657742521897208e-05, + "loss": 1.3705, + "step": 38010 + }, + { + "epoch": 6.282999380293328, + "grad_norm": 18.13045883178711, + "learning_rate": 2.0648561302998587e-05, + "loss": 1.4394, + "step": 38020 + }, + { + "epoch": 6.284651931419129, + "grad_norm": 14.277056694030762, + "learning_rate": 2.0639380084099966e-05, + "loss": 1.46, + "step": 38030 + }, + { + "epoch": 6.286304482544929, + "grad_norm": 12.03947925567627, + "learning_rate": 2.0630198865201346e-05, + "loss": 1.2961, + "step": 38040 + }, + { + "epoch": 6.287957033670729, + "grad_norm": 11.471107482910156, + "learning_rate": 2.0621017646302725e-05, + "loss": 1.3797, + "step": 38050 + }, + { + "epoch": 6.28960958479653, + "grad_norm": 14.753610610961914, + "learning_rate": 2.06118364274041e-05, + "loss": 1.5292, + "step": 38060 + }, + { + "epoch": 6.29126213592233, + "grad_norm": 9.602330207824707, + "learning_rate": 2.060265520850548e-05, + "loss": 1.3428, + "step": 38070 + }, + { + "epoch": 6.292914687048131, + "grad_norm": 12.456007957458496, + "learning_rate": 2.0593473989606863e-05, + "loss": 1.4892, + "step": 38080 + }, + { + "epoch": 6.294567238173931, + "grad_norm": 8.093803405761719, + "learning_rate": 2.0584292770708242e-05, + "loss": 1.4064, + "step": 38090 + }, + { + "epoch": 6.296219789299731, + "grad_norm": 10.36359691619873, + "learning_rate": 2.057511155180962e-05, + "loss": 1.4547, + "step": 38100 + }, + { + "epoch": 6.297872340425532, + "grad_norm": 10.041078567504883, + "learning_rate": 2.0565930332911e-05, + "loss": 1.4408, + "step": 38110 + }, + { + "epoch": 6.299524891551332, + "grad_norm": 12.253533363342285, + "learning_rate": 2.0556749114012376e-05, + "loss": 1.589, + "step": 38120 + }, + { + "epoch": 6.301177442677133, + "grad_norm": 10.570530891418457, + "learning_rate": 2.0547567895113755e-05, + "loss": 1.3889, + "step": 38130 + }, + { + "epoch": 6.302829993802933, + "grad_norm": 13.251456260681152, + "learning_rate": 2.0538386676215135e-05, + "loss": 1.5385, + "step": 38140 + }, + { + "epoch": 6.304482544928733, + "grad_norm": 27.627914428710938, + "learning_rate": 2.0529205457316514e-05, + "loss": 1.5747, + "step": 38150 + }, + { + "epoch": 6.306135096054534, + "grad_norm": 9.486604690551758, + "learning_rate": 2.0520024238417893e-05, + "loss": 1.4378, + "step": 38160 + }, + { + "epoch": 6.3077876471803345, + "grad_norm": 6.460193157196045, + "learning_rate": 2.0510843019519272e-05, + "loss": 1.444, + "step": 38170 + }, + { + "epoch": 6.3094401983061354, + "grad_norm": 48.902530670166016, + "learning_rate": 2.050166180062065e-05, + "loss": 1.3742, + "step": 38180 + }, + { + "epoch": 6.3110927494319355, + "grad_norm": 8.491707801818848, + "learning_rate": 2.049248058172203e-05, + "loss": 1.4993, + "step": 38190 + }, + { + "epoch": 6.312745300557736, + "grad_norm": 14.18362808227539, + "learning_rate": 2.048329936282341e-05, + "loss": 1.5047, + "step": 38200 + }, + { + "epoch": 6.314397851683537, + "grad_norm": 15.740824699401855, + "learning_rate": 2.047411814392479e-05, + "loss": 1.3403, + "step": 38210 + }, + { + "epoch": 6.316050402809337, + "grad_norm": 11.892169952392578, + "learning_rate": 2.046493692502617e-05, + "loss": 1.5607, + "step": 38220 + }, + { + "epoch": 6.317702953935138, + "grad_norm": 8.451154708862305, + "learning_rate": 2.0455755706127548e-05, + "loss": 1.3493, + "step": 38230 + }, + { + "epoch": 6.319355505060938, + "grad_norm": 14.7618408203125, + "learning_rate": 2.0446574487228927e-05, + "loss": 1.504, + "step": 38240 + }, + { + "epoch": 6.321008056186738, + "grad_norm": 10.903997421264648, + "learning_rate": 2.0437393268330303e-05, + "loss": 1.3993, + "step": 38250 + }, + { + "epoch": 6.322660607312539, + "grad_norm": 12.13855266571045, + "learning_rate": 2.0428212049431682e-05, + "loss": 1.478, + "step": 38260 + }, + { + "epoch": 6.324313158438339, + "grad_norm": 13.594361305236816, + "learning_rate": 2.041903083053306e-05, + "loss": 1.5079, + "step": 38270 + }, + { + "epoch": 6.32596570956414, + "grad_norm": 9.111920356750488, + "learning_rate": 2.040984961163444e-05, + "loss": 1.5341, + "step": 38280 + }, + { + "epoch": 6.32761826068994, + "grad_norm": 8.151607513427734, + "learning_rate": 2.040066839273582e-05, + "loss": 1.4063, + "step": 38290 + }, + { + "epoch": 6.32927081181574, + "grad_norm": 9.332478523254395, + "learning_rate": 2.0391487173837203e-05, + "loss": 1.5013, + "step": 38300 + }, + { + "epoch": 6.330923362941541, + "grad_norm": 20.929075241088867, + "learning_rate": 2.038230595493858e-05, + "loss": 1.3706, + "step": 38310 + }, + { + "epoch": 6.332575914067341, + "grad_norm": 16.67856788635254, + "learning_rate": 2.0373124736039958e-05, + "loss": 1.4256, + "step": 38320 + }, + { + "epoch": 6.334228465193142, + "grad_norm": 9.978659629821777, + "learning_rate": 2.0363943517141337e-05, + "loss": 1.5506, + "step": 38330 + }, + { + "epoch": 6.335881016318942, + "grad_norm": 7.140017032623291, + "learning_rate": 2.0354762298242716e-05, + "loss": 1.3496, + "step": 38340 + }, + { + "epoch": 6.337533567444742, + "grad_norm": 7.583073616027832, + "learning_rate": 2.0345581079344095e-05, + "loss": 1.2338, + "step": 38350 + }, + { + "epoch": 6.339186118570543, + "grad_norm": 27.26841163635254, + "learning_rate": 2.0336399860445475e-05, + "loss": 1.3861, + "step": 38360 + }, + { + "epoch": 6.3408386696963435, + "grad_norm": 22.0368595123291, + "learning_rate": 2.0327218641546854e-05, + "loss": 1.4712, + "step": 38370 + }, + { + "epoch": 6.3424912208221444, + "grad_norm": 13.927277565002441, + "learning_rate": 2.031803742264823e-05, + "loss": 1.5026, + "step": 38380 + }, + { + "epoch": 6.3441437719479445, + "grad_norm": 13.221909523010254, + "learning_rate": 2.030885620374961e-05, + "loss": 1.3972, + "step": 38390 + }, + { + "epoch": 6.3457963230737455, + "grad_norm": 13.610274314880371, + "learning_rate": 2.0299674984850988e-05, + "loss": 1.5108, + "step": 38400 + }, + { + "epoch": 6.347448874199546, + "grad_norm": 16.176225662231445, + "learning_rate": 2.0290493765952368e-05, + "loss": 1.3653, + "step": 38410 + }, + { + "epoch": 6.349101425325346, + "grad_norm": 21.335020065307617, + "learning_rate": 2.028131254705375e-05, + "loss": 1.5748, + "step": 38420 + }, + { + "epoch": 6.350753976451147, + "grad_norm": 17.27007293701172, + "learning_rate": 2.027213132815513e-05, + "loss": 1.4135, + "step": 38430 + }, + { + "epoch": 6.352406527576947, + "grad_norm": 19.558349609375, + "learning_rate": 2.0262950109256505e-05, + "loss": 1.398, + "step": 38440 + }, + { + "epoch": 6.354059078702748, + "grad_norm": 7.04690408706665, + "learning_rate": 2.0253768890357885e-05, + "loss": 1.3721, + "step": 38450 + }, + { + "epoch": 6.355711629828548, + "grad_norm": 9.508496284484863, + "learning_rate": 2.0244587671459264e-05, + "loss": 1.3303, + "step": 38460 + }, + { + "epoch": 6.357364180954348, + "grad_norm": 21.893522262573242, + "learning_rate": 2.0235406452560643e-05, + "loss": 1.3006, + "step": 38470 + }, + { + "epoch": 6.359016732080149, + "grad_norm": 10.605900764465332, + "learning_rate": 2.0226225233662022e-05, + "loss": 1.5355, + "step": 38480 + }, + { + "epoch": 6.360669283205949, + "grad_norm": 43.09657669067383, + "learning_rate": 2.02170440147634e-05, + "loss": 1.4263, + "step": 38490 + }, + { + "epoch": 6.36232183433175, + "grad_norm": 11.316243171691895, + "learning_rate": 2.020786279586478e-05, + "loss": 1.435, + "step": 38500 + }, + { + "epoch": 6.36397438545755, + "grad_norm": 13.281632423400879, + "learning_rate": 2.0198681576966157e-05, + "loss": 1.496, + "step": 38510 + }, + { + "epoch": 6.36562693658335, + "grad_norm": 11.309965133666992, + "learning_rate": 2.0189500358067536e-05, + "loss": 1.4598, + "step": 38520 + }, + { + "epoch": 6.367279487709151, + "grad_norm": 13.029101371765137, + "learning_rate": 2.018031913916892e-05, + "loss": 1.5399, + "step": 38530 + }, + { + "epoch": 6.368932038834951, + "grad_norm": 112.3958740234375, + "learning_rate": 2.0171137920270298e-05, + "loss": 1.3838, + "step": 38540 + }, + { + "epoch": 6.370584589960752, + "grad_norm": 9.24577808380127, + "learning_rate": 2.0161956701371677e-05, + "loss": 1.3194, + "step": 38550 + }, + { + "epoch": 6.372237141086552, + "grad_norm": 14.375222206115723, + "learning_rate": 2.0152775482473056e-05, + "loss": 1.1996, + "step": 38560 + }, + { + "epoch": 6.3738896922123525, + "grad_norm": 19.481122970581055, + "learning_rate": 2.0143594263574432e-05, + "loss": 1.5943, + "step": 38570 + }, + { + "epoch": 6.3755422433381534, + "grad_norm": 13.95479679107666, + "learning_rate": 2.013441304467581e-05, + "loss": 1.4706, + "step": 38580 + }, + { + "epoch": 6.3771947944639535, + "grad_norm": 13.144954681396484, + "learning_rate": 2.012523182577719e-05, + "loss": 1.3673, + "step": 38590 + }, + { + "epoch": 6.3788473455897545, + "grad_norm": 12.10051155090332, + "learning_rate": 2.011605060687857e-05, + "loss": 1.2891, + "step": 38600 + }, + { + "epoch": 6.380499896715555, + "grad_norm": 27.080062866210938, + "learning_rate": 2.010686938797995e-05, + "loss": 1.4499, + "step": 38610 + }, + { + "epoch": 6.382152447841355, + "grad_norm": 14.484885215759277, + "learning_rate": 2.009768816908133e-05, + "loss": 1.3468, + "step": 38620 + }, + { + "epoch": 6.383804998967156, + "grad_norm": 10.011687278747559, + "learning_rate": 2.0088506950182708e-05, + "loss": 1.4775, + "step": 38630 + }, + { + "epoch": 6.385457550092956, + "grad_norm": 16.136798858642578, + "learning_rate": 2.0079325731284083e-05, + "loss": 1.4223, + "step": 38640 + }, + { + "epoch": 6.387110101218757, + "grad_norm": 14.336216926574707, + "learning_rate": 2.0070144512385466e-05, + "loss": 1.4514, + "step": 38650 + }, + { + "epoch": 6.388762652344557, + "grad_norm": 19.590391159057617, + "learning_rate": 2.0060963293486845e-05, + "loss": 1.5656, + "step": 38660 + }, + { + "epoch": 6.390415203470357, + "grad_norm": 12.93675422668457, + "learning_rate": 2.0051782074588225e-05, + "loss": 1.4404, + "step": 38670 + }, + { + "epoch": 6.392067754596158, + "grad_norm": 12.619584083557129, + "learning_rate": 2.0042600855689604e-05, + "loss": 1.4569, + "step": 38680 + }, + { + "epoch": 6.393720305721958, + "grad_norm": 9.457258224487305, + "learning_rate": 2.0033419636790983e-05, + "loss": 1.3339, + "step": 38690 + }, + { + "epoch": 6.395372856847759, + "grad_norm": 13.199435234069824, + "learning_rate": 2.002423841789236e-05, + "loss": 1.3885, + "step": 38700 + }, + { + "epoch": 6.397025407973559, + "grad_norm": 19.85384178161621, + "learning_rate": 2.0015057198993738e-05, + "loss": 1.5067, + "step": 38710 + }, + { + "epoch": 6.398677959099359, + "grad_norm": 9.798479080200195, + "learning_rate": 2.0005875980095117e-05, + "loss": 1.4391, + "step": 38720 + }, + { + "epoch": 6.40033051022516, + "grad_norm": 14.563024520874023, + "learning_rate": 1.9996694761196497e-05, + "loss": 1.4709, + "step": 38730 + }, + { + "epoch": 6.40198306135096, + "grad_norm": 13.90085506439209, + "learning_rate": 1.9987513542297876e-05, + "loss": 1.5339, + "step": 38740 + }, + { + "epoch": 6.403635612476761, + "grad_norm": 7.516844272613525, + "learning_rate": 1.9978332323399255e-05, + "loss": 1.3254, + "step": 38750 + }, + { + "epoch": 6.405288163602561, + "grad_norm": 10.863202095031738, + "learning_rate": 1.9969151104500634e-05, + "loss": 1.3872, + "step": 38760 + }, + { + "epoch": 6.406940714728362, + "grad_norm": 21.591825485229492, + "learning_rate": 1.9959969885602014e-05, + "loss": 1.5489, + "step": 38770 + }, + { + "epoch": 6.4085932658541624, + "grad_norm": 17.203176498413086, + "learning_rate": 1.9950788666703393e-05, + "loss": 1.4613, + "step": 38780 + }, + { + "epoch": 6.4102458169799625, + "grad_norm": 13.360187530517578, + "learning_rate": 1.9941607447804772e-05, + "loss": 1.4316, + "step": 38790 + }, + { + "epoch": 6.4118983681057635, + "grad_norm": 30.37958526611328, + "learning_rate": 1.993242622890615e-05, + "loss": 1.4888, + "step": 38800 + }, + { + "epoch": 6.413550919231564, + "grad_norm": 21.938684463500977, + "learning_rate": 1.992324501000753e-05, + "loss": 1.5436, + "step": 38810 + }, + { + "epoch": 6.415203470357364, + "grad_norm": 9.949905395507812, + "learning_rate": 1.991406379110891e-05, + "loss": 1.45, + "step": 38820 + }, + { + "epoch": 6.416856021483165, + "grad_norm": 15.345038414001465, + "learning_rate": 1.9904882572210286e-05, + "loss": 1.4534, + "step": 38830 + }, + { + "epoch": 6.418508572608965, + "grad_norm": 11.959953308105469, + "learning_rate": 1.9895701353311665e-05, + "loss": 1.4728, + "step": 38840 + }, + { + "epoch": 6.420161123734766, + "grad_norm": 10.856972694396973, + "learning_rate": 1.9886520134413044e-05, + "loss": 1.4479, + "step": 38850 + }, + { + "epoch": 6.421813674860566, + "grad_norm": 10.778406143188477, + "learning_rate": 1.9877338915514424e-05, + "loss": 1.4299, + "step": 38860 + }, + { + "epoch": 6.423466225986367, + "grad_norm": 10.005019187927246, + "learning_rate": 1.9868157696615803e-05, + "loss": 1.3999, + "step": 38870 + }, + { + "epoch": 6.425118777112167, + "grad_norm": 8.668060302734375, + "learning_rate": 1.9858976477717185e-05, + "loss": 1.425, + "step": 38880 + }, + { + "epoch": 6.426771328237967, + "grad_norm": 13.997286796569824, + "learning_rate": 1.984979525881856e-05, + "loss": 1.3868, + "step": 38890 + }, + { + "epoch": 6.428423879363768, + "grad_norm": 15.211793899536133, + "learning_rate": 1.984061403991994e-05, + "loss": 1.5317, + "step": 38900 + }, + { + "epoch": 6.430076430489568, + "grad_norm": 7.7943034172058105, + "learning_rate": 1.983143282102132e-05, + "loss": 1.299, + "step": 38910 + }, + { + "epoch": 6.431728981615369, + "grad_norm": 7.966866493225098, + "learning_rate": 1.98222516021227e-05, + "loss": 1.3767, + "step": 38920 + }, + { + "epoch": 6.433381532741169, + "grad_norm": 29.43184471130371, + "learning_rate": 1.9813070383224078e-05, + "loss": 1.3973, + "step": 38930 + }, + { + "epoch": 6.435034083866969, + "grad_norm": 15.053828239440918, + "learning_rate": 1.9803889164325457e-05, + "loss": 1.4583, + "step": 38940 + }, + { + "epoch": 6.43668663499277, + "grad_norm": 6.739525318145752, + "learning_rate": 1.9794707945426837e-05, + "loss": 1.2457, + "step": 38950 + }, + { + "epoch": 6.43833918611857, + "grad_norm": 15.069477081298828, + "learning_rate": 1.9785526726528213e-05, + "loss": 1.3477, + "step": 38960 + }, + { + "epoch": 6.439991737244371, + "grad_norm": 14.493611335754395, + "learning_rate": 1.9776345507629592e-05, + "loss": 1.4433, + "step": 38970 + }, + { + "epoch": 6.4416442883701714, + "grad_norm": 9.453902244567871, + "learning_rate": 1.976716428873097e-05, + "loss": 1.5945, + "step": 38980 + }, + { + "epoch": 6.4432968394959715, + "grad_norm": 42.37503433227539, + "learning_rate": 1.9757983069832354e-05, + "loss": 1.4771, + "step": 38990 + }, + { + "epoch": 6.4449493906217725, + "grad_norm": 13.014341354370117, + "learning_rate": 1.9748801850933733e-05, + "loss": 1.4682, + "step": 39000 + }, + { + "epoch": 6.446601941747573, + "grad_norm": 13.779463768005371, + "learning_rate": 1.9739620632035112e-05, + "loss": 1.3348, + "step": 39010 + }, + { + "epoch": 6.448254492873374, + "grad_norm": 27.29125213623047, + "learning_rate": 1.9730439413136488e-05, + "loss": 1.3392, + "step": 39020 + }, + { + "epoch": 6.449907043999174, + "grad_norm": 23.10088539123535, + "learning_rate": 1.9721258194237867e-05, + "loss": 1.4222, + "step": 39030 + }, + { + "epoch": 6.451559595124974, + "grad_norm": 9.619447708129883, + "learning_rate": 1.9712076975339247e-05, + "loss": 1.4682, + "step": 39040 + }, + { + "epoch": 6.453212146250775, + "grad_norm": 12.745689392089844, + "learning_rate": 1.9702895756440626e-05, + "loss": 1.4813, + "step": 39050 + }, + { + "epoch": 6.454864697376575, + "grad_norm": 13.98746109008789, + "learning_rate": 1.9693714537542005e-05, + "loss": 1.4522, + "step": 39060 + }, + { + "epoch": 6.456517248502376, + "grad_norm": 11.557574272155762, + "learning_rate": 1.9684533318643384e-05, + "loss": 1.4221, + "step": 39070 + }, + { + "epoch": 6.458169799628176, + "grad_norm": 10.058650016784668, + "learning_rate": 1.9675352099744764e-05, + "loss": 1.5567, + "step": 39080 + }, + { + "epoch": 6.459822350753976, + "grad_norm": 13.266910552978516, + "learning_rate": 1.966617088084614e-05, + "loss": 1.5052, + "step": 39090 + }, + { + "epoch": 6.461474901879777, + "grad_norm": 13.270020484924316, + "learning_rate": 1.9656989661947522e-05, + "loss": 1.4501, + "step": 39100 + }, + { + "epoch": 6.463127453005577, + "grad_norm": 9.27397346496582, + "learning_rate": 1.96478084430489e-05, + "loss": 1.5234, + "step": 39110 + }, + { + "epoch": 6.464780004131378, + "grad_norm": 12.057477951049805, + "learning_rate": 1.963862722415028e-05, + "loss": 1.3752, + "step": 39120 + }, + { + "epoch": 6.466432555257178, + "grad_norm": 8.3319730758667, + "learning_rate": 1.962944600525166e-05, + "loss": 1.4345, + "step": 39130 + }, + { + "epoch": 6.468085106382979, + "grad_norm": 9.282336235046387, + "learning_rate": 1.962026478635304e-05, + "loss": 1.3447, + "step": 39140 + }, + { + "epoch": 6.469737657508779, + "grad_norm": 30.67858123779297, + "learning_rate": 1.9611083567454415e-05, + "loss": 1.4171, + "step": 39150 + }, + { + "epoch": 6.471390208634579, + "grad_norm": 13.620590209960938, + "learning_rate": 1.9601902348555794e-05, + "loss": 1.4623, + "step": 39160 + }, + { + "epoch": 6.47304275976038, + "grad_norm": 10.19704818725586, + "learning_rate": 1.9592721129657173e-05, + "loss": 1.4476, + "step": 39170 + }, + { + "epoch": 6.47469531088618, + "grad_norm": 8.967962265014648, + "learning_rate": 1.9583539910758553e-05, + "loss": 1.502, + "step": 39180 + }, + { + "epoch": 6.4763478620119805, + "grad_norm": 68.98345184326172, + "learning_rate": 1.9574358691859932e-05, + "loss": 1.4766, + "step": 39190 + }, + { + "epoch": 6.4780004131377815, + "grad_norm": 13.379420280456543, + "learning_rate": 1.956517747296131e-05, + "loss": 1.4901, + "step": 39200 + }, + { + "epoch": 6.479652964263582, + "grad_norm": 10.130301475524902, + "learning_rate": 1.955599625406269e-05, + "loss": 1.4408, + "step": 39210 + }, + { + "epoch": 6.481305515389383, + "grad_norm": 9.031115531921387, + "learning_rate": 1.954681503516407e-05, + "loss": 1.4477, + "step": 39220 + }, + { + "epoch": 6.482958066515183, + "grad_norm": 10.593781471252441, + "learning_rate": 1.953763381626545e-05, + "loss": 1.3162, + "step": 39230 + }, + { + "epoch": 6.484610617640984, + "grad_norm": 10.19831657409668, + "learning_rate": 1.9528452597366828e-05, + "loss": 1.393, + "step": 39240 + }, + { + "epoch": 6.486263168766784, + "grad_norm": 28.23524284362793, + "learning_rate": 1.9519271378468207e-05, + "loss": 1.4287, + "step": 39250 + }, + { + "epoch": 6.487915719892584, + "grad_norm": 10.968947410583496, + "learning_rate": 1.9510090159569587e-05, + "loss": 1.4497, + "step": 39260 + }, + { + "epoch": 6.489568271018385, + "grad_norm": 11.46053409576416, + "learning_rate": 1.9500908940670966e-05, + "loss": 1.511, + "step": 39270 + }, + { + "epoch": 6.491220822144185, + "grad_norm": 11.82854175567627, + "learning_rate": 1.9491727721772342e-05, + "loss": 1.3231, + "step": 39280 + }, + { + "epoch": 6.492873373269986, + "grad_norm": 14.076804161071777, + "learning_rate": 1.948254650287372e-05, + "loss": 1.4442, + "step": 39290 + }, + { + "epoch": 6.494525924395786, + "grad_norm": 8.95998477935791, + "learning_rate": 1.94733652839751e-05, + "loss": 1.4034, + "step": 39300 + }, + { + "epoch": 6.496178475521586, + "grad_norm": 18.003774642944336, + "learning_rate": 1.946418406507648e-05, + "loss": 1.5485, + "step": 39310 + }, + { + "epoch": 6.497831026647387, + "grad_norm": 44.281982421875, + "learning_rate": 1.945500284617786e-05, + "loss": 1.4739, + "step": 39320 + }, + { + "epoch": 6.499483577773187, + "grad_norm": 8.51934814453125, + "learning_rate": 1.944582162727924e-05, + "loss": 1.5113, + "step": 39330 + }, + { + "epoch": 6.501136128898988, + "grad_norm": 12.045053482055664, + "learning_rate": 1.9436640408380617e-05, + "loss": 1.4032, + "step": 39340 + }, + { + "epoch": 6.502788680024788, + "grad_norm": 9.922347068786621, + "learning_rate": 1.9427459189481996e-05, + "loss": 1.3881, + "step": 39350 + }, + { + "epoch": 6.504441231150588, + "grad_norm": 17.480976104736328, + "learning_rate": 1.9418277970583376e-05, + "loss": 1.4731, + "step": 39360 + }, + { + "epoch": 6.506093782276389, + "grad_norm": 7.395370006561279, + "learning_rate": 1.9409096751684755e-05, + "loss": 1.3666, + "step": 39370 + }, + { + "epoch": 6.507746333402189, + "grad_norm": 23.064151763916016, + "learning_rate": 1.9399915532786134e-05, + "loss": 1.4598, + "step": 39380 + }, + { + "epoch": 6.50939888452799, + "grad_norm": 8.329906463623047, + "learning_rate": 1.9390734313887513e-05, + "loss": 1.367, + "step": 39390 + }, + { + "epoch": 6.5110514356537905, + "grad_norm": 8.619010925292969, + "learning_rate": 1.9381553094988893e-05, + "loss": 1.4067, + "step": 39400 + }, + { + "epoch": 6.512703986779591, + "grad_norm": 9.927018165588379, + "learning_rate": 1.937237187609027e-05, + "loss": 1.4338, + "step": 39410 + }, + { + "epoch": 6.514356537905392, + "grad_norm": 13.935831069946289, + "learning_rate": 1.9363190657191648e-05, + "loss": 1.4524, + "step": 39420 + }, + { + "epoch": 6.516009089031192, + "grad_norm": 8.87132453918457, + "learning_rate": 1.9354009438293027e-05, + "loss": 1.3038, + "step": 39430 + }, + { + "epoch": 6.517661640156993, + "grad_norm": 9.636089324951172, + "learning_rate": 1.9344828219394406e-05, + "loss": 1.4682, + "step": 39440 + }, + { + "epoch": 6.519314191282793, + "grad_norm": 21.424053192138672, + "learning_rate": 1.933564700049579e-05, + "loss": 1.3425, + "step": 39450 + }, + { + "epoch": 6.520966742408593, + "grad_norm": 53.614646911621094, + "learning_rate": 1.9326465781597168e-05, + "loss": 1.4947, + "step": 39460 + }, + { + "epoch": 6.522619293534394, + "grad_norm": 8.673842430114746, + "learning_rate": 1.9317284562698544e-05, + "loss": 1.4588, + "step": 39470 + }, + { + "epoch": 6.524271844660194, + "grad_norm": 47.06062316894531, + "learning_rate": 1.9308103343799923e-05, + "loss": 1.2771, + "step": 39480 + }, + { + "epoch": 6.525924395785995, + "grad_norm": 11.706918716430664, + "learning_rate": 1.9298922124901303e-05, + "loss": 1.4553, + "step": 39490 + }, + { + "epoch": 6.527576946911795, + "grad_norm": 9.595712661743164, + "learning_rate": 1.9289740906002682e-05, + "loss": 1.4247, + "step": 39500 + }, + { + "epoch": 6.529229498037596, + "grad_norm": 9.61540412902832, + "learning_rate": 1.928055968710406e-05, + "loss": 1.3642, + "step": 39510 + }, + { + "epoch": 6.530882049163396, + "grad_norm": 34.72698974609375, + "learning_rate": 1.927137846820544e-05, + "loss": 1.4828, + "step": 39520 + }, + { + "epoch": 6.532534600289196, + "grad_norm": 14.31153678894043, + "learning_rate": 1.926219724930682e-05, + "loss": 1.5188, + "step": 39530 + }, + { + "epoch": 6.534187151414997, + "grad_norm": 12.134452819824219, + "learning_rate": 1.9253016030408195e-05, + "loss": 1.5148, + "step": 39540 + }, + { + "epoch": 6.535839702540797, + "grad_norm": 8.781564712524414, + "learning_rate": 1.9243834811509575e-05, + "loss": 1.5178, + "step": 39550 + }, + { + "epoch": 6.537492253666597, + "grad_norm": 16.55754852294922, + "learning_rate": 1.9234653592610957e-05, + "loss": 1.4356, + "step": 39560 + }, + { + "epoch": 6.539144804792398, + "grad_norm": 14.849857330322266, + "learning_rate": 1.9225472373712337e-05, + "loss": 1.405, + "step": 39570 + }, + { + "epoch": 6.540797355918198, + "grad_norm": 12.2333402633667, + "learning_rate": 1.9216291154813716e-05, + "loss": 1.4223, + "step": 39580 + }, + { + "epoch": 6.542449907043999, + "grad_norm": 8.129743576049805, + "learning_rate": 1.9207109935915095e-05, + "loss": 1.4793, + "step": 39590 + }, + { + "epoch": 6.5441024581697995, + "grad_norm": 10.34343433380127, + "learning_rate": 1.919792871701647e-05, + "loss": 1.4236, + "step": 39600 + }, + { + "epoch": 6.5457550092956005, + "grad_norm": 18.99115753173828, + "learning_rate": 1.918874749811785e-05, + "loss": 1.4893, + "step": 39610 + }, + { + "epoch": 6.547407560421401, + "grad_norm": 13.560120582580566, + "learning_rate": 1.917956627921923e-05, + "loss": 1.5145, + "step": 39620 + }, + { + "epoch": 6.549060111547201, + "grad_norm": 20.85279655456543, + "learning_rate": 1.917038506032061e-05, + "loss": 1.3886, + "step": 39630 + }, + { + "epoch": 6.550712662673002, + "grad_norm": 11.048171997070312, + "learning_rate": 1.9161203841421988e-05, + "loss": 1.4902, + "step": 39640 + }, + { + "epoch": 6.552365213798802, + "grad_norm": 11.270243644714355, + "learning_rate": 1.9152022622523367e-05, + "loss": 1.4862, + "step": 39650 + }, + { + "epoch": 6.554017764924602, + "grad_norm": 10.38618278503418, + "learning_rate": 1.9142841403624746e-05, + "loss": 1.478, + "step": 39660 + }, + { + "epoch": 6.555670316050403, + "grad_norm": 11.706856727600098, + "learning_rate": 1.9133660184726126e-05, + "loss": 1.4313, + "step": 39670 + }, + { + "epoch": 6.557322867176203, + "grad_norm": 19.566499710083008, + "learning_rate": 1.9124478965827505e-05, + "loss": 1.4815, + "step": 39680 + }, + { + "epoch": 6.558975418302004, + "grad_norm": 10.643657684326172, + "learning_rate": 1.9115297746928884e-05, + "loss": 1.2795, + "step": 39690 + }, + { + "epoch": 6.560627969427804, + "grad_norm": 12.18120288848877, + "learning_rate": 1.9106116528030263e-05, + "loss": 1.2629, + "step": 39700 + }, + { + "epoch": 6.562280520553605, + "grad_norm": 9.885913848876953, + "learning_rate": 1.9096935309131643e-05, + "loss": 1.4513, + "step": 39710 + }, + { + "epoch": 6.563933071679405, + "grad_norm": 7.771469593048096, + "learning_rate": 1.9087754090233022e-05, + "loss": 1.4502, + "step": 39720 + }, + { + "epoch": 6.565585622805205, + "grad_norm": 8.415877342224121, + "learning_rate": 1.9078572871334398e-05, + "loss": 1.5465, + "step": 39730 + }, + { + "epoch": 6.567238173931006, + "grad_norm": 30.650428771972656, + "learning_rate": 1.9069391652435777e-05, + "loss": 1.3623, + "step": 39740 + }, + { + "epoch": 6.568890725056806, + "grad_norm": 19.83210563659668, + "learning_rate": 1.9060210433537156e-05, + "loss": 1.3031, + "step": 39750 + }, + { + "epoch": 6.570543276182607, + "grad_norm": 11.888619422912598, + "learning_rate": 1.9051029214638535e-05, + "loss": 1.55, + "step": 39760 + }, + { + "epoch": 6.572195827308407, + "grad_norm": 8.220413208007812, + "learning_rate": 1.9041847995739915e-05, + "loss": 1.395, + "step": 39770 + }, + { + "epoch": 6.573848378434207, + "grad_norm": 10.484262466430664, + "learning_rate": 1.9032666776841294e-05, + "loss": 1.3441, + "step": 39780 + }, + { + "epoch": 6.575500929560008, + "grad_norm": 8.3544921875, + "learning_rate": 1.9023485557942673e-05, + "loss": 1.4837, + "step": 39790 + }, + { + "epoch": 6.5771534806858085, + "grad_norm": 10.449460983276367, + "learning_rate": 1.9014304339044052e-05, + "loss": 1.4106, + "step": 39800 + }, + { + "epoch": 6.5788060318116095, + "grad_norm": 9.053803443908691, + "learning_rate": 1.9005123120145432e-05, + "loss": 1.309, + "step": 39810 + }, + { + "epoch": 6.58045858293741, + "grad_norm": 26.58787727355957, + "learning_rate": 1.899594190124681e-05, + "loss": 1.4609, + "step": 39820 + }, + { + "epoch": 6.58211113406321, + "grad_norm": 12.057016372680664, + "learning_rate": 1.898676068234819e-05, + "loss": 1.3254, + "step": 39830 + }, + { + "epoch": 6.583763685189011, + "grad_norm": 7.667273998260498, + "learning_rate": 1.897757946344957e-05, + "loss": 1.4452, + "step": 39840 + }, + { + "epoch": 6.585416236314811, + "grad_norm": 12.83427619934082, + "learning_rate": 1.896839824455095e-05, + "loss": 1.4146, + "step": 39850 + }, + { + "epoch": 6.587068787440612, + "grad_norm": 13.283857345581055, + "learning_rate": 1.8959217025652325e-05, + "loss": 1.4297, + "step": 39860 + }, + { + "epoch": 6.588721338566412, + "grad_norm": 13.097709655761719, + "learning_rate": 1.8950035806753704e-05, + "loss": 1.443, + "step": 39870 + }, + { + "epoch": 6.590373889692213, + "grad_norm": 10.823288917541504, + "learning_rate": 1.8940854587855083e-05, + "loss": 1.4325, + "step": 39880 + }, + { + "epoch": 6.592026440818013, + "grad_norm": 13.02402400970459, + "learning_rate": 1.8931673368956462e-05, + "loss": 1.4653, + "step": 39890 + }, + { + "epoch": 6.593678991943813, + "grad_norm": 11.030945777893066, + "learning_rate": 1.8922492150057845e-05, + "loss": 1.2672, + "step": 39900 + }, + { + "epoch": 6.595331543069614, + "grad_norm": 8.03734302520752, + "learning_rate": 1.8913310931159224e-05, + "loss": 1.3499, + "step": 39910 + }, + { + "epoch": 6.596984094195414, + "grad_norm": 12.094477653503418, + "learning_rate": 1.89041297122606e-05, + "loss": 1.4214, + "step": 39920 + }, + { + "epoch": 6.598636645321214, + "grad_norm": 10.550955772399902, + "learning_rate": 1.889494849336198e-05, + "loss": 1.4045, + "step": 39930 + }, + { + "epoch": 6.600289196447015, + "grad_norm": 10.761007308959961, + "learning_rate": 1.888576727446336e-05, + "loss": 1.5118, + "step": 39940 + }, + { + "epoch": 6.601941747572815, + "grad_norm": 19.835403442382812, + "learning_rate": 1.8876586055564738e-05, + "loss": 1.4687, + "step": 39950 + }, + { + "epoch": 6.603594298698616, + "grad_norm": 11.144922256469727, + "learning_rate": 1.8867404836666117e-05, + "loss": 1.4642, + "step": 39960 + }, + { + "epoch": 6.605246849824416, + "grad_norm": 12.04179573059082, + "learning_rate": 1.8858223617767496e-05, + "loss": 1.6522, + "step": 39970 + }, + { + "epoch": 6.606899400950217, + "grad_norm": 8.051116943359375, + "learning_rate": 1.8849042398868876e-05, + "loss": 1.4801, + "step": 39980 + }, + { + "epoch": 6.608551952076017, + "grad_norm": 13.712382316589355, + "learning_rate": 1.883986117997025e-05, + "loss": 1.4568, + "step": 39990 + }, + { + "epoch": 6.6102045032018175, + "grad_norm": 7.790043830871582, + "learning_rate": 1.883067996107163e-05, + "loss": 1.3776, + "step": 40000 + }, + { + "epoch": 6.6118570543276185, + "grad_norm": 16.25408172607422, + "learning_rate": 1.882149874217301e-05, + "loss": 1.3283, + "step": 40010 + }, + { + "epoch": 6.613509605453419, + "grad_norm": 6.5616774559021, + "learning_rate": 1.8812317523274393e-05, + "loss": 1.3177, + "step": 40020 + }, + { + "epoch": 6.615162156579219, + "grad_norm": 8.824763298034668, + "learning_rate": 1.8803136304375772e-05, + "loss": 1.3024, + "step": 40030 + }, + { + "epoch": 6.61681470770502, + "grad_norm": 14.049736976623535, + "learning_rate": 1.879395508547715e-05, + "loss": 1.3356, + "step": 40040 + }, + { + "epoch": 6.61846725883082, + "grad_norm": 16.632484436035156, + "learning_rate": 1.8784773866578527e-05, + "loss": 1.5723, + "step": 40050 + }, + { + "epoch": 6.620119809956621, + "grad_norm": 21.031719207763672, + "learning_rate": 1.8775592647679906e-05, + "loss": 1.2901, + "step": 40060 + }, + { + "epoch": 6.621772361082421, + "grad_norm": 15.199872016906738, + "learning_rate": 1.8766411428781285e-05, + "loss": 1.4082, + "step": 40070 + }, + { + "epoch": 6.623424912208222, + "grad_norm": 10.436044692993164, + "learning_rate": 1.8757230209882665e-05, + "loss": 1.4145, + "step": 40080 + }, + { + "epoch": 6.625077463334022, + "grad_norm": 8.306492805480957, + "learning_rate": 1.8748048990984044e-05, + "loss": 1.3091, + "step": 40090 + }, + { + "epoch": 6.626730014459822, + "grad_norm": 10.09487247467041, + "learning_rate": 1.8738867772085423e-05, + "loss": 1.3551, + "step": 40100 + }, + { + "epoch": 6.628382565585623, + "grad_norm": 13.294191360473633, + "learning_rate": 1.8729686553186802e-05, + "loss": 1.6028, + "step": 40110 + }, + { + "epoch": 6.630035116711423, + "grad_norm": 11.726201057434082, + "learning_rate": 1.8720505334288178e-05, + "loss": 1.4088, + "step": 40120 + }, + { + "epoch": 6.631687667837224, + "grad_norm": 9.069192886352539, + "learning_rate": 1.871132411538956e-05, + "loss": 1.4661, + "step": 40130 + }, + { + "epoch": 6.633340218963024, + "grad_norm": 14.839059829711914, + "learning_rate": 1.870214289649094e-05, + "loss": 1.5112, + "step": 40140 + }, + { + "epoch": 6.634992770088824, + "grad_norm": 15.229697227478027, + "learning_rate": 1.869296167759232e-05, + "loss": 1.289, + "step": 40150 + }, + { + "epoch": 6.636645321214625, + "grad_norm": 9.121678352355957, + "learning_rate": 1.86837804586937e-05, + "loss": 1.5177, + "step": 40160 + }, + { + "epoch": 6.638297872340425, + "grad_norm": 9.134458541870117, + "learning_rate": 1.8674599239795078e-05, + "loss": 1.3569, + "step": 40170 + }, + { + "epoch": 6.639950423466226, + "grad_norm": 9.136309623718262, + "learning_rate": 1.8665418020896454e-05, + "loss": 1.4439, + "step": 40180 + }, + { + "epoch": 6.641602974592026, + "grad_norm": 13.449373245239258, + "learning_rate": 1.8656236801997833e-05, + "loss": 1.4349, + "step": 40190 + }, + { + "epoch": 6.6432555257178265, + "grad_norm": 14.80142879486084, + "learning_rate": 1.8647055583099212e-05, + "loss": 1.379, + "step": 40200 + }, + { + "epoch": 6.6449080768436275, + "grad_norm": 12.268270492553711, + "learning_rate": 1.863787436420059e-05, + "loss": 1.4978, + "step": 40210 + }, + { + "epoch": 6.646560627969428, + "grad_norm": 20.85672378540039, + "learning_rate": 1.862869314530197e-05, + "loss": 1.3803, + "step": 40220 + }, + { + "epoch": 6.648213179095229, + "grad_norm": 15.535070419311523, + "learning_rate": 1.861951192640335e-05, + "loss": 1.383, + "step": 40230 + }, + { + "epoch": 6.649865730221029, + "grad_norm": 9.89908504486084, + "learning_rate": 1.861033070750473e-05, + "loss": 1.4263, + "step": 40240 + }, + { + "epoch": 6.65151828134683, + "grad_norm": 15.000544548034668, + "learning_rate": 1.860114948860611e-05, + "loss": 1.5781, + "step": 40250 + }, + { + "epoch": 6.65317083247263, + "grad_norm": 11.549114227294922, + "learning_rate": 1.8591968269707488e-05, + "loss": 1.4254, + "step": 40260 + }, + { + "epoch": 6.65482338359843, + "grad_norm": 16.330419540405273, + "learning_rate": 1.8582787050808867e-05, + "loss": 1.4272, + "step": 40270 + }, + { + "epoch": 6.656475934724231, + "grad_norm": 12.878795623779297, + "learning_rate": 1.8573605831910246e-05, + "loss": 1.4654, + "step": 40280 + }, + { + "epoch": 6.658128485850031, + "grad_norm": 35.6964111328125, + "learning_rate": 1.8564424613011625e-05, + "loss": 1.3814, + "step": 40290 + }, + { + "epoch": 6.659781036975831, + "grad_norm": 14.159383773803711, + "learning_rate": 1.8555243394113005e-05, + "loss": 1.5742, + "step": 40300 + }, + { + "epoch": 6.661433588101632, + "grad_norm": 8.858052253723145, + "learning_rate": 1.854606217521438e-05, + "loss": 1.4659, + "step": 40310 + }, + { + "epoch": 6.663086139227432, + "grad_norm": 9.946170806884766, + "learning_rate": 1.853688095631576e-05, + "loss": 1.2612, + "step": 40320 + }, + { + "epoch": 6.664738690353233, + "grad_norm": 9.768632888793945, + "learning_rate": 1.852769973741714e-05, + "loss": 1.4816, + "step": 40330 + }, + { + "epoch": 6.666391241479033, + "grad_norm": 11.27400016784668, + "learning_rate": 1.8518518518518518e-05, + "loss": 1.466, + "step": 40340 + }, + { + "epoch": 6.668043792604834, + "grad_norm": 13.281896591186523, + "learning_rate": 1.8509337299619898e-05, + "loss": 1.356, + "step": 40350 + }, + { + "epoch": 6.669696343730634, + "grad_norm": 14.230932235717773, + "learning_rate": 1.850015608072128e-05, + "loss": 1.3526, + "step": 40360 + }, + { + "epoch": 6.671348894856434, + "grad_norm": 9.921380043029785, + "learning_rate": 1.8490974861822656e-05, + "loss": 1.4302, + "step": 40370 + }, + { + "epoch": 6.673001445982235, + "grad_norm": 13.660683631896973, + "learning_rate": 1.8481793642924035e-05, + "loss": 1.3974, + "step": 40380 + }, + { + "epoch": 6.674653997108035, + "grad_norm": 15.629735946655273, + "learning_rate": 1.8472612424025415e-05, + "loss": 1.4577, + "step": 40390 + }, + { + "epoch": 6.6763065482338355, + "grad_norm": 10.573319435119629, + "learning_rate": 1.8463431205126794e-05, + "loss": 1.425, + "step": 40400 + }, + { + "epoch": 6.6779590993596365, + "grad_norm": 9.454117774963379, + "learning_rate": 1.8454249986228173e-05, + "loss": 1.374, + "step": 40410 + }, + { + "epoch": 6.679611650485437, + "grad_norm": 21.72432518005371, + "learning_rate": 1.8445068767329552e-05, + "loss": 1.499, + "step": 40420 + }, + { + "epoch": 6.681264201611238, + "grad_norm": 10.876113891601562, + "learning_rate": 1.843588754843093e-05, + "loss": 1.5718, + "step": 40430 + }, + { + "epoch": 6.682916752737038, + "grad_norm": 21.28350067138672, + "learning_rate": 1.8426706329532307e-05, + "loss": 1.399, + "step": 40440 + }, + { + "epoch": 6.684569303862839, + "grad_norm": 10.992979049682617, + "learning_rate": 1.8417525110633687e-05, + "loss": 1.3749, + "step": 40450 + }, + { + "epoch": 6.686221854988639, + "grad_norm": 11.09716510772705, + "learning_rate": 1.8408343891735066e-05, + "loss": 1.3887, + "step": 40460 + }, + { + "epoch": 6.687874406114439, + "grad_norm": 14.070152282714844, + "learning_rate": 1.839916267283645e-05, + "loss": 1.541, + "step": 40470 + }, + { + "epoch": 6.68952695724024, + "grad_norm": 16.888887405395508, + "learning_rate": 1.8389981453937828e-05, + "loss": 1.4876, + "step": 40480 + }, + { + "epoch": 6.69117950836604, + "grad_norm": 13.392816543579102, + "learning_rate": 1.8380800235039207e-05, + "loss": 1.4925, + "step": 40490 + }, + { + "epoch": 6.692832059491841, + "grad_norm": 12.64699649810791, + "learning_rate": 1.8371619016140583e-05, + "loss": 1.4564, + "step": 40500 + }, + { + "epoch": 6.694484610617641, + "grad_norm": 48.2119026184082, + "learning_rate": 1.8362437797241962e-05, + "loss": 1.4863, + "step": 40510 + }, + { + "epoch": 6.696137161743441, + "grad_norm": 10.135701179504395, + "learning_rate": 1.835325657834334e-05, + "loss": 1.4801, + "step": 40520 + }, + { + "epoch": 6.697789712869242, + "grad_norm": 13.071991920471191, + "learning_rate": 1.834407535944472e-05, + "loss": 1.5195, + "step": 40530 + }, + { + "epoch": 6.699442263995042, + "grad_norm": 10.40569019317627, + "learning_rate": 1.83348941405461e-05, + "loss": 1.4392, + "step": 40540 + }, + { + "epoch": 6.701094815120843, + "grad_norm": 8.229533195495605, + "learning_rate": 1.832571292164748e-05, + "loss": 1.4115, + "step": 40550 + }, + { + "epoch": 6.702747366246643, + "grad_norm": 10.888386726379395, + "learning_rate": 1.831653170274886e-05, + "loss": 1.3083, + "step": 40560 + }, + { + "epoch": 6.704399917372443, + "grad_norm": 10.805375099182129, + "learning_rate": 1.8307350483850234e-05, + "loss": 1.4312, + "step": 40570 + }, + { + "epoch": 6.706052468498244, + "grad_norm": 21.973468780517578, + "learning_rate": 1.8298169264951613e-05, + "loss": 1.4178, + "step": 40580 + }, + { + "epoch": 6.707705019624044, + "grad_norm": 12.331151008605957, + "learning_rate": 1.8288988046052996e-05, + "loss": 1.361, + "step": 40590 + }, + { + "epoch": 6.709357570749845, + "grad_norm": 17.735149383544922, + "learning_rate": 1.8279806827154375e-05, + "loss": 1.5563, + "step": 40600 + }, + { + "epoch": 6.7110101218756455, + "grad_norm": 9.35239028930664, + "learning_rate": 1.8270625608255755e-05, + "loss": 1.2544, + "step": 40610 + }, + { + "epoch": 6.7126626730014465, + "grad_norm": 10.340348243713379, + "learning_rate": 1.8261444389357134e-05, + "loss": 1.5049, + "step": 40620 + }, + { + "epoch": 6.714315224127247, + "grad_norm": 7.2036871910095215, + "learning_rate": 1.825226317045851e-05, + "loss": 1.3728, + "step": 40630 + }, + { + "epoch": 6.715967775253047, + "grad_norm": 21.527620315551758, + "learning_rate": 1.824308195155989e-05, + "loss": 1.4878, + "step": 40640 + }, + { + "epoch": 6.717620326378848, + "grad_norm": 16.127460479736328, + "learning_rate": 1.8233900732661268e-05, + "loss": 1.4064, + "step": 40650 + }, + { + "epoch": 6.719272877504648, + "grad_norm": 27.192962646484375, + "learning_rate": 1.8224719513762647e-05, + "loss": 1.3923, + "step": 40660 + }, + { + "epoch": 6.720925428630448, + "grad_norm": 13.89322280883789, + "learning_rate": 1.8215538294864027e-05, + "loss": 1.4246, + "step": 40670 + }, + { + "epoch": 6.722577979756249, + "grad_norm": 12.22103214263916, + "learning_rate": 1.8206357075965406e-05, + "loss": 1.3845, + "step": 40680 + }, + { + "epoch": 6.724230530882049, + "grad_norm": 10.441896438598633, + "learning_rate": 1.8197175857066785e-05, + "loss": 1.6125, + "step": 40690 + }, + { + "epoch": 6.72588308200785, + "grad_norm": 10.106922149658203, + "learning_rate": 1.8187994638168164e-05, + "loss": 1.3981, + "step": 40700 + }, + { + "epoch": 6.72753563313365, + "grad_norm": 10.453673362731934, + "learning_rate": 1.8178813419269544e-05, + "loss": 1.4177, + "step": 40710 + }, + { + "epoch": 6.729188184259451, + "grad_norm": 14.970731735229492, + "learning_rate": 1.8169632200370923e-05, + "loss": 1.4991, + "step": 40720 + }, + { + "epoch": 6.730840735385251, + "grad_norm": 8.335892677307129, + "learning_rate": 1.8160450981472302e-05, + "loss": 1.354, + "step": 40730 + }, + { + "epoch": 6.732493286511051, + "grad_norm": 15.514796257019043, + "learning_rate": 1.815126976257368e-05, + "loss": 1.4697, + "step": 40740 + }, + { + "epoch": 6.734145837636852, + "grad_norm": 11.07535457611084, + "learning_rate": 1.814208854367506e-05, + "loss": 1.4608, + "step": 40750 + }, + { + "epoch": 6.735798388762652, + "grad_norm": 14.842573165893555, + "learning_rate": 1.8132907324776437e-05, + "loss": 1.4753, + "step": 40760 + }, + { + "epoch": 6.737450939888452, + "grad_norm": 9.721745491027832, + "learning_rate": 1.8123726105877816e-05, + "loss": 1.5479, + "step": 40770 + }, + { + "epoch": 6.739103491014253, + "grad_norm": 8.41561222076416, + "learning_rate": 1.8114544886979195e-05, + "loss": 1.3348, + "step": 40780 + }, + { + "epoch": 6.740756042140053, + "grad_norm": 9.427742004394531, + "learning_rate": 1.8105363668080574e-05, + "loss": 1.4775, + "step": 40790 + }, + { + "epoch": 6.742408593265854, + "grad_norm": 11.878392219543457, + "learning_rate": 1.8096182449181954e-05, + "loss": 1.492, + "step": 40800 + }, + { + "epoch": 6.7440611443916545, + "grad_norm": 14.520692825317383, + "learning_rate": 1.8087001230283333e-05, + "loss": 1.5121, + "step": 40810 + }, + { + "epoch": 6.7457136955174555, + "grad_norm": 5.713458061218262, + "learning_rate": 1.8077820011384712e-05, + "loss": 1.4465, + "step": 40820 + }, + { + "epoch": 6.747366246643256, + "grad_norm": 11.001376152038574, + "learning_rate": 1.806863879248609e-05, + "loss": 1.3831, + "step": 40830 + }, + { + "epoch": 6.749018797769056, + "grad_norm": 11.784494400024414, + "learning_rate": 1.805945757358747e-05, + "loss": 1.4752, + "step": 40840 + }, + { + "epoch": 6.750671348894857, + "grad_norm": 16.32159996032715, + "learning_rate": 1.805027635468885e-05, + "loss": 1.3969, + "step": 40850 + }, + { + "epoch": 6.752323900020657, + "grad_norm": 8.51651382446289, + "learning_rate": 1.804109513579023e-05, + "loss": 1.4283, + "step": 40860 + }, + { + "epoch": 6.753976451146458, + "grad_norm": 17.39286231994629, + "learning_rate": 1.8031913916891608e-05, + "loss": 1.482, + "step": 40870 + }, + { + "epoch": 6.755629002272258, + "grad_norm": 19.771865844726562, + "learning_rate": 1.8022732697992987e-05, + "loss": 1.3334, + "step": 40880 + }, + { + "epoch": 6.757281553398058, + "grad_norm": 8.667237281799316, + "learning_rate": 1.8013551479094363e-05, + "loss": 1.3109, + "step": 40890 + }, + { + "epoch": 6.758934104523859, + "grad_norm": 16.53960609436035, + "learning_rate": 1.8004370260195743e-05, + "loss": 1.4221, + "step": 40900 + }, + { + "epoch": 6.760586655649659, + "grad_norm": 18.91925048828125, + "learning_rate": 1.7995189041297122e-05, + "loss": 1.376, + "step": 40910 + }, + { + "epoch": 6.76223920677546, + "grad_norm": 14.152482986450195, + "learning_rate": 1.79860078223985e-05, + "loss": 1.4648, + "step": 40920 + }, + { + "epoch": 6.76389175790126, + "grad_norm": 10.383070945739746, + "learning_rate": 1.7976826603499884e-05, + "loss": 1.3391, + "step": 40930 + }, + { + "epoch": 6.76554430902706, + "grad_norm": 15.106592178344727, + "learning_rate": 1.7967645384601263e-05, + "loss": 1.4528, + "step": 40940 + }, + { + "epoch": 6.767196860152861, + "grad_norm": 13.787833213806152, + "learning_rate": 1.795846416570264e-05, + "loss": 1.3813, + "step": 40950 + }, + { + "epoch": 6.768849411278661, + "grad_norm": 6.263724327087402, + "learning_rate": 1.7949282946804018e-05, + "loss": 1.5428, + "step": 40960 + }, + { + "epoch": 6.770501962404462, + "grad_norm": 10.366127014160156, + "learning_rate": 1.7940101727905397e-05, + "loss": 1.4737, + "step": 40970 + }, + { + "epoch": 6.772154513530262, + "grad_norm": 27.11163902282715, + "learning_rate": 1.7930920509006777e-05, + "loss": 1.4403, + "step": 40980 + }, + { + "epoch": 6.773807064656062, + "grad_norm": 10.552127838134766, + "learning_rate": 1.7921739290108156e-05, + "loss": 1.4511, + "step": 40990 + }, + { + "epoch": 6.775459615781863, + "grad_norm": 17.98050308227539, + "learning_rate": 1.7912558071209535e-05, + "loss": 1.5128, + "step": 41000 + }, + { + "epoch": 6.7771121669076635, + "grad_norm": 9.192895889282227, + "learning_rate": 1.7903376852310914e-05, + "loss": 1.334, + "step": 41010 + }, + { + "epoch": 6.7787647180334645, + "grad_norm": 11.638321876525879, + "learning_rate": 1.789419563341229e-05, + "loss": 1.3764, + "step": 41020 + }, + { + "epoch": 6.780417269159265, + "grad_norm": 12.00686264038086, + "learning_rate": 1.788501441451367e-05, + "loss": 1.4813, + "step": 41030 + }, + { + "epoch": 6.782069820285065, + "grad_norm": 11.396140098571777, + "learning_rate": 1.7875833195615052e-05, + "loss": 1.438, + "step": 41040 + }, + { + "epoch": 6.783722371410866, + "grad_norm": 55.01451873779297, + "learning_rate": 1.786665197671643e-05, + "loss": 1.4997, + "step": 41050 + }, + { + "epoch": 6.785374922536666, + "grad_norm": 9.244829177856445, + "learning_rate": 1.785747075781781e-05, + "loss": 1.3627, + "step": 41060 + }, + { + "epoch": 6.787027473662467, + "grad_norm": 9.327887535095215, + "learning_rate": 1.784828953891919e-05, + "loss": 1.3883, + "step": 41070 + }, + { + "epoch": 6.788680024788267, + "grad_norm": 9.212366104125977, + "learning_rate": 1.7839108320020566e-05, + "loss": 1.4265, + "step": 41080 + }, + { + "epoch": 6.790332575914068, + "grad_norm": 41.623443603515625, + "learning_rate": 1.7829927101121945e-05, + "loss": 1.4342, + "step": 41090 + }, + { + "epoch": 6.791985127039868, + "grad_norm": 10.537240982055664, + "learning_rate": 1.7820745882223324e-05, + "loss": 1.5631, + "step": 41100 + }, + { + "epoch": 6.793637678165668, + "grad_norm": 14.747405052185059, + "learning_rate": 1.7811564663324703e-05, + "loss": 1.4803, + "step": 41110 + }, + { + "epoch": 6.795290229291469, + "grad_norm": 10.192240715026855, + "learning_rate": 1.7802383444426083e-05, + "loss": 1.3918, + "step": 41120 + }, + { + "epoch": 6.796942780417269, + "grad_norm": 9.440613746643066, + "learning_rate": 1.7793202225527462e-05, + "loss": 1.4808, + "step": 41130 + }, + { + "epoch": 6.798595331543069, + "grad_norm": 14.262847900390625, + "learning_rate": 1.778402100662884e-05, + "loss": 1.3783, + "step": 41140 + }, + { + "epoch": 6.80024788266887, + "grad_norm": 10.99431324005127, + "learning_rate": 1.7774839787730217e-05, + "loss": 1.4737, + "step": 41150 + }, + { + "epoch": 6.80190043379467, + "grad_norm": 8.806312561035156, + "learning_rate": 1.77656585688316e-05, + "loss": 1.3779, + "step": 41160 + }, + { + "epoch": 6.803552984920471, + "grad_norm": 39.46382141113281, + "learning_rate": 1.775647734993298e-05, + "loss": 1.4395, + "step": 41170 + }, + { + "epoch": 6.805205536046271, + "grad_norm": 14.97314453125, + "learning_rate": 1.7747296131034358e-05, + "loss": 1.4276, + "step": 41180 + }, + { + "epoch": 6.806858087172072, + "grad_norm": 9.130488395690918, + "learning_rate": 1.7738114912135737e-05, + "loss": 1.3061, + "step": 41190 + }, + { + "epoch": 6.808510638297872, + "grad_norm": 9.451000213623047, + "learning_rate": 1.7728933693237117e-05, + "loss": 1.5621, + "step": 41200 + }, + { + "epoch": 6.8101631894236725, + "grad_norm": 10.108766555786133, + "learning_rate": 1.7719752474338492e-05, + "loss": 1.387, + "step": 41210 + }, + { + "epoch": 6.8118157405494735, + "grad_norm": 7.01558780670166, + "learning_rate": 1.7710571255439872e-05, + "loss": 1.3172, + "step": 41220 + }, + { + "epoch": 6.813468291675274, + "grad_norm": 10.980785369873047, + "learning_rate": 1.770139003654125e-05, + "loss": 1.4921, + "step": 41230 + }, + { + "epoch": 6.815120842801074, + "grad_norm": 12.635100364685059, + "learning_rate": 1.769220881764263e-05, + "loss": 1.5118, + "step": 41240 + }, + { + "epoch": 6.816773393926875, + "grad_norm": 21.64072036743164, + "learning_rate": 1.768302759874401e-05, + "loss": 1.3484, + "step": 41250 + }, + { + "epoch": 6.818425945052675, + "grad_norm": 10.687591552734375, + "learning_rate": 1.767384637984539e-05, + "loss": 1.3822, + "step": 41260 + }, + { + "epoch": 6.820078496178476, + "grad_norm": 31.336519241333008, + "learning_rate": 1.7664665160946768e-05, + "loss": 1.4516, + "step": 41270 + }, + { + "epoch": 6.821731047304276, + "grad_norm": 14.67437744140625, + "learning_rate": 1.7655483942048147e-05, + "loss": 1.4765, + "step": 41280 + }, + { + "epoch": 6.823383598430077, + "grad_norm": 18.006410598754883, + "learning_rate": 1.7646302723149526e-05, + "loss": 1.3861, + "step": 41290 + }, + { + "epoch": 6.825036149555877, + "grad_norm": 10.844664573669434, + "learning_rate": 1.7637121504250906e-05, + "loss": 1.4537, + "step": 41300 + }, + { + "epoch": 6.826688700681677, + "grad_norm": 11.941879272460938, + "learning_rate": 1.7627940285352285e-05, + "loss": 1.4432, + "step": 41310 + }, + { + "epoch": 6.828341251807478, + "grad_norm": 16.71599769592285, + "learning_rate": 1.7618759066453664e-05, + "loss": 1.4506, + "step": 41320 + }, + { + "epoch": 6.829993802933278, + "grad_norm": 11.480480194091797, + "learning_rate": 1.7609577847555043e-05, + "loss": 1.5141, + "step": 41330 + }, + { + "epoch": 6.831646354059079, + "grad_norm": 13.7313814163208, + "learning_rate": 1.760039662865642e-05, + "loss": 1.3246, + "step": 41340 + }, + { + "epoch": 6.833298905184879, + "grad_norm": 6.67985725402832, + "learning_rate": 1.75912154097578e-05, + "loss": 1.5389, + "step": 41350 + }, + { + "epoch": 6.834951456310679, + "grad_norm": 35.11752700805664, + "learning_rate": 1.7582034190859178e-05, + "loss": 1.3679, + "step": 41360 + }, + { + "epoch": 6.83660400743648, + "grad_norm": 16.82154083251953, + "learning_rate": 1.7572852971960557e-05, + "loss": 1.5087, + "step": 41370 + }, + { + "epoch": 6.83825655856228, + "grad_norm": 11.635119438171387, + "learning_rate": 1.7563671753061936e-05, + "loss": 1.5188, + "step": 41380 + }, + { + "epoch": 6.839909109688081, + "grad_norm": 9.211424827575684, + "learning_rate": 1.755449053416332e-05, + "loss": 1.3765, + "step": 41390 + }, + { + "epoch": 6.841561660813881, + "grad_norm": 7.994271755218506, + "learning_rate": 1.7545309315264695e-05, + "loss": 1.4092, + "step": 41400 + }, + { + "epoch": 6.8432142119396815, + "grad_norm": 9.016519546508789, + "learning_rate": 1.7536128096366074e-05, + "loss": 1.5425, + "step": 41410 + }, + { + "epoch": 6.8448667630654825, + "grad_norm": 15.688495635986328, + "learning_rate": 1.7526946877467453e-05, + "loss": 1.4526, + "step": 41420 + }, + { + "epoch": 6.846519314191283, + "grad_norm": 11.424908638000488, + "learning_rate": 1.7517765658568833e-05, + "loss": 1.4075, + "step": 41430 + }, + { + "epoch": 6.8481718653170836, + "grad_norm": 11.070535659790039, + "learning_rate": 1.7508584439670212e-05, + "loss": 1.3919, + "step": 41440 + }, + { + "epoch": 6.849824416442884, + "grad_norm": 7.652846336364746, + "learning_rate": 1.749940322077159e-05, + "loss": 1.328, + "step": 41450 + }, + { + "epoch": 6.851476967568685, + "grad_norm": 9.631972312927246, + "learning_rate": 1.749022200187297e-05, + "loss": 1.4173, + "step": 41460 + }, + { + "epoch": 6.853129518694485, + "grad_norm": 41.709320068359375, + "learning_rate": 1.7481040782974346e-05, + "loss": 1.5274, + "step": 41470 + }, + { + "epoch": 6.854782069820285, + "grad_norm": 11.905040740966797, + "learning_rate": 1.7471859564075725e-05, + "loss": 1.4682, + "step": 41480 + }, + { + "epoch": 6.856434620946086, + "grad_norm": 20.44754981994629, + "learning_rate": 1.7462678345177105e-05, + "loss": 1.553, + "step": 41490 + }, + { + "epoch": 6.858087172071886, + "grad_norm": 8.729151725769043, + "learning_rate": 1.7453497126278487e-05, + "loss": 1.4357, + "step": 41500 + }, + { + "epoch": 6.859739723197686, + "grad_norm": 13.73019027709961, + "learning_rate": 1.7444315907379867e-05, + "loss": 1.4612, + "step": 41510 + }, + { + "epoch": 6.861392274323487, + "grad_norm": 9.946063995361328, + "learning_rate": 1.7435134688481246e-05, + "loss": 1.4276, + "step": 41520 + }, + { + "epoch": 6.863044825449287, + "grad_norm": 17.463390350341797, + "learning_rate": 1.742595346958262e-05, + "loss": 1.2973, + "step": 41530 + }, + { + "epoch": 6.864697376575088, + "grad_norm": 25.70206642150879, + "learning_rate": 1.7416772250684e-05, + "loss": 1.5196, + "step": 41540 + }, + { + "epoch": 6.866349927700888, + "grad_norm": 9.276856422424316, + "learning_rate": 1.740759103178538e-05, + "loss": 1.5073, + "step": 41550 + }, + { + "epoch": 6.868002478826689, + "grad_norm": 9.336898803710938, + "learning_rate": 1.739840981288676e-05, + "loss": 1.4767, + "step": 41560 + }, + { + "epoch": 6.869655029952489, + "grad_norm": 10.633883476257324, + "learning_rate": 1.738922859398814e-05, + "loss": 1.532, + "step": 41570 + }, + { + "epoch": 6.871307581078289, + "grad_norm": 9.510002136230469, + "learning_rate": 1.7380047375089518e-05, + "loss": 1.4982, + "step": 41580 + }, + { + "epoch": 6.87296013220409, + "grad_norm": 9.511191368103027, + "learning_rate": 1.7370866156190897e-05, + "loss": 1.4293, + "step": 41590 + }, + { + "epoch": 6.87461268332989, + "grad_norm": 12.487457275390625, + "learning_rate": 1.7361684937292273e-05, + "loss": 1.5126, + "step": 41600 + }, + { + "epoch": 6.8762652344556905, + "grad_norm": 11.388182640075684, + "learning_rate": 1.7352503718393652e-05, + "loss": 1.387, + "step": 41610 + }, + { + "epoch": 6.8779177855814915, + "grad_norm": 12.775744438171387, + "learning_rate": 1.7343322499495035e-05, + "loss": 1.5719, + "step": 41620 + }, + { + "epoch": 6.879570336707292, + "grad_norm": 9.344951629638672, + "learning_rate": 1.7334141280596414e-05, + "loss": 1.4063, + "step": 41630 + }, + { + "epoch": 6.8812228878330925, + "grad_norm": 13.491334915161133, + "learning_rate": 1.7324960061697793e-05, + "loss": 1.2582, + "step": 41640 + }, + { + "epoch": 6.882875438958893, + "grad_norm": 78.28813934326172, + "learning_rate": 1.7315778842799173e-05, + "loss": 1.5862, + "step": 41650 + }, + { + "epoch": 6.884527990084694, + "grad_norm": 9.13769245147705, + "learning_rate": 1.730659762390055e-05, + "loss": 1.4646, + "step": 41660 + }, + { + "epoch": 6.886180541210494, + "grad_norm": 7.546828269958496, + "learning_rate": 1.7297416405001928e-05, + "loss": 1.4563, + "step": 41670 + }, + { + "epoch": 6.887833092336294, + "grad_norm": 8.716988563537598, + "learning_rate": 1.7288235186103307e-05, + "loss": 1.4902, + "step": 41680 + }, + { + "epoch": 6.889485643462095, + "grad_norm": 11.223580360412598, + "learning_rate": 1.7279053967204686e-05, + "loss": 1.3886, + "step": 41690 + }, + { + "epoch": 6.891138194587895, + "grad_norm": 9.644621849060059, + "learning_rate": 1.7269872748306065e-05, + "loss": 1.398, + "step": 41700 + }, + { + "epoch": 6.892790745713696, + "grad_norm": 8.582539558410645, + "learning_rate": 1.7260691529407445e-05, + "loss": 1.3557, + "step": 41710 + }, + { + "epoch": 6.894443296839496, + "grad_norm": 31.303890228271484, + "learning_rate": 1.7251510310508824e-05, + "loss": 1.4995, + "step": 41720 + }, + { + "epoch": 6.896095847965296, + "grad_norm": 9.271944046020508, + "learning_rate": 1.7242329091610203e-05, + "loss": 1.2297, + "step": 41730 + }, + { + "epoch": 6.897748399091097, + "grad_norm": 12.003678321838379, + "learning_rate": 1.7233147872711582e-05, + "loss": 1.3033, + "step": 41740 + }, + { + "epoch": 6.899400950216897, + "grad_norm": 12.872206687927246, + "learning_rate": 1.7223966653812962e-05, + "loss": 1.5084, + "step": 41750 + }, + { + "epoch": 6.901053501342698, + "grad_norm": 12.925690650939941, + "learning_rate": 1.721478543491434e-05, + "loss": 1.476, + "step": 41760 + }, + { + "epoch": 6.902706052468498, + "grad_norm": 16.326433181762695, + "learning_rate": 1.720560421601572e-05, + "loss": 1.4717, + "step": 41770 + }, + { + "epoch": 6.904358603594298, + "grad_norm": 10.100279808044434, + "learning_rate": 1.71964229971171e-05, + "loss": 1.5529, + "step": 41780 + }, + { + "epoch": 6.906011154720099, + "grad_norm": 13.078716278076172, + "learning_rate": 1.7187241778218475e-05, + "loss": 1.4674, + "step": 41790 + }, + { + "epoch": 6.907663705845899, + "grad_norm": 11.755579948425293, + "learning_rate": 1.7178060559319855e-05, + "loss": 1.3926, + "step": 41800 + }, + { + "epoch": 6.9093162569717, + "grad_norm": 11.553741455078125, + "learning_rate": 1.7168879340421234e-05, + "loss": 1.352, + "step": 41810 + }, + { + "epoch": 6.9109688080975005, + "grad_norm": 12.865326881408691, + "learning_rate": 1.7159698121522613e-05, + "loss": 1.3908, + "step": 41820 + }, + { + "epoch": 6.9126213592233015, + "grad_norm": 9.914249420166016, + "learning_rate": 1.7150516902623992e-05, + "loss": 1.3497, + "step": 41830 + }, + { + "epoch": 6.9142739103491015, + "grad_norm": 6.659285068511963, + "learning_rate": 1.7141335683725375e-05, + "loss": 1.3643, + "step": 41840 + }, + { + "epoch": 6.915926461474902, + "grad_norm": 9.710965156555176, + "learning_rate": 1.713215446482675e-05, + "loss": 1.4596, + "step": 41850 + }, + { + "epoch": 6.917579012600703, + "grad_norm": 9.427946090698242, + "learning_rate": 1.712297324592813e-05, + "loss": 1.4356, + "step": 41860 + }, + { + "epoch": 6.919231563726503, + "grad_norm": 10.323373794555664, + "learning_rate": 1.711379202702951e-05, + "loss": 1.4465, + "step": 41870 + }, + { + "epoch": 6.920884114852303, + "grad_norm": 19.84877586364746, + "learning_rate": 1.710461080813089e-05, + "loss": 1.4025, + "step": 41880 + }, + { + "epoch": 6.922536665978104, + "grad_norm": 9.801685333251953, + "learning_rate": 1.7095429589232268e-05, + "loss": 1.3275, + "step": 41890 + }, + { + "epoch": 6.924189217103904, + "grad_norm": 43.13996887207031, + "learning_rate": 1.7086248370333647e-05, + "loss": 1.4675, + "step": 41900 + }, + { + "epoch": 6.925841768229705, + "grad_norm": 37.639007568359375, + "learning_rate": 1.7077067151435026e-05, + "loss": 1.2324, + "step": 41910 + }, + { + "epoch": 6.927494319355505, + "grad_norm": 12.643664360046387, + "learning_rate": 1.7067885932536402e-05, + "loss": 1.3959, + "step": 41920 + }, + { + "epoch": 6.929146870481306, + "grad_norm": 30.085914611816406, + "learning_rate": 1.705870471363778e-05, + "loss": 1.3784, + "step": 41930 + }, + { + "epoch": 6.930799421607106, + "grad_norm": 17.809532165527344, + "learning_rate": 1.704952349473916e-05, + "loss": 1.4568, + "step": 41940 + }, + { + "epoch": 6.932451972732906, + "grad_norm": 9.029561042785645, + "learning_rate": 1.704034227584054e-05, + "loss": 1.3388, + "step": 41950 + }, + { + "epoch": 6.934104523858707, + "grad_norm": 12.235958099365234, + "learning_rate": 1.7031161056941923e-05, + "loss": 1.4447, + "step": 41960 + }, + { + "epoch": 6.935757074984507, + "grad_norm": 12.634254455566406, + "learning_rate": 1.7021979838043302e-05, + "loss": 1.3982, + "step": 41970 + }, + { + "epoch": 6.937409626110307, + "grad_norm": 15.774003982543945, + "learning_rate": 1.7012798619144678e-05, + "loss": 1.3463, + "step": 41980 + }, + { + "epoch": 6.939062177236108, + "grad_norm": 8.786571502685547, + "learning_rate": 1.7003617400246057e-05, + "loss": 1.4888, + "step": 41990 + }, + { + "epoch": 6.940714728361908, + "grad_norm": 20.433345794677734, + "learning_rate": 1.6994436181347436e-05, + "loss": 1.3355, + "step": 42000 + }, + { + "epoch": 6.942367279487709, + "grad_norm": 7.788454055786133, + "learning_rate": 1.6985254962448815e-05, + "loss": 1.3896, + "step": 42010 + }, + { + "epoch": 6.9440198306135095, + "grad_norm": 7.458805084228516, + "learning_rate": 1.6976073743550195e-05, + "loss": 1.393, + "step": 42020 + }, + { + "epoch": 6.9456723817393105, + "grad_norm": 14.492402076721191, + "learning_rate": 1.6966892524651574e-05, + "loss": 1.5976, + "step": 42030 + }, + { + "epoch": 6.9473249328651105, + "grad_norm": 8.02368450164795, + "learning_rate": 1.6957711305752953e-05, + "loss": 1.3769, + "step": 42040 + }, + { + "epoch": 6.948977483990911, + "grad_norm": 8.434112548828125, + "learning_rate": 1.694853008685433e-05, + "loss": 1.4697, + "step": 42050 + }, + { + "epoch": 6.950630035116712, + "grad_norm": 10.042356491088867, + "learning_rate": 1.6939348867955708e-05, + "loss": 1.4125, + "step": 42060 + }, + { + "epoch": 6.952282586242512, + "grad_norm": 77.89820098876953, + "learning_rate": 1.693016764905709e-05, + "loss": 1.4607, + "step": 42070 + }, + { + "epoch": 6.953935137368313, + "grad_norm": 48.598690032958984, + "learning_rate": 1.692098643015847e-05, + "loss": 1.3837, + "step": 42080 + }, + { + "epoch": 6.955587688494113, + "grad_norm": 47.6584587097168, + "learning_rate": 1.691180521125985e-05, + "loss": 1.6172, + "step": 42090 + }, + { + "epoch": 6.957240239619913, + "grad_norm": 7.663386344909668, + "learning_rate": 1.690262399236123e-05, + "loss": 1.383, + "step": 42100 + }, + { + "epoch": 6.958892790745714, + "grad_norm": 7.526758670806885, + "learning_rate": 1.6893442773462604e-05, + "loss": 1.4312, + "step": 42110 + }, + { + "epoch": 6.960545341871514, + "grad_norm": 16.803466796875, + "learning_rate": 1.6884261554563984e-05, + "loss": 1.4376, + "step": 42120 + }, + { + "epoch": 6.962197892997315, + "grad_norm": 13.658576011657715, + "learning_rate": 1.6875080335665363e-05, + "loss": 1.5319, + "step": 42130 + }, + { + "epoch": 6.963850444123115, + "grad_norm": 8.935698509216309, + "learning_rate": 1.6865899116766742e-05, + "loss": 1.439, + "step": 42140 + }, + { + "epoch": 6.965502995248915, + "grad_norm": 11.672993659973145, + "learning_rate": 1.685671789786812e-05, + "loss": 1.4362, + "step": 42150 + }, + { + "epoch": 6.967155546374716, + "grad_norm": 13.540828704833984, + "learning_rate": 1.68475366789695e-05, + "loss": 1.4189, + "step": 42160 + }, + { + "epoch": 6.968808097500516, + "grad_norm": 11.229409217834473, + "learning_rate": 1.683835546007088e-05, + "loss": 1.4317, + "step": 42170 + }, + { + "epoch": 6.970460648626317, + "grad_norm": 12.30543327331543, + "learning_rate": 1.682917424117226e-05, + "loss": 1.4359, + "step": 42180 + }, + { + "epoch": 6.972113199752117, + "grad_norm": 11.764647483825684, + "learning_rate": 1.681999302227364e-05, + "loss": 1.5617, + "step": 42190 + }, + { + "epoch": 6.973765750877918, + "grad_norm": 10.844428062438965, + "learning_rate": 1.6810811803375018e-05, + "loss": 1.4419, + "step": 42200 + }, + { + "epoch": 6.975418302003718, + "grad_norm": 10.631561279296875, + "learning_rate": 1.6801630584476397e-05, + "loss": 1.3841, + "step": 42210 + }, + { + "epoch": 6.9770708531295185, + "grad_norm": 13.280062675476074, + "learning_rate": 1.6792449365577776e-05, + "loss": 1.35, + "step": 42220 + }, + { + "epoch": 6.9787234042553195, + "grad_norm": 15.788164138793945, + "learning_rate": 1.6783268146679155e-05, + "loss": 1.334, + "step": 42230 + }, + { + "epoch": 6.9803759553811195, + "grad_norm": 12.476130485534668, + "learning_rate": 1.677408692778053e-05, + "loss": 1.5234, + "step": 42240 + }, + { + "epoch": 6.98202850650692, + "grad_norm": 24.663625717163086, + "learning_rate": 1.676490570888191e-05, + "loss": 1.3422, + "step": 42250 + }, + { + "epoch": 6.983681057632721, + "grad_norm": 16.133569717407227, + "learning_rate": 1.675572448998329e-05, + "loss": 1.4352, + "step": 42260 + }, + { + "epoch": 6.985333608758521, + "grad_norm": 11.581048965454102, + "learning_rate": 1.674654327108467e-05, + "loss": 1.4443, + "step": 42270 + }, + { + "epoch": 6.986986159884322, + "grad_norm": 10.542519569396973, + "learning_rate": 1.6737362052186048e-05, + "loss": 1.3883, + "step": 42280 + }, + { + "epoch": 6.988638711010122, + "grad_norm": 7.865337371826172, + "learning_rate": 1.6728180833287428e-05, + "loss": 1.493, + "step": 42290 + }, + { + "epoch": 6.990291262135923, + "grad_norm": 12.757125854492188, + "learning_rate": 1.6718999614388807e-05, + "loss": 1.4717, + "step": 42300 + }, + { + "epoch": 6.991943813261723, + "grad_norm": 13.490144729614258, + "learning_rate": 1.6709818395490186e-05, + "loss": 1.4115, + "step": 42310 + }, + { + "epoch": 6.993596364387523, + "grad_norm": 14.82015609741211, + "learning_rate": 1.6700637176591565e-05, + "loss": 1.4881, + "step": 42320 + }, + { + "epoch": 6.995248915513324, + "grad_norm": 23.44951057434082, + "learning_rate": 1.6691455957692945e-05, + "loss": 1.4348, + "step": 42330 + }, + { + "epoch": 6.996901466639124, + "grad_norm": 8.522165298461914, + "learning_rate": 1.6682274738794324e-05, + "loss": 1.4091, + "step": 42340 + }, + { + "epoch": 6.998554017764924, + "grad_norm": 26.870582580566406, + "learning_rate": 1.6673093519895703e-05, + "loss": 1.4368, + "step": 42350 + }, + { + "epoch": 6.999876058665565, + "eval_accuracy": 0.32497517378351537, + "eval_loss": 2.2579329013824463, + "eval_runtime": 821.7981, + "eval_samples_per_second": 34.31, + "eval_steps_per_second": 8.578, + "step": 42358 + }, + { + "epoch": 7.000206568890725, + "grad_norm": 7.999766826629639, + "learning_rate": 1.6663912300997082e-05, + "loss": 1.289, + "step": 42360 + }, + { + "epoch": 7.001859120016525, + "grad_norm": 8.853545188903809, + "learning_rate": 1.6654731082098458e-05, + "loss": 1.2426, + "step": 42370 + }, + { + "epoch": 7.003511671142326, + "grad_norm": 9.978639602661133, + "learning_rate": 1.6645549863199837e-05, + "loss": 1.4188, + "step": 42380 + }, + { + "epoch": 7.005164222268126, + "grad_norm": 12.857678413391113, + "learning_rate": 1.6636368644301217e-05, + "loss": 1.3589, + "step": 42390 + }, + { + "epoch": 7.006816773393927, + "grad_norm": 12.013607025146484, + "learning_rate": 1.6627187425402596e-05, + "loss": 1.4138, + "step": 42400 + }, + { + "epoch": 7.008469324519727, + "grad_norm": 6.538505554199219, + "learning_rate": 1.6618006206503975e-05, + "loss": 1.4131, + "step": 42410 + }, + { + "epoch": 7.0101218756455275, + "grad_norm": 7.0642595291137695, + "learning_rate": 1.6608824987605358e-05, + "loss": 1.3476, + "step": 42420 + }, + { + "epoch": 7.0117744267713285, + "grad_norm": 17.850061416625977, + "learning_rate": 1.6599643768706734e-05, + "loss": 1.4026, + "step": 42430 + }, + { + "epoch": 7.0134269778971285, + "grad_norm": 12.280431747436523, + "learning_rate": 1.6590462549808113e-05, + "loss": 1.3851, + "step": 42440 + }, + { + "epoch": 7.0150795290229295, + "grad_norm": 9.028722763061523, + "learning_rate": 1.6581281330909492e-05, + "loss": 1.3367, + "step": 42450 + }, + { + "epoch": 7.01673208014873, + "grad_norm": 15.112545013427734, + "learning_rate": 1.657210011201087e-05, + "loss": 1.4602, + "step": 42460 + }, + { + "epoch": 7.01838463127453, + "grad_norm": 13.523941993713379, + "learning_rate": 1.656291889311225e-05, + "loss": 1.4501, + "step": 42470 + }, + { + "epoch": 7.020037182400331, + "grad_norm": 8.846076011657715, + "learning_rate": 1.655373767421363e-05, + "loss": 1.4001, + "step": 42480 + }, + { + "epoch": 7.021689733526131, + "grad_norm": 7.614100456237793, + "learning_rate": 1.654455645531501e-05, + "loss": 1.3485, + "step": 42490 + }, + { + "epoch": 7.023342284651932, + "grad_norm": 11.066787719726562, + "learning_rate": 1.6535375236416385e-05, + "loss": 1.4282, + "step": 42500 + }, + { + "epoch": 7.024994835777732, + "grad_norm": 11.001472473144531, + "learning_rate": 1.6526194017517764e-05, + "loss": 1.3722, + "step": 42510 + }, + { + "epoch": 7.026647386903532, + "grad_norm": 12.738889694213867, + "learning_rate": 1.6517012798619143e-05, + "loss": 1.4098, + "step": 42520 + }, + { + "epoch": 7.028299938029333, + "grad_norm": 11.663647651672363, + "learning_rate": 1.6507831579720526e-05, + "loss": 1.4311, + "step": 42530 + }, + { + "epoch": 7.029952489155133, + "grad_norm": 22.87346076965332, + "learning_rate": 1.6498650360821905e-05, + "loss": 1.316, + "step": 42540 + }, + { + "epoch": 7.031605040280934, + "grad_norm": 8.73465347290039, + "learning_rate": 1.6489469141923285e-05, + "loss": 1.4358, + "step": 42550 + }, + { + "epoch": 7.033257591406734, + "grad_norm": 10.015544891357422, + "learning_rate": 1.648028792302466e-05, + "loss": 1.3549, + "step": 42560 + }, + { + "epoch": 7.034910142532534, + "grad_norm": 10.383803367614746, + "learning_rate": 1.647110670412604e-05, + "loss": 1.2933, + "step": 42570 + }, + { + "epoch": 7.036562693658335, + "grad_norm": 6.737658977508545, + "learning_rate": 1.646192548522742e-05, + "loss": 1.3442, + "step": 42580 + }, + { + "epoch": 7.038215244784135, + "grad_norm": 20.091114044189453, + "learning_rate": 1.6452744266328798e-05, + "loss": 1.4975, + "step": 42590 + }, + { + "epoch": 7.039867795909936, + "grad_norm": 27.708118438720703, + "learning_rate": 1.6443563047430177e-05, + "loss": 1.5107, + "step": 42600 + }, + { + "epoch": 7.041520347035736, + "grad_norm": 15.870881080627441, + "learning_rate": 1.6434381828531557e-05, + "loss": 1.4584, + "step": 42610 + }, + { + "epoch": 7.0431728981615365, + "grad_norm": 20.80101203918457, + "learning_rate": 1.6425200609632936e-05, + "loss": 1.4254, + "step": 42620 + }, + { + "epoch": 7.0448254492873374, + "grad_norm": 17.202617645263672, + "learning_rate": 1.6416019390734315e-05, + "loss": 1.3899, + "step": 42630 + }, + { + "epoch": 7.0464780004131375, + "grad_norm": 10.804183006286621, + "learning_rate": 1.6406838171835694e-05, + "loss": 1.3267, + "step": 42640 + }, + { + "epoch": 7.0481305515389385, + "grad_norm": 26.866113662719727, + "learning_rate": 1.6397656952937074e-05, + "loss": 1.4871, + "step": 42650 + }, + { + "epoch": 7.049783102664739, + "grad_norm": 13.534538269042969, + "learning_rate": 1.6388475734038453e-05, + "loss": 1.5625, + "step": 42660 + }, + { + "epoch": 7.051435653790539, + "grad_norm": 9.568449020385742, + "learning_rate": 1.6379294515139832e-05, + "loss": 1.3345, + "step": 42670 + }, + { + "epoch": 7.05308820491634, + "grad_norm": 6.132664680480957, + "learning_rate": 1.637011329624121e-05, + "loss": 1.4646, + "step": 42680 + }, + { + "epoch": 7.05474075604214, + "grad_norm": 13.083625793457031, + "learning_rate": 1.6360932077342587e-05, + "loss": 1.3717, + "step": 42690 + }, + { + "epoch": 7.056393307167941, + "grad_norm": 22.688730239868164, + "learning_rate": 1.6351750858443967e-05, + "loss": 1.321, + "step": 42700 + }, + { + "epoch": 7.058045858293741, + "grad_norm": 13.621474266052246, + "learning_rate": 1.6342569639545346e-05, + "loss": 1.4109, + "step": 42710 + }, + { + "epoch": 7.059698409419542, + "grad_norm": 16.44967269897461, + "learning_rate": 1.6333388420646725e-05, + "loss": 1.3616, + "step": 42720 + }, + { + "epoch": 7.061350960545342, + "grad_norm": 16.369356155395508, + "learning_rate": 1.6324207201748104e-05, + "loss": 1.4134, + "step": 42730 + }, + { + "epoch": 7.063003511671142, + "grad_norm": 11.338850975036621, + "learning_rate": 1.6315025982849484e-05, + "loss": 1.3899, + "step": 42740 + }, + { + "epoch": 7.064656062796943, + "grad_norm": 13.041457176208496, + "learning_rate": 1.6305844763950863e-05, + "loss": 1.3229, + "step": 42750 + }, + { + "epoch": 7.066308613922743, + "grad_norm": 18.309200286865234, + "learning_rate": 1.6296663545052242e-05, + "loss": 1.4822, + "step": 42760 + }, + { + "epoch": 7.067961165048544, + "grad_norm": 14.844568252563477, + "learning_rate": 1.628748232615362e-05, + "loss": 1.5301, + "step": 42770 + }, + { + "epoch": 7.069613716174344, + "grad_norm": 8.260869979858398, + "learning_rate": 1.6278301107255e-05, + "loss": 1.3789, + "step": 42780 + }, + { + "epoch": 7.071266267300144, + "grad_norm": 20.73078727722168, + "learning_rate": 1.626911988835638e-05, + "loss": 1.3875, + "step": 42790 + }, + { + "epoch": 7.072918818425945, + "grad_norm": 7.812931537628174, + "learning_rate": 1.625993866945776e-05, + "loss": 1.3335, + "step": 42800 + }, + { + "epoch": 7.074571369551745, + "grad_norm": 10.699761390686035, + "learning_rate": 1.6250757450559138e-05, + "loss": 1.2201, + "step": 42810 + }, + { + "epoch": 7.076223920677546, + "grad_norm": 13.993300437927246, + "learning_rate": 1.6241576231660514e-05, + "loss": 1.3617, + "step": 42820 + }, + { + "epoch": 7.0778764718033464, + "grad_norm": 7.880179405212402, + "learning_rate": 1.6232395012761893e-05, + "loss": 1.3303, + "step": 42830 + }, + { + "epoch": 7.0795290229291465, + "grad_norm": 14.917696952819824, + "learning_rate": 1.6223213793863273e-05, + "loss": 1.383, + "step": 42840 + }, + { + "epoch": 7.0811815740549475, + "grad_norm": 62.78154373168945, + "learning_rate": 1.6214032574964652e-05, + "loss": 1.5036, + "step": 42850 + }, + { + "epoch": 7.082834125180748, + "grad_norm": 50.27464294433594, + "learning_rate": 1.620485135606603e-05, + "loss": 1.3522, + "step": 42860 + }, + { + "epoch": 7.084486676306549, + "grad_norm": 19.16926383972168, + "learning_rate": 1.6195670137167414e-05, + "loss": 1.4757, + "step": 42870 + }, + { + "epoch": 7.086139227432349, + "grad_norm": 13.433306694030762, + "learning_rate": 1.618648891826879e-05, + "loss": 1.3875, + "step": 42880 + }, + { + "epoch": 7.087791778558149, + "grad_norm": 13.95352554321289, + "learning_rate": 1.617730769937017e-05, + "loss": 1.4077, + "step": 42890 + }, + { + "epoch": 7.08944432968395, + "grad_norm": 11.004186630249023, + "learning_rate": 1.6168126480471548e-05, + "loss": 1.2435, + "step": 42900 + }, + { + "epoch": 7.09109688080975, + "grad_norm": 9.82064151763916, + "learning_rate": 1.6158945261572927e-05, + "loss": 1.3743, + "step": 42910 + }, + { + "epoch": 7.092749431935551, + "grad_norm": 11.183409690856934, + "learning_rate": 1.6149764042674307e-05, + "loss": 1.2179, + "step": 42920 + }, + { + "epoch": 7.094401983061351, + "grad_norm": 12.038310050964355, + "learning_rate": 1.6140582823775686e-05, + "loss": 1.4433, + "step": 42930 + }, + { + "epoch": 7.096054534187151, + "grad_norm": 14.014615058898926, + "learning_rate": 1.6131401604877065e-05, + "loss": 1.4148, + "step": 42940 + }, + { + "epoch": 7.097707085312952, + "grad_norm": 11.169940948486328, + "learning_rate": 1.6122220385978444e-05, + "loss": 1.4004, + "step": 42950 + }, + { + "epoch": 7.099359636438752, + "grad_norm": 12.410701751708984, + "learning_rate": 1.611303916707982e-05, + "loss": 1.4521, + "step": 42960 + }, + { + "epoch": 7.101012187564553, + "grad_norm": 13.273151397705078, + "learning_rate": 1.61038579481812e-05, + "loss": 1.5151, + "step": 42970 + }, + { + "epoch": 7.102664738690353, + "grad_norm": 8.128931999206543, + "learning_rate": 1.609467672928258e-05, + "loss": 1.4828, + "step": 42980 + }, + { + "epoch": 7.104317289816153, + "grad_norm": 26.178449630737305, + "learning_rate": 1.608549551038396e-05, + "loss": 1.3804, + "step": 42990 + }, + { + "epoch": 7.105969840941954, + "grad_norm": 9.145550727844238, + "learning_rate": 1.607631429148534e-05, + "loss": 1.3358, + "step": 43000 + }, + { + "epoch": 7.107622392067754, + "grad_norm": 10.598913192749023, + "learning_rate": 1.6067133072586716e-05, + "loss": 1.4074, + "step": 43010 + }, + { + "epoch": 7.109274943193555, + "grad_norm": 13.98646068572998, + "learning_rate": 1.6057951853688096e-05, + "loss": 1.4561, + "step": 43020 + }, + { + "epoch": 7.1109274943193554, + "grad_norm": 15.383737564086914, + "learning_rate": 1.6048770634789475e-05, + "loss": 1.3876, + "step": 43030 + }, + { + "epoch": 7.1125800454451555, + "grad_norm": 14.499407768249512, + "learning_rate": 1.6039589415890854e-05, + "loss": 1.3743, + "step": 43040 + }, + { + "epoch": 7.1142325965709565, + "grad_norm": 21.623125076293945, + "learning_rate": 1.6030408196992233e-05, + "loss": 1.3869, + "step": 43050 + }, + { + "epoch": 7.115885147696757, + "grad_norm": 19.47028350830078, + "learning_rate": 1.6021226978093613e-05, + "loss": 1.4454, + "step": 43060 + }, + { + "epoch": 7.117537698822558, + "grad_norm": 12.078246116638184, + "learning_rate": 1.6012045759194992e-05, + "loss": 1.396, + "step": 43070 + }, + { + "epoch": 7.119190249948358, + "grad_norm": 11.143622398376465, + "learning_rate": 1.600286454029637e-05, + "loss": 1.5037, + "step": 43080 + }, + { + "epoch": 7.120842801074158, + "grad_norm": 35.760719299316406, + "learning_rate": 1.5993683321397747e-05, + "loss": 1.4383, + "step": 43090 + }, + { + "epoch": 7.122495352199959, + "grad_norm": 23.67930030822754, + "learning_rate": 1.598450210249913e-05, + "loss": 1.415, + "step": 43100 + }, + { + "epoch": 7.124147903325759, + "grad_norm": 8.731281280517578, + "learning_rate": 1.597532088360051e-05, + "loss": 1.4276, + "step": 43110 + }, + { + "epoch": 7.12580045445156, + "grad_norm": 16.28318214416504, + "learning_rate": 1.5966139664701888e-05, + "loss": 1.4406, + "step": 43120 + }, + { + "epoch": 7.12745300557736, + "grad_norm": 15.253238677978516, + "learning_rate": 1.5956958445803267e-05, + "loss": 1.4224, + "step": 43130 + }, + { + "epoch": 7.129105556703161, + "grad_norm": 9.880094528198242, + "learning_rate": 1.5947777226904643e-05, + "loss": 1.289, + "step": 43140 + }, + { + "epoch": 7.130758107828961, + "grad_norm": 45.78981399536133, + "learning_rate": 1.5938596008006022e-05, + "loss": 1.4237, + "step": 43150 + }, + { + "epoch": 7.132410658954761, + "grad_norm": 15.530732154846191, + "learning_rate": 1.5929414789107402e-05, + "loss": 1.3385, + "step": 43160 + }, + { + "epoch": 7.134063210080562, + "grad_norm": 7.943971157073975, + "learning_rate": 1.592023357020878e-05, + "loss": 1.3658, + "step": 43170 + }, + { + "epoch": 7.135715761206362, + "grad_norm": 8.042201042175293, + "learning_rate": 1.591105235131016e-05, + "loss": 1.3703, + "step": 43180 + }, + { + "epoch": 7.137368312332163, + "grad_norm": 14.290609359741211, + "learning_rate": 1.590187113241154e-05, + "loss": 1.433, + "step": 43190 + }, + { + "epoch": 7.139020863457963, + "grad_norm": 12.84965991973877, + "learning_rate": 1.589268991351292e-05, + "loss": 1.3675, + "step": 43200 + }, + { + "epoch": 7.140673414583763, + "grad_norm": 9.75621509552002, + "learning_rate": 1.5883508694614298e-05, + "loss": 1.3423, + "step": 43210 + }, + { + "epoch": 7.142325965709564, + "grad_norm": 16.249305725097656, + "learning_rate": 1.5874327475715677e-05, + "loss": 1.5104, + "step": 43220 + }, + { + "epoch": 7.1439785168353644, + "grad_norm": 9.561155319213867, + "learning_rate": 1.5865146256817056e-05, + "loss": 1.4316, + "step": 43230 + }, + { + "epoch": 7.145631067961165, + "grad_norm": 9.332865715026855, + "learning_rate": 1.5855965037918436e-05, + "loss": 1.3956, + "step": 43240 + }, + { + "epoch": 7.1472836190869655, + "grad_norm": 14.026449203491211, + "learning_rate": 1.5846783819019815e-05, + "loss": 1.3913, + "step": 43250 + }, + { + "epoch": 7.148936170212766, + "grad_norm": 16.361709594726562, + "learning_rate": 1.5837602600121194e-05, + "loss": 1.4388, + "step": 43260 + }, + { + "epoch": 7.150588721338567, + "grad_norm": 34.313411712646484, + "learning_rate": 1.5828421381222573e-05, + "loss": 1.4463, + "step": 43270 + }, + { + "epoch": 7.152241272464367, + "grad_norm": 62.70059585571289, + "learning_rate": 1.581924016232395e-05, + "loss": 1.5965, + "step": 43280 + }, + { + "epoch": 7.153893823590168, + "grad_norm": 12.427000999450684, + "learning_rate": 1.581005894342533e-05, + "loss": 1.4107, + "step": 43290 + }, + { + "epoch": 7.155546374715968, + "grad_norm": 38.85627365112305, + "learning_rate": 1.5800877724526708e-05, + "loss": 1.4769, + "step": 43300 + }, + { + "epoch": 7.157198925841768, + "grad_norm": 13.122133255004883, + "learning_rate": 1.5791696505628087e-05, + "loss": 1.3648, + "step": 43310 + }, + { + "epoch": 7.158851476967569, + "grad_norm": 44.62786102294922, + "learning_rate": 1.5782515286729466e-05, + "loss": 1.5429, + "step": 43320 + }, + { + "epoch": 7.160504028093369, + "grad_norm": 12.482089042663574, + "learning_rate": 1.5773334067830846e-05, + "loss": 1.3703, + "step": 43330 + }, + { + "epoch": 7.16215657921917, + "grad_norm": 10.335782051086426, + "learning_rate": 1.5764152848932225e-05, + "loss": 1.3795, + "step": 43340 + }, + { + "epoch": 7.16380913034497, + "grad_norm": 11.581258773803711, + "learning_rate": 1.5754971630033604e-05, + "loss": 1.3594, + "step": 43350 + }, + { + "epoch": 7.16546168147077, + "grad_norm": 9.168930053710938, + "learning_rate": 1.5745790411134983e-05, + "loss": 1.3988, + "step": 43360 + }, + { + "epoch": 7.167114232596571, + "grad_norm": 35.34038543701172, + "learning_rate": 1.5736609192236363e-05, + "loss": 1.367, + "step": 43370 + }, + { + "epoch": 7.168766783722371, + "grad_norm": 13.737029075622559, + "learning_rate": 1.5727427973337742e-05, + "loss": 1.4333, + "step": 43380 + }, + { + "epoch": 7.170419334848172, + "grad_norm": 11.130227088928223, + "learning_rate": 1.571824675443912e-05, + "loss": 1.3601, + "step": 43390 + }, + { + "epoch": 7.172071885973972, + "grad_norm": 14.647340774536133, + "learning_rate": 1.57090655355405e-05, + "loss": 1.4556, + "step": 43400 + }, + { + "epoch": 7.173724437099772, + "grad_norm": 11.120412826538086, + "learning_rate": 1.5699884316641876e-05, + "loss": 1.4372, + "step": 43410 + }, + { + "epoch": 7.175376988225573, + "grad_norm": 51.608375549316406, + "learning_rate": 1.5690703097743255e-05, + "loss": 1.4654, + "step": 43420 + }, + { + "epoch": 7.177029539351373, + "grad_norm": 10.668750762939453, + "learning_rate": 1.5681521878844635e-05, + "loss": 1.2519, + "step": 43430 + }, + { + "epoch": 7.178682090477174, + "grad_norm": 12.485282897949219, + "learning_rate": 1.5672340659946017e-05, + "loss": 1.322, + "step": 43440 + }, + { + "epoch": 7.1803346416029745, + "grad_norm": 12.183808326721191, + "learning_rate": 1.5663159441047397e-05, + "loss": 1.4264, + "step": 43450 + }, + { + "epoch": 7.181987192728775, + "grad_norm": 13.21921443939209, + "learning_rate": 1.5653978222148772e-05, + "loss": 1.3703, + "step": 43460 + }, + { + "epoch": 7.183639743854576, + "grad_norm": 10.9345064163208, + "learning_rate": 1.564479700325015e-05, + "loss": 1.2281, + "step": 43470 + }, + { + "epoch": 7.185292294980376, + "grad_norm": 14.405402183532715, + "learning_rate": 1.563561578435153e-05, + "loss": 1.4111, + "step": 43480 + }, + { + "epoch": 7.186944846106177, + "grad_norm": 17.535526275634766, + "learning_rate": 1.562643456545291e-05, + "loss": 1.3301, + "step": 43490 + }, + { + "epoch": 7.188597397231977, + "grad_norm": 15.74835205078125, + "learning_rate": 1.561725334655429e-05, + "loss": 1.5023, + "step": 43500 + }, + { + "epoch": 7.190249948357778, + "grad_norm": 29.075929641723633, + "learning_rate": 1.560807212765567e-05, + "loss": 1.443, + "step": 43510 + }, + { + "epoch": 7.191902499483578, + "grad_norm": 8.865483283996582, + "learning_rate": 1.5598890908757048e-05, + "loss": 1.3097, + "step": 43520 + }, + { + "epoch": 7.193555050609378, + "grad_norm": 14.490361213684082, + "learning_rate": 1.5589709689858427e-05, + "loss": 1.3695, + "step": 43530 + }, + { + "epoch": 7.195207601735179, + "grad_norm": 21.584367752075195, + "learning_rate": 1.5580528470959803e-05, + "loss": 1.3773, + "step": 43540 + }, + { + "epoch": 7.196860152860979, + "grad_norm": 10.539822578430176, + "learning_rate": 1.5571347252061182e-05, + "loss": 1.4, + "step": 43550 + }, + { + "epoch": 7.19851270398678, + "grad_norm": 12.214553833007812, + "learning_rate": 1.5562166033162565e-05, + "loss": 1.3988, + "step": 43560 + }, + { + "epoch": 7.20016525511258, + "grad_norm": 13.141803741455078, + "learning_rate": 1.5552984814263944e-05, + "loss": 1.3828, + "step": 43570 + }, + { + "epoch": 7.20181780623838, + "grad_norm": 12.777125358581543, + "learning_rate": 1.5543803595365323e-05, + "loss": 1.3906, + "step": 43580 + }, + { + "epoch": 7.203470357364181, + "grad_norm": 15.37719440460205, + "learning_rate": 1.5534622376466703e-05, + "loss": 1.4042, + "step": 43590 + }, + { + "epoch": 7.205122908489981, + "grad_norm": 12.876169204711914, + "learning_rate": 1.552544115756808e-05, + "loss": 1.582, + "step": 43600 + }, + { + "epoch": 7.206775459615782, + "grad_norm": 21.468862533569336, + "learning_rate": 1.5516259938669458e-05, + "loss": 1.4693, + "step": 43610 + }, + { + "epoch": 7.208428010741582, + "grad_norm": 10.771493911743164, + "learning_rate": 1.5507078719770837e-05, + "loss": 1.4106, + "step": 43620 + }, + { + "epoch": 7.210080561867382, + "grad_norm": 16.791486740112305, + "learning_rate": 1.5497897500872216e-05, + "loss": 1.4429, + "step": 43630 + }, + { + "epoch": 7.211733112993183, + "grad_norm": 16.35462760925293, + "learning_rate": 1.5488716281973595e-05, + "loss": 1.3854, + "step": 43640 + }, + { + "epoch": 7.2133856641189835, + "grad_norm": 12.617788314819336, + "learning_rate": 1.5479535063074975e-05, + "loss": 1.3077, + "step": 43650 + }, + { + "epoch": 7.2150382152447845, + "grad_norm": 12.411581993103027, + "learning_rate": 1.5470353844176354e-05, + "loss": 1.3623, + "step": 43660 + }, + { + "epoch": 7.216690766370585, + "grad_norm": 19.033109664916992, + "learning_rate": 1.5461172625277733e-05, + "loss": 1.4943, + "step": 43670 + }, + { + "epoch": 7.218343317496385, + "grad_norm": 8.353110313415527, + "learning_rate": 1.5451991406379112e-05, + "loss": 1.3783, + "step": 43680 + }, + { + "epoch": 7.219995868622186, + "grad_norm": 8.418465614318848, + "learning_rate": 1.5442810187480492e-05, + "loss": 1.3545, + "step": 43690 + }, + { + "epoch": 7.221648419747986, + "grad_norm": 8.319928169250488, + "learning_rate": 1.543362896858187e-05, + "loss": 1.3794, + "step": 43700 + }, + { + "epoch": 7.223300970873787, + "grad_norm": 13.188324928283691, + "learning_rate": 1.542444774968325e-05, + "loss": 1.2745, + "step": 43710 + }, + { + "epoch": 7.224953521999587, + "grad_norm": 10.732872009277344, + "learning_rate": 1.541526653078463e-05, + "loss": 1.3477, + "step": 43720 + }, + { + "epoch": 7.226606073125387, + "grad_norm": 47.982398986816406, + "learning_rate": 1.5406085311886005e-05, + "loss": 1.4172, + "step": 43730 + }, + { + "epoch": 7.228258624251188, + "grad_norm": 44.85279846191406, + "learning_rate": 1.5396904092987385e-05, + "loss": 1.4017, + "step": 43740 + }, + { + "epoch": 7.229911175376988, + "grad_norm": 9.955687522888184, + "learning_rate": 1.5387722874088764e-05, + "loss": 1.349, + "step": 43750 + }, + { + "epoch": 7.231563726502789, + "grad_norm": 8.512934684753418, + "learning_rate": 1.5378541655190143e-05, + "loss": 1.4837, + "step": 43760 + }, + { + "epoch": 7.233216277628589, + "grad_norm": 17.035743713378906, + "learning_rate": 1.5369360436291522e-05, + "loss": 1.4043, + "step": 43770 + }, + { + "epoch": 7.234868828754389, + "grad_norm": 45.515899658203125, + "learning_rate": 1.53601792173929e-05, + "loss": 1.2978, + "step": 43780 + }, + { + "epoch": 7.23652137988019, + "grad_norm": 13.9974946975708, + "learning_rate": 1.535099799849428e-05, + "loss": 1.4702, + "step": 43790 + }, + { + "epoch": 7.23817393100599, + "grad_norm": 16.096546173095703, + "learning_rate": 1.534181677959566e-05, + "loss": 1.4132, + "step": 43800 + }, + { + "epoch": 7.239826482131791, + "grad_norm": 11.778483390808105, + "learning_rate": 1.533263556069704e-05, + "loss": 1.3275, + "step": 43810 + }, + { + "epoch": 7.241479033257591, + "grad_norm": 7.991126537322998, + "learning_rate": 1.532345434179842e-05, + "loss": 1.2556, + "step": 43820 + }, + { + "epoch": 7.243131584383391, + "grad_norm": 12.626638412475586, + "learning_rate": 1.5314273122899798e-05, + "loss": 1.38, + "step": 43830 + }, + { + "epoch": 7.244784135509192, + "grad_norm": 7.913274765014648, + "learning_rate": 1.5305091904001177e-05, + "loss": 1.253, + "step": 43840 + }, + { + "epoch": 7.2464366866349925, + "grad_norm": 11.45130729675293, + "learning_rate": 1.5295910685102556e-05, + "loss": 1.4744, + "step": 43850 + }, + { + "epoch": 7.2480892377607935, + "grad_norm": 7.596304416656494, + "learning_rate": 1.5286729466203932e-05, + "loss": 1.3495, + "step": 43860 + }, + { + "epoch": 7.249741788886594, + "grad_norm": 16.072736740112305, + "learning_rate": 1.527754824730531e-05, + "loss": 1.3415, + "step": 43870 + }, + { + "epoch": 7.251394340012395, + "grad_norm": 11.912080764770508, + "learning_rate": 1.526836702840669e-05, + "loss": 1.4074, + "step": 43880 + }, + { + "epoch": 7.253046891138195, + "grad_norm": 14.602355003356934, + "learning_rate": 1.525918580950807e-05, + "loss": 1.4189, + "step": 43890 + }, + { + "epoch": 7.254699442263995, + "grad_norm": 9.636602401733398, + "learning_rate": 1.525000459060945e-05, + "loss": 1.37, + "step": 43900 + }, + { + "epoch": 7.256351993389796, + "grad_norm": 22.76931381225586, + "learning_rate": 1.524082337171083e-05, + "loss": 1.3947, + "step": 43910 + }, + { + "epoch": 7.258004544515596, + "grad_norm": 15.332125663757324, + "learning_rate": 1.523164215281221e-05, + "loss": 1.433, + "step": 43920 + }, + { + "epoch": 7.259657095641396, + "grad_norm": 13.930269241333008, + "learning_rate": 1.5222460933913587e-05, + "loss": 1.4699, + "step": 43930 + }, + { + "epoch": 7.261309646767197, + "grad_norm": 13.538143157958984, + "learning_rate": 1.5213279715014966e-05, + "loss": 1.398, + "step": 43940 + }, + { + "epoch": 7.262962197892997, + "grad_norm": 12.98928451538086, + "learning_rate": 1.5204098496116345e-05, + "loss": 1.2564, + "step": 43950 + }, + { + "epoch": 7.264614749018798, + "grad_norm": 17.9935359954834, + "learning_rate": 1.5194917277217725e-05, + "loss": 1.4769, + "step": 43960 + }, + { + "epoch": 7.266267300144598, + "grad_norm": 13.365751266479492, + "learning_rate": 1.5185736058319102e-05, + "loss": 1.4451, + "step": 43970 + }, + { + "epoch": 7.267919851270399, + "grad_norm": 16.52039337158203, + "learning_rate": 1.5176554839420481e-05, + "loss": 1.4016, + "step": 43980 + }, + { + "epoch": 7.269572402396199, + "grad_norm": 15.098515510559082, + "learning_rate": 1.516737362052186e-05, + "loss": 1.369, + "step": 43990 + }, + { + "epoch": 7.271224953521999, + "grad_norm": 8.429466247558594, + "learning_rate": 1.5158192401623238e-05, + "loss": 1.4621, + "step": 44000 + }, + { + "epoch": 7.2728775046478, + "grad_norm": 13.037130355834961, + "learning_rate": 1.5149011182724621e-05, + "loss": 1.4036, + "step": 44010 + }, + { + "epoch": 7.2745300557736, + "grad_norm": 11.745615005493164, + "learning_rate": 1.5139829963826e-05, + "loss": 1.3655, + "step": 44020 + }, + { + "epoch": 7.276182606899401, + "grad_norm": 25.232398986816406, + "learning_rate": 1.5130648744927378e-05, + "loss": 1.424, + "step": 44030 + }, + { + "epoch": 7.277835158025201, + "grad_norm": 10.532912254333496, + "learning_rate": 1.5121467526028757e-05, + "loss": 1.4246, + "step": 44040 + }, + { + "epoch": 7.2794877091510015, + "grad_norm": 9.035202026367188, + "learning_rate": 1.5112286307130136e-05, + "loss": 1.3259, + "step": 44050 + }, + { + "epoch": 7.2811402602768025, + "grad_norm": 10.761393547058105, + "learning_rate": 1.5103105088231514e-05, + "loss": 1.4091, + "step": 44060 + }, + { + "epoch": 7.282792811402603, + "grad_norm": 21.04129409790039, + "learning_rate": 1.5093923869332893e-05, + "loss": 1.3684, + "step": 44070 + }, + { + "epoch": 7.284445362528404, + "grad_norm": 6.939519882202148, + "learning_rate": 1.5084742650434272e-05, + "loss": 1.5599, + "step": 44080 + }, + { + "epoch": 7.286097913654204, + "grad_norm": 74.00979614257812, + "learning_rate": 1.5075561431535651e-05, + "loss": 1.4212, + "step": 44090 + }, + { + "epoch": 7.287750464780004, + "grad_norm": 11.299165725708008, + "learning_rate": 1.5066380212637029e-05, + "loss": 1.3967, + "step": 44100 + }, + { + "epoch": 7.289403015905805, + "grad_norm": 9.076863288879395, + "learning_rate": 1.5057198993738408e-05, + "loss": 1.3786, + "step": 44110 + }, + { + "epoch": 7.291055567031605, + "grad_norm": 10.693045616149902, + "learning_rate": 1.5048017774839787e-05, + "loss": 1.4005, + "step": 44120 + }, + { + "epoch": 7.292708118157406, + "grad_norm": 11.115889549255371, + "learning_rate": 1.5038836555941168e-05, + "loss": 1.512, + "step": 44130 + }, + { + "epoch": 7.294360669283206, + "grad_norm": 10.336817741394043, + "learning_rate": 1.5029655337042548e-05, + "loss": 1.3774, + "step": 44140 + }, + { + "epoch": 7.296013220409006, + "grad_norm": 11.22537612915039, + "learning_rate": 1.5020474118143927e-05, + "loss": 1.4058, + "step": 44150 + }, + { + "epoch": 7.297665771534807, + "grad_norm": 43.39937210083008, + "learning_rate": 1.5011292899245304e-05, + "loss": 1.3649, + "step": 44160 + }, + { + "epoch": 7.299318322660607, + "grad_norm": 22.82792091369629, + "learning_rate": 1.5002111680346684e-05, + "loss": 1.5085, + "step": 44170 + }, + { + "epoch": 7.300970873786408, + "grad_norm": 30.476633071899414, + "learning_rate": 1.4992930461448063e-05, + "loss": 1.32, + "step": 44180 + }, + { + "epoch": 7.302623424912208, + "grad_norm": 10.81251049041748, + "learning_rate": 1.498374924254944e-05, + "loss": 1.3552, + "step": 44190 + }, + { + "epoch": 7.304275976038008, + "grad_norm": 17.479726791381836, + "learning_rate": 1.497456802365082e-05, + "loss": 1.437, + "step": 44200 + }, + { + "epoch": 7.305928527163809, + "grad_norm": 14.184757232666016, + "learning_rate": 1.4965386804752199e-05, + "loss": 1.3092, + "step": 44210 + }, + { + "epoch": 7.307581078289609, + "grad_norm": 16.718236923217773, + "learning_rate": 1.4956205585853578e-05, + "loss": 1.4491, + "step": 44220 + }, + { + "epoch": 7.30923362941541, + "grad_norm": 13.425004005432129, + "learning_rate": 1.4947024366954956e-05, + "loss": 1.4319, + "step": 44230 + }, + { + "epoch": 7.31088618054121, + "grad_norm": 12.716981887817383, + "learning_rate": 1.4937843148056338e-05, + "loss": 1.365, + "step": 44240 + }, + { + "epoch": 7.312538731667011, + "grad_norm": 11.309150695800781, + "learning_rate": 1.4928661929157716e-05, + "loss": 1.3449, + "step": 44250 + }, + { + "epoch": 7.3141912827928115, + "grad_norm": 9.994693756103516, + "learning_rate": 1.4919480710259095e-05, + "loss": 1.3902, + "step": 44260 + }, + { + "epoch": 7.315843833918612, + "grad_norm": 12.569499015808105, + "learning_rate": 1.4910299491360475e-05, + "loss": 1.2932, + "step": 44270 + }, + { + "epoch": 7.317496385044413, + "grad_norm": 14.061833381652832, + "learning_rate": 1.4901118272461854e-05, + "loss": 1.3171, + "step": 44280 + }, + { + "epoch": 7.319148936170213, + "grad_norm": 17.114004135131836, + "learning_rate": 1.4891937053563231e-05, + "loss": 1.4204, + "step": 44290 + }, + { + "epoch": 7.320801487296013, + "grad_norm": 20.603870391845703, + "learning_rate": 1.488275583466461e-05, + "loss": 1.4114, + "step": 44300 + }, + { + "epoch": 7.322454038421814, + "grad_norm": 11.82662582397461, + "learning_rate": 1.487357461576599e-05, + "loss": 1.3978, + "step": 44310 + }, + { + "epoch": 7.324106589547614, + "grad_norm": 8.087780952453613, + "learning_rate": 1.4864393396867367e-05, + "loss": 1.4033, + "step": 44320 + }, + { + "epoch": 7.325759140673415, + "grad_norm": 14.538772583007812, + "learning_rate": 1.4855212177968747e-05, + "loss": 1.4692, + "step": 44330 + }, + { + "epoch": 7.327411691799215, + "grad_norm": 11.816965103149414, + "learning_rate": 1.4846030959070126e-05, + "loss": 1.4003, + "step": 44340 + }, + { + "epoch": 7.329064242925016, + "grad_norm": 11.297624588012695, + "learning_rate": 1.4836849740171505e-05, + "loss": 1.3688, + "step": 44350 + }, + { + "epoch": 7.330716794050816, + "grad_norm": 29.10480308532715, + "learning_rate": 1.4827668521272886e-05, + "loss": 1.3417, + "step": 44360 + }, + { + "epoch": 7.332369345176616, + "grad_norm": 16.348495483398438, + "learning_rate": 1.4818487302374265e-05, + "loss": 1.3799, + "step": 44370 + }, + { + "epoch": 7.334021896302417, + "grad_norm": 15.20343017578125, + "learning_rate": 1.4809306083475643e-05, + "loss": 1.3703, + "step": 44380 + }, + { + "epoch": 7.335674447428217, + "grad_norm": 10.832907676696777, + "learning_rate": 1.4800124864577022e-05, + "loss": 1.5541, + "step": 44390 + }, + { + "epoch": 7.337326998554018, + "grad_norm": 13.253135681152344, + "learning_rate": 1.4790943645678401e-05, + "loss": 1.346, + "step": 44400 + }, + { + "epoch": 7.338979549679818, + "grad_norm": 12.137068748474121, + "learning_rate": 1.478176242677978e-05, + "loss": 1.4282, + "step": 44410 + }, + { + "epoch": 7.340632100805618, + "grad_norm": 17.6302547454834, + "learning_rate": 1.4772581207881158e-05, + "loss": 1.2844, + "step": 44420 + }, + { + "epoch": 7.342284651931419, + "grad_norm": 8.67888355255127, + "learning_rate": 1.4763399988982537e-05, + "loss": 1.2272, + "step": 44430 + }, + { + "epoch": 7.343937203057219, + "grad_norm": 12.13434886932373, + "learning_rate": 1.4754218770083917e-05, + "loss": 1.4605, + "step": 44440 + }, + { + "epoch": 7.34558975418302, + "grad_norm": 13.558122634887695, + "learning_rate": 1.4745037551185296e-05, + "loss": 1.3914, + "step": 44450 + }, + { + "epoch": 7.3472423053088205, + "grad_norm": 11.792350769042969, + "learning_rate": 1.4735856332286673e-05, + "loss": 1.3819, + "step": 44460 + }, + { + "epoch": 7.348894856434621, + "grad_norm": 15.687477111816406, + "learning_rate": 1.4726675113388056e-05, + "loss": 1.4004, + "step": 44470 + }, + { + "epoch": 7.350547407560422, + "grad_norm": 14.300453186035156, + "learning_rate": 1.4717493894489434e-05, + "loss": 1.3687, + "step": 44480 + }, + { + "epoch": 7.352199958686222, + "grad_norm": 12.354840278625488, + "learning_rate": 1.4708312675590813e-05, + "loss": 1.3209, + "step": 44490 + }, + { + "epoch": 7.353852509812023, + "grad_norm": 14.35772705078125, + "learning_rate": 1.4699131456692192e-05, + "loss": 1.428, + "step": 44500 + }, + { + "epoch": 7.355505060937823, + "grad_norm": 16.01991844177246, + "learning_rate": 1.468995023779357e-05, + "loss": 1.3319, + "step": 44510 + }, + { + "epoch": 7.357157612063623, + "grad_norm": 14.260498046875, + "learning_rate": 1.4680769018894949e-05, + "loss": 1.3704, + "step": 44520 + }, + { + "epoch": 7.358810163189424, + "grad_norm": 17.772220611572266, + "learning_rate": 1.4671587799996328e-05, + "loss": 1.552, + "step": 44530 + }, + { + "epoch": 7.360462714315224, + "grad_norm": 13.813607215881348, + "learning_rate": 1.4662406581097707e-05, + "loss": 1.4813, + "step": 44540 + }, + { + "epoch": 7.362115265441025, + "grad_norm": 11.745838165283203, + "learning_rate": 1.4653225362199085e-05, + "loss": 1.396, + "step": 44550 + }, + { + "epoch": 7.363767816566825, + "grad_norm": 24.77311897277832, + "learning_rate": 1.4644044143300464e-05, + "loss": 1.4673, + "step": 44560 + }, + { + "epoch": 7.365420367692625, + "grad_norm": 12.793754577636719, + "learning_rate": 1.4634862924401843e-05, + "loss": 1.3443, + "step": 44570 + }, + { + "epoch": 7.367072918818426, + "grad_norm": 10.67249870300293, + "learning_rate": 1.4625681705503224e-05, + "loss": 1.3569, + "step": 44580 + }, + { + "epoch": 7.368725469944226, + "grad_norm": 15.902300834655762, + "learning_rate": 1.4616500486604604e-05, + "loss": 1.5068, + "step": 44590 + }, + { + "epoch": 7.370378021070027, + "grad_norm": 9.883173942565918, + "learning_rate": 1.4607319267705983e-05, + "loss": 1.2706, + "step": 44600 + }, + { + "epoch": 7.372030572195827, + "grad_norm": 17.580493927001953, + "learning_rate": 1.459813804880736e-05, + "loss": 1.3805, + "step": 44610 + }, + { + "epoch": 7.373683123321628, + "grad_norm": 20.290407180786133, + "learning_rate": 1.458895682990874e-05, + "loss": 1.5256, + "step": 44620 + }, + { + "epoch": 7.375335674447428, + "grad_norm": 11.272130012512207, + "learning_rate": 1.4579775611010119e-05, + "loss": 1.4099, + "step": 44630 + }, + { + "epoch": 7.376988225573228, + "grad_norm": 14.19814395904541, + "learning_rate": 1.4570594392111497e-05, + "loss": 1.4281, + "step": 44640 + }, + { + "epoch": 7.378640776699029, + "grad_norm": 13.382776260375977, + "learning_rate": 1.4561413173212876e-05, + "loss": 1.3977, + "step": 44650 + }, + { + "epoch": 7.3802933278248295, + "grad_norm": 13.100677490234375, + "learning_rate": 1.4552231954314255e-05, + "loss": 1.3892, + "step": 44660 + }, + { + "epoch": 7.38194587895063, + "grad_norm": 23.83889389038086, + "learning_rate": 1.4543050735415634e-05, + "loss": 1.3258, + "step": 44670 + }, + { + "epoch": 7.383598430076431, + "grad_norm": 15.009169578552246, + "learning_rate": 1.4533869516517012e-05, + "loss": 1.3531, + "step": 44680 + }, + { + "epoch": 7.385250981202231, + "grad_norm": 12.051997184753418, + "learning_rate": 1.4524688297618391e-05, + "loss": 1.3819, + "step": 44690 + }, + { + "epoch": 7.386903532328032, + "grad_norm": 15.543947219848633, + "learning_rate": 1.4515507078719772e-05, + "loss": 1.3308, + "step": 44700 + }, + { + "epoch": 7.388556083453832, + "grad_norm": 10.858241081237793, + "learning_rate": 1.4506325859821151e-05, + "loss": 1.32, + "step": 44710 + }, + { + "epoch": 7.390208634579633, + "grad_norm": 11.575108528137207, + "learning_rate": 1.449714464092253e-05, + "loss": 1.3483, + "step": 44720 + }, + { + "epoch": 7.391861185705433, + "grad_norm": 14.963944435119629, + "learning_rate": 1.448796342202391e-05, + "loss": 1.5092, + "step": 44730 + }, + { + "epoch": 7.393513736831233, + "grad_norm": 24.254188537597656, + "learning_rate": 1.4478782203125287e-05, + "loss": 1.5216, + "step": 44740 + }, + { + "epoch": 7.395166287957034, + "grad_norm": 11.076996803283691, + "learning_rate": 1.4469600984226667e-05, + "loss": 1.3176, + "step": 44750 + }, + { + "epoch": 7.396818839082834, + "grad_norm": 17.357913970947266, + "learning_rate": 1.4460419765328046e-05, + "loss": 1.4716, + "step": 44760 + }, + { + "epoch": 7.398471390208635, + "grad_norm": 15.097737312316895, + "learning_rate": 1.4451238546429425e-05, + "loss": 1.328, + "step": 44770 + }, + { + "epoch": 7.400123941334435, + "grad_norm": 30.251018524169922, + "learning_rate": 1.4442057327530803e-05, + "loss": 1.3193, + "step": 44780 + }, + { + "epoch": 7.401776492460235, + "grad_norm": 11.265031814575195, + "learning_rate": 1.4432876108632182e-05, + "loss": 1.3426, + "step": 44790 + }, + { + "epoch": 7.403429043586036, + "grad_norm": 13.859474182128906, + "learning_rate": 1.4423694889733561e-05, + "loss": 1.4333, + "step": 44800 + }, + { + "epoch": 7.405081594711836, + "grad_norm": 9.234789848327637, + "learning_rate": 1.4414513670834942e-05, + "loss": 1.3847, + "step": 44810 + }, + { + "epoch": 7.406734145837637, + "grad_norm": 7.183691024780273, + "learning_rate": 1.4405332451936321e-05, + "loss": 1.3156, + "step": 44820 + }, + { + "epoch": 7.408386696963437, + "grad_norm": 16.125627517700195, + "learning_rate": 1.4396151233037699e-05, + "loss": 1.4502, + "step": 44830 + }, + { + "epoch": 7.410039248089237, + "grad_norm": 12.971248626708984, + "learning_rate": 1.4386970014139078e-05, + "loss": 1.4037, + "step": 44840 + }, + { + "epoch": 7.411691799215038, + "grad_norm": 13.183744430541992, + "learning_rate": 1.4377788795240457e-05, + "loss": 1.3773, + "step": 44850 + }, + { + "epoch": 7.4133443503408385, + "grad_norm": 12.956613540649414, + "learning_rate": 1.4368607576341837e-05, + "loss": 1.4324, + "step": 44860 + }, + { + "epoch": 7.4149969014666395, + "grad_norm": 13.061296463012695, + "learning_rate": 1.4359426357443214e-05, + "loss": 1.3835, + "step": 44870 + }, + { + "epoch": 7.41664945259244, + "grad_norm": 16.211750030517578, + "learning_rate": 1.4350245138544593e-05, + "loss": 1.2608, + "step": 44880 + }, + { + "epoch": 7.41830200371824, + "grad_norm": 27.55082893371582, + "learning_rate": 1.4341063919645973e-05, + "loss": 1.3529, + "step": 44890 + }, + { + "epoch": 7.419954554844041, + "grad_norm": 10.643492698669434, + "learning_rate": 1.4331882700747352e-05, + "loss": 1.3314, + "step": 44900 + }, + { + "epoch": 7.421607105969841, + "grad_norm": 17.30533790588379, + "learning_rate": 1.432270148184873e-05, + "loss": 1.426, + "step": 44910 + }, + { + "epoch": 7.423259657095642, + "grad_norm": 14.52156925201416, + "learning_rate": 1.4313520262950109e-05, + "loss": 1.3467, + "step": 44920 + }, + { + "epoch": 7.424912208221442, + "grad_norm": 14.510608673095703, + "learning_rate": 1.430433904405149e-05, + "loss": 1.3018, + "step": 44930 + }, + { + "epoch": 7.426564759347242, + "grad_norm": 15.366539001464844, + "learning_rate": 1.4295157825152869e-05, + "loss": 1.4342, + "step": 44940 + }, + { + "epoch": 7.428217310473043, + "grad_norm": 66.76383972167969, + "learning_rate": 1.4285976606254248e-05, + "loss": 1.2281, + "step": 44950 + }, + { + "epoch": 7.429869861598843, + "grad_norm": 12.815417289733887, + "learning_rate": 1.4276795387355626e-05, + "loss": 1.3599, + "step": 44960 + }, + { + "epoch": 7.431522412724644, + "grad_norm": 11.356534004211426, + "learning_rate": 1.4267614168457005e-05, + "loss": 1.3074, + "step": 44970 + }, + { + "epoch": 7.433174963850444, + "grad_norm": 12.682281494140625, + "learning_rate": 1.4258432949558384e-05, + "loss": 1.4034, + "step": 44980 + }, + { + "epoch": 7.434827514976244, + "grad_norm": 217.0580596923828, + "learning_rate": 1.4249251730659763e-05, + "loss": 1.495, + "step": 44990 + }, + { + "epoch": 7.436480066102045, + "grad_norm": 13.032721519470215, + "learning_rate": 1.4240070511761141e-05, + "loss": 1.373, + "step": 45000 + }, + { + "epoch": 7.438132617227845, + "grad_norm": 13.059027671813965, + "learning_rate": 1.423088929286252e-05, + "loss": 1.4275, + "step": 45010 + }, + { + "epoch": 7.439785168353646, + "grad_norm": 21.048809051513672, + "learning_rate": 1.42217080739639e-05, + "loss": 1.4316, + "step": 45020 + }, + { + "epoch": 7.441437719479446, + "grad_norm": 15.526588439941406, + "learning_rate": 1.4212526855065279e-05, + "loss": 1.4637, + "step": 45030 + }, + { + "epoch": 7.443090270605246, + "grad_norm": 12.340840339660645, + "learning_rate": 1.420334563616666e-05, + "loss": 1.4365, + "step": 45040 + }, + { + "epoch": 7.444742821731047, + "grad_norm": 13.411223411560059, + "learning_rate": 1.4194164417268039e-05, + "loss": 1.4682, + "step": 45050 + }, + { + "epoch": 7.4463953728568475, + "grad_norm": 9.226353645324707, + "learning_rate": 1.4184983198369416e-05, + "loss": 1.2974, + "step": 45060 + }, + { + "epoch": 7.4480479239826485, + "grad_norm": 11.580143928527832, + "learning_rate": 1.4175801979470796e-05, + "loss": 1.517, + "step": 45070 + }, + { + "epoch": 7.449700475108449, + "grad_norm": 20.38673973083496, + "learning_rate": 1.4166620760572175e-05, + "loss": 1.3698, + "step": 45080 + }, + { + "epoch": 7.4513530262342496, + "grad_norm": 8.279924392700195, + "learning_rate": 1.4157439541673554e-05, + "loss": 1.5279, + "step": 45090 + }, + { + "epoch": 7.45300557736005, + "grad_norm": 12.932108879089355, + "learning_rate": 1.4148258322774932e-05, + "loss": 1.5431, + "step": 45100 + }, + { + "epoch": 7.45465812848585, + "grad_norm": 16.382911682128906, + "learning_rate": 1.4139077103876311e-05, + "loss": 1.3692, + "step": 45110 + }, + { + "epoch": 7.456310679611651, + "grad_norm": 9.782100677490234, + "learning_rate": 1.412989588497769e-05, + "loss": 1.3877, + "step": 45120 + }, + { + "epoch": 7.457963230737451, + "grad_norm": 13.853610038757324, + "learning_rate": 1.4120714666079068e-05, + "loss": 1.3601, + "step": 45130 + }, + { + "epoch": 7.459615781863252, + "grad_norm": 15.717327117919922, + "learning_rate": 1.4111533447180447e-05, + "loss": 1.5425, + "step": 45140 + }, + { + "epoch": 7.461268332989052, + "grad_norm": 11.447760581970215, + "learning_rate": 1.4102352228281826e-05, + "loss": 1.3657, + "step": 45150 + }, + { + "epoch": 7.462920884114852, + "grad_norm": 15.681619644165039, + "learning_rate": 1.4093171009383207e-05, + "loss": 1.3663, + "step": 45160 + }, + { + "epoch": 7.464573435240653, + "grad_norm": 11.067183494567871, + "learning_rate": 1.4083989790484586e-05, + "loss": 1.4642, + "step": 45170 + }, + { + "epoch": 7.466225986366453, + "grad_norm": 21.230554580688477, + "learning_rate": 1.4074808571585966e-05, + "loss": 1.2956, + "step": 45180 + }, + { + "epoch": 7.467878537492254, + "grad_norm": 12.558381080627441, + "learning_rate": 1.4065627352687343e-05, + "loss": 1.48, + "step": 45190 + }, + { + "epoch": 7.469531088618054, + "grad_norm": 8.757758140563965, + "learning_rate": 1.4056446133788723e-05, + "loss": 1.1943, + "step": 45200 + }, + { + "epoch": 7.471183639743854, + "grad_norm": 16.4716796875, + "learning_rate": 1.4047264914890102e-05, + "loss": 1.3091, + "step": 45210 + }, + { + "epoch": 7.472836190869655, + "grad_norm": 22.595905303955078, + "learning_rate": 1.4038083695991481e-05, + "loss": 1.5014, + "step": 45220 + }, + { + "epoch": 7.474488741995455, + "grad_norm": 12.184328079223633, + "learning_rate": 1.4028902477092859e-05, + "loss": 1.2568, + "step": 45230 + }, + { + "epoch": 7.476141293121256, + "grad_norm": 14.307938575744629, + "learning_rate": 1.4019721258194238e-05, + "loss": 1.366, + "step": 45240 + }, + { + "epoch": 7.477793844247056, + "grad_norm": 15.05473518371582, + "learning_rate": 1.4010540039295617e-05, + "loss": 1.3869, + "step": 45250 + }, + { + "epoch": 7.4794463953728565, + "grad_norm": 12.070158958435059, + "learning_rate": 1.4001358820396995e-05, + "loss": 1.296, + "step": 45260 + }, + { + "epoch": 7.4810989464986575, + "grad_norm": 23.16536521911621, + "learning_rate": 1.3992177601498377e-05, + "loss": 1.3242, + "step": 45270 + }, + { + "epoch": 7.482751497624458, + "grad_norm": 11.528528213500977, + "learning_rate": 1.3982996382599755e-05, + "loss": 1.4554, + "step": 45280 + }, + { + "epoch": 7.4844040487502586, + "grad_norm": 10.557455062866211, + "learning_rate": 1.3973815163701134e-05, + "loss": 1.3526, + "step": 45290 + }, + { + "epoch": 7.486056599876059, + "grad_norm": 6.784778594970703, + "learning_rate": 1.3964633944802513e-05, + "loss": 1.3421, + "step": 45300 + }, + { + "epoch": 7.487709151001859, + "grad_norm": 10.750931739807129, + "learning_rate": 1.3955452725903893e-05, + "loss": 1.305, + "step": 45310 + }, + { + "epoch": 7.48936170212766, + "grad_norm": 16.1955623626709, + "learning_rate": 1.394627150700527e-05, + "loss": 1.4377, + "step": 45320 + }, + { + "epoch": 7.49101425325346, + "grad_norm": 14.3265962600708, + "learning_rate": 1.393709028810665e-05, + "loss": 1.3428, + "step": 45330 + }, + { + "epoch": 7.492666804379261, + "grad_norm": 31.921222686767578, + "learning_rate": 1.3927909069208029e-05, + "loss": 1.4143, + "step": 45340 + }, + { + "epoch": 7.494319355505061, + "grad_norm": 20.369794845581055, + "learning_rate": 1.3918727850309408e-05, + "loss": 1.3892, + "step": 45350 + }, + { + "epoch": 7.495971906630861, + "grad_norm": 14.980151176452637, + "learning_rate": 1.3909546631410785e-05, + "loss": 1.3714, + "step": 45360 + }, + { + "epoch": 7.497624457756662, + "grad_norm": 21.37506675720215, + "learning_rate": 1.3900365412512165e-05, + "loss": 1.3442, + "step": 45370 + }, + { + "epoch": 7.499277008882462, + "grad_norm": 9.15767765045166, + "learning_rate": 1.3891184193613546e-05, + "loss": 1.2651, + "step": 45380 + }, + { + "epoch": 7.500929560008263, + "grad_norm": 19.919265747070312, + "learning_rate": 1.3882002974714925e-05, + "loss": 1.4399, + "step": 45390 + }, + { + "epoch": 7.502582111134063, + "grad_norm": 18.08030891418457, + "learning_rate": 1.3872821755816304e-05, + "loss": 1.2924, + "step": 45400 + }, + { + "epoch": 7.504234662259863, + "grad_norm": 14.912238121032715, + "learning_rate": 1.3863640536917683e-05, + "loss": 1.356, + "step": 45410 + }, + { + "epoch": 7.505887213385664, + "grad_norm": 18.654098510742188, + "learning_rate": 1.3854459318019061e-05, + "loss": 1.3739, + "step": 45420 + }, + { + "epoch": 7.507539764511464, + "grad_norm": 37.90325164794922, + "learning_rate": 1.384527809912044e-05, + "loss": 1.4239, + "step": 45430 + }, + { + "epoch": 7.509192315637265, + "grad_norm": 10.297859191894531, + "learning_rate": 1.383609688022182e-05, + "loss": 1.3303, + "step": 45440 + }, + { + "epoch": 7.510844866763065, + "grad_norm": 16.992605209350586, + "learning_rate": 1.3826915661323197e-05, + "loss": 1.4006, + "step": 45450 + }, + { + "epoch": 7.512497417888866, + "grad_norm": 11.61043930053711, + "learning_rate": 1.3817734442424576e-05, + "loss": 1.3269, + "step": 45460 + }, + { + "epoch": 7.5141499690146665, + "grad_norm": 11.616755485534668, + "learning_rate": 1.3808553223525955e-05, + "loss": 1.2896, + "step": 45470 + }, + { + "epoch": 7.515802520140467, + "grad_norm": 13.678464889526367, + "learning_rate": 1.3799372004627335e-05, + "loss": 1.4308, + "step": 45480 + }, + { + "epoch": 7.5174550712662676, + "grad_norm": 15.36746597290039, + "learning_rate": 1.3790190785728712e-05, + "loss": 1.2909, + "step": 45490 + }, + { + "epoch": 7.519107622392068, + "grad_norm": 17.820960998535156, + "learning_rate": 1.3781009566830095e-05, + "loss": 1.273, + "step": 45500 + }, + { + "epoch": 7.520760173517868, + "grad_norm": 34.905059814453125, + "learning_rate": 1.3771828347931472e-05, + "loss": 1.3707, + "step": 45510 + }, + { + "epoch": 7.522412724643669, + "grad_norm": 9.051143646240234, + "learning_rate": 1.3762647129032852e-05, + "loss": 1.3682, + "step": 45520 + }, + { + "epoch": 7.524065275769469, + "grad_norm": 12.66955852508545, + "learning_rate": 1.3753465910134231e-05, + "loss": 1.361, + "step": 45530 + }, + { + "epoch": 7.52571782689527, + "grad_norm": 12.091620445251465, + "learning_rate": 1.374428469123561e-05, + "loss": 1.3856, + "step": 45540 + }, + { + "epoch": 7.52737037802107, + "grad_norm": 9.986820220947266, + "learning_rate": 1.3735103472336988e-05, + "loss": 1.362, + "step": 45550 + }, + { + "epoch": 7.529022929146871, + "grad_norm": 40.41029357910156, + "learning_rate": 1.3725922253438367e-05, + "loss": 1.4566, + "step": 45560 + }, + { + "epoch": 7.530675480272671, + "grad_norm": 51.40917205810547, + "learning_rate": 1.3716741034539746e-05, + "loss": 1.4305, + "step": 45570 + }, + { + "epoch": 7.532328031398471, + "grad_norm": 12.97027587890625, + "learning_rate": 1.3707559815641124e-05, + "loss": 1.444, + "step": 45580 + }, + { + "epoch": 7.533980582524272, + "grad_norm": 10.593807220458984, + "learning_rate": 1.3698378596742503e-05, + "loss": 1.4671, + "step": 45590 + }, + { + "epoch": 7.535633133650072, + "grad_norm": 14.09102725982666, + "learning_rate": 1.3689197377843882e-05, + "loss": 1.2288, + "step": 45600 + }, + { + "epoch": 7.537285684775873, + "grad_norm": 13.20085620880127, + "learning_rate": 1.3680016158945263e-05, + "loss": 1.5422, + "step": 45610 + }, + { + "epoch": 7.538938235901673, + "grad_norm": 13.723567008972168, + "learning_rate": 1.3670834940046642e-05, + "loss": 1.3606, + "step": 45620 + }, + { + "epoch": 7.540590787027473, + "grad_norm": 12.14212703704834, + "learning_rate": 1.3661653721148022e-05, + "loss": 1.3494, + "step": 45630 + }, + { + "epoch": 7.542243338153274, + "grad_norm": 18.38138771057129, + "learning_rate": 1.36524725022494e-05, + "loss": 1.3763, + "step": 45640 + }, + { + "epoch": 7.543895889279074, + "grad_norm": 9.355786323547363, + "learning_rate": 1.3643291283350778e-05, + "loss": 1.4087, + "step": 45650 + }, + { + "epoch": 7.545548440404875, + "grad_norm": 17.49809455871582, + "learning_rate": 1.3634110064452158e-05, + "loss": 1.3953, + "step": 45660 + }, + { + "epoch": 7.5472009915306755, + "grad_norm": 29.25096893310547, + "learning_rate": 1.3624928845553537e-05, + "loss": 1.3762, + "step": 45670 + }, + { + "epoch": 7.548853542656476, + "grad_norm": 21.453706741333008, + "learning_rate": 1.3615747626654915e-05, + "loss": 1.4828, + "step": 45680 + }, + { + "epoch": 7.5505060937822766, + "grad_norm": 9.639589309692383, + "learning_rate": 1.3606566407756294e-05, + "loss": 1.4189, + "step": 45690 + }, + { + "epoch": 7.552158644908077, + "grad_norm": 18.783720016479492, + "learning_rate": 1.3597385188857673e-05, + "loss": 1.3584, + "step": 45700 + }, + { + "epoch": 7.553811196033878, + "grad_norm": 10.537060737609863, + "learning_rate": 1.358820396995905e-05, + "loss": 1.3764, + "step": 45710 + }, + { + "epoch": 7.555463747159678, + "grad_norm": 17.940895080566406, + "learning_rate": 1.357902275106043e-05, + "loss": 1.5005, + "step": 45720 + }, + { + "epoch": 7.557116298285479, + "grad_norm": 13.430806159973145, + "learning_rate": 1.3569841532161812e-05, + "loss": 1.4229, + "step": 45730 + }, + { + "epoch": 7.558768849411279, + "grad_norm": 37.275325775146484, + "learning_rate": 1.356066031326319e-05, + "loss": 1.332, + "step": 45740 + }, + { + "epoch": 7.560421400537079, + "grad_norm": 14.86922550201416, + "learning_rate": 1.355147909436457e-05, + "loss": 1.3686, + "step": 45750 + }, + { + "epoch": 7.56207395166288, + "grad_norm": 87.05618286132812, + "learning_rate": 1.3542297875465949e-05, + "loss": 1.4918, + "step": 45760 + }, + { + "epoch": 7.56372650278868, + "grad_norm": 11.52795696258545, + "learning_rate": 1.3533116656567326e-05, + "loss": 1.3878, + "step": 45770 + }, + { + "epoch": 7.56537905391448, + "grad_norm": 13.127462387084961, + "learning_rate": 1.3523935437668705e-05, + "loss": 1.3601, + "step": 45780 + }, + { + "epoch": 7.567031605040281, + "grad_norm": 14.65783977508545, + "learning_rate": 1.3514754218770085e-05, + "loss": 1.3726, + "step": 45790 + }, + { + "epoch": 7.568684156166081, + "grad_norm": 12.145511627197266, + "learning_rate": 1.3505572999871464e-05, + "loss": 1.4446, + "step": 45800 + }, + { + "epoch": 7.570336707291882, + "grad_norm": 20.19838523864746, + "learning_rate": 1.3496391780972841e-05, + "loss": 1.472, + "step": 45810 + }, + { + "epoch": 7.571989258417682, + "grad_norm": 8.90953254699707, + "learning_rate": 1.348721056207422e-05, + "loss": 1.3158, + "step": 45820 + }, + { + "epoch": 7.573641809543483, + "grad_norm": 14.088125228881836, + "learning_rate": 1.34780293431756e-05, + "loss": 1.2182, + "step": 45830 + }, + { + "epoch": 7.575294360669283, + "grad_norm": 14.895604133605957, + "learning_rate": 1.346884812427698e-05, + "loss": 1.501, + "step": 45840 + }, + { + "epoch": 7.576946911795083, + "grad_norm": 11.624994277954102, + "learning_rate": 1.345966690537836e-05, + "loss": 1.4957, + "step": 45850 + }, + { + "epoch": 7.578599462920884, + "grad_norm": 19.928787231445312, + "learning_rate": 1.345048568647974e-05, + "loss": 1.2905, + "step": 45860 + }, + { + "epoch": 7.5802520140466845, + "grad_norm": 12.225404739379883, + "learning_rate": 1.3441304467581117e-05, + "loss": 1.5096, + "step": 45870 + }, + { + "epoch": 7.581904565172485, + "grad_norm": 12.218575477600098, + "learning_rate": 1.3432123248682496e-05, + "loss": 1.289, + "step": 45880 + }, + { + "epoch": 7.5835571162982855, + "grad_norm": 17.526824951171875, + "learning_rate": 1.3422942029783875e-05, + "loss": 1.4192, + "step": 45890 + }, + { + "epoch": 7.585209667424086, + "grad_norm": 13.405013084411621, + "learning_rate": 1.3413760810885253e-05, + "loss": 1.4625, + "step": 45900 + }, + { + "epoch": 7.586862218549887, + "grad_norm": 10.898892402648926, + "learning_rate": 1.3404579591986632e-05, + "loss": 1.3915, + "step": 45910 + }, + { + "epoch": 7.588514769675687, + "grad_norm": 17.272350311279297, + "learning_rate": 1.3395398373088011e-05, + "loss": 1.409, + "step": 45920 + }, + { + "epoch": 7.590167320801488, + "grad_norm": 13.411367416381836, + "learning_rate": 1.338621715418939e-05, + "loss": 1.4557, + "step": 45930 + }, + { + "epoch": 7.591819871927288, + "grad_norm": 33.857017517089844, + "learning_rate": 1.3377035935290768e-05, + "loss": 1.2233, + "step": 45940 + }, + { + "epoch": 7.593472423053088, + "grad_norm": 18.17672348022461, + "learning_rate": 1.3367854716392147e-05, + "loss": 1.3755, + "step": 45950 + }, + { + "epoch": 7.595124974178889, + "grad_norm": 11.827006340026855, + "learning_rate": 1.3358673497493528e-05, + "loss": 1.3968, + "step": 45960 + }, + { + "epoch": 7.596777525304689, + "grad_norm": 10.973575592041016, + "learning_rate": 1.3349492278594908e-05, + "loss": 1.4225, + "step": 45970 + }, + { + "epoch": 7.59843007643049, + "grad_norm": 13.734282493591309, + "learning_rate": 1.3340311059696287e-05, + "loss": 1.3545, + "step": 45980 + }, + { + "epoch": 7.60008262755629, + "grad_norm": 20.4477596282959, + "learning_rate": 1.3331129840797666e-05, + "loss": 1.4661, + "step": 45990 + }, + { + "epoch": 7.60173517868209, + "grad_norm": 11.852956771850586, + "learning_rate": 1.3321948621899044e-05, + "loss": 1.28, + "step": 46000 + }, + { + "epoch": 7.603387729807891, + "grad_norm": 11.33363151550293, + "learning_rate": 1.3312767403000423e-05, + "loss": 1.4586, + "step": 46010 + }, + { + "epoch": 7.605040280933691, + "grad_norm": 10.724862098693848, + "learning_rate": 1.3303586184101802e-05, + "loss": 1.5499, + "step": 46020 + }, + { + "epoch": 7.606692832059492, + "grad_norm": 12.029391288757324, + "learning_rate": 1.329440496520318e-05, + "loss": 1.3334, + "step": 46030 + }, + { + "epoch": 7.608345383185292, + "grad_norm": 27.66431999206543, + "learning_rate": 1.3285223746304559e-05, + "loss": 1.4114, + "step": 46040 + }, + { + "epoch": 7.609997934311092, + "grad_norm": 11.550406455993652, + "learning_rate": 1.3276042527405938e-05, + "loss": 1.3458, + "step": 46050 + }, + { + "epoch": 7.611650485436893, + "grad_norm": 19.206697463989258, + "learning_rate": 1.3266861308507317e-05, + "loss": 1.4311, + "step": 46060 + }, + { + "epoch": 7.6133030365626935, + "grad_norm": 13.49929428100586, + "learning_rate": 1.3257680089608698e-05, + "loss": 1.4536, + "step": 46070 + }, + { + "epoch": 7.6149555876884945, + "grad_norm": 12.506552696228027, + "learning_rate": 1.3248498870710078e-05, + "loss": 1.4649, + "step": 46080 + }, + { + "epoch": 7.6166081388142945, + "grad_norm": 16.31134605407715, + "learning_rate": 1.3239317651811455e-05, + "loss": 1.4549, + "step": 46090 + }, + { + "epoch": 7.618260689940095, + "grad_norm": 9.935333251953125, + "learning_rate": 1.3230136432912834e-05, + "loss": 1.387, + "step": 46100 + }, + { + "epoch": 7.619913241065896, + "grad_norm": 14.679750442504883, + "learning_rate": 1.3220955214014214e-05, + "loss": 1.4526, + "step": 46110 + }, + { + "epoch": 7.621565792191696, + "grad_norm": 11.663803100585938, + "learning_rate": 1.3211773995115593e-05, + "loss": 1.4375, + "step": 46120 + }, + { + "epoch": 7.623218343317497, + "grad_norm": 333.21502685546875, + "learning_rate": 1.320259277621697e-05, + "loss": 1.5769, + "step": 46130 + }, + { + "epoch": 7.624870894443297, + "grad_norm": 17.140153884887695, + "learning_rate": 1.319341155731835e-05, + "loss": 1.3195, + "step": 46140 + }, + { + "epoch": 7.626523445569097, + "grad_norm": 7.242269039154053, + "learning_rate": 1.3184230338419729e-05, + "loss": 1.4281, + "step": 46150 + }, + { + "epoch": 7.628175996694898, + "grad_norm": 13.725931167602539, + "learning_rate": 1.3175049119521107e-05, + "loss": 1.3323, + "step": 46160 + }, + { + "epoch": 7.629828547820698, + "grad_norm": 14.671615600585938, + "learning_rate": 1.3165867900622486e-05, + "loss": 1.4224, + "step": 46170 + }, + { + "epoch": 7.631481098946499, + "grad_norm": 12.745248794555664, + "learning_rate": 1.3156686681723868e-05, + "loss": 1.3534, + "step": 46180 + }, + { + "epoch": 7.633133650072299, + "grad_norm": 24.55314064025879, + "learning_rate": 1.3147505462825246e-05, + "loss": 1.2832, + "step": 46190 + }, + { + "epoch": 7.6347862011981, + "grad_norm": 17.937116622924805, + "learning_rate": 1.3138324243926625e-05, + "loss": 1.2754, + "step": 46200 + }, + { + "epoch": 7.6364387523239, + "grad_norm": 26.932025909423828, + "learning_rate": 1.3129143025028005e-05, + "loss": 1.3059, + "step": 46210 + }, + { + "epoch": 7.6380913034497, + "grad_norm": 11.607211112976074, + "learning_rate": 1.3119961806129382e-05, + "loss": 1.4191, + "step": 46220 + }, + { + "epoch": 7.639743854575501, + "grad_norm": 30.498897552490234, + "learning_rate": 1.3110780587230761e-05, + "loss": 1.6439, + "step": 46230 + }, + { + "epoch": 7.641396405701301, + "grad_norm": 9.308640480041504, + "learning_rate": 1.310159936833214e-05, + "loss": 1.3958, + "step": 46240 + }, + { + "epoch": 7.643048956827101, + "grad_norm": 12.674652099609375, + "learning_rate": 1.309241814943352e-05, + "loss": 1.3947, + "step": 46250 + }, + { + "epoch": 7.644701507952902, + "grad_norm": 18.442964553833008, + "learning_rate": 1.3083236930534897e-05, + "loss": 1.3829, + "step": 46260 + }, + { + "epoch": 7.6463540590787025, + "grad_norm": 9.009620666503906, + "learning_rate": 1.3074055711636277e-05, + "loss": 1.382, + "step": 46270 + }, + { + "epoch": 7.6480066102045035, + "grad_norm": 11.820587158203125, + "learning_rate": 1.3064874492737656e-05, + "loss": 1.2625, + "step": 46280 + }, + { + "epoch": 7.6496591613303035, + "grad_norm": 13.428104400634766, + "learning_rate": 1.3055693273839033e-05, + "loss": 1.2081, + "step": 46290 + }, + { + "epoch": 7.6513117124561045, + "grad_norm": 226.61851501464844, + "learning_rate": 1.3046512054940416e-05, + "loss": 1.3743, + "step": 46300 + }, + { + "epoch": 7.652964263581905, + "grad_norm": 19.351016998291016, + "learning_rate": 1.3037330836041795e-05, + "loss": 1.3831, + "step": 46310 + }, + { + "epoch": 7.654616814707705, + "grad_norm": 9.168930053710938, + "learning_rate": 1.3028149617143173e-05, + "loss": 1.3555, + "step": 46320 + }, + { + "epoch": 7.656269365833506, + "grad_norm": 17.819650650024414, + "learning_rate": 1.3018968398244552e-05, + "loss": 1.4432, + "step": 46330 + }, + { + "epoch": 7.657921916959306, + "grad_norm": 57.033180236816406, + "learning_rate": 1.3009787179345931e-05, + "loss": 1.3021, + "step": 46340 + }, + { + "epoch": 7.659574468085106, + "grad_norm": 25.294410705566406, + "learning_rate": 1.3000605960447309e-05, + "loss": 1.4219, + "step": 46350 + }, + { + "epoch": 7.661227019210907, + "grad_norm": 22.175817489624023, + "learning_rate": 1.2991424741548688e-05, + "loss": 1.4499, + "step": 46360 + }, + { + "epoch": 7.662879570336707, + "grad_norm": 12.002792358398438, + "learning_rate": 1.2982243522650067e-05, + "loss": 1.3654, + "step": 46370 + }, + { + "epoch": 7.664532121462508, + "grad_norm": 9.988709449768066, + "learning_rate": 1.2973062303751447e-05, + "loss": 1.4325, + "step": 46380 + }, + { + "epoch": 7.666184672588308, + "grad_norm": 14.610112190246582, + "learning_rate": 1.2963881084852824e-05, + "loss": 1.2868, + "step": 46390 + }, + { + "epoch": 7.667837223714109, + "grad_norm": 14.141910552978516, + "learning_rate": 1.2954699865954203e-05, + "loss": 1.3978, + "step": 46400 + }, + { + "epoch": 7.669489774839909, + "grad_norm": 24.699037551879883, + "learning_rate": 1.2945518647055584e-05, + "loss": 1.3868, + "step": 46410 + }, + { + "epoch": 7.671142325965709, + "grad_norm": 13.397317886352539, + "learning_rate": 1.2936337428156964e-05, + "loss": 1.329, + "step": 46420 + }, + { + "epoch": 7.67279487709151, + "grad_norm": 13.475739479064941, + "learning_rate": 1.2927156209258343e-05, + "loss": 1.4917, + "step": 46430 + }, + { + "epoch": 7.67444742821731, + "grad_norm": 14.52394962310791, + "learning_rate": 1.2917974990359722e-05, + "loss": 1.3478, + "step": 46440 + }, + { + "epoch": 7.676099979343111, + "grad_norm": 20.19123077392578, + "learning_rate": 1.29087937714611e-05, + "loss": 1.5876, + "step": 46450 + }, + { + "epoch": 7.677752530468911, + "grad_norm": 10.427851676940918, + "learning_rate": 1.2899612552562479e-05, + "loss": 1.3178, + "step": 46460 + }, + { + "epoch": 7.6794050815947115, + "grad_norm": 15.901928901672363, + "learning_rate": 1.2890431333663858e-05, + "loss": 1.4723, + "step": 46470 + }, + { + "epoch": 7.6810576327205125, + "grad_norm": 16.793914794921875, + "learning_rate": 1.2881250114765236e-05, + "loss": 1.3749, + "step": 46480 + }, + { + "epoch": 7.6827101838463125, + "grad_norm": 9.323737144470215, + "learning_rate": 1.2872068895866615e-05, + "loss": 1.3149, + "step": 46490 + }, + { + "epoch": 7.6843627349721135, + "grad_norm": 13.843609809875488, + "learning_rate": 1.2862887676967994e-05, + "loss": 1.4424, + "step": 46500 + }, + { + "epoch": 7.686015286097914, + "grad_norm": 16.021211624145508, + "learning_rate": 1.2853706458069373e-05, + "loss": 1.3349, + "step": 46510 + }, + { + "epoch": 7.687667837223714, + "grad_norm": 12.02914810180664, + "learning_rate": 1.2844525239170751e-05, + "loss": 1.3076, + "step": 46520 + }, + { + "epoch": 7.689320388349515, + "grad_norm": 9.748140335083008, + "learning_rate": 1.2835344020272134e-05, + "loss": 1.3444, + "step": 46530 + }, + { + "epoch": 7.690972939475315, + "grad_norm": 17.1309757232666, + "learning_rate": 1.2826162801373511e-05, + "loss": 1.3255, + "step": 46540 + }, + { + "epoch": 7.692625490601116, + "grad_norm": 10.874885559082031, + "learning_rate": 1.281698158247489e-05, + "loss": 1.3793, + "step": 46550 + }, + { + "epoch": 7.694278041726916, + "grad_norm": 13.18276596069336, + "learning_rate": 1.280780036357627e-05, + "loss": 1.399, + "step": 46560 + }, + { + "epoch": 7.695930592852717, + "grad_norm": 12.042601585388184, + "learning_rate": 1.2798619144677649e-05, + "loss": 1.3615, + "step": 46570 + }, + { + "epoch": 7.697583143978517, + "grad_norm": 25.975971221923828, + "learning_rate": 1.2789437925779027e-05, + "loss": 1.5137, + "step": 46580 + }, + { + "epoch": 7.699235695104317, + "grad_norm": 16.21059226989746, + "learning_rate": 1.2780256706880406e-05, + "loss": 1.3351, + "step": 46590 + }, + { + "epoch": 7.700888246230118, + "grad_norm": 13.045463562011719, + "learning_rate": 1.2771075487981785e-05, + "loss": 1.3001, + "step": 46600 + }, + { + "epoch": 7.702540797355918, + "grad_norm": 11.22043228149414, + "learning_rate": 1.2761894269083163e-05, + "loss": 1.4261, + "step": 46610 + }, + { + "epoch": 7.704193348481718, + "grad_norm": 36.950538635253906, + "learning_rate": 1.2752713050184542e-05, + "loss": 1.3339, + "step": 46620 + }, + { + "epoch": 7.705845899607519, + "grad_norm": 9.229752540588379, + "learning_rate": 1.2743531831285921e-05, + "loss": 1.3431, + "step": 46630 + }, + { + "epoch": 7.707498450733319, + "grad_norm": 11.805033683776855, + "learning_rate": 1.2734350612387302e-05, + "loss": 1.3491, + "step": 46640 + }, + { + "epoch": 7.70915100185912, + "grad_norm": 23.916683197021484, + "learning_rate": 1.2725169393488681e-05, + "loss": 1.3354, + "step": 46650 + }, + { + "epoch": 7.71080355298492, + "grad_norm": 15.571776390075684, + "learning_rate": 1.271598817459006e-05, + "loss": 1.4053, + "step": 46660 + }, + { + "epoch": 7.712456104110721, + "grad_norm": 16.37409782409668, + "learning_rate": 1.2706806955691438e-05, + "loss": 1.2922, + "step": 46670 + }, + { + "epoch": 7.7141086552365215, + "grad_norm": 10.659138679504395, + "learning_rate": 1.2697625736792817e-05, + "loss": 1.3695, + "step": 46680 + }, + { + "epoch": 7.7157612063623215, + "grad_norm": 20.0853328704834, + "learning_rate": 1.2688444517894197e-05, + "loss": 1.4891, + "step": 46690 + }, + { + "epoch": 7.7174137574881225, + "grad_norm": 16.797853469848633, + "learning_rate": 1.2679263298995576e-05, + "loss": 1.4373, + "step": 46700 + }, + { + "epoch": 7.719066308613923, + "grad_norm": 14.921588897705078, + "learning_rate": 1.2670082080096953e-05, + "loss": 1.4576, + "step": 46710 + }, + { + "epoch": 7.720718859739723, + "grad_norm": 9.122919082641602, + "learning_rate": 1.2660900861198333e-05, + "loss": 1.3505, + "step": 46720 + }, + { + "epoch": 7.722371410865524, + "grad_norm": 18.881772994995117, + "learning_rate": 1.2651719642299712e-05, + "loss": 1.5416, + "step": 46730 + }, + { + "epoch": 7.724023961991324, + "grad_norm": 9.798796653747559, + "learning_rate": 1.264253842340109e-05, + "loss": 1.3828, + "step": 46740 + }, + { + "epoch": 7.725676513117125, + "grad_norm": 21.321590423583984, + "learning_rate": 1.2633357204502472e-05, + "loss": 1.4042, + "step": 46750 + }, + { + "epoch": 7.727329064242925, + "grad_norm": 16.886615753173828, + "learning_rate": 1.2624175985603851e-05, + "loss": 1.3997, + "step": 46760 + }, + { + "epoch": 7.728981615368726, + "grad_norm": 16.772817611694336, + "learning_rate": 1.2614994766705229e-05, + "loss": 1.4211, + "step": 46770 + }, + { + "epoch": 7.730634166494526, + "grad_norm": 19.1043758392334, + "learning_rate": 1.2605813547806608e-05, + "loss": 1.3403, + "step": 46780 + }, + { + "epoch": 7.732286717620326, + "grad_norm": 17.032512664794922, + "learning_rate": 1.2596632328907987e-05, + "loss": 1.3278, + "step": 46790 + }, + { + "epoch": 7.733939268746127, + "grad_norm": 9.3512544631958, + "learning_rate": 1.2587451110009365e-05, + "loss": 1.4196, + "step": 46800 + }, + { + "epoch": 7.735591819871927, + "grad_norm": 14.732444763183594, + "learning_rate": 1.2578269891110744e-05, + "loss": 1.4069, + "step": 46810 + }, + { + "epoch": 7.737244370997728, + "grad_norm": 19.343429565429688, + "learning_rate": 1.2569088672212123e-05, + "loss": 1.3892, + "step": 46820 + }, + { + "epoch": 7.738896922123528, + "grad_norm": 15.403067588806152, + "learning_rate": 1.2559907453313503e-05, + "loss": 1.433, + "step": 46830 + }, + { + "epoch": 7.740549473249328, + "grad_norm": 14.680621147155762, + "learning_rate": 1.255072623441488e-05, + "loss": 1.5072, + "step": 46840 + }, + { + "epoch": 7.742202024375129, + "grad_norm": 21.64932632446289, + "learning_rate": 1.254154501551626e-05, + "loss": 1.4388, + "step": 46850 + }, + { + "epoch": 7.743854575500929, + "grad_norm": 12.506025314331055, + "learning_rate": 1.2532363796617639e-05, + "loss": 1.3015, + "step": 46860 + }, + { + "epoch": 7.74550712662673, + "grad_norm": 18.126253128051758, + "learning_rate": 1.252318257771902e-05, + "loss": 1.2921, + "step": 46870 + }, + { + "epoch": 7.7471596777525304, + "grad_norm": 8.994416236877441, + "learning_rate": 1.2514001358820399e-05, + "loss": 1.3792, + "step": 46880 + }, + { + "epoch": 7.7488122288783305, + "grad_norm": 14.529839515686035, + "learning_rate": 1.2504820139921778e-05, + "loss": 1.3501, + "step": 46890 + }, + { + "epoch": 7.7504647800041315, + "grad_norm": 8.697839736938477, + "learning_rate": 1.2495638921023156e-05, + "loss": 1.3575, + "step": 46900 + }, + { + "epoch": 7.752117331129932, + "grad_norm": 14.368610382080078, + "learning_rate": 1.2486457702124535e-05, + "loss": 1.3781, + "step": 46910 + }, + { + "epoch": 7.753769882255733, + "grad_norm": 11.302849769592285, + "learning_rate": 1.2477276483225914e-05, + "loss": 1.4043, + "step": 46920 + }, + { + "epoch": 7.755422433381533, + "grad_norm": 13.063495635986328, + "learning_rate": 1.2468095264327292e-05, + "loss": 1.3099, + "step": 46930 + }, + { + "epoch": 7.757074984507334, + "grad_norm": 11.363663673400879, + "learning_rate": 1.2458914045428671e-05, + "loss": 1.2627, + "step": 46940 + }, + { + "epoch": 7.758727535633134, + "grad_norm": 16.601898193359375, + "learning_rate": 1.2449732826530052e-05, + "loss": 1.2975, + "step": 46950 + }, + { + "epoch": 7.760380086758934, + "grad_norm": 15.275944709777832, + "learning_rate": 1.244055160763143e-05, + "loss": 1.3798, + "step": 46960 + }, + { + "epoch": 7.762032637884735, + "grad_norm": 15.088699340820312, + "learning_rate": 1.2431370388732809e-05, + "loss": 1.4236, + "step": 46970 + }, + { + "epoch": 7.763685189010535, + "grad_norm": 49.174373626708984, + "learning_rate": 1.2422189169834188e-05, + "loss": 1.446, + "step": 46980 + }, + { + "epoch": 7.765337740136335, + "grad_norm": 10.12486457824707, + "learning_rate": 1.2413007950935567e-05, + "loss": 1.3574, + "step": 46990 + }, + { + "epoch": 7.766990291262136, + "grad_norm": 13.83552074432373, + "learning_rate": 1.2403826732036945e-05, + "loss": 1.3296, + "step": 47000 + }, + { + "epoch": 7.768642842387936, + "grad_norm": 7.309926986694336, + "learning_rate": 1.2394645513138326e-05, + "loss": 1.2785, + "step": 47010 + }, + { + "epoch": 7.770295393513737, + "grad_norm": 11.483392715454102, + "learning_rate": 1.2385464294239705e-05, + "loss": 1.3437, + "step": 47020 + }, + { + "epoch": 7.771947944639537, + "grad_norm": 14.15250301361084, + "learning_rate": 1.2376283075341082e-05, + "loss": 1.1891, + "step": 47030 + }, + { + "epoch": 7.773600495765338, + "grad_norm": 8.438793182373047, + "learning_rate": 1.2367101856442462e-05, + "loss": 1.2479, + "step": 47040 + }, + { + "epoch": 7.775253046891138, + "grad_norm": 11.011880874633789, + "learning_rate": 1.2357920637543841e-05, + "loss": 1.2589, + "step": 47050 + }, + { + "epoch": 7.776905598016938, + "grad_norm": 12.733864784240723, + "learning_rate": 1.2348739418645219e-05, + "loss": 1.3427, + "step": 47060 + }, + { + "epoch": 7.778558149142739, + "grad_norm": 11.575126647949219, + "learning_rate": 1.23395581997466e-05, + "loss": 1.3099, + "step": 47070 + }, + { + "epoch": 7.7802107002685394, + "grad_norm": 9.306955337524414, + "learning_rate": 1.2330376980847979e-05, + "loss": 1.3208, + "step": 47080 + }, + { + "epoch": 7.7818632513943395, + "grad_norm": 21.35601234436035, + "learning_rate": 1.2321195761949356e-05, + "loss": 1.4267, + "step": 47090 + }, + { + "epoch": 7.7835158025201405, + "grad_norm": 9.708170890808105, + "learning_rate": 1.2312014543050736e-05, + "loss": 1.3281, + "step": 47100 + }, + { + "epoch": 7.785168353645941, + "grad_norm": 11.418980598449707, + "learning_rate": 1.2302833324152115e-05, + "loss": 1.4658, + "step": 47110 + }, + { + "epoch": 7.786820904771742, + "grad_norm": 33.159698486328125, + "learning_rate": 1.2293652105253494e-05, + "loss": 1.4893, + "step": 47120 + }, + { + "epoch": 7.788473455897542, + "grad_norm": 16.78141212463379, + "learning_rate": 1.2284470886354873e-05, + "loss": 1.5537, + "step": 47130 + }, + { + "epoch": 7.790126007023343, + "grad_norm": 8.051682472229004, + "learning_rate": 1.2275289667456253e-05, + "loss": 1.3073, + "step": 47140 + }, + { + "epoch": 7.791778558149143, + "grad_norm": 10.035185813903809, + "learning_rate": 1.2266108448557632e-05, + "loss": 1.3287, + "step": 47150 + }, + { + "epoch": 7.793431109274943, + "grad_norm": 16.550947189331055, + "learning_rate": 1.225692722965901e-05, + "loss": 1.3027, + "step": 47160 + }, + { + "epoch": 7.795083660400744, + "grad_norm": 13.783156394958496, + "learning_rate": 1.2247746010760389e-05, + "loss": 1.5014, + "step": 47170 + }, + { + "epoch": 7.796736211526544, + "grad_norm": 11.391876220703125, + "learning_rate": 1.223856479186177e-05, + "loss": 1.3918, + "step": 47180 + }, + { + "epoch": 7.798388762652345, + "grad_norm": 16.179780960083008, + "learning_rate": 1.2229383572963147e-05, + "loss": 1.3255, + "step": 47190 + }, + { + "epoch": 7.800041313778145, + "grad_norm": 20.307268142700195, + "learning_rate": 1.2220202354064526e-05, + "loss": 1.3893, + "step": 47200 + }, + { + "epoch": 7.801693864903945, + "grad_norm": 32.367427825927734, + "learning_rate": 1.2211021135165906e-05, + "loss": 1.4139, + "step": 47210 + }, + { + "epoch": 7.803346416029746, + "grad_norm": 9.915732383728027, + "learning_rate": 1.2201839916267283e-05, + "loss": 1.5902, + "step": 47220 + }, + { + "epoch": 7.804998967155546, + "grad_norm": 10.596333503723145, + "learning_rate": 1.2192658697368662e-05, + "loss": 1.4343, + "step": 47230 + }, + { + "epoch": 7.806651518281347, + "grad_norm": 16.008501052856445, + "learning_rate": 1.2183477478470043e-05, + "loss": 1.3559, + "step": 47240 + }, + { + "epoch": 7.808304069407147, + "grad_norm": 14.380460739135742, + "learning_rate": 1.2174296259571421e-05, + "loss": 1.4441, + "step": 47250 + }, + { + "epoch": 7.809956620532947, + "grad_norm": 18.642454147338867, + "learning_rate": 1.21651150406728e-05, + "loss": 1.3636, + "step": 47260 + }, + { + "epoch": 7.811609171658748, + "grad_norm": 10.425969123840332, + "learning_rate": 1.215593382177418e-05, + "loss": 1.4597, + "step": 47270 + }, + { + "epoch": 7.8132617227845484, + "grad_norm": 15.50149917602539, + "learning_rate": 1.2146752602875559e-05, + "loss": 1.3136, + "step": 47280 + }, + { + "epoch": 7.814914273910349, + "grad_norm": 7.004604816436768, + "learning_rate": 1.2137571383976936e-05, + "loss": 1.3023, + "step": 47290 + }, + { + "epoch": 7.8165668250361495, + "grad_norm": 10.380492210388184, + "learning_rate": 1.2128390165078317e-05, + "loss": 1.2798, + "step": 47300 + }, + { + "epoch": 7.8182193761619505, + "grad_norm": 13.373892784118652, + "learning_rate": 1.2119208946179696e-05, + "loss": 1.4054, + "step": 47310 + }, + { + "epoch": 7.819871927287751, + "grad_norm": 18.053157806396484, + "learning_rate": 1.2110027727281074e-05, + "loss": 1.5367, + "step": 47320 + }, + { + "epoch": 7.821524478413551, + "grad_norm": 8.14160442352295, + "learning_rate": 1.2100846508382453e-05, + "loss": 1.2874, + "step": 47330 + }, + { + "epoch": 7.823177029539352, + "grad_norm": 13.822787284851074, + "learning_rate": 1.2091665289483832e-05, + "loss": 1.3485, + "step": 47340 + }, + { + "epoch": 7.824829580665152, + "grad_norm": 44.869102478027344, + "learning_rate": 1.2082484070585212e-05, + "loss": 1.4024, + "step": 47350 + }, + { + "epoch": 7.826482131790952, + "grad_norm": 12.257972717285156, + "learning_rate": 1.2073302851686591e-05, + "loss": 1.3067, + "step": 47360 + }, + { + "epoch": 7.828134682916753, + "grad_norm": 13.056764602661133, + "learning_rate": 1.206412163278797e-05, + "loss": 1.3647, + "step": 47370 + }, + { + "epoch": 7.829787234042553, + "grad_norm": 9.314446449279785, + "learning_rate": 1.2054940413889348e-05, + "loss": 1.4241, + "step": 47380 + }, + { + "epoch": 7.831439785168354, + "grad_norm": 8.149978637695312, + "learning_rate": 1.2045759194990727e-05, + "loss": 1.3955, + "step": 47390 + }, + { + "epoch": 7.833092336294154, + "grad_norm": 25.435317993164062, + "learning_rate": 1.2036577976092106e-05, + "loss": 1.4185, + "step": 47400 + }, + { + "epoch": 7.834744887419955, + "grad_norm": 15.4861478805542, + "learning_rate": 1.2027396757193485e-05, + "loss": 1.4882, + "step": 47410 + }, + { + "epoch": 7.836397438545755, + "grad_norm": 8.784843444824219, + "learning_rate": 1.2018215538294865e-05, + "loss": 1.33, + "step": 47420 + }, + { + "epoch": 7.838049989671555, + "grad_norm": 14.153306007385254, + "learning_rate": 1.2009034319396244e-05, + "loss": 1.3612, + "step": 47430 + }, + { + "epoch": 7.839702540797356, + "grad_norm": 10.80195426940918, + "learning_rate": 1.1999853100497623e-05, + "loss": 1.48, + "step": 47440 + }, + { + "epoch": 7.841355091923156, + "grad_norm": 8.831650733947754, + "learning_rate": 1.1990671881599e-05, + "loss": 1.3938, + "step": 47450 + }, + { + "epoch": 7.843007643048956, + "grad_norm": 8.44402027130127, + "learning_rate": 1.198149066270038e-05, + "loss": 1.4246, + "step": 47460 + }, + { + "epoch": 7.844660194174757, + "grad_norm": 10.85715389251709, + "learning_rate": 1.1972309443801761e-05, + "loss": 1.3261, + "step": 47470 + }, + { + "epoch": 7.8463127453005574, + "grad_norm": 17.19044303894043, + "learning_rate": 1.1963128224903138e-05, + "loss": 1.5413, + "step": 47480 + }, + { + "epoch": 7.847965296426358, + "grad_norm": 7.725858211517334, + "learning_rate": 1.1953947006004518e-05, + "loss": 1.4338, + "step": 47490 + }, + { + "epoch": 7.8496178475521585, + "grad_norm": 9.736605644226074, + "learning_rate": 1.1944765787105897e-05, + "loss": 1.2932, + "step": 47500 + }, + { + "epoch": 7.8512703986779595, + "grad_norm": 14.61507511138916, + "learning_rate": 1.1935584568207275e-05, + "loss": 1.4587, + "step": 47510 + }, + { + "epoch": 7.85292294980376, + "grad_norm": 12.031658172607422, + "learning_rate": 1.1926403349308654e-05, + "loss": 1.4079, + "step": 47520 + }, + { + "epoch": 7.85457550092956, + "grad_norm": 16.91095542907715, + "learning_rate": 1.1917222130410035e-05, + "loss": 1.3452, + "step": 47530 + }, + { + "epoch": 7.856228052055361, + "grad_norm": 15.591407775878906, + "learning_rate": 1.1908040911511412e-05, + "loss": 1.4318, + "step": 47540 + }, + { + "epoch": 7.857880603181161, + "grad_norm": 15.299722671508789, + "learning_rate": 1.1898859692612792e-05, + "loss": 1.3371, + "step": 47550 + }, + { + "epoch": 7.859533154306962, + "grad_norm": 8.688148498535156, + "learning_rate": 1.188967847371417e-05, + "loss": 1.4927, + "step": 47560 + }, + { + "epoch": 7.861185705432762, + "grad_norm": 14.59116268157959, + "learning_rate": 1.188049725481555e-05, + "loss": 1.3456, + "step": 47570 + }, + { + "epoch": 7.862838256558562, + "grad_norm": 30.086807250976562, + "learning_rate": 1.187131603591693e-05, + "loss": 1.2876, + "step": 47580 + }, + { + "epoch": 7.864490807684363, + "grad_norm": 25.886877059936523, + "learning_rate": 1.1862134817018308e-05, + "loss": 1.491, + "step": 47590 + }, + { + "epoch": 7.866143358810163, + "grad_norm": 11.027227401733398, + "learning_rate": 1.1852953598119688e-05, + "loss": 1.4246, + "step": 47600 + }, + { + "epoch": 7.867795909935964, + "grad_norm": 10.795921325683594, + "learning_rate": 1.1843772379221065e-05, + "loss": 1.3966, + "step": 47610 + }, + { + "epoch": 7.869448461061764, + "grad_norm": 9.404576301574707, + "learning_rate": 1.1834591160322445e-05, + "loss": 1.3629, + "step": 47620 + }, + { + "epoch": 7.871101012187564, + "grad_norm": 11.95871353149414, + "learning_rate": 1.1825409941423824e-05, + "loss": 1.4068, + "step": 47630 + }, + { + "epoch": 7.872753563313365, + "grad_norm": 11.290194511413574, + "learning_rate": 1.1816228722525203e-05, + "loss": 1.4184, + "step": 47640 + }, + { + "epoch": 7.874406114439165, + "grad_norm": 32.461849212646484, + "learning_rate": 1.1807047503626582e-05, + "loss": 1.4621, + "step": 47650 + }, + { + "epoch": 7.876058665564966, + "grad_norm": 10.396666526794434, + "learning_rate": 1.1797866284727962e-05, + "loss": 1.3957, + "step": 47660 + }, + { + "epoch": 7.877711216690766, + "grad_norm": 12.307036399841309, + "learning_rate": 1.1788685065829339e-05, + "loss": 1.3876, + "step": 47670 + }, + { + "epoch": 7.879363767816566, + "grad_norm": 10.1054048538208, + "learning_rate": 1.1779503846930718e-05, + "loss": 1.4137, + "step": 47680 + }, + { + "epoch": 7.881016318942367, + "grad_norm": 16.99799346923828, + "learning_rate": 1.1770322628032098e-05, + "loss": 1.3882, + "step": 47690 + }, + { + "epoch": 7.8826688700681675, + "grad_norm": 14.859070777893066, + "learning_rate": 1.1761141409133477e-05, + "loss": 1.3793, + "step": 47700 + }, + { + "epoch": 7.8843214211939685, + "grad_norm": 13.93166446685791, + "learning_rate": 1.1751960190234856e-05, + "loss": 1.4492, + "step": 47710 + }, + { + "epoch": 7.885973972319769, + "grad_norm": 12.025364875793457, + "learning_rate": 1.1742778971336235e-05, + "loss": 1.5158, + "step": 47720 + }, + { + "epoch": 7.887626523445569, + "grad_norm": 15.031390190124512, + "learning_rate": 1.1733597752437615e-05, + "loss": 1.3162, + "step": 47730 + }, + { + "epoch": 7.88927907457137, + "grad_norm": 13.956084251403809, + "learning_rate": 1.1724416533538992e-05, + "loss": 1.2951, + "step": 47740 + }, + { + "epoch": 7.89093162569717, + "grad_norm": 12.361295700073242, + "learning_rate": 1.1715235314640373e-05, + "loss": 1.5406, + "step": 47750 + }, + { + "epoch": 7.892584176822971, + "grad_norm": 8.774741172790527, + "learning_rate": 1.1706054095741752e-05, + "loss": 1.2487, + "step": 47760 + }, + { + "epoch": 7.894236727948771, + "grad_norm": 31.390975952148438, + "learning_rate": 1.169687287684313e-05, + "loss": 1.3818, + "step": 47770 + }, + { + "epoch": 7.895889279074572, + "grad_norm": 7.857132911682129, + "learning_rate": 1.1687691657944509e-05, + "loss": 1.4169, + "step": 47780 + }, + { + "epoch": 7.897541830200372, + "grad_norm": 27.877517700195312, + "learning_rate": 1.1678510439045888e-05, + "loss": 1.4917, + "step": 47790 + }, + { + "epoch": 7.899194381326172, + "grad_norm": 21.92901611328125, + "learning_rate": 1.1669329220147266e-05, + "loss": 1.4045, + "step": 47800 + }, + { + "epoch": 7.900846932451973, + "grad_norm": 13.53458023071289, + "learning_rate": 1.1660148001248647e-05, + "loss": 1.4553, + "step": 47810 + }, + { + "epoch": 7.902499483577773, + "grad_norm": 13.368227005004883, + "learning_rate": 1.1650966782350026e-05, + "loss": 1.3294, + "step": 47820 + }, + { + "epoch": 7.904152034703573, + "grad_norm": 9.063852310180664, + "learning_rate": 1.1641785563451404e-05, + "loss": 1.4244, + "step": 47830 + }, + { + "epoch": 7.905804585829374, + "grad_norm": 62.96859359741211, + "learning_rate": 1.1632604344552783e-05, + "loss": 1.3679, + "step": 47840 + }, + { + "epoch": 7.907457136955174, + "grad_norm": 11.219415664672852, + "learning_rate": 1.1623423125654162e-05, + "loss": 1.4244, + "step": 47850 + }, + { + "epoch": 7.909109688080975, + "grad_norm": 13.625591278076172, + "learning_rate": 1.1614241906755541e-05, + "loss": 1.4893, + "step": 47860 + }, + { + "epoch": 7.910762239206775, + "grad_norm": 7.387325763702393, + "learning_rate": 1.160506068785692e-05, + "loss": 1.3991, + "step": 47870 + }, + { + "epoch": 7.912414790332576, + "grad_norm": 11.21784782409668, + "learning_rate": 1.15958794689583e-05, + "loss": 1.3294, + "step": 47880 + }, + { + "epoch": 7.914067341458376, + "grad_norm": 17.778669357299805, + "learning_rate": 1.1586698250059679e-05, + "loss": 1.4258, + "step": 47890 + }, + { + "epoch": 7.9157198925841765, + "grad_norm": 12.979781150817871, + "learning_rate": 1.1577517031161057e-05, + "loss": 1.2635, + "step": 47900 + }, + { + "epoch": 7.9173724437099775, + "grad_norm": 13.952737808227539, + "learning_rate": 1.1568335812262436e-05, + "loss": 1.4153, + "step": 47910 + }, + { + "epoch": 7.919024994835778, + "grad_norm": 10.576740264892578, + "learning_rate": 1.1559154593363815e-05, + "loss": 1.455, + "step": 47920 + }, + { + "epoch": 7.920677545961578, + "grad_norm": 9.399680137634277, + "learning_rate": 1.1549973374465194e-05, + "loss": 1.3299, + "step": 47930 + }, + { + "epoch": 7.922330097087379, + "grad_norm": 18.13553237915039, + "learning_rate": 1.1540792155566574e-05, + "loss": 1.3659, + "step": 47940 + }, + { + "epoch": 7.923982648213179, + "grad_norm": 17.09833335876465, + "learning_rate": 1.1531610936667953e-05, + "loss": 1.3799, + "step": 47950 + }, + { + "epoch": 7.92563519933898, + "grad_norm": 11.579339027404785, + "learning_rate": 1.152242971776933e-05, + "loss": 1.4604, + "step": 47960 + }, + { + "epoch": 7.92728775046478, + "grad_norm": 17.407424926757812, + "learning_rate": 1.151324849887071e-05, + "loss": 1.4101, + "step": 47970 + }, + { + "epoch": 7.928940301590581, + "grad_norm": 17.47738265991211, + "learning_rate": 1.150406727997209e-05, + "loss": 1.3574, + "step": 47980 + }, + { + "epoch": 7.930592852716381, + "grad_norm": 10.31318473815918, + "learning_rate": 1.1494886061073468e-05, + "loss": 1.457, + "step": 47990 + }, + { + "epoch": 7.932245403842181, + "grad_norm": 14.748434066772461, + "learning_rate": 1.1485704842174847e-05, + "loss": 1.3569, + "step": 48000 + }, + { + "epoch": 7.933897954967982, + "grad_norm": 10.245794296264648, + "learning_rate": 1.1476523623276227e-05, + "loss": 1.4359, + "step": 48010 + }, + { + "epoch": 7.935550506093782, + "grad_norm": 14.848341941833496, + "learning_rate": 1.1467342404377606e-05, + "loss": 1.365, + "step": 48020 + }, + { + "epoch": 7.937203057219583, + "grad_norm": 11.871755599975586, + "learning_rate": 1.1458161185478984e-05, + "loss": 1.4103, + "step": 48030 + }, + { + "epoch": 7.938855608345383, + "grad_norm": 13.944067001342773, + "learning_rate": 1.1448979966580364e-05, + "loss": 1.4093, + "step": 48040 + }, + { + "epoch": 7.940508159471183, + "grad_norm": 67.12018585205078, + "learning_rate": 1.1439798747681744e-05, + "loss": 1.3977, + "step": 48050 + }, + { + "epoch": 7.942160710596984, + "grad_norm": 13.307299613952637, + "learning_rate": 1.1430617528783121e-05, + "loss": 1.2964, + "step": 48060 + }, + { + "epoch": 7.943813261722784, + "grad_norm": 7.652238845825195, + "learning_rate": 1.14214363098845e-05, + "loss": 1.221, + "step": 48070 + }, + { + "epoch": 7.945465812848585, + "grad_norm": 11.270803451538086, + "learning_rate": 1.141225509098588e-05, + "loss": 1.3174, + "step": 48080 + }, + { + "epoch": 7.947118363974385, + "grad_norm": 16.249711990356445, + "learning_rate": 1.1403073872087259e-05, + "loss": 1.3507, + "step": 48090 + }, + { + "epoch": 7.9487709151001855, + "grad_norm": 12.031152725219727, + "learning_rate": 1.1393892653188638e-05, + "loss": 1.4643, + "step": 48100 + }, + { + "epoch": 7.9504234662259865, + "grad_norm": 9.80186939239502, + "learning_rate": 1.1384711434290018e-05, + "loss": 1.2806, + "step": 48110 + }, + { + "epoch": 7.952076017351787, + "grad_norm": 8.662875175476074, + "learning_rate": 1.1375530215391395e-05, + "loss": 1.3472, + "step": 48120 + }, + { + "epoch": 7.953728568477588, + "grad_norm": 10.309189796447754, + "learning_rate": 1.1366348996492774e-05, + "loss": 1.3387, + "step": 48130 + }, + { + "epoch": 7.955381119603388, + "grad_norm": 16.453157424926758, + "learning_rate": 1.1357167777594154e-05, + "loss": 1.3604, + "step": 48140 + }, + { + "epoch": 7.957033670729189, + "grad_norm": 16.2752628326416, + "learning_rate": 1.1347986558695533e-05, + "loss": 1.4941, + "step": 48150 + }, + { + "epoch": 7.958686221854989, + "grad_norm": 9.665366172790527, + "learning_rate": 1.1338805339796912e-05, + "loss": 1.3735, + "step": 48160 + }, + { + "epoch": 7.960338772980789, + "grad_norm": 12.39187240600586, + "learning_rate": 1.1329624120898291e-05, + "loss": 1.3077, + "step": 48170 + }, + { + "epoch": 7.96199132410659, + "grad_norm": 12.81519603729248, + "learning_rate": 1.132044290199967e-05, + "loss": 1.3662, + "step": 48180 + }, + { + "epoch": 7.96364387523239, + "grad_norm": 23.483640670776367, + "learning_rate": 1.1311261683101048e-05, + "loss": 1.4731, + "step": 48190 + }, + { + "epoch": 7.96529642635819, + "grad_norm": 26.04778480529785, + "learning_rate": 1.1302080464202427e-05, + "loss": 1.5032, + "step": 48200 + }, + { + "epoch": 7.966948977483991, + "grad_norm": 7.779325485229492, + "learning_rate": 1.1292899245303808e-05, + "loss": 1.412, + "step": 48210 + }, + { + "epoch": 7.968601528609791, + "grad_norm": 9.814796447753906, + "learning_rate": 1.1283718026405186e-05, + "loss": 1.4677, + "step": 48220 + }, + { + "epoch": 7.970254079735592, + "grad_norm": 12.185991287231445, + "learning_rate": 1.1274536807506565e-05, + "loss": 1.3061, + "step": 48230 + }, + { + "epoch": 7.971906630861392, + "grad_norm": 15.006294250488281, + "learning_rate": 1.1265355588607944e-05, + "loss": 1.3275, + "step": 48240 + }, + { + "epoch": 7.973559181987193, + "grad_norm": 12.14893627166748, + "learning_rate": 1.1256174369709324e-05, + "loss": 1.3091, + "step": 48250 + }, + { + "epoch": 7.975211733112993, + "grad_norm": 14.654484748840332, + "learning_rate": 1.1246993150810701e-05, + "loss": 1.4328, + "step": 48260 + }, + { + "epoch": 7.976864284238793, + "grad_norm": 13.88418960571289, + "learning_rate": 1.1237811931912082e-05, + "loss": 1.3212, + "step": 48270 + }, + { + "epoch": 7.978516835364594, + "grad_norm": 8.910808563232422, + "learning_rate": 1.122863071301346e-05, + "loss": 1.2907, + "step": 48280 + }, + { + "epoch": 7.980169386490394, + "grad_norm": 9.435023307800293, + "learning_rate": 1.1219449494114839e-05, + "loss": 1.3946, + "step": 48290 + }, + { + "epoch": 7.9818219376161945, + "grad_norm": 23.680673599243164, + "learning_rate": 1.1210268275216218e-05, + "loss": 1.401, + "step": 48300 + }, + { + "epoch": 7.9834744887419955, + "grad_norm": 14.382912635803223, + "learning_rate": 1.1201087056317597e-05, + "loss": 1.367, + "step": 48310 + }, + { + "epoch": 7.985127039867796, + "grad_norm": 18.664398193359375, + "learning_rate": 1.1191905837418977e-05, + "loss": 1.4019, + "step": 48320 + }, + { + "epoch": 7.986779590993597, + "grad_norm": 13.360762596130371, + "learning_rate": 1.1182724618520356e-05, + "loss": 1.5117, + "step": 48330 + }, + { + "epoch": 7.988432142119397, + "grad_norm": 13.729475975036621, + "learning_rate": 1.1173543399621735e-05, + "loss": 1.5005, + "step": 48340 + }, + { + "epoch": 7.990084693245198, + "grad_norm": 9.402355194091797, + "learning_rate": 1.1164362180723113e-05, + "loss": 1.4522, + "step": 48350 + }, + { + "epoch": 7.991737244370998, + "grad_norm": 10.234284400939941, + "learning_rate": 1.1155180961824492e-05, + "loss": 1.4535, + "step": 48360 + }, + { + "epoch": 7.993389795496798, + "grad_norm": 14.257830619812012, + "learning_rate": 1.1145999742925871e-05, + "loss": 1.3578, + "step": 48370 + }, + { + "epoch": 7.995042346622599, + "grad_norm": 16.891815185546875, + "learning_rate": 1.113681852402725e-05, + "loss": 1.46, + "step": 48380 + }, + { + "epoch": 7.996694897748399, + "grad_norm": 11.595968246459961, + "learning_rate": 1.112763730512863e-05, + "loss": 1.3274, + "step": 48390 + }, + { + "epoch": 7.9983474488742, + "grad_norm": 12.209437370300293, + "learning_rate": 1.1118456086230009e-05, + "loss": 1.3268, + "step": 48400 + }, + { + "epoch": 8.0, + "grad_norm": 10.217020988464355, + "learning_rate": 1.1109274867331388e-05, + "loss": 1.3077, + "step": 48410 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.3359696410838417, + "eval_loss": 2.232658863067627, + "eval_runtime": 833.0289, + "eval_samples_per_second": 33.848, + "eval_steps_per_second": 8.462, + "step": 48410 + }, + { + "epoch": 8.001652551125801, + "grad_norm": 13.642919540405273, + "learning_rate": 1.1100093648432766e-05, + "loss": 1.2788, + "step": 48420 + }, + { + "epoch": 8.0033051022516, + "grad_norm": 11.345486640930176, + "learning_rate": 1.1090912429534145e-05, + "loss": 1.3731, + "step": 48430 + }, + { + "epoch": 8.004957653377401, + "grad_norm": 13.825784683227539, + "learning_rate": 1.1081731210635524e-05, + "loss": 1.398, + "step": 48440 + }, + { + "epoch": 8.006610204503202, + "grad_norm": 22.2542667388916, + "learning_rate": 1.1072549991736903e-05, + "loss": 1.3321, + "step": 48450 + }, + { + "epoch": 8.008262755629003, + "grad_norm": 12.9199800491333, + "learning_rate": 1.1063368772838283e-05, + "loss": 1.3493, + "step": 48460 + }, + { + "epoch": 8.009915306754802, + "grad_norm": 9.398885726928711, + "learning_rate": 1.1054187553939662e-05, + "loss": 1.292, + "step": 48470 + }, + { + "epoch": 8.011567857880603, + "grad_norm": 17.111080169677734, + "learning_rate": 1.104500633504104e-05, + "loss": 1.3765, + "step": 48480 + }, + { + "epoch": 8.013220409006404, + "grad_norm": 268.5393981933594, + "learning_rate": 1.1035825116142419e-05, + "loss": 1.3245, + "step": 48490 + }, + { + "epoch": 8.014872960132204, + "grad_norm": 11.041481971740723, + "learning_rate": 1.10266438972438e-05, + "loss": 1.3954, + "step": 48500 + }, + { + "epoch": 8.016525511258004, + "grad_norm": 13.948131561279297, + "learning_rate": 1.1017462678345177e-05, + "loss": 1.2804, + "step": 48510 + }, + { + "epoch": 8.018178062383805, + "grad_norm": 10.275325775146484, + "learning_rate": 1.1008281459446557e-05, + "loss": 1.4867, + "step": 48520 + }, + { + "epoch": 8.019830613509605, + "grad_norm": 9.993414878845215, + "learning_rate": 1.0999100240547936e-05, + "loss": 1.4039, + "step": 48530 + }, + { + "epoch": 8.021483164635406, + "grad_norm": 10.115413665771484, + "learning_rate": 1.0989919021649315e-05, + "loss": 1.2562, + "step": 48540 + }, + { + "epoch": 8.023135715761207, + "grad_norm": 12.741392135620117, + "learning_rate": 1.0980737802750694e-05, + "loss": 1.1463, + "step": 48550 + }, + { + "epoch": 8.024788266887008, + "grad_norm": 21.065656661987305, + "learning_rate": 1.0971556583852073e-05, + "loss": 1.3402, + "step": 48560 + }, + { + "epoch": 8.026440818012807, + "grad_norm": 13.77077865600586, + "learning_rate": 1.0962375364953453e-05, + "loss": 1.4906, + "step": 48570 + }, + { + "epoch": 8.028093369138608, + "grad_norm": 10.071250915527344, + "learning_rate": 1.095319414605483e-05, + "loss": 1.2456, + "step": 48580 + }, + { + "epoch": 8.029745920264409, + "grad_norm": 41.76918029785156, + "learning_rate": 1.094401292715621e-05, + "loss": 1.3994, + "step": 48590 + }, + { + "epoch": 8.031398471390208, + "grad_norm": 12.07766342163086, + "learning_rate": 1.0934831708257589e-05, + "loss": 1.5174, + "step": 48600 + }, + { + "epoch": 8.033051022516009, + "grad_norm": 12.890829086303711, + "learning_rate": 1.0925650489358968e-05, + "loss": 1.398, + "step": 48610 + }, + { + "epoch": 8.03470357364181, + "grad_norm": 18.99813461303711, + "learning_rate": 1.0916469270460347e-05, + "loss": 1.4505, + "step": 48620 + }, + { + "epoch": 8.03635612476761, + "grad_norm": 13.51767635345459, + "learning_rate": 1.0907288051561727e-05, + "loss": 1.2901, + "step": 48630 + }, + { + "epoch": 8.03800867589341, + "grad_norm": 11.078587532043457, + "learning_rate": 1.0898106832663104e-05, + "loss": 1.273, + "step": 48640 + }, + { + "epoch": 8.039661227019211, + "grad_norm": 9.125107765197754, + "learning_rate": 1.0888925613764483e-05, + "loss": 1.2565, + "step": 48650 + }, + { + "epoch": 8.041313778145012, + "grad_norm": 17.25629997253418, + "learning_rate": 1.0879744394865863e-05, + "loss": 1.4344, + "step": 48660 + }, + { + "epoch": 8.042966329270811, + "grad_norm": 10.981740951538086, + "learning_rate": 1.0870563175967242e-05, + "loss": 1.4435, + "step": 48670 + }, + { + "epoch": 8.044618880396612, + "grad_norm": 24.005199432373047, + "learning_rate": 1.0861381957068621e-05, + "loss": 1.3133, + "step": 48680 + }, + { + "epoch": 8.046271431522413, + "grad_norm": 10.016214370727539, + "learning_rate": 1.085220073817e-05, + "loss": 1.3351, + "step": 48690 + }, + { + "epoch": 8.047923982648213, + "grad_norm": 19.678804397583008, + "learning_rate": 1.084301951927138e-05, + "loss": 1.5142, + "step": 48700 + }, + { + "epoch": 8.049576533774013, + "grad_norm": 12.911602020263672, + "learning_rate": 1.0833838300372757e-05, + "loss": 1.3535, + "step": 48710 + }, + { + "epoch": 8.051229084899814, + "grad_norm": 28.89436149597168, + "learning_rate": 1.0824657081474138e-05, + "loss": 1.3765, + "step": 48720 + }, + { + "epoch": 8.052881636025614, + "grad_norm": 9.610832214355469, + "learning_rate": 1.0815475862575517e-05, + "loss": 1.3013, + "step": 48730 + }, + { + "epoch": 8.054534187151415, + "grad_norm": 10.988263130187988, + "learning_rate": 1.0806294643676895e-05, + "loss": 1.4339, + "step": 48740 + }, + { + "epoch": 8.056186738277216, + "grad_norm": 8.778138160705566, + "learning_rate": 1.0797113424778274e-05, + "loss": 1.4137, + "step": 48750 + }, + { + "epoch": 8.057839289403017, + "grad_norm": 14.911858558654785, + "learning_rate": 1.0787932205879653e-05, + "loss": 1.4565, + "step": 48760 + }, + { + "epoch": 8.059491840528816, + "grad_norm": 11.017518043518066, + "learning_rate": 1.0778750986981031e-05, + "loss": 1.2947, + "step": 48770 + }, + { + "epoch": 8.061144391654617, + "grad_norm": 10.435925483703613, + "learning_rate": 1.0769569768082412e-05, + "loss": 1.2544, + "step": 48780 + }, + { + "epoch": 8.062796942780418, + "grad_norm": 15.238306999206543, + "learning_rate": 1.0760388549183791e-05, + "loss": 1.4135, + "step": 48790 + }, + { + "epoch": 8.064449493906217, + "grad_norm": 19.062891006469727, + "learning_rate": 1.0751207330285169e-05, + "loss": 1.375, + "step": 48800 + }, + { + "epoch": 8.066102045032018, + "grad_norm": 32.4384651184082, + "learning_rate": 1.0742026111386548e-05, + "loss": 1.5354, + "step": 48810 + }, + { + "epoch": 8.067754596157819, + "grad_norm": 12.400315284729004, + "learning_rate": 1.0732844892487927e-05, + "loss": 1.3206, + "step": 48820 + }, + { + "epoch": 8.06940714728362, + "grad_norm": 21.19147491455078, + "learning_rate": 1.0723663673589306e-05, + "loss": 1.4587, + "step": 48830 + }, + { + "epoch": 8.07105969840942, + "grad_norm": 14.054428100585938, + "learning_rate": 1.0714482454690686e-05, + "loss": 1.3973, + "step": 48840 + }, + { + "epoch": 8.07271224953522, + "grad_norm": 9.821606636047363, + "learning_rate": 1.0705301235792065e-05, + "loss": 1.3624, + "step": 48850 + }, + { + "epoch": 8.074364800661021, + "grad_norm": 17.55989646911621, + "learning_rate": 1.0696120016893444e-05, + "loss": 1.4144, + "step": 48860 + }, + { + "epoch": 8.07601735178682, + "grad_norm": 14.98707103729248, + "learning_rate": 1.0686938797994822e-05, + "loss": 1.3011, + "step": 48870 + }, + { + "epoch": 8.077669902912621, + "grad_norm": 20.7563419342041, + "learning_rate": 1.0677757579096201e-05, + "loss": 1.282, + "step": 48880 + }, + { + "epoch": 8.079322454038422, + "grad_norm": 12.075446128845215, + "learning_rate": 1.066857636019758e-05, + "loss": 1.3504, + "step": 48890 + }, + { + "epoch": 8.080975005164222, + "grad_norm": 41.509918212890625, + "learning_rate": 1.065939514129896e-05, + "loss": 1.2301, + "step": 48900 + }, + { + "epoch": 8.082627556290022, + "grad_norm": 14.281842231750488, + "learning_rate": 1.0650213922400339e-05, + "loss": 1.3586, + "step": 48910 + }, + { + "epoch": 8.084280107415823, + "grad_norm": 11.7456693649292, + "learning_rate": 1.0641032703501718e-05, + "loss": 1.3069, + "step": 48920 + }, + { + "epoch": 8.085932658541624, + "grad_norm": 19.099252700805664, + "learning_rate": 1.0631851484603095e-05, + "loss": 1.4793, + "step": 48930 + }, + { + "epoch": 8.087585209667424, + "grad_norm": 24.131771087646484, + "learning_rate": 1.0622670265704475e-05, + "loss": 1.2569, + "step": 48940 + }, + { + "epoch": 8.089237760793225, + "grad_norm": 11.209100723266602, + "learning_rate": 1.0613489046805856e-05, + "loss": 1.2394, + "step": 48950 + }, + { + "epoch": 8.090890311919026, + "grad_norm": 13.452157020568848, + "learning_rate": 1.0604307827907233e-05, + "loss": 1.2686, + "step": 48960 + }, + { + "epoch": 8.092542863044825, + "grad_norm": 15.363450050354004, + "learning_rate": 1.0595126609008612e-05, + "loss": 1.4572, + "step": 48970 + }, + { + "epoch": 8.094195414170626, + "grad_norm": 10.773612022399902, + "learning_rate": 1.0585945390109992e-05, + "loss": 1.3411, + "step": 48980 + }, + { + "epoch": 8.095847965296427, + "grad_norm": 142.82435607910156, + "learning_rate": 1.0576764171211371e-05, + "loss": 1.3155, + "step": 48990 + }, + { + "epoch": 8.097500516422226, + "grad_norm": 67.02763366699219, + "learning_rate": 1.0567582952312749e-05, + "loss": 1.39, + "step": 49000 + }, + { + "epoch": 8.099153067548027, + "grad_norm": 38.37979507446289, + "learning_rate": 1.055840173341413e-05, + "loss": 1.3099, + "step": 49010 + }, + { + "epoch": 8.100805618673828, + "grad_norm": 12.719371795654297, + "learning_rate": 1.0549220514515509e-05, + "loss": 1.3926, + "step": 49020 + }, + { + "epoch": 8.102458169799629, + "grad_norm": 18.666179656982422, + "learning_rate": 1.0540039295616886e-05, + "loss": 1.3513, + "step": 49030 + }, + { + "epoch": 8.104110720925428, + "grad_norm": 11.054634094238281, + "learning_rate": 1.0530858076718266e-05, + "loss": 1.3234, + "step": 49040 + }, + { + "epoch": 8.10576327205123, + "grad_norm": 14.501553535461426, + "learning_rate": 1.0521676857819645e-05, + "loss": 1.352, + "step": 49050 + }, + { + "epoch": 8.10741582317703, + "grad_norm": 12.57413101196289, + "learning_rate": 1.0512495638921022e-05, + "loss": 1.2829, + "step": 49060 + }, + { + "epoch": 8.10906837430283, + "grad_norm": 19.41373062133789, + "learning_rate": 1.0503314420022403e-05, + "loss": 1.2808, + "step": 49070 + }, + { + "epoch": 8.11072092542863, + "grad_norm": 12.489398956298828, + "learning_rate": 1.0494133201123783e-05, + "loss": 1.3782, + "step": 49080 + }, + { + "epoch": 8.112373476554431, + "grad_norm": 17.002321243286133, + "learning_rate": 1.048495198222516e-05, + "loss": 1.4326, + "step": 49090 + }, + { + "epoch": 8.11402602768023, + "grad_norm": 10.533157348632812, + "learning_rate": 1.047577076332654e-05, + "loss": 1.2274, + "step": 49100 + }, + { + "epoch": 8.115678578806031, + "grad_norm": 14.494053840637207, + "learning_rate": 1.0466589544427919e-05, + "loss": 1.3649, + "step": 49110 + }, + { + "epoch": 8.117331129931832, + "grad_norm": 8.390114784240723, + "learning_rate": 1.0457408325529298e-05, + "loss": 1.3052, + "step": 49120 + }, + { + "epoch": 8.118983681057633, + "grad_norm": 23.97467803955078, + "learning_rate": 1.0448227106630677e-05, + "loss": 1.3362, + "step": 49130 + }, + { + "epoch": 8.120636232183433, + "grad_norm": 18.547996520996094, + "learning_rate": 1.0439045887732056e-05, + "loss": 1.3471, + "step": 49140 + }, + { + "epoch": 8.122288783309234, + "grad_norm": 17.564769744873047, + "learning_rate": 1.0429864668833436e-05, + "loss": 1.3934, + "step": 49150 + }, + { + "epoch": 8.123941334435035, + "grad_norm": 25.809839248657227, + "learning_rate": 1.0420683449934813e-05, + "loss": 1.5178, + "step": 49160 + }, + { + "epoch": 8.125593885560834, + "grad_norm": 24.299781799316406, + "learning_rate": 1.0411502231036192e-05, + "loss": 1.352, + "step": 49170 + }, + { + "epoch": 8.127246436686635, + "grad_norm": 13.150251388549805, + "learning_rate": 1.0402321012137573e-05, + "loss": 1.4525, + "step": 49180 + }, + { + "epoch": 8.128898987812436, + "grad_norm": 14.097673416137695, + "learning_rate": 1.0393139793238951e-05, + "loss": 1.4137, + "step": 49190 + }, + { + "epoch": 8.130551538938235, + "grad_norm": 13.112128257751465, + "learning_rate": 1.038395857434033e-05, + "loss": 1.2533, + "step": 49200 + }, + { + "epoch": 8.132204090064036, + "grad_norm": 12.603325843811035, + "learning_rate": 1.037477735544171e-05, + "loss": 1.4444, + "step": 49210 + }, + { + "epoch": 8.133856641189837, + "grad_norm": 41.19782638549805, + "learning_rate": 1.0365596136543087e-05, + "loss": 1.45, + "step": 49220 + }, + { + "epoch": 8.135509192315638, + "grad_norm": 15.660469055175781, + "learning_rate": 1.0356414917644466e-05, + "loss": 1.436, + "step": 49230 + }, + { + "epoch": 8.137161743441437, + "grad_norm": 12.267075538635254, + "learning_rate": 1.0347233698745847e-05, + "loss": 1.4064, + "step": 49240 + }, + { + "epoch": 8.138814294567238, + "grad_norm": 12.525456428527832, + "learning_rate": 1.0338052479847225e-05, + "loss": 1.3886, + "step": 49250 + }, + { + "epoch": 8.14046684569304, + "grad_norm": 12.728673934936523, + "learning_rate": 1.0328871260948604e-05, + "loss": 1.3165, + "step": 49260 + }, + { + "epoch": 8.142119396818838, + "grad_norm": 14.614742279052734, + "learning_rate": 1.0319690042049983e-05, + "loss": 1.2583, + "step": 49270 + }, + { + "epoch": 8.14377194794464, + "grad_norm": 14.37644100189209, + "learning_rate": 1.0310508823151362e-05, + "loss": 1.2297, + "step": 49280 + }, + { + "epoch": 8.14542449907044, + "grad_norm": 10.755502700805664, + "learning_rate": 1.030132760425274e-05, + "loss": 1.3726, + "step": 49290 + }, + { + "epoch": 8.147077050196241, + "grad_norm": 156.68991088867188, + "learning_rate": 1.0292146385354121e-05, + "loss": 1.4638, + "step": 49300 + }, + { + "epoch": 8.14872960132204, + "grad_norm": 16.2905330657959, + "learning_rate": 1.02829651664555e-05, + "loss": 1.3056, + "step": 49310 + }, + { + "epoch": 8.150382152447841, + "grad_norm": 12.603002548217773, + "learning_rate": 1.0273783947556878e-05, + "loss": 1.4877, + "step": 49320 + }, + { + "epoch": 8.152034703573642, + "grad_norm": 14.49899673461914, + "learning_rate": 1.0264602728658257e-05, + "loss": 1.3918, + "step": 49330 + }, + { + "epoch": 8.153687254699442, + "grad_norm": 12.177115440368652, + "learning_rate": 1.0255421509759636e-05, + "loss": 1.2091, + "step": 49340 + }, + { + "epoch": 8.155339805825243, + "grad_norm": 9.79865550994873, + "learning_rate": 1.0246240290861015e-05, + "loss": 1.2554, + "step": 49350 + }, + { + "epoch": 8.156992356951044, + "grad_norm": 8.341238975524902, + "learning_rate": 1.0237059071962395e-05, + "loss": 1.2596, + "step": 49360 + }, + { + "epoch": 8.158644908076843, + "grad_norm": 15.849066734313965, + "learning_rate": 1.0227877853063774e-05, + "loss": 1.2234, + "step": 49370 + }, + { + "epoch": 8.160297459202644, + "grad_norm": 8.827754974365234, + "learning_rate": 1.0218696634165151e-05, + "loss": 1.3488, + "step": 49380 + }, + { + "epoch": 8.161950010328445, + "grad_norm": 10.364538192749023, + "learning_rate": 1.020951541526653e-05, + "loss": 1.356, + "step": 49390 + }, + { + "epoch": 8.163602561454246, + "grad_norm": 33.56261444091797, + "learning_rate": 1.020033419636791e-05, + "loss": 1.3118, + "step": 49400 + }, + { + "epoch": 8.165255112580045, + "grad_norm": 21.711645126342773, + "learning_rate": 1.019115297746929e-05, + "loss": 1.2635, + "step": 49410 + }, + { + "epoch": 8.166907663705846, + "grad_norm": 11.296979904174805, + "learning_rate": 1.0181971758570668e-05, + "loss": 1.3455, + "step": 49420 + }, + { + "epoch": 8.168560214831647, + "grad_norm": 22.215770721435547, + "learning_rate": 1.0172790539672048e-05, + "loss": 1.29, + "step": 49430 + }, + { + "epoch": 8.170212765957446, + "grad_norm": 16.198862075805664, + "learning_rate": 1.0163609320773427e-05, + "loss": 1.4448, + "step": 49440 + }, + { + "epoch": 8.171865317083247, + "grad_norm": 15.688820838928223, + "learning_rate": 1.0154428101874805e-05, + "loss": 1.4095, + "step": 49450 + }, + { + "epoch": 8.173517868209048, + "grad_norm": 18.601253509521484, + "learning_rate": 1.0145246882976184e-05, + "loss": 1.3167, + "step": 49460 + }, + { + "epoch": 8.175170419334847, + "grad_norm": 14.552800178527832, + "learning_rate": 1.0136065664077565e-05, + "loss": 1.3402, + "step": 49470 + }, + { + "epoch": 8.176822970460648, + "grad_norm": 25.716659545898438, + "learning_rate": 1.0126884445178942e-05, + "loss": 1.2686, + "step": 49480 + }, + { + "epoch": 8.17847552158645, + "grad_norm": 14.601028442382812, + "learning_rate": 1.0117703226280322e-05, + "loss": 1.3386, + "step": 49490 + }, + { + "epoch": 8.18012807271225, + "grad_norm": 19.567943572998047, + "learning_rate": 1.01085220073817e-05, + "loss": 1.3595, + "step": 49500 + }, + { + "epoch": 8.18178062383805, + "grad_norm": 16.065189361572266, + "learning_rate": 1.0099340788483078e-05, + "loss": 1.4657, + "step": 49510 + }, + { + "epoch": 8.18343317496385, + "grad_norm": 21.156858444213867, + "learning_rate": 1.009015956958446e-05, + "loss": 1.3717, + "step": 49520 + }, + { + "epoch": 8.185085726089651, + "grad_norm": 9.819899559020996, + "learning_rate": 1.0080978350685838e-05, + "loss": 1.3775, + "step": 49530 + }, + { + "epoch": 8.18673827721545, + "grad_norm": 20.710819244384766, + "learning_rate": 1.0071797131787216e-05, + "loss": 1.4204, + "step": 49540 + }, + { + "epoch": 8.188390828341252, + "grad_norm": 8.26885986328125, + "learning_rate": 1.0062615912888595e-05, + "loss": 1.3977, + "step": 49550 + }, + { + "epoch": 8.190043379467053, + "grad_norm": 23.709930419921875, + "learning_rate": 1.0053434693989975e-05, + "loss": 1.3835, + "step": 49560 + }, + { + "epoch": 8.191695930592854, + "grad_norm": 13.372426986694336, + "learning_rate": 1.0044253475091354e-05, + "loss": 1.3282, + "step": 49570 + }, + { + "epoch": 8.193348481718653, + "grad_norm": 14.474409103393555, + "learning_rate": 1.0035072256192733e-05, + "loss": 1.4683, + "step": 49580 + }, + { + "epoch": 8.195001032844454, + "grad_norm": 19.034423828125, + "learning_rate": 1.0025891037294112e-05, + "loss": 1.5369, + "step": 49590 + }, + { + "epoch": 8.196653583970255, + "grad_norm": 13.448150634765625, + "learning_rate": 1.0016709818395492e-05, + "loss": 1.3128, + "step": 49600 + }, + { + "epoch": 8.198306135096054, + "grad_norm": 16.181913375854492, + "learning_rate": 1.0007528599496869e-05, + "loss": 1.2877, + "step": 49610 + }, + { + "epoch": 8.199958686221855, + "grad_norm": 10.915853500366211, + "learning_rate": 9.998347380598248e-06, + "loss": 1.237, + "step": 49620 + }, + { + "epoch": 8.201611237347656, + "grad_norm": 11.362007141113281, + "learning_rate": 9.989166161699628e-06, + "loss": 1.5008, + "step": 49630 + }, + { + "epoch": 8.203263788473455, + "grad_norm": 11.269021034240723, + "learning_rate": 9.979984942801007e-06, + "loss": 1.2711, + "step": 49640 + }, + { + "epoch": 8.204916339599256, + "grad_norm": 18.16535758972168, + "learning_rate": 9.970803723902386e-06, + "loss": 1.3525, + "step": 49650 + }, + { + "epoch": 8.206568890725057, + "grad_norm": 16.32088279724121, + "learning_rate": 9.961622505003765e-06, + "loss": 1.2825, + "step": 49660 + }, + { + "epoch": 8.208221441850858, + "grad_norm": 10.657417297363281, + "learning_rate": 9.952441286105143e-06, + "loss": 1.4107, + "step": 49670 + }, + { + "epoch": 8.209873992976657, + "grad_norm": 8.144217491149902, + "learning_rate": 9.943260067206522e-06, + "loss": 1.3751, + "step": 49680 + }, + { + "epoch": 8.211526544102458, + "grad_norm": 11.399767875671387, + "learning_rate": 9.934078848307901e-06, + "loss": 1.3849, + "step": 49690 + }, + { + "epoch": 8.21317909522826, + "grad_norm": 12.942078590393066, + "learning_rate": 9.92489762940928e-06, + "loss": 1.3903, + "step": 49700 + }, + { + "epoch": 8.214831646354058, + "grad_norm": 14.669422149658203, + "learning_rate": 9.91571641051066e-06, + "loss": 1.2849, + "step": 49710 + }, + { + "epoch": 8.21648419747986, + "grad_norm": 14.220985412597656, + "learning_rate": 9.906535191612039e-06, + "loss": 1.2295, + "step": 49720 + }, + { + "epoch": 8.21813674860566, + "grad_norm": 15.08796215057373, + "learning_rate": 9.897353972713418e-06, + "loss": 1.2791, + "step": 49730 + }, + { + "epoch": 8.21978929973146, + "grad_norm": 19.543704986572266, + "learning_rate": 9.888172753814796e-06, + "loss": 1.2695, + "step": 49740 + }, + { + "epoch": 8.22144185085726, + "grad_norm": 12.316389083862305, + "learning_rate": 9.878991534916177e-06, + "loss": 1.4851, + "step": 49750 + }, + { + "epoch": 8.223094401983062, + "grad_norm": 15.752791404724121, + "learning_rate": 9.869810316017556e-06, + "loss": 1.4293, + "step": 49760 + }, + { + "epoch": 8.224746953108863, + "grad_norm": 12.644379615783691, + "learning_rate": 9.860629097118934e-06, + "loss": 1.2978, + "step": 49770 + }, + { + "epoch": 8.226399504234662, + "grad_norm": 22.362035751342773, + "learning_rate": 9.851447878220313e-06, + "loss": 1.3376, + "step": 49780 + }, + { + "epoch": 8.228052055360463, + "grad_norm": 12.713665008544922, + "learning_rate": 9.842266659321692e-06, + "loss": 1.4388, + "step": 49790 + }, + { + "epoch": 8.229704606486264, + "grad_norm": 12.972372055053711, + "learning_rate": 9.83308544042307e-06, + "loss": 1.5245, + "step": 49800 + }, + { + "epoch": 8.231357157612063, + "grad_norm": 18.19927215576172, + "learning_rate": 9.82390422152445e-06, + "loss": 1.3927, + "step": 49810 + }, + { + "epoch": 8.233009708737864, + "grad_norm": 36.67835998535156, + "learning_rate": 9.81472300262583e-06, + "loss": 1.3945, + "step": 49820 + }, + { + "epoch": 8.234662259863665, + "grad_norm": 13.975602149963379, + "learning_rate": 9.805541783727207e-06, + "loss": 1.5244, + "step": 49830 + }, + { + "epoch": 8.236314810989464, + "grad_norm": 11.509796142578125, + "learning_rate": 9.796360564828587e-06, + "loss": 1.3487, + "step": 49840 + }, + { + "epoch": 8.237967362115265, + "grad_norm": 13.768054008483887, + "learning_rate": 9.787179345929966e-06, + "loss": 1.3423, + "step": 49850 + }, + { + "epoch": 8.239619913241066, + "grad_norm": 24.76049041748047, + "learning_rate": 9.777998127031345e-06, + "loss": 1.3772, + "step": 49860 + }, + { + "epoch": 8.241272464366867, + "grad_norm": 19.85248374938965, + "learning_rate": 9.768816908132724e-06, + "loss": 1.364, + "step": 49870 + }, + { + "epoch": 8.242925015492666, + "grad_norm": 10.143357276916504, + "learning_rate": 9.759635689234104e-06, + "loss": 1.3958, + "step": 49880 + }, + { + "epoch": 8.244577566618467, + "grad_norm": 11.001799583435059, + "learning_rate": 9.750454470335483e-06, + "loss": 1.3621, + "step": 49890 + }, + { + "epoch": 8.246230117744268, + "grad_norm": 9.355253219604492, + "learning_rate": 9.74127325143686e-06, + "loss": 1.2735, + "step": 49900 + }, + { + "epoch": 8.247882668870067, + "grad_norm": 31.860811233520508, + "learning_rate": 9.73209203253824e-06, + "loss": 1.3465, + "step": 49910 + }, + { + "epoch": 8.249535219995868, + "grad_norm": 9.52219295501709, + "learning_rate": 9.72291081363962e-06, + "loss": 1.2591, + "step": 49920 + }, + { + "epoch": 8.25118777112167, + "grad_norm": 16.92643165588379, + "learning_rate": 9.713729594740998e-06, + "loss": 1.5207, + "step": 49930 + }, + { + "epoch": 8.252840322247469, + "grad_norm": 24.846473693847656, + "learning_rate": 9.704548375842377e-06, + "loss": 1.1995, + "step": 49940 + }, + { + "epoch": 8.25449287337327, + "grad_norm": 18.698997497558594, + "learning_rate": 9.695367156943757e-06, + "loss": 1.409, + "step": 49950 + }, + { + "epoch": 8.25614542449907, + "grad_norm": 9.128297805786133, + "learning_rate": 9.686185938045134e-06, + "loss": 1.4773, + "step": 49960 + }, + { + "epoch": 8.257797975624872, + "grad_norm": 8.733552932739258, + "learning_rate": 9.677004719146514e-06, + "loss": 1.3389, + "step": 49970 + }, + { + "epoch": 8.25945052675067, + "grad_norm": 14.439172744750977, + "learning_rate": 9.667823500247894e-06, + "loss": 1.4054, + "step": 49980 + }, + { + "epoch": 8.261103077876472, + "grad_norm": 25.46586036682129, + "learning_rate": 9.658642281349272e-06, + "loss": 1.385, + "step": 49990 + }, + { + "epoch": 8.262755629002273, + "grad_norm": 15.000371932983398, + "learning_rate": 9.649461062450651e-06, + "loss": 1.3654, + "step": 50000 + }, + { + "epoch": 8.264408180128072, + "grad_norm": 15.230158805847168, + "learning_rate": 9.64027984355203e-06, + "loss": 1.2959, + "step": 50010 + }, + { + "epoch": 8.266060731253873, + "grad_norm": 20.123403549194336, + "learning_rate": 9.63109862465341e-06, + "loss": 1.4118, + "step": 50020 + }, + { + "epoch": 8.267713282379674, + "grad_norm": 9.338974952697754, + "learning_rate": 9.621917405754787e-06, + "loss": 1.2685, + "step": 50030 + }, + { + "epoch": 8.269365833505475, + "grad_norm": 18.35788917541504, + "learning_rate": 9.612736186856168e-06, + "loss": 1.3901, + "step": 50040 + }, + { + "epoch": 8.271018384631274, + "grad_norm": 22.194643020629883, + "learning_rate": 9.603554967957548e-06, + "loss": 1.278, + "step": 50050 + }, + { + "epoch": 8.272670935757075, + "grad_norm": 10.933758735656738, + "learning_rate": 9.594373749058925e-06, + "loss": 1.4638, + "step": 50060 + }, + { + "epoch": 8.274323486882876, + "grad_norm": 13.151747703552246, + "learning_rate": 9.585192530160304e-06, + "loss": 1.4557, + "step": 50070 + }, + { + "epoch": 8.275976038008675, + "grad_norm": 16.250370025634766, + "learning_rate": 9.576011311261684e-06, + "loss": 1.3638, + "step": 50080 + }, + { + "epoch": 8.277628589134476, + "grad_norm": 14.143457412719727, + "learning_rate": 9.566830092363063e-06, + "loss": 1.2704, + "step": 50090 + }, + { + "epoch": 8.279281140260277, + "grad_norm": 15.831094741821289, + "learning_rate": 9.557648873464442e-06, + "loss": 1.4091, + "step": 50100 + }, + { + "epoch": 8.280933691386076, + "grad_norm": 11.630001068115234, + "learning_rate": 9.548467654565821e-06, + "loss": 1.3446, + "step": 50110 + }, + { + "epoch": 8.282586242511877, + "grad_norm": 9.2128324508667, + "learning_rate": 9.539286435667199e-06, + "loss": 1.2162, + "step": 50120 + }, + { + "epoch": 8.284238793637678, + "grad_norm": 16.4133243560791, + "learning_rate": 9.530105216768578e-06, + "loss": 1.3735, + "step": 50130 + }, + { + "epoch": 8.28589134476348, + "grad_norm": 8.486006736755371, + "learning_rate": 9.520923997869957e-06, + "loss": 1.2581, + "step": 50140 + }, + { + "epoch": 8.287543895889279, + "grad_norm": 25.598995208740234, + "learning_rate": 9.511742778971337e-06, + "loss": 1.2991, + "step": 50150 + }, + { + "epoch": 8.28919644701508, + "grad_norm": 13.834522247314453, + "learning_rate": 9.502561560072716e-06, + "loss": 1.4145, + "step": 50160 + }, + { + "epoch": 8.29084899814088, + "grad_norm": 13.25239086151123, + "learning_rate": 9.493380341174095e-06, + "loss": 1.4416, + "step": 50170 + }, + { + "epoch": 8.29250154926668, + "grad_norm": 15.901601791381836, + "learning_rate": 9.484199122275474e-06, + "loss": 1.4722, + "step": 50180 + }, + { + "epoch": 8.29415410039248, + "grad_norm": 12.437292098999023, + "learning_rate": 9.475017903376852e-06, + "loss": 1.3519, + "step": 50190 + }, + { + "epoch": 8.295806651518282, + "grad_norm": 13.174797058105469, + "learning_rate": 9.465836684478231e-06, + "loss": 1.3125, + "step": 50200 + }, + { + "epoch": 8.297459202644081, + "grad_norm": 18.76152229309082, + "learning_rate": 9.456655465579612e-06, + "loss": 1.3622, + "step": 50210 + }, + { + "epoch": 8.299111753769882, + "grad_norm": 13.118831634521484, + "learning_rate": 9.44747424668099e-06, + "loss": 1.3004, + "step": 50220 + }, + { + "epoch": 8.300764304895683, + "grad_norm": 18.371763229370117, + "learning_rate": 9.438293027782369e-06, + "loss": 1.3731, + "step": 50230 + }, + { + "epoch": 8.302416856021484, + "grad_norm": 12.924381256103516, + "learning_rate": 9.429111808883748e-06, + "loss": 1.3149, + "step": 50240 + }, + { + "epoch": 8.304069407147283, + "grad_norm": 10.467041015625, + "learning_rate": 9.419930589985126e-06, + "loss": 1.3262, + "step": 50250 + }, + { + "epoch": 8.305721958273084, + "grad_norm": 18.903898239135742, + "learning_rate": 9.410749371086505e-06, + "loss": 1.2989, + "step": 50260 + }, + { + "epoch": 8.307374509398885, + "grad_norm": 13.446491241455078, + "learning_rate": 9.401568152187886e-06, + "loss": 1.4296, + "step": 50270 + }, + { + "epoch": 8.309027060524684, + "grad_norm": 14.688339233398438, + "learning_rate": 9.392386933289263e-06, + "loss": 1.2989, + "step": 50280 + }, + { + "epoch": 8.310679611650485, + "grad_norm": 17.203166961669922, + "learning_rate": 9.383205714390643e-06, + "loss": 1.3036, + "step": 50290 + }, + { + "epoch": 8.312332162776286, + "grad_norm": 83.26315307617188, + "learning_rate": 9.374024495492022e-06, + "loss": 1.3526, + "step": 50300 + }, + { + "epoch": 8.313984713902085, + "grad_norm": 23.8687686920166, + "learning_rate": 9.364843276593401e-06, + "loss": 1.2074, + "step": 50310 + }, + { + "epoch": 8.315637265027886, + "grad_norm": 13.990741729736328, + "learning_rate": 9.35566205769478e-06, + "loss": 1.3218, + "step": 50320 + }, + { + "epoch": 8.317289816153687, + "grad_norm": 34.39853286743164, + "learning_rate": 9.34648083879616e-06, + "loss": 1.338, + "step": 50330 + }, + { + "epoch": 8.318942367279488, + "grad_norm": 22.453739166259766, + "learning_rate": 9.337299619897539e-06, + "loss": 1.4889, + "step": 50340 + }, + { + "epoch": 8.320594918405288, + "grad_norm": 10.095982551574707, + "learning_rate": 9.328118400998916e-06, + "loss": 1.336, + "step": 50350 + }, + { + "epoch": 8.322247469531089, + "grad_norm": 74.14069366455078, + "learning_rate": 9.318937182100296e-06, + "loss": 1.4469, + "step": 50360 + }, + { + "epoch": 8.32390002065689, + "grad_norm": 23.241098403930664, + "learning_rate": 9.309755963201675e-06, + "loss": 1.4088, + "step": 50370 + }, + { + "epoch": 8.325552571782689, + "grad_norm": 9.130888938903809, + "learning_rate": 9.300574744303054e-06, + "loss": 1.3539, + "step": 50380 + }, + { + "epoch": 8.32720512290849, + "grad_norm": 28.220802307128906, + "learning_rate": 9.291393525404433e-06, + "loss": 1.4253, + "step": 50390 + }, + { + "epoch": 8.32885767403429, + "grad_norm": 14.703250885009766, + "learning_rate": 9.282212306505813e-06, + "loss": 1.2751, + "step": 50400 + }, + { + "epoch": 8.33051022516009, + "grad_norm": 17.532804489135742, + "learning_rate": 9.27303108760719e-06, + "loss": 1.3413, + "step": 50410 + }, + { + "epoch": 8.332162776285891, + "grad_norm": 11.652085304260254, + "learning_rate": 9.26384986870857e-06, + "loss": 1.2881, + "step": 50420 + }, + { + "epoch": 8.333815327411692, + "grad_norm": 16.633052825927734, + "learning_rate": 9.254668649809949e-06, + "loss": 1.3163, + "step": 50430 + }, + { + "epoch": 8.335467878537493, + "grad_norm": 17.8756046295166, + "learning_rate": 9.245487430911328e-06, + "loss": 1.2875, + "step": 50440 + }, + { + "epoch": 8.337120429663292, + "grad_norm": 14.60150146484375, + "learning_rate": 9.236306212012707e-06, + "loss": 1.3227, + "step": 50450 + }, + { + "epoch": 8.338772980789093, + "grad_norm": 10.519417762756348, + "learning_rate": 9.227124993114087e-06, + "loss": 1.1588, + "step": 50460 + }, + { + "epoch": 8.340425531914894, + "grad_norm": 12.479300498962402, + "learning_rate": 9.217943774215466e-06, + "loss": 1.2749, + "step": 50470 + }, + { + "epoch": 8.342078083040693, + "grad_norm": 15.454779624938965, + "learning_rate": 9.208762555316843e-06, + "loss": 1.4351, + "step": 50480 + }, + { + "epoch": 8.343730634166494, + "grad_norm": 22.32270050048828, + "learning_rate": 9.199581336418224e-06, + "loss": 1.242, + "step": 50490 + }, + { + "epoch": 8.345383185292295, + "grad_norm": 12.450278282165527, + "learning_rate": 9.190400117519603e-06, + "loss": 1.3128, + "step": 50500 + }, + { + "epoch": 8.347035736418096, + "grad_norm": 12.18801498413086, + "learning_rate": 9.181218898620981e-06, + "loss": 1.3458, + "step": 50510 + }, + { + "epoch": 8.348688287543895, + "grad_norm": 11.80329704284668, + "learning_rate": 9.17203767972236e-06, + "loss": 1.3676, + "step": 50520 + }, + { + "epoch": 8.350340838669696, + "grad_norm": 16.792072296142578, + "learning_rate": 9.16285646082374e-06, + "loss": 1.3725, + "step": 50530 + }, + { + "epoch": 8.351993389795497, + "grad_norm": 20.232120513916016, + "learning_rate": 9.153675241925117e-06, + "loss": 1.2889, + "step": 50540 + }, + { + "epoch": 8.353645940921297, + "grad_norm": 10.46639347076416, + "learning_rate": 9.144494023026498e-06, + "loss": 1.3758, + "step": 50550 + }, + { + "epoch": 8.355298492047098, + "grad_norm": 11.493830680847168, + "learning_rate": 9.135312804127877e-06, + "loss": 1.3593, + "step": 50560 + }, + { + "epoch": 8.356951043172899, + "grad_norm": 9.20073413848877, + "learning_rate": 9.126131585229255e-06, + "loss": 1.3475, + "step": 50570 + }, + { + "epoch": 8.358603594298698, + "grad_norm": 19.450620651245117, + "learning_rate": 9.116950366330634e-06, + "loss": 1.4109, + "step": 50580 + }, + { + "epoch": 8.360256145424499, + "grad_norm": 13.948092460632324, + "learning_rate": 9.107769147432013e-06, + "loss": 1.3133, + "step": 50590 + }, + { + "epoch": 8.3619086965503, + "grad_norm": 26.458175659179688, + "learning_rate": 9.098587928533393e-06, + "loss": 1.5351, + "step": 50600 + }, + { + "epoch": 8.3635612476761, + "grad_norm": 11.807082176208496, + "learning_rate": 9.089406709634772e-06, + "loss": 1.3635, + "step": 50610 + }, + { + "epoch": 8.3652137988019, + "grad_norm": 13.108859062194824, + "learning_rate": 9.080225490736151e-06, + "loss": 1.411, + "step": 50620 + }, + { + "epoch": 8.366866349927701, + "grad_norm": 14.530641555786133, + "learning_rate": 9.07104427183753e-06, + "loss": 1.4175, + "step": 50630 + }, + { + "epoch": 8.368518901053502, + "grad_norm": 21.881153106689453, + "learning_rate": 9.061863052938908e-06, + "loss": 1.4125, + "step": 50640 + }, + { + "epoch": 8.370171452179301, + "grad_norm": 15.157549858093262, + "learning_rate": 9.052681834040287e-06, + "loss": 1.2924, + "step": 50650 + }, + { + "epoch": 8.371824003305102, + "grad_norm": 15.397344589233398, + "learning_rate": 9.043500615141666e-06, + "loss": 1.2743, + "step": 50660 + }, + { + "epoch": 8.373476554430903, + "grad_norm": 13.956780433654785, + "learning_rate": 9.034319396243046e-06, + "loss": 1.2906, + "step": 50670 + }, + { + "epoch": 8.375129105556702, + "grad_norm": 15.764337539672852, + "learning_rate": 9.025138177344425e-06, + "loss": 1.362, + "step": 50680 + }, + { + "epoch": 8.376781656682503, + "grad_norm": 18.606061935424805, + "learning_rate": 9.015956958445804e-06, + "loss": 1.3092, + "step": 50690 + }, + { + "epoch": 8.378434207808304, + "grad_norm": 17.503150939941406, + "learning_rate": 9.006775739547182e-06, + "loss": 1.3202, + "step": 50700 + }, + { + "epoch": 8.380086758934105, + "grad_norm": 7.5358967781066895, + "learning_rate": 8.997594520648561e-06, + "loss": 1.3524, + "step": 50710 + }, + { + "epoch": 8.381739310059904, + "grad_norm": 19.432466506958008, + "learning_rate": 8.988413301749942e-06, + "loss": 1.344, + "step": 50720 + }, + { + "epoch": 8.383391861185705, + "grad_norm": 10.042672157287598, + "learning_rate": 8.97923208285132e-06, + "loss": 1.3504, + "step": 50730 + }, + { + "epoch": 8.385044412311506, + "grad_norm": 14.225571632385254, + "learning_rate": 8.970050863952699e-06, + "loss": 1.2284, + "step": 50740 + }, + { + "epoch": 8.386696963437306, + "grad_norm": 11.462421417236328, + "learning_rate": 8.960869645054078e-06, + "loss": 1.371, + "step": 50750 + }, + { + "epoch": 8.388349514563107, + "grad_norm": 9.020613670349121, + "learning_rate": 8.951688426155457e-06, + "loss": 1.262, + "step": 50760 + }, + { + "epoch": 8.390002065688908, + "grad_norm": 17.618812561035156, + "learning_rate": 8.942507207256835e-06, + "loss": 1.393, + "step": 50770 + }, + { + "epoch": 8.391654616814709, + "grad_norm": 19.001405715942383, + "learning_rate": 8.933325988358216e-06, + "loss": 1.3542, + "step": 50780 + }, + { + "epoch": 8.393307167940508, + "grad_norm": 19.39535140991211, + "learning_rate": 8.924144769459595e-06, + "loss": 1.4777, + "step": 50790 + }, + { + "epoch": 8.394959719066309, + "grad_norm": 11.611642837524414, + "learning_rate": 8.914963550560972e-06, + "loss": 1.2758, + "step": 50800 + }, + { + "epoch": 8.39661227019211, + "grad_norm": 25.300411224365234, + "learning_rate": 8.905782331662352e-06, + "loss": 1.4082, + "step": 50810 + }, + { + "epoch": 8.398264821317909, + "grad_norm": 11.443560600280762, + "learning_rate": 8.896601112763731e-06, + "loss": 1.2994, + "step": 50820 + }, + { + "epoch": 8.39991737244371, + "grad_norm": 11.911940574645996, + "learning_rate": 8.887419893865109e-06, + "loss": 1.4266, + "step": 50830 + }, + { + "epoch": 8.401569923569511, + "grad_norm": 10.640558242797852, + "learning_rate": 8.87823867496649e-06, + "loss": 1.3858, + "step": 50840 + }, + { + "epoch": 8.40322247469531, + "grad_norm": 14.38443660736084, + "learning_rate": 8.869057456067869e-06, + "loss": 1.3228, + "step": 50850 + }, + { + "epoch": 8.404875025821111, + "grad_norm": 11.794403076171875, + "learning_rate": 8.859876237169246e-06, + "loss": 1.3956, + "step": 50860 + }, + { + "epoch": 8.406527576946912, + "grad_norm": 10.348832130432129, + "learning_rate": 8.850695018270625e-06, + "loss": 1.4676, + "step": 50870 + }, + { + "epoch": 8.408180128072713, + "grad_norm": 10.430915832519531, + "learning_rate": 8.841513799372005e-06, + "loss": 1.3324, + "step": 50880 + }, + { + "epoch": 8.409832679198512, + "grad_norm": 11.54885482788086, + "learning_rate": 8.832332580473384e-06, + "loss": 1.4735, + "step": 50890 + }, + { + "epoch": 8.411485230324313, + "grad_norm": 19.26331901550293, + "learning_rate": 8.823151361574763e-06, + "loss": 1.2549, + "step": 50900 + }, + { + "epoch": 8.413137781450114, + "grad_norm": 12.017107963562012, + "learning_rate": 8.813970142676142e-06, + "loss": 1.3315, + "step": 50910 + }, + { + "epoch": 8.414790332575913, + "grad_norm": 13.017470359802246, + "learning_rate": 8.804788923777522e-06, + "loss": 1.2567, + "step": 50920 + }, + { + "epoch": 8.416442883701714, + "grad_norm": 14.95610523223877, + "learning_rate": 8.7956077048789e-06, + "loss": 1.3519, + "step": 50930 + }, + { + "epoch": 8.418095434827515, + "grad_norm": 18.328262329101562, + "learning_rate": 8.786426485980279e-06, + "loss": 1.3535, + "step": 50940 + }, + { + "epoch": 8.419747985953315, + "grad_norm": 11.61627197265625, + "learning_rate": 8.77724526708166e-06, + "loss": 1.4133, + "step": 50950 + }, + { + "epoch": 8.421400537079116, + "grad_norm": 17.8658390045166, + "learning_rate": 8.768064048183037e-06, + "loss": 1.3322, + "step": 50960 + }, + { + "epoch": 8.423053088204917, + "grad_norm": 12.533629417419434, + "learning_rate": 8.758882829284416e-06, + "loss": 1.2034, + "step": 50970 + }, + { + "epoch": 8.424705639330718, + "grad_norm": 10.913763046264648, + "learning_rate": 8.749701610385796e-06, + "loss": 1.3928, + "step": 50980 + }, + { + "epoch": 8.426358190456517, + "grad_norm": 12.7138090133667, + "learning_rate": 8.740520391487173e-06, + "loss": 1.2596, + "step": 50990 + }, + { + "epoch": 8.428010741582318, + "grad_norm": 69.10160827636719, + "learning_rate": 8.731339172588552e-06, + "loss": 1.3902, + "step": 51000 + }, + { + "epoch": 8.429663292708119, + "grad_norm": 18.809534072875977, + "learning_rate": 8.722157953689933e-06, + "loss": 1.343, + "step": 51010 + }, + { + "epoch": 8.431315843833918, + "grad_norm": 17.767322540283203, + "learning_rate": 8.71297673479131e-06, + "loss": 1.241, + "step": 51020 + }, + { + "epoch": 8.432968394959719, + "grad_norm": 18.070133209228516, + "learning_rate": 8.70379551589269e-06, + "loss": 1.2567, + "step": 51030 + }, + { + "epoch": 8.43462094608552, + "grad_norm": 17.62630271911621, + "learning_rate": 8.69461429699407e-06, + "loss": 1.3031, + "step": 51040 + }, + { + "epoch": 8.43627349721132, + "grad_norm": 19.49014663696289, + "learning_rate": 8.685433078095449e-06, + "loss": 1.4, + "step": 51050 + }, + { + "epoch": 8.43792604833712, + "grad_norm": 22.65168571472168, + "learning_rate": 8.676251859196826e-06, + "loss": 1.3422, + "step": 51060 + }, + { + "epoch": 8.439578599462921, + "grad_norm": 15.873695373535156, + "learning_rate": 8.667070640298207e-06, + "loss": 1.5412, + "step": 51070 + }, + { + "epoch": 8.441231150588722, + "grad_norm": 20.69378089904785, + "learning_rate": 8.657889421399586e-06, + "loss": 1.2421, + "step": 51080 + }, + { + "epoch": 8.442883701714521, + "grad_norm": 14.681087493896484, + "learning_rate": 8.648708202500964e-06, + "loss": 1.2936, + "step": 51090 + }, + { + "epoch": 8.444536252840322, + "grad_norm": 23.839170455932617, + "learning_rate": 8.639526983602343e-06, + "loss": 1.3616, + "step": 51100 + }, + { + "epoch": 8.446188803966123, + "grad_norm": 12.453177452087402, + "learning_rate": 8.630345764703722e-06, + "loss": 1.4633, + "step": 51110 + }, + { + "epoch": 8.447841355091922, + "grad_norm": 10.4721040725708, + "learning_rate": 8.621164545805102e-06, + "loss": 1.3924, + "step": 51120 + }, + { + "epoch": 8.449493906217723, + "grad_norm": 11.576587677001953, + "learning_rate": 8.611983326906481e-06, + "loss": 1.2731, + "step": 51130 + }, + { + "epoch": 8.451146457343524, + "grad_norm": 16.914947509765625, + "learning_rate": 8.60280210800786e-06, + "loss": 1.4043, + "step": 51140 + }, + { + "epoch": 8.452799008469324, + "grad_norm": 14.741705894470215, + "learning_rate": 8.593620889109238e-06, + "loss": 1.3298, + "step": 51150 + }, + { + "epoch": 8.454451559595125, + "grad_norm": 10.166900634765625, + "learning_rate": 8.584439670210617e-06, + "loss": 1.3858, + "step": 51160 + }, + { + "epoch": 8.456104110720926, + "grad_norm": 16.264799118041992, + "learning_rate": 8.575258451311996e-06, + "loss": 1.3903, + "step": 51170 + }, + { + "epoch": 8.457756661846727, + "grad_norm": 11.90115737915039, + "learning_rate": 8.566077232413375e-06, + "loss": 1.3022, + "step": 51180 + }, + { + "epoch": 8.459409212972526, + "grad_norm": 14.521602630615234, + "learning_rate": 8.556896013514755e-06, + "loss": 1.2753, + "step": 51190 + }, + { + "epoch": 8.461061764098327, + "grad_norm": 29.15642547607422, + "learning_rate": 8.547714794616134e-06, + "loss": 1.3347, + "step": 51200 + }, + { + "epoch": 8.462714315224128, + "grad_norm": 20.147611618041992, + "learning_rate": 8.538533575717513e-06, + "loss": 1.4416, + "step": 51210 + }, + { + "epoch": 8.464366866349927, + "grad_norm": 30.82805633544922, + "learning_rate": 8.52935235681889e-06, + "loss": 1.3531, + "step": 51220 + }, + { + "epoch": 8.466019417475728, + "grad_norm": 21.31821632385254, + "learning_rate": 8.52017113792027e-06, + "loss": 1.2286, + "step": 51230 + }, + { + "epoch": 8.467671968601529, + "grad_norm": 19.59657096862793, + "learning_rate": 8.510989919021651e-06, + "loss": 1.3915, + "step": 51240 + }, + { + "epoch": 8.46932451972733, + "grad_norm": 23.90947151184082, + "learning_rate": 8.501808700123028e-06, + "loss": 1.3517, + "step": 51250 + }, + { + "epoch": 8.47097707085313, + "grad_norm": 22.817235946655273, + "learning_rate": 8.492627481224408e-06, + "loss": 1.3708, + "step": 51260 + }, + { + "epoch": 8.47262962197893, + "grad_norm": 11.758684158325195, + "learning_rate": 8.483446262325787e-06, + "loss": 1.2679, + "step": 51270 + }, + { + "epoch": 8.474282173104731, + "grad_norm": 9.888134002685547, + "learning_rate": 8.474265043427164e-06, + "loss": 1.3361, + "step": 51280 + }, + { + "epoch": 8.47593472423053, + "grad_norm": 11.831820487976074, + "learning_rate": 8.465083824528545e-06, + "loss": 1.274, + "step": 51290 + }, + { + "epoch": 8.477587275356331, + "grad_norm": 42.835636138916016, + "learning_rate": 8.455902605629925e-06, + "loss": 1.3636, + "step": 51300 + }, + { + "epoch": 8.479239826482132, + "grad_norm": 10.630398750305176, + "learning_rate": 8.446721386731302e-06, + "loss": 1.1861, + "step": 51310 + }, + { + "epoch": 8.480892377607931, + "grad_norm": 53.76140594482422, + "learning_rate": 8.437540167832681e-06, + "loss": 1.3159, + "step": 51320 + }, + { + "epoch": 8.482544928733732, + "grad_norm": 25.955413818359375, + "learning_rate": 8.42835894893406e-06, + "loss": 1.2953, + "step": 51330 + }, + { + "epoch": 8.484197479859533, + "grad_norm": 15.469341278076172, + "learning_rate": 8.41917773003544e-06, + "loss": 1.2313, + "step": 51340 + }, + { + "epoch": 8.485850030985334, + "grad_norm": 15.408190727233887, + "learning_rate": 8.40999651113682e-06, + "loss": 1.4626, + "step": 51350 + }, + { + "epoch": 8.487502582111134, + "grad_norm": 18.029531478881836, + "learning_rate": 8.400815292238198e-06, + "loss": 1.3921, + "step": 51360 + }, + { + "epoch": 8.489155133236935, + "grad_norm": 12.669473648071289, + "learning_rate": 8.391634073339578e-06, + "loss": 1.203, + "step": 51370 + }, + { + "epoch": 8.490807684362736, + "grad_norm": 20.233211517333984, + "learning_rate": 8.382452854440955e-06, + "loss": 1.5235, + "step": 51380 + }, + { + "epoch": 8.492460235488535, + "grad_norm": 18.365198135375977, + "learning_rate": 8.373271635542335e-06, + "loss": 1.5077, + "step": 51390 + }, + { + "epoch": 8.494112786614336, + "grad_norm": 9.256744384765625, + "learning_rate": 8.364090416643714e-06, + "loss": 1.328, + "step": 51400 + }, + { + "epoch": 8.495765337740137, + "grad_norm": 13.671814918518066, + "learning_rate": 8.354909197745093e-06, + "loss": 1.4123, + "step": 51410 + }, + { + "epoch": 8.497417888865936, + "grad_norm": 48.41630172729492, + "learning_rate": 8.345727978846472e-06, + "loss": 1.3484, + "step": 51420 + }, + { + "epoch": 8.499070439991737, + "grad_norm": 19.150592803955078, + "learning_rate": 8.336546759947852e-06, + "loss": 1.3481, + "step": 51430 + }, + { + "epoch": 8.500722991117538, + "grad_norm": 20.59627914428711, + "learning_rate": 8.327365541049229e-06, + "loss": 1.281, + "step": 51440 + }, + { + "epoch": 8.502375542243339, + "grad_norm": 16.762386322021484, + "learning_rate": 8.318184322150608e-06, + "loss": 1.4107, + "step": 51450 + }, + { + "epoch": 8.504028093369138, + "grad_norm": 15.31123161315918, + "learning_rate": 8.309003103251988e-06, + "loss": 1.3289, + "step": 51460 + }, + { + "epoch": 8.505680644494939, + "grad_norm": 13.639659881591797, + "learning_rate": 8.299821884353367e-06, + "loss": 1.2842, + "step": 51470 + }, + { + "epoch": 8.50733319562074, + "grad_norm": 15.621134757995605, + "learning_rate": 8.290640665454746e-06, + "loss": 1.3049, + "step": 51480 + }, + { + "epoch": 8.50898574674654, + "grad_norm": 11.561347961425781, + "learning_rate": 8.281459446556125e-06, + "loss": 1.3786, + "step": 51490 + }, + { + "epoch": 8.51063829787234, + "grad_norm": 6.681066989898682, + "learning_rate": 8.272278227657505e-06, + "loss": 1.1806, + "step": 51500 + }, + { + "epoch": 8.512290848998141, + "grad_norm": 13.842896461486816, + "learning_rate": 8.263097008758882e-06, + "loss": 1.4607, + "step": 51510 + }, + { + "epoch": 8.513943400123942, + "grad_norm": 16.961971282958984, + "learning_rate": 8.253915789860263e-06, + "loss": 1.2977, + "step": 51520 + }, + { + "epoch": 8.515595951249741, + "grad_norm": 11.850485801696777, + "learning_rate": 8.244734570961642e-06, + "loss": 1.3939, + "step": 51530 + }, + { + "epoch": 8.517248502375542, + "grad_norm": 20.252544403076172, + "learning_rate": 8.23555335206302e-06, + "loss": 1.4132, + "step": 51540 + }, + { + "epoch": 8.518901053501343, + "grad_norm": 11.936491966247559, + "learning_rate": 8.226372133164399e-06, + "loss": 1.2569, + "step": 51550 + }, + { + "epoch": 8.520553604627143, + "grad_norm": 15.113805770874023, + "learning_rate": 8.217190914265778e-06, + "loss": 1.3359, + "step": 51560 + }, + { + "epoch": 8.522206155752944, + "grad_norm": 14.202859878540039, + "learning_rate": 8.208009695367158e-06, + "loss": 1.2987, + "step": 51570 + }, + { + "epoch": 8.523858706878745, + "grad_norm": 22.607847213745117, + "learning_rate": 8.198828476468537e-06, + "loss": 1.3453, + "step": 51580 + }, + { + "epoch": 8.525511258004544, + "grad_norm": 12.42171859741211, + "learning_rate": 8.189647257569916e-06, + "loss": 1.3669, + "step": 51590 + }, + { + "epoch": 8.527163809130345, + "grad_norm": 19.58823585510254, + "learning_rate": 8.180466038671294e-06, + "loss": 1.2657, + "step": 51600 + }, + { + "epoch": 8.528816360256146, + "grad_norm": 8.663179397583008, + "learning_rate": 8.171284819772673e-06, + "loss": 1.2291, + "step": 51610 + }, + { + "epoch": 8.530468911381945, + "grad_norm": 52.90243148803711, + "learning_rate": 8.162103600874052e-06, + "loss": 1.2141, + "step": 51620 + }, + { + "epoch": 8.532121462507746, + "grad_norm": 17.209096908569336, + "learning_rate": 8.152922381975431e-06, + "loss": 1.3594, + "step": 51630 + }, + { + "epoch": 8.533774013633547, + "grad_norm": 14.850648880004883, + "learning_rate": 8.14374116307681e-06, + "loss": 1.3939, + "step": 51640 + }, + { + "epoch": 8.535426564759348, + "grad_norm": 11.91596794128418, + "learning_rate": 8.13455994417819e-06, + "loss": 1.4166, + "step": 51650 + }, + { + "epoch": 8.537079115885147, + "grad_norm": 19.062267303466797, + "learning_rate": 8.125378725279569e-06, + "loss": 1.2763, + "step": 51660 + }, + { + "epoch": 8.538731667010948, + "grad_norm": 39.40817642211914, + "learning_rate": 8.116197506380947e-06, + "loss": 1.4173, + "step": 51670 + }, + { + "epoch": 8.540384218136749, + "grad_norm": 9.898063659667969, + "learning_rate": 8.107016287482326e-06, + "loss": 1.2107, + "step": 51680 + }, + { + "epoch": 8.542036769262548, + "grad_norm": 19.64806365966797, + "learning_rate": 8.097835068583707e-06, + "loss": 1.4224, + "step": 51690 + }, + { + "epoch": 8.54368932038835, + "grad_norm": 12.104853630065918, + "learning_rate": 8.088653849685084e-06, + "loss": 1.3676, + "step": 51700 + }, + { + "epoch": 8.54534187151415, + "grad_norm": 20.285003662109375, + "learning_rate": 8.079472630786464e-06, + "loss": 1.3047, + "step": 51710 + }, + { + "epoch": 8.546994422639951, + "grad_norm": 18.16466522216797, + "learning_rate": 8.070291411887843e-06, + "loss": 1.494, + "step": 51720 + }, + { + "epoch": 8.54864697376575, + "grad_norm": 14.75426197052002, + "learning_rate": 8.061110192989222e-06, + "loss": 1.3873, + "step": 51730 + }, + { + "epoch": 8.550299524891551, + "grad_norm": 13.810311317443848, + "learning_rate": 8.0519289740906e-06, + "loss": 1.3675, + "step": 51740 + }, + { + "epoch": 8.551952076017352, + "grad_norm": 10.323554992675781, + "learning_rate": 8.04274775519198e-06, + "loss": 1.3622, + "step": 51750 + }, + { + "epoch": 8.553604627143152, + "grad_norm": 13.959274291992188, + "learning_rate": 8.033566536293358e-06, + "loss": 1.2956, + "step": 51760 + }, + { + "epoch": 8.555257178268953, + "grad_norm": 12.601279258728027, + "learning_rate": 8.024385317394737e-06, + "loss": 1.2695, + "step": 51770 + }, + { + "epoch": 8.556909729394754, + "grad_norm": 11.207502365112305, + "learning_rate": 8.015204098496117e-06, + "loss": 1.3684, + "step": 51780 + }, + { + "epoch": 8.558562280520553, + "grad_norm": 28.540502548217773, + "learning_rate": 8.006022879597496e-06, + "loss": 1.319, + "step": 51790 + }, + { + "epoch": 8.560214831646354, + "grad_norm": 15.684926986694336, + "learning_rate": 7.996841660698874e-06, + "loss": 1.3478, + "step": 51800 + }, + { + "epoch": 8.561867382772155, + "grad_norm": 9.565122604370117, + "learning_rate": 7.987660441800254e-06, + "loss": 1.2809, + "step": 51810 + }, + { + "epoch": 8.563519933897956, + "grad_norm": 12.698163986206055, + "learning_rate": 7.978479222901634e-06, + "loss": 1.2952, + "step": 51820 + }, + { + "epoch": 8.565172485023755, + "grad_norm": 30.729351043701172, + "learning_rate": 7.969298004003011e-06, + "loss": 1.5078, + "step": 51830 + }, + { + "epoch": 8.566825036149556, + "grad_norm": 14.974106788635254, + "learning_rate": 7.96011678510439e-06, + "loss": 1.3052, + "step": 51840 + }, + { + "epoch": 8.568477587275357, + "grad_norm": 6.864882946014404, + "learning_rate": 7.95093556620577e-06, + "loss": 1.3145, + "step": 51850 + }, + { + "epoch": 8.570130138401156, + "grad_norm": 17.064834594726562, + "learning_rate": 7.941754347307149e-06, + "loss": 1.3939, + "step": 51860 + }, + { + "epoch": 8.571782689526957, + "grad_norm": 9.994977951049805, + "learning_rate": 7.932573128408528e-06, + "loss": 1.3526, + "step": 51870 + }, + { + "epoch": 8.573435240652758, + "grad_norm": 14.07989501953125, + "learning_rate": 7.923391909509907e-06, + "loss": 1.3424, + "step": 51880 + }, + { + "epoch": 8.575087791778557, + "grad_norm": 12.24575138092041, + "learning_rate": 7.914210690611287e-06, + "loss": 1.2845, + "step": 51890 + }, + { + "epoch": 8.576740342904358, + "grad_norm": 13.51680850982666, + "learning_rate": 7.905029471712664e-06, + "loss": 1.3452, + "step": 51900 + }, + { + "epoch": 8.57839289403016, + "grad_norm": 13.959500312805176, + "learning_rate": 7.895848252814044e-06, + "loss": 1.2149, + "step": 51910 + }, + { + "epoch": 8.58004544515596, + "grad_norm": 28.394367218017578, + "learning_rate": 7.886667033915423e-06, + "loss": 1.463, + "step": 51920 + }, + { + "epoch": 8.58169799628176, + "grad_norm": 14.02746868133545, + "learning_rate": 7.877485815016802e-06, + "loss": 1.4335, + "step": 51930 + }, + { + "epoch": 8.58335054740756, + "grad_norm": 11.631068229675293, + "learning_rate": 7.868304596118181e-06, + "loss": 1.2435, + "step": 51940 + }, + { + "epoch": 8.585003098533361, + "grad_norm": 13.741445541381836, + "learning_rate": 7.85912337721956e-06, + "loss": 1.2622, + "step": 51950 + }, + { + "epoch": 8.58665564965916, + "grad_norm": 21.316083908081055, + "learning_rate": 7.849942158320938e-06, + "loss": 1.484, + "step": 51960 + }, + { + "epoch": 8.588308200784962, + "grad_norm": 17.675111770629883, + "learning_rate": 7.840760939422317e-06, + "loss": 1.38, + "step": 51970 + }, + { + "epoch": 8.589960751910763, + "grad_norm": 21.218908309936523, + "learning_rate": 7.831579720523698e-06, + "loss": 1.247, + "step": 51980 + }, + { + "epoch": 8.591613303036564, + "grad_norm": 18.68570899963379, + "learning_rate": 7.822398501625076e-06, + "loss": 1.3275, + "step": 51990 + }, + { + "epoch": 8.593265854162363, + "grad_norm": 12.028681755065918, + "learning_rate": 7.813217282726455e-06, + "loss": 1.2789, + "step": 52000 + }, + { + "epoch": 8.594918405288164, + "grad_norm": 16.868160247802734, + "learning_rate": 7.804036063827834e-06, + "loss": 1.1928, + "step": 52010 + }, + { + "epoch": 8.596570956413965, + "grad_norm": 14.895330429077148, + "learning_rate": 7.794854844929214e-06, + "loss": 1.3861, + "step": 52020 + }, + { + "epoch": 8.598223507539764, + "grad_norm": 17.00758934020996, + "learning_rate": 7.785673626030591e-06, + "loss": 1.3761, + "step": 52030 + }, + { + "epoch": 8.599876058665565, + "grad_norm": 21.712890625, + "learning_rate": 7.776492407131972e-06, + "loss": 1.2694, + "step": 52040 + }, + { + "epoch": 8.601528609791366, + "grad_norm": 21.000192642211914, + "learning_rate": 7.767311188233351e-06, + "loss": 1.3491, + "step": 52050 + }, + { + "epoch": 8.603181160917165, + "grad_norm": 19.828134536743164, + "learning_rate": 7.758129969334729e-06, + "loss": 1.2993, + "step": 52060 + }, + { + "epoch": 8.604833712042966, + "grad_norm": 9.904111862182617, + "learning_rate": 7.748948750436108e-06, + "loss": 1.3062, + "step": 52070 + }, + { + "epoch": 8.606486263168767, + "grad_norm": 19.431184768676758, + "learning_rate": 7.739767531537487e-06, + "loss": 1.3353, + "step": 52080 + }, + { + "epoch": 8.608138814294568, + "grad_norm": 16.284452438354492, + "learning_rate": 7.730586312638867e-06, + "loss": 1.3189, + "step": 52090 + }, + { + "epoch": 8.609791365420367, + "grad_norm": 20.377105712890625, + "learning_rate": 7.721405093740246e-06, + "loss": 1.2256, + "step": 52100 + }, + { + "epoch": 8.611443916546168, + "grad_norm": 16.32294273376465, + "learning_rate": 7.712223874841625e-06, + "loss": 1.3875, + "step": 52110 + }, + { + "epoch": 8.61309646767197, + "grad_norm": 17.102628707885742, + "learning_rate": 7.703042655943003e-06, + "loss": 1.3282, + "step": 52120 + }, + { + "epoch": 8.614749018797768, + "grad_norm": 7.808797359466553, + "learning_rate": 7.693861437044382e-06, + "loss": 1.2726, + "step": 52130 + }, + { + "epoch": 8.61640156992357, + "grad_norm": 44.669883728027344, + "learning_rate": 7.684680218145761e-06, + "loss": 1.3213, + "step": 52140 + }, + { + "epoch": 8.61805412104937, + "grad_norm": 10.58360767364502, + "learning_rate": 7.67549899924714e-06, + "loss": 1.2849, + "step": 52150 + }, + { + "epoch": 8.61970667217517, + "grad_norm": 11.978372573852539, + "learning_rate": 7.66631778034852e-06, + "loss": 1.392, + "step": 52160 + }, + { + "epoch": 8.62135922330097, + "grad_norm": 19.33893585205078, + "learning_rate": 7.657136561449899e-06, + "loss": 1.3544, + "step": 52170 + }, + { + "epoch": 8.623011774426772, + "grad_norm": 15.807573318481445, + "learning_rate": 7.647955342551278e-06, + "loss": 1.4272, + "step": 52180 + }, + { + "epoch": 8.624664325552573, + "grad_norm": 22.08311653137207, + "learning_rate": 7.638774123652656e-06, + "loss": 1.3413, + "step": 52190 + }, + { + "epoch": 8.626316876678372, + "grad_norm": 18.999937057495117, + "learning_rate": 7.629592904754035e-06, + "loss": 1.4418, + "step": 52200 + }, + { + "epoch": 8.627969427804173, + "grad_norm": 18.321889877319336, + "learning_rate": 7.620411685855415e-06, + "loss": 1.4389, + "step": 52210 + }, + { + "epoch": 8.629621978929974, + "grad_norm": 18.07795524597168, + "learning_rate": 7.6112304669567934e-06, + "loss": 1.2659, + "step": 52220 + }, + { + "epoch": 8.631274530055773, + "grad_norm": 17.448719024658203, + "learning_rate": 7.602049248058173e-06, + "loss": 1.3933, + "step": 52230 + }, + { + "epoch": 8.632927081181574, + "grad_norm": 10.875787734985352, + "learning_rate": 7.592868029159551e-06, + "loss": 1.313, + "step": 52240 + }, + { + "epoch": 8.634579632307375, + "grad_norm": 17.80232810974121, + "learning_rate": 7.58368681026093e-06, + "loss": 1.3124, + "step": 52250 + }, + { + "epoch": 8.636232183433176, + "grad_norm": 21.202545166015625, + "learning_rate": 7.5745055913623104e-06, + "loss": 1.4396, + "step": 52260 + }, + { + "epoch": 8.637884734558975, + "grad_norm": 22.008880615234375, + "learning_rate": 7.565324372463689e-06, + "loss": 1.3628, + "step": 52270 + }, + { + "epoch": 8.639537285684776, + "grad_norm": 31.674184799194336, + "learning_rate": 7.556143153565068e-06, + "loss": 1.3047, + "step": 52280 + }, + { + "epoch": 8.641189836810577, + "grad_norm": 13.10922908782959, + "learning_rate": 7.5469619346664465e-06, + "loss": 1.3011, + "step": 52290 + }, + { + "epoch": 8.642842387936376, + "grad_norm": 43.33464813232422, + "learning_rate": 7.537780715767826e-06, + "loss": 1.4427, + "step": 52300 + }, + { + "epoch": 8.644494939062177, + "grad_norm": 11.043828964233398, + "learning_rate": 7.528599496869204e-06, + "loss": 1.3129, + "step": 52310 + }, + { + "epoch": 8.646147490187978, + "grad_norm": 17.55091667175293, + "learning_rate": 7.519418277970584e-06, + "loss": 1.3026, + "step": 52320 + }, + { + "epoch": 8.647800041313777, + "grad_norm": 11.531242370605469, + "learning_rate": 7.5102370590719635e-06, + "loss": 1.343, + "step": 52330 + }, + { + "epoch": 8.649452592439578, + "grad_norm": 7.957118988037109, + "learning_rate": 7.501055840173342e-06, + "loss": 1.2908, + "step": 52340 + }, + { + "epoch": 8.65110514356538, + "grad_norm": 15.16044807434082, + "learning_rate": 7.49187462127472e-06, + "loss": 1.4025, + "step": 52350 + }, + { + "epoch": 8.652757694691179, + "grad_norm": 12.470380783081055, + "learning_rate": 7.4826934023760995e-06, + "loss": 1.2541, + "step": 52360 + }, + { + "epoch": 8.65441024581698, + "grad_norm": 14.842360496520996, + "learning_rate": 7.473512183477478e-06, + "loss": 1.4249, + "step": 52370 + }, + { + "epoch": 8.65606279694278, + "grad_norm": 10.60826587677002, + "learning_rate": 7.464330964578858e-06, + "loss": 1.2648, + "step": 52380 + }, + { + "epoch": 8.657715348068582, + "grad_norm": 20.237024307250977, + "learning_rate": 7.455149745680237e-06, + "loss": 1.3097, + "step": 52390 + }, + { + "epoch": 8.65936789919438, + "grad_norm": 15.583151817321777, + "learning_rate": 7.445968526781616e-06, + "loss": 1.3364, + "step": 52400 + }, + { + "epoch": 8.661020450320182, + "grad_norm": 22.51787757873535, + "learning_rate": 7.436787307882995e-06, + "loss": 1.3017, + "step": 52410 + }, + { + "epoch": 8.662673001445983, + "grad_norm": 8.777755737304688, + "learning_rate": 7.427606088984373e-06, + "loss": 1.2984, + "step": 52420 + }, + { + "epoch": 8.664325552571782, + "grad_norm": 43.127593994140625, + "learning_rate": 7.4184248700857526e-06, + "loss": 1.3406, + "step": 52430 + }, + { + "epoch": 8.665978103697583, + "grad_norm": 10.117231369018555, + "learning_rate": 7.409243651187133e-06, + "loss": 1.261, + "step": 52440 + }, + { + "epoch": 8.667630654823384, + "grad_norm": 12.074843406677246, + "learning_rate": 7.400062432288511e-06, + "loss": 1.4873, + "step": 52450 + }, + { + "epoch": 8.669283205949185, + "grad_norm": 13.140725135803223, + "learning_rate": 7.39088121338989e-06, + "loss": 1.3359, + "step": 52460 + }, + { + "epoch": 8.670935757074984, + "grad_norm": 15.065147399902344, + "learning_rate": 7.381699994491269e-06, + "loss": 1.2755, + "step": 52470 + }, + { + "epoch": 8.672588308200785, + "grad_norm": 10.053617477416992, + "learning_rate": 7.372518775592648e-06, + "loss": 1.3763, + "step": 52480 + }, + { + "epoch": 8.674240859326586, + "grad_norm": 15.998075485229492, + "learning_rate": 7.363337556694028e-06, + "loss": 1.3625, + "step": 52490 + }, + { + "epoch": 8.675893410452385, + "grad_norm": 11.436149597167969, + "learning_rate": 7.3541563377954064e-06, + "loss": 1.3587, + "step": 52500 + }, + { + "epoch": 8.677545961578186, + "grad_norm": 12.246932983398438, + "learning_rate": 7.344975118896785e-06, + "loss": 1.3444, + "step": 52510 + }, + { + "epoch": 8.679198512703987, + "grad_norm": 10.678253173828125, + "learning_rate": 7.335793899998164e-06, + "loss": 1.3673, + "step": 52520 + }, + { + "epoch": 8.680851063829786, + "grad_norm": 11.65112018585205, + "learning_rate": 7.3266126810995425e-06, + "loss": 1.3642, + "step": 52530 + }, + { + "epoch": 8.682503614955587, + "grad_norm": 17.31778335571289, + "learning_rate": 7.317431462200922e-06, + "loss": 1.3697, + "step": 52540 + }, + { + "epoch": 8.684156166081388, + "grad_norm": 14.801467895507812, + "learning_rate": 7.308250243302302e-06, + "loss": 1.3962, + "step": 52550 + }, + { + "epoch": 8.68580871720719, + "grad_norm": 11.221620559692383, + "learning_rate": 7.29906902440368e-06, + "loss": 1.2398, + "step": 52560 + }, + { + "epoch": 8.687461268332989, + "grad_norm": 19.58458137512207, + "learning_rate": 7.2898878055050595e-06, + "loss": 1.3754, + "step": 52570 + }, + { + "epoch": 8.68911381945879, + "grad_norm": 12.247591972351074, + "learning_rate": 7.280706586606438e-06, + "loss": 1.3815, + "step": 52580 + }, + { + "epoch": 8.69076637058459, + "grad_norm": 9.798544883728027, + "learning_rate": 7.271525367707817e-06, + "loss": 1.3341, + "step": 52590 + }, + { + "epoch": 8.69241892171039, + "grad_norm": 12.177785873413086, + "learning_rate": 7.2623441488091955e-06, + "loss": 1.2962, + "step": 52600 + }, + { + "epoch": 8.69407147283619, + "grad_norm": 13.61369800567627, + "learning_rate": 7.253162929910576e-06, + "loss": 1.4009, + "step": 52610 + }, + { + "epoch": 8.695724023961992, + "grad_norm": 11.679938316345215, + "learning_rate": 7.243981711011955e-06, + "loss": 1.2359, + "step": 52620 + }, + { + "epoch": 8.697376575087791, + "grad_norm": 17.74927520751953, + "learning_rate": 7.234800492113333e-06, + "loss": 1.3443, + "step": 52630 + }, + { + "epoch": 8.699029126213592, + "grad_norm": 20.92237091064453, + "learning_rate": 7.2256192732147125e-06, + "loss": 1.3608, + "step": 52640 + }, + { + "epoch": 8.700681677339393, + "grad_norm": 14.271102905273438, + "learning_rate": 7.216438054316091e-06, + "loss": 1.2804, + "step": 52650 + }, + { + "epoch": 8.702334228465194, + "grad_norm": 16.352445602416992, + "learning_rate": 7.207256835417471e-06, + "loss": 1.3786, + "step": 52660 + }, + { + "epoch": 8.703986779590993, + "grad_norm": 11.873323440551758, + "learning_rate": 7.198075616518849e-06, + "loss": 1.3908, + "step": 52670 + }, + { + "epoch": 8.705639330716794, + "grad_norm": 15.193153381347656, + "learning_rate": 7.188894397620229e-06, + "loss": 1.2402, + "step": 52680 + }, + { + "epoch": 8.707291881842595, + "grad_norm": 22.531282424926758, + "learning_rate": 7.179713178721607e-06, + "loss": 1.2404, + "step": 52690 + }, + { + "epoch": 8.708944432968394, + "grad_norm": 10.484509468078613, + "learning_rate": 7.170531959822986e-06, + "loss": 1.3803, + "step": 52700 + }, + { + "epoch": 8.710596984094195, + "grad_norm": 21.465103149414062, + "learning_rate": 7.161350740924365e-06, + "loss": 1.458, + "step": 52710 + }, + { + "epoch": 8.712249535219996, + "grad_norm": 16.147686004638672, + "learning_rate": 7.152169522025745e-06, + "loss": 1.3513, + "step": 52720 + }, + { + "epoch": 8.713902086345797, + "grad_norm": 14.028088569641113, + "learning_rate": 7.142988303127124e-06, + "loss": 1.2286, + "step": 52730 + }, + { + "epoch": 8.715554637471596, + "grad_norm": 15.7096586227417, + "learning_rate": 7.1338070842285025e-06, + "loss": 1.2122, + "step": 52740 + }, + { + "epoch": 8.717207188597397, + "grad_norm": 21.39398956298828, + "learning_rate": 7.124625865329882e-06, + "loss": 1.274, + "step": 52750 + }, + { + "epoch": 8.718859739723198, + "grad_norm": 15.905527114868164, + "learning_rate": 7.11544464643126e-06, + "loss": 1.323, + "step": 52760 + }, + { + "epoch": 8.720512290848998, + "grad_norm": 13.975886344909668, + "learning_rate": 7.106263427532639e-06, + "loss": 1.5179, + "step": 52770 + }, + { + "epoch": 8.722164841974799, + "grad_norm": 20.37264060974121, + "learning_rate": 7.0970822086340194e-06, + "loss": 1.2484, + "step": 52780 + }, + { + "epoch": 8.7238173931006, + "grad_norm": 10.651167869567871, + "learning_rate": 7.087900989735398e-06, + "loss": 1.3944, + "step": 52790 + }, + { + "epoch": 8.725469944226399, + "grad_norm": 12.802726745605469, + "learning_rate": 7.078719770836777e-06, + "loss": 1.4366, + "step": 52800 + }, + { + "epoch": 8.7271224953522, + "grad_norm": 13.142681121826172, + "learning_rate": 7.0695385519381555e-06, + "loss": 1.3166, + "step": 52810 + }, + { + "epoch": 8.728775046478, + "grad_norm": 14.891148567199707, + "learning_rate": 7.060357333039534e-06, + "loss": 1.3012, + "step": 52820 + }, + { + "epoch": 8.7304275976038, + "grad_norm": 13.529729843139648, + "learning_rate": 7.051176114140913e-06, + "loss": 1.4384, + "step": 52830 + }, + { + "epoch": 8.732080148729601, + "grad_norm": 14.417351722717285, + "learning_rate": 7.041994895242293e-06, + "loss": 1.4955, + "step": 52840 + }, + { + "epoch": 8.733732699855402, + "grad_norm": 11.694726943969727, + "learning_rate": 7.032813676343672e-06, + "loss": 1.3628, + "step": 52850 + }, + { + "epoch": 8.735385250981203, + "grad_norm": 8.318774223327637, + "learning_rate": 7.023632457445051e-06, + "loss": 1.2791, + "step": 52860 + }, + { + "epoch": 8.737037802107002, + "grad_norm": 29.99515724182129, + "learning_rate": 7.014451238546429e-06, + "loss": 1.357, + "step": 52870 + }, + { + "epoch": 8.738690353232803, + "grad_norm": 8.45457649230957, + "learning_rate": 7.0052700196478085e-06, + "loss": 1.2616, + "step": 52880 + }, + { + "epoch": 8.740342904358604, + "grad_norm": 15.457688331604004, + "learning_rate": 6.996088800749189e-06, + "loss": 1.4261, + "step": 52890 + }, + { + "epoch": 8.741995455484403, + "grad_norm": 23.088054656982422, + "learning_rate": 6.986907581850567e-06, + "loss": 1.3279, + "step": 52900 + }, + { + "epoch": 8.743648006610204, + "grad_norm": 10.44580364227295, + "learning_rate": 6.977726362951946e-06, + "loss": 1.2399, + "step": 52910 + }, + { + "epoch": 8.745300557736005, + "grad_norm": 15.399632453918457, + "learning_rate": 6.968545144053325e-06, + "loss": 1.3267, + "step": 52920 + }, + { + "epoch": 8.746953108861806, + "grad_norm": 11.569892883300781, + "learning_rate": 6.959363925154704e-06, + "loss": 1.2217, + "step": 52930 + }, + { + "epoch": 8.748605659987605, + "grad_norm": 13.182928085327148, + "learning_rate": 6.950182706256082e-06, + "loss": 1.3053, + "step": 52940 + }, + { + "epoch": 8.750258211113406, + "grad_norm": 10.968775749206543, + "learning_rate": 6.941001487357462e-06, + "loss": 1.2109, + "step": 52950 + }, + { + "epoch": 8.751910762239207, + "grad_norm": 28.66567611694336, + "learning_rate": 6.931820268458842e-06, + "loss": 1.2991, + "step": 52960 + }, + { + "epoch": 8.753563313365007, + "grad_norm": 14.520941734313965, + "learning_rate": 6.92263904956022e-06, + "loss": 1.3108, + "step": 52970 + }, + { + "epoch": 8.755215864490808, + "grad_norm": 18.797256469726562, + "learning_rate": 6.9134578306615985e-06, + "loss": 1.3105, + "step": 52980 + }, + { + "epoch": 8.756868415616609, + "grad_norm": 21.74106788635254, + "learning_rate": 6.904276611762978e-06, + "loss": 1.4593, + "step": 52990 + }, + { + "epoch": 8.758520966742408, + "grad_norm": 21.514694213867188, + "learning_rate": 6.895095392864356e-06, + "loss": 1.2157, + "step": 53000 + }, + { + "epoch": 8.760173517868209, + "grad_norm": 15.12600040435791, + "learning_rate": 6.885914173965736e-06, + "loss": 1.4009, + "step": 53010 + }, + { + "epoch": 8.76182606899401, + "grad_norm": 17.038434982299805, + "learning_rate": 6.8767329550671155e-06, + "loss": 1.3185, + "step": 53020 + }, + { + "epoch": 8.76347862011981, + "grad_norm": 16.833236694335938, + "learning_rate": 6.867551736168494e-06, + "loss": 1.2934, + "step": 53030 + }, + { + "epoch": 8.76513117124561, + "grad_norm": 13.726874351501465, + "learning_rate": 6.858370517269873e-06, + "loss": 1.4518, + "step": 53040 + }, + { + "epoch": 8.766783722371411, + "grad_norm": 42.266380310058594, + "learning_rate": 6.8491892983712515e-06, + "loss": 1.3364, + "step": 53050 + }, + { + "epoch": 8.768436273497212, + "grad_norm": 22.413843154907227, + "learning_rate": 6.840008079472632e-06, + "loss": 1.3972, + "step": 53060 + }, + { + "epoch": 8.770088824623011, + "grad_norm": 19.03530502319336, + "learning_rate": 6.830826860574011e-06, + "loss": 1.322, + "step": 53070 + }, + { + "epoch": 8.771741375748812, + "grad_norm": 22.070907592773438, + "learning_rate": 6.821645641675389e-06, + "loss": 1.314, + "step": 53080 + }, + { + "epoch": 8.773393926874613, + "grad_norm": 11.455080032348633, + "learning_rate": 6.8124644227767685e-06, + "loss": 1.3339, + "step": 53090 + }, + { + "epoch": 8.775046478000412, + "grad_norm": 9.250655174255371, + "learning_rate": 6.803283203878147e-06, + "loss": 1.3523, + "step": 53100 + }, + { + "epoch": 8.776699029126213, + "grad_norm": 11.893872261047363, + "learning_rate": 6.794101984979525e-06, + "loss": 1.3746, + "step": 53110 + }, + { + "epoch": 8.778351580252014, + "grad_norm": 16.446348190307617, + "learning_rate": 6.784920766080906e-06, + "loss": 1.3863, + "step": 53120 + }, + { + "epoch": 8.780004131377815, + "grad_norm": 13.59485912322998, + "learning_rate": 6.775739547182285e-06, + "loss": 1.2766, + "step": 53130 + }, + { + "epoch": 8.781656682503614, + "grad_norm": 11.817255020141602, + "learning_rate": 6.766558328283663e-06, + "loss": 1.3529, + "step": 53140 + }, + { + "epoch": 8.783309233629415, + "grad_norm": 14.739175796508789, + "learning_rate": 6.757377109385042e-06, + "loss": 1.4092, + "step": 53150 + }, + { + "epoch": 8.784961784755216, + "grad_norm": 16.03285789489746, + "learning_rate": 6.748195890486421e-06, + "loss": 1.4103, + "step": 53160 + }, + { + "epoch": 8.786614335881016, + "grad_norm": 14.885622024536133, + "learning_rate": 6.7390146715878e-06, + "loss": 1.2313, + "step": 53170 + }, + { + "epoch": 8.788266887006817, + "grad_norm": 17.540298461914062, + "learning_rate": 6.72983345268918e-06, + "loss": 1.2694, + "step": 53180 + }, + { + "epoch": 8.789919438132618, + "grad_norm": 12.274869918823242, + "learning_rate": 6.7206522337905584e-06, + "loss": 1.3668, + "step": 53190 + }, + { + "epoch": 8.791571989258419, + "grad_norm": 12.353447914123535, + "learning_rate": 6.711471014891938e-06, + "loss": 1.3208, + "step": 53200 + }, + { + "epoch": 8.793224540384218, + "grad_norm": 10.738207817077637, + "learning_rate": 6.702289795993316e-06, + "loss": 1.1065, + "step": 53210 + }, + { + "epoch": 8.794877091510019, + "grad_norm": 11.016992568969727, + "learning_rate": 6.693108577094695e-06, + "loss": 1.3443, + "step": 53220 + }, + { + "epoch": 8.79652964263582, + "grad_norm": 15.448955535888672, + "learning_rate": 6.683927358196074e-06, + "loss": 1.3079, + "step": 53230 + }, + { + "epoch": 8.798182193761619, + "grad_norm": 12.838058471679688, + "learning_rate": 6.674746139297454e-06, + "loss": 1.2739, + "step": 53240 + }, + { + "epoch": 8.79983474488742, + "grad_norm": 6.943477630615234, + "learning_rate": 6.665564920398833e-06, + "loss": 1.2321, + "step": 53250 + }, + { + "epoch": 8.80148729601322, + "grad_norm": 22.82369041442871, + "learning_rate": 6.6563837015002115e-06, + "loss": 1.4648, + "step": 53260 + }, + { + "epoch": 8.80313984713902, + "grad_norm": 11.41329288482666, + "learning_rate": 6.64720248260159e-06, + "loss": 1.2926, + "step": 53270 + }, + { + "epoch": 8.804792398264821, + "grad_norm": 16.04501724243164, + "learning_rate": 6.638021263702969e-06, + "loss": 1.3558, + "step": 53280 + }, + { + "epoch": 8.806444949390622, + "grad_norm": 11.551048278808594, + "learning_rate": 6.628840044804349e-06, + "loss": 1.3032, + "step": 53290 + }, + { + "epoch": 8.808097500516423, + "grad_norm": 16.122791290283203, + "learning_rate": 6.619658825905728e-06, + "loss": 1.4187, + "step": 53300 + }, + { + "epoch": 8.809750051642222, + "grad_norm": 19.001909255981445, + "learning_rate": 6.610477607007107e-06, + "loss": 1.3634, + "step": 53310 + }, + { + "epoch": 8.811402602768023, + "grad_norm": 16.42638397216797, + "learning_rate": 6.601296388108485e-06, + "loss": 1.2488, + "step": 53320 + }, + { + "epoch": 8.813055153893824, + "grad_norm": 18.04165267944336, + "learning_rate": 6.5921151692098645e-06, + "loss": 1.3113, + "step": 53330 + }, + { + "epoch": 8.814707705019623, + "grad_norm": 14.503774642944336, + "learning_rate": 6.582933950311243e-06, + "loss": 1.2873, + "step": 53340 + }, + { + "epoch": 8.816360256145424, + "grad_norm": 12.262187957763672, + "learning_rate": 6.573752731412623e-06, + "loss": 1.3056, + "step": 53350 + }, + { + "epoch": 8.818012807271225, + "grad_norm": 16.321773529052734, + "learning_rate": 6.564571512514002e-06, + "loss": 1.2377, + "step": 53360 + }, + { + "epoch": 8.819665358397025, + "grad_norm": 8.0360107421875, + "learning_rate": 6.555390293615381e-06, + "loss": 1.3937, + "step": 53370 + }, + { + "epoch": 8.821317909522826, + "grad_norm": 9.701162338256836, + "learning_rate": 6.54620907471676e-06, + "loss": 1.2723, + "step": 53380 + }, + { + "epoch": 8.822970460648627, + "grad_norm": 63.11784362792969, + "learning_rate": 6.537027855818138e-06, + "loss": 1.4564, + "step": 53390 + }, + { + "epoch": 8.824623011774428, + "grad_norm": 13.213398933410645, + "learning_rate": 6.527846636919517e-06, + "loss": 1.3917, + "step": 53400 + }, + { + "epoch": 8.826275562900227, + "grad_norm": 29.3795166015625, + "learning_rate": 6.518665418020898e-06, + "loss": 1.1673, + "step": 53410 + }, + { + "epoch": 8.827928114026028, + "grad_norm": 20.529861450195312, + "learning_rate": 6.509484199122276e-06, + "loss": 1.3965, + "step": 53420 + }, + { + "epoch": 8.829580665151829, + "grad_norm": 12.665575981140137, + "learning_rate": 6.5003029802236544e-06, + "loss": 1.2904, + "step": 53430 + }, + { + "epoch": 8.831233216277628, + "grad_norm": 23.68876838684082, + "learning_rate": 6.491121761325034e-06, + "loss": 1.4163, + "step": 53440 + }, + { + "epoch": 8.832885767403429, + "grad_norm": 13.767816543579102, + "learning_rate": 6.481940542426412e-06, + "loss": 1.2772, + "step": 53450 + }, + { + "epoch": 8.83453831852923, + "grad_norm": 38.02272415161133, + "learning_rate": 6.472759323527792e-06, + "loss": 1.3729, + "step": 53460 + }, + { + "epoch": 8.83619086965503, + "grad_norm": 18.688528060913086, + "learning_rate": 6.4635781046291714e-06, + "loss": 1.4323, + "step": 53470 + }, + { + "epoch": 8.83784342078083, + "grad_norm": 8.972434043884277, + "learning_rate": 6.45439688573055e-06, + "loss": 1.2365, + "step": 53480 + }, + { + "epoch": 8.839495971906631, + "grad_norm": 12.999137878417969, + "learning_rate": 6.445215666831929e-06, + "loss": 1.2838, + "step": 53490 + }, + { + "epoch": 8.841148523032432, + "grad_norm": 12.123946189880371, + "learning_rate": 6.4360344479333075e-06, + "loss": 1.3943, + "step": 53500 + }, + { + "epoch": 8.842801074158231, + "grad_norm": 8.562018394470215, + "learning_rate": 6.426853229034687e-06, + "loss": 1.4237, + "step": 53510 + }, + { + "epoch": 8.844453625284032, + "grad_norm": 19.84341049194336, + "learning_rate": 6.417672010136067e-06, + "loss": 1.3901, + "step": 53520 + }, + { + "epoch": 8.846106176409833, + "grad_norm": 11.601472854614258, + "learning_rate": 6.408490791237445e-06, + "loss": 1.2849, + "step": 53530 + }, + { + "epoch": 8.847758727535632, + "grad_norm": 16.48011589050293, + "learning_rate": 6.3993095723388245e-06, + "loss": 1.4206, + "step": 53540 + }, + { + "epoch": 8.849411278661433, + "grad_norm": 10.797418594360352, + "learning_rate": 6.390128353440203e-06, + "loss": 1.516, + "step": 53550 + }, + { + "epoch": 8.851063829787234, + "grad_norm": 13.68311595916748, + "learning_rate": 6.380947134541581e-06, + "loss": 1.3379, + "step": 53560 + }, + { + "epoch": 8.852716380913034, + "grad_norm": 11.746673583984375, + "learning_rate": 6.3717659156429605e-06, + "loss": 1.3683, + "step": 53570 + }, + { + "epoch": 8.854368932038835, + "grad_norm": 16.722267150878906, + "learning_rate": 6.362584696744341e-06, + "loss": 1.4114, + "step": 53580 + }, + { + "epoch": 8.856021483164636, + "grad_norm": 13.57430362701416, + "learning_rate": 6.353403477845719e-06, + "loss": 1.3794, + "step": 53590 + }, + { + "epoch": 8.857674034290437, + "grad_norm": 19.50122833251953, + "learning_rate": 6.344222258947098e-06, + "loss": 1.3773, + "step": 53600 + }, + { + "epoch": 8.859326585416236, + "grad_norm": 13.851290702819824, + "learning_rate": 6.335041040048477e-06, + "loss": 1.2712, + "step": 53610 + }, + { + "epoch": 8.860979136542037, + "grad_norm": 11.362812995910645, + "learning_rate": 6.325859821149856e-06, + "loss": 1.2669, + "step": 53620 + }, + { + "epoch": 8.862631687667838, + "grad_norm": 13.138988494873047, + "learning_rate": 6.316678602251236e-06, + "loss": 1.3556, + "step": 53630 + }, + { + "epoch": 8.864284238793637, + "grad_norm": 21.056747436523438, + "learning_rate": 6.307497383352614e-06, + "loss": 1.4155, + "step": 53640 + }, + { + "epoch": 8.865936789919438, + "grad_norm": 16.05164909362793, + "learning_rate": 6.298316164453994e-06, + "loss": 1.4202, + "step": 53650 + }, + { + "epoch": 8.867589341045239, + "grad_norm": 30.52538299560547, + "learning_rate": 6.289134945555372e-06, + "loss": 1.2314, + "step": 53660 + }, + { + "epoch": 8.86924189217104, + "grad_norm": 27.325618743896484, + "learning_rate": 6.279953726656751e-06, + "loss": 1.3327, + "step": 53670 + }, + { + "epoch": 8.870894443296839, + "grad_norm": 14.13979434967041, + "learning_rate": 6.27077250775813e-06, + "loss": 1.4423, + "step": 53680 + }, + { + "epoch": 8.87254699442264, + "grad_norm": 9.841453552246094, + "learning_rate": 6.26159128885951e-06, + "loss": 1.4869, + "step": 53690 + }, + { + "epoch": 8.874199545548441, + "grad_norm": 10.473819732666016, + "learning_rate": 6.252410069960889e-06, + "loss": 1.2138, + "step": 53700 + }, + { + "epoch": 8.87585209667424, + "grad_norm": 20.32805824279785, + "learning_rate": 6.2432288510622675e-06, + "loss": 1.3251, + "step": 53710 + }, + { + "epoch": 8.877504647800041, + "grad_norm": 26.516998291015625, + "learning_rate": 6.234047632163646e-06, + "loss": 1.3455, + "step": 53720 + }, + { + "epoch": 8.879157198925842, + "grad_norm": 10.170158386230469, + "learning_rate": 6.224866413265026e-06, + "loss": 1.2815, + "step": 53730 + }, + { + "epoch": 8.880809750051641, + "grad_norm": 13.213897705078125, + "learning_rate": 6.215685194366404e-06, + "loss": 1.3635, + "step": 53740 + }, + { + "epoch": 8.882462301177442, + "grad_norm": 15.690320014953613, + "learning_rate": 6.206503975467784e-06, + "loss": 1.3606, + "step": 53750 + }, + { + "epoch": 8.884114852303243, + "grad_norm": 8.988179206848145, + "learning_rate": 6.197322756569163e-06, + "loss": 1.3001, + "step": 53760 + }, + { + "epoch": 8.885767403429044, + "grad_norm": 15.613768577575684, + "learning_rate": 6.188141537670541e-06, + "loss": 1.2319, + "step": 53770 + }, + { + "epoch": 8.887419954554844, + "grad_norm": 22.44986343383789, + "learning_rate": 6.1789603187719205e-06, + "loss": 1.2765, + "step": 53780 + }, + { + "epoch": 8.889072505680645, + "grad_norm": 19.037084579467773, + "learning_rate": 6.1697790998733e-06, + "loss": 1.3022, + "step": 53790 + }, + { + "epoch": 8.890725056806446, + "grad_norm": 20.137554168701172, + "learning_rate": 6.160597880974678e-06, + "loss": 1.4232, + "step": 53800 + }, + { + "epoch": 8.892377607932245, + "grad_norm": 10.645371437072754, + "learning_rate": 6.151416662076057e-06, + "loss": 1.2907, + "step": 53810 + }, + { + "epoch": 8.894030159058046, + "grad_norm": 11.387445449829102, + "learning_rate": 6.142235443177437e-06, + "loss": 1.2386, + "step": 53820 + }, + { + "epoch": 8.895682710183847, + "grad_norm": 13.088994979858398, + "learning_rate": 6.133054224278816e-06, + "loss": 1.379, + "step": 53830 + }, + { + "epoch": 8.897335261309646, + "grad_norm": 17.280851364135742, + "learning_rate": 6.123873005380194e-06, + "loss": 1.2731, + "step": 53840 + }, + { + "epoch": 8.898987812435447, + "grad_norm": 13.71379566192627, + "learning_rate": 6.1146917864815735e-06, + "loss": 1.3099, + "step": 53850 + }, + { + "epoch": 8.900640363561248, + "grad_norm": 27.96197509765625, + "learning_rate": 6.105510567582953e-06, + "loss": 1.3606, + "step": 53860 + }, + { + "epoch": 8.902292914687049, + "grad_norm": 14.15559196472168, + "learning_rate": 6.096329348684331e-06, + "loss": 1.3055, + "step": 53870 + }, + { + "epoch": 8.903945465812848, + "grad_norm": 14.177240371704102, + "learning_rate": 6.0871481297857104e-06, + "loss": 1.3035, + "step": 53880 + }, + { + "epoch": 8.905598016938649, + "grad_norm": 12.994119644165039, + "learning_rate": 6.07796691088709e-06, + "loss": 1.3279, + "step": 53890 + }, + { + "epoch": 8.90725056806445, + "grad_norm": 15.457225799560547, + "learning_rate": 6.068785691988468e-06, + "loss": 1.3741, + "step": 53900 + }, + { + "epoch": 8.90890311919025, + "grad_norm": 9.61684799194336, + "learning_rate": 6.059604473089848e-06, + "loss": 1.3281, + "step": 53910 + }, + { + "epoch": 8.91055567031605, + "grad_norm": 10.564475059509277, + "learning_rate": 6.0504232541912266e-06, + "loss": 1.3453, + "step": 53920 + }, + { + "epoch": 8.912208221441851, + "grad_norm": 12.618019104003906, + "learning_rate": 6.041242035292606e-06, + "loss": 1.2849, + "step": 53930 + }, + { + "epoch": 8.913860772567652, + "grad_norm": 19.173622131347656, + "learning_rate": 6.032060816393985e-06, + "loss": 1.4845, + "step": 53940 + }, + { + "epoch": 8.915513323693451, + "grad_norm": 11.999756813049316, + "learning_rate": 6.0228795974953635e-06, + "loss": 1.3252, + "step": 53950 + }, + { + "epoch": 8.917165874819252, + "grad_norm": 25.721511840820312, + "learning_rate": 6.013698378596743e-06, + "loss": 1.3001, + "step": 53960 + }, + { + "epoch": 8.918818425945053, + "grad_norm": 14.201374053955078, + "learning_rate": 6.004517159698122e-06, + "loss": 1.4462, + "step": 53970 + }, + { + "epoch": 8.920470977070853, + "grad_norm": 14.782429695129395, + "learning_rate": 5.9953359407995e-06, + "loss": 1.5245, + "step": 53980 + }, + { + "epoch": 8.922123528196654, + "grad_norm": 13.691256523132324, + "learning_rate": 5.9861547219008805e-06, + "loss": 1.3555, + "step": 53990 + }, + { + "epoch": 8.923776079322455, + "grad_norm": 18.531856536865234, + "learning_rate": 5.976973503002259e-06, + "loss": 1.3047, + "step": 54000 + }, + { + "epoch": 8.925428630448254, + "grad_norm": 11.382560729980469, + "learning_rate": 5.967792284103637e-06, + "loss": 1.2516, + "step": 54010 + }, + { + "epoch": 8.927081181574055, + "grad_norm": 12.062386512756348, + "learning_rate": 5.958611065205017e-06, + "loss": 1.3224, + "step": 54020 + }, + { + "epoch": 8.928733732699856, + "grad_norm": 26.916439056396484, + "learning_rate": 5.949429846306396e-06, + "loss": 1.3987, + "step": 54030 + }, + { + "epoch": 8.930386283825655, + "grad_norm": 13.287517547607422, + "learning_rate": 5.940248627407775e-06, + "loss": 1.339, + "step": 54040 + }, + { + "epoch": 8.932038834951456, + "grad_norm": 27.684982299804688, + "learning_rate": 5.931067408509154e-06, + "loss": 1.2144, + "step": 54050 + }, + { + "epoch": 8.933691386077257, + "grad_norm": 12.099579811096191, + "learning_rate": 5.921886189610533e-06, + "loss": 1.2829, + "step": 54060 + }, + { + "epoch": 8.935343937203058, + "grad_norm": 12.295833587646484, + "learning_rate": 5.912704970711912e-06, + "loss": 1.3906, + "step": 54070 + }, + { + "epoch": 8.936996488328857, + "grad_norm": 15.061843872070312, + "learning_rate": 5.903523751813291e-06, + "loss": 1.4865, + "step": 54080 + }, + { + "epoch": 8.938649039454658, + "grad_norm": 10.409856796264648, + "learning_rate": 5.8943425329146695e-06, + "loss": 1.4299, + "step": 54090 + }, + { + "epoch": 8.940301590580459, + "grad_norm": 7.9486212730407715, + "learning_rate": 5.885161314016049e-06, + "loss": 1.2411, + "step": 54100 + }, + { + "epoch": 8.941954141706258, + "grad_norm": 11.08385944366455, + "learning_rate": 5.875980095117428e-06, + "loss": 1.3165, + "step": 54110 + }, + { + "epoch": 8.94360669283206, + "grad_norm": 15.459141731262207, + "learning_rate": 5.866798876218807e-06, + "loss": 1.431, + "step": 54120 + }, + { + "epoch": 8.94525924395786, + "grad_norm": 12.070756912231445, + "learning_rate": 5.8576176573201865e-06, + "loss": 1.3148, + "step": 54130 + }, + { + "epoch": 8.946911795083661, + "grad_norm": 15.25113582611084, + "learning_rate": 5.848436438421565e-06, + "loss": 1.4825, + "step": 54140 + }, + { + "epoch": 8.94856434620946, + "grad_norm": 14.48801326751709, + "learning_rate": 5.839255219522944e-06, + "loss": 1.3299, + "step": 54150 + }, + { + "epoch": 8.950216897335261, + "grad_norm": 40.4534797668457, + "learning_rate": 5.8300740006243234e-06, + "loss": 1.2875, + "step": 54160 + }, + { + "epoch": 8.951869448461062, + "grad_norm": 10.24796199798584, + "learning_rate": 5.820892781725702e-06, + "loss": 1.296, + "step": 54170 + }, + { + "epoch": 8.953521999586862, + "grad_norm": 12.344043731689453, + "learning_rate": 5.811711562827081e-06, + "loss": 1.4524, + "step": 54180 + }, + { + "epoch": 8.955174550712663, + "grad_norm": 16.066959381103516, + "learning_rate": 5.80253034392846e-06, + "loss": 1.2011, + "step": 54190 + }, + { + "epoch": 8.956827101838464, + "grad_norm": 16.067623138427734, + "learning_rate": 5.7933491250298396e-06, + "loss": 1.4874, + "step": 54200 + }, + { + "epoch": 8.958479652964265, + "grad_norm": 15.692643165588379, + "learning_rate": 5.784167906131218e-06, + "loss": 1.2578, + "step": 54210 + }, + { + "epoch": 8.960132204090064, + "grad_norm": 29.125408172607422, + "learning_rate": 5.774986687232597e-06, + "loss": 1.4732, + "step": 54220 + }, + { + "epoch": 8.961784755215865, + "grad_norm": 14.437169075012207, + "learning_rate": 5.7658054683339765e-06, + "loss": 1.3453, + "step": 54230 + }, + { + "epoch": 8.963437306341666, + "grad_norm": 14.426401138305664, + "learning_rate": 5.756624249435355e-06, + "loss": 1.3487, + "step": 54240 + }, + { + "epoch": 8.965089857467465, + "grad_norm": 17.095333099365234, + "learning_rate": 5.747443030536734e-06, + "loss": 1.3297, + "step": 54250 + }, + { + "epoch": 8.966742408593266, + "grad_norm": 15.15270709991455, + "learning_rate": 5.738261811638113e-06, + "loss": 1.2081, + "step": 54260 + }, + { + "epoch": 8.968394959719067, + "grad_norm": 17.099987030029297, + "learning_rate": 5.729080592739492e-06, + "loss": 1.2349, + "step": 54270 + }, + { + "epoch": 8.970047510844866, + "grad_norm": 19.881269454956055, + "learning_rate": 5.719899373840872e-06, + "loss": 1.3661, + "step": 54280 + }, + { + "epoch": 8.971700061970667, + "grad_norm": 21.131669998168945, + "learning_rate": 5.71071815494225e-06, + "loss": 1.2879, + "step": 54290 + }, + { + "epoch": 8.973352613096468, + "grad_norm": 19.318918228149414, + "learning_rate": 5.7015369360436295e-06, + "loss": 1.2724, + "step": 54300 + }, + { + "epoch": 8.975005164222267, + "grad_norm": 22.20404624938965, + "learning_rate": 5.692355717145009e-06, + "loss": 1.3058, + "step": 54310 + }, + { + "epoch": 8.976657715348068, + "grad_norm": 9.28625202178955, + "learning_rate": 5.683174498246387e-06, + "loss": 1.3284, + "step": 54320 + }, + { + "epoch": 8.97831026647387, + "grad_norm": 18.054636001586914, + "learning_rate": 5.673993279347766e-06, + "loss": 1.2838, + "step": 54330 + }, + { + "epoch": 8.97996281759967, + "grad_norm": 13.409037590026855, + "learning_rate": 5.664812060449146e-06, + "loss": 1.2346, + "step": 54340 + }, + { + "epoch": 8.98161536872547, + "grad_norm": 15.619589805603027, + "learning_rate": 5.655630841550524e-06, + "loss": 1.3766, + "step": 54350 + }, + { + "epoch": 8.98326791985127, + "grad_norm": 11.095884323120117, + "learning_rate": 5.646449622651904e-06, + "loss": 1.3703, + "step": 54360 + }, + { + "epoch": 8.984920470977071, + "grad_norm": 14.479853630065918, + "learning_rate": 5.6372684037532826e-06, + "loss": 1.3841, + "step": 54370 + }, + { + "epoch": 8.98657302210287, + "grad_norm": 12.613340377807617, + "learning_rate": 5.628087184854662e-06, + "loss": 1.2455, + "step": 54380 + }, + { + "epoch": 8.988225573228672, + "grad_norm": 9.666672706604004, + "learning_rate": 5.618905965956041e-06, + "loss": 1.1269, + "step": 54390 + }, + { + "epoch": 8.989878124354473, + "grad_norm": 32.712371826171875, + "learning_rate": 5.6097247470574194e-06, + "loss": 1.4111, + "step": 54400 + }, + { + "epoch": 8.991530675480274, + "grad_norm": 13.545608520507812, + "learning_rate": 5.600543528158799e-06, + "loss": 1.3404, + "step": 54410 + }, + { + "epoch": 8.993183226606073, + "grad_norm": 12.60755729675293, + "learning_rate": 5.591362309260178e-06, + "loss": 1.2749, + "step": 54420 + }, + { + "epoch": 8.994835777731874, + "grad_norm": 17.72956657409668, + "learning_rate": 5.582181090361556e-06, + "loss": 1.3757, + "step": 54430 + }, + { + "epoch": 8.996488328857675, + "grad_norm": 8.734169006347656, + "learning_rate": 5.572999871462936e-06, + "loss": 1.2355, + "step": 54440 + }, + { + "epoch": 8.998140879983474, + "grad_norm": 10.11967945098877, + "learning_rate": 5.563818652564315e-06, + "loss": 1.1991, + "step": 54450 + }, + { + "epoch": 8.999793431109275, + "grad_norm": 13.364051818847656, + "learning_rate": 5.554637433665694e-06, + "loss": 1.3775, + "step": 54460 + }, + { + "epoch": 8.999958686221856, + "eval_accuracy": 0.33997730174492835, + "eval_loss": 2.3859617710113525, + "eval_runtime": 823.2799, + "eval_samples_per_second": 34.248, + "eval_steps_per_second": 8.562, + "step": 54461 + }, + { + "epoch": 9.001445982235076, + "grad_norm": 18.1331844329834, + "learning_rate": 5.5454562147670725e-06, + "loss": 1.433, + "step": 54470 + }, + { + "epoch": 9.003098533360875, + "grad_norm": 19.45754051208496, + "learning_rate": 5.536274995868452e-06, + "loss": 1.199, + "step": 54480 + }, + { + "epoch": 9.004751084486676, + "grad_norm": 6.5627970695495605, + "learning_rate": 5.527093776969831e-06, + "loss": 1.2614, + "step": 54490 + }, + { + "epoch": 9.006403635612477, + "grad_norm": 13.369138717651367, + "learning_rate": 5.517912558071209e-06, + "loss": 1.3375, + "step": 54500 + }, + { + "epoch": 9.008056186738278, + "grad_norm": 21.747480392456055, + "learning_rate": 5.508731339172589e-06, + "loss": 1.2666, + "step": 54510 + }, + { + "epoch": 9.009708737864077, + "grad_norm": 13.323081970214844, + "learning_rate": 5.499550120273968e-06, + "loss": 1.3027, + "step": 54520 + }, + { + "epoch": 9.011361288989878, + "grad_norm": 8.494196891784668, + "learning_rate": 5.490368901375347e-06, + "loss": 1.1626, + "step": 54530 + }, + { + "epoch": 9.01301384011568, + "grad_norm": 8.807066917419434, + "learning_rate": 5.481187682476726e-06, + "loss": 1.142, + "step": 54540 + }, + { + "epoch": 9.014666391241478, + "grad_norm": 21.88263702392578, + "learning_rate": 5.472006463578105e-06, + "loss": 1.3858, + "step": 54550 + }, + { + "epoch": 9.01631894236728, + "grad_norm": 16.97997283935547, + "learning_rate": 5.462825244679484e-06, + "loss": 1.4277, + "step": 54560 + }, + { + "epoch": 9.01797149349308, + "grad_norm": 17.69019889831543, + "learning_rate": 5.453644025780863e-06, + "loss": 1.4564, + "step": 54570 + }, + { + "epoch": 9.01962404461888, + "grad_norm": 21.331026077270508, + "learning_rate": 5.444462806882242e-06, + "loss": 1.2666, + "step": 54580 + }, + { + "epoch": 9.02127659574468, + "grad_norm": 21.188798904418945, + "learning_rate": 5.435281587983621e-06, + "loss": 1.3853, + "step": 54590 + }, + { + "epoch": 9.022929146870482, + "grad_norm": 17.578445434570312, + "learning_rate": 5.426100369085e-06, + "loss": 1.2737, + "step": 54600 + }, + { + "epoch": 9.024581697996283, + "grad_norm": 21.36452293395996, + "learning_rate": 5.4169191501863786e-06, + "loss": 1.319, + "step": 54610 + }, + { + "epoch": 9.026234249122082, + "grad_norm": 23.47857666015625, + "learning_rate": 5.407737931287759e-06, + "loss": 1.3926, + "step": 54620 + }, + { + "epoch": 9.027886800247883, + "grad_norm": 11.379927635192871, + "learning_rate": 5.398556712389137e-06, + "loss": 1.2875, + "step": 54630 + }, + { + "epoch": 9.029539351373684, + "grad_norm": 12.81651496887207, + "learning_rate": 5.3893754934905155e-06, + "loss": 1.1859, + "step": 54640 + }, + { + "epoch": 9.031191902499483, + "grad_norm": 13.319893836975098, + "learning_rate": 5.3801942745918956e-06, + "loss": 1.3343, + "step": 54650 + }, + { + "epoch": 9.032844453625284, + "grad_norm": 10.670450210571289, + "learning_rate": 5.371013055693274e-06, + "loss": 1.3357, + "step": 54660 + }, + { + "epoch": 9.034497004751085, + "grad_norm": 12.764323234558105, + "learning_rate": 5.361831836794653e-06, + "loss": 1.3346, + "step": 54670 + }, + { + "epoch": 9.036149555876884, + "grad_norm": 11.50831127166748, + "learning_rate": 5.3526506178960325e-06, + "loss": 1.2738, + "step": 54680 + }, + { + "epoch": 9.037802107002685, + "grad_norm": 14.689154624938965, + "learning_rate": 5.343469398997411e-06, + "loss": 1.3225, + "step": 54690 + }, + { + "epoch": 9.039454658128486, + "grad_norm": 12.108161926269531, + "learning_rate": 5.33428818009879e-06, + "loss": 1.2694, + "step": 54700 + }, + { + "epoch": 9.041107209254287, + "grad_norm": 12.230198860168457, + "learning_rate": 5.325106961200169e-06, + "loss": 1.336, + "step": 54710 + }, + { + "epoch": 9.042759760380086, + "grad_norm": 16.08524513244629, + "learning_rate": 5.315925742301548e-06, + "loss": 1.2942, + "step": 54720 + }, + { + "epoch": 9.044412311505887, + "grad_norm": 36.26041030883789, + "learning_rate": 5.306744523402928e-06, + "loss": 1.2881, + "step": 54730 + }, + { + "epoch": 9.046064862631688, + "grad_norm": 10.63325309753418, + "learning_rate": 5.297563304504306e-06, + "loss": 1.533, + "step": 54740 + }, + { + "epoch": 9.047717413757487, + "grad_norm": 20.543127059936523, + "learning_rate": 5.2883820856056855e-06, + "loss": 1.429, + "step": 54750 + }, + { + "epoch": 9.049369964883288, + "grad_norm": 16.287906646728516, + "learning_rate": 5.279200866707065e-06, + "loss": 1.3495, + "step": 54760 + }, + { + "epoch": 9.05102251600909, + "grad_norm": 18.479795455932617, + "learning_rate": 5.270019647808443e-06, + "loss": 1.3766, + "step": 54770 + }, + { + "epoch": 9.05267506713489, + "grad_norm": 12.131267547607422, + "learning_rate": 5.260838428909822e-06, + "loss": 1.388, + "step": 54780 + }, + { + "epoch": 9.05432761826069, + "grad_norm": 17.166824340820312, + "learning_rate": 5.251657210011202e-06, + "loss": 1.3733, + "step": 54790 + }, + { + "epoch": 9.05598016938649, + "grad_norm": 14.11315631866455, + "learning_rate": 5.24247599111258e-06, + "loss": 1.3204, + "step": 54800 + }, + { + "epoch": 9.057632720512292, + "grad_norm": 11.092327117919922, + "learning_rate": 5.233294772213959e-06, + "loss": 1.2731, + "step": 54810 + }, + { + "epoch": 9.05928527163809, + "grad_norm": 11.508016586303711, + "learning_rate": 5.2241135533153385e-06, + "loss": 1.3763, + "step": 54820 + }, + { + "epoch": 9.060937822763892, + "grad_norm": 13.264908790588379, + "learning_rate": 5.214932334416718e-06, + "loss": 1.1948, + "step": 54830 + }, + { + "epoch": 9.062590373889693, + "grad_norm": 16.212055206298828, + "learning_rate": 5.205751115518096e-06, + "loss": 1.318, + "step": 54840 + }, + { + "epoch": 9.064242925015492, + "grad_norm": 33.09046173095703, + "learning_rate": 5.1965698966194754e-06, + "loss": 1.28, + "step": 54850 + }, + { + "epoch": 9.065895476141293, + "grad_norm": 12.761731147766113, + "learning_rate": 5.187388677720855e-06, + "loss": 1.2515, + "step": 54860 + }, + { + "epoch": 9.067548027267094, + "grad_norm": 11.305936813354492, + "learning_rate": 5.178207458822233e-06, + "loss": 1.1792, + "step": 54870 + }, + { + "epoch": 9.069200578392895, + "grad_norm": 19.43776512145996, + "learning_rate": 5.169026239923612e-06, + "loss": 1.3766, + "step": 54880 + }, + { + "epoch": 9.070853129518694, + "grad_norm": 11.29062557220459, + "learning_rate": 5.1598450210249916e-06, + "loss": 1.2883, + "step": 54890 + }, + { + "epoch": 9.072505680644495, + "grad_norm": 12.61872673034668, + "learning_rate": 5.15066380212637e-06, + "loss": 1.245, + "step": 54900 + }, + { + "epoch": 9.074158231770296, + "grad_norm": 10.980611801147461, + "learning_rate": 5.14148258322775e-06, + "loss": 1.42, + "step": 54910 + }, + { + "epoch": 9.075810782896095, + "grad_norm": 13.058303833007812, + "learning_rate": 5.1323013643291285e-06, + "loss": 1.21, + "step": 54920 + }, + { + "epoch": 9.077463334021896, + "grad_norm": 21.789306640625, + "learning_rate": 5.123120145430508e-06, + "loss": 1.3207, + "step": 54930 + }, + { + "epoch": 9.079115885147697, + "grad_norm": 10.95693588256836, + "learning_rate": 5.113938926531887e-06, + "loss": 1.3505, + "step": 54940 + }, + { + "epoch": 9.080768436273496, + "grad_norm": 21.274913787841797, + "learning_rate": 5.104757707633265e-06, + "loss": 1.2573, + "step": 54950 + }, + { + "epoch": 9.082420987399297, + "grad_norm": 17.98501205444336, + "learning_rate": 5.095576488734645e-06, + "loss": 1.2962, + "step": 54960 + }, + { + "epoch": 9.084073538525098, + "grad_norm": 21.35386848449707, + "learning_rate": 5.086395269836024e-06, + "loss": 1.3419, + "step": 54970 + }, + { + "epoch": 9.0857260896509, + "grad_norm": 19.58464241027832, + "learning_rate": 5.077214050937402e-06, + "loss": 1.4037, + "step": 54980 + }, + { + "epoch": 9.087378640776699, + "grad_norm": 19.767635345458984, + "learning_rate": 5.068032832038782e-06, + "loss": 1.2972, + "step": 54990 + }, + { + "epoch": 9.0890311919025, + "grad_norm": 10.111676216125488, + "learning_rate": 5.058851613140161e-06, + "loss": 1.1995, + "step": 55000 + }, + { + "epoch": 9.0906837430283, + "grad_norm": 11.100005149841309, + "learning_rate": 5.049670394241539e-06, + "loss": 1.2342, + "step": 55010 + }, + { + "epoch": 9.0923362941541, + "grad_norm": 13.454795837402344, + "learning_rate": 5.040489175342919e-06, + "loss": 1.2795, + "step": 55020 + }, + { + "epoch": 9.0939888452799, + "grad_norm": 16.521162033081055, + "learning_rate": 5.031307956444298e-06, + "loss": 1.3299, + "step": 55030 + }, + { + "epoch": 9.095641396405702, + "grad_norm": 12.513435363769531, + "learning_rate": 5.022126737545677e-06, + "loss": 1.2241, + "step": 55040 + }, + { + "epoch": 9.097293947531501, + "grad_norm": 33.393882751464844, + "learning_rate": 5.012945518647056e-06, + "loss": 1.3831, + "step": 55050 + }, + { + "epoch": 9.098946498657302, + "grad_norm": 12.040635108947754, + "learning_rate": 5.0037642997484345e-06, + "loss": 1.2954, + "step": 55060 + }, + { + "epoch": 9.100599049783103, + "grad_norm": 16.64748764038086, + "learning_rate": 4.994583080849814e-06, + "loss": 1.2867, + "step": 55070 + }, + { + "epoch": 9.102251600908904, + "grad_norm": 20.921030044555664, + "learning_rate": 4.985401861951193e-06, + "loss": 1.3136, + "step": 55080 + }, + { + "epoch": 9.103904152034703, + "grad_norm": 17.621395111083984, + "learning_rate": 4.9762206430525714e-06, + "loss": 1.2214, + "step": 55090 + }, + { + "epoch": 9.105556703160504, + "grad_norm": 13.227643013000488, + "learning_rate": 4.967039424153951e-06, + "loss": 1.3465, + "step": 55100 + }, + { + "epoch": 9.107209254286305, + "grad_norm": 12.755581855773926, + "learning_rate": 4.95785820525533e-06, + "loss": 1.3946, + "step": 55110 + }, + { + "epoch": 9.108861805412104, + "grad_norm": 19.701791763305664, + "learning_rate": 4.948676986356709e-06, + "loss": 1.2807, + "step": 55120 + }, + { + "epoch": 9.110514356537905, + "grad_norm": 16.122726440429688, + "learning_rate": 4.9394957674580884e-06, + "loss": 1.3864, + "step": 55130 + }, + { + "epoch": 9.112166907663706, + "grad_norm": 14.194814682006836, + "learning_rate": 4.930314548559467e-06, + "loss": 1.3725, + "step": 55140 + }, + { + "epoch": 9.113819458789507, + "grad_norm": 16.94427490234375, + "learning_rate": 4.921133329660846e-06, + "loss": 1.311, + "step": 55150 + }, + { + "epoch": 9.115472009915306, + "grad_norm": 41.79347229003906, + "learning_rate": 4.911952110762225e-06, + "loss": 1.498, + "step": 55160 + }, + { + "epoch": 9.117124561041107, + "grad_norm": 26.90255355834961, + "learning_rate": 4.902770891863604e-06, + "loss": 1.2616, + "step": 55170 + }, + { + "epoch": 9.118777112166908, + "grad_norm": 15.550649642944336, + "learning_rate": 4.893589672964983e-06, + "loss": 1.2621, + "step": 55180 + }, + { + "epoch": 9.120429663292708, + "grad_norm": 25.298446655273438, + "learning_rate": 4.884408454066362e-06, + "loss": 1.2517, + "step": 55190 + }, + { + "epoch": 9.122082214418509, + "grad_norm": 19.14631462097168, + "learning_rate": 4.8752272351677415e-06, + "loss": 1.353, + "step": 55200 + }, + { + "epoch": 9.12373476554431, + "grad_norm": 16.164554595947266, + "learning_rate": 4.86604601626912e-06, + "loss": 1.2734, + "step": 55210 + }, + { + "epoch": 9.125387316670109, + "grad_norm": 16.599605560302734, + "learning_rate": 4.856864797370499e-06, + "loss": 1.2069, + "step": 55220 + }, + { + "epoch": 9.12703986779591, + "grad_norm": 22.704082489013672, + "learning_rate": 4.847683578471878e-06, + "loss": 1.3562, + "step": 55230 + }, + { + "epoch": 9.12869241892171, + "grad_norm": 9.706205368041992, + "learning_rate": 4.838502359573257e-06, + "loss": 1.239, + "step": 55240 + }, + { + "epoch": 9.130344970047512, + "grad_norm": 19.193674087524414, + "learning_rate": 4.829321140674636e-06, + "loss": 1.3882, + "step": 55250 + }, + { + "epoch": 9.13199752117331, + "grad_norm": 9.436002731323242, + "learning_rate": 4.820139921776015e-06, + "loss": 1.3104, + "step": 55260 + }, + { + "epoch": 9.133650072299112, + "grad_norm": 12.764290809631348, + "learning_rate": 4.810958702877394e-06, + "loss": 1.238, + "step": 55270 + }, + { + "epoch": 9.135302623424913, + "grad_norm": 13.602463722229004, + "learning_rate": 4.801777483978774e-06, + "loss": 1.4013, + "step": 55280 + }, + { + "epoch": 9.136955174550712, + "grad_norm": 18.19413185119629, + "learning_rate": 4.792596265080152e-06, + "loss": 1.2348, + "step": 55290 + }, + { + "epoch": 9.138607725676513, + "grad_norm": 11.962725639343262, + "learning_rate": 4.783415046181531e-06, + "loss": 1.1874, + "step": 55300 + }, + { + "epoch": 9.140260276802314, + "grad_norm": 14.87602710723877, + "learning_rate": 4.774233827282911e-06, + "loss": 1.3184, + "step": 55310 + }, + { + "epoch": 9.141912827928113, + "grad_norm": 17.802339553833008, + "learning_rate": 4.765052608384289e-06, + "loss": 1.2428, + "step": 55320 + }, + { + "epoch": 9.143565379053914, + "grad_norm": 23.761425018310547, + "learning_rate": 4.755871389485668e-06, + "loss": 1.2894, + "step": 55330 + }, + { + "epoch": 9.145217930179715, + "grad_norm": 10.590774536132812, + "learning_rate": 4.7466901705870475e-06, + "loss": 1.2307, + "step": 55340 + }, + { + "epoch": 9.146870481305516, + "grad_norm": 11.037496566772461, + "learning_rate": 4.737508951688426e-06, + "loss": 1.2873, + "step": 55350 + }, + { + "epoch": 9.148523032431315, + "grad_norm": 14.964154243469238, + "learning_rate": 4.728327732789806e-06, + "loss": 1.295, + "step": 55360 + }, + { + "epoch": 9.150175583557116, + "grad_norm": 15.096177101135254, + "learning_rate": 4.7191465138911844e-06, + "loss": 1.2046, + "step": 55370 + }, + { + "epoch": 9.151828134682917, + "grad_norm": 8.96634578704834, + "learning_rate": 4.709965294992563e-06, + "loss": 1.382, + "step": 55380 + }, + { + "epoch": 9.153480685808717, + "grad_norm": 16.687162399291992, + "learning_rate": 4.700784076093943e-06, + "loss": 1.317, + "step": 55390 + }, + { + "epoch": 9.155133236934518, + "grad_norm": 15.783407211303711, + "learning_rate": 4.691602857195321e-06, + "loss": 1.3151, + "step": 55400 + }, + { + "epoch": 9.156785788060319, + "grad_norm": 18.55613136291504, + "learning_rate": 4.682421638296701e-06, + "loss": 1.402, + "step": 55410 + }, + { + "epoch": 9.158438339186118, + "grad_norm": 11.569503784179688, + "learning_rate": 4.67324041939808e-06, + "loss": 1.3886, + "step": 55420 + }, + { + "epoch": 9.160090890311919, + "grad_norm": 49.667762756347656, + "learning_rate": 4.664059200499458e-06, + "loss": 1.0859, + "step": 55430 + }, + { + "epoch": 9.16174344143772, + "grad_norm": 11.705397605895996, + "learning_rate": 4.6548779816008375e-06, + "loss": 1.2273, + "step": 55440 + }, + { + "epoch": 9.16339599256352, + "grad_norm": 15.881692886352539, + "learning_rate": 4.645696762702217e-06, + "loss": 1.388, + "step": 55450 + }, + { + "epoch": 9.16504854368932, + "grad_norm": 18.31934928894043, + "learning_rate": 4.636515543803595e-06, + "loss": 1.3677, + "step": 55460 + }, + { + "epoch": 9.16670109481512, + "grad_norm": 14.108166694641113, + "learning_rate": 4.627334324904974e-06, + "loss": 1.2068, + "step": 55470 + }, + { + "epoch": 9.168353645940922, + "grad_norm": 10.275944709777832, + "learning_rate": 4.618153106006354e-06, + "loss": 1.3015, + "step": 55480 + }, + { + "epoch": 9.170006197066721, + "grad_norm": 11.239326477050781, + "learning_rate": 4.608971887107733e-06, + "loss": 1.3207, + "step": 55490 + }, + { + "epoch": 9.171658748192522, + "grad_norm": 16.125225067138672, + "learning_rate": 4.599790668209112e-06, + "loss": 1.2972, + "step": 55500 + }, + { + "epoch": 9.173311299318323, + "grad_norm": 15.761225700378418, + "learning_rate": 4.5906094493104905e-06, + "loss": 1.2291, + "step": 55510 + }, + { + "epoch": 9.174963850444122, + "grad_norm": 12.649540901184082, + "learning_rate": 4.58142823041187e-06, + "loss": 1.2749, + "step": 55520 + }, + { + "epoch": 9.176616401569923, + "grad_norm": 17.501537322998047, + "learning_rate": 4.572247011513249e-06, + "loss": 1.3203, + "step": 55530 + }, + { + "epoch": 9.178268952695724, + "grad_norm": 18.077924728393555, + "learning_rate": 4.563065792614627e-06, + "loss": 1.397, + "step": 55540 + }, + { + "epoch": 9.179921503821525, + "grad_norm": 16.778873443603516, + "learning_rate": 4.553884573716007e-06, + "loss": 1.2976, + "step": 55550 + }, + { + "epoch": 9.181574054947324, + "grad_norm": 13.572968482971191, + "learning_rate": 4.544703354817386e-06, + "loss": 1.2331, + "step": 55560 + }, + { + "epoch": 9.183226606073125, + "grad_norm": 13.623981475830078, + "learning_rate": 4.535522135918765e-06, + "loss": 1.3081, + "step": 55570 + }, + { + "epoch": 9.184879157198926, + "grad_norm": 11.336007118225098, + "learning_rate": 4.5263409170201436e-06, + "loss": 1.2609, + "step": 55580 + }, + { + "epoch": 9.186531708324726, + "grad_norm": 14.808525085449219, + "learning_rate": 4.517159698121523e-06, + "loss": 1.2853, + "step": 55590 + }, + { + "epoch": 9.188184259450527, + "grad_norm": 11.729154586791992, + "learning_rate": 4.507978479222902e-06, + "loss": 1.3402, + "step": 55600 + }, + { + "epoch": 9.189836810576328, + "grad_norm": 25.362701416015625, + "learning_rate": 4.4987972603242805e-06, + "loss": 1.5593, + "step": 55610 + }, + { + "epoch": 9.191489361702128, + "grad_norm": 10.813699722290039, + "learning_rate": 4.48961604142566e-06, + "loss": 1.233, + "step": 55620 + }, + { + "epoch": 9.193141912827928, + "grad_norm": 11.491537094116211, + "learning_rate": 4.480434822527039e-06, + "loss": 1.3878, + "step": 55630 + }, + { + "epoch": 9.194794463953729, + "grad_norm": 17.993120193481445, + "learning_rate": 4.471253603628417e-06, + "loss": 1.2426, + "step": 55640 + }, + { + "epoch": 9.19644701507953, + "grad_norm": 14.781805038452148, + "learning_rate": 4.4620723847297975e-06, + "loss": 1.2122, + "step": 55650 + }, + { + "epoch": 9.198099566205329, + "grad_norm": 12.504164695739746, + "learning_rate": 4.452891165831176e-06, + "loss": 1.3508, + "step": 55660 + }, + { + "epoch": 9.19975211733113, + "grad_norm": 14.548686027526855, + "learning_rate": 4.443709946932554e-06, + "loss": 1.3797, + "step": 55670 + }, + { + "epoch": 9.20140466845693, + "grad_norm": 14.047411918640137, + "learning_rate": 4.434528728033934e-06, + "loss": 1.1652, + "step": 55680 + }, + { + "epoch": 9.20305721958273, + "grad_norm": 27.93585205078125, + "learning_rate": 4.425347509135313e-06, + "loss": 1.2276, + "step": 55690 + }, + { + "epoch": 9.204709770708531, + "grad_norm": 14.484628677368164, + "learning_rate": 4.416166290236692e-06, + "loss": 1.2675, + "step": 55700 + }, + { + "epoch": 9.206362321834332, + "grad_norm": 23.827726364135742, + "learning_rate": 4.406985071338071e-06, + "loss": 1.306, + "step": 55710 + }, + { + "epoch": 9.208014872960133, + "grad_norm": 22.634979248046875, + "learning_rate": 4.39780385243945e-06, + "loss": 1.4343, + "step": 55720 + }, + { + "epoch": 9.209667424085932, + "grad_norm": 17.955224990844727, + "learning_rate": 4.38862263354083e-06, + "loss": 1.3497, + "step": 55730 + }, + { + "epoch": 9.211319975211733, + "grad_norm": 12.252691268920898, + "learning_rate": 4.379441414642208e-06, + "loss": 1.2658, + "step": 55740 + }, + { + "epoch": 9.212972526337534, + "grad_norm": 29.16194725036621, + "learning_rate": 4.3702601957435865e-06, + "loss": 1.1545, + "step": 55750 + }, + { + "epoch": 9.214625077463333, + "grad_norm": 19.66663932800293, + "learning_rate": 4.361078976844967e-06, + "loss": 1.3138, + "step": 55760 + }, + { + "epoch": 9.216277628589134, + "grad_norm": 11.54080867767334, + "learning_rate": 4.351897757946345e-06, + "loss": 1.2657, + "step": 55770 + }, + { + "epoch": 9.217930179714935, + "grad_norm": 17.858362197875977, + "learning_rate": 4.342716539047724e-06, + "loss": 1.3622, + "step": 55780 + }, + { + "epoch": 9.219582730840735, + "grad_norm": 10.797994613647461, + "learning_rate": 4.3335353201491035e-06, + "loss": 1.4025, + "step": 55790 + }, + { + "epoch": 9.221235281966536, + "grad_norm": 12.868209838867188, + "learning_rate": 4.324354101250482e-06, + "loss": 1.3193, + "step": 55800 + }, + { + "epoch": 9.222887833092337, + "grad_norm": 12.669856071472168, + "learning_rate": 4.315172882351861e-06, + "loss": 1.3091, + "step": 55810 + }, + { + "epoch": 9.224540384218137, + "grad_norm": 14.927458763122559, + "learning_rate": 4.3059916634532404e-06, + "loss": 1.3628, + "step": 55820 + }, + { + "epoch": 9.226192935343937, + "grad_norm": 13.37768840789795, + "learning_rate": 4.296810444554619e-06, + "loss": 1.3404, + "step": 55830 + }, + { + "epoch": 9.227845486469738, + "grad_norm": 21.847307205200195, + "learning_rate": 4.287629225655998e-06, + "loss": 1.306, + "step": 55840 + }, + { + "epoch": 9.229498037595539, + "grad_norm": 40.5103759765625, + "learning_rate": 4.278448006757377e-06, + "loss": 1.3563, + "step": 55850 + }, + { + "epoch": 9.231150588721338, + "grad_norm": 9.23901653289795, + "learning_rate": 4.2692667878587566e-06, + "loss": 1.1886, + "step": 55860 + }, + { + "epoch": 9.232803139847139, + "grad_norm": 21.267831802368164, + "learning_rate": 4.260085568960135e-06, + "loss": 1.3202, + "step": 55870 + }, + { + "epoch": 9.23445569097294, + "grad_norm": 32.90507888793945, + "learning_rate": 4.250904350061514e-06, + "loss": 1.3244, + "step": 55880 + }, + { + "epoch": 9.23610824209874, + "grad_norm": 44.84469223022461, + "learning_rate": 4.2417231311628935e-06, + "loss": 1.3374, + "step": 55890 + }, + { + "epoch": 9.23776079322454, + "grad_norm": 21.209402084350586, + "learning_rate": 4.232541912264273e-06, + "loss": 1.3268, + "step": 55900 + }, + { + "epoch": 9.239413344350341, + "grad_norm": 10.94561767578125, + "learning_rate": 4.223360693365651e-06, + "loss": 1.3878, + "step": 55910 + }, + { + "epoch": 9.241065895476142, + "grad_norm": 24.05000114440918, + "learning_rate": 4.21417947446703e-06, + "loss": 1.2205, + "step": 55920 + }, + { + "epoch": 9.242718446601941, + "grad_norm": 17.849794387817383, + "learning_rate": 4.20499825556841e-06, + "loss": 1.2642, + "step": 55930 + }, + { + "epoch": 9.244370997727742, + "grad_norm": 22.153186798095703, + "learning_rate": 4.195817036669789e-06, + "loss": 1.2733, + "step": 55940 + }, + { + "epoch": 9.246023548853543, + "grad_norm": 15.071115493774414, + "learning_rate": 4.186635817771167e-06, + "loss": 1.2561, + "step": 55950 + }, + { + "epoch": 9.247676099979342, + "grad_norm": 22.48716926574707, + "learning_rate": 4.1774545988725465e-06, + "loss": 1.3194, + "step": 55960 + }, + { + "epoch": 9.249328651105143, + "grad_norm": 18.10262680053711, + "learning_rate": 4.168273379973926e-06, + "loss": 1.3629, + "step": 55970 + }, + { + "epoch": 9.250981202230944, + "grad_norm": 14.13875961303711, + "learning_rate": 4.159092161075304e-06, + "loss": 1.286, + "step": 55980 + }, + { + "epoch": 9.252633753356745, + "grad_norm": 16.74396324157715, + "learning_rate": 4.149910942176683e-06, + "loss": 1.2687, + "step": 55990 + }, + { + "epoch": 9.254286304482545, + "grad_norm": 33.51347732543945, + "learning_rate": 4.140729723278063e-06, + "loss": 1.4942, + "step": 56000 + }, + { + "epoch": 9.255938855608346, + "grad_norm": 15.115519523620605, + "learning_rate": 4.131548504379441e-06, + "loss": 1.36, + "step": 56010 + }, + { + "epoch": 9.257591406734146, + "grad_norm": 25.56139373779297, + "learning_rate": 4.122367285480821e-06, + "loss": 1.4605, + "step": 56020 + }, + { + "epoch": 9.259243957859946, + "grad_norm": 20.03956413269043, + "learning_rate": 4.1131860665821995e-06, + "loss": 1.3127, + "step": 56030 + }, + { + "epoch": 9.260896508985747, + "grad_norm": 14.782129287719727, + "learning_rate": 4.104004847683579e-06, + "loss": 1.4149, + "step": 56040 + }, + { + "epoch": 9.262549060111548, + "grad_norm": 16.4676570892334, + "learning_rate": 4.094823628784958e-06, + "loss": 1.3314, + "step": 56050 + }, + { + "epoch": 9.264201611237347, + "grad_norm": 18.199886322021484, + "learning_rate": 4.0856424098863364e-06, + "loss": 1.3737, + "step": 56060 + }, + { + "epoch": 9.265854162363148, + "grad_norm": 19.445627212524414, + "learning_rate": 4.076461190987716e-06, + "loss": 1.3664, + "step": 56070 + }, + { + "epoch": 9.267506713488949, + "grad_norm": 14.629560470581055, + "learning_rate": 4.067279972089095e-06, + "loss": 1.3397, + "step": 56080 + }, + { + "epoch": 9.26915926461475, + "grad_norm": 22.30562400817871, + "learning_rate": 4.058098753190473e-06, + "loss": 1.3311, + "step": 56090 + }, + { + "epoch": 9.270811815740549, + "grad_norm": 9.496840476989746, + "learning_rate": 4.0489175342918534e-06, + "loss": 1.1589, + "step": 56100 + }, + { + "epoch": 9.27246436686635, + "grad_norm": 13.570216178894043, + "learning_rate": 4.039736315393232e-06, + "loss": 1.3121, + "step": 56110 + }, + { + "epoch": 9.274116917992151, + "grad_norm": 23.2502384185791, + "learning_rate": 4.030555096494611e-06, + "loss": 1.3267, + "step": 56120 + }, + { + "epoch": 9.27576946911795, + "grad_norm": 15.193344116210938, + "learning_rate": 4.02137387759599e-06, + "loss": 1.308, + "step": 56130 + }, + { + "epoch": 9.277422020243751, + "grad_norm": 18.408641815185547, + "learning_rate": 4.012192658697369e-06, + "loss": 1.3684, + "step": 56140 + }, + { + "epoch": 9.279074571369552, + "grad_norm": 16.26186180114746, + "learning_rate": 4.003011439798748e-06, + "loss": 1.3184, + "step": 56150 + }, + { + "epoch": 9.280727122495351, + "grad_norm": 15.509997367858887, + "learning_rate": 3.993830220900127e-06, + "loss": 1.3755, + "step": 56160 + }, + { + "epoch": 9.282379673621152, + "grad_norm": 10.246735572814941, + "learning_rate": 3.984649002001506e-06, + "loss": 1.4041, + "step": 56170 + }, + { + "epoch": 9.284032224746953, + "grad_norm": 14.112131118774414, + "learning_rate": 3.975467783102885e-06, + "loss": 1.2511, + "step": 56180 + }, + { + "epoch": 9.285684775872754, + "grad_norm": 14.345402717590332, + "learning_rate": 3.966286564204264e-06, + "loss": 1.3432, + "step": 56190 + }, + { + "epoch": 9.287337326998554, + "grad_norm": 13.019506454467773, + "learning_rate": 3.957105345305643e-06, + "loss": 1.4026, + "step": 56200 + }, + { + "epoch": 9.288989878124355, + "grad_norm": 10.218276023864746, + "learning_rate": 3.947924126407022e-06, + "loss": 1.3788, + "step": 56210 + }, + { + "epoch": 9.290642429250155, + "grad_norm": 10.781867980957031, + "learning_rate": 3.938742907508401e-06, + "loss": 1.2091, + "step": 56220 + }, + { + "epoch": 9.292294980375955, + "grad_norm": 8.732402801513672, + "learning_rate": 3.92956168860978e-06, + "loss": 1.2941, + "step": 56230 + }, + { + "epoch": 9.293947531501756, + "grad_norm": 19.886987686157227, + "learning_rate": 3.920380469711159e-06, + "loss": 1.2725, + "step": 56240 + }, + { + "epoch": 9.295600082627557, + "grad_norm": 13.052376747131348, + "learning_rate": 3.911199250812538e-06, + "loss": 1.2563, + "step": 56250 + }, + { + "epoch": 9.297252633753356, + "grad_norm": 9.23311710357666, + "learning_rate": 3.902018031913917e-06, + "loss": 1.2777, + "step": 56260 + }, + { + "epoch": 9.298905184879157, + "grad_norm": 17.674257278442383, + "learning_rate": 3.8928368130152956e-06, + "loss": 1.1415, + "step": 56270 + }, + { + "epoch": 9.300557736004958, + "grad_norm": 13.015941619873047, + "learning_rate": 3.883655594116676e-06, + "loss": 1.2718, + "step": 56280 + }, + { + "epoch": 9.302210287130759, + "grad_norm": 13.032365798950195, + "learning_rate": 3.874474375218054e-06, + "loss": 1.3148, + "step": 56290 + }, + { + "epoch": 9.303862838256558, + "grad_norm": 13.440291404724121, + "learning_rate": 3.865293156319433e-06, + "loss": 1.2976, + "step": 56300 + }, + { + "epoch": 9.305515389382359, + "grad_norm": 20.601293563842773, + "learning_rate": 3.8561119374208125e-06, + "loss": 1.2242, + "step": 56310 + }, + { + "epoch": 9.30716794050816, + "grad_norm": 14.70266342163086, + "learning_rate": 3.846930718522191e-06, + "loss": 1.3573, + "step": 56320 + }, + { + "epoch": 9.30882049163396, + "grad_norm": 18.35739517211914, + "learning_rate": 3.83774949962357e-06, + "loss": 1.2484, + "step": 56330 + }, + { + "epoch": 9.31047304275976, + "grad_norm": 14.594882011413574, + "learning_rate": 3.8285682807249494e-06, + "loss": 1.3821, + "step": 56340 + }, + { + "epoch": 9.312125593885561, + "grad_norm": 17.77569007873535, + "learning_rate": 3.819387061826328e-06, + "loss": 1.2919, + "step": 56350 + }, + { + "epoch": 9.313778145011362, + "grad_norm": 15.527255058288574, + "learning_rate": 3.8102058429277075e-06, + "loss": 1.29, + "step": 56360 + }, + { + "epoch": 9.315430696137161, + "grad_norm": 15.60554313659668, + "learning_rate": 3.8010246240290863e-06, + "loss": 1.2916, + "step": 56370 + }, + { + "epoch": 9.317083247262962, + "grad_norm": 14.84151554107666, + "learning_rate": 3.791843405130465e-06, + "loss": 1.2193, + "step": 56380 + }, + { + "epoch": 9.318735798388763, + "grad_norm": 15.908535957336426, + "learning_rate": 3.7826621862318444e-06, + "loss": 1.3779, + "step": 56390 + }, + { + "epoch": 9.320388349514563, + "grad_norm": 20.001300811767578, + "learning_rate": 3.7734809673332232e-06, + "loss": 1.3879, + "step": 56400 + }, + { + "epoch": 9.322040900640364, + "grad_norm": 25.633216857910156, + "learning_rate": 3.764299748434602e-06, + "loss": 1.4137, + "step": 56410 + }, + { + "epoch": 9.323693451766164, + "grad_norm": 14.804731369018555, + "learning_rate": 3.7551185295359817e-06, + "loss": 1.3807, + "step": 56420 + }, + { + "epoch": 9.325346002891964, + "grad_norm": 17.46161651611328, + "learning_rate": 3.74593731063736e-06, + "loss": 1.2958, + "step": 56430 + }, + { + "epoch": 9.326998554017765, + "grad_norm": 29.73974609375, + "learning_rate": 3.736756091738739e-06, + "loss": 1.3127, + "step": 56440 + }, + { + "epoch": 9.328651105143566, + "grad_norm": 14.454090118408203, + "learning_rate": 3.7275748728401186e-06, + "loss": 1.3697, + "step": 56450 + }, + { + "epoch": 9.330303656269367, + "grad_norm": 16.23573875427246, + "learning_rate": 3.7183936539414975e-06, + "loss": 1.3834, + "step": 56460 + }, + { + "epoch": 9.331956207395166, + "grad_norm": 31.776811599731445, + "learning_rate": 3.7092124350428763e-06, + "loss": 1.3624, + "step": 56470 + }, + { + "epoch": 9.333608758520967, + "grad_norm": 17.408411026000977, + "learning_rate": 3.7000312161442555e-06, + "loss": 1.13, + "step": 56480 + }, + { + "epoch": 9.335261309646768, + "grad_norm": 14.618858337402344, + "learning_rate": 3.6908499972456343e-06, + "loss": 1.2647, + "step": 56490 + }, + { + "epoch": 9.336913860772567, + "grad_norm": 15.05654239654541, + "learning_rate": 3.681668778347014e-06, + "loss": 1.2852, + "step": 56500 + }, + { + "epoch": 9.338566411898368, + "grad_norm": 12.420156478881836, + "learning_rate": 3.6724875594483924e-06, + "loss": 1.43, + "step": 56510 + }, + { + "epoch": 9.340218963024169, + "grad_norm": 15.04156494140625, + "learning_rate": 3.6633063405497712e-06, + "loss": 1.207, + "step": 56520 + }, + { + "epoch": 9.341871514149968, + "grad_norm": 8.728355407714844, + "learning_rate": 3.654125121651151e-06, + "loss": 1.2025, + "step": 56530 + }, + { + "epoch": 9.34352406527577, + "grad_norm": 21.201353073120117, + "learning_rate": 3.6449439027525297e-06, + "loss": 1.0984, + "step": 56540 + }, + { + "epoch": 9.34517661640157, + "grad_norm": 11.79217529296875, + "learning_rate": 3.6357626838539086e-06, + "loss": 1.2759, + "step": 56550 + }, + { + "epoch": 9.346829167527371, + "grad_norm": 13.807501792907715, + "learning_rate": 3.626581464955288e-06, + "loss": 1.2994, + "step": 56560 + }, + { + "epoch": 9.34848171865317, + "grad_norm": 19.914011001586914, + "learning_rate": 3.6174002460566666e-06, + "loss": 1.3291, + "step": 56570 + }, + { + "epoch": 9.350134269778971, + "grad_norm": 16.1053409576416, + "learning_rate": 3.6082190271580455e-06, + "loss": 1.2639, + "step": 56580 + }, + { + "epoch": 9.351786820904772, + "grad_norm": 17.01410484313965, + "learning_rate": 3.5990378082594247e-06, + "loss": 1.4563, + "step": 56590 + }, + { + "epoch": 9.353439372030572, + "grad_norm": 12.761663436889648, + "learning_rate": 3.5898565893608035e-06, + "loss": 1.4376, + "step": 56600 + }, + { + "epoch": 9.355091923156373, + "grad_norm": 12.494538307189941, + "learning_rate": 3.5806753704621824e-06, + "loss": 1.4416, + "step": 56610 + }, + { + "epoch": 9.356744474282173, + "grad_norm": 15.235856056213379, + "learning_rate": 3.571494151563562e-06, + "loss": 1.2971, + "step": 56620 + }, + { + "epoch": 9.358397025407974, + "grad_norm": 22.249408721923828, + "learning_rate": 3.562312932664941e-06, + "loss": 1.3123, + "step": 56630 + }, + { + "epoch": 9.360049576533774, + "grad_norm": 13.691697120666504, + "learning_rate": 3.5531317137663197e-06, + "loss": 1.4081, + "step": 56640 + }, + { + "epoch": 9.361702127659575, + "grad_norm": 14.822437286376953, + "learning_rate": 3.543950494867699e-06, + "loss": 1.2504, + "step": 56650 + }, + { + "epoch": 9.363354678785376, + "grad_norm": 13.317008018493652, + "learning_rate": 3.5347692759690777e-06, + "loss": 1.2689, + "step": 56660 + }, + { + "epoch": 9.365007229911175, + "grad_norm": 19.28122901916504, + "learning_rate": 3.5255880570704566e-06, + "loss": 1.3562, + "step": 56670 + }, + { + "epoch": 9.366659781036976, + "grad_norm": 14.012068748474121, + "learning_rate": 3.516406838171836e-06, + "loss": 1.3528, + "step": 56680 + }, + { + "epoch": 9.368312332162777, + "grad_norm": 16.407472610473633, + "learning_rate": 3.5072256192732146e-06, + "loss": 1.1581, + "step": 56690 + }, + { + "epoch": 9.369964883288576, + "grad_norm": 13.19459342956543, + "learning_rate": 3.4980444003745943e-06, + "loss": 1.2654, + "step": 56700 + }, + { + "epoch": 9.371617434414377, + "grad_norm": 11.365413665771484, + "learning_rate": 3.488863181475973e-06, + "loss": 1.3031, + "step": 56710 + }, + { + "epoch": 9.373269985540178, + "grad_norm": 11.131091117858887, + "learning_rate": 3.479681962577352e-06, + "loss": 1.3596, + "step": 56720 + }, + { + "epoch": 9.374922536665977, + "grad_norm": 13.381197929382324, + "learning_rate": 3.470500743678731e-06, + "loss": 1.3861, + "step": 56730 + }, + { + "epoch": 9.376575087791778, + "grad_norm": 9.68941879272461, + "learning_rate": 3.46131952478011e-06, + "loss": 1.3179, + "step": 56740 + }, + { + "epoch": 9.37822763891758, + "grad_norm": 15.212055206298828, + "learning_rate": 3.452138305881489e-06, + "loss": 1.2758, + "step": 56750 + }, + { + "epoch": 9.37988019004338, + "grad_norm": 17.098337173461914, + "learning_rate": 3.442957086982868e-06, + "loss": 1.1934, + "step": 56760 + }, + { + "epoch": 9.38153274116918, + "grad_norm": 10.168761253356934, + "learning_rate": 3.433775868084247e-06, + "loss": 1.3831, + "step": 56770 + }, + { + "epoch": 9.38318529229498, + "grad_norm": 50.241641998291016, + "learning_rate": 3.4245946491856258e-06, + "loss": 1.4488, + "step": 56780 + }, + { + "epoch": 9.384837843420781, + "grad_norm": 18.76384162902832, + "learning_rate": 3.4154134302870054e-06, + "loss": 1.3276, + "step": 56790 + }, + { + "epoch": 9.38649039454658, + "grad_norm": 16.12409782409668, + "learning_rate": 3.4062322113883842e-06, + "loss": 1.2093, + "step": 56800 + }, + { + "epoch": 9.388142945672382, + "grad_norm": 16.562042236328125, + "learning_rate": 3.3970509924897626e-06, + "loss": 1.1709, + "step": 56810 + }, + { + "epoch": 9.389795496798182, + "grad_norm": 26.49726104736328, + "learning_rate": 3.3878697735911423e-06, + "loss": 1.3279, + "step": 56820 + }, + { + "epoch": 9.391448047923983, + "grad_norm": 13.345320701599121, + "learning_rate": 3.378688554692521e-06, + "loss": 1.2046, + "step": 56830 + }, + { + "epoch": 9.393100599049783, + "grad_norm": 16.027231216430664, + "learning_rate": 3.3695073357939e-06, + "loss": 1.3354, + "step": 56840 + }, + { + "epoch": 9.394753150175584, + "grad_norm": 10.626627922058105, + "learning_rate": 3.3603261168952792e-06, + "loss": 1.168, + "step": 56850 + }, + { + "epoch": 9.396405701301385, + "grad_norm": 18.5012264251709, + "learning_rate": 3.351144897996658e-06, + "loss": 1.4278, + "step": 56860 + }, + { + "epoch": 9.398058252427184, + "grad_norm": 21.47645378112793, + "learning_rate": 3.341963679098037e-06, + "loss": 1.3033, + "step": 56870 + }, + { + "epoch": 9.399710803552985, + "grad_norm": 17.78287696838379, + "learning_rate": 3.3327824601994165e-06, + "loss": 1.2744, + "step": 56880 + }, + { + "epoch": 9.401363354678786, + "grad_norm": 14.139810562133789, + "learning_rate": 3.323601241300795e-06, + "loss": 1.4479, + "step": 56890 + }, + { + "epoch": 9.403015905804585, + "grad_norm": 28.748075485229492, + "learning_rate": 3.3144200224021746e-06, + "loss": 1.2902, + "step": 56900 + }, + { + "epoch": 9.404668456930386, + "grad_norm": 46.95508575439453, + "learning_rate": 3.3052388035035534e-06, + "loss": 1.4943, + "step": 56910 + }, + { + "epoch": 9.406321008056187, + "grad_norm": 15.258516311645508, + "learning_rate": 3.2960575846049323e-06, + "loss": 1.1904, + "step": 56920 + }, + { + "epoch": 9.407973559181988, + "grad_norm": 23.977991104125977, + "learning_rate": 3.2868763657063115e-06, + "loss": 1.2751, + "step": 56930 + }, + { + "epoch": 9.409626110307787, + "grad_norm": 12.30024242401123, + "learning_rate": 3.2776951468076903e-06, + "loss": 1.2913, + "step": 56940 + }, + { + "epoch": 9.411278661433588, + "grad_norm": 11.82300853729248, + "learning_rate": 3.268513927909069e-06, + "loss": 1.4331, + "step": 56950 + }, + { + "epoch": 9.41293121255939, + "grad_norm": 17.451065063476562, + "learning_rate": 3.259332709010449e-06, + "loss": 1.2351, + "step": 56960 + }, + { + "epoch": 9.414583763685188, + "grad_norm": 13.529333114624023, + "learning_rate": 3.2501514901118272e-06, + "loss": 1.1954, + "step": 56970 + }, + { + "epoch": 9.41623631481099, + "grad_norm": 14.515869140625, + "learning_rate": 3.240970271213206e-06, + "loss": 1.3682, + "step": 56980 + }, + { + "epoch": 9.41788886593679, + "grad_norm": 11.694517135620117, + "learning_rate": 3.2317890523145857e-06, + "loss": 1.3659, + "step": 56990 + }, + { + "epoch": 9.41954141706259, + "grad_norm": 27.488210678100586, + "learning_rate": 3.2226078334159645e-06, + "loss": 1.1186, + "step": 57000 + }, + { + "epoch": 9.42119396818839, + "grad_norm": 15.681142807006836, + "learning_rate": 3.2134266145173434e-06, + "loss": 1.2438, + "step": 57010 + }, + { + "epoch": 9.422846519314191, + "grad_norm": 19.548660278320312, + "learning_rate": 3.2042453956187226e-06, + "loss": 1.3773, + "step": 57020 + }, + { + "epoch": 9.424499070439992, + "grad_norm": 23.549972534179688, + "learning_rate": 3.1950641767201014e-06, + "loss": 1.308, + "step": 57030 + }, + { + "epoch": 9.426151621565792, + "grad_norm": 19.313648223876953, + "learning_rate": 3.1858829578214803e-06, + "loss": 1.2348, + "step": 57040 + }, + { + "epoch": 9.427804172691593, + "grad_norm": 32.71291732788086, + "learning_rate": 3.1767017389228595e-06, + "loss": 1.1344, + "step": 57050 + }, + { + "epoch": 9.429456723817394, + "grad_norm": 14.130952835083008, + "learning_rate": 3.1675205200242383e-06, + "loss": 1.3653, + "step": 57060 + }, + { + "epoch": 9.431109274943193, + "grad_norm": 17.445035934448242, + "learning_rate": 3.158339301125618e-06, + "loss": 1.3794, + "step": 57070 + }, + { + "epoch": 9.432761826068994, + "grad_norm": 14.413025856018066, + "learning_rate": 3.149158082226997e-06, + "loss": 1.3055, + "step": 57080 + }, + { + "epoch": 9.434414377194795, + "grad_norm": 17.203859329223633, + "learning_rate": 3.1399768633283757e-06, + "loss": 1.3073, + "step": 57090 + }, + { + "epoch": 9.436066928320596, + "grad_norm": 13.873775482177734, + "learning_rate": 3.130795644429755e-06, + "loss": 1.2085, + "step": 57100 + }, + { + "epoch": 9.437719479446395, + "grad_norm": 15.375999450683594, + "learning_rate": 3.1216144255311337e-06, + "loss": 1.326, + "step": 57110 + }, + { + "epoch": 9.439372030572196, + "grad_norm": 17.9966983795166, + "learning_rate": 3.112433206632513e-06, + "loss": 1.2816, + "step": 57120 + }, + { + "epoch": 9.441024581697997, + "grad_norm": 19.590351104736328, + "learning_rate": 3.103251987733892e-06, + "loss": 1.2491, + "step": 57130 + }, + { + "epoch": 9.442677132823796, + "grad_norm": 19.17728614807129, + "learning_rate": 3.0940707688352706e-06, + "loss": 1.3321, + "step": 57140 + }, + { + "epoch": 9.444329683949597, + "grad_norm": 17.96892547607422, + "learning_rate": 3.08488954993665e-06, + "loss": 1.2419, + "step": 57150 + }, + { + "epoch": 9.445982235075398, + "grad_norm": 19.752483367919922, + "learning_rate": 3.0757083310380287e-06, + "loss": 1.3689, + "step": 57160 + }, + { + "epoch": 9.447634786201197, + "grad_norm": 16.997968673706055, + "learning_rate": 3.066527112139408e-06, + "loss": 1.3388, + "step": 57170 + }, + { + "epoch": 9.449287337326998, + "grad_norm": 13.03810977935791, + "learning_rate": 3.0573458932407868e-06, + "loss": 1.2395, + "step": 57180 + }, + { + "epoch": 9.4509398884528, + "grad_norm": 21.81022071838379, + "learning_rate": 3.0481646743421656e-06, + "loss": 1.2308, + "step": 57190 + }, + { + "epoch": 9.4525924395786, + "grad_norm": 19.233882904052734, + "learning_rate": 3.038983455443545e-06, + "loss": 1.135, + "step": 57200 + }, + { + "epoch": 9.4542449907044, + "grad_norm": 19.100370407104492, + "learning_rate": 3.029802236544924e-06, + "loss": 1.1726, + "step": 57210 + }, + { + "epoch": 9.4558975418302, + "grad_norm": 10.6184720993042, + "learning_rate": 3.020621017646303e-06, + "loss": 1.2419, + "step": 57220 + }, + { + "epoch": 9.457550092956001, + "grad_norm": 23.119884490966797, + "learning_rate": 3.0114397987476817e-06, + "loss": 1.253, + "step": 57230 + }, + { + "epoch": 9.4592026440818, + "grad_norm": 15.129219055175781, + "learning_rate": 3.002258579849061e-06, + "loss": 1.3053, + "step": 57240 + }, + { + "epoch": 9.460855195207602, + "grad_norm": 16.50132179260254, + "learning_rate": 2.9930773609504402e-06, + "loss": 1.2244, + "step": 57250 + }, + { + "epoch": 9.462507746333403, + "grad_norm": 12.94861125946045, + "learning_rate": 2.9838961420518186e-06, + "loss": 1.3387, + "step": 57260 + }, + { + "epoch": 9.464160297459202, + "grad_norm": 17.04841423034668, + "learning_rate": 2.974714923153198e-06, + "loss": 1.3652, + "step": 57270 + }, + { + "epoch": 9.465812848585003, + "grad_norm": 10.876708984375, + "learning_rate": 2.965533704254577e-06, + "loss": 1.2098, + "step": 57280 + }, + { + "epoch": 9.467465399710804, + "grad_norm": 11.755070686340332, + "learning_rate": 2.956352485355956e-06, + "loss": 1.299, + "step": 57290 + }, + { + "epoch": 9.469117950836605, + "grad_norm": 10.822213172912598, + "learning_rate": 2.9471712664573348e-06, + "loss": 1.2336, + "step": 57300 + }, + { + "epoch": 9.470770501962404, + "grad_norm": 19.71744155883789, + "learning_rate": 2.937990047558714e-06, + "loss": 1.3242, + "step": 57310 + }, + { + "epoch": 9.472423053088205, + "grad_norm": 21.902231216430664, + "learning_rate": 2.9288088286600933e-06, + "loss": 1.4042, + "step": 57320 + }, + { + "epoch": 9.474075604214006, + "grad_norm": 28.398181915283203, + "learning_rate": 2.919627609761472e-06, + "loss": 1.292, + "step": 57330 + }, + { + "epoch": 9.475728155339805, + "grad_norm": 17.9105281829834, + "learning_rate": 2.910446390862851e-06, + "loss": 1.4072, + "step": 57340 + }, + { + "epoch": 9.477380706465606, + "grad_norm": 24.03833770751953, + "learning_rate": 2.90126517196423e-06, + "loss": 1.3059, + "step": 57350 + }, + { + "epoch": 9.479033257591407, + "grad_norm": 18.128108978271484, + "learning_rate": 2.892083953065609e-06, + "loss": 1.3698, + "step": 57360 + }, + { + "epoch": 9.480685808717206, + "grad_norm": 15.403632164001465, + "learning_rate": 2.8829027341669882e-06, + "loss": 1.2558, + "step": 57370 + }, + { + "epoch": 9.482338359843007, + "grad_norm": 12.787527084350586, + "learning_rate": 2.873721515268367e-06, + "loss": 1.3369, + "step": 57380 + }, + { + "epoch": 9.483990910968808, + "grad_norm": 13.58300495147705, + "learning_rate": 2.864540296369746e-06, + "loss": 1.2305, + "step": 57390 + }, + { + "epoch": 9.48564346209461, + "grad_norm": 8.516387939453125, + "learning_rate": 2.855359077471125e-06, + "loss": 1.184, + "step": 57400 + }, + { + "epoch": 9.487296013220408, + "grad_norm": 12.774700164794922, + "learning_rate": 2.8461778585725044e-06, + "loss": 1.3685, + "step": 57410 + }, + { + "epoch": 9.48894856434621, + "grad_norm": 18.586366653442383, + "learning_rate": 2.836996639673883e-06, + "loss": 1.3613, + "step": 57420 + }, + { + "epoch": 9.49060111547201, + "grad_norm": 19.410852432250977, + "learning_rate": 2.827815420775262e-06, + "loss": 1.3254, + "step": 57430 + }, + { + "epoch": 9.49225366659781, + "grad_norm": 37.648040771484375, + "learning_rate": 2.8186342018766413e-06, + "loss": 1.3388, + "step": 57440 + }, + { + "epoch": 9.49390621772361, + "grad_norm": 11.228830337524414, + "learning_rate": 2.8094529829780205e-06, + "loss": 1.2954, + "step": 57450 + }, + { + "epoch": 9.495558768849412, + "grad_norm": 17.186487197875977, + "learning_rate": 2.8002717640793993e-06, + "loss": 1.3855, + "step": 57460 + }, + { + "epoch": 9.49721131997521, + "grad_norm": 14.050568580627441, + "learning_rate": 2.791090545180778e-06, + "loss": 1.3872, + "step": 57470 + }, + { + "epoch": 9.498863871101012, + "grad_norm": 57.6312370300293, + "learning_rate": 2.7819093262821574e-06, + "loss": 1.3453, + "step": 57480 + }, + { + "epoch": 9.500516422226813, + "grad_norm": 16.523967742919922, + "learning_rate": 2.7727281073835362e-06, + "loss": 1.3147, + "step": 57490 + }, + { + "epoch": 9.502168973352614, + "grad_norm": 18.444759368896484, + "learning_rate": 2.7635468884849155e-06, + "loss": 1.3901, + "step": 57500 + }, + { + "epoch": 9.503821524478413, + "grad_norm": 10.994654655456543, + "learning_rate": 2.7543656695862943e-06, + "loss": 1.3167, + "step": 57510 + }, + { + "epoch": 9.505474075604214, + "grad_norm": 12.720823287963867, + "learning_rate": 2.7451844506876736e-06, + "loss": 1.2899, + "step": 57520 + }, + { + "epoch": 9.507126626730015, + "grad_norm": 16.59524917602539, + "learning_rate": 2.7360032317890524e-06, + "loss": 1.25, + "step": 57530 + }, + { + "epoch": 9.508779177855814, + "grad_norm": 14.175232887268066, + "learning_rate": 2.7268220128904316e-06, + "loss": 1.3248, + "step": 57540 + }, + { + "epoch": 9.510431728981615, + "grad_norm": 16.468807220458984, + "learning_rate": 2.7176407939918105e-06, + "loss": 1.3374, + "step": 57550 + }, + { + "epoch": 9.512084280107416, + "grad_norm": 26.731430053710938, + "learning_rate": 2.7084595750931893e-06, + "loss": 1.2489, + "step": 57560 + }, + { + "epoch": 9.513736831233217, + "grad_norm": 9.305521011352539, + "learning_rate": 2.6992783561945685e-06, + "loss": 1.3049, + "step": 57570 + }, + { + "epoch": 9.515389382359016, + "grad_norm": 12.240220069885254, + "learning_rate": 2.6900971372959478e-06, + "loss": 1.3189, + "step": 57580 + }, + { + "epoch": 9.517041933484817, + "grad_norm": 76.72401428222656, + "learning_rate": 2.6809159183973266e-06, + "loss": 1.3185, + "step": 57590 + }, + { + "epoch": 9.518694484610618, + "grad_norm": 13.349190711975098, + "learning_rate": 2.6717346994987054e-06, + "loss": 1.4297, + "step": 57600 + }, + { + "epoch": 9.520347035736417, + "grad_norm": 21.3134765625, + "learning_rate": 2.6625534806000847e-06, + "loss": 1.1601, + "step": 57610 + }, + { + "epoch": 9.521999586862218, + "grad_norm": 11.682439804077148, + "learning_rate": 2.653372261701464e-06, + "loss": 1.2994, + "step": 57620 + }, + { + "epoch": 9.52365213798802, + "grad_norm": 17.29874038696289, + "learning_rate": 2.6441910428028427e-06, + "loss": 1.3704, + "step": 57630 + }, + { + "epoch": 9.525304689113819, + "grad_norm": 9.633611679077148, + "learning_rate": 2.6350098239042216e-06, + "loss": 1.2424, + "step": 57640 + }, + { + "epoch": 9.52695724023962, + "grad_norm": 73.63542175292969, + "learning_rate": 2.625828605005601e-06, + "loss": 1.3749, + "step": 57650 + }, + { + "epoch": 9.52860979136542, + "grad_norm": 20.629478454589844, + "learning_rate": 2.6166473861069796e-06, + "loss": 1.2985, + "step": 57660 + }, + { + "epoch": 9.530262342491222, + "grad_norm": 15.100749969482422, + "learning_rate": 2.607466167208359e-06, + "loss": 1.272, + "step": 57670 + }, + { + "epoch": 9.53191489361702, + "grad_norm": 16.76715850830078, + "learning_rate": 2.5982849483097377e-06, + "loss": 1.283, + "step": 57680 + }, + { + "epoch": 9.533567444742822, + "grad_norm": 23.314260482788086, + "learning_rate": 2.5891037294111165e-06, + "loss": 1.3745, + "step": 57690 + }, + { + "epoch": 9.535219995868623, + "grad_norm": 25.0333194732666, + "learning_rate": 2.5799225105124958e-06, + "loss": 1.1921, + "step": 57700 + }, + { + "epoch": 9.536872546994422, + "grad_norm": 14.260005950927734, + "learning_rate": 2.570741291613875e-06, + "loss": 1.2287, + "step": 57710 + }, + { + "epoch": 9.538525098120223, + "grad_norm": 20.597248077392578, + "learning_rate": 2.561560072715254e-06, + "loss": 1.3879, + "step": 57720 + }, + { + "epoch": 9.540177649246024, + "grad_norm": 15.62580680847168, + "learning_rate": 2.5523788538166327e-06, + "loss": 1.2893, + "step": 57730 + }, + { + "epoch": 9.541830200371823, + "grad_norm": 11.68388557434082, + "learning_rate": 2.543197634918012e-06, + "loss": 1.2051, + "step": 57740 + }, + { + "epoch": 9.543482751497624, + "grad_norm": 15.208742141723633, + "learning_rate": 2.534016416019391e-06, + "loss": 1.3074, + "step": 57750 + }, + { + "epoch": 9.545135302623425, + "grad_norm": 15.40304946899414, + "learning_rate": 2.5248351971207696e-06, + "loss": 1.3454, + "step": 57760 + }, + { + "epoch": 9.546787853749226, + "grad_norm": 21.844449996948242, + "learning_rate": 2.515653978222149e-06, + "loss": 1.1692, + "step": 57770 + }, + { + "epoch": 9.548440404875025, + "grad_norm": 16.35841178894043, + "learning_rate": 2.506472759323528e-06, + "loss": 1.386, + "step": 57780 + }, + { + "epoch": 9.550092956000826, + "grad_norm": 15.275668144226074, + "learning_rate": 2.497291540424907e-06, + "loss": 1.3673, + "step": 57790 + }, + { + "epoch": 9.551745507126627, + "grad_norm": 13.76074504852295, + "learning_rate": 2.4881103215262857e-06, + "loss": 1.3039, + "step": 57800 + }, + { + "epoch": 9.553398058252426, + "grad_norm": 14.345163345336914, + "learning_rate": 2.478929102627665e-06, + "loss": 1.4154, + "step": 57810 + }, + { + "epoch": 9.555050609378227, + "grad_norm": 12.346527099609375, + "learning_rate": 2.4697478837290442e-06, + "loss": 1.2726, + "step": 57820 + }, + { + "epoch": 9.556703160504028, + "grad_norm": 19.1199893951416, + "learning_rate": 2.460566664830423e-06, + "loss": 1.263, + "step": 57830 + }, + { + "epoch": 9.55835571162983, + "grad_norm": 14.129487037658691, + "learning_rate": 2.451385445931802e-06, + "loss": 1.3791, + "step": 57840 + }, + { + "epoch": 9.560008262755629, + "grad_norm": 15.209273338317871, + "learning_rate": 2.442204227033181e-06, + "loss": 1.2314, + "step": 57850 + }, + { + "epoch": 9.56166081388143, + "grad_norm": 15.329292297363281, + "learning_rate": 2.43302300813456e-06, + "loss": 1.2872, + "step": 57860 + }, + { + "epoch": 9.56331336500723, + "grad_norm": 16.666093826293945, + "learning_rate": 2.423841789235939e-06, + "loss": 1.3597, + "step": 57870 + }, + { + "epoch": 9.56496591613303, + "grad_norm": 13.654024124145508, + "learning_rate": 2.414660570337318e-06, + "loss": 1.2514, + "step": 57880 + }, + { + "epoch": 9.56661846725883, + "grad_norm": 12.923748016357422, + "learning_rate": 2.405479351438697e-06, + "loss": 1.1992, + "step": 57890 + }, + { + "epoch": 9.568271018384632, + "grad_norm": 12.820647239685059, + "learning_rate": 2.396298132540076e-06, + "loss": 1.3571, + "step": 57900 + }, + { + "epoch": 9.569923569510431, + "grad_norm": 16.327159881591797, + "learning_rate": 2.3871169136414553e-06, + "loss": 1.3175, + "step": 57910 + }, + { + "epoch": 9.571576120636232, + "grad_norm": 24.59794807434082, + "learning_rate": 2.377935694742834e-06, + "loss": 1.3611, + "step": 57920 + }, + { + "epoch": 9.573228671762033, + "grad_norm": 12.740336418151855, + "learning_rate": 2.368754475844213e-06, + "loss": 1.344, + "step": 57930 + }, + { + "epoch": 9.574881222887832, + "grad_norm": 17.685501098632812, + "learning_rate": 2.3595732569455922e-06, + "loss": 1.394, + "step": 57940 + }, + { + "epoch": 9.576533774013633, + "grad_norm": 19.866283416748047, + "learning_rate": 2.3503920380469715e-06, + "loss": 1.1501, + "step": 57950 + }, + { + "epoch": 9.578186325139434, + "grad_norm": 14.622546195983887, + "learning_rate": 2.3412108191483503e-06, + "loss": 1.2882, + "step": 57960 + }, + { + "epoch": 9.579838876265235, + "grad_norm": 16.59194564819336, + "learning_rate": 2.332029600249729e-06, + "loss": 1.3115, + "step": 57970 + }, + { + "epoch": 9.581491427391034, + "grad_norm": 18.110830307006836, + "learning_rate": 2.3228483813511084e-06, + "loss": 1.2563, + "step": 57980 + }, + { + "epoch": 9.583143978516835, + "grad_norm": 14.122636795043945, + "learning_rate": 2.313667162452487e-06, + "loss": 1.3451, + "step": 57990 + }, + { + "epoch": 9.584796529642636, + "grad_norm": 14.215288162231445, + "learning_rate": 2.3044859435538664e-06, + "loss": 1.2917, + "step": 58000 + }, + { + "epoch": 9.586449080768435, + "grad_norm": 100.20287322998047, + "learning_rate": 2.2953047246552453e-06, + "loss": 1.2025, + "step": 58010 + }, + { + "epoch": 9.588101631894236, + "grad_norm": 12.98987102508545, + "learning_rate": 2.2861235057566245e-06, + "loss": 1.2286, + "step": 58020 + }, + { + "epoch": 9.589754183020037, + "grad_norm": 17.084440231323242, + "learning_rate": 2.2769422868580033e-06, + "loss": 1.2378, + "step": 58030 + }, + { + "epoch": 9.591406734145838, + "grad_norm": 17.176267623901367, + "learning_rate": 2.2677610679593826e-06, + "loss": 1.4078, + "step": 58040 + }, + { + "epoch": 9.593059285271638, + "grad_norm": 38.53559112548828, + "learning_rate": 2.2585798490607614e-06, + "loss": 1.3422, + "step": 58050 + }, + { + "epoch": 9.594711836397439, + "grad_norm": 11.070537567138672, + "learning_rate": 2.2493986301621402e-06, + "loss": 1.3479, + "step": 58060 + }, + { + "epoch": 9.59636438752324, + "grad_norm": 19.474384307861328, + "learning_rate": 2.2402174112635195e-06, + "loss": 1.2747, + "step": 58070 + }, + { + "epoch": 9.598016938649039, + "grad_norm": 15.456579208374023, + "learning_rate": 2.2310361923648987e-06, + "loss": 1.3125, + "step": 58080 + }, + { + "epoch": 9.59966948977484, + "grad_norm": 15.075743675231934, + "learning_rate": 2.221854973466277e-06, + "loss": 1.3164, + "step": 58090 + }, + { + "epoch": 9.60132204090064, + "grad_norm": 25.315460205078125, + "learning_rate": 2.2126737545676564e-06, + "loss": 1.347, + "step": 58100 + }, + { + "epoch": 9.60297459202644, + "grad_norm": 21.648412704467773, + "learning_rate": 2.2034925356690356e-06, + "loss": 1.332, + "step": 58110 + }, + { + "epoch": 9.604627143152241, + "grad_norm": 15.424687385559082, + "learning_rate": 2.194311316770415e-06, + "loss": 1.3377, + "step": 58120 + }, + { + "epoch": 9.606279694278042, + "grad_norm": 36.263980865478516, + "learning_rate": 2.1851300978717933e-06, + "loss": 1.3506, + "step": 58130 + }, + { + "epoch": 9.607932245403843, + "grad_norm": 28.31026840209961, + "learning_rate": 2.1759488789731725e-06, + "loss": 1.2666, + "step": 58140 + }, + { + "epoch": 9.609584796529642, + "grad_norm": 17.763492584228516, + "learning_rate": 2.1667676600745518e-06, + "loss": 1.3172, + "step": 58150 + }, + { + "epoch": 9.611237347655443, + "grad_norm": 16.161304473876953, + "learning_rate": 2.1575864411759306e-06, + "loss": 1.2002, + "step": 58160 + }, + { + "epoch": 9.612889898781244, + "grad_norm": 15.862537384033203, + "learning_rate": 2.1484052222773094e-06, + "loss": 1.4035, + "step": 58170 + }, + { + "epoch": 9.614542449907043, + "grad_norm": 13.134415626525879, + "learning_rate": 2.1392240033786887e-06, + "loss": 1.2955, + "step": 58180 + }, + { + "epoch": 9.616195001032844, + "grad_norm": 14.605551719665527, + "learning_rate": 2.1300427844800675e-06, + "loss": 1.4105, + "step": 58190 + }, + { + "epoch": 9.617847552158645, + "grad_norm": 18.962343215942383, + "learning_rate": 2.1208615655814467e-06, + "loss": 1.2919, + "step": 58200 + }, + { + "epoch": 9.619500103284444, + "grad_norm": 13.857765197753906, + "learning_rate": 2.1116803466828256e-06, + "loss": 1.2513, + "step": 58210 + }, + { + "epoch": 9.621152654410245, + "grad_norm": 23.394207000732422, + "learning_rate": 2.102499127784205e-06, + "loss": 1.3572, + "step": 58220 + }, + { + "epoch": 9.622805205536046, + "grad_norm": 14.980634689331055, + "learning_rate": 2.0933179088855836e-06, + "loss": 1.2912, + "step": 58230 + }, + { + "epoch": 9.624457756661847, + "grad_norm": 20.3941593170166, + "learning_rate": 2.084136689986963e-06, + "loss": 1.3388, + "step": 58240 + }, + { + "epoch": 9.626110307787647, + "grad_norm": 14.584732055664062, + "learning_rate": 2.0749554710883417e-06, + "loss": 1.252, + "step": 58250 + }, + { + "epoch": 9.627762858913448, + "grad_norm": 23.42914581298828, + "learning_rate": 2.0657742521897205e-06, + "loss": 1.2462, + "step": 58260 + }, + { + "epoch": 9.629415410039249, + "grad_norm": 10.313939094543457, + "learning_rate": 2.0565930332910998e-06, + "loss": 1.3573, + "step": 58270 + }, + { + "epoch": 9.631067961165048, + "grad_norm": 11.711010932922363, + "learning_rate": 2.047411814392479e-06, + "loss": 1.3872, + "step": 58280 + }, + { + "epoch": 9.632720512290849, + "grad_norm": 16.497787475585938, + "learning_rate": 2.038230595493858e-06, + "loss": 1.3352, + "step": 58290 + }, + { + "epoch": 9.63437306341665, + "grad_norm": 9.749482154846191, + "learning_rate": 2.0290493765952367e-06, + "loss": 1.2596, + "step": 58300 + }, + { + "epoch": 9.63602561454245, + "grad_norm": 27.247074127197266, + "learning_rate": 2.019868157696616e-06, + "loss": 1.1729, + "step": 58310 + }, + { + "epoch": 9.63767816566825, + "grad_norm": 28.22972869873047, + "learning_rate": 2.010686938797995e-06, + "loss": 1.2017, + "step": 58320 + }, + { + "epoch": 9.639330716794051, + "grad_norm": 16.79508399963379, + "learning_rate": 2.001505719899374e-06, + "loss": 1.2413, + "step": 58330 + }, + { + "epoch": 9.640983267919852, + "grad_norm": 15.924692153930664, + "learning_rate": 1.992324501000753e-06, + "loss": 1.2305, + "step": 58340 + }, + { + "epoch": 9.642635819045651, + "grad_norm": 12.7982816696167, + "learning_rate": 1.983143282102132e-06, + "loss": 1.3867, + "step": 58350 + }, + { + "epoch": 9.644288370171452, + "grad_norm": 15.45654296875, + "learning_rate": 1.973962063203511e-06, + "loss": 1.2799, + "step": 58360 + }, + { + "epoch": 9.645940921297253, + "grad_norm": 16.585647583007812, + "learning_rate": 1.96478084430489e-06, + "loss": 1.3226, + "step": 58370 + }, + { + "epoch": 9.647593472423052, + "grad_norm": 23.49603843688965, + "learning_rate": 1.955599625406269e-06, + "loss": 1.2294, + "step": 58380 + }, + { + "epoch": 9.649246023548853, + "grad_norm": 15.694708824157715, + "learning_rate": 1.9464184065076478e-06, + "loss": 1.1692, + "step": 58390 + }, + { + "epoch": 9.650898574674654, + "grad_norm": 13.695332527160645, + "learning_rate": 1.937237187609027e-06, + "loss": 1.2909, + "step": 58400 + }, + { + "epoch": 9.652551125800455, + "grad_norm": 9.972579002380371, + "learning_rate": 1.9280559687104063e-06, + "loss": 1.2341, + "step": 58410 + }, + { + "epoch": 9.654203676926254, + "grad_norm": 14.860621452331543, + "learning_rate": 1.918874749811785e-06, + "loss": 1.2629, + "step": 58420 + }, + { + "epoch": 9.655856228052055, + "grad_norm": 11.948851585388184, + "learning_rate": 1.909693530913164e-06, + "loss": 1.2555, + "step": 58430 + }, + { + "epoch": 9.657508779177856, + "grad_norm": 22.112714767456055, + "learning_rate": 1.9005123120145432e-06, + "loss": 1.3156, + "step": 58440 + }, + { + "epoch": 9.659161330303656, + "grad_norm": 24.64064598083496, + "learning_rate": 1.8913310931159222e-06, + "loss": 1.4058, + "step": 58450 + }, + { + "epoch": 9.660813881429457, + "grad_norm": 14.680100440979004, + "learning_rate": 1.882149874217301e-06, + "loss": 1.357, + "step": 58460 + }, + { + "epoch": 9.662466432555258, + "grad_norm": 13.727468490600586, + "learning_rate": 1.87296865531868e-06, + "loss": 1.3138, + "step": 58470 + }, + { + "epoch": 9.664118983681057, + "grad_norm": 17.917619705200195, + "learning_rate": 1.8637874364200593e-06, + "loss": 1.286, + "step": 58480 + }, + { + "epoch": 9.665771534806858, + "grad_norm": 13.282672882080078, + "learning_rate": 1.8546062175214381e-06, + "loss": 1.253, + "step": 58490 + }, + { + "epoch": 9.667424085932659, + "grad_norm": 13.979046821594238, + "learning_rate": 1.8454249986228172e-06, + "loss": 1.2243, + "step": 58500 + }, + { + "epoch": 9.66907663705846, + "grad_norm": 16.821653366088867, + "learning_rate": 1.8362437797241962e-06, + "loss": 1.2361, + "step": 58510 + }, + { + "epoch": 9.670729188184259, + "grad_norm": 14.560995101928711, + "learning_rate": 1.8270625608255755e-06, + "loss": 1.2431, + "step": 58520 + }, + { + "epoch": 9.67238173931006, + "grad_norm": 15.146513938903809, + "learning_rate": 1.8178813419269543e-06, + "loss": 1.1976, + "step": 58530 + }, + { + "epoch": 9.674034290435861, + "grad_norm": 15.486896514892578, + "learning_rate": 1.8087001230283333e-06, + "loss": 1.3143, + "step": 58540 + }, + { + "epoch": 9.67568684156166, + "grad_norm": 18.165647506713867, + "learning_rate": 1.7995189041297124e-06, + "loss": 1.2991, + "step": 58550 + }, + { + "epoch": 9.677339392687461, + "grad_norm": 17.75432586669922, + "learning_rate": 1.7903376852310912e-06, + "loss": 1.1735, + "step": 58560 + }, + { + "epoch": 9.678991943813262, + "grad_norm": 17.992963790893555, + "learning_rate": 1.7811564663324704e-06, + "loss": 1.3001, + "step": 58570 + }, + { + "epoch": 9.680644494939063, + "grad_norm": 65.19438171386719, + "learning_rate": 1.7719752474338495e-06, + "loss": 1.3303, + "step": 58580 + }, + { + "epoch": 9.682297046064862, + "grad_norm": 18.94664192199707, + "learning_rate": 1.7627940285352283e-06, + "loss": 1.2772, + "step": 58590 + }, + { + "epoch": 9.683949597190663, + "grad_norm": 22.916940689086914, + "learning_rate": 1.7536128096366073e-06, + "loss": 1.3774, + "step": 58600 + }, + { + "epoch": 9.685602148316464, + "grad_norm": 17.095354080200195, + "learning_rate": 1.7444315907379866e-06, + "loss": 1.2863, + "step": 58610 + }, + { + "epoch": 9.687254699442263, + "grad_norm": 12.96331787109375, + "learning_rate": 1.7352503718393656e-06, + "loss": 1.4177, + "step": 58620 + }, + { + "epoch": 9.688907250568064, + "grad_norm": 20.66131019592285, + "learning_rate": 1.7260691529407444e-06, + "loss": 1.3123, + "step": 58630 + }, + { + "epoch": 9.690559801693865, + "grad_norm": 15.276062965393066, + "learning_rate": 1.7168879340421235e-06, + "loss": 1.334, + "step": 58640 + }, + { + "epoch": 9.692212352819665, + "grad_norm": 30.0185546875, + "learning_rate": 1.7077067151435027e-06, + "loss": 1.3818, + "step": 58650 + }, + { + "epoch": 9.693864903945466, + "grad_norm": 14.805960655212402, + "learning_rate": 1.6985254962448813e-06, + "loss": 1.3392, + "step": 58660 + }, + { + "epoch": 9.695517455071267, + "grad_norm": 17.313312530517578, + "learning_rate": 1.6893442773462606e-06, + "loss": 1.421, + "step": 58670 + }, + { + "epoch": 9.697170006197066, + "grad_norm": 15.689269065856934, + "learning_rate": 1.6801630584476396e-06, + "loss": 1.375, + "step": 58680 + }, + { + "epoch": 9.698822557322867, + "grad_norm": 27.808439254760742, + "learning_rate": 1.6709818395490184e-06, + "loss": 1.3845, + "step": 58690 + }, + { + "epoch": 9.700475108448668, + "grad_norm": 16.34505271911621, + "learning_rate": 1.6618006206503975e-06, + "loss": 1.2094, + "step": 58700 + }, + { + "epoch": 9.702127659574469, + "grad_norm": 30.347618103027344, + "learning_rate": 1.6526194017517767e-06, + "loss": 1.4331, + "step": 58710 + }, + { + "epoch": 9.703780210700268, + "grad_norm": 32.67634201049805, + "learning_rate": 1.6434381828531558e-06, + "loss": 1.3411, + "step": 58720 + }, + { + "epoch": 9.705432761826069, + "grad_norm": 13.503914833068848, + "learning_rate": 1.6342569639545346e-06, + "loss": 1.2915, + "step": 58730 + }, + { + "epoch": 9.70708531295187, + "grad_norm": 30.984130859375, + "learning_rate": 1.6250757450559136e-06, + "loss": 1.2693, + "step": 58740 + }, + { + "epoch": 9.70873786407767, + "grad_norm": 22.482847213745117, + "learning_rate": 1.6158945261572929e-06, + "loss": 1.2009, + "step": 58750 + }, + { + "epoch": 9.71039041520347, + "grad_norm": 51.00187301635742, + "learning_rate": 1.6067133072586717e-06, + "loss": 1.3464, + "step": 58760 + }, + { + "epoch": 9.712042966329271, + "grad_norm": 16.98764991760254, + "learning_rate": 1.5975320883600507e-06, + "loss": 1.2747, + "step": 58770 + }, + { + "epoch": 9.713695517455072, + "grad_norm": 14.666571617126465, + "learning_rate": 1.5883508694614298e-06, + "loss": 1.2513, + "step": 58780 + }, + { + "epoch": 9.715348068580871, + "grad_norm": 11.236737251281738, + "learning_rate": 1.579169650562809e-06, + "loss": 1.2613, + "step": 58790 + }, + { + "epoch": 9.717000619706672, + "grad_norm": 22.248416900634766, + "learning_rate": 1.5699884316641878e-06, + "loss": 1.3245, + "step": 58800 + }, + { + "epoch": 9.718653170832473, + "grad_norm": 10.243622779846191, + "learning_rate": 1.5608072127655669e-06, + "loss": 1.3724, + "step": 58810 + }, + { + "epoch": 9.720305721958272, + "grad_norm": 16.06927490234375, + "learning_rate": 1.551625993866946e-06, + "loss": 1.3333, + "step": 58820 + }, + { + "epoch": 9.721958273084073, + "grad_norm": 15.943928718566895, + "learning_rate": 1.542444774968325e-06, + "loss": 1.261, + "step": 58830 + }, + { + "epoch": 9.723610824209874, + "grad_norm": 17.844417572021484, + "learning_rate": 1.533263556069704e-06, + "loss": 1.3135, + "step": 58840 + }, + { + "epoch": 9.725263375335674, + "grad_norm": 16.729440689086914, + "learning_rate": 1.5240823371710828e-06, + "loss": 1.3187, + "step": 58850 + }, + { + "epoch": 9.726915926461475, + "grad_norm": 15.814526557922363, + "learning_rate": 1.514901118272462e-06, + "loss": 1.3089, + "step": 58860 + }, + { + "epoch": 9.728568477587276, + "grad_norm": 12.397698402404785, + "learning_rate": 1.5057198993738409e-06, + "loss": 1.3365, + "step": 58870 + }, + { + "epoch": 9.730221028713077, + "grad_norm": 19.588912963867188, + "learning_rate": 1.4965386804752201e-06, + "loss": 1.3647, + "step": 58880 + }, + { + "epoch": 9.731873579838876, + "grad_norm": 19.40388298034668, + "learning_rate": 1.487357461576599e-06, + "loss": 1.228, + "step": 58890 + }, + { + "epoch": 9.733526130964677, + "grad_norm": 9.690013885498047, + "learning_rate": 1.478176242677978e-06, + "loss": 1.198, + "step": 58900 + }, + { + "epoch": 9.735178682090478, + "grad_norm": 21.138465881347656, + "learning_rate": 1.468995023779357e-06, + "loss": 1.2491, + "step": 58910 + }, + { + "epoch": 9.736831233216277, + "grad_norm": 13.240848541259766, + "learning_rate": 1.459813804880736e-06, + "loss": 1.0939, + "step": 58920 + }, + { + "epoch": 9.738483784342078, + "grad_norm": 11.486268043518066, + "learning_rate": 1.450632585982115e-06, + "loss": 1.2259, + "step": 58930 + }, + { + "epoch": 9.740136335467879, + "grad_norm": 18.410409927368164, + "learning_rate": 1.4414513670834941e-06, + "loss": 1.2794, + "step": 58940 + }, + { + "epoch": 9.741788886593678, + "grad_norm": 13.786377906799316, + "learning_rate": 1.432270148184873e-06, + "loss": 1.2376, + "step": 58950 + }, + { + "epoch": 9.74344143771948, + "grad_norm": 17.038511276245117, + "learning_rate": 1.4230889292862522e-06, + "loss": 1.2173, + "step": 58960 + }, + { + "epoch": 9.74509398884528, + "grad_norm": 15.250676155090332, + "learning_rate": 1.413907710387631e-06, + "loss": 1.3068, + "step": 58970 + }, + { + "epoch": 9.746746539971081, + "grad_norm": 28.942930221557617, + "learning_rate": 1.4047264914890103e-06, + "loss": 1.3075, + "step": 58980 + }, + { + "epoch": 9.74839909109688, + "grad_norm": 12.493021965026855, + "learning_rate": 1.395545272590389e-06, + "loss": 1.3052, + "step": 58990 + }, + { + "epoch": 9.750051642222681, + "grad_norm": 15.28077220916748, + "learning_rate": 1.3863640536917681e-06, + "loss": 1.3504, + "step": 59000 + }, + { + "epoch": 9.751704193348482, + "grad_norm": 20.08050537109375, + "learning_rate": 1.3771828347931472e-06, + "loss": 1.2934, + "step": 59010 + }, + { + "epoch": 9.753356744474281, + "grad_norm": 25.60179901123047, + "learning_rate": 1.3680016158945262e-06, + "loss": 1.2961, + "step": 59020 + }, + { + "epoch": 9.755009295600082, + "grad_norm": 14.154158592224121, + "learning_rate": 1.3588203969959052e-06, + "loss": 1.4102, + "step": 59030 + }, + { + "epoch": 9.756661846725883, + "grad_norm": 12.870734214782715, + "learning_rate": 1.3496391780972843e-06, + "loss": 1.134, + "step": 59040 + }, + { + "epoch": 9.758314397851684, + "grad_norm": 9.939929962158203, + "learning_rate": 1.3404579591986633e-06, + "loss": 1.2146, + "step": 59050 + }, + { + "epoch": 9.759966948977484, + "grad_norm": 14.322998046875, + "learning_rate": 1.3312767403000423e-06, + "loss": 1.1763, + "step": 59060 + }, + { + "epoch": 9.761619500103285, + "grad_norm": 18.574724197387695, + "learning_rate": 1.3220955214014214e-06, + "loss": 1.3397, + "step": 59070 + }, + { + "epoch": 9.763272051229086, + "grad_norm": 12.602153778076172, + "learning_rate": 1.3129143025028004e-06, + "loss": 1.245, + "step": 59080 + }, + { + "epoch": 9.764924602354885, + "grad_norm": 21.25696563720703, + "learning_rate": 1.3037330836041794e-06, + "loss": 1.2831, + "step": 59090 + }, + { + "epoch": 9.766577153480686, + "grad_norm": 24.119489669799805, + "learning_rate": 1.2945518647055583e-06, + "loss": 1.2615, + "step": 59100 + }, + { + "epoch": 9.768229704606487, + "grad_norm": 20.59731101989746, + "learning_rate": 1.2853706458069375e-06, + "loss": 1.2008, + "step": 59110 + }, + { + "epoch": 9.769882255732286, + "grad_norm": 17.279531478881836, + "learning_rate": 1.2761894269083163e-06, + "loss": 1.2945, + "step": 59120 + }, + { + "epoch": 9.771534806858087, + "grad_norm": 15.187520027160645, + "learning_rate": 1.2670082080096956e-06, + "loss": 1.3367, + "step": 59130 + }, + { + "epoch": 9.773187357983888, + "grad_norm": 24.268413543701172, + "learning_rate": 1.2578269891110744e-06, + "loss": 1.2033, + "step": 59140 + }, + { + "epoch": 9.774839909109687, + "grad_norm": 10.84234619140625, + "learning_rate": 1.2486457702124534e-06, + "loss": 1.1307, + "step": 59150 + }, + { + "epoch": 9.776492460235488, + "grad_norm": 19.090682983398438, + "learning_rate": 1.2394645513138325e-06, + "loss": 1.2864, + "step": 59160 + }, + { + "epoch": 9.778145011361289, + "grad_norm": 16.51657485961914, + "learning_rate": 1.2302833324152115e-06, + "loss": 1.2448, + "step": 59170 + }, + { + "epoch": 9.77979756248709, + "grad_norm": 10.925396919250488, + "learning_rate": 1.2211021135165906e-06, + "loss": 1.2297, + "step": 59180 + }, + { + "epoch": 9.78145011361289, + "grad_norm": 19.890026092529297, + "learning_rate": 1.2119208946179696e-06, + "loss": 1.2474, + "step": 59190 + }, + { + "epoch": 9.78310266473869, + "grad_norm": 21.863983154296875, + "learning_rate": 1.2027396757193484e-06, + "loss": 1.3076, + "step": 59200 + }, + { + "epoch": 9.784755215864491, + "grad_norm": 20.163433074951172, + "learning_rate": 1.1935584568207277e-06, + "loss": 1.253, + "step": 59210 + }, + { + "epoch": 9.78640776699029, + "grad_norm": 13.780261039733887, + "learning_rate": 1.1843772379221065e-06, + "loss": 1.2075, + "step": 59220 + }, + { + "epoch": 9.788060318116091, + "grad_norm": 15.498116493225098, + "learning_rate": 1.1751960190234857e-06, + "loss": 1.3903, + "step": 59230 + }, + { + "epoch": 9.789712869241892, + "grad_norm": 12.257755279541016, + "learning_rate": 1.1660148001248646e-06, + "loss": 1.379, + "step": 59240 + }, + { + "epoch": 9.791365420367693, + "grad_norm": 15.42638111114502, + "learning_rate": 1.1568335812262436e-06, + "loss": 1.2928, + "step": 59250 + }, + { + "epoch": 9.793017971493493, + "grad_norm": 17.679750442504883, + "learning_rate": 1.1476523623276226e-06, + "loss": 1.2155, + "step": 59260 + }, + { + "epoch": 9.794670522619294, + "grad_norm": 13.531704902648926, + "learning_rate": 1.1384711434290017e-06, + "loss": 1.396, + "step": 59270 + }, + { + "epoch": 9.796323073745095, + "grad_norm": 21.790496826171875, + "learning_rate": 1.1292899245303807e-06, + "loss": 1.3374, + "step": 59280 + }, + { + "epoch": 9.797975624870894, + "grad_norm": 20.123689651489258, + "learning_rate": 1.1201087056317597e-06, + "loss": 1.3, + "step": 59290 + }, + { + "epoch": 9.799628175996695, + "grad_norm": 16.618236541748047, + "learning_rate": 1.1109274867331386e-06, + "loss": 1.2193, + "step": 59300 + }, + { + "epoch": 9.801280727122496, + "grad_norm": 87.83232879638672, + "learning_rate": 1.1017462678345178e-06, + "loss": 1.283, + "step": 59310 + }, + { + "epoch": 9.802933278248297, + "grad_norm": 19.9451847076416, + "learning_rate": 1.0925650489358966e-06, + "loss": 1.2853, + "step": 59320 + }, + { + "epoch": 9.804585829374096, + "grad_norm": 20.5788631439209, + "learning_rate": 1.0833838300372759e-06, + "loss": 1.3428, + "step": 59330 + }, + { + "epoch": 9.806238380499897, + "grad_norm": 13.178668975830078, + "learning_rate": 1.0742026111386547e-06, + "loss": 1.3449, + "step": 59340 + }, + { + "epoch": 9.807890931625698, + "grad_norm": 9.858376502990723, + "learning_rate": 1.0650213922400337e-06, + "loss": 1.2549, + "step": 59350 + }, + { + "epoch": 9.809543482751497, + "grad_norm": 14.950618743896484, + "learning_rate": 1.0558401733414128e-06, + "loss": 1.1921, + "step": 59360 + }, + { + "epoch": 9.811196033877298, + "grad_norm": 15.29289436340332, + "learning_rate": 1.0466589544427918e-06, + "loss": 1.2611, + "step": 59370 + }, + { + "epoch": 9.812848585003099, + "grad_norm": 10.011371612548828, + "learning_rate": 1.0374777355441709e-06, + "loss": 1.3245, + "step": 59380 + }, + { + "epoch": 9.814501136128898, + "grad_norm": 39.90605545043945, + "learning_rate": 1.0282965166455499e-06, + "loss": 1.1861, + "step": 59390 + }, + { + "epoch": 9.8161536872547, + "grad_norm": 23.458520889282227, + "learning_rate": 1.019115297746929e-06, + "loss": 1.3771, + "step": 59400 + }, + { + "epoch": 9.8178062383805, + "grad_norm": 24.068479537963867, + "learning_rate": 1.009934078848308e-06, + "loss": 1.1251, + "step": 59410 + }, + { + "epoch": 9.8194587895063, + "grad_norm": 20.833627700805664, + "learning_rate": 1.000752859949687e-06, + "loss": 1.4204, + "step": 59420 + }, + { + "epoch": 9.8211113406321, + "grad_norm": 13.489704132080078, + "learning_rate": 9.91571641051066e-07, + "loss": 1.3008, + "step": 59430 + }, + { + "epoch": 9.822763891757901, + "grad_norm": 35.63694381713867, + "learning_rate": 9.82390422152445e-07, + "loss": 1.3852, + "step": 59440 + }, + { + "epoch": 9.824416442883702, + "grad_norm": 17.55329132080078, + "learning_rate": 9.732092032538239e-07, + "loss": 1.3671, + "step": 59450 + }, + { + "epoch": 9.826068994009502, + "grad_norm": 19.870128631591797, + "learning_rate": 9.640279843552031e-07, + "loss": 1.3378, + "step": 59460 + }, + { + "epoch": 9.827721545135303, + "grad_norm": 31.158733367919922, + "learning_rate": 9.54846765456582e-07, + "loss": 1.2725, + "step": 59470 + }, + { + "epoch": 9.829374096261104, + "grad_norm": 23.401962280273438, + "learning_rate": 9.456655465579611e-07, + "loss": 1.3456, + "step": 59480 + }, + { + "epoch": 9.831026647386903, + "grad_norm": 11.159757614135742, + "learning_rate": 9.3648432765934e-07, + "loss": 1.2201, + "step": 59490 + }, + { + "epoch": 9.832679198512704, + "grad_norm": 18.284528732299805, + "learning_rate": 9.273031087607191e-07, + "loss": 1.3702, + "step": 59500 + }, + { + "epoch": 9.834331749638505, + "grad_norm": 22.221506118774414, + "learning_rate": 9.181218898620981e-07, + "loss": 1.1929, + "step": 59510 + }, + { + "epoch": 9.835984300764306, + "grad_norm": 14.718652725219727, + "learning_rate": 9.089406709634771e-07, + "loss": 1.3242, + "step": 59520 + }, + { + "epoch": 9.837636851890105, + "grad_norm": 30.458106994628906, + "learning_rate": 8.997594520648562e-07, + "loss": 1.2832, + "step": 59530 + }, + { + "epoch": 9.839289403015906, + "grad_norm": 17.770442962646484, + "learning_rate": 8.905782331662352e-07, + "loss": 1.375, + "step": 59540 + }, + { + "epoch": 9.840941954141707, + "grad_norm": 13.754855155944824, + "learning_rate": 8.813970142676141e-07, + "loss": 1.1981, + "step": 59550 + }, + { + "epoch": 9.842594505267506, + "grad_norm": 28.639034271240234, + "learning_rate": 8.722157953689933e-07, + "loss": 1.3355, + "step": 59560 + }, + { + "epoch": 9.844247056393307, + "grad_norm": 34.25663757324219, + "learning_rate": 8.630345764703722e-07, + "loss": 1.5129, + "step": 59570 + }, + { + "epoch": 9.845899607519108, + "grad_norm": 8.183585166931152, + "learning_rate": 8.538533575717514e-07, + "loss": 1.262, + "step": 59580 + }, + { + "epoch": 9.847552158644907, + "grad_norm": 27.922536849975586, + "learning_rate": 8.446721386731303e-07, + "loss": 1.2981, + "step": 59590 + }, + { + "epoch": 9.849204709770708, + "grad_norm": 11.618725776672363, + "learning_rate": 8.354909197745092e-07, + "loss": 1.2361, + "step": 59600 + }, + { + "epoch": 9.85085726089651, + "grad_norm": 21.355430603027344, + "learning_rate": 8.263097008758884e-07, + "loss": 1.3164, + "step": 59610 + }, + { + "epoch": 9.85250981202231, + "grad_norm": 17.563817977905273, + "learning_rate": 8.171284819772673e-07, + "loss": 1.1633, + "step": 59620 + }, + { + "epoch": 9.85416236314811, + "grad_norm": 15.469801902770996, + "learning_rate": 8.079472630786464e-07, + "loss": 1.3304, + "step": 59630 + }, + { + "epoch": 9.85581491427391, + "grad_norm": 12.664761543273926, + "learning_rate": 7.987660441800254e-07, + "loss": 1.3019, + "step": 59640 + }, + { + "epoch": 9.857467465399711, + "grad_norm": 18.20703125, + "learning_rate": 7.895848252814045e-07, + "loss": 1.3006, + "step": 59650 + }, + { + "epoch": 9.85912001652551, + "grad_norm": 14.753276824951172, + "learning_rate": 7.804036063827834e-07, + "loss": 1.3308, + "step": 59660 + }, + { + "epoch": 9.860772567651312, + "grad_norm": 12.871907234191895, + "learning_rate": 7.712223874841625e-07, + "loss": 1.3225, + "step": 59670 + }, + { + "epoch": 9.862425118777113, + "grad_norm": 10.84438419342041, + "learning_rate": 7.620411685855414e-07, + "loss": 1.1678, + "step": 59680 + }, + { + "epoch": 9.864077669902912, + "grad_norm": 15.910385131835938, + "learning_rate": 7.528599496869204e-07, + "loss": 1.3399, + "step": 59690 + }, + { + "epoch": 9.865730221028713, + "grad_norm": 15.912435531616211, + "learning_rate": 7.436787307882995e-07, + "loss": 1.251, + "step": 59700 + }, + { + "epoch": 9.867382772154514, + "grad_norm": 14.724613189697266, + "learning_rate": 7.344975118896785e-07, + "loss": 1.3633, + "step": 59710 + }, + { + "epoch": 9.869035323280315, + "grad_norm": 19.542469024658203, + "learning_rate": 7.253162929910575e-07, + "loss": 1.4641, + "step": 59720 + }, + { + "epoch": 9.870687874406114, + "grad_norm": 15.426322937011719, + "learning_rate": 7.161350740924365e-07, + "loss": 1.4261, + "step": 59730 + }, + { + "epoch": 9.872340425531915, + "grad_norm": 12.275348663330078, + "learning_rate": 7.069538551938155e-07, + "loss": 1.3722, + "step": 59740 + }, + { + "epoch": 9.873992976657716, + "grad_norm": 13.315044403076172, + "learning_rate": 6.977726362951945e-07, + "loss": 1.3351, + "step": 59750 + }, + { + "epoch": 9.875645527783515, + "grad_norm": 24.9056453704834, + "learning_rate": 6.885914173965736e-07, + "loss": 1.3371, + "step": 59760 + }, + { + "epoch": 9.877298078909316, + "grad_norm": 14.293803215026855, + "learning_rate": 6.794101984979526e-07, + "loss": 1.1908, + "step": 59770 + }, + { + "epoch": 9.878950630035117, + "grad_norm": 20.57039451599121, + "learning_rate": 6.702289795993317e-07, + "loss": 1.2033, + "step": 59780 + }, + { + "epoch": 9.880603181160918, + "grad_norm": 13.297779083251953, + "learning_rate": 6.610477607007107e-07, + "loss": 1.163, + "step": 59790 + }, + { + "epoch": 9.882255732286717, + "grad_norm": 13.230541229248047, + "learning_rate": 6.518665418020897e-07, + "loss": 1.4123, + "step": 59800 + }, + { + "epoch": 9.883908283412518, + "grad_norm": 24.931100845336914, + "learning_rate": 6.426853229034688e-07, + "loss": 1.3383, + "step": 59810 + }, + { + "epoch": 9.88556083453832, + "grad_norm": 17.547151565551758, + "learning_rate": 6.335041040048478e-07, + "loss": 1.3894, + "step": 59820 + }, + { + "epoch": 9.887213385664118, + "grad_norm": 16.77924346923828, + "learning_rate": 6.243228851062267e-07, + "loss": 1.32, + "step": 59830 + }, + { + "epoch": 9.88886593678992, + "grad_norm": 16.8702449798584, + "learning_rate": 6.151416662076058e-07, + "loss": 1.3243, + "step": 59840 + }, + { + "epoch": 9.89051848791572, + "grad_norm": 15.003113746643066, + "learning_rate": 6.059604473089848e-07, + "loss": 1.2222, + "step": 59850 + }, + { + "epoch": 9.89217103904152, + "grad_norm": 15.460107803344727, + "learning_rate": 5.967792284103638e-07, + "loss": 1.2588, + "step": 59860 + }, + { + "epoch": 9.89382359016732, + "grad_norm": 17.36907958984375, + "learning_rate": 5.875980095117429e-07, + "loss": 1.3467, + "step": 59870 + }, + { + "epoch": 9.895476141293122, + "grad_norm": 13.089510917663574, + "learning_rate": 5.784167906131218e-07, + "loss": 1.3112, + "step": 59880 + }, + { + "epoch": 9.89712869241892, + "grad_norm": 22.797977447509766, + "learning_rate": 5.692355717145008e-07, + "loss": 1.379, + "step": 59890 + }, + { + "epoch": 9.898781243544722, + "grad_norm": 18.484411239624023, + "learning_rate": 5.600543528158799e-07, + "loss": 1.3037, + "step": 59900 + }, + { + "epoch": 9.900433794670523, + "grad_norm": 10.942911148071289, + "learning_rate": 5.508731339172589e-07, + "loss": 1.3234, + "step": 59910 + }, + { + "epoch": 9.902086345796324, + "grad_norm": 17.38446044921875, + "learning_rate": 5.416919150186379e-07, + "loss": 1.3274, + "step": 59920 + }, + { + "epoch": 9.903738896922123, + "grad_norm": 140.1362762451172, + "learning_rate": 5.325106961200169e-07, + "loss": 1.3595, + "step": 59930 + }, + { + "epoch": 9.905391448047924, + "grad_norm": 16.535696029663086, + "learning_rate": 5.233294772213959e-07, + "loss": 1.3435, + "step": 59940 + }, + { + "epoch": 9.907043999173725, + "grad_norm": 17.555944442749023, + "learning_rate": 5.141482583227749e-07, + "loss": 1.3091, + "step": 59950 + }, + { + "epoch": 9.908696550299524, + "grad_norm": 13.292182922363281, + "learning_rate": 5.04967039424154e-07, + "loss": 1.3891, + "step": 59960 + }, + { + "epoch": 9.910349101425325, + "grad_norm": 15.89194393157959, + "learning_rate": 4.95785820525533e-07, + "loss": 1.2585, + "step": 59970 + }, + { + "epoch": 9.912001652551126, + "grad_norm": 11.141946792602539, + "learning_rate": 4.866046016269119e-07, + "loss": 1.3311, + "step": 59980 + }, + { + "epoch": 9.913654203676927, + "grad_norm": 18.01121711730957, + "learning_rate": 4.77423382728291e-07, + "loss": 1.2728, + "step": 59990 + }, + { + "epoch": 9.915306754802726, + "grad_norm": 14.889471054077148, + "learning_rate": 4.6824216382967e-07, + "loss": 1.2427, + "step": 60000 + }, + { + "epoch": 9.916959305928527, + "grad_norm": 24.612850189208984, + "learning_rate": 4.5906094493104905e-07, + "loss": 1.2339, + "step": 60010 + }, + { + "epoch": 9.918611857054328, + "grad_norm": 11.734923362731934, + "learning_rate": 4.498797260324281e-07, + "loss": 1.3032, + "step": 60020 + }, + { + "epoch": 9.920264408180127, + "grad_norm": 20.81572723388672, + "learning_rate": 4.4069850713380707e-07, + "loss": 1.2575, + "step": 60030 + }, + { + "epoch": 9.921916959305928, + "grad_norm": 17.165307998657227, + "learning_rate": 4.315172882351861e-07, + "loss": 1.3268, + "step": 60040 + }, + { + "epoch": 9.92356951043173, + "grad_norm": 10.70225715637207, + "learning_rate": 4.2233606933656514e-07, + "loss": 1.3622, + "step": 60050 + }, + { + "epoch": 9.925222061557529, + "grad_norm": 36.28559494018555, + "learning_rate": 4.131548504379442e-07, + "loss": 1.2658, + "step": 60060 + }, + { + "epoch": 9.92687461268333, + "grad_norm": 17.466758728027344, + "learning_rate": 4.039736315393232e-07, + "loss": 1.3854, + "step": 60070 + }, + { + "epoch": 9.92852716380913, + "grad_norm": 19.143348693847656, + "learning_rate": 3.9479241264070225e-07, + "loss": 1.0911, + "step": 60080 + }, + { + "epoch": 9.930179714934932, + "grad_norm": 18.350982666015625, + "learning_rate": 3.8561119374208123e-07, + "loss": 1.2973, + "step": 60090 + }, + { + "epoch": 9.93183226606073, + "grad_norm": 12.689515113830566, + "learning_rate": 3.764299748434602e-07, + "loss": 1.3574, + "step": 60100 + }, + { + "epoch": 9.933484817186532, + "grad_norm": 8.145671844482422, + "learning_rate": 3.6724875594483925e-07, + "loss": 1.4449, + "step": 60110 + }, + { + "epoch": 9.935137368312333, + "grad_norm": 23.987871170043945, + "learning_rate": 3.5806753704621824e-07, + "loss": 1.298, + "step": 60120 + }, + { + "epoch": 9.936789919438132, + "grad_norm": 18.037437438964844, + "learning_rate": 3.4888631814759727e-07, + "loss": 1.2382, + "step": 60130 + }, + { + "epoch": 9.938442470563933, + "grad_norm": 16.41983413696289, + "learning_rate": 3.397050992489763e-07, + "loss": 1.3517, + "step": 60140 + }, + { + "epoch": 9.940095021689734, + "grad_norm": 16.38405418395996, + "learning_rate": 3.3052388035035534e-07, + "loss": 1.2899, + "step": 60150 + }, + { + "epoch": 9.941747572815533, + "grad_norm": 16.4738826751709, + "learning_rate": 3.213426614517344e-07, + "loss": 1.29, + "step": 60160 + }, + { + "epoch": 9.943400123941334, + "grad_norm": 22.556604385375977, + "learning_rate": 3.1216144255311336e-07, + "loss": 1.2654, + "step": 60170 + }, + { + "epoch": 9.945052675067135, + "grad_norm": 11.890191078186035, + "learning_rate": 3.029802236544924e-07, + "loss": 1.0929, + "step": 60180 + }, + { + "epoch": 9.946705226192936, + "grad_norm": 72.20085906982422, + "learning_rate": 2.9379900475587143e-07, + "loss": 1.2483, + "step": 60190 + }, + { + "epoch": 9.948357777318735, + "grad_norm": 17.972667694091797, + "learning_rate": 2.846177858572504e-07, + "loss": 1.2809, + "step": 60200 + }, + { + "epoch": 9.950010328444536, + "grad_norm": 21.140539169311523, + "learning_rate": 2.7543656695862945e-07, + "loss": 1.4348, + "step": 60210 + }, + { + "epoch": 9.951662879570337, + "grad_norm": 17.385190963745117, + "learning_rate": 2.6625534806000844e-07, + "loss": 1.3182, + "step": 60220 + }, + { + "epoch": 9.953315430696136, + "grad_norm": 9.468734741210938, + "learning_rate": 2.5707412916138747e-07, + "loss": 1.1968, + "step": 60230 + }, + { + "epoch": 9.954967981821937, + "grad_norm": 9.555684089660645, + "learning_rate": 2.478929102627665e-07, + "loss": 1.3367, + "step": 60240 + }, + { + "epoch": 9.956620532947738, + "grad_norm": 11.08018684387207, + "learning_rate": 2.387116913641455e-07, + "loss": 1.392, + "step": 60250 + }, + { + "epoch": 9.95827308407354, + "grad_norm": 16.899229049682617, + "learning_rate": 2.2953047246552453e-07, + "loss": 1.2918, + "step": 60260 + }, + { + "epoch": 9.959925635199339, + "grad_norm": 15.151360511779785, + "learning_rate": 2.2034925356690354e-07, + "loss": 1.2946, + "step": 60270 + }, + { + "epoch": 9.96157818632514, + "grad_norm": 8.768255233764648, + "learning_rate": 2.1116803466828257e-07, + "loss": 1.3185, + "step": 60280 + }, + { + "epoch": 9.96323073745094, + "grad_norm": 11.71931266784668, + "learning_rate": 2.019868157696616e-07, + "loss": 1.3727, + "step": 60290 + }, + { + "epoch": 9.96488328857674, + "grad_norm": 25.73476219177246, + "learning_rate": 1.9280559687104062e-07, + "loss": 1.2354, + "step": 60300 + }, + { + "epoch": 9.96653583970254, + "grad_norm": 25.98105239868164, + "learning_rate": 1.8362437797241963e-07, + "loss": 1.2941, + "step": 60310 + }, + { + "epoch": 9.968188390828342, + "grad_norm": 35.79945373535156, + "learning_rate": 1.7444315907379864e-07, + "loss": 1.2131, + "step": 60320 + }, + { + "epoch": 9.969840941954141, + "grad_norm": 13.044464111328125, + "learning_rate": 1.6526194017517767e-07, + "loss": 1.2468, + "step": 60330 + }, + { + "epoch": 9.971493493079942, + "grad_norm": 25.407047271728516, + "learning_rate": 1.5608072127655668e-07, + "loss": 1.379, + "step": 60340 + }, + { + "epoch": 9.973146044205743, + "grad_norm": 13.202648162841797, + "learning_rate": 1.4689950237793572e-07, + "loss": 1.185, + "step": 60350 + }, + { + "epoch": 9.974798595331544, + "grad_norm": 38.73662567138672, + "learning_rate": 1.3771828347931473e-07, + "loss": 1.2843, + "step": 60360 + }, + { + "epoch": 9.976451146457343, + "grad_norm": 14.0643310546875, + "learning_rate": 1.2853706458069374e-07, + "loss": 1.294, + "step": 60370 + }, + { + "epoch": 9.978103697583144, + "grad_norm": 12.296748161315918, + "learning_rate": 1.1935584568207275e-07, + "loss": 1.2442, + "step": 60380 + }, + { + "epoch": 9.979756248708945, + "grad_norm": 19.61980628967285, + "learning_rate": 1.1017462678345177e-07, + "loss": 1.1628, + "step": 60390 + }, + { + "epoch": 9.981408799834744, + "grad_norm": 15.85641860961914, + "learning_rate": 1.009934078848308e-07, + "loss": 1.2077, + "step": 60400 + }, + { + "epoch": 9.983061350960545, + "grad_norm": 18.99192237854004, + "learning_rate": 9.181218898620981e-08, + "loss": 1.1939, + "step": 60410 + }, + { + "epoch": 9.984713902086346, + "grad_norm": 34.235374450683594, + "learning_rate": 8.263097008758884e-08, + "loss": 1.3344, + "step": 60420 + }, + { + "epoch": 9.986366453212145, + "grad_norm": 13.520135879516602, + "learning_rate": 7.344975118896786e-08, + "loss": 1.2839, + "step": 60430 + }, + { + "epoch": 9.988019004337946, + "grad_norm": 18.85852813720703, + "learning_rate": 6.426853229034687e-08, + "loss": 1.2802, + "step": 60440 + }, + { + "epoch": 9.989671555463747, + "grad_norm": 16.009300231933594, + "learning_rate": 5.5087313391725884e-08, + "loss": 1.2258, + "step": 60450 + }, + { + "epoch": 9.991324106589548, + "grad_norm": 14.094069480895996, + "learning_rate": 4.5906094493104907e-08, + "loss": 1.3012, + "step": 60460 + }, + { + "epoch": 9.992976657715348, + "grad_norm": 23.66718101501465, + "learning_rate": 3.672487559448393e-08, + "loss": 1.329, + "step": 60470 + }, + { + "epoch": 9.994629208841149, + "grad_norm": 18.014339447021484, + "learning_rate": 2.7543656695862942e-08, + "loss": 1.3193, + "step": 60480 + }, + { + "epoch": 9.99628175996695, + "grad_norm": 12.735496520996094, + "learning_rate": 1.8362437797241965e-08, + "loss": 1.2747, + "step": 60490 + }, + { + "epoch": 9.997934311092749, + "grad_norm": 29.70014190673828, + "learning_rate": 9.181218898620982e-09, + "loss": 1.3249, + "step": 60500 + }, + { + "epoch": 9.99958686221855, + "grad_norm": 17.789243698120117, + "learning_rate": 0.0, + "loss": 1.3595, + "step": 60510 + }, + { + "epoch": 9.99958686221855, + "eval_accuracy": 0.33699815576677544, + "eval_loss": 2.4853203296661377, + "eval_runtime": 861.2243, + "eval_samples_per_second": 32.739, + "eval_steps_per_second": 8.185, + "step": 60510 + }, + { + "epoch": 9.99958686221855, + "step": 60510, + "total_flos": 9.925024630549355e+19, + "train_loss": 1.502550286699811, + "train_runtime": 81421.0241, + "train_samples_per_second": 11.891, + "train_steps_per_second": 0.743 + } + ], + "logging_steps": 10, + "max_steps": 60510, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 9.925024630549355e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}