{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08041134872168292, "eval_steps": 500, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.4672971512046065e-05, "grad_norm": 11.733072280883789, "learning_rate": 9.999553270284881e-06, "loss": 0.164, "step": 10 }, { "epoch": 8.934594302409213e-05, "grad_norm": 0.08407776802778244, "learning_rate": 9.99910654056976e-06, "loss": 0.0147, "step": 20 }, { "epoch": 0.0001340189145361382, "grad_norm": 0.00012560053437482566, "learning_rate": 9.99865981085464e-06, "loss": 0.0002, "step": 30 }, { "epoch": 0.00017869188604818426, "grad_norm": 3.125946022919379e-05, "learning_rate": 9.99821308113952e-06, "loss": 0.0004, "step": 40 }, { "epoch": 0.00022336485756023033, "grad_norm": 0.0017562335124239326, "learning_rate": 9.997766351424398e-06, "loss": 0.0, "step": 50 }, { "epoch": 0.0002680378290722764, "grad_norm": 10.499456405639648, "learning_rate": 9.997319621709278e-06, "loss": 0.0697, "step": 60 }, { "epoch": 0.0003127108005843225, "grad_norm": 3.6197413919580868e-06, "learning_rate": 9.996872891994157e-06, "loss": 0.0082, "step": 70 }, { "epoch": 0.0003573837720963685, "grad_norm": 0.0009690591250546277, "learning_rate": 9.996426162279037e-06, "loss": 0.0379, "step": 80 }, { "epoch": 0.0004020567436084146, "grad_norm": 0.011041563004255295, "learning_rate": 9.995979432563917e-06, "loss": 0.0014, "step": 90 }, { "epoch": 0.00044672971512046066, "grad_norm": 1.1426632227085065e-05, "learning_rate": 9.995532702848795e-06, "loss": 0.2133, "step": 100 }, { "epoch": 0.0004914026866325068, "grad_norm": 4.7969253500923514e-05, "learning_rate": 9.995085973133675e-06, "loss": 0.0001, "step": 110 }, { "epoch": 0.0005360756581445528, "grad_norm": 0.019292959943413734, "learning_rate": 9.994639243418556e-06, "loss": 0.0002, "step": 120 }, { "epoch": 0.0005807486296565989, "grad_norm": 135.5595245361328, "learning_rate": 9.994192513703434e-06, "loss": 0.0579, "step": 130 }, { "epoch": 0.000625421601168645, "grad_norm": 2.2290277001957293e-07, "learning_rate": 9.993745783988314e-06, "loss": 0.2915, "step": 140 }, { "epoch": 0.000670094572680691, "grad_norm": 1.4152267802103324e-08, "learning_rate": 9.993299054273193e-06, "loss": 0.0096, "step": 150 }, { "epoch": 0.000714767544192737, "grad_norm": 0.0003992164565715939, "learning_rate": 9.992852324558073e-06, "loss": 0.0015, "step": 160 }, { "epoch": 0.0007594405157047831, "grad_norm": 4.117830485483864e-06, "learning_rate": 9.992405594842953e-06, "loss": 0.0128, "step": 170 }, { "epoch": 0.0008041134872168292, "grad_norm": 0.0052748871967196465, "learning_rate": 9.991958865127833e-06, "loss": 0.0005, "step": 180 }, { "epoch": 0.0008487864587288752, "grad_norm": 0.4053354859352112, "learning_rate": 9.991512135412711e-06, "loss": 0.0575, "step": 190 }, { "epoch": 0.0008934594302409213, "grad_norm": 1.9831970348604955e-05, "learning_rate": 9.991065405697591e-06, "loss": 0.0001, "step": 200 }, { "epoch": 0.0009381324017529674, "grad_norm": 2.868847381876094e-08, "learning_rate": 9.990618675982472e-06, "loss": 0.0102, "step": 210 }, { "epoch": 0.0009828053732650135, "grad_norm": 0.007543394807726145, "learning_rate": 9.99017194626735e-06, "loss": 0.0001, "step": 220 }, { "epoch": 0.0010274783447770596, "grad_norm": 0.04161032661795616, "learning_rate": 9.98972521655223e-06, "loss": 0.0, "step": 230 }, { "epoch": 0.0010721513162891055, "grad_norm": 0.07510162144899368, "learning_rate": 9.98927848683711e-06, "loss": 0.0002, "step": 240 }, { "epoch": 0.0011168242878011516, "grad_norm": 0.00014790371642448008, "learning_rate": 9.988831757121989e-06, "loss": 0.005, "step": 250 }, { "epoch": 0.0011614972593131977, "grad_norm": 2.0253996808605734e-06, "learning_rate": 9.988385027406869e-06, "loss": 0.0166, "step": 260 }, { "epoch": 0.0012061702308252438, "grad_norm": 0.8337207436561584, "learning_rate": 9.987938297691749e-06, "loss": 0.1077, "step": 270 }, { "epoch": 0.00125084320233729, "grad_norm": 1.2096836599084781e-06, "learning_rate": 9.987491567976627e-06, "loss": 0.0005, "step": 280 }, { "epoch": 0.001295516173849336, "grad_norm": 6.229875725694001e-05, "learning_rate": 9.987044838261508e-06, "loss": 0.0021, "step": 290 }, { "epoch": 0.001340189145361382, "grad_norm": 52.171180725097656, "learning_rate": 9.986598108546388e-06, "loss": 0.014, "step": 300 }, { "epoch": 0.001384862116873428, "grad_norm": 0.01654650643467903, "learning_rate": 9.986151378831268e-06, "loss": 0.0013, "step": 310 }, { "epoch": 0.001429535088385474, "grad_norm": 0.00010301181464456022, "learning_rate": 9.985704649116146e-06, "loss": 0.0003, "step": 320 }, { "epoch": 0.0014742080598975202, "grad_norm": 0.10489977151155472, "learning_rate": 9.985257919401026e-06, "loss": 0.0079, "step": 330 }, { "epoch": 0.0015188810314095663, "grad_norm": 1.3773581031273352e-06, "learning_rate": 9.984811189685906e-06, "loss": 0.0329, "step": 340 }, { "epoch": 0.0015635540029216124, "grad_norm": 0.01485914271324873, "learning_rate": 9.984364459970785e-06, "loss": 0.0001, "step": 350 }, { "epoch": 0.0016082269744336585, "grad_norm": 30.461793899536133, "learning_rate": 9.983917730255665e-06, "loss": 0.0949, "step": 360 }, { "epoch": 0.0016528999459457046, "grad_norm": 6.307673454284668, "learning_rate": 9.983471000540543e-06, "loss": 0.0017, "step": 370 }, { "epoch": 0.0016975729174577505, "grad_norm": 0.014393389225006104, "learning_rate": 9.983024270825424e-06, "loss": 0.0147, "step": 380 }, { "epoch": 0.0017422458889697966, "grad_norm": 0.019219743087887764, "learning_rate": 9.982577541110304e-06, "loss": 0.0021, "step": 390 }, { "epoch": 0.0017869188604818427, "grad_norm": 0.05191327631473541, "learning_rate": 9.982130811395182e-06, "loss": 0.0, "step": 400 }, { "epoch": 0.0018315918319938887, "grad_norm": 5.0406357331667095e-05, "learning_rate": 9.981684081680062e-06, "loss": 0.0037, "step": 410 }, { "epoch": 0.0018762648035059348, "grad_norm": 5.040341420681216e-05, "learning_rate": 9.981237351964942e-06, "loss": 0.0066, "step": 420 }, { "epoch": 0.001920937775017981, "grad_norm": 0.006360863335430622, "learning_rate": 9.98079062224982e-06, "loss": 0.0002, "step": 430 }, { "epoch": 0.001965610746530027, "grad_norm": 4.391232266698353e-08, "learning_rate": 9.980343892534701e-06, "loss": 0.1804, "step": 440 }, { "epoch": 0.002010283718042073, "grad_norm": 2.598305854917271e-06, "learning_rate": 9.97989716281958e-06, "loss": 0.0063, "step": 450 }, { "epoch": 0.0020549566895541192, "grad_norm": 2.415512678766163e-09, "learning_rate": 9.97945043310446e-06, "loss": 0.0001, "step": 460 }, { "epoch": 0.0020996296610661653, "grad_norm": 0.00015337311197072268, "learning_rate": 9.97900370338934e-06, "loss": 0.0, "step": 470 }, { "epoch": 0.002144302632578211, "grad_norm": 0.00045923213474452496, "learning_rate": 9.978556973674218e-06, "loss": 0.0478, "step": 480 }, { "epoch": 0.002188975604090257, "grad_norm": 6.670333174696452e-09, "learning_rate": 9.978110243959098e-06, "loss": 0.0002, "step": 490 }, { "epoch": 0.002233648575602303, "grad_norm": 5.148401396581903e-05, "learning_rate": 9.977663514243978e-06, "loss": 0.0009, "step": 500 }, { "epoch": 0.0022783215471143493, "grad_norm": 3.0508829240716295e-06, "learning_rate": 9.977216784528857e-06, "loss": 0.0001, "step": 510 }, { "epoch": 0.0023229945186263954, "grad_norm": 3.840111457975581e-05, "learning_rate": 9.976770054813737e-06, "loss": 0.0006, "step": 520 }, { "epoch": 0.0023676674901384415, "grad_norm": 0.0005538268014788628, "learning_rate": 9.976323325098617e-06, "loss": 0.0, "step": 530 }, { "epoch": 0.0024123404616504876, "grad_norm": 2.515283483717212e-07, "learning_rate": 9.975876595383495e-06, "loss": 0.0003, "step": 540 }, { "epoch": 0.0024570134331625337, "grad_norm": 4.5527631300501525e-05, "learning_rate": 9.975429865668376e-06, "loss": 0.5001, "step": 550 }, { "epoch": 0.00250168640467458, "grad_norm": 0.0001504126121290028, "learning_rate": 9.974983135953254e-06, "loss": 0.003, "step": 560 }, { "epoch": 0.002546359376186626, "grad_norm": 0.06473321467638016, "learning_rate": 9.974536406238134e-06, "loss": 0.1604, "step": 570 }, { "epoch": 0.002591032347698672, "grad_norm": 0.06711556017398834, "learning_rate": 9.974089676523014e-06, "loss": 0.0, "step": 580 }, { "epoch": 0.002635705319210718, "grad_norm": 0.2837385833263397, "learning_rate": 9.973642946807893e-06, "loss": 0.0, "step": 590 }, { "epoch": 0.002680378290722764, "grad_norm": 0.00018509538494981825, "learning_rate": 9.973196217092773e-06, "loss": 0.0005, "step": 600 }, { "epoch": 0.00272505126223481, "grad_norm": 0.49453040957450867, "learning_rate": 9.972749487377653e-06, "loss": 0.0145, "step": 610 }, { "epoch": 0.002769724233746856, "grad_norm": 0.7000535130500793, "learning_rate": 9.972302757662531e-06, "loss": 0.0025, "step": 620 }, { "epoch": 0.002814397205258902, "grad_norm": 1.611105382437472e-08, "learning_rate": 9.971856027947411e-06, "loss": 0.0, "step": 630 }, { "epoch": 0.002859070176770948, "grad_norm": 2.75453260201175e-07, "learning_rate": 9.971409298232292e-06, "loss": 0.0081, "step": 640 }, { "epoch": 0.0029037431482829943, "grad_norm": 1.9412634344462276e-07, "learning_rate": 9.97096256851717e-06, "loss": 0.0693, "step": 650 }, { "epoch": 0.0029484161197950404, "grad_norm": 2.8128763762680364e-09, "learning_rate": 9.97051583880205e-06, "loss": 0.1002, "step": 660 }, { "epoch": 0.0029930890913070865, "grad_norm": 0.0005962385912425816, "learning_rate": 9.97006910908693e-06, "loss": 0.0, "step": 670 }, { "epoch": 0.0030377620628191325, "grad_norm": 2.213113307952881, "learning_rate": 9.969622379371809e-06, "loss": 0.0093, "step": 680 }, { "epoch": 0.0030824350343311786, "grad_norm": 1.1586568149368759e-07, "learning_rate": 9.969175649656689e-06, "loss": 0.0, "step": 690 }, { "epoch": 0.0031271080058432247, "grad_norm": 2.6724624633789062, "learning_rate": 9.968728919941569e-06, "loss": 0.0286, "step": 700 }, { "epoch": 0.003171780977355271, "grad_norm": 1.0748630074886023e-06, "learning_rate": 9.968282190226447e-06, "loss": 0.0226, "step": 710 }, { "epoch": 0.003216453948867317, "grad_norm": 0.5267998576164246, "learning_rate": 9.967835460511328e-06, "loss": 0.2235, "step": 720 }, { "epoch": 0.003261126920379363, "grad_norm": 9.127699485134144e-09, "learning_rate": 9.967388730796208e-06, "loss": 0.2336, "step": 730 }, { "epoch": 0.003305799891891409, "grad_norm": 6.590168482034642e-07, "learning_rate": 9.966942001081086e-06, "loss": 0.0006, "step": 740 }, { "epoch": 0.003350472863403455, "grad_norm": 6.971930588406394e-09, "learning_rate": 9.966495271365966e-06, "loss": 0.0232, "step": 750 }, { "epoch": 0.003395145834915501, "grad_norm": 0.01525203138589859, "learning_rate": 9.966048541650846e-06, "loss": 0.1315, "step": 760 }, { "epoch": 0.003439818806427547, "grad_norm": 1.4364684375323122e-07, "learning_rate": 9.965601811935726e-06, "loss": 0.0452, "step": 770 }, { "epoch": 0.003484491777939593, "grad_norm": 0.013514714315533638, "learning_rate": 9.965155082220605e-06, "loss": 0.0001, "step": 780 }, { "epoch": 0.003529164749451639, "grad_norm": 6.487604724270124e-12, "learning_rate": 9.964708352505485e-06, "loss": 0.0, "step": 790 }, { "epoch": 0.0035738377209636853, "grad_norm": 0.0036428929306566715, "learning_rate": 9.964261622790365e-06, "loss": 0.0, "step": 800 }, { "epoch": 0.0036185106924757314, "grad_norm": 0.00026434988831169903, "learning_rate": 9.963814893075244e-06, "loss": 0.0079, "step": 810 }, { "epoch": 0.0036631836639877775, "grad_norm": 5.467930532176979e-05, "learning_rate": 9.963368163360124e-06, "loss": 0.0002, "step": 820 }, { "epoch": 0.0037078566354998236, "grad_norm": 1.1106082098422121e-07, "learning_rate": 9.962921433645004e-06, "loss": 0.0001, "step": 830 }, { "epoch": 0.0037525296070118697, "grad_norm": 31.38213348388672, "learning_rate": 9.962474703929882e-06, "loss": 0.0064, "step": 840 }, { "epoch": 0.003797202578523916, "grad_norm": 0.00040082065970636904, "learning_rate": 9.962027974214762e-06, "loss": 0.0001, "step": 850 }, { "epoch": 0.003841875550035962, "grad_norm": 2.0785208276752343e-10, "learning_rate": 9.96158124449964e-06, "loss": 0.0, "step": 860 }, { "epoch": 0.003886548521548008, "grad_norm": 21.16458511352539, "learning_rate": 9.961134514784521e-06, "loss": 0.011, "step": 870 }, { "epoch": 0.003931221493060054, "grad_norm": 5.773109212903194e-10, "learning_rate": 9.960687785069401e-06, "loss": 0.0001, "step": 880 }, { "epoch": 0.0039758944645721, "grad_norm": 8.707175993549754e-07, "learning_rate": 9.96024105535428e-06, "loss": 0.0001, "step": 890 }, { "epoch": 0.004020567436084146, "grad_norm": 6.301308867057154e-12, "learning_rate": 9.95979432563916e-06, "loss": 0.0388, "step": 900 }, { "epoch": 0.004065240407596192, "grad_norm": 2.1161547514303436e-10, "learning_rate": 9.95934759592404e-06, "loss": 0.0004, "step": 910 }, { "epoch": 0.0041099133791082385, "grad_norm": 0.0002068611647700891, "learning_rate": 9.958900866208918e-06, "loss": 0.0001, "step": 920 }, { "epoch": 0.004154586350620285, "grad_norm": 13.312015533447266, "learning_rate": 9.958454136493798e-06, "loss": 0.1857, "step": 930 }, { "epoch": 0.004199259322132331, "grad_norm": 2.75353984058313e-09, "learning_rate": 9.958007406778678e-06, "loss": 0.0001, "step": 940 }, { "epoch": 0.004243932293644376, "grad_norm": 243.2834014892578, "learning_rate": 9.957560677063557e-06, "loss": 0.1703, "step": 950 }, { "epoch": 0.004288605265156422, "grad_norm": 5.9964871956808e-10, "learning_rate": 9.957113947348437e-06, "loss": 0.0, "step": 960 }, { "epoch": 0.004333278236668468, "grad_norm": 3.727791181518114e-06, "learning_rate": 9.956667217633315e-06, "loss": 0.0201, "step": 970 }, { "epoch": 0.004377951208180514, "grad_norm": 2.396371030499722e-07, "learning_rate": 9.956220487918196e-06, "loss": 0.4625, "step": 980 }, { "epoch": 0.00442262417969256, "grad_norm": 7.116894948921981e-07, "learning_rate": 9.955773758203076e-06, "loss": 0.0, "step": 990 }, { "epoch": 0.004467297151204606, "grad_norm": 0.00417350372299552, "learning_rate": 9.955327028487954e-06, "loss": 0.0003, "step": 1000 }, { "epoch": 0.0045119701227166525, "grad_norm": 1.3091346583171681e-11, "learning_rate": 9.954880298772834e-06, "loss": 0.0002, "step": 1010 }, { "epoch": 0.004556643094228699, "grad_norm": 8.930554629138499e-10, "learning_rate": 9.954433569057714e-06, "loss": 0.8625, "step": 1020 }, { "epoch": 0.004601316065740745, "grad_norm": 4.606064067047555e-06, "learning_rate": 9.953986839342593e-06, "loss": 0.0114, "step": 1030 }, { "epoch": 0.004645989037252791, "grad_norm": 0.09143206477165222, "learning_rate": 9.953540109627473e-06, "loss": 0.0002, "step": 1040 }, { "epoch": 0.004690662008764837, "grad_norm": 1.3204090595245361, "learning_rate": 9.953093379912351e-06, "loss": 0.0082, "step": 1050 }, { "epoch": 0.004735334980276883, "grad_norm": 0.011992010287940502, "learning_rate": 9.952646650197231e-06, "loss": 0.0, "step": 1060 }, { "epoch": 0.004780007951788929, "grad_norm": 2.1359237223350647e-07, "learning_rate": 9.952199920482112e-06, "loss": 0.0, "step": 1070 }, { "epoch": 0.004824680923300975, "grad_norm": 2.1936364191788016e-06, "learning_rate": 9.95175319076699e-06, "loss": 0.0018, "step": 1080 }, { "epoch": 0.004869353894813021, "grad_norm": 6.708241961916883e-09, "learning_rate": 9.95130646105187e-06, "loss": 0.0, "step": 1090 }, { "epoch": 0.004914026866325067, "grad_norm": 1.3402251313951452e-12, "learning_rate": 9.95085973133675e-06, "loss": 0.0, "step": 1100 }, { "epoch": 0.0049586998378371135, "grad_norm": 0.0038719906006008387, "learning_rate": 9.950413001621629e-06, "loss": 0.0003, "step": 1110 }, { "epoch": 0.00500337280934916, "grad_norm": 2.7270777991361683e-07, "learning_rate": 9.949966271906509e-06, "loss": 0.3289, "step": 1120 }, { "epoch": 0.005048045780861206, "grad_norm": 5.544622421264648, "learning_rate": 9.949519542191389e-06, "loss": 0.0357, "step": 1130 }, { "epoch": 0.005092718752373252, "grad_norm": 1.9362450984772295e-06, "learning_rate": 9.949072812476267e-06, "loss": 0.0, "step": 1140 }, { "epoch": 0.005137391723885298, "grad_norm": 3.2831758145501766e-11, "learning_rate": 9.948626082761147e-06, "loss": 0.1835, "step": 1150 }, { "epoch": 0.005182064695397344, "grad_norm": 3.0839785836178635e-08, "learning_rate": 9.948179353046028e-06, "loss": 0.0, "step": 1160 }, { "epoch": 0.00522673766690939, "grad_norm": 4.25630517497666e-08, "learning_rate": 9.947732623330906e-06, "loss": 0.0765, "step": 1170 }, { "epoch": 0.005271410638421436, "grad_norm": 0.04964405670762062, "learning_rate": 9.947285893615786e-06, "loss": 0.0049, "step": 1180 }, { "epoch": 0.005316083609933482, "grad_norm": 4.989205081074033e-06, "learning_rate": 9.946839163900666e-06, "loss": 0.0089, "step": 1190 }, { "epoch": 0.005360756581445528, "grad_norm": 9.702245951093147e-11, "learning_rate": 9.946392434185545e-06, "loss": 0.0, "step": 1200 }, { "epoch": 0.0054054295529575745, "grad_norm": 13.150774955749512, "learning_rate": 9.945945704470425e-06, "loss": 0.0056, "step": 1210 }, { "epoch": 0.00545010252446962, "grad_norm": 9.329108252131846e-06, "learning_rate": 9.945498974755305e-06, "loss": 0.018, "step": 1220 }, { "epoch": 0.005494775495981666, "grad_norm": 9.888663043966517e-06, "learning_rate": 9.945052245040185e-06, "loss": 0.4156, "step": 1230 }, { "epoch": 0.005539448467493712, "grad_norm": 1.0036829006798698e-11, "learning_rate": 9.944605515325064e-06, "loss": 0.0, "step": 1240 }, { "epoch": 0.005584121439005758, "grad_norm": 5.702725047740387e-06, "learning_rate": 9.944158785609944e-06, "loss": 0.0, "step": 1250 }, { "epoch": 0.005628794410517804, "grad_norm": 0.0011835768818855286, "learning_rate": 9.943712055894824e-06, "loss": 0.1227, "step": 1260 }, { "epoch": 0.00567346738202985, "grad_norm": 4.868632572652132e-07, "learning_rate": 9.943265326179702e-06, "loss": 0.0002, "step": 1270 }, { "epoch": 0.005718140353541896, "grad_norm": 0.0006190579733811319, "learning_rate": 9.942818596464582e-06, "loss": 0.0014, "step": 1280 }, { "epoch": 0.005762813325053942, "grad_norm": 9.045915589922515e-07, "learning_rate": 9.942371866749462e-06, "loss": 0.0143, "step": 1290 }, { "epoch": 0.0058074862965659885, "grad_norm": 0.0010632362682372332, "learning_rate": 9.941925137034341e-06, "loss": 0.0001, "step": 1300 }, { "epoch": 0.005852159268078035, "grad_norm": 1.4616702515013458e-08, "learning_rate": 9.941478407319221e-06, "loss": 0.0003, "step": 1310 }, { "epoch": 0.005896832239590081, "grad_norm": 0.003920422866940498, "learning_rate": 9.941031677604101e-06, "loss": 0.0, "step": 1320 }, { "epoch": 0.005941505211102127, "grad_norm": 1.2216974987211415e-08, "learning_rate": 9.94058494788898e-06, "loss": 0.1611, "step": 1330 }, { "epoch": 0.005986178182614173, "grad_norm": 2.7995277207537583e-08, "learning_rate": 9.94013821817386e-06, "loss": 0.0002, "step": 1340 }, { "epoch": 0.006030851154126219, "grad_norm": 5.323033333559657e-11, "learning_rate": 9.939691488458738e-06, "loss": 0.0, "step": 1350 }, { "epoch": 0.006075524125638265, "grad_norm": 8.650657079556368e-09, "learning_rate": 9.939244758743618e-06, "loss": 0.1313, "step": 1360 }, { "epoch": 0.006120197097150311, "grad_norm": 7.414421816065442e-06, "learning_rate": 9.938798029028498e-06, "loss": 0.0001, "step": 1370 }, { "epoch": 0.006164870068662357, "grad_norm": 0.0003329328028485179, "learning_rate": 9.938351299313377e-06, "loss": 0.0018, "step": 1380 }, { "epoch": 0.006209543040174403, "grad_norm": 0.00023930691531859338, "learning_rate": 9.937904569598257e-06, "loss": 0.0013, "step": 1390 }, { "epoch": 0.0062542160116864495, "grad_norm": 166.027099609375, "learning_rate": 9.937457839883137e-06, "loss": 0.0341, "step": 1400 }, { "epoch": 0.006298888983198496, "grad_norm": 2.805340500344755e-06, "learning_rate": 9.937011110168016e-06, "loss": 0.0001, "step": 1410 }, { "epoch": 0.006343561954710542, "grad_norm": 8.33417709844575e-12, "learning_rate": 9.936564380452896e-06, "loss": 0.0312, "step": 1420 }, { "epoch": 0.006388234926222588, "grad_norm": 1.616840124130249, "learning_rate": 9.936117650737776e-06, "loss": 0.0003, "step": 1430 }, { "epoch": 0.006432907897734634, "grad_norm": 7.233583346533123e-06, "learning_rate": 9.935670921022654e-06, "loss": 0.0016, "step": 1440 }, { "epoch": 0.00647758086924668, "grad_norm": 0.0003906420897692442, "learning_rate": 9.935224191307534e-06, "loss": 0.0015, "step": 1450 }, { "epoch": 0.006522253840758726, "grad_norm": 4.490878200158477e-05, "learning_rate": 9.934777461592413e-06, "loss": 0.0067, "step": 1460 }, { "epoch": 0.006566926812270772, "grad_norm": 0.0002814727777149528, "learning_rate": 9.934330731877293e-06, "loss": 0.0003, "step": 1470 }, { "epoch": 0.006611599783782818, "grad_norm": 0.12526680529117584, "learning_rate": 9.933884002162173e-06, "loss": 0.0985, "step": 1480 }, { "epoch": 0.0066562727552948635, "grad_norm": 0.0004512220621109009, "learning_rate": 9.933437272447051e-06, "loss": 0.0432, "step": 1490 }, { "epoch": 0.00670094572680691, "grad_norm": 0.0018296745838597417, "learning_rate": 9.932990542731932e-06, "loss": 0.0001, "step": 1500 }, { "epoch": 0.006745618698318956, "grad_norm": 0.008827965706586838, "learning_rate": 9.932543813016812e-06, "loss": 0.0001, "step": 1510 }, { "epoch": 0.006790291669831002, "grad_norm": 0.643787682056427, "learning_rate": 9.93209708330169e-06, "loss": 0.0001, "step": 1520 }, { "epoch": 0.006834964641343048, "grad_norm": 1.456542668165639e-05, "learning_rate": 9.93165035358657e-06, "loss": 0.0, "step": 1530 }, { "epoch": 0.006879637612855094, "grad_norm": 0.004160408861935139, "learning_rate": 9.931203623871449e-06, "loss": 0.0129, "step": 1540 }, { "epoch": 0.00692431058436714, "grad_norm": 1.7304720878601074, "learning_rate": 9.930756894156329e-06, "loss": 0.0004, "step": 1550 }, { "epoch": 0.006968983555879186, "grad_norm": 3.7183988094329834, "learning_rate": 9.930310164441209e-06, "loss": 0.0009, "step": 1560 }, { "epoch": 0.007013656527391232, "grad_norm": 8.051643817452714e-05, "learning_rate": 9.929863434726087e-06, "loss": 0.0, "step": 1570 }, { "epoch": 0.007058329498903278, "grad_norm": 0.0009096893481910229, "learning_rate": 9.929416705010967e-06, "loss": 0.0, "step": 1580 }, { "epoch": 0.0071030024704153245, "grad_norm": 8.303599088321789e-07, "learning_rate": 9.928969975295848e-06, "loss": 0.0101, "step": 1590 }, { "epoch": 0.007147675441927371, "grad_norm": 5.950789017106217e-08, "learning_rate": 9.928523245580726e-06, "loss": 0.076, "step": 1600 }, { "epoch": 0.007192348413439417, "grad_norm": 6.889599717396777e-07, "learning_rate": 9.928076515865606e-06, "loss": 0.0125, "step": 1610 }, { "epoch": 0.007237021384951463, "grad_norm": 1.637503856954936e-07, "learning_rate": 9.927629786150486e-06, "loss": 0.1141, "step": 1620 }, { "epoch": 0.007281694356463509, "grad_norm": 0.27094534039497375, "learning_rate": 9.927183056435365e-06, "loss": 0.0001, "step": 1630 }, { "epoch": 0.007326367327975555, "grad_norm": 4.483622433326673e-06, "learning_rate": 9.926736326720245e-06, "loss": 0.0, "step": 1640 }, { "epoch": 0.007371040299487601, "grad_norm": 121.53789520263672, "learning_rate": 9.926289597005125e-06, "loss": 0.0389, "step": 1650 }, { "epoch": 0.007415713270999647, "grad_norm": 0.02433842420578003, "learning_rate": 9.925842867290003e-06, "loss": 0.0003, "step": 1660 }, { "epoch": 0.007460386242511693, "grad_norm": 0.00012709507427643985, "learning_rate": 9.925396137574884e-06, "loss": 0.0001, "step": 1670 }, { "epoch": 0.007505059214023739, "grad_norm": 0.0830000564455986, "learning_rate": 9.924949407859764e-06, "loss": 0.0001, "step": 1680 }, { "epoch": 0.0075497321855357855, "grad_norm": 4.0808568330108486e-11, "learning_rate": 9.924502678144644e-06, "loss": 0.0692, "step": 1690 }, { "epoch": 0.007594405157047832, "grad_norm": 4.6427681809291244e-05, "learning_rate": 9.924055948429522e-06, "loss": 0.0691, "step": 1700 }, { "epoch": 0.007639078128559878, "grad_norm": 1.8825395107269287, "learning_rate": 9.923609218714402e-06, "loss": 0.0005, "step": 1710 }, { "epoch": 0.007683751100071924, "grad_norm": 0.0066458964720368385, "learning_rate": 9.923162488999282e-06, "loss": 0.0, "step": 1720 }, { "epoch": 0.00772842407158397, "grad_norm": 5.8484480279508144e-12, "learning_rate": 9.922715759284161e-06, "loss": 0.0001, "step": 1730 }, { "epoch": 0.007773097043096016, "grad_norm": 3.3582630157470703, "learning_rate": 9.922269029569041e-06, "loss": 0.0009, "step": 1740 }, { "epoch": 0.007817770014608061, "grad_norm": 9.292478895736589e-13, "learning_rate": 9.921822299853921e-06, "loss": 0.0, "step": 1750 }, { "epoch": 0.007862442986120108, "grad_norm": 6.707903139613336e-06, "learning_rate": 9.9213755701388e-06, "loss": 0.0002, "step": 1760 }, { "epoch": 0.007907115957632153, "grad_norm": 3.6095958036788667e-13, "learning_rate": 9.92092884042368e-06, "loss": 0.0, "step": 1770 }, { "epoch": 0.0079517889291442, "grad_norm": 4.903984063275857e-08, "learning_rate": 9.92048211070856e-06, "loss": 0.0001, "step": 1780 }, { "epoch": 0.007996461900656246, "grad_norm": 8.984227008620148e-17, "learning_rate": 9.920035380993438e-06, "loss": 0.0031, "step": 1790 }, { "epoch": 0.008041134872168293, "grad_norm": 5.623330253001768e-06, "learning_rate": 9.919588651278318e-06, "loss": 0.0006, "step": 1800 }, { "epoch": 0.008085807843680338, "grad_norm": 14.42357063293457, "learning_rate": 9.919141921563199e-06, "loss": 0.0009, "step": 1810 }, { "epoch": 0.008130480815192385, "grad_norm": 2.3612351223584183e-12, "learning_rate": 9.918695191848077e-06, "loss": 0.0001, "step": 1820 }, { "epoch": 0.00817515378670443, "grad_norm": 9.99601297735353e-07, "learning_rate": 9.918248462132957e-06, "loss": 0.0266, "step": 1830 }, { "epoch": 0.008219826758216477, "grad_norm": 0.016000457108020782, "learning_rate": 9.917801732417836e-06, "loss": 0.0006, "step": 1840 }, { "epoch": 0.008264499729728522, "grad_norm": 2.5022452859735367e-11, "learning_rate": 9.917355002702716e-06, "loss": 0.0022, "step": 1850 }, { "epoch": 0.00830917270124057, "grad_norm": 1.409413086456493e-13, "learning_rate": 9.916908272987596e-06, "loss": 0.0018, "step": 1860 }, { "epoch": 0.008353845672752614, "grad_norm": 0.0018817445961758494, "learning_rate": 9.916461543272474e-06, "loss": 0.2938, "step": 1870 }, { "epoch": 0.008398518644264661, "grad_norm": 1.8390163631920586e-06, "learning_rate": 9.916014813557354e-06, "loss": 0.0, "step": 1880 }, { "epoch": 0.008443191615776707, "grad_norm": 2.352197325805605e-09, "learning_rate": 9.915568083842234e-06, "loss": 0.0, "step": 1890 }, { "epoch": 0.008487864587288752, "grad_norm": 2.6651733264770883e-08, "learning_rate": 9.915121354127113e-06, "loss": 0.0, "step": 1900 }, { "epoch": 0.008532537558800799, "grad_norm": 1.7570675817091264e-11, "learning_rate": 9.914674624411993e-06, "loss": 0.012, "step": 1910 }, { "epoch": 0.008577210530312844, "grad_norm": 5.639159553538775e-06, "learning_rate": 9.914227894696873e-06, "loss": 0.0008, "step": 1920 }, { "epoch": 0.008621883501824891, "grad_norm": 1.478918534303375e-06, "learning_rate": 9.913781164981752e-06, "loss": 0.0, "step": 1930 }, { "epoch": 0.008666556473336936, "grad_norm": 34.57212829589844, "learning_rate": 9.913334435266632e-06, "loss": 0.005, "step": 1940 }, { "epoch": 0.008711229444848983, "grad_norm": 9.411427527084015e-06, "learning_rate": 9.91288770555151e-06, "loss": 0.0, "step": 1950 }, { "epoch": 0.008755902416361028, "grad_norm": 2.57602250641753e-09, "learning_rate": 9.91244097583639e-06, "loss": 0.0008, "step": 1960 }, { "epoch": 0.008800575387873075, "grad_norm": 0.00012505015183705837, "learning_rate": 9.91199424612127e-06, "loss": 0.0001, "step": 1970 }, { "epoch": 0.00884524835938512, "grad_norm": 1.6882954696484376e-06, "learning_rate": 9.911547516406149e-06, "loss": 0.0041, "step": 1980 }, { "epoch": 0.008889921330897168, "grad_norm": 85.14512634277344, "learning_rate": 9.911100786691029e-06, "loss": 0.5186, "step": 1990 }, { "epoch": 0.008934594302409213, "grad_norm": 5.102280283608707e-06, "learning_rate": 9.910654056975909e-06, "loss": 0.0001, "step": 2000 }, { "epoch": 0.00897926727392126, "grad_norm": 2.827220337182912e-13, "learning_rate": 9.910207327260787e-06, "loss": 0.0003, "step": 2010 }, { "epoch": 0.009023940245433305, "grad_norm": 0.0008906282018870115, "learning_rate": 9.909760597545668e-06, "loss": 0.0022, "step": 2020 }, { "epoch": 0.009068613216945352, "grad_norm": 0.013927340507507324, "learning_rate": 9.909313867830548e-06, "loss": 0.0, "step": 2030 }, { "epoch": 0.009113286188457397, "grad_norm": 1.2370523272409173e-08, "learning_rate": 9.908867138115426e-06, "loss": 0.0, "step": 2040 }, { "epoch": 0.009157959159969444, "grad_norm": 5.075939043308608e-06, "learning_rate": 9.908420408400306e-06, "loss": 0.0167, "step": 2050 }, { "epoch": 0.00920263213148149, "grad_norm": 1.1170839115948183e-06, "learning_rate": 9.907973678685185e-06, "loss": 0.0001, "step": 2060 }, { "epoch": 0.009247305102993536, "grad_norm": 1.5858339565966162e-06, "learning_rate": 9.907526948970065e-06, "loss": 0.0, "step": 2070 }, { "epoch": 0.009291978074505582, "grad_norm": 5.236922788753873e-06, "learning_rate": 9.907080219254945e-06, "loss": 0.0281, "step": 2080 }, { "epoch": 0.009336651046017629, "grad_norm": 0.10250507295131683, "learning_rate": 9.906633489539823e-06, "loss": 0.0001, "step": 2090 }, { "epoch": 0.009381324017529674, "grad_norm": 3.122114383431046e-13, "learning_rate": 9.906186759824704e-06, "loss": 0.0252, "step": 2100 }, { "epoch": 0.00942599698904172, "grad_norm": 1.3368267381963506e-08, "learning_rate": 9.905740030109584e-06, "loss": 0.0002, "step": 2110 }, { "epoch": 0.009470669960553766, "grad_norm": 1.0595232238222105e-13, "learning_rate": 9.905293300394462e-06, "loss": 0.0141, "step": 2120 }, { "epoch": 0.009515342932065813, "grad_norm": 0.006708197295665741, "learning_rate": 9.904846570679342e-06, "loss": 0.0005, "step": 2130 }, { "epoch": 0.009560015903577858, "grad_norm": 2.7285839678370394e-05, "learning_rate": 9.904399840964222e-06, "loss": 0.0, "step": 2140 }, { "epoch": 0.009604688875089905, "grad_norm": 1.811926007270813, "learning_rate": 9.903953111249102e-06, "loss": 0.0004, "step": 2150 }, { "epoch": 0.00964936184660195, "grad_norm": 1.2543156782426706e-20, "learning_rate": 9.903506381533981e-06, "loss": 0.0001, "step": 2160 }, { "epoch": 0.009694034818113996, "grad_norm": 9.28156551616155e-10, "learning_rate": 9.903059651818861e-06, "loss": 0.001, "step": 2170 }, { "epoch": 0.009738707789626043, "grad_norm": 7.503916992104438e-18, "learning_rate": 9.902612922103741e-06, "loss": 0.0, "step": 2180 }, { "epoch": 0.009783380761138088, "grad_norm": 2.061022496491205e-05, "learning_rate": 9.90216619238862e-06, "loss": 0.0, "step": 2190 }, { "epoch": 0.009828053732650135, "grad_norm": 1.511832594871521, "learning_rate": 9.9017194626735e-06, "loss": 0.0003, "step": 2200 }, { "epoch": 0.00987272670416218, "grad_norm": 4.14693022321444e-06, "learning_rate": 9.90127273295838e-06, "loss": 0.0252, "step": 2210 }, { "epoch": 0.009917399675674227, "grad_norm": 2.3049189223911526e-07, "learning_rate": 9.900826003243258e-06, "loss": 0.0101, "step": 2220 }, { "epoch": 0.009962072647186272, "grad_norm": 2.308055400135345e-06, "learning_rate": 9.900379273528138e-06, "loss": 0.0002, "step": 2230 }, { "epoch": 0.01000674561869832, "grad_norm": 2.468515626283252e-12, "learning_rate": 9.899932543813019e-06, "loss": 0.0015, "step": 2240 }, { "epoch": 0.010051418590210364, "grad_norm": 1.3838086926343252e-11, "learning_rate": 9.899485814097897e-06, "loss": 0.0008, "step": 2250 }, { "epoch": 0.010096091561722411, "grad_norm": 0.0163019560277462, "learning_rate": 9.899039084382777e-06, "loss": 0.1336, "step": 2260 }, { "epoch": 0.010140764533234457, "grad_norm": 1.2105209634682979e-07, "learning_rate": 9.898592354667657e-06, "loss": 0.0002, "step": 2270 }, { "epoch": 0.010185437504746504, "grad_norm": 0.01281829085201025, "learning_rate": 9.898145624952536e-06, "loss": 0.0021, "step": 2280 }, { "epoch": 0.010230110476258549, "grad_norm": 8.710338050832434e-08, "learning_rate": 9.897698895237416e-06, "loss": 0.0, "step": 2290 }, { "epoch": 0.010274783447770596, "grad_norm": 49.676658630371094, "learning_rate": 9.897252165522296e-06, "loss": 0.0219, "step": 2300 }, { "epoch": 0.010319456419282641, "grad_norm": 2.708988589006367e-11, "learning_rate": 9.896805435807174e-06, "loss": 0.0, "step": 2310 }, { "epoch": 0.010364129390794688, "grad_norm": 1.1903916075084453e-08, "learning_rate": 9.896358706092054e-06, "loss": 0.0252, "step": 2320 }, { "epoch": 0.010408802362306733, "grad_norm": 5.5912947573233396e-05, "learning_rate": 9.895911976376935e-06, "loss": 0.0, "step": 2330 }, { "epoch": 0.01045347533381878, "grad_norm": 6.648967487699053e-11, "learning_rate": 9.895465246661813e-06, "loss": 0.0582, "step": 2340 }, { "epoch": 0.010498148305330825, "grad_norm": 1.705614749880624e-06, "learning_rate": 9.895018516946693e-06, "loss": 0.0008, "step": 2350 }, { "epoch": 0.010542821276842872, "grad_norm": 0.09579768031835556, "learning_rate": 9.894571787231572e-06, "loss": 0.0, "step": 2360 }, { "epoch": 0.010587494248354918, "grad_norm": 1.360779151582392e-07, "learning_rate": 9.894125057516452e-06, "loss": 0.0001, "step": 2370 }, { "epoch": 0.010632167219866965, "grad_norm": 2.5863171160267484e-08, "learning_rate": 9.893678327801332e-06, "loss": 0.0, "step": 2380 }, { "epoch": 0.01067684019137901, "grad_norm": 1.5213099718093872, "learning_rate": 9.89323159808621e-06, "loss": 0.0006, "step": 2390 }, { "epoch": 0.010721513162891057, "grad_norm": 1.3281054207781029e-11, "learning_rate": 9.89278486837109e-06, "loss": 0.0004, "step": 2400 }, { "epoch": 0.010766186134403102, "grad_norm": 0.18338826298713684, "learning_rate": 9.89233813865597e-06, "loss": 0.0, "step": 2410 }, { "epoch": 0.010810859105915149, "grad_norm": 0.8633919954299927, "learning_rate": 9.891891408940849e-06, "loss": 0.0006, "step": 2420 }, { "epoch": 0.010855532077427194, "grad_norm": 0.0002038137463387102, "learning_rate": 9.891444679225729e-06, "loss": 0.0016, "step": 2430 }, { "epoch": 0.01090020504893924, "grad_norm": 2.7525827590579866e-06, "learning_rate": 9.890997949510607e-06, "loss": 0.0252, "step": 2440 }, { "epoch": 0.010944878020451286, "grad_norm": 2.0097404296315347e-10, "learning_rate": 9.890551219795488e-06, "loss": 0.0007, "step": 2450 }, { "epoch": 0.010989550991963332, "grad_norm": 6.054831657754944e-13, "learning_rate": 9.890104490080368e-06, "loss": 0.0002, "step": 2460 }, { "epoch": 0.011034223963475379, "grad_norm": 1.084228706815793e-08, "learning_rate": 9.889657760365246e-06, "loss": 0.0001, "step": 2470 }, { "epoch": 0.011078896934987424, "grad_norm": 0.005000013392418623, "learning_rate": 9.889211030650126e-06, "loss": 0.0036, "step": 2480 }, { "epoch": 0.01112356990649947, "grad_norm": 1.300892371602913e-08, "learning_rate": 9.888764300935006e-06, "loss": 0.0604, "step": 2490 }, { "epoch": 0.011168242878011516, "grad_norm": 2.1114540100097656, "learning_rate": 9.888317571219885e-06, "loss": 0.0057, "step": 2500 }, { "epoch": 0.011212915849523563, "grad_norm": 0.0629802942276001, "learning_rate": 9.887870841504765e-06, "loss": 0.0027, "step": 2510 }, { "epoch": 0.011257588821035608, "grad_norm": 108.7916488647461, "learning_rate": 9.887424111789645e-06, "loss": 0.038, "step": 2520 }, { "epoch": 0.011302261792547655, "grad_norm": 1.681610115156218e-06, "learning_rate": 9.886977382074524e-06, "loss": 0.0996, "step": 2530 }, { "epoch": 0.0113469347640597, "grad_norm": 3.4575478036202867e-09, "learning_rate": 9.886530652359404e-06, "loss": 0.0004, "step": 2540 }, { "epoch": 0.011391607735571747, "grad_norm": 0.0012267986312508583, "learning_rate": 9.886083922644282e-06, "loss": 0.0062, "step": 2550 }, { "epoch": 0.011436280707083793, "grad_norm": 1.681084826898882e-09, "learning_rate": 9.885637192929162e-06, "loss": 0.0004, "step": 2560 }, { "epoch": 0.01148095367859584, "grad_norm": 5.801022995299832e-10, "learning_rate": 9.885190463214042e-06, "loss": 0.0, "step": 2570 }, { "epoch": 0.011525626650107885, "grad_norm": 2.1600253603537567e-05, "learning_rate": 9.88474373349892e-06, "loss": 0.0, "step": 2580 }, { "epoch": 0.011570299621619932, "grad_norm": 1.1640495678477691e-08, "learning_rate": 9.884297003783801e-06, "loss": 0.0, "step": 2590 }, { "epoch": 0.011614972593131977, "grad_norm": 0.010300342924892902, "learning_rate": 9.883850274068681e-06, "loss": 0.0, "step": 2600 }, { "epoch": 0.011659645564644024, "grad_norm": 0.0024663745425641537, "learning_rate": 9.883403544353561e-06, "loss": 0.0002, "step": 2610 }, { "epoch": 0.01170431853615607, "grad_norm": 0.00015290798910427839, "learning_rate": 9.88295681463844e-06, "loss": 0.305, "step": 2620 }, { "epoch": 0.011748991507668116, "grad_norm": 4.006568815384526e-06, "learning_rate": 9.88251008492332e-06, "loss": 0.4032, "step": 2630 }, { "epoch": 0.011793664479180161, "grad_norm": 7.545190783275757e-08, "learning_rate": 9.8820633552082e-06, "loss": 0.0001, "step": 2640 }, { "epoch": 0.011838337450692208, "grad_norm": 0.004153342917561531, "learning_rate": 9.881616625493078e-06, "loss": 0.001, "step": 2650 }, { "epoch": 0.011883010422204254, "grad_norm": 0.3302571177482605, "learning_rate": 9.881169895777958e-06, "loss": 0.0003, "step": 2660 }, { "epoch": 0.0119276833937163, "grad_norm": 7.340275609557523e-10, "learning_rate": 9.880723166062838e-06, "loss": 0.0001, "step": 2670 }, { "epoch": 0.011972356365228346, "grad_norm": 0.0007693750667385757, "learning_rate": 9.880276436347717e-06, "loss": 0.0, "step": 2680 }, { "epoch": 0.012017029336740393, "grad_norm": 9.321666355477376e-11, "learning_rate": 9.879829706632597e-06, "loss": 0.0024, "step": 2690 }, { "epoch": 0.012061702308252438, "grad_norm": 4.1744015572507936e-15, "learning_rate": 9.879382976917477e-06, "loss": 0.0, "step": 2700 }, { "epoch": 0.012106375279764483, "grad_norm": 0.05857926979660988, "learning_rate": 9.878936247202356e-06, "loss": 0.0014, "step": 2710 }, { "epoch": 0.01215104825127653, "grad_norm": 1.411705312648337e-07, "learning_rate": 9.878489517487236e-06, "loss": 0.0, "step": 2720 }, { "epoch": 0.012195721222788575, "grad_norm": 0.00032469897996634245, "learning_rate": 9.878042787772116e-06, "loss": 0.0, "step": 2730 }, { "epoch": 0.012240394194300622, "grad_norm": 2.2813771920482395e-06, "learning_rate": 9.877596058056994e-06, "loss": 0.0, "step": 2740 }, { "epoch": 0.012285067165812668, "grad_norm": 1.0996114241379473e-08, "learning_rate": 9.877149328341874e-06, "loss": 0.0, "step": 2750 }, { "epoch": 0.012329740137324715, "grad_norm": 1.8043495586539393e-09, "learning_rate": 9.876702598626755e-06, "loss": 0.003, "step": 2760 }, { "epoch": 0.01237441310883676, "grad_norm": 5.025076461606659e-07, "learning_rate": 9.876255868911633e-06, "loss": 0.0, "step": 2770 }, { "epoch": 0.012419086080348807, "grad_norm": 4.728721414437587e-09, "learning_rate": 9.875809139196513e-06, "loss": 0.0002, "step": 2780 }, { "epoch": 0.012463759051860852, "grad_norm": 5.2914412663085386e-05, "learning_rate": 9.875362409481393e-06, "loss": 0.0038, "step": 2790 }, { "epoch": 0.012508432023372899, "grad_norm": 9.6832536655711e-07, "learning_rate": 9.874915679766272e-06, "loss": 0.0113, "step": 2800 }, { "epoch": 0.012553104994884944, "grad_norm": 7.181570981629193e-05, "learning_rate": 9.874468950051152e-06, "loss": 0.0, "step": 2810 }, { "epoch": 0.012597777966396991, "grad_norm": 3.6548785828927066e-06, "learning_rate": 9.874022220336032e-06, "loss": 0.0, "step": 2820 }, { "epoch": 0.012642450937909036, "grad_norm": 1.1662586044472456e-12, "learning_rate": 9.87357549062091e-06, "loss": 0.0, "step": 2830 }, { "epoch": 0.012687123909421083, "grad_norm": 8.896884537534788e-05, "learning_rate": 9.87312876090579e-06, "loss": 0.0, "step": 2840 }, { "epoch": 0.012731796880933129, "grad_norm": 3.573189487349193e-12, "learning_rate": 9.872682031190669e-06, "loss": 0.0002, "step": 2850 }, { "epoch": 0.012776469852445176, "grad_norm": 0.005769051611423492, "learning_rate": 9.872235301475549e-06, "loss": 0.0, "step": 2860 }, { "epoch": 0.01282114282395722, "grad_norm": 1.0245492648719434e-10, "learning_rate": 9.871788571760429e-06, "loss": 0.0004, "step": 2870 }, { "epoch": 0.012865815795469268, "grad_norm": 8.58198109199293e-05, "learning_rate": 9.871341842045308e-06, "loss": 0.0019, "step": 2880 }, { "epoch": 0.012910488766981313, "grad_norm": 7.715765418074561e-13, "learning_rate": 9.870895112330188e-06, "loss": 0.0, "step": 2890 }, { "epoch": 0.01295516173849336, "grad_norm": 3.119744405921665e-06, "learning_rate": 9.870448382615068e-06, "loss": 0.0, "step": 2900 }, { "epoch": 0.012999834710005405, "grad_norm": 5.282707604692405e-10, "learning_rate": 9.870001652899946e-06, "loss": 0.0057, "step": 2910 }, { "epoch": 0.013044507681517452, "grad_norm": 3.762822085168205e-12, "learning_rate": 9.869554923184826e-06, "loss": 0.01, "step": 2920 }, { "epoch": 0.013089180653029497, "grad_norm": 0.226803719997406, "learning_rate": 9.869108193469705e-06, "loss": 0.007, "step": 2930 }, { "epoch": 0.013133853624541544, "grad_norm": 6.065501434449061e-09, "learning_rate": 9.868661463754585e-06, "loss": 0.0, "step": 2940 }, { "epoch": 0.01317852659605359, "grad_norm": 0.18184098601341248, "learning_rate": 9.868214734039465e-06, "loss": 0.0348, "step": 2950 }, { "epoch": 0.013223199567565637, "grad_norm": 0.1139160618185997, "learning_rate": 9.867768004324343e-06, "loss": 0.1315, "step": 2960 }, { "epoch": 0.013267872539077682, "grad_norm": 0.00014653371181339025, "learning_rate": 9.867321274609224e-06, "loss": 0.0001, "step": 2970 }, { "epoch": 0.013312545510589727, "grad_norm": 1.2791061543282467e-09, "learning_rate": 9.866874544894104e-06, "loss": 0.0041, "step": 2980 }, { "epoch": 0.013357218482101774, "grad_norm": 5.588849383286743e-10, "learning_rate": 9.866427815178982e-06, "loss": 0.0016, "step": 2990 }, { "epoch": 0.01340189145361382, "grad_norm": 551.7684936523438, "learning_rate": 9.865981085463862e-06, "loss": 0.1721, "step": 3000 }, { "epoch": 0.013446564425125866, "grad_norm": 1.8122448799864216e-13, "learning_rate": 9.865534355748742e-06, "loss": 0.0, "step": 3010 }, { "epoch": 0.013491237396637911, "grad_norm": 0.0015737615758553147, "learning_rate": 9.865087626033621e-06, "loss": 0.0001, "step": 3020 }, { "epoch": 0.013535910368149958, "grad_norm": 1.6226993982354856e-16, "learning_rate": 9.864640896318501e-06, "loss": 0.0, "step": 3030 }, { "epoch": 0.013580583339662004, "grad_norm": 0.00010395060235168785, "learning_rate": 9.86419416660338e-06, "loss": 0.0, "step": 3040 }, { "epoch": 0.01362525631117405, "grad_norm": 1.976358632305164e-08, "learning_rate": 9.86374743688826e-06, "loss": 0.0, "step": 3050 }, { "epoch": 0.013669929282686096, "grad_norm": 3.838881923456938e-08, "learning_rate": 9.86330070717314e-06, "loss": 0.0, "step": 3060 }, { "epoch": 0.013714602254198143, "grad_norm": 8.503144499627524e-07, "learning_rate": 9.86285397745802e-06, "loss": 0.0005, "step": 3070 }, { "epoch": 0.013759275225710188, "grad_norm": 2.1550865259012397e-10, "learning_rate": 9.862407247742898e-06, "loss": 0.0, "step": 3080 }, { "epoch": 0.013803948197222235, "grad_norm": 1.3286899358533333e-11, "learning_rate": 9.861960518027778e-06, "loss": 0.0037, "step": 3090 }, { "epoch": 0.01384862116873428, "grad_norm": 9.956566645996645e-05, "learning_rate": 9.861513788312658e-06, "loss": 0.0, "step": 3100 }, { "epoch": 0.013893294140246327, "grad_norm": 1.4003201931345188e-09, "learning_rate": 9.861067058597537e-06, "loss": 0.0, "step": 3110 }, { "epoch": 0.013937967111758372, "grad_norm": 3.2533531424405737e-09, "learning_rate": 9.860620328882417e-06, "loss": 0.0, "step": 3120 }, { "epoch": 0.01398264008327042, "grad_norm": 7.113734046271247e-10, "learning_rate": 9.860173599167297e-06, "loss": 0.1255, "step": 3130 }, { "epoch": 0.014027313054782465, "grad_norm": 3.964849426552064e-08, "learning_rate": 9.859726869452176e-06, "loss": 0.0, "step": 3140 }, { "epoch": 0.014071986026294512, "grad_norm": 6.418162001864403e-07, "learning_rate": 9.859280139737056e-06, "loss": 0.0, "step": 3150 }, { "epoch": 0.014116658997806557, "grad_norm": 3.987723928844389e-09, "learning_rate": 9.858833410021936e-06, "loss": 0.0005, "step": 3160 }, { "epoch": 0.014161331969318604, "grad_norm": 0.5856716632843018, "learning_rate": 9.858386680306814e-06, "loss": 0.0002, "step": 3170 }, { "epoch": 0.014206004940830649, "grad_norm": 8.413451185740328e-11, "learning_rate": 9.857939950591694e-06, "loss": 0.0005, "step": 3180 }, { "epoch": 0.014250677912342696, "grad_norm": 1.8648626046169348e-11, "learning_rate": 9.857493220876575e-06, "loss": 0.0001, "step": 3190 }, { "epoch": 0.014295350883854741, "grad_norm": 5.774042910466903e-10, "learning_rate": 9.857046491161455e-06, "loss": 0.0, "step": 3200 }, { "epoch": 0.014340023855366788, "grad_norm": 0.00016052935097832233, "learning_rate": 9.856599761446333e-06, "loss": 0.0, "step": 3210 }, { "epoch": 0.014384696826878833, "grad_norm": 1.0157315018645505e-14, "learning_rate": 9.856153031731213e-06, "loss": 0.0034, "step": 3220 }, { "epoch": 0.01442936979839088, "grad_norm": 0.029599646106362343, "learning_rate": 9.855706302016092e-06, "loss": 0.0267, "step": 3230 }, { "epoch": 0.014474042769902926, "grad_norm": 9.77450180053711, "learning_rate": 9.855259572300972e-06, "loss": 0.0019, "step": 3240 }, { "epoch": 0.014518715741414973, "grad_norm": 1.0505848990760569e-07, "learning_rate": 9.854812842585852e-06, "loss": 0.0006, "step": 3250 }, { "epoch": 0.014563388712927018, "grad_norm": 0.002367907902225852, "learning_rate": 9.85436611287073e-06, "loss": 0.0, "step": 3260 }, { "epoch": 0.014608061684439063, "grad_norm": 1.1953168232992084e-08, "learning_rate": 9.85391938315561e-06, "loss": 0.0005, "step": 3270 }, { "epoch": 0.01465273465595111, "grad_norm": 4.748066089604874e-12, "learning_rate": 9.85347265344049e-06, "loss": 0.0001, "step": 3280 }, { "epoch": 0.014697407627463155, "grad_norm": 0.01604483276605606, "learning_rate": 9.853025923725369e-06, "loss": 0.0, "step": 3290 }, { "epoch": 0.014742080598975202, "grad_norm": 2.076117743854411e-05, "learning_rate": 9.852579194010249e-06, "loss": 0.0002, "step": 3300 }, { "epoch": 0.014786753570487247, "grad_norm": 0.000603312742896378, "learning_rate": 9.85213246429513e-06, "loss": 0.425, "step": 3310 }, { "epoch": 0.014831426541999294, "grad_norm": 0.004995096940547228, "learning_rate": 9.851685734580008e-06, "loss": 0.5148, "step": 3320 }, { "epoch": 0.01487609951351134, "grad_norm": 0.03883814066648483, "learning_rate": 9.851239004864888e-06, "loss": 0.9627, "step": 3330 }, { "epoch": 0.014920772485023387, "grad_norm": 8.102724677883089e-06, "learning_rate": 9.850792275149766e-06, "loss": 0.0048, "step": 3340 }, { "epoch": 0.014965445456535432, "grad_norm": 0.31644752621650696, "learning_rate": 9.850345545434646e-06, "loss": 0.0001, "step": 3350 }, { "epoch": 0.015010118428047479, "grad_norm": 3.0332233905792236, "learning_rate": 9.849898815719526e-06, "loss": 0.0058, "step": 3360 }, { "epoch": 0.015054791399559524, "grad_norm": 6.062638391313158e-09, "learning_rate": 9.849452086004405e-06, "loss": 0.0013, "step": 3370 }, { "epoch": 0.015099464371071571, "grad_norm": 0.0011684580240398645, "learning_rate": 9.849005356289285e-06, "loss": 0.0002, "step": 3380 }, { "epoch": 0.015144137342583616, "grad_norm": 5.77909929688758e-07, "learning_rate": 9.848558626574165e-06, "loss": 0.0, "step": 3390 }, { "epoch": 0.015188810314095663, "grad_norm": 4.860052058575093e-07, "learning_rate": 9.848111896859044e-06, "loss": 0.0, "step": 3400 }, { "epoch": 0.015233483285607708, "grad_norm": 1.0624749847920612e-05, "learning_rate": 9.847665167143924e-06, "loss": 0.0, "step": 3410 }, { "epoch": 0.015278156257119755, "grad_norm": 3.9735247969567566e-13, "learning_rate": 9.847218437428804e-06, "loss": 0.0005, "step": 3420 }, { "epoch": 0.0153228292286318, "grad_norm": 0.0007657803362235427, "learning_rate": 9.846771707713682e-06, "loss": 0.0003, "step": 3430 }, { "epoch": 0.015367502200143848, "grad_norm": 9.241873377696663e-10, "learning_rate": 9.846324977998562e-06, "loss": 0.0004, "step": 3440 }, { "epoch": 0.015412175171655893, "grad_norm": 1.5009467233539908e-06, "learning_rate": 9.845878248283441e-06, "loss": 0.0001, "step": 3450 }, { "epoch": 0.01545684814316794, "grad_norm": 1.1387187193179216e-08, "learning_rate": 9.845431518568321e-06, "loss": 0.0, "step": 3460 }, { "epoch": 0.015501521114679985, "grad_norm": 6.789093731640605e-09, "learning_rate": 9.844984788853201e-06, "loss": 0.0001, "step": 3470 }, { "epoch": 0.015546194086192032, "grad_norm": 1.0673472772282366e-08, "learning_rate": 9.84453805913808e-06, "loss": 0.0, "step": 3480 }, { "epoch": 0.015590867057704077, "grad_norm": 0.0025754093658179045, "learning_rate": 9.84409132942296e-06, "loss": 0.0, "step": 3490 }, { "epoch": 0.015635540029216122, "grad_norm": 1.62450643215184e-11, "learning_rate": 9.84364459970784e-06, "loss": 0.0, "step": 3500 }, { "epoch": 0.01568021300072817, "grad_norm": 7.053716277738431e-09, "learning_rate": 9.843197869992718e-06, "loss": 0.0, "step": 3510 }, { "epoch": 0.015724885972240216, "grad_norm": 0.03826324641704559, "learning_rate": 9.842751140277598e-06, "loss": 0.0, "step": 3520 }, { "epoch": 0.01576955894375226, "grad_norm": 4.0738819961916306e-07, "learning_rate": 9.842304410562478e-06, "loss": 0.0, "step": 3530 }, { "epoch": 0.015814231915264307, "grad_norm": 1.7018284351189017e-11, "learning_rate": 9.841857680847357e-06, "loss": 0.0, "step": 3540 }, { "epoch": 0.015858904886776354, "grad_norm": 0.0078599127009511, "learning_rate": 9.841410951132237e-06, "loss": 0.0001, "step": 3550 }, { "epoch": 0.0159035778582884, "grad_norm": 4.419483182194206e-11, "learning_rate": 9.840964221417117e-06, "loss": 0.4157, "step": 3560 }, { "epoch": 0.015948250829800444, "grad_norm": 6.113156268838793e-05, "learning_rate": 9.840517491701996e-06, "loss": 0.018, "step": 3570 }, { "epoch": 0.01599292380131249, "grad_norm": 1.637298852052993e-09, "learning_rate": 9.840070761986876e-06, "loss": 0.0633, "step": 3580 }, { "epoch": 0.016037596772824538, "grad_norm": 1.0059465012091096e-06, "learning_rate": 9.839624032271756e-06, "loss": 0.0297, "step": 3590 }, { "epoch": 0.016082269744336585, "grad_norm": 1.3263518283679332e-08, "learning_rate": 9.839177302556634e-06, "loss": 0.0, "step": 3600 }, { "epoch": 0.01612694271584863, "grad_norm": 3.2096600932618458e-09, "learning_rate": 9.838730572841514e-06, "loss": 0.0593, "step": 3610 }, { "epoch": 0.016171615687360676, "grad_norm": 0.5415199995040894, "learning_rate": 9.838283843126395e-06, "loss": 0.0341, "step": 3620 }, { "epoch": 0.016216288658872723, "grad_norm": 5.8153847959374616e-08, "learning_rate": 9.837837113411273e-06, "loss": 0.778, "step": 3630 }, { "epoch": 0.01626096163038477, "grad_norm": 0.0023709749802947044, "learning_rate": 9.837390383696153e-06, "loss": 0.0, "step": 3640 }, { "epoch": 0.016305634601896813, "grad_norm": 2.2431919603077555e-12, "learning_rate": 9.836943653981033e-06, "loss": 0.0002, "step": 3650 }, { "epoch": 0.01635030757340886, "grad_norm": 1.8902798891067505, "learning_rate": 9.836496924265913e-06, "loss": 0.3081, "step": 3660 }, { "epoch": 0.016394980544920907, "grad_norm": 7.196881979254499e-10, "learning_rate": 9.836050194550792e-06, "loss": 0.0, "step": 3670 }, { "epoch": 0.016439653516432954, "grad_norm": 1.9319632053375244, "learning_rate": 9.835603464835672e-06, "loss": 0.0006, "step": 3680 }, { "epoch": 0.016484326487944997, "grad_norm": 6.603991550946375e-06, "learning_rate": 9.835156735120552e-06, "loss": 0.0001, "step": 3690 }, { "epoch": 0.016528999459457044, "grad_norm": 1.0397147552632102e-12, "learning_rate": 9.83471000540543e-06, "loss": 0.0016, "step": 3700 }, { "epoch": 0.01657367243096909, "grad_norm": 6.637133651565819e-07, "learning_rate": 9.83426327569031e-06, "loss": 0.0, "step": 3710 }, { "epoch": 0.01661834540248114, "grad_norm": 2.6404313757666387e-07, "learning_rate": 9.83381654597519e-06, "loss": 0.0023, "step": 3720 }, { "epoch": 0.016663018373993182, "grad_norm": 5.428249642136507e-05, "learning_rate": 9.833369816260069e-06, "loss": 0.0003, "step": 3730 }, { "epoch": 0.01670769134550523, "grad_norm": 1.8128656620319816e-07, "learning_rate": 9.83292308654495e-06, "loss": 0.9128, "step": 3740 }, { "epoch": 0.016752364317017276, "grad_norm": 1.0186456165683921e-05, "learning_rate": 9.832476356829828e-06, "loss": 0.0, "step": 3750 }, { "epoch": 0.016797037288529323, "grad_norm": 1.559259033001581e-07, "learning_rate": 9.832029627114708e-06, "loss": 0.0231, "step": 3760 }, { "epoch": 0.016841710260041366, "grad_norm": 7.261510745593114e-06, "learning_rate": 9.831582897399588e-06, "loss": 0.0, "step": 3770 }, { "epoch": 0.016886383231553413, "grad_norm": 4.436628557868971e-07, "learning_rate": 9.831136167684466e-06, "loss": 0.0013, "step": 3780 }, { "epoch": 0.01693105620306546, "grad_norm": 1.7237377166748047, "learning_rate": 9.830689437969346e-06, "loss": 0.0007, "step": 3790 }, { "epoch": 0.016975729174577504, "grad_norm": 0.7090981006622314, "learning_rate": 9.830242708254227e-06, "loss": 0.0003, "step": 3800 }, { "epoch": 0.01702040214608955, "grad_norm": 2.859711685232469e-07, "learning_rate": 9.829795978539105e-06, "loss": 0.0, "step": 3810 }, { "epoch": 0.017065075117601598, "grad_norm": 4.5942306314827874e-05, "learning_rate": 9.829349248823985e-06, "loss": 0.2813, "step": 3820 }, { "epoch": 0.017109748089113645, "grad_norm": 0.026525896042585373, "learning_rate": 9.828902519108864e-06, "loss": 0.0, "step": 3830 }, { "epoch": 0.017154421060625688, "grad_norm": 173.7867889404297, "learning_rate": 9.828455789393744e-06, "loss": 0.0828, "step": 3840 }, { "epoch": 0.017199094032137735, "grad_norm": 0.0008094139629974961, "learning_rate": 9.828009059678624e-06, "loss": 0.0015, "step": 3850 }, { "epoch": 0.017243767003649782, "grad_norm": 2.362374473818818e-08, "learning_rate": 9.827562329963502e-06, "loss": 0.0202, "step": 3860 }, { "epoch": 0.01728843997516183, "grad_norm": 1.1910659090952347e-11, "learning_rate": 9.827115600248382e-06, "loss": 0.0011, "step": 3870 }, { "epoch": 0.017333112946673872, "grad_norm": 0.009401158429682255, "learning_rate": 9.826668870533263e-06, "loss": 0.0, "step": 3880 }, { "epoch": 0.01737778591818592, "grad_norm": 8.214187552368912e-09, "learning_rate": 9.826222140818141e-06, "loss": 0.0, "step": 3890 }, { "epoch": 0.017422458889697966, "grad_norm": 5.333662556950003e-05, "learning_rate": 9.825775411103021e-06, "loss": 0.0004, "step": 3900 }, { "epoch": 0.017467131861210013, "grad_norm": 0.00015062151942402124, "learning_rate": 9.825328681387901e-06, "loss": 0.0001, "step": 3910 }, { "epoch": 0.017511804832722057, "grad_norm": 0.4469970762729645, "learning_rate": 9.82488195167278e-06, "loss": 0.0002, "step": 3920 }, { "epoch": 0.017556477804234104, "grad_norm": 0.12761425971984863, "learning_rate": 9.82443522195766e-06, "loss": 0.0, "step": 3930 }, { "epoch": 0.01760115077574615, "grad_norm": 3.445596308271348e-10, "learning_rate": 9.823988492242538e-06, "loss": 0.0021, "step": 3940 }, { "epoch": 0.017645823747258198, "grad_norm": 3.4900170930995955e-07, "learning_rate": 9.823541762527418e-06, "loss": 0.0, "step": 3950 }, { "epoch": 0.01769049671877024, "grad_norm": 526.72021484375, "learning_rate": 9.823095032812298e-06, "loss": 0.1069, "step": 3960 }, { "epoch": 0.017735169690282288, "grad_norm": 5.073938369750977, "learning_rate": 9.822648303097177e-06, "loss": 0.0014, "step": 3970 }, { "epoch": 0.017779842661794335, "grad_norm": 23.13395118713379, "learning_rate": 9.822201573382057e-06, "loss": 0.0081, "step": 3980 }, { "epoch": 0.017824515633306382, "grad_norm": 1.453740151191596e-05, "learning_rate": 9.821754843666937e-06, "loss": 0.0007, "step": 3990 }, { "epoch": 0.017869188604818426, "grad_norm": 4.718764781951904, "learning_rate": 9.821308113951816e-06, "loss": 0.0015, "step": 4000 }, { "epoch": 0.017913861576330473, "grad_norm": 7.012652611382286e-12, "learning_rate": 9.820861384236696e-06, "loss": 0.0005, "step": 4010 }, { "epoch": 0.01795853454784252, "grad_norm": 9.016378409946318e-14, "learning_rate": 9.820414654521576e-06, "loss": 0.3409, "step": 4020 }, { "epoch": 0.018003207519354567, "grad_norm": 0.34797075390815735, "learning_rate": 9.819967924806454e-06, "loss": 0.0013, "step": 4030 }, { "epoch": 0.01804788049086661, "grad_norm": 2.998768522388673e-08, "learning_rate": 9.819521195091334e-06, "loss": 0.0006, "step": 4040 }, { "epoch": 0.018092553462378657, "grad_norm": 0.0008473226334899664, "learning_rate": 9.819074465376215e-06, "loss": 0.0, "step": 4050 }, { "epoch": 0.018137226433890704, "grad_norm": 6.374268474163003e-11, "learning_rate": 9.818627735661093e-06, "loss": 0.0313, "step": 4060 }, { "epoch": 0.018181899405402747, "grad_norm": 3.1651177323510638e-06, "learning_rate": 9.818181005945973e-06, "loss": 0.0, "step": 4070 }, { "epoch": 0.018226572376914794, "grad_norm": 0.00011180033470736817, "learning_rate": 9.817734276230853e-06, "loss": 0.0004, "step": 4080 }, { "epoch": 0.01827124534842684, "grad_norm": 3.584693502034497e-07, "learning_rate": 9.817287546515732e-06, "loss": 0.0, "step": 4090 }, { "epoch": 0.01831591831993889, "grad_norm": 752.3006591796875, "learning_rate": 9.816840816800612e-06, "loss": 0.3661, "step": 4100 }, { "epoch": 0.018360591291450932, "grad_norm": 26.972646713256836, "learning_rate": 9.816394087085492e-06, "loss": 0.0867, "step": 4110 }, { "epoch": 0.01840526426296298, "grad_norm": 3.087233810608603e-12, "learning_rate": 9.815947357370372e-06, "loss": 0.0, "step": 4120 }, { "epoch": 0.018449937234475026, "grad_norm": 3.907925025248282e-11, "learning_rate": 9.81550062765525e-06, "loss": 0.004, "step": 4130 }, { "epoch": 0.018494610205987073, "grad_norm": 0.0007196891237981617, "learning_rate": 9.81505389794013e-06, "loss": 0.0, "step": 4140 }, { "epoch": 0.018539283177499116, "grad_norm": 2.1690328139811754e-05, "learning_rate": 9.81460716822501e-06, "loss": 0.0, "step": 4150 }, { "epoch": 0.018583956149011163, "grad_norm": 4.64138536632186e-12, "learning_rate": 9.814160438509889e-06, "loss": 0.0002, "step": 4160 }, { "epoch": 0.01862862912052321, "grad_norm": 3.0848708152770996, "learning_rate": 9.81371370879477e-06, "loss": 0.0008, "step": 4170 }, { "epoch": 0.018673302092035257, "grad_norm": 376.42767333984375, "learning_rate": 9.81326697907965e-06, "loss": 0.0408, "step": 4180 }, { "epoch": 0.0187179750635473, "grad_norm": 1.2392396950191426e-13, "learning_rate": 9.812820249364528e-06, "loss": 0.0003, "step": 4190 }, { "epoch": 0.018762648035059348, "grad_norm": 1.0111125448020175e-06, "learning_rate": 9.812373519649408e-06, "loss": 0.0001, "step": 4200 }, { "epoch": 0.018807321006571395, "grad_norm": 1.8822409231233905e-07, "learning_rate": 9.811926789934288e-06, "loss": 0.0, "step": 4210 }, { "epoch": 0.01885199397808344, "grad_norm": 0.003501696279272437, "learning_rate": 9.811480060219166e-06, "loss": 0.0041, "step": 4220 }, { "epoch": 0.018896666949595485, "grad_norm": 5.014882844989188e-06, "learning_rate": 9.811033330504047e-06, "loss": 0.0025, "step": 4230 }, { "epoch": 0.018941339921107532, "grad_norm": 98.7721939086914, "learning_rate": 9.810586600788925e-06, "loss": 0.0182, "step": 4240 }, { "epoch": 0.01898601289261958, "grad_norm": 1.2174330549896695e-05, "learning_rate": 9.810139871073805e-06, "loss": 0.0, "step": 4250 }, { "epoch": 0.019030685864131626, "grad_norm": 0.00010244682925986126, "learning_rate": 9.809693141358685e-06, "loss": 0.0, "step": 4260 }, { "epoch": 0.01907535883564367, "grad_norm": 21.687419891357422, "learning_rate": 9.809246411643564e-06, "loss": 0.0187, "step": 4270 }, { "epoch": 0.019120031807155716, "grad_norm": 1.5209188075626656e-10, "learning_rate": 9.808799681928444e-06, "loss": 0.0134, "step": 4280 }, { "epoch": 0.019164704778667763, "grad_norm": 8.382775029680545e-10, "learning_rate": 9.808352952213324e-06, "loss": 0.0252, "step": 4290 }, { "epoch": 0.01920937775017981, "grad_norm": 1.1792429961124049e-14, "learning_rate": 9.807906222498202e-06, "loss": 0.0004, "step": 4300 }, { "epoch": 0.019254050721691854, "grad_norm": 0.051694102585315704, "learning_rate": 9.807459492783083e-06, "loss": 0.0, "step": 4310 }, { "epoch": 0.0192987236932039, "grad_norm": 7.386233291661881e-11, "learning_rate": 9.807012763067961e-06, "loss": 0.0001, "step": 4320 }, { "epoch": 0.019343396664715948, "grad_norm": 2.1222596596759402e-11, "learning_rate": 9.806566033352841e-06, "loss": 0.0001, "step": 4330 }, { "epoch": 0.01938806963622799, "grad_norm": 1.083875877938567e-09, "learning_rate": 9.806119303637721e-06, "loss": 0.0428, "step": 4340 }, { "epoch": 0.019432742607740038, "grad_norm": 2.3458659459407727e-09, "learning_rate": 9.8056725739226e-06, "loss": 0.0, "step": 4350 }, { "epoch": 0.019477415579252085, "grad_norm": 9.970278454837084e-14, "learning_rate": 9.80522584420748e-06, "loss": 0.0, "step": 4360 }, { "epoch": 0.019522088550764132, "grad_norm": 3.586633767760361e-10, "learning_rate": 9.80477911449236e-06, "loss": 0.6023, "step": 4370 }, { "epoch": 0.019566761522276176, "grad_norm": 0.00017252798716071993, "learning_rate": 9.804332384777238e-06, "loss": 0.0, "step": 4380 }, { "epoch": 0.019611434493788223, "grad_norm": 684.9390258789062, "learning_rate": 9.803885655062118e-06, "loss": 1.2474, "step": 4390 }, { "epoch": 0.01965610746530027, "grad_norm": 0.024622151628136635, "learning_rate": 9.803438925346999e-06, "loss": 0.0758, "step": 4400 }, { "epoch": 0.019700780436812317, "grad_norm": 52.290061950683594, "learning_rate": 9.802992195631877e-06, "loss": 0.0207, "step": 4410 }, { "epoch": 0.01974545340832436, "grad_norm": 0.05896748974919319, "learning_rate": 9.802545465916757e-06, "loss": 0.1313, "step": 4420 }, { "epoch": 0.019790126379836407, "grad_norm": 1.7239247540601355e-07, "learning_rate": 9.802098736201636e-06, "loss": 0.0758, "step": 4430 }, { "epoch": 0.019834799351348454, "grad_norm": 8.05067427801525e-18, "learning_rate": 9.801652006486516e-06, "loss": 0.0, "step": 4440 }, { "epoch": 0.0198794723228605, "grad_norm": 4.193351079424373e-20, "learning_rate": 9.801205276771396e-06, "loss": 0.0079, "step": 4450 }, { "epoch": 0.019924145294372544, "grad_norm": 8.296374654978567e-13, "learning_rate": 9.800758547056274e-06, "loss": 0.2641, "step": 4460 }, { "epoch": 0.01996881826588459, "grad_norm": 1.3925680377724348e-06, "learning_rate": 9.800311817341154e-06, "loss": 0.0, "step": 4470 }, { "epoch": 0.02001349123739664, "grad_norm": 86.02528381347656, "learning_rate": 9.799865087626034e-06, "loss": 0.0143, "step": 4480 }, { "epoch": 0.020058164208908685, "grad_norm": 51.72323989868164, "learning_rate": 9.799418357910913e-06, "loss": 0.0343, "step": 4490 }, { "epoch": 0.02010283718042073, "grad_norm": 1.158203190243512e-06, "learning_rate": 9.798971628195793e-06, "loss": 0.0001, "step": 4500 }, { "epoch": 0.020147510151932776, "grad_norm": 9.528105301276346e-13, "learning_rate": 9.798524898480673e-06, "loss": 0.0011, "step": 4510 }, { "epoch": 0.020192183123444823, "grad_norm": 4.934356638841564e-07, "learning_rate": 9.798078168765552e-06, "loss": 0.0003, "step": 4520 }, { "epoch": 0.02023685609495687, "grad_norm": 2.8366539478302, "learning_rate": 9.797631439050432e-06, "loss": 0.0005, "step": 4530 }, { "epoch": 0.020281529066468913, "grad_norm": 0.00037676902138628066, "learning_rate": 9.797184709335312e-06, "loss": 0.0031, "step": 4540 }, { "epoch": 0.02032620203798096, "grad_norm": 4.719042379086602e-10, "learning_rate": 9.79673797962019e-06, "loss": 0.008, "step": 4550 }, { "epoch": 0.020370875009493007, "grad_norm": 3.678450599675642e-11, "learning_rate": 9.79629124990507e-06, "loss": 0.0, "step": 4560 }, { "epoch": 0.020415547981005054, "grad_norm": 5.407119260780746e-06, "learning_rate": 9.79584452018995e-06, "loss": 0.0049, "step": 4570 }, { "epoch": 0.020460220952517098, "grad_norm": 1.160919282483519e-06, "learning_rate": 9.79539779047483e-06, "loss": 0.1253, "step": 4580 }, { "epoch": 0.020504893924029145, "grad_norm": 6.910381489433348e-05, "learning_rate": 9.794951060759709e-06, "loss": 0.0004, "step": 4590 }, { "epoch": 0.02054956689554119, "grad_norm": 0.00012307892029639333, "learning_rate": 9.79450433104459e-06, "loss": 0.0001, "step": 4600 }, { "epoch": 0.020594239867053235, "grad_norm": 1.16939194438892e-10, "learning_rate": 9.79405760132947e-06, "loss": 0.0483, "step": 4610 }, { "epoch": 0.020638912838565282, "grad_norm": 0.00010817075235536322, "learning_rate": 9.793610871614348e-06, "loss": 0.0134, "step": 4620 }, { "epoch": 0.02068358581007733, "grad_norm": 12.48432445526123, "learning_rate": 9.793164141899228e-06, "loss": 0.0017, "step": 4630 }, { "epoch": 0.020728258781589376, "grad_norm": 1.0494361646351535e-07, "learning_rate": 9.792717412184108e-06, "loss": 0.0, "step": 4640 }, { "epoch": 0.02077293175310142, "grad_norm": 4.475537718207079e-09, "learning_rate": 9.792270682468986e-06, "loss": 0.0, "step": 4650 }, { "epoch": 0.020817604724613466, "grad_norm": 0.003947664052248001, "learning_rate": 9.791823952753867e-06, "loss": 0.0007, "step": 4660 }, { "epoch": 0.020862277696125513, "grad_norm": 3.053561570265373e-10, "learning_rate": 9.791377223038747e-06, "loss": 0.0574, "step": 4670 }, { "epoch": 0.02090695066763756, "grad_norm": 0.3782574236392975, "learning_rate": 9.790930493323625e-06, "loss": 0.0002, "step": 4680 }, { "epoch": 0.020951623639149604, "grad_norm": 0.04846750944852829, "learning_rate": 9.790483763608505e-06, "loss": 0.6375, "step": 4690 }, { "epoch": 0.02099629661066165, "grad_norm": 0.0004883208894170821, "learning_rate": 9.790037033893385e-06, "loss": 0.0, "step": 4700 }, { "epoch": 0.021040969582173698, "grad_norm": 7.283672363161964e-13, "learning_rate": 9.789590304178264e-06, "loss": 0.0029, "step": 4710 }, { "epoch": 0.021085642553685745, "grad_norm": 5.044914264118461e-09, "learning_rate": 9.789143574463144e-06, "loss": 0.0357, "step": 4720 }, { "epoch": 0.021130315525197788, "grad_norm": 2.224873202338884e-10, "learning_rate": 9.788696844748022e-06, "loss": 0.003, "step": 4730 }, { "epoch": 0.021174988496709835, "grad_norm": 1.3932583216205285e-14, "learning_rate": 9.788250115032903e-06, "loss": 0.0004, "step": 4740 }, { "epoch": 0.021219661468221882, "grad_norm": 9.30095952512977e-13, "learning_rate": 9.787803385317783e-06, "loss": 0.0123, "step": 4750 }, { "epoch": 0.02126433443973393, "grad_norm": 0.00033034654916264117, "learning_rate": 9.787356655602661e-06, "loss": 0.0, "step": 4760 }, { "epoch": 0.021309007411245973, "grad_norm": 4.3573920513462205e-12, "learning_rate": 9.786909925887541e-06, "loss": 0.0, "step": 4770 }, { "epoch": 0.02135368038275802, "grad_norm": 0.025035222992300987, "learning_rate": 9.786463196172421e-06, "loss": 0.0, "step": 4780 }, { "epoch": 0.021398353354270067, "grad_norm": 7.293956105769439e-09, "learning_rate": 9.7860164664573e-06, "loss": 0.0001, "step": 4790 }, { "epoch": 0.021443026325782114, "grad_norm": 1.0322760035002057e-13, "learning_rate": 9.78556973674218e-06, "loss": 0.0, "step": 4800 }, { "epoch": 0.021487699297294157, "grad_norm": 7.134031321416601e-13, "learning_rate": 9.78512300702706e-06, "loss": 0.0018, "step": 4810 }, { "epoch": 0.021532372268806204, "grad_norm": 2.9583046853076667e-05, "learning_rate": 9.784676277311938e-06, "loss": 0.0, "step": 4820 }, { "epoch": 0.02157704524031825, "grad_norm": 1.6715544226553192e-10, "learning_rate": 9.784229547596819e-06, "loss": 0.1215, "step": 4830 }, { "epoch": 0.021621718211830298, "grad_norm": 0.0013644276186823845, "learning_rate": 9.783782817881697e-06, "loss": 0.0001, "step": 4840 }, { "epoch": 0.02166639118334234, "grad_norm": 1.2130669801990734e-09, "learning_rate": 9.783336088166577e-06, "loss": 0.0018, "step": 4850 }, { "epoch": 0.02171106415485439, "grad_norm": 1.6507818445532507e-09, "learning_rate": 9.782889358451457e-06, "loss": 0.0116, "step": 4860 }, { "epoch": 0.021755737126366435, "grad_norm": 8.015909602754334e-10, "learning_rate": 9.782442628736336e-06, "loss": 0.0001, "step": 4870 }, { "epoch": 0.02180041009787848, "grad_norm": 0.0021370230242609978, "learning_rate": 9.781995899021216e-06, "loss": 0.0002, "step": 4880 }, { "epoch": 0.021845083069390526, "grad_norm": 1.9856920590588658e-16, "learning_rate": 9.781549169306096e-06, "loss": 0.0, "step": 4890 }, { "epoch": 0.021889756040902573, "grad_norm": 7.834253807747354e-14, "learning_rate": 9.781102439590974e-06, "loss": 0.0, "step": 4900 }, { "epoch": 0.02193442901241462, "grad_norm": 1.1786222842147254e-07, "learning_rate": 9.780655709875854e-06, "loss": 0.0005, "step": 4910 }, { "epoch": 0.021979101983926663, "grad_norm": 3.896390199661255, "learning_rate": 9.780208980160733e-06, "loss": 0.0829, "step": 4920 }, { "epoch": 0.02202377495543871, "grad_norm": 1.0240219339152645e-08, "learning_rate": 9.779762250445613e-06, "loss": 0.0, "step": 4930 }, { "epoch": 0.022068447926950757, "grad_norm": 3.104096979456017e-09, "learning_rate": 9.779315520730493e-06, "loss": 0.0, "step": 4940 }, { "epoch": 0.022113120898462804, "grad_norm": 1.9251933736086357e-06, "learning_rate": 9.778868791015372e-06, "loss": 0.0002, "step": 4950 }, { "epoch": 0.022157793869974848, "grad_norm": 5.579614025919e-06, "learning_rate": 9.778422061300252e-06, "loss": 0.0003, "step": 4960 }, { "epoch": 0.022202466841486895, "grad_norm": 0.006300140172243118, "learning_rate": 9.777975331585132e-06, "loss": 0.0127, "step": 4970 }, { "epoch": 0.02224713981299894, "grad_norm": 1.7765872328823207e-08, "learning_rate": 9.77752860187001e-06, "loss": 0.0938, "step": 4980 }, { "epoch": 0.02229181278451099, "grad_norm": 7.047427061479539e-06, "learning_rate": 9.77708187215489e-06, "loss": 0.0, "step": 4990 }, { "epoch": 0.022336485756023032, "grad_norm": 0.0007222782005555928, "learning_rate": 9.77663514243977e-06, "loss": 0.0016, "step": 5000 }, { "epoch": 0.02238115872753508, "grad_norm": 2.581775269714184e-15, "learning_rate": 9.776188412724649e-06, "loss": 0.0226, "step": 5010 }, { "epoch": 0.022425831699047126, "grad_norm": 9.7462100256962e-07, "learning_rate": 9.775741683009529e-06, "loss": 0.0019, "step": 5020 }, { "epoch": 0.022470504670559173, "grad_norm": 0.04423893988132477, "learning_rate": 9.77529495329441e-06, "loss": 0.0, "step": 5030 }, { "epoch": 0.022515177642071216, "grad_norm": 6.549327736138366e-06, "learning_rate": 9.77484822357929e-06, "loss": 0.0, "step": 5040 }, { "epoch": 0.022559850613583263, "grad_norm": 4.1592920774213837e-10, "learning_rate": 9.774401493864168e-06, "loss": 0.0, "step": 5050 }, { "epoch": 0.02260452358509531, "grad_norm": 1.6939921687608717e-10, "learning_rate": 9.773954764149048e-06, "loss": 0.0009, "step": 5060 }, { "epoch": 0.022649196556607357, "grad_norm": 1.3087286561130895e-06, "learning_rate": 9.773508034433928e-06, "loss": 0.0451, "step": 5070 }, { "epoch": 0.0226938695281194, "grad_norm": 1.7216597938962686e-11, "learning_rate": 9.773061304718806e-06, "loss": 0.0005, "step": 5080 }, { "epoch": 0.022738542499631448, "grad_norm": 1.9706143383757535e-09, "learning_rate": 9.772614575003687e-06, "loss": 0.0003, "step": 5090 }, { "epoch": 0.022783215471143495, "grad_norm": 0.001979758031666279, "learning_rate": 9.772167845288567e-06, "loss": 0.0, "step": 5100 }, { "epoch": 0.02282788844265554, "grad_norm": 2.8083006782253506e-06, "learning_rate": 9.771721115573445e-06, "loss": 0.0003, "step": 5110 }, { "epoch": 0.022872561414167585, "grad_norm": 3.5032118717026606e-07, "learning_rate": 9.771274385858325e-06, "loss": 0.0005, "step": 5120 }, { "epoch": 0.022917234385679632, "grad_norm": 8.553373564978983e-10, "learning_rate": 9.770827656143205e-06, "loss": 0.0, "step": 5130 }, { "epoch": 0.02296190735719168, "grad_norm": 4.153927328312669e-17, "learning_rate": 9.770380926428084e-06, "loss": 0.0007, "step": 5140 }, { "epoch": 0.023006580328703723, "grad_norm": 6.216424944249788e-14, "learning_rate": 9.769934196712964e-06, "loss": 0.0003, "step": 5150 }, { "epoch": 0.02305125330021577, "grad_norm": 0.0009006602340377867, "learning_rate": 9.769487466997844e-06, "loss": 0.0, "step": 5160 }, { "epoch": 0.023095926271727817, "grad_norm": 0.04137864708900452, "learning_rate": 9.769040737282722e-06, "loss": 0.2688, "step": 5170 }, { "epoch": 0.023140599243239864, "grad_norm": 4.6671142699215157e-14, "learning_rate": 9.768594007567603e-06, "loss": 0.0005, "step": 5180 }, { "epoch": 0.023185272214751907, "grad_norm": 0.001031166291795671, "learning_rate": 9.768147277852483e-06, "loss": 0.0255, "step": 5190 }, { "epoch": 0.023229945186263954, "grad_norm": 5.5269151271249313e-17, "learning_rate": 9.767700548137361e-06, "loss": 0.0, "step": 5200 }, { "epoch": 0.023274618157776, "grad_norm": 4.2891007585865726e-12, "learning_rate": 9.767253818422241e-06, "loss": 0.1061, "step": 5210 }, { "epoch": 0.023319291129288048, "grad_norm": 3.626120985700254e-07, "learning_rate": 9.76680708870712e-06, "loss": 0.0, "step": 5220 }, { "epoch": 0.02336396410080009, "grad_norm": 7.203292540225448e-08, "learning_rate": 9.766360358992e-06, "loss": 0.6625, "step": 5230 }, { "epoch": 0.02340863707231214, "grad_norm": 4.1581765808373916e-10, "learning_rate": 9.76591362927688e-06, "loss": 0.0, "step": 5240 }, { "epoch": 0.023453310043824185, "grad_norm": 0.008564828895032406, "learning_rate": 9.765466899561758e-06, "loss": 0.0062, "step": 5250 }, { "epoch": 0.023497983015336232, "grad_norm": 0.0008313873549923301, "learning_rate": 9.765020169846639e-06, "loss": 0.0014, "step": 5260 }, { "epoch": 0.023542655986848276, "grad_norm": 2.6287691071047448e-05, "learning_rate": 9.764573440131519e-06, "loss": 0.0, "step": 5270 }, { "epoch": 0.023587328958360323, "grad_norm": 5.458162366522856e-09, "learning_rate": 9.764126710416397e-06, "loss": 0.0034, "step": 5280 }, { "epoch": 0.02363200192987237, "grad_norm": 2.566042810094732e-13, "learning_rate": 9.763679980701277e-06, "loss": 0.0, "step": 5290 }, { "epoch": 0.023676674901384417, "grad_norm": 1.3405983168013336e-07, "learning_rate": 9.763233250986157e-06, "loss": 0.0014, "step": 5300 }, { "epoch": 0.02372134787289646, "grad_norm": 2.0920136734048356e-09, "learning_rate": 9.762786521271036e-06, "loss": 0.0079, "step": 5310 }, { "epoch": 0.023766020844408507, "grad_norm": 2.4717706992305466e-08, "learning_rate": 9.762339791555916e-06, "loss": 0.0, "step": 5320 }, { "epoch": 0.023810693815920554, "grad_norm": 1.62759391741929e-07, "learning_rate": 9.761893061840794e-06, "loss": 0.0, "step": 5330 }, { "epoch": 0.0238553667874326, "grad_norm": 0.00256610126234591, "learning_rate": 9.761446332125674e-06, "loss": 0.0, "step": 5340 }, { "epoch": 0.023900039758944645, "grad_norm": 8.087974129011855e-05, "learning_rate": 9.760999602410555e-06, "loss": 0.0, "step": 5350 }, { "epoch": 0.02394471273045669, "grad_norm": 1.1636046672643813e-13, "learning_rate": 9.760552872695433e-06, "loss": 0.0002, "step": 5360 }, { "epoch": 0.02398938570196874, "grad_norm": 4.7322808865990496e-11, "learning_rate": 9.760106142980313e-06, "loss": 0.0003, "step": 5370 }, { "epoch": 0.024034058673480786, "grad_norm": 3.8606845009780955e-07, "learning_rate": 9.759659413265193e-06, "loss": 0.0016, "step": 5380 }, { "epoch": 0.02407873164499283, "grad_norm": 0.017653707414865494, "learning_rate": 9.759212683550072e-06, "loss": 0.0, "step": 5390 }, { "epoch": 0.024123404616504876, "grad_norm": 1.5549872841802426e-05, "learning_rate": 9.758765953834952e-06, "loss": 0.0, "step": 5400 }, { "epoch": 0.024168077588016923, "grad_norm": 0.2490101158618927, "learning_rate": 9.758319224119832e-06, "loss": 0.0615, "step": 5410 }, { "epoch": 0.024212750559528966, "grad_norm": 9.94989818536851e-07, "learning_rate": 9.75787249440471e-06, "loss": 0.0, "step": 5420 }, { "epoch": 0.024257423531041013, "grad_norm": 0.259600430727005, "learning_rate": 9.75742576468959e-06, "loss": 0.0575, "step": 5430 }, { "epoch": 0.02430209650255306, "grad_norm": 5.348607667876593e-10, "learning_rate": 9.756979034974469e-06, "loss": 0.0, "step": 5440 }, { "epoch": 0.024346769474065107, "grad_norm": 0.05933636799454689, "learning_rate": 9.756532305259349e-06, "loss": 0.0, "step": 5450 }, { "epoch": 0.02439144244557715, "grad_norm": 0.00012589334801305085, "learning_rate": 9.75608557554423e-06, "loss": 0.0002, "step": 5460 }, { "epoch": 0.024436115417089198, "grad_norm": 1.8503410609271853e-12, "learning_rate": 9.755638845829108e-06, "loss": 0.0, "step": 5470 }, { "epoch": 0.024480788388601245, "grad_norm": 0.014644503593444824, "learning_rate": 9.755192116113988e-06, "loss": 0.0014, "step": 5480 }, { "epoch": 0.024525461360113292, "grad_norm": 0.04739692062139511, "learning_rate": 9.754745386398868e-06, "loss": 0.0, "step": 5490 }, { "epoch": 0.024570134331625335, "grad_norm": 1.7644185845711036e-06, "learning_rate": 9.754298656683748e-06, "loss": 0.0, "step": 5500 }, { "epoch": 0.024614807303137382, "grad_norm": 0.0011657369323074818, "learning_rate": 9.753851926968626e-06, "loss": 0.0, "step": 5510 }, { "epoch": 0.02465948027464943, "grad_norm": 7.385870874067608e-18, "learning_rate": 9.753405197253507e-06, "loss": 0.0, "step": 5520 }, { "epoch": 0.024704153246161476, "grad_norm": 9.445318305267847e-09, "learning_rate": 9.752958467538387e-06, "loss": 0.4907, "step": 5530 }, { "epoch": 0.02474882621767352, "grad_norm": 2.0039067627486418e-11, "learning_rate": 9.752511737823265e-06, "loss": 0.0001, "step": 5540 }, { "epoch": 0.024793499189185567, "grad_norm": 1.243283948015872e-10, "learning_rate": 9.752065008108145e-06, "loss": 0.0948, "step": 5550 }, { "epoch": 0.024838172160697614, "grad_norm": 0.0030552446842193604, "learning_rate": 9.751618278393025e-06, "loss": 0.0, "step": 5560 }, { "epoch": 0.02488284513220966, "grad_norm": 21.20594596862793, "learning_rate": 9.751171548677904e-06, "loss": 0.0193, "step": 5570 }, { "epoch": 0.024927518103721704, "grad_norm": 7.996323221626245e-12, "learning_rate": 9.750724818962784e-06, "loss": 0.0002, "step": 5580 }, { "epoch": 0.02497219107523375, "grad_norm": 2.2440810099055852e-08, "learning_rate": 9.750278089247664e-06, "loss": 0.0, "step": 5590 }, { "epoch": 0.025016864046745798, "grad_norm": 8.148406777763739e-05, "learning_rate": 9.749831359532544e-06, "loss": 0.0119, "step": 5600 }, { "epoch": 0.025061537018257845, "grad_norm": 4.539280843568472e-12, "learning_rate": 9.749384629817423e-06, "loss": 0.0241, "step": 5610 }, { "epoch": 0.02510620998976989, "grad_norm": 2.1162943397712297e-08, "learning_rate": 9.748937900102303e-06, "loss": 0.0, "step": 5620 }, { "epoch": 0.025150882961281935, "grad_norm": 0.049990683794021606, "learning_rate": 9.748491170387181e-06, "loss": 0.0973, "step": 5630 }, { "epoch": 0.025195555932793982, "grad_norm": 1.8373774723912106e-11, "learning_rate": 9.748044440672061e-06, "loss": 0.0001, "step": 5640 }, { "epoch": 0.02524022890430603, "grad_norm": 6.4786890749812075e-18, "learning_rate": 9.747597710956941e-06, "loss": 0.0, "step": 5650 }, { "epoch": 0.025284901875818073, "grad_norm": 6.861407086944382e-08, "learning_rate": 9.74715098124182e-06, "loss": 0.0, "step": 5660 }, { "epoch": 0.02532957484733012, "grad_norm": 0.2536761164665222, "learning_rate": 9.7467042515267e-06, "loss": 0.0001, "step": 5670 }, { "epoch": 0.025374247818842167, "grad_norm": 6.602550983428955, "learning_rate": 9.74625752181158e-06, "loss": 0.0078, "step": 5680 }, { "epoch": 0.02541892079035421, "grad_norm": 1.3618071115217845e-09, "learning_rate": 9.745810792096459e-06, "loss": 0.0001, "step": 5690 }, { "epoch": 0.025463593761866257, "grad_norm": 3.1000786293589044e-07, "learning_rate": 9.745364062381339e-06, "loss": 0.0, "step": 5700 }, { "epoch": 0.025508266733378304, "grad_norm": 1.2080630540367565e-06, "learning_rate": 9.744917332666219e-06, "loss": 0.5865, "step": 5710 }, { "epoch": 0.02555293970489035, "grad_norm": 1.0851125807675999e-05, "learning_rate": 9.744470602951097e-06, "loss": 0.0005, "step": 5720 }, { "epoch": 0.025597612676402395, "grad_norm": 1.670174788159784e-05, "learning_rate": 9.744023873235977e-06, "loss": 0.0001, "step": 5730 }, { "epoch": 0.02564228564791444, "grad_norm": 1.4780018098292658e-08, "learning_rate": 9.743577143520856e-06, "loss": 0.0, "step": 5740 }, { "epoch": 0.02568695861942649, "grad_norm": 2.2874927646521215e-11, "learning_rate": 9.743130413805736e-06, "loss": 0.0089, "step": 5750 }, { "epoch": 0.025731631590938536, "grad_norm": 1.2863636129623046e-06, "learning_rate": 9.742683684090616e-06, "loss": 0.0, "step": 5760 }, { "epoch": 0.02577630456245058, "grad_norm": 9.311942267231643e-05, "learning_rate": 9.742236954375494e-06, "loss": 0.0001, "step": 5770 }, { "epoch": 0.025820977533962626, "grad_norm": 1.664363367126498e-07, "learning_rate": 9.741790224660375e-06, "loss": 0.2125, "step": 5780 }, { "epoch": 0.025865650505474673, "grad_norm": 1.3566239798201707e-10, "learning_rate": 9.741343494945255e-06, "loss": 0.7625, "step": 5790 }, { "epoch": 0.02591032347698672, "grad_norm": 1.458537894905021e-07, "learning_rate": 9.740896765230133e-06, "loss": 0.0, "step": 5800 }, { "epoch": 0.025954996448498763, "grad_norm": 5.571019937633537e-05, "learning_rate": 9.740450035515013e-06, "loss": 0.0002, "step": 5810 }, { "epoch": 0.02599966942001081, "grad_norm": 5.8732325669552665e-09, "learning_rate": 9.740003305799892e-06, "loss": 0.0523, "step": 5820 }, { "epoch": 0.026044342391522857, "grad_norm": 7.986272976268083e-07, "learning_rate": 9.739556576084772e-06, "loss": 0.0134, "step": 5830 }, { "epoch": 0.026089015363034904, "grad_norm": 2.1820655504711794e-08, "learning_rate": 9.739109846369652e-06, "loss": 0.0011, "step": 5840 }, { "epoch": 0.026133688334546948, "grad_norm": 0.3102538585662842, "learning_rate": 9.73866311665453e-06, "loss": 0.0001, "step": 5850 }, { "epoch": 0.026178361306058995, "grad_norm": 0.00025683510466478765, "learning_rate": 9.73821638693941e-06, "loss": 0.15, "step": 5860 }, { "epoch": 0.026223034277571042, "grad_norm": 1.50867836055113e-05, "learning_rate": 9.73776965722429e-06, "loss": 0.0, "step": 5870 }, { "epoch": 0.02626770724908309, "grad_norm": 0.005404121708124876, "learning_rate": 9.737322927509169e-06, "loss": 0.0002, "step": 5880 }, { "epoch": 0.026312380220595132, "grad_norm": 0.002749478444457054, "learning_rate": 9.73687619779405e-06, "loss": 0.0, "step": 5890 }, { "epoch": 0.02635705319210718, "grad_norm": 6.164871592773125e-05, "learning_rate": 9.73642946807893e-06, "loss": 0.0, "step": 5900 }, { "epoch": 0.026401726163619226, "grad_norm": 0.1678089052438736, "learning_rate": 9.735982738363808e-06, "loss": 0.0, "step": 5910 }, { "epoch": 0.026446399135131273, "grad_norm": 3.7444728206992295e-08, "learning_rate": 9.735536008648688e-06, "loss": 0.0143, "step": 5920 }, { "epoch": 0.026491072106643317, "grad_norm": 0.00017223106988240033, "learning_rate": 9.735089278933566e-06, "loss": 0.0, "step": 5930 }, { "epoch": 0.026535745078155364, "grad_norm": 1.9701846820652236e-09, "learning_rate": 9.734642549218446e-06, "loss": 0.0, "step": 5940 }, { "epoch": 0.02658041804966741, "grad_norm": 0.010152035392820835, "learning_rate": 9.734195819503327e-06, "loss": 0.2016, "step": 5950 }, { "epoch": 0.026625091021179454, "grad_norm": 3.5874224977305857e-06, "learning_rate": 9.733749089788207e-06, "loss": 0.035, "step": 5960 }, { "epoch": 0.0266697639926915, "grad_norm": 4.681384233983909e-15, "learning_rate": 9.733302360073085e-06, "loss": 0.0, "step": 5970 }, { "epoch": 0.026714436964203548, "grad_norm": 1.9966948912042426e-06, "learning_rate": 9.732855630357965e-06, "loss": 0.0002, "step": 5980 }, { "epoch": 0.026759109935715595, "grad_norm": 4.600369265972404e-06, "learning_rate": 9.732408900642845e-06, "loss": 0.0, "step": 5990 }, { "epoch": 0.02680378290722764, "grad_norm": 4.245321179041639e-06, "learning_rate": 9.731962170927724e-06, "loss": 0.0693, "step": 6000 }, { "epoch": 0.026848455878739685, "grad_norm": 6.09661583439447e-06, "learning_rate": 9.731515441212604e-06, "loss": 0.0049, "step": 6010 }, { "epoch": 0.026893128850251732, "grad_norm": 1.7047414075932465e-06, "learning_rate": 9.731068711497484e-06, "loss": 0.0004, "step": 6020 }, { "epoch": 0.02693780182176378, "grad_norm": 4.4253320452547484e-10, "learning_rate": 9.730621981782362e-06, "loss": 0.9938, "step": 6030 }, { "epoch": 0.026982474793275823, "grad_norm": 0.0007237186655402184, "learning_rate": 9.730175252067243e-06, "loss": 0.0, "step": 6040 }, { "epoch": 0.02702714776478787, "grad_norm": 0.001562108751386404, "learning_rate": 9.729728522352123e-06, "loss": 0.0004, "step": 6050 }, { "epoch": 0.027071820736299917, "grad_norm": 1.0216683769676796e-15, "learning_rate": 9.729281792637003e-06, "loss": 0.0574, "step": 6060 }, { "epoch": 0.027116493707811964, "grad_norm": 1.391899651093098e-10, "learning_rate": 9.728835062921881e-06, "loss": 0.0026, "step": 6070 }, { "epoch": 0.027161166679324007, "grad_norm": 0.054726582020521164, "learning_rate": 9.728388333206761e-06, "loss": 0.0014, "step": 6080 }, { "epoch": 0.027205839650836054, "grad_norm": 433.8678283691406, "learning_rate": 9.727941603491642e-06, "loss": 0.2813, "step": 6090 }, { "epoch": 0.0272505126223481, "grad_norm": 0.007730548270046711, "learning_rate": 9.72749487377652e-06, "loss": 0.0, "step": 6100 }, { "epoch": 0.027295185593860148, "grad_norm": 2.500319016499747e-13, "learning_rate": 9.7270481440614e-06, "loss": 0.0, "step": 6110 }, { "epoch": 0.02733985856537219, "grad_norm": 3.923983967618616e-12, "learning_rate": 9.726601414346279e-06, "loss": 0.0982, "step": 6120 }, { "epoch": 0.02738453153688424, "grad_norm": 3.476937490631826e-05, "learning_rate": 9.726154684631159e-06, "loss": 0.0114, "step": 6130 }, { "epoch": 0.027429204508396286, "grad_norm": 426.7615051269531, "learning_rate": 9.725707954916039e-06, "loss": 0.1055, "step": 6140 }, { "epoch": 0.027473877479908333, "grad_norm": 1.0694459056322689e-15, "learning_rate": 9.725261225200917e-06, "loss": 0.0691, "step": 6150 }, { "epoch": 0.027518550451420376, "grad_norm": 2.8464754997514774e-09, "learning_rate": 9.724814495485797e-06, "loss": 0.0004, "step": 6160 }, { "epoch": 0.027563223422932423, "grad_norm": 1.008872207997058e-09, "learning_rate": 9.724367765770677e-06, "loss": 0.0006, "step": 6170 }, { "epoch": 0.02760789639444447, "grad_norm": 3.2404907750194223e-12, "learning_rate": 9.723921036055556e-06, "loss": 0.0003, "step": 6180 }, { "epoch": 0.027652569365956517, "grad_norm": 4.715409052807873e-12, "learning_rate": 9.723474306340436e-06, "loss": 0.0222, "step": 6190 }, { "epoch": 0.02769724233746856, "grad_norm": 6.288278816928006e-13, "learning_rate": 9.723027576625316e-06, "loss": 0.0003, "step": 6200 }, { "epoch": 0.027741915308980607, "grad_norm": 1.877140675787814e-06, "learning_rate": 9.722580846910195e-06, "loss": 0.0001, "step": 6210 }, { "epoch": 0.027786588280492654, "grad_norm": 190.27633666992188, "learning_rate": 9.722134117195075e-06, "loss": 0.0348, "step": 6220 }, { "epoch": 0.0278312612520047, "grad_norm": 2.001493015768574e-07, "learning_rate": 9.721687387479953e-06, "loss": 0.0, "step": 6230 }, { "epoch": 0.027875934223516745, "grad_norm": 5.368600355937225e-11, "learning_rate": 9.721240657764833e-06, "loss": 0.0209, "step": 6240 }, { "epoch": 0.027920607195028792, "grad_norm": 0.00173946691211313, "learning_rate": 9.720793928049713e-06, "loss": 0.0003, "step": 6250 }, { "epoch": 0.02796528016654084, "grad_norm": 0.006585344672203064, "learning_rate": 9.720347198334592e-06, "loss": 0.5363, "step": 6260 }, { "epoch": 0.028009953138052882, "grad_norm": 0.013094873167574406, "learning_rate": 9.719900468619472e-06, "loss": 0.0, "step": 6270 }, { "epoch": 0.02805462610956493, "grad_norm": 0.002456572838127613, "learning_rate": 9.719453738904352e-06, "loss": 0.0, "step": 6280 }, { "epoch": 0.028099299081076976, "grad_norm": 1.4949612658909928e-09, "learning_rate": 9.71900700918923e-06, "loss": 0.615, "step": 6290 }, { "epoch": 0.028143972052589023, "grad_norm": 6.688063791671084e-08, "learning_rate": 9.71856027947411e-06, "loss": 0.0006, "step": 6300 }, { "epoch": 0.028188645024101067, "grad_norm": 0.0004353579424787313, "learning_rate": 9.718113549758989e-06, "loss": 0.0001, "step": 6310 }, { "epoch": 0.028233317995613114, "grad_norm": 0.00210509798489511, "learning_rate": 9.71766682004387e-06, "loss": 0.0, "step": 6320 }, { "epoch": 0.02827799096712516, "grad_norm": 9.322425285063218e-06, "learning_rate": 9.71722009032875e-06, "loss": 0.0, "step": 6330 }, { "epoch": 0.028322663938637208, "grad_norm": 5.350936044123955e-05, "learning_rate": 9.716773360613628e-06, "loss": 0.0, "step": 6340 }, { "epoch": 0.02836733691014925, "grad_norm": 3.720874630630533e-08, "learning_rate": 9.716326630898508e-06, "loss": 0.0, "step": 6350 }, { "epoch": 0.028412009881661298, "grad_norm": 33.51473617553711, "learning_rate": 9.715879901183388e-06, "loss": 0.0089, "step": 6360 }, { "epoch": 0.028456682853173345, "grad_norm": 7.0806942531476125e-09, "learning_rate": 9.715433171468266e-06, "loss": 0.0003, "step": 6370 }, { "epoch": 0.028501355824685392, "grad_norm": 0.007248702924698591, "learning_rate": 9.714986441753147e-06, "loss": 0.0, "step": 6380 }, { "epoch": 0.028546028796197435, "grad_norm": 1.2337777910431669e-08, "learning_rate": 9.714539712038027e-06, "loss": 0.0, "step": 6390 }, { "epoch": 0.028590701767709482, "grad_norm": 1.512154388250836e-13, "learning_rate": 9.714092982322905e-06, "loss": 0.0, "step": 6400 }, { "epoch": 0.02863537473922153, "grad_norm": 4.472500801086426, "learning_rate": 9.713646252607785e-06, "loss": 0.2022, "step": 6410 }, { "epoch": 0.028680047710733576, "grad_norm": 34.18728256225586, "learning_rate": 9.713199522892665e-06, "loss": 0.0081, "step": 6420 }, { "epoch": 0.02872472068224562, "grad_norm": 0.18630315363407135, "learning_rate": 9.712752793177544e-06, "loss": 0.0565, "step": 6430 }, { "epoch": 0.028769393653757667, "grad_norm": 2.7868030071258545, "learning_rate": 9.712306063462424e-06, "loss": 0.0005, "step": 6440 }, { "epoch": 0.028814066625269714, "grad_norm": 6.858974899159875e-08, "learning_rate": 9.711859333747304e-06, "loss": 0.0, "step": 6450 }, { "epoch": 0.02885873959678176, "grad_norm": 1.4235383434101773e-13, "learning_rate": 9.711412604032182e-06, "loss": 0.0575, "step": 6460 }, { "epoch": 0.028903412568293804, "grad_norm": 1.4090331124094746e-09, "learning_rate": 9.710965874317063e-06, "loss": 0.0, "step": 6470 }, { "epoch": 0.02894808553980585, "grad_norm": 2.0118551269376894e-10, "learning_rate": 9.710519144601943e-06, "loss": 1.243, "step": 6480 }, { "epoch": 0.028992758511317898, "grad_norm": 5.944572091520772e-13, "learning_rate": 9.710072414886821e-06, "loss": 0.0012, "step": 6490 }, { "epoch": 0.029037431482829945, "grad_norm": 1.8894002681846267e-11, "learning_rate": 9.709625685171701e-06, "loss": 0.0, "step": 6500 }, { "epoch": 0.02908210445434199, "grad_norm": 3.621100294615154e-11, "learning_rate": 9.709178955456581e-06, "loss": 0.0004, "step": 6510 }, { "epoch": 0.029126777425854036, "grad_norm": 1.755638117379751e-19, "learning_rate": 9.708732225741462e-06, "loss": 0.044, "step": 6520 }, { "epoch": 0.029171450397366083, "grad_norm": 2.4194950043465724e-08, "learning_rate": 9.70828549602634e-06, "loss": 0.0, "step": 6530 }, { "epoch": 0.029216123368878126, "grad_norm": 1.1459812299108307e-07, "learning_rate": 9.70783876631122e-06, "loss": 0.0001, "step": 6540 }, { "epoch": 0.029260796340390173, "grad_norm": 0.02885868027806282, "learning_rate": 9.7073920365961e-06, "loss": 0.0023, "step": 6550 }, { "epoch": 0.02930546931190222, "grad_norm": 3.1013897006104685e-10, "learning_rate": 9.706945306880979e-06, "loss": 0.0, "step": 6560 }, { "epoch": 0.029350142283414267, "grad_norm": 1.3184380254216421e-09, "learning_rate": 9.706498577165859e-06, "loss": 0.2098, "step": 6570 }, { "epoch": 0.02939481525492631, "grad_norm": 0.0006379721453413367, "learning_rate": 9.706051847450739e-06, "loss": 0.2688, "step": 6580 }, { "epoch": 0.029439488226438357, "grad_norm": 2.612810801050358e-10, "learning_rate": 9.705605117735617e-06, "loss": 0.0523, "step": 6590 }, { "epoch": 0.029484161197950404, "grad_norm": 0.006585241761058569, "learning_rate": 9.705158388020497e-06, "loss": 0.0, "step": 6600 }, { "epoch": 0.02952883416946245, "grad_norm": 3.1172658054856583e-05, "learning_rate": 9.704711658305376e-06, "loss": 0.0001, "step": 6610 }, { "epoch": 0.029573507140974495, "grad_norm": 1.5668823877268423e-11, "learning_rate": 9.704264928590256e-06, "loss": 0.0014, "step": 6620 }, { "epoch": 0.029618180112486542, "grad_norm": 9.767539449967444e-05, "learning_rate": 9.703818198875136e-06, "loss": 0.0113, "step": 6630 }, { "epoch": 0.02966285308399859, "grad_norm": 1.4886247299727984e-05, "learning_rate": 9.703371469160015e-06, "loss": 0.0001, "step": 6640 }, { "epoch": 0.029707526055510636, "grad_norm": 8.713771349443122e-13, "learning_rate": 9.702924739444895e-06, "loss": 0.0049, "step": 6650 }, { "epoch": 0.02975219902702268, "grad_norm": 0.00010709473281167448, "learning_rate": 9.702478009729775e-06, "loss": 0.071, "step": 6660 }, { "epoch": 0.029796871998534726, "grad_norm": 3.314019281219771e-08, "learning_rate": 9.702031280014653e-06, "loss": 0.0, "step": 6670 }, { "epoch": 0.029841544970046773, "grad_norm": 3.9084474545703074e-10, "learning_rate": 9.701584550299533e-06, "loss": 0.0062, "step": 6680 }, { "epoch": 0.02988621794155882, "grad_norm": 0.011260163970291615, "learning_rate": 9.701137820584413e-06, "loss": 0.0026, "step": 6690 }, { "epoch": 0.029930890913070864, "grad_norm": 5.194561936150421e-07, "learning_rate": 9.700691090869292e-06, "loss": 0.0, "step": 6700 }, { "epoch": 0.02997556388458291, "grad_norm": 2.75262749482863e-07, "learning_rate": 9.700244361154172e-06, "loss": 0.0, "step": 6710 }, { "epoch": 0.030020236856094958, "grad_norm": 3.266507701482624e-05, "learning_rate": 9.69979763143905e-06, "loss": 0.0, "step": 6720 }, { "epoch": 0.030064909827607005, "grad_norm": 0.0007317392737604678, "learning_rate": 9.69935090172393e-06, "loss": 0.0, "step": 6730 }, { "epoch": 0.030109582799119048, "grad_norm": 7.135286494985849e-08, "learning_rate": 9.69890417200881e-06, "loss": 0.0, "step": 6740 }, { "epoch": 0.030154255770631095, "grad_norm": 0.0005465031717903912, "learning_rate": 9.69845744229369e-06, "loss": 0.0128, "step": 6750 }, { "epoch": 0.030198928742143142, "grad_norm": 0.10797858983278275, "learning_rate": 9.69801071257857e-06, "loss": 0.45, "step": 6760 }, { "epoch": 0.03024360171365519, "grad_norm": 3.4820845127105713, "learning_rate": 9.69756398286345e-06, "loss": 0.002, "step": 6770 }, { "epoch": 0.030288274685167232, "grad_norm": 2.2667615481763337e-19, "learning_rate": 9.697117253148328e-06, "loss": 0.0005, "step": 6780 }, { "epoch": 0.03033294765667928, "grad_norm": 0.0035033426247537136, "learning_rate": 9.696670523433208e-06, "loss": 0.0049, "step": 6790 }, { "epoch": 0.030377620628191326, "grad_norm": 2.2869908633538216e-08, "learning_rate": 9.696223793718088e-06, "loss": 0.0018, "step": 6800 }, { "epoch": 0.03042229359970337, "grad_norm": 2.1959427613182925e-05, "learning_rate": 9.695777064002967e-06, "loss": 0.001, "step": 6810 }, { "epoch": 0.030466966571215417, "grad_norm": 3.3809324229139293e-09, "learning_rate": 9.695330334287847e-06, "loss": 0.7331, "step": 6820 }, { "epoch": 0.030511639542727464, "grad_norm": 0.0002111146313836798, "learning_rate": 9.694883604572725e-06, "loss": 0.0, "step": 6830 }, { "epoch": 0.03055631251423951, "grad_norm": 6.2108989826170635e-12, "learning_rate": 9.694436874857605e-06, "loss": 0.0038, "step": 6840 }, { "epoch": 0.030600985485751554, "grad_norm": 1.232419729232788, "learning_rate": 9.693990145142485e-06, "loss": 0.4186, "step": 6850 }, { "epoch": 0.0306456584572636, "grad_norm": 0.03285527974367142, "learning_rate": 9.693543415427364e-06, "loss": 0.0, "step": 6860 }, { "epoch": 0.030690331428775648, "grad_norm": 1.7257926858416095e-12, "learning_rate": 9.693096685712244e-06, "loss": 0.0, "step": 6870 }, { "epoch": 0.030735004400287695, "grad_norm": 3.952288596766919e-14, "learning_rate": 9.692649955997124e-06, "loss": 0.0, "step": 6880 }, { "epoch": 0.03077967737179974, "grad_norm": 1.4386819372543685e-13, "learning_rate": 9.692203226282002e-06, "loss": 0.0009, "step": 6890 }, { "epoch": 0.030824350343311786, "grad_norm": 2.87882137298584, "learning_rate": 9.691756496566883e-06, "loss": 0.1736, "step": 6900 }, { "epoch": 0.030869023314823833, "grad_norm": 2.567782653173367e-10, "learning_rate": 9.691309766851763e-06, "loss": 0.0, "step": 6910 }, { "epoch": 0.03091369628633588, "grad_norm": 8.343608470795516e-09, "learning_rate": 9.690863037136641e-06, "loss": 0.0018, "step": 6920 }, { "epoch": 0.030958369257847923, "grad_norm": 4.9662394523620605, "learning_rate": 9.690416307421521e-06, "loss": 0.0011, "step": 6930 }, { "epoch": 0.03100304222935997, "grad_norm": 0.02405603602528572, "learning_rate": 9.689969577706401e-06, "loss": 0.0001, "step": 6940 }, { "epoch": 0.031047715200872017, "grad_norm": 9.02886768017197e-07, "learning_rate": 9.68952284799128e-06, "loss": 0.0, "step": 6950 }, { "epoch": 0.031092388172384064, "grad_norm": 655.1307983398438, "learning_rate": 9.68907611827616e-06, "loss": 0.1703, "step": 6960 }, { "epoch": 0.031137061143896107, "grad_norm": 7.539566259661431e-16, "learning_rate": 9.68862938856104e-06, "loss": 0.0001, "step": 6970 }, { "epoch": 0.031181734115408154, "grad_norm": 1.45284036537352e-11, "learning_rate": 9.68818265884592e-06, "loss": 0.0113, "step": 6980 }, { "epoch": 0.0312264070869202, "grad_norm": 0.020145660266280174, "learning_rate": 9.687735929130799e-06, "loss": 1.7126, "step": 6990 }, { "epoch": 0.031271080058432245, "grad_norm": 4.409622000878244e-09, "learning_rate": 9.687289199415679e-06, "loss": 0.0, "step": 7000 }, { "epoch": 0.03131575302994429, "grad_norm": 2.2381243525160244e-06, "learning_rate": 9.686842469700559e-06, "loss": 0.0, "step": 7010 }, { "epoch": 0.03136042600145634, "grad_norm": 8.434130336354428e-07, "learning_rate": 9.686395739985437e-06, "loss": 0.0027, "step": 7020 }, { "epoch": 0.031405098972968386, "grad_norm": 6.848535627490548e-11, "learning_rate": 9.685949010270317e-06, "loss": 0.0023, "step": 7030 }, { "epoch": 0.03144977194448043, "grad_norm": 0.00046889035729691386, "learning_rate": 9.685502280555198e-06, "loss": 0.0, "step": 7040 }, { "epoch": 0.03149444491599248, "grad_norm": 4.915896655672755e-14, "learning_rate": 9.685055550840076e-06, "loss": 0.0, "step": 7050 }, { "epoch": 0.03153911788750452, "grad_norm": 0.000905799912288785, "learning_rate": 9.684608821124956e-06, "loss": 0.1114, "step": 7060 }, { "epoch": 0.03158379085901657, "grad_norm": 8.679131507873535, "learning_rate": 9.684162091409836e-06, "loss": 0.0016, "step": 7070 }, { "epoch": 0.031628463830528614, "grad_norm": 0.00016720079293008894, "learning_rate": 9.683715361694715e-06, "loss": 0.0001, "step": 7080 }, { "epoch": 0.03167313680204066, "grad_norm": 9.629166015656665e-06, "learning_rate": 9.683268631979595e-06, "loss": 0.0313, "step": 7090 }, { "epoch": 0.03171780977355271, "grad_norm": 1.7250875316676684e-05, "learning_rate": 9.682821902264475e-06, "loss": 0.0281, "step": 7100 }, { "epoch": 0.031762482745064755, "grad_norm": 4.952021370741022e-08, "learning_rate": 9.682375172549353e-06, "loss": 0.0023, "step": 7110 }, { "epoch": 0.0318071557165768, "grad_norm": 2.3499190149323113e-08, "learning_rate": 9.681928442834233e-06, "loss": 0.4156, "step": 7120 }, { "epoch": 0.03185182868808885, "grad_norm": 9.509724563372401e-09, "learning_rate": 9.681481713119112e-06, "loss": 0.0, "step": 7130 }, { "epoch": 0.03189650165960089, "grad_norm": 6.2214312492869794e-06, "learning_rate": 9.681034983403992e-06, "loss": 0.0, "step": 7140 }, { "epoch": 0.031941174631112935, "grad_norm": 0.1611732840538025, "learning_rate": 9.680588253688872e-06, "loss": 0.0, "step": 7150 }, { "epoch": 0.03198584760262498, "grad_norm": 1.358881404752918e-14, "learning_rate": 9.68014152397375e-06, "loss": 0.0001, "step": 7160 }, { "epoch": 0.03203052057413703, "grad_norm": 2.607118965158861e-09, "learning_rate": 9.67969479425863e-06, "loss": 0.0, "step": 7170 }, { "epoch": 0.032075193545649076, "grad_norm": 7.82560187441092e-14, "learning_rate": 9.67924806454351e-06, "loss": 0.0, "step": 7180 }, { "epoch": 0.03211986651716112, "grad_norm": 3.4912116007035365e-06, "learning_rate": 9.67880133482839e-06, "loss": 0.0009, "step": 7190 }, { "epoch": 0.03216453948867317, "grad_norm": 4.0497349939640215e-13, "learning_rate": 9.67835460511327e-06, "loss": 0.0011, "step": 7200 }, { "epoch": 0.03220921246018522, "grad_norm": 3.911545254364768e-10, "learning_rate": 9.677907875398148e-06, "loss": 0.0001, "step": 7210 }, { "epoch": 0.03225388543169726, "grad_norm": 6.038759181592468e-08, "learning_rate": 9.677461145683028e-06, "loss": 0.0, "step": 7220 }, { "epoch": 0.032298558403209304, "grad_norm": 2.0243858500634815e-07, "learning_rate": 9.677014415967908e-06, "loss": 0.0, "step": 7230 }, { "epoch": 0.03234323137472135, "grad_norm": 501.6820373535156, "learning_rate": 9.676567686252787e-06, "loss": 0.3256, "step": 7240 }, { "epoch": 0.0323879043462334, "grad_norm": 9.651954542277963e-07, "learning_rate": 9.676120956537667e-06, "loss": 0.0001, "step": 7250 }, { "epoch": 0.032432577317745445, "grad_norm": 5.384000604830193e-14, "learning_rate": 9.675674226822547e-06, "loss": 0.0002, "step": 7260 }, { "epoch": 0.03247725028925749, "grad_norm": 1.508151416251824e-11, "learning_rate": 9.675227497107425e-06, "loss": 0.0012, "step": 7270 }, { "epoch": 0.03252192326076954, "grad_norm": 7.18380704315541e-14, "learning_rate": 9.674780767392305e-06, "loss": 0.0, "step": 7280 }, { "epoch": 0.032566596232281586, "grad_norm": 4.494653239817126e-06, "learning_rate": 9.674334037677185e-06, "loss": 0.0, "step": 7290 }, { "epoch": 0.032611269203793626, "grad_norm": 1.3249834864836885e-06, "learning_rate": 9.673887307962064e-06, "loss": 0.0466, "step": 7300 }, { "epoch": 0.03265594217530567, "grad_norm": 2.338749647140503, "learning_rate": 9.673440578246944e-06, "loss": 0.0005, "step": 7310 }, { "epoch": 0.03270061514681772, "grad_norm": 4.7304103315643983e-14, "learning_rate": 9.672993848531822e-06, "loss": 0.0006, "step": 7320 }, { "epoch": 0.03274528811832977, "grad_norm": 1.2597056210650326e-11, "learning_rate": 9.672547118816703e-06, "loss": 0.3047, "step": 7330 }, { "epoch": 0.032789961089841814, "grad_norm": 8.864343178753564e-13, "learning_rate": 9.672100389101583e-06, "loss": 0.0, "step": 7340 }, { "epoch": 0.03283463406135386, "grad_norm": 4.640969683208929e-13, "learning_rate": 9.671653659386461e-06, "loss": 0.0, "step": 7350 }, { "epoch": 0.03287930703286591, "grad_norm": 1.1632348730084008e-11, "learning_rate": 9.671206929671341e-06, "loss": 0.0566, "step": 7360 }, { "epoch": 0.03292398000437795, "grad_norm": 1.163630145128991e-06, "learning_rate": 9.670760199956221e-06, "loss": 0.0387, "step": 7370 }, { "epoch": 0.032968652975889995, "grad_norm": 2.2575783020784002e-07, "learning_rate": 9.6703134702411e-06, "loss": 0.0006, "step": 7380 }, { "epoch": 0.03301332594740204, "grad_norm": 0.007182363886386156, "learning_rate": 9.66986674052598e-06, "loss": 1.0463, "step": 7390 }, { "epoch": 0.03305799891891409, "grad_norm": 20.120473861694336, "learning_rate": 9.66942001081086e-06, "loss": 0.0036, "step": 7400 }, { "epoch": 0.033102671890426136, "grad_norm": 1.0165902786241077e-08, "learning_rate": 9.668973281095738e-06, "loss": 0.0, "step": 7410 }, { "epoch": 0.03314734486193818, "grad_norm": 2.1038341424875484e-13, "learning_rate": 9.668526551380619e-06, "loss": 0.0, "step": 7420 }, { "epoch": 0.03319201783345023, "grad_norm": 1.064471462086658e-06, "learning_rate": 9.668079821665499e-06, "loss": 0.0, "step": 7430 }, { "epoch": 0.03323669080496228, "grad_norm": 5.4925614295479874e-14, "learning_rate": 9.667633091950379e-06, "loss": 0.0, "step": 7440 }, { "epoch": 0.03328136377647432, "grad_norm": 1.8044505776734776e-12, "learning_rate": 9.667186362235257e-06, "loss": 0.0006, "step": 7450 }, { "epoch": 0.033326036747986364, "grad_norm": 7.88182262567716e-07, "learning_rate": 9.666739632520137e-06, "loss": 0.0, "step": 7460 }, { "epoch": 0.03337070971949841, "grad_norm": 7.243825734803977e-07, "learning_rate": 9.666292902805018e-06, "loss": 0.0001, "step": 7470 }, { "epoch": 0.03341538269101046, "grad_norm": 7.187257779150968e-06, "learning_rate": 9.665846173089896e-06, "loss": 0.2984, "step": 7480 }, { "epoch": 0.033460055662522505, "grad_norm": 0.23065359890460968, "learning_rate": 9.665399443374776e-06, "loss": 0.0002, "step": 7490 }, { "epoch": 0.03350472863403455, "grad_norm": 0.00021304815891198814, "learning_rate": 9.664952713659656e-06, "loss": 0.01, "step": 7500 }, { "epoch": 0.0335494016055466, "grad_norm": 1.3281003900800226e-12, "learning_rate": 9.664505983944535e-06, "loss": 0.0824, "step": 7510 }, { "epoch": 0.033594074577058645, "grad_norm": 3.0334553770793027e-14, "learning_rate": 9.664059254229415e-06, "loss": 0.0016, "step": 7520 }, { "epoch": 0.033638747548570685, "grad_norm": 4.896755490335636e-05, "learning_rate": 9.663612524514295e-06, "loss": 0.0, "step": 7530 }, { "epoch": 0.03368342052008273, "grad_norm": 0.00037503300700336695, "learning_rate": 9.663165794799173e-06, "loss": 0.0001, "step": 7540 }, { "epoch": 0.03372809349159478, "grad_norm": 8.310901778243746e-15, "learning_rate": 9.662719065084053e-06, "loss": 0.0049, "step": 7550 }, { "epoch": 0.033772766463106826, "grad_norm": 2.1257954358588904e-05, "learning_rate": 9.662272335368934e-06, "loss": 0.0001, "step": 7560 }, { "epoch": 0.03381743943461887, "grad_norm": 0.00030382740078493953, "learning_rate": 9.661825605653812e-06, "loss": 0.0, "step": 7570 }, { "epoch": 0.03386211240613092, "grad_norm": 2.665811393853801e-08, "learning_rate": 9.661378875938692e-06, "loss": 0.0003, "step": 7580 }, { "epoch": 0.03390678537764297, "grad_norm": 17.635639190673828, "learning_rate": 9.660932146223572e-06, "loss": 0.0023, "step": 7590 }, { "epoch": 0.03395145834915501, "grad_norm": 2.347639194155704e-09, "learning_rate": 9.66048541650845e-06, "loss": 0.0001, "step": 7600 }, { "epoch": 0.033996131320667054, "grad_norm": 25.22496223449707, "learning_rate": 9.66003868679333e-06, "loss": 0.0054, "step": 7610 }, { "epoch": 0.0340408042921791, "grad_norm": 0.02057386375963688, "learning_rate": 9.65959195707821e-06, "loss": 0.0113, "step": 7620 }, { "epoch": 0.03408547726369115, "grad_norm": 1123.7857666015625, "learning_rate": 9.65914522736309e-06, "loss": 0.8286, "step": 7630 }, { "epoch": 0.034130150235203195, "grad_norm": 3.706858005091259e-15, "learning_rate": 9.65869849764797e-06, "loss": 0.018, "step": 7640 }, { "epoch": 0.03417482320671524, "grad_norm": 2.1704324806599146e-14, "learning_rate": 9.658251767932848e-06, "loss": 0.0, "step": 7650 }, { "epoch": 0.03421949617822729, "grad_norm": 7.746194023638964e-05, "learning_rate": 9.657805038217728e-06, "loss": 0.0, "step": 7660 }, { "epoch": 0.034264169149739336, "grad_norm": 3.4090260214725276e-07, "learning_rate": 9.657358308502608e-06, "loss": 0.0, "step": 7670 }, { "epoch": 0.034308842121251376, "grad_norm": 85.19580841064453, "learning_rate": 9.656911578787487e-06, "loss": 0.0253, "step": 7680 }, { "epoch": 0.03435351509276342, "grad_norm": 0.00017949531320482492, "learning_rate": 9.656464849072367e-06, "loss": 0.0, "step": 7690 }, { "epoch": 0.03439818806427547, "grad_norm": 2.522860813769512e-05, "learning_rate": 9.656018119357245e-06, "loss": 0.0001, "step": 7700 }, { "epoch": 0.03444286103578752, "grad_norm": 0.0251141469925642, "learning_rate": 9.655571389642125e-06, "loss": 0.6876, "step": 7710 }, { "epoch": 0.034487534007299564, "grad_norm": 0.000446300080511719, "learning_rate": 9.655124659927005e-06, "loss": 0.0, "step": 7720 }, { "epoch": 0.03453220697881161, "grad_norm": 3.106126467145032e-11, "learning_rate": 9.654677930211884e-06, "loss": 0.0, "step": 7730 }, { "epoch": 0.03457687995032366, "grad_norm": 7.441053458023816e-05, "learning_rate": 9.654231200496764e-06, "loss": 0.0016, "step": 7740 }, { "epoch": 0.034621552921835705, "grad_norm": 4.7033314742872534e-11, "learning_rate": 9.653784470781644e-06, "loss": 0.0001, "step": 7750 }, { "epoch": 0.034666225893347745, "grad_norm": 1.9171542589901946e-05, "learning_rate": 9.653337741066523e-06, "loss": 0.0001, "step": 7760 }, { "epoch": 0.03471089886485979, "grad_norm": 11.328216552734375, "learning_rate": 9.652891011351403e-06, "loss": 0.022, "step": 7770 }, { "epoch": 0.03475557183637184, "grad_norm": 1.524915471673438e-10, "learning_rate": 9.652444281636283e-06, "loss": 0.0, "step": 7780 }, { "epoch": 0.034800244807883886, "grad_norm": 8.888397830253414e-12, "learning_rate": 9.651997551921161e-06, "loss": 0.0, "step": 7790 }, { "epoch": 0.03484491777939593, "grad_norm": 2.0420384316821583e-06, "learning_rate": 9.651550822206041e-06, "loss": 0.0009, "step": 7800 }, { "epoch": 0.03488959075090798, "grad_norm": 0.04410456120967865, "learning_rate": 9.65110409249092e-06, "loss": 0.0035, "step": 7810 }, { "epoch": 0.03493426372242003, "grad_norm": 2.595260378129849e-12, "learning_rate": 9.6506573627758e-06, "loss": 0.0, "step": 7820 }, { "epoch": 0.034978936693932074, "grad_norm": 1.2043649133609596e-15, "learning_rate": 9.65021063306068e-06, "loss": 0.0, "step": 7830 }, { "epoch": 0.035023609665444114, "grad_norm": 2.8324453893446844e-09, "learning_rate": 9.649763903345558e-06, "loss": 0.0, "step": 7840 }, { "epoch": 0.03506828263695616, "grad_norm": 6.378841703735816e-07, "learning_rate": 9.649317173630439e-06, "loss": 0.0, "step": 7850 }, { "epoch": 0.03511295560846821, "grad_norm": 1.2266521132175967e-10, "learning_rate": 9.648870443915319e-06, "loss": 0.0041, "step": 7860 }, { "epoch": 0.035157628579980255, "grad_norm": 2.6962340780173344e-12, "learning_rate": 9.648423714200197e-06, "loss": 0.0, "step": 7870 }, { "epoch": 0.0352023015514923, "grad_norm": 6.922159373479175e-11, "learning_rate": 9.647976984485077e-06, "loss": 0.0, "step": 7880 }, { "epoch": 0.03524697452300435, "grad_norm": 1.943918137570222e-09, "learning_rate": 9.647530254769957e-06, "loss": 0.1805, "step": 7890 }, { "epoch": 0.035291647494516395, "grad_norm": 8.879051165422425e-06, "learning_rate": 9.647083525054838e-06, "loss": 0.0001, "step": 7900 }, { "epoch": 0.035336320466028436, "grad_norm": 2.9081201091685216e-07, "learning_rate": 9.646636795339716e-06, "loss": 0.0076, "step": 7910 }, { "epoch": 0.03538099343754048, "grad_norm": 1.1710764169692993, "learning_rate": 9.646190065624596e-06, "loss": 0.0002, "step": 7920 }, { "epoch": 0.03542566640905253, "grad_norm": 6.1495647983578205e-12, "learning_rate": 9.645743335909476e-06, "loss": 0.0001, "step": 7930 }, { "epoch": 0.035470339380564576, "grad_norm": 1.2547023288789205e-05, "learning_rate": 9.645296606194355e-06, "loss": 0.0026, "step": 7940 }, { "epoch": 0.03551501235207662, "grad_norm": 5.72323062982108e-16, "learning_rate": 9.644849876479235e-06, "loss": 0.0005, "step": 7950 }, { "epoch": 0.03555968532358867, "grad_norm": 1.514777814293211e-08, "learning_rate": 9.644403146764115e-06, "loss": 0.1914, "step": 7960 }, { "epoch": 0.03560435829510072, "grad_norm": 5.735700359110751e-09, "learning_rate": 9.643956417048993e-06, "loss": 0.0753, "step": 7970 }, { "epoch": 0.035649031266612764, "grad_norm": 1.7102655647249776e-07, "learning_rate": 9.643509687333873e-06, "loss": 0.5125, "step": 7980 }, { "epoch": 0.035693704238124804, "grad_norm": 1.2987326769575702e-09, "learning_rate": 9.643062957618754e-06, "loss": 0.0012, "step": 7990 }, { "epoch": 0.03573837720963685, "grad_norm": 9.948726287234422e-09, "learning_rate": 9.642616227903632e-06, "loss": 0.3656, "step": 8000 }, { "epoch": 0.0357830501811489, "grad_norm": 5.586425118053739e-07, "learning_rate": 9.642169498188512e-06, "loss": 0.0, "step": 8010 }, { "epoch": 0.035827723152660945, "grad_norm": 0.6900156140327454, "learning_rate": 9.641722768473392e-06, "loss": 0.0001, "step": 8020 }, { "epoch": 0.03587239612417299, "grad_norm": 1.6524243307003417e-08, "learning_rate": 9.64127603875827e-06, "loss": 0.0049, "step": 8030 }, { "epoch": 0.03591706909568504, "grad_norm": 0.005095589440315962, "learning_rate": 9.64082930904315e-06, "loss": 0.0, "step": 8040 }, { "epoch": 0.035961742067197086, "grad_norm": 0.005782588385045528, "learning_rate": 9.640382579328031e-06, "loss": 0.6813, "step": 8050 }, { "epoch": 0.03600641503870913, "grad_norm": 1.5047929286956787, "learning_rate": 9.63993584961291e-06, "loss": 0.0025, "step": 8060 }, { "epoch": 0.03605108801022117, "grad_norm": 6.845878885997081e-08, "learning_rate": 9.63948911989779e-06, "loss": 0.01, "step": 8070 }, { "epoch": 0.03609576098173322, "grad_norm": 6.9732931824446464e-15, "learning_rate": 9.63904239018267e-06, "loss": 0.0005, "step": 8080 }, { "epoch": 0.03614043395324527, "grad_norm": 0.029109634459018707, "learning_rate": 9.638595660467548e-06, "loss": 0.2125, "step": 8090 }, { "epoch": 0.036185106924757314, "grad_norm": 148.90367126464844, "learning_rate": 9.638148930752428e-06, "loss": 0.0226, "step": 8100 }, { "epoch": 0.03622977989626936, "grad_norm": 1.1183489898036747e-11, "learning_rate": 9.637702201037307e-06, "loss": 0.0, "step": 8110 }, { "epoch": 0.03627445286778141, "grad_norm": 1.0056037353933789e-06, "learning_rate": 9.637255471322187e-06, "loss": 0.4766, "step": 8120 }, { "epoch": 0.036319125839293455, "grad_norm": 1.895713808153232e-09, "learning_rate": 9.636808741607067e-06, "loss": 0.0023, "step": 8130 }, { "epoch": 0.036363798810805495, "grad_norm": 8.699790043920075e-08, "learning_rate": 9.636362011891945e-06, "loss": 0.0011, "step": 8140 }, { "epoch": 0.03640847178231754, "grad_norm": 6.252026755462423e-14, "learning_rate": 9.635915282176825e-06, "loss": 0.0001, "step": 8150 }, { "epoch": 0.03645314475382959, "grad_norm": 0.008534946478903294, "learning_rate": 9.635468552461706e-06, "loss": 0.0, "step": 8160 }, { "epoch": 0.036497817725341636, "grad_norm": 3.932518211513525e-06, "learning_rate": 9.635021822746584e-06, "loss": 0.0009, "step": 8170 }, { "epoch": 0.03654249069685368, "grad_norm": 8.627313218312338e-05, "learning_rate": 9.634575093031464e-06, "loss": 0.0, "step": 8180 }, { "epoch": 0.03658716366836573, "grad_norm": 1.372724489634436e-09, "learning_rate": 9.634128363316344e-06, "loss": 0.0, "step": 8190 }, { "epoch": 0.03663183663987778, "grad_norm": 5.55379529032507e-07, "learning_rate": 9.633681633601223e-06, "loss": 0.0578, "step": 8200 }, { "epoch": 0.036676509611389824, "grad_norm": 756.7450561523438, "learning_rate": 9.633234903886103e-06, "loss": 0.925, "step": 8210 }, { "epoch": 0.036721182582901864, "grad_norm": 3.7226468574402816e-09, "learning_rate": 9.632788174170981e-06, "loss": 0.0002, "step": 8220 }, { "epoch": 0.03676585555441391, "grad_norm": 2.771380781113608e-12, "learning_rate": 9.632341444455861e-06, "loss": 0.0, "step": 8230 }, { "epoch": 0.03681052852592596, "grad_norm": 3.6899330162556976e-13, "learning_rate": 9.631894714740741e-06, "loss": 0.0001, "step": 8240 }, { "epoch": 0.036855201497438005, "grad_norm": 5.4532566906573265e-08, "learning_rate": 9.63144798502562e-06, "loss": 0.0151, "step": 8250 }, { "epoch": 0.03689987446895005, "grad_norm": 8.034226630115882e-05, "learning_rate": 9.6310012553105e-06, "loss": 0.0, "step": 8260 }, { "epoch": 0.0369445474404621, "grad_norm": 2.354401112825144e-05, "learning_rate": 9.63055452559538e-06, "loss": 0.0, "step": 8270 }, { "epoch": 0.036989220411974146, "grad_norm": 592.7618408203125, "learning_rate": 9.630107795880259e-06, "loss": 0.1501, "step": 8280 }, { "epoch": 0.03703389338348619, "grad_norm": 1.201908162329346e-05, "learning_rate": 9.629661066165139e-06, "loss": 0.0043, "step": 8290 }, { "epoch": 0.03707856635499823, "grad_norm": 5.1015629742824035e-15, "learning_rate": 9.629214336450017e-06, "loss": 0.0, "step": 8300 }, { "epoch": 0.03712323932651028, "grad_norm": 3.6857750274066348e-06, "learning_rate": 9.628767606734897e-06, "loss": 0.0, "step": 8310 }, { "epoch": 0.037167912298022326, "grad_norm": 0.0037877659779042006, "learning_rate": 9.628320877019777e-06, "loss": 0.0424, "step": 8320 }, { "epoch": 0.03721258526953437, "grad_norm": 1.0160490883977941e-13, "learning_rate": 9.627874147304656e-06, "loss": 0.0007, "step": 8330 }, { "epoch": 0.03725725824104642, "grad_norm": 1.4340297178894179e-11, "learning_rate": 9.627427417589536e-06, "loss": 0.0, "step": 8340 }, { "epoch": 0.03730193121255847, "grad_norm": 1.898951662582249e-09, "learning_rate": 9.626980687874416e-06, "loss": 0.575, "step": 8350 }, { "epoch": 0.037346604184070514, "grad_norm": 2.65234375547152e-05, "learning_rate": 9.626533958159296e-06, "loss": 0.0005, "step": 8360 }, { "epoch": 0.03739127715558256, "grad_norm": 0.002871223958209157, "learning_rate": 9.626087228444175e-06, "loss": 0.0252, "step": 8370 }, { "epoch": 0.0374359501270946, "grad_norm": 2.3592594891597152e-11, "learning_rate": 9.625640498729055e-06, "loss": 0.0252, "step": 8380 }, { "epoch": 0.03748062309860665, "grad_norm": 10.255550384521484, "learning_rate": 9.625193769013935e-06, "loss": 0.1819, "step": 8390 }, { "epoch": 0.037525296070118695, "grad_norm": 5.704014691132464e-11, "learning_rate": 9.624747039298813e-06, "loss": 0.0, "step": 8400 }, { "epoch": 0.03756996904163074, "grad_norm": 3.851150864875308e-08, "learning_rate": 9.624300309583693e-06, "loss": 0.012, "step": 8410 }, { "epoch": 0.03761464201314279, "grad_norm": 1.6326265495081316e-06, "learning_rate": 9.623853579868574e-06, "loss": 0.0008, "step": 8420 }, { "epoch": 0.037659314984654836, "grad_norm": 1.937662703899168e-12, "learning_rate": 9.623406850153452e-06, "loss": 0.0016, "step": 8430 }, { "epoch": 0.03770398795616688, "grad_norm": 156.47518920898438, "learning_rate": 9.622960120438332e-06, "loss": 0.0313, "step": 8440 }, { "epoch": 0.03774866092767892, "grad_norm": 1.1682730161055588e-07, "learning_rate": 9.622513390723212e-06, "loss": 0.0006, "step": 8450 }, { "epoch": 0.03779333389919097, "grad_norm": 0.0061811706982553005, "learning_rate": 9.62206666100809e-06, "loss": 0.0, "step": 8460 }, { "epoch": 0.03783800687070302, "grad_norm": 6.299590626923646e-09, "learning_rate": 9.62161993129297e-06, "loss": 0.0018, "step": 8470 }, { "epoch": 0.037882679842215064, "grad_norm": 1.5485067706322297e-05, "learning_rate": 9.621173201577851e-06, "loss": 0.0, "step": 8480 }, { "epoch": 0.03792735281372711, "grad_norm": 5.004122627383367e-08, "learning_rate": 9.620726471862731e-06, "loss": 0.001, "step": 8490 }, { "epoch": 0.03797202578523916, "grad_norm": 1.0859906723892365e-10, "learning_rate": 9.62027974214761e-06, "loss": 0.0001, "step": 8500 }, { "epoch": 0.038016698756751205, "grad_norm": 9.935144576388248e-17, "learning_rate": 9.61983301243249e-06, "loss": 0.0001, "step": 8510 }, { "epoch": 0.03806137172826325, "grad_norm": 8.756306324381776e-17, "learning_rate": 9.619386282717368e-06, "loss": 0.0, "step": 8520 }, { "epoch": 0.03810604469977529, "grad_norm": 1.8787010048981756e-05, "learning_rate": 9.618939553002248e-06, "loss": 0.0001, "step": 8530 }, { "epoch": 0.03815071767128734, "grad_norm": 1.7396580781692738e-15, "learning_rate": 9.618492823287128e-06, "loss": 0.0, "step": 8540 }, { "epoch": 0.038195390642799386, "grad_norm": 8.588968945788533e-12, "learning_rate": 9.618046093572007e-06, "loss": 0.0898, "step": 8550 }, { "epoch": 0.03824006361431143, "grad_norm": 7.278631031226723e-10, "learning_rate": 9.617599363856887e-06, "loss": 0.0, "step": 8560 }, { "epoch": 0.03828473658582348, "grad_norm": 7.282357273652451e-06, "learning_rate": 9.617152634141767e-06, "loss": 0.0, "step": 8570 }, { "epoch": 0.03832940955733553, "grad_norm": 4.293304201041792e-08, "learning_rate": 9.616705904426645e-06, "loss": 0.0081, "step": 8580 }, { "epoch": 0.038374082528847574, "grad_norm": 5.928718564973678e-07, "learning_rate": 9.616259174711526e-06, "loss": 0.2127, "step": 8590 }, { "epoch": 0.03841875550035962, "grad_norm": 9.632236414730544e-14, "learning_rate": 9.615812444996404e-06, "loss": 0.0019, "step": 8600 }, { "epoch": 0.03846342847187166, "grad_norm": 1.7470816260356514e-07, "learning_rate": 9.615365715281284e-06, "loss": 0.0, "step": 8610 }, { "epoch": 0.03850810144338371, "grad_norm": 4.201944781706546e-12, "learning_rate": 9.614918985566164e-06, "loss": 0.0, "step": 8620 }, { "epoch": 0.038552774414895755, "grad_norm": 3.792194366455078, "learning_rate": 9.614472255851043e-06, "loss": 0.0005, "step": 8630 }, { "epoch": 0.0385974473864078, "grad_norm": 1.5693518706783038e-10, "learning_rate": 9.614025526135923e-06, "loss": 0.0, "step": 8640 }, { "epoch": 0.03864212035791985, "grad_norm": 6.817324482710774e-13, "learning_rate": 9.613578796420803e-06, "loss": 0.2689, "step": 8650 }, { "epoch": 0.038686793329431896, "grad_norm": 3.3513822828012962e-09, "learning_rate": 9.613132066705681e-06, "loss": 0.0, "step": 8660 }, { "epoch": 0.03873146630094394, "grad_norm": 1.621324187306059e-15, "learning_rate": 9.612685336990561e-06, "loss": 0.0, "step": 8670 }, { "epoch": 0.03877613927245598, "grad_norm": 0.06873009353876114, "learning_rate": 9.612238607275442e-06, "loss": 0.2078, "step": 8680 }, { "epoch": 0.03882081224396803, "grad_norm": 6.228573852240515e-07, "learning_rate": 9.61179187756032e-06, "loss": 0.0, "step": 8690 }, { "epoch": 0.038865485215480076, "grad_norm": 1.1779390661104117e-05, "learning_rate": 9.6113451478452e-06, "loss": 0.0, "step": 8700 }, { "epoch": 0.03891015818699212, "grad_norm": 1.770038920767547e-06, "learning_rate": 9.610898418130079e-06, "loss": 0.0001, "step": 8710 }, { "epoch": 0.03895483115850417, "grad_norm": 0.00015133483975660056, "learning_rate": 9.610451688414959e-06, "loss": 0.0297, "step": 8720 }, { "epoch": 0.03899950413001622, "grad_norm": 2.2596263079321943e-05, "learning_rate": 9.610004958699839e-06, "loss": 0.4875, "step": 8730 }, { "epoch": 0.039044177101528264, "grad_norm": 1.4790319255553186e-05, "learning_rate": 9.609558228984717e-06, "loss": 0.0, "step": 8740 }, { "epoch": 0.03908885007304031, "grad_norm": 0.060237348079681396, "learning_rate": 9.609111499269597e-06, "loss": 0.0, "step": 8750 }, { "epoch": 0.03913352304455235, "grad_norm": 0.00015527455252595246, "learning_rate": 9.608664769554478e-06, "loss": 0.0062, "step": 8760 }, { "epoch": 0.0391781960160644, "grad_norm": 1.2009442051930108e-10, "learning_rate": 9.608218039839356e-06, "loss": 0.0348, "step": 8770 }, { "epoch": 0.039222868987576445, "grad_norm": 7.036464158497043e-16, "learning_rate": 9.607771310124236e-06, "loss": 0.0001, "step": 8780 }, { "epoch": 0.03926754195908849, "grad_norm": 5.08871373858244e-12, "learning_rate": 9.607324580409115e-06, "loss": 0.0003, "step": 8790 }, { "epoch": 0.03931221493060054, "grad_norm": 1.72422603972322e-10, "learning_rate": 9.606877850693995e-06, "loss": 0.0403, "step": 8800 }, { "epoch": 0.039356887902112586, "grad_norm": 1.8882680141718566e-11, "learning_rate": 9.606431120978875e-06, "loss": 0.2578, "step": 8810 }, { "epoch": 0.03940156087362463, "grad_norm": 4.5700039753978836e-14, "learning_rate": 9.605984391263755e-06, "loss": 0.0, "step": 8820 }, { "epoch": 0.03944623384513668, "grad_norm": 6.765368701453639e-14, "learning_rate": 9.605537661548633e-06, "loss": 0.0226, "step": 8830 }, { "epoch": 0.03949090681664872, "grad_norm": 1.6490670574274792e-15, "learning_rate": 9.605090931833513e-06, "loss": 0.0007, "step": 8840 }, { "epoch": 0.03953557978816077, "grad_norm": 6.464263152539118e-12, "learning_rate": 9.604644202118394e-06, "loss": 0.0006, "step": 8850 }, { "epoch": 0.039580252759672814, "grad_norm": 7.716137950808388e-09, "learning_rate": 9.604197472403272e-06, "loss": 0.0, "step": 8860 }, { "epoch": 0.03962492573118486, "grad_norm": 4.2918849030293416e-10, "learning_rate": 9.603750742688152e-06, "loss": 0.675, "step": 8870 }, { "epoch": 0.03966959870269691, "grad_norm": 0.0005095364176668227, "learning_rate": 9.603304012973032e-06, "loss": 0.0014, "step": 8880 }, { "epoch": 0.039714271674208955, "grad_norm": 6.71338548988154e-12, "learning_rate": 9.60285728325791e-06, "loss": 0.0, "step": 8890 }, { "epoch": 0.039758944645721, "grad_norm": 1.603486089152284e-05, "learning_rate": 9.60241055354279e-06, "loss": 0.0006, "step": 8900 }, { "epoch": 0.03980361761723305, "grad_norm": 2.7107730602438096e-06, "learning_rate": 9.601963823827671e-06, "loss": 0.0, "step": 8910 }, { "epoch": 0.03984829058874509, "grad_norm": 2.2744011118902563e-07, "learning_rate": 9.60151709411255e-06, "loss": 0.0, "step": 8920 }, { "epoch": 0.039892963560257136, "grad_norm": 0.007210536394268274, "learning_rate": 9.60107036439743e-06, "loss": 0.3172, "step": 8930 }, { "epoch": 0.03993763653176918, "grad_norm": 7.9241566989228e-15, "learning_rate": 9.60062363468231e-06, "loss": 0.0046, "step": 8940 }, { "epoch": 0.03998230950328123, "grad_norm": 9.576701164245605, "learning_rate": 9.60017690496719e-06, "loss": 0.0036, "step": 8950 }, { "epoch": 0.04002698247479328, "grad_norm": 0.0009234817116521299, "learning_rate": 9.599730175252068e-06, "loss": 0.0014, "step": 8960 }, { "epoch": 0.040071655446305324, "grad_norm": 2.6690221588410168e-08, "learning_rate": 9.599283445536948e-06, "loss": 0.0049, "step": 8970 }, { "epoch": 0.04011632841781737, "grad_norm": 7.448632914019981e-06, "learning_rate": 9.598836715821828e-06, "loss": 0.0, "step": 8980 }, { "epoch": 0.04016100138932941, "grad_norm": 5.278381898232709e-13, "learning_rate": 9.598389986106707e-06, "loss": 0.0, "step": 8990 }, { "epoch": 0.04020567436084146, "grad_norm": 1.8674436996768407e-11, "learning_rate": 9.597943256391587e-06, "loss": 0.1055, "step": 9000 }, { "epoch": 0.040250347332353505, "grad_norm": 6.622378426884845e-13, "learning_rate": 9.597496526676465e-06, "loss": 0.0, "step": 9010 }, { "epoch": 0.04029502030386555, "grad_norm": 1.352628342665696e-09, "learning_rate": 9.597049796961346e-06, "loss": 0.0, "step": 9020 }, { "epoch": 0.0403396932753776, "grad_norm": 4.5851612640035455e-07, "learning_rate": 9.596603067246226e-06, "loss": 0.2813, "step": 9030 }, { "epoch": 0.040384366246889646, "grad_norm": 7.833050119643303e-08, "learning_rate": 9.596156337531104e-06, "loss": 0.0, "step": 9040 }, { "epoch": 0.04042903921840169, "grad_norm": 8.340951467289415e-07, "learning_rate": 9.595709607815984e-06, "loss": 0.0001, "step": 9050 }, { "epoch": 0.04047371218991374, "grad_norm": 4.396644506798886e-11, "learning_rate": 9.595262878100864e-06, "loss": 0.0, "step": 9060 }, { "epoch": 0.04051838516142578, "grad_norm": 1.6098958004060223e-17, "learning_rate": 9.594816148385743e-06, "loss": 0.0, "step": 9070 }, { "epoch": 0.040563058132937826, "grad_norm": 9.86034774541622e-06, "learning_rate": 9.594369418670623e-06, "loss": 0.0, "step": 9080 }, { "epoch": 0.04060773110444987, "grad_norm": 8.242366789090738e-09, "learning_rate": 9.593922688955501e-06, "loss": 0.0002, "step": 9090 }, { "epoch": 0.04065240407596192, "grad_norm": 5.484909333902492e-15, "learning_rate": 9.593475959240381e-06, "loss": 0.0, "step": 9100 }, { "epoch": 0.04069707704747397, "grad_norm": 2.234793322619666e-10, "learning_rate": 9.593029229525262e-06, "loss": 0.0001, "step": 9110 }, { "epoch": 0.040741750018986014, "grad_norm": 0.0003598829498514533, "learning_rate": 9.59258249981014e-06, "loss": 0.0, "step": 9120 }, { "epoch": 0.04078642299049806, "grad_norm": 1.4396250573079327e-21, "learning_rate": 9.59213577009502e-06, "loss": 0.0, "step": 9130 }, { "epoch": 0.04083109596201011, "grad_norm": 1.944572795764543e-05, "learning_rate": 9.5916890403799e-06, "loss": 0.0001, "step": 9140 }, { "epoch": 0.04087576893352215, "grad_norm": 6.553003040372929e-13, "learning_rate": 9.591242310664779e-06, "loss": 0.0, "step": 9150 }, { "epoch": 0.040920441905034195, "grad_norm": 1.1000043542708227e-08, "learning_rate": 9.590795580949659e-06, "loss": 0.0007, "step": 9160 }, { "epoch": 0.04096511487654624, "grad_norm": 1.991377332130617e-12, "learning_rate": 9.590348851234539e-06, "loss": 0.0062, "step": 9170 }, { "epoch": 0.04100978784805829, "grad_norm": 8.767239931544053e-16, "learning_rate": 9.589902121519417e-06, "loss": 0.0, "step": 9180 }, { "epoch": 0.041054460819570336, "grad_norm": 2.192102783737937e-06, "learning_rate": 9.589455391804297e-06, "loss": 0.0001, "step": 9190 }, { "epoch": 0.04109913379108238, "grad_norm": 3.0813654579298078e-12, "learning_rate": 9.589008662089176e-06, "loss": 0.0007, "step": 9200 }, { "epoch": 0.04114380676259443, "grad_norm": 0.022311272099614143, "learning_rate": 9.588561932374056e-06, "loss": 0.0001, "step": 9210 }, { "epoch": 0.04118847973410647, "grad_norm": 1.5432090284470124e-12, "learning_rate": 9.588115202658936e-06, "loss": 0.0, "step": 9220 }, { "epoch": 0.04123315270561852, "grad_norm": 1.624277388145856e-07, "learning_rate": 9.587668472943815e-06, "loss": 0.0, "step": 9230 }, { "epoch": 0.041277825677130564, "grad_norm": 7.818975689133367e-09, "learning_rate": 9.587221743228695e-06, "loss": 0.0244, "step": 9240 }, { "epoch": 0.04132249864864261, "grad_norm": 1.1187073809633061e-13, "learning_rate": 9.586775013513575e-06, "loss": 0.0, "step": 9250 }, { "epoch": 0.04136717162015466, "grad_norm": 1.5692712906911765e-07, "learning_rate": 9.586328283798453e-06, "loss": 0.4656, "step": 9260 }, { "epoch": 0.041411844591666705, "grad_norm": 447.6181640625, "learning_rate": 9.585881554083333e-06, "loss": 0.1313, "step": 9270 }, { "epoch": 0.04145651756317875, "grad_norm": 14.45638656616211, "learning_rate": 9.585434824368214e-06, "loss": 0.0018, "step": 9280 }, { "epoch": 0.0415011905346908, "grad_norm": 2.5550141429658835e-12, "learning_rate": 9.584988094653092e-06, "loss": 0.0153, "step": 9290 }, { "epoch": 0.04154586350620284, "grad_norm": 4.1291680335998535, "learning_rate": 9.584541364937972e-06, "loss": 0.0525, "step": 9300 }, { "epoch": 0.041590536477714886, "grad_norm": 68.95028686523438, "learning_rate": 9.584094635222852e-06, "loss": 0.0089, "step": 9310 }, { "epoch": 0.04163520944922693, "grad_norm": 9.987977023229178e-08, "learning_rate": 9.58364790550773e-06, "loss": 0.0, "step": 9320 }, { "epoch": 0.04167988242073898, "grad_norm": 2.3775720975338643e-11, "learning_rate": 9.58320117579261e-06, "loss": 0.0, "step": 9330 }, { "epoch": 0.04172455539225103, "grad_norm": 0.20666083693504333, "learning_rate": 9.582754446077491e-06, "loss": 0.0, "step": 9340 }, { "epoch": 0.041769228363763074, "grad_norm": 2.3314614736591466e-05, "learning_rate": 9.58230771636237e-06, "loss": 0.0, "step": 9350 }, { "epoch": 0.04181390133527512, "grad_norm": 1.173892390628553e-13, "learning_rate": 9.58186098664725e-06, "loss": 0.0049, "step": 9360 }, { "epoch": 0.04185857430678717, "grad_norm": 2.4579649107181467e-05, "learning_rate": 9.58141425693213e-06, "loss": 0.0005, "step": 9370 }, { "epoch": 0.04190324727829921, "grad_norm": 0.0003566561790648848, "learning_rate": 9.580967527217008e-06, "loss": 0.0049, "step": 9380 }, { "epoch": 0.041947920249811255, "grad_norm": 0.08571092039346695, "learning_rate": 9.580520797501888e-06, "loss": 0.0064, "step": 9390 }, { "epoch": 0.0419925932213233, "grad_norm": 0.00036160662421025336, "learning_rate": 9.580074067786768e-06, "loss": 0.0, "step": 9400 }, { "epoch": 0.04203726619283535, "grad_norm": 57.7559700012207, "learning_rate": 9.579627338071648e-06, "loss": 0.0127, "step": 9410 }, { "epoch": 0.042081939164347396, "grad_norm": 2.41027595831219e-14, "learning_rate": 9.579180608356527e-06, "loss": 0.0201, "step": 9420 }, { "epoch": 0.04212661213585944, "grad_norm": 1.1963922962130585e-13, "learning_rate": 9.578733878641407e-06, "loss": 0.0008, "step": 9430 }, { "epoch": 0.04217128510737149, "grad_norm": 14.161581039428711, "learning_rate": 9.578287148926287e-06, "loss": 0.2502, "step": 9440 }, { "epoch": 0.042215958078883536, "grad_norm": 1.318870568312036e-09, "learning_rate": 9.577840419211166e-06, "loss": 0.012, "step": 9450 }, { "epoch": 0.042260631050395576, "grad_norm": 1.5369117259979248, "learning_rate": 9.577393689496046e-06, "loss": 0.0004, "step": 9460 }, { "epoch": 0.04230530402190762, "grad_norm": 2.638370208071783e-12, "learning_rate": 9.576946959780926e-06, "loss": 0.0, "step": 9470 }, { "epoch": 0.04234997699341967, "grad_norm": 9.181162698252364e-16, "learning_rate": 9.576500230065804e-06, "loss": 0.0, "step": 9480 }, { "epoch": 0.04239464996493172, "grad_norm": 1.4507448895528796e-06, "learning_rate": 9.576053500350684e-06, "loss": 0.0, "step": 9490 }, { "epoch": 0.042439322936443764, "grad_norm": 1.1731444704565774e-09, "learning_rate": 9.575606770635563e-06, "loss": 0.0, "step": 9500 }, { "epoch": 0.04248399590795581, "grad_norm": 8.030391395805144e-19, "learning_rate": 9.575160040920443e-06, "loss": 0.0001, "step": 9510 }, { "epoch": 0.04252866887946786, "grad_norm": 5.895772457122803, "learning_rate": 9.574713311205323e-06, "loss": 0.0017, "step": 9520 }, { "epoch": 0.0425733418509799, "grad_norm": 2.221716463145923e-13, "learning_rate": 9.574266581490201e-06, "loss": 0.0, "step": 9530 }, { "epoch": 0.042618014822491945, "grad_norm": 0.000884619599673897, "learning_rate": 9.573819851775082e-06, "loss": 0.0018, "step": 9540 }, { "epoch": 0.04266268779400399, "grad_norm": 3.657691422631615e-06, "learning_rate": 9.573373122059962e-06, "loss": 0.0002, "step": 9550 }, { "epoch": 0.04270736076551604, "grad_norm": 4.2471978711766667e-10, "learning_rate": 9.57292639234484e-06, "loss": 0.0312, "step": 9560 }, { "epoch": 0.042752033737028086, "grad_norm": 2.2941121002872933e-08, "learning_rate": 9.57247966262972e-06, "loss": 0.0, "step": 9570 }, { "epoch": 0.04279670670854013, "grad_norm": 0.043591901659965515, "learning_rate": 9.5720329329146e-06, "loss": 0.0, "step": 9580 }, { "epoch": 0.04284137968005218, "grad_norm": 4.469314163202398e-08, "learning_rate": 9.571586203199479e-06, "loss": 0.0075, "step": 9590 }, { "epoch": 0.04288605265156423, "grad_norm": 3.9559712948512556e-16, "learning_rate": 9.571139473484359e-06, "loss": 0.001, "step": 9600 }, { "epoch": 0.04293072562307627, "grad_norm": 1.8759371833709793e-09, "learning_rate": 9.570692743769237e-06, "loss": 0.0, "step": 9610 }, { "epoch": 0.042975398594588314, "grad_norm": 8.860857880765849e-11, "learning_rate": 9.570246014054117e-06, "loss": 0.0, "step": 9620 }, { "epoch": 0.04302007156610036, "grad_norm": 3.235507195498144e-09, "learning_rate": 9.569799284338998e-06, "loss": 0.0, "step": 9630 }, { "epoch": 0.04306474453761241, "grad_norm": 1.045658893673862e-12, "learning_rate": 9.569352554623876e-06, "loss": 0.0001, "step": 9640 }, { "epoch": 0.043109417509124455, "grad_norm": 5.70101528919037e-13, "learning_rate": 9.568905824908756e-06, "loss": 0.0, "step": 9650 }, { "epoch": 0.0431540904806365, "grad_norm": 2.711625337600708, "learning_rate": 9.568459095193636e-06, "loss": 0.0005, "step": 9660 }, { "epoch": 0.04319876345214855, "grad_norm": 1.9815430277958512e-05, "learning_rate": 9.568012365478515e-06, "loss": 0.0, "step": 9670 }, { "epoch": 0.043243436423660596, "grad_norm": 2.702470137862928e-13, "learning_rate": 9.567565635763395e-06, "loss": 0.1424, "step": 9680 }, { "epoch": 0.043288109395172636, "grad_norm": 2.9452898786530568e-08, "learning_rate": 9.567118906048273e-06, "loss": 0.0, "step": 9690 }, { "epoch": 0.04333278236668468, "grad_norm": 1.0396380275778938e-05, "learning_rate": 9.566672176333153e-06, "loss": 0.0, "step": 9700 }, { "epoch": 0.04337745533819673, "grad_norm": 0.10802660882472992, "learning_rate": 9.566225446618034e-06, "loss": 0.0, "step": 9710 }, { "epoch": 0.04342212830970878, "grad_norm": 0.00029546706355176866, "learning_rate": 9.565778716902912e-06, "loss": 0.0007, "step": 9720 }, { "epoch": 0.043466801281220824, "grad_norm": 2.035315219575963e-16, "learning_rate": 9.565331987187792e-06, "loss": 0.1061, "step": 9730 }, { "epoch": 0.04351147425273287, "grad_norm": 1.4710154379587049e-15, "learning_rate": 9.564885257472672e-06, "loss": 0.0001, "step": 9740 }, { "epoch": 0.04355614722424492, "grad_norm": 3.584343325618855e-12, "learning_rate": 9.56443852775755e-06, "loss": 0.0008, "step": 9750 }, { "epoch": 0.04360082019575696, "grad_norm": 8.707693837095576e-07, "learning_rate": 9.56399179804243e-06, "loss": 0.0003, "step": 9760 }, { "epoch": 0.043645493167269005, "grad_norm": 95.9183578491211, "learning_rate": 9.563545068327311e-06, "loss": 0.4336, "step": 9770 }, { "epoch": 0.04369016613878105, "grad_norm": 5.015752813086749e-10, "learning_rate": 9.56309833861219e-06, "loss": 0.0001, "step": 9780 }, { "epoch": 0.0437348391102931, "grad_norm": 9.280040831072256e-05, "learning_rate": 9.56265160889707e-06, "loss": 0.0016, "step": 9790 }, { "epoch": 0.043779512081805146, "grad_norm": 2.6840293685381766e-06, "learning_rate": 9.56220487918195e-06, "loss": 0.0, "step": 9800 }, { "epoch": 0.04382418505331719, "grad_norm": 4.250260190019617e-06, "learning_rate": 9.561758149466828e-06, "loss": 0.0387, "step": 9810 }, { "epoch": 0.04386885802482924, "grad_norm": 4.036125392303802e-05, "learning_rate": 9.561311419751708e-06, "loss": 0.0016, "step": 9820 }, { "epoch": 0.043913530996341286, "grad_norm": 7.597957960570056e-07, "learning_rate": 9.560864690036588e-06, "loss": 0.0001, "step": 9830 }, { "epoch": 0.043958203967853327, "grad_norm": 4.901462014039737e-10, "learning_rate": 9.560417960321467e-06, "loss": 0.0002, "step": 9840 }, { "epoch": 0.044002876939365373, "grad_norm": 1.544968666888913e-11, "learning_rate": 9.559971230606347e-06, "loss": 0.0479, "step": 9850 }, { "epoch": 0.04404754991087742, "grad_norm": 6.470037006156559e-11, "learning_rate": 9.559524500891227e-06, "loss": 0.0, "step": 9860 }, { "epoch": 0.04409222288238947, "grad_norm": 4.1798831063033504e-10, "learning_rate": 9.559077771176107e-06, "loss": 0.0, "step": 9870 }, { "epoch": 0.044136895853901514, "grad_norm": 3.472337972709738e-10, "learning_rate": 9.558631041460986e-06, "loss": 0.0387, "step": 9880 }, { "epoch": 0.04418156882541356, "grad_norm": 1.1115545604182403e-15, "learning_rate": 9.558184311745866e-06, "loss": 0.0, "step": 9890 }, { "epoch": 0.04422624179692561, "grad_norm": 0.0023215063847601414, "learning_rate": 9.557737582030746e-06, "loss": 0.0072, "step": 9900 }, { "epoch": 0.044270914768437655, "grad_norm": 0.0007233454380184412, "learning_rate": 9.557290852315624e-06, "loss": 0.0, "step": 9910 }, { "epoch": 0.044315587739949695, "grad_norm": 373.0455017089844, "learning_rate": 9.556844122600504e-06, "loss": 0.1313, "step": 9920 }, { "epoch": 0.04436026071146174, "grad_norm": 4.1237175452124575e-08, "learning_rate": 9.556397392885384e-06, "loss": 0.0, "step": 9930 }, { "epoch": 0.04440493368297379, "grad_norm": 3.873950119026581e-11, "learning_rate": 9.555950663170263e-06, "loss": 0.0, "step": 9940 }, { "epoch": 0.044449606654485836, "grad_norm": 1.5232348442077637, "learning_rate": 9.555503933455143e-06, "loss": 0.0004, "step": 9950 }, { "epoch": 0.04449427962599788, "grad_norm": 0.0005546921165660024, "learning_rate": 9.555057203740023e-06, "loss": 0.0003, "step": 9960 }, { "epoch": 0.04453895259750993, "grad_norm": 2.327199405405267e-10, "learning_rate": 9.554610474024902e-06, "loss": 0.012, "step": 9970 }, { "epoch": 0.04458362556902198, "grad_norm": 2.1845164566158815e-11, "learning_rate": 9.554163744309782e-06, "loss": 0.0, "step": 9980 }, { "epoch": 0.044628298540534024, "grad_norm": 8.4572713276998e-09, "learning_rate": 9.55371701459466e-06, "loss": 0.002, "step": 9990 }, { "epoch": 0.044672971512046064, "grad_norm": 1.1127020528925446e-12, "learning_rate": 9.55327028487954e-06, "loss": 0.0011, "step": 10000 }, { "epoch": 0.04471764448355811, "grad_norm": 4.5678115156988497e-07, "learning_rate": 9.55282355516442e-06, "loss": 0.0031, "step": 10010 }, { "epoch": 0.04476231745507016, "grad_norm": 9.302107810974121, "learning_rate": 9.552376825449299e-06, "loss": 0.002, "step": 10020 }, { "epoch": 0.044806990426582205, "grad_norm": 1.2560059498803889e-14, "learning_rate": 9.551930095734179e-06, "loss": 0.0, "step": 10030 }, { "epoch": 0.04485166339809425, "grad_norm": 0.05222535878419876, "learning_rate": 9.551483366019059e-06, "loss": 0.0004, "step": 10040 }, { "epoch": 0.0448963363696063, "grad_norm": 0.3058798015117645, "learning_rate": 9.551036636303937e-06, "loss": 0.3173, "step": 10050 }, { "epoch": 0.044941009341118346, "grad_norm": 3.920735707652057e-06, "learning_rate": 9.550589906588818e-06, "loss": 0.0, "step": 10060 }, { "epoch": 0.044985682312630386, "grad_norm": 9.553231166137266e-07, "learning_rate": 9.550143176873698e-06, "loss": 0.0, "step": 10070 }, { "epoch": 0.04503035528414243, "grad_norm": 0.06729566305875778, "learning_rate": 9.549696447158576e-06, "loss": 0.001, "step": 10080 }, { "epoch": 0.04507502825565448, "grad_norm": 4.8741655156447905e-09, "learning_rate": 9.549249717443456e-06, "loss": 0.0, "step": 10090 }, { "epoch": 0.04511970122716653, "grad_norm": 5.887299980855687e-09, "learning_rate": 9.548802987728335e-06, "loss": 0.7375, "step": 10100 }, { "epoch": 0.045164374198678574, "grad_norm": 2.405948407613323e-06, "learning_rate": 9.548356258013215e-06, "loss": 0.0, "step": 10110 }, { "epoch": 0.04520904717019062, "grad_norm": 7.378541950608247e-14, "learning_rate": 9.547909528298095e-06, "loss": 0.0, "step": 10120 }, { "epoch": 0.04525372014170267, "grad_norm": 3.9999552281511086e-11, "learning_rate": 9.547462798582973e-06, "loss": 0.0, "step": 10130 }, { "epoch": 0.045298393113214715, "grad_norm": 1.4985769780651026e-07, "learning_rate": 9.547016068867854e-06, "loss": 0.0006, "step": 10140 }, { "epoch": 0.045343066084726755, "grad_norm": 5.634565353393555, "learning_rate": 9.546569339152734e-06, "loss": 0.0018, "step": 10150 }, { "epoch": 0.0453877390562388, "grad_norm": 9.927728861214291e-09, "learning_rate": 9.546122609437612e-06, "loss": 0.0079, "step": 10160 }, { "epoch": 0.04543241202775085, "grad_norm": 9.97686100401296e-11, "learning_rate": 9.545675879722492e-06, "loss": 0.0001, "step": 10170 }, { "epoch": 0.045477084999262896, "grad_norm": 2.851215263888207e-09, "learning_rate": 9.54522915000737e-06, "loss": 0.3344, "step": 10180 }, { "epoch": 0.04552175797077494, "grad_norm": 5.79595762246754e-05, "learning_rate": 9.54478242029225e-06, "loss": 0.0, "step": 10190 }, { "epoch": 0.04556643094228699, "grad_norm": 1.3934897680201175e-10, "learning_rate": 9.544335690577131e-06, "loss": 0.0049, "step": 10200 }, { "epoch": 0.045611103913799037, "grad_norm": 8.2845375004581e-10, "learning_rate": 9.54388896086201e-06, "loss": 0.0017, "step": 10210 }, { "epoch": 0.04565577688531108, "grad_norm": 6.951763165119473e-09, "learning_rate": 9.54344223114689e-06, "loss": 0.0, "step": 10220 }, { "epoch": 0.045700449856823123, "grad_norm": 5.074342638786145e-13, "learning_rate": 9.54299550143177e-06, "loss": 0.0011, "step": 10230 }, { "epoch": 0.04574512282833517, "grad_norm": 1.7860834589100705e-07, "learning_rate": 9.542548771716648e-06, "loss": 0.2344, "step": 10240 }, { "epoch": 0.04578979579984722, "grad_norm": 4.644238651962951e-05, "learning_rate": 9.542102042001528e-06, "loss": 0.0128, "step": 10250 }, { "epoch": 0.045834468771359264, "grad_norm": 2.7153768655807653e-07, "learning_rate": 9.541655312286408e-06, "loss": 0.0, "step": 10260 }, { "epoch": 0.04587914174287131, "grad_norm": 3.4437507565598935e-06, "learning_rate": 9.541208582571287e-06, "loss": 0.0, "step": 10270 }, { "epoch": 0.04592381471438336, "grad_norm": 0.2568793296813965, "learning_rate": 9.540761852856167e-06, "loss": 0.003, "step": 10280 }, { "epoch": 0.045968487685895405, "grad_norm": 1.8173045290836853e-08, "learning_rate": 9.540315123141047e-06, "loss": 0.0, "step": 10290 }, { "epoch": 0.046013160657407445, "grad_norm": 2.1706417316792043e-13, "learning_rate": 9.539868393425925e-06, "loss": 0.0, "step": 10300 }, { "epoch": 0.04605783362891949, "grad_norm": 1.2277396981008604e-13, "learning_rate": 9.539421663710805e-06, "loss": 0.0001, "step": 10310 }, { "epoch": 0.04610250660043154, "grad_norm": 1.0452093635038295e-09, "learning_rate": 9.538974933995686e-06, "loss": 0.0, "step": 10320 }, { "epoch": 0.046147179571943586, "grad_norm": 1.810579511568166e-14, "learning_rate": 9.538528204280566e-06, "loss": 0.0579, "step": 10330 }, { "epoch": 0.04619185254345563, "grad_norm": 1.3487329308947693e-10, "learning_rate": 9.538081474565444e-06, "loss": 0.0011, "step": 10340 }, { "epoch": 0.04623652551496768, "grad_norm": 1.3384500618940365e-07, "learning_rate": 9.537634744850324e-06, "loss": 0.0038, "step": 10350 }, { "epoch": 0.04628119848647973, "grad_norm": 0.007952453568577766, "learning_rate": 9.537188015135204e-06, "loss": 0.0, "step": 10360 }, { "epoch": 0.046325871457991774, "grad_norm": 1.5257786353806146e-11, "learning_rate": 9.536741285420083e-06, "loss": 0.0026, "step": 10370 }, { "epoch": 0.046370544429503814, "grad_norm": 1.2612064357149988e-15, "learning_rate": 9.536294555704963e-06, "loss": 0.0, "step": 10380 }, { "epoch": 0.04641521740101586, "grad_norm": 5.068566660132445e-12, "learning_rate": 9.535847825989843e-06, "loss": 0.0001, "step": 10390 }, { "epoch": 0.04645989037252791, "grad_norm": 1.833808141782356e-06, "learning_rate": 9.535401096274722e-06, "loss": 0.0, "step": 10400 }, { "epoch": 0.046504563344039955, "grad_norm": 1.5389470297069718e-12, "learning_rate": 9.534954366559602e-06, "loss": 0.0003, "step": 10410 }, { "epoch": 0.046549236315552, "grad_norm": 1.1092303742188392e-09, "learning_rate": 9.534507636844482e-06, "loss": 0.0, "step": 10420 }, { "epoch": 0.04659390928706405, "grad_norm": 2.9077733643134707e-07, "learning_rate": 9.53406090712936e-06, "loss": 1.6, "step": 10430 }, { "epoch": 0.046638582258576096, "grad_norm": 0.9567630887031555, "learning_rate": 9.53361417741424e-06, "loss": 0.0018, "step": 10440 }, { "epoch": 0.04668325523008814, "grad_norm": 2.428366697417234e-17, "learning_rate": 9.53316744769912e-06, "loss": 0.0011, "step": 10450 }, { "epoch": 0.04672792820160018, "grad_norm": 1.1588655368655054e-08, "learning_rate": 9.532720717983999e-06, "loss": 0.0, "step": 10460 }, { "epoch": 0.04677260117311223, "grad_norm": 2.9958778191655497e-15, "learning_rate": 9.532273988268879e-06, "loss": 0.0, "step": 10470 }, { "epoch": 0.04681727414462428, "grad_norm": 1.0342333933710535e-10, "learning_rate": 9.531827258553757e-06, "loss": 0.0, "step": 10480 }, { "epoch": 0.046861947116136324, "grad_norm": 1.4794973024478963e-09, "learning_rate": 9.531380528838638e-06, "loss": 0.1307, "step": 10490 }, { "epoch": 0.04690662008764837, "grad_norm": 7.921020135919117e-16, "learning_rate": 9.530933799123518e-06, "loss": 0.0537, "step": 10500 }, { "epoch": 0.04695129305916042, "grad_norm": 2.399570348643465e-06, "learning_rate": 9.530487069408396e-06, "loss": 0.0, "step": 10510 }, { "epoch": 0.046995966030672465, "grad_norm": 0.003332250751554966, "learning_rate": 9.530040339693276e-06, "loss": 0.1141, "step": 10520 }, { "epoch": 0.04704063900218451, "grad_norm": 0.001612229272723198, "learning_rate": 9.529593609978156e-06, "loss": 0.0, "step": 10530 }, { "epoch": 0.04708531197369655, "grad_norm": 0.017735157161951065, "learning_rate": 9.529146880263035e-06, "loss": 0.0006, "step": 10540 }, { "epoch": 0.0471299849452086, "grad_norm": 0.0006999396719038486, "learning_rate": 9.528700150547915e-06, "loss": 0.0, "step": 10550 }, { "epoch": 0.047174657916720646, "grad_norm": 0.00042073967051692307, "learning_rate": 9.528253420832795e-06, "loss": 0.0113, "step": 10560 }, { "epoch": 0.04721933088823269, "grad_norm": 2.5330312070082073e-08, "learning_rate": 9.527806691117674e-06, "loss": 0.0001, "step": 10570 }, { "epoch": 0.04726400385974474, "grad_norm": 0.04455679655075073, "learning_rate": 9.527359961402554e-06, "loss": 0.0201, "step": 10580 }, { "epoch": 0.047308676831256787, "grad_norm": 2.771469165274709e-10, "learning_rate": 9.526913231687432e-06, "loss": 0.0834, "step": 10590 }, { "epoch": 0.047353349802768833, "grad_norm": 3.395351555468551e-09, "learning_rate": 9.526466501972312e-06, "loss": 0.0908, "step": 10600 }, { "epoch": 0.047398022774280874, "grad_norm": 2.7118030629935674e-06, "learning_rate": 9.526019772257192e-06, "loss": 0.0008, "step": 10610 }, { "epoch": 0.04744269574579292, "grad_norm": 4.103829860687256, "learning_rate": 9.52557304254207e-06, "loss": 0.0011, "step": 10620 }, { "epoch": 0.04748736871730497, "grad_norm": 6.712851609336212e-05, "learning_rate": 9.525126312826951e-06, "loss": 0.0, "step": 10630 }, { "epoch": 0.047532041688817014, "grad_norm": 6.580361514352262e-06, "learning_rate": 9.524679583111831e-06, "loss": 0.0001, "step": 10640 }, { "epoch": 0.04757671466032906, "grad_norm": 0.016762414947152138, "learning_rate": 9.52423285339671e-06, "loss": 0.2344, "step": 10650 }, { "epoch": 0.04762138763184111, "grad_norm": 2.72741681670019e-12, "learning_rate": 9.52378612368159e-06, "loss": 0.0, "step": 10660 }, { "epoch": 0.047666060603353155, "grad_norm": 1.9086164940174122e-11, "learning_rate": 9.52333939396647e-06, "loss": 0.0, "step": 10670 }, { "epoch": 0.0477107335748652, "grad_norm": 2.0200214834176222e-09, "learning_rate": 9.522892664251348e-06, "loss": 0.0, "step": 10680 }, { "epoch": 0.04775540654637724, "grad_norm": 3.076793291256763e-06, "learning_rate": 9.522445934536228e-06, "loss": 0.2235, "step": 10690 }, { "epoch": 0.04780007951788929, "grad_norm": 1.306749939918518, "learning_rate": 9.521999204821107e-06, "loss": 0.0005, "step": 10700 }, { "epoch": 0.047844752489401336, "grad_norm": 0.004337243270128965, "learning_rate": 9.521552475105987e-06, "loss": 0.055, "step": 10710 }, { "epoch": 0.04788942546091338, "grad_norm": 1.5169650424695647e-08, "learning_rate": 9.521105745390867e-06, "loss": 0.5752, "step": 10720 }, { "epoch": 0.04793409843242543, "grad_norm": 1.403236971064814e-10, "learning_rate": 9.520659015675745e-06, "loss": 0.0062, "step": 10730 }, { "epoch": 0.04797877140393748, "grad_norm": 9.81935127826461e-13, "learning_rate": 9.520212285960625e-06, "loss": 0.0, "step": 10740 }, { "epoch": 0.048023444375449524, "grad_norm": 8.163741767930333e-06, "learning_rate": 9.519765556245506e-06, "loss": 0.0, "step": 10750 }, { "epoch": 0.04806811734696157, "grad_norm": 7.663325529705389e-10, "learning_rate": 9.519318826530384e-06, "loss": 0.0, "step": 10760 }, { "epoch": 0.04811279031847361, "grad_norm": 5.807218551635742, "learning_rate": 9.518872096815264e-06, "loss": 0.0007, "step": 10770 }, { "epoch": 0.04815746328998566, "grad_norm": 9.717787555554858e-13, "learning_rate": 9.518425367100144e-06, "loss": 0.0001, "step": 10780 }, { "epoch": 0.048202136261497705, "grad_norm": 2.0461568055907264e-05, "learning_rate": 9.517978637385024e-06, "loss": 0.0003, "step": 10790 }, { "epoch": 0.04824680923300975, "grad_norm": 0.04437157139182091, "learning_rate": 9.517531907669903e-06, "loss": 0.0003, "step": 10800 }, { "epoch": 0.0482914822045218, "grad_norm": 1.7951228618621826, "learning_rate": 9.517085177954783e-06, "loss": 0.2804, "step": 10810 }, { "epoch": 0.048336155176033846, "grad_norm": 3.078026722391769e-10, "learning_rate": 9.516638448239663e-06, "loss": 0.0, "step": 10820 }, { "epoch": 0.04838082814754589, "grad_norm": 0.0025664654094725847, "learning_rate": 9.516191718524542e-06, "loss": 0.0, "step": 10830 }, { "epoch": 0.04842550111905793, "grad_norm": 2.2002347860317073e-12, "learning_rate": 9.515744988809422e-06, "loss": 0.0, "step": 10840 }, { "epoch": 0.04847017409056998, "grad_norm": 0.11011355370283127, "learning_rate": 9.515298259094302e-06, "loss": 0.0011, "step": 10850 }, { "epoch": 0.04851484706208203, "grad_norm": 1.5746836652397178e-05, "learning_rate": 9.51485152937918e-06, "loss": 0.0, "step": 10860 }, { "epoch": 0.048559520033594074, "grad_norm": 4.677225717841793e-09, "learning_rate": 9.51440479966406e-06, "loss": 0.0, "step": 10870 }, { "epoch": 0.04860419300510612, "grad_norm": 5.160199931198914e-14, "learning_rate": 9.51395806994894e-06, "loss": 0.0, "step": 10880 }, { "epoch": 0.04864886597661817, "grad_norm": 8.101441184571456e-10, "learning_rate": 9.513511340233819e-06, "loss": 0.0036, "step": 10890 }, { "epoch": 0.048693538948130215, "grad_norm": 1.0413211626314478e-08, "learning_rate": 9.513064610518699e-06, "loss": 0.0001, "step": 10900 }, { "epoch": 0.04873821191964226, "grad_norm": 15.043107986450195, "learning_rate": 9.512617880803579e-06, "loss": 0.0028, "step": 10910 }, { "epoch": 0.0487828848911543, "grad_norm": 8.022602256184484e-15, "learning_rate": 9.512171151088458e-06, "loss": 0.0, "step": 10920 }, { "epoch": 0.04882755786266635, "grad_norm": 11.803953170776367, "learning_rate": 9.511724421373338e-06, "loss": 0.0745, "step": 10930 }, { "epoch": 0.048872230834178396, "grad_norm": 4.265514608192689e-09, "learning_rate": 9.511277691658218e-06, "loss": 0.0034, "step": 10940 }, { "epoch": 0.04891690380569044, "grad_norm": 0.0013865844812244177, "learning_rate": 9.510830961943096e-06, "loss": 0.0004, "step": 10950 }, { "epoch": 0.04896157677720249, "grad_norm": 2.2187156278885567e-10, "learning_rate": 9.510384232227976e-06, "loss": 0.0002, "step": 10960 }, { "epoch": 0.04900624974871454, "grad_norm": 4.963759181464411e-09, "learning_rate": 9.509937502512857e-06, "loss": 0.0202, "step": 10970 }, { "epoch": 0.049050922720226584, "grad_norm": 0.00019618085934780538, "learning_rate": 9.509490772797735e-06, "loss": 0.2945, "step": 10980 }, { "epoch": 0.04909559569173863, "grad_norm": 0.0008185982587747276, "learning_rate": 9.509044043082615e-06, "loss": 0.0, "step": 10990 }, { "epoch": 0.04914026866325067, "grad_norm": 0.4256277084350586, "learning_rate": 9.508597313367494e-06, "loss": 0.0, "step": 11000 }, { "epoch": 0.04918494163476272, "grad_norm": 5.677510305957355e-17, "learning_rate": 9.508150583652374e-06, "loss": 0.0, "step": 11010 }, { "epoch": 0.049229614606274764, "grad_norm": 1.41667380951227e-11, "learning_rate": 9.507703853937254e-06, "loss": 0.0039, "step": 11020 }, { "epoch": 0.04927428757778681, "grad_norm": 6.928135287687098e-10, "learning_rate": 9.507257124222132e-06, "loss": 0.0005, "step": 11030 }, { "epoch": 0.04931896054929886, "grad_norm": 5.0077971536666155e-05, "learning_rate": 9.506810394507012e-06, "loss": 0.0003, "step": 11040 }, { "epoch": 0.049363633520810905, "grad_norm": 1.8108100088842782e-11, "learning_rate": 9.506363664791892e-06, "loss": 0.0, "step": 11050 }, { "epoch": 0.04940830649232295, "grad_norm": 0.032544512301683426, "learning_rate": 9.505916935076771e-06, "loss": 0.0, "step": 11060 }, { "epoch": 0.049452979463835, "grad_norm": 2.02817362904284e-09, "learning_rate": 9.505470205361651e-06, "loss": 0.0, "step": 11070 }, { "epoch": 0.04949765243534704, "grad_norm": 7.679974820158642e-13, "learning_rate": 9.50502347564653e-06, "loss": 0.039, "step": 11080 }, { "epoch": 0.049542325406859086, "grad_norm": 1.672820673093156e-07, "learning_rate": 9.50457674593141e-06, "loss": 0.0, "step": 11090 }, { "epoch": 0.04958699837837113, "grad_norm": 3.4813847804571196e-08, "learning_rate": 9.50413001621629e-06, "loss": 0.0, "step": 11100 }, { "epoch": 0.04963167134988318, "grad_norm": 5.6135419229663697e-11, "learning_rate": 9.503683286501168e-06, "loss": 0.003, "step": 11110 }, { "epoch": 0.04967634432139523, "grad_norm": 1.9980174329248257e-05, "learning_rate": 9.503236556786048e-06, "loss": 0.0, "step": 11120 }, { "epoch": 0.049721017292907274, "grad_norm": 0.5705450177192688, "learning_rate": 9.502789827070928e-06, "loss": 0.0001, "step": 11130 }, { "epoch": 0.04976569026441932, "grad_norm": 0.0001016234455164522, "learning_rate": 9.502343097355807e-06, "loss": 0.0, "step": 11140 }, { "epoch": 0.04981036323593136, "grad_norm": 386.708740234375, "learning_rate": 9.501896367640687e-06, "loss": 0.1914, "step": 11150 }, { "epoch": 0.04985503620744341, "grad_norm": 0.0004175099020358175, "learning_rate": 9.501449637925567e-06, "loss": 0.106, "step": 11160 }, { "epoch": 0.049899709178955455, "grad_norm": 6.118619921835489e-07, "learning_rate": 9.501002908210445e-06, "loss": 0.001, "step": 11170 }, { "epoch": 0.0499443821504675, "grad_norm": 8.625532510464495e-10, "learning_rate": 9.500556178495326e-06, "loss": 0.001, "step": 11180 }, { "epoch": 0.04998905512197955, "grad_norm": 0.8630591630935669, "learning_rate": 9.500109448780204e-06, "loss": 0.0001, "step": 11190 }, { "epoch": 0.050033728093491596, "grad_norm": 1.435045252117309e-14, "learning_rate": 9.499662719065084e-06, "loss": 0.1915, "step": 11200 }, { "epoch": 0.05007840106500364, "grad_norm": 6.082538444272811e-13, "learning_rate": 9.499215989349964e-06, "loss": 0.0006, "step": 11210 }, { "epoch": 0.05012307403651569, "grad_norm": 1.1062415462565877e-14, "learning_rate": 9.498769259634843e-06, "loss": 0.0009, "step": 11220 }, { "epoch": 0.05016774700802773, "grad_norm": 5.843751864631486e-07, "learning_rate": 9.498322529919723e-06, "loss": 0.0, "step": 11230 }, { "epoch": 0.05021241997953978, "grad_norm": 3.578229046397041e-09, "learning_rate": 9.497875800204603e-06, "loss": 0.0143, "step": 11240 }, { "epoch": 0.050257092951051824, "grad_norm": 9.63085056149282e-10, "learning_rate": 9.497429070489483e-06, "loss": 0.0001, "step": 11250 }, { "epoch": 0.05030176592256387, "grad_norm": 2.2219956008484587e-06, "learning_rate": 9.496982340774362e-06, "loss": 0.0079, "step": 11260 }, { "epoch": 0.05034643889407592, "grad_norm": 0.00028614644543267787, "learning_rate": 9.496535611059242e-06, "loss": 0.0, "step": 11270 }, { "epoch": 0.050391111865587965, "grad_norm": 1.2245513200759888, "learning_rate": 9.496088881344122e-06, "loss": 0.0977, "step": 11280 }, { "epoch": 0.05043578483710001, "grad_norm": 1.8125715636535186e-14, "learning_rate": 9.495642151629e-06, "loss": 0.033, "step": 11290 }, { "epoch": 0.05048045780861206, "grad_norm": 2.919892637519514e-17, "learning_rate": 9.49519542191388e-06, "loss": 0.1024, "step": 11300 }, { "epoch": 0.0505251307801241, "grad_norm": 1.0463354798752062e-14, "learning_rate": 9.49474869219876e-06, "loss": 0.0049, "step": 11310 }, { "epoch": 0.050569803751636146, "grad_norm": 34.62099838256836, "learning_rate": 9.494301962483639e-06, "loss": 0.0073, "step": 11320 }, { "epoch": 0.05061447672314819, "grad_norm": 0.00027098608552478254, "learning_rate": 9.493855232768519e-06, "loss": 0.0001, "step": 11330 }, { "epoch": 0.05065914969466024, "grad_norm": 1.0252830806578572e-09, "learning_rate": 9.493408503053399e-06, "loss": 0.0002, "step": 11340 }, { "epoch": 0.05070382266617229, "grad_norm": 2.8664729959234844e-14, "learning_rate": 9.492961773338278e-06, "loss": 0.0002, "step": 11350 }, { "epoch": 0.050748495637684334, "grad_norm": 9.993117419071496e-05, "learning_rate": 9.492515043623158e-06, "loss": 0.0001, "step": 11360 }, { "epoch": 0.05079316860919638, "grad_norm": 5.520075515082907e-21, "learning_rate": 9.492068313908038e-06, "loss": 0.0, "step": 11370 }, { "epoch": 0.05083784158070842, "grad_norm": 5.1444808377709705e-06, "learning_rate": 9.491621584192916e-06, "loss": 0.0913, "step": 11380 }, { "epoch": 0.05088251455222047, "grad_norm": 1.1695376223030962e-08, "learning_rate": 9.491174854477796e-06, "loss": 0.0, "step": 11390 }, { "epoch": 0.050927187523732514, "grad_norm": 0.02048194594681263, "learning_rate": 9.490728124762677e-06, "loss": 0.0, "step": 11400 }, { "epoch": 0.05097186049524456, "grad_norm": 5.23670157530267e-14, "learning_rate": 9.490281395047555e-06, "loss": 0.0038, "step": 11410 }, { "epoch": 0.05101653346675661, "grad_norm": 0.026476258412003517, "learning_rate": 9.489834665332435e-06, "loss": 0.0001, "step": 11420 }, { "epoch": 0.051061206438268655, "grad_norm": 0.005338937975466251, "learning_rate": 9.489387935617315e-06, "loss": 0.0001, "step": 11430 }, { "epoch": 0.0511058794097807, "grad_norm": 4.433853519406739e-16, "learning_rate": 9.488941205902194e-06, "loss": 0.0, "step": 11440 }, { "epoch": 0.05115055238129275, "grad_norm": 7.238606358651143e-19, "learning_rate": 9.488494476187074e-06, "loss": 0.0001, "step": 11450 }, { "epoch": 0.05119522535280479, "grad_norm": 9.190580222318175e-15, "learning_rate": 9.488047746471954e-06, "loss": 0.0006, "step": 11460 }, { "epoch": 0.051239898324316836, "grad_norm": 3.4059677279062173e-13, "learning_rate": 9.487601016756832e-06, "loss": 0.0824, "step": 11470 }, { "epoch": 0.05128457129582888, "grad_norm": 2.744744051597081e-05, "learning_rate": 9.487154287041712e-06, "loss": 0.003, "step": 11480 }, { "epoch": 0.05132924426734093, "grad_norm": 7.354658548841542e-12, "learning_rate": 9.486707557326591e-06, "loss": 0.1917, "step": 11490 }, { "epoch": 0.05137391723885298, "grad_norm": 1.0190446924135594e-16, "learning_rate": 9.486260827611471e-06, "loss": 0.0, "step": 11500 }, { "epoch": 0.051418590210365024, "grad_norm": 1.3966475975735193e-08, "learning_rate": 9.485814097896351e-06, "loss": 0.0, "step": 11510 }, { "epoch": 0.05146326318187707, "grad_norm": 0.02896651066839695, "learning_rate": 9.48536736818123e-06, "loss": 0.0018, "step": 11520 }, { "epoch": 0.05150793615338912, "grad_norm": 4.734069852709716e-14, "learning_rate": 9.48492063846611e-06, "loss": 0.0004, "step": 11530 }, { "epoch": 0.05155260912490116, "grad_norm": 0.000538403692189604, "learning_rate": 9.48447390875099e-06, "loss": 0.0, "step": 11540 }, { "epoch": 0.051597282096413205, "grad_norm": 1.768347804070782e-14, "learning_rate": 9.484027179035868e-06, "loss": 0.0009, "step": 11550 }, { "epoch": 0.05164195506792525, "grad_norm": 1.6111727507706064e-09, "learning_rate": 9.483580449320748e-06, "loss": 0.0005, "step": 11560 }, { "epoch": 0.0516866280394373, "grad_norm": 5.3446917031019225e-11, "learning_rate": 9.483133719605627e-06, "loss": 0.0003, "step": 11570 }, { "epoch": 0.051731301010949346, "grad_norm": 2.6434767264049697e-08, "learning_rate": 9.482686989890507e-06, "loss": 0.1141, "step": 11580 }, { "epoch": 0.05177597398246139, "grad_norm": 1.753207024002279e-10, "learning_rate": 9.482240260175387e-06, "loss": 0.0062, "step": 11590 }, { "epoch": 0.05182064695397344, "grad_norm": 3.0608987566793644e-11, "learning_rate": 9.481793530460265e-06, "loss": 0.0002, "step": 11600 }, { "epoch": 0.05186531992548549, "grad_norm": 1.7744504754446666e-09, "learning_rate": 9.481346800745146e-06, "loss": 0.0022, "step": 11610 }, { "epoch": 0.05190999289699753, "grad_norm": 4.3283048967168725e-07, "learning_rate": 9.480900071030026e-06, "loss": 0.0, "step": 11620 }, { "epoch": 0.051954665868509574, "grad_norm": 3.8451598811661825e-05, "learning_rate": 9.480453341314904e-06, "loss": 0.0, "step": 11630 }, { "epoch": 0.05199933884002162, "grad_norm": 4.421446889169012e-11, "learning_rate": 9.480006611599784e-06, "loss": 0.0, "step": 11640 }, { "epoch": 0.05204401181153367, "grad_norm": 0.0026478637009859085, "learning_rate": 9.479559881884664e-06, "loss": 0.0899, "step": 11650 }, { "epoch": 0.052088684783045715, "grad_norm": 3.592525899875909e-05, "learning_rate": 9.479113152169543e-06, "loss": 0.0, "step": 11660 }, { "epoch": 0.05213335775455776, "grad_norm": 1.283037465850837e-16, "learning_rate": 9.478666422454423e-06, "loss": 0.0, "step": 11670 }, { "epoch": 0.05217803072606981, "grad_norm": 0.014142933301627636, "learning_rate": 9.478219692739301e-06, "loss": 0.0, "step": 11680 }, { "epoch": 0.05222270369758185, "grad_norm": 9.994188076234423e-06, "learning_rate": 9.477772963024182e-06, "loss": 0.0, "step": 11690 }, { "epoch": 0.052267376669093896, "grad_norm": 0.0016839229501783848, "learning_rate": 9.477326233309062e-06, "loss": 0.0, "step": 11700 }, { "epoch": 0.05231204964060594, "grad_norm": 0.00028825466870330274, "learning_rate": 9.476879503593942e-06, "loss": 0.0387, "step": 11710 }, { "epoch": 0.05235672261211799, "grad_norm": 2.1650969905806505e-10, "learning_rate": 9.47643277387882e-06, "loss": 0.0001, "step": 11720 }, { "epoch": 0.05240139558363004, "grad_norm": 8.670284842082765e-06, "learning_rate": 9.4759860441637e-06, "loss": 0.0, "step": 11730 }, { "epoch": 0.052446068555142084, "grad_norm": 1.1954272167713498e-06, "learning_rate": 9.47553931444858e-06, "loss": 0.0006, "step": 11740 }, { "epoch": 0.05249074152665413, "grad_norm": 4.82285622638301e-06, "learning_rate": 9.475092584733459e-06, "loss": 0.0826, "step": 11750 }, { "epoch": 0.05253541449816618, "grad_norm": 4.6414779575343346e-08, "learning_rate": 9.474645855018339e-06, "loss": 0.0035, "step": 11760 }, { "epoch": 0.05258008746967822, "grad_norm": 1.3316235891736028e-09, "learning_rate": 9.474199125303219e-06, "loss": 0.0001, "step": 11770 }, { "epoch": 0.052624760441190264, "grad_norm": 1.7648435601813617e-08, "learning_rate": 9.473752395588098e-06, "loss": 0.0005, "step": 11780 }, { "epoch": 0.05266943341270231, "grad_norm": 4.0121092559353144e-10, "learning_rate": 9.473305665872978e-06, "loss": 0.0, "step": 11790 }, { "epoch": 0.05271410638421436, "grad_norm": 1.308599273386335e-08, "learning_rate": 9.472858936157858e-06, "loss": 0.0, "step": 11800 }, { "epoch": 0.052758779355726405, "grad_norm": 800.6392211914062, "learning_rate": 9.472412206442736e-06, "loss": 0.8, "step": 11810 }, { "epoch": 0.05280345232723845, "grad_norm": 1.0625454187393188, "learning_rate": 9.471965476727616e-06, "loss": 0.0002, "step": 11820 }, { "epoch": 0.0528481252987505, "grad_norm": 0.2922259271144867, "learning_rate": 9.471518747012496e-06, "loss": 0.0537, "step": 11830 }, { "epoch": 0.052892798270262546, "grad_norm": 2.0052473814757832e-07, "learning_rate": 9.471072017297377e-06, "loss": 0.0, "step": 11840 }, { "epoch": 0.052937471241774586, "grad_norm": 0.00018748472211882472, "learning_rate": 9.470625287582255e-06, "loss": 0.0, "step": 11850 }, { "epoch": 0.05298214421328663, "grad_norm": 3.574021079089107e-09, "learning_rate": 9.470178557867135e-06, "loss": 0.0003, "step": 11860 }, { "epoch": 0.05302681718479868, "grad_norm": 3.325407726961771e-10, "learning_rate": 9.469731828152014e-06, "loss": 0.0117, "step": 11870 }, { "epoch": 0.05307149015631073, "grad_norm": 5.634159242617898e-05, "learning_rate": 9.469285098436894e-06, "loss": 0.0011, "step": 11880 }, { "epoch": 0.053116163127822774, "grad_norm": 6.641070649493486e-05, "learning_rate": 9.468838368721774e-06, "loss": 0.0, "step": 11890 }, { "epoch": 0.05316083609933482, "grad_norm": 3.0292048904811963e-05, "learning_rate": 9.468391639006652e-06, "loss": 0.0001, "step": 11900 }, { "epoch": 0.05320550907084687, "grad_norm": 0.0033375348430126905, "learning_rate": 9.467944909291532e-06, "loss": 0.0, "step": 11910 }, { "epoch": 0.05325018204235891, "grad_norm": 1.4492547961708624e-06, "learning_rate": 9.467498179576413e-06, "loss": 0.0217, "step": 11920 }, { "epoch": 0.053294855013870955, "grad_norm": 4.281428822583422e-10, "learning_rate": 9.467051449861291e-06, "loss": 0.0034, "step": 11930 }, { "epoch": 0.053339527985383, "grad_norm": 9.358368697576225e-05, "learning_rate": 9.466604720146171e-06, "loss": 0.0017, "step": 11940 }, { "epoch": 0.05338420095689505, "grad_norm": 1.4051488271549228e-11, "learning_rate": 9.466157990431051e-06, "loss": 0.0009, "step": 11950 }, { "epoch": 0.053428873928407096, "grad_norm": 8.350542479718115e-10, "learning_rate": 9.46571126071593e-06, "loss": 0.0, "step": 11960 }, { "epoch": 0.05347354689991914, "grad_norm": 3.462303332923966e-09, "learning_rate": 9.46526453100081e-06, "loss": 0.0, "step": 11970 }, { "epoch": 0.05351821987143119, "grad_norm": 0.0007217188831418753, "learning_rate": 9.464817801285688e-06, "loss": 0.0002, "step": 11980 }, { "epoch": 0.05356289284294324, "grad_norm": 3.165804285387708e-10, "learning_rate": 9.464371071570568e-06, "loss": 0.0, "step": 11990 }, { "epoch": 0.05360756581445528, "grad_norm": 4.4119716841478704e-20, "learning_rate": 9.463924341855448e-06, "loss": 0.0201, "step": 12000 }, { "epoch": 0.053652238785967324, "grad_norm": 7.734079002230665e-12, "learning_rate": 9.463477612140327e-06, "loss": 0.1211, "step": 12010 }, { "epoch": 0.05369691175747937, "grad_norm": 5.054994964281967e-11, "learning_rate": 9.463030882425207e-06, "loss": 0.0523, "step": 12020 }, { "epoch": 0.05374158472899142, "grad_norm": 3.923462596477911e-11, "learning_rate": 9.462584152710087e-06, "loss": 0.0141, "step": 12030 }, { "epoch": 0.053786257700503465, "grad_norm": 2.907965812593005e-10, "learning_rate": 9.462137422994966e-06, "loss": 0.0, "step": 12040 }, { "epoch": 0.05383093067201551, "grad_norm": 0.014994989149272442, "learning_rate": 9.461690693279846e-06, "loss": 0.0001, "step": 12050 }, { "epoch": 0.05387560364352756, "grad_norm": 8.538211915265492e-08, "learning_rate": 9.461243963564726e-06, "loss": 0.003, "step": 12060 }, { "epoch": 0.053920276615039606, "grad_norm": 1.8153332192868347e-16, "learning_rate": 9.460797233849604e-06, "loss": 0.0, "step": 12070 }, { "epoch": 0.053964949586551646, "grad_norm": 4.282461700793466e-13, "learning_rate": 9.460350504134484e-06, "loss": 0.0, "step": 12080 }, { "epoch": 0.05400962255806369, "grad_norm": 1.88760368473595e-05, "learning_rate": 9.459903774419363e-06, "loss": 0.7031, "step": 12090 }, { "epoch": 0.05405429552957574, "grad_norm": 1.0948496307423752e-14, "learning_rate": 9.459457044704243e-06, "loss": 0.0253, "step": 12100 }, { "epoch": 0.05409896850108779, "grad_norm": 1.64843222592026e-05, "learning_rate": 9.459010314989123e-06, "loss": 0.153, "step": 12110 }, { "epoch": 0.054143641472599834, "grad_norm": 6.418565630167058e-14, "learning_rate": 9.458563585274001e-06, "loss": 0.0, "step": 12120 }, { "epoch": 0.05418831444411188, "grad_norm": 1.8156572093891595e-16, "learning_rate": 9.458116855558882e-06, "loss": 0.0084, "step": 12130 }, { "epoch": 0.05423298741562393, "grad_norm": 2.4297784761984076e-07, "learning_rate": 9.457670125843762e-06, "loss": 0.01, "step": 12140 }, { "epoch": 0.054277660387135974, "grad_norm": 2.5206797999999253e-06, "learning_rate": 9.45722339612864e-06, "loss": 0.0002, "step": 12150 }, { "epoch": 0.054322333358648014, "grad_norm": 5.1032744002865726e-11, "learning_rate": 9.45677666641352e-06, "loss": 0.0001, "step": 12160 }, { "epoch": 0.05436700633016006, "grad_norm": 1.2425232398527442e-08, "learning_rate": 9.4563299366984e-06, "loss": 0.0, "step": 12170 }, { "epoch": 0.05441167930167211, "grad_norm": 9.334379491576783e-09, "learning_rate": 9.455883206983279e-06, "loss": 0.0038, "step": 12180 }, { "epoch": 0.054456352273184155, "grad_norm": 2.4313667381647974e-06, "learning_rate": 9.455436477268159e-06, "loss": 0.0, "step": 12190 }, { "epoch": 0.0545010252446962, "grad_norm": 2.281985176625155e-17, "learning_rate": 9.454989747553039e-06, "loss": 0.0, "step": 12200 }, { "epoch": 0.05454569821620825, "grad_norm": 1.9201582581303235e-15, "learning_rate": 9.454543017837918e-06, "loss": 0.0, "step": 12210 }, { "epoch": 0.054590371187720296, "grad_norm": 2.42331623517833e-11, "learning_rate": 9.454096288122798e-06, "loss": 0.0004, "step": 12220 }, { "epoch": 0.054635044159232336, "grad_norm": 1.804826723628139e-07, "learning_rate": 9.453649558407678e-06, "loss": 0.0, "step": 12230 }, { "epoch": 0.05467971713074438, "grad_norm": 0.00022683448332827538, "learning_rate": 9.453202828692556e-06, "loss": 0.0, "step": 12240 }, { "epoch": 0.05472439010225643, "grad_norm": 7.438219995492545e-07, "learning_rate": 9.452756098977436e-06, "loss": 0.4503, "step": 12250 }, { "epoch": 0.05476906307376848, "grad_norm": 2.3686347105922323e-08, "learning_rate": 9.452309369262316e-06, "loss": 0.8375, "step": 12260 }, { "epoch": 0.054813736045280524, "grad_norm": 2.794357897073496e-06, "learning_rate": 9.451862639547195e-06, "loss": 1.3188, "step": 12270 }, { "epoch": 0.05485840901679257, "grad_norm": 7.669413089752197, "learning_rate": 9.451415909832075e-06, "loss": 0.0016, "step": 12280 }, { "epoch": 0.05490308198830462, "grad_norm": 0.001205424894578755, "learning_rate": 9.450969180116955e-06, "loss": 0.004, "step": 12290 }, { "epoch": 0.054947754959816665, "grad_norm": 6.918762210261775e-06, "learning_rate": 9.450522450401835e-06, "loss": 0.0002, "step": 12300 }, { "epoch": 0.054992427931328705, "grad_norm": 2.9161958958745515e-10, "learning_rate": 9.450075720686714e-06, "loss": 0.525, "step": 12310 }, { "epoch": 0.05503710090284075, "grad_norm": 2.0024484911118634e-05, "learning_rate": 9.449628990971594e-06, "loss": 0.0007, "step": 12320 }, { "epoch": 0.0550817738743528, "grad_norm": 4.540103873296175e-06, "learning_rate": 9.449182261256474e-06, "loss": 0.0, "step": 12330 }, { "epoch": 0.055126446845864846, "grad_norm": 1.0630525348886977e-09, "learning_rate": 9.448735531541352e-06, "loss": 0.475, "step": 12340 }, { "epoch": 0.05517111981737689, "grad_norm": 618.4067993164062, "learning_rate": 9.448288801826233e-06, "loss": 0.3183, "step": 12350 }, { "epoch": 0.05521579278888894, "grad_norm": 7.532779733310235e-09, "learning_rate": 9.447842072111113e-06, "loss": 0.0213, "step": 12360 }, { "epoch": 0.05526046576040099, "grad_norm": 3.242120460988929e-11, "learning_rate": 9.447395342395991e-06, "loss": 0.0428, "step": 12370 }, { "epoch": 0.055305138731913034, "grad_norm": 2.8545087300824168e-12, "learning_rate": 9.446948612680871e-06, "loss": 0.0, "step": 12380 }, { "epoch": 0.055349811703425074, "grad_norm": 2.1261237179714954e-06, "learning_rate": 9.44650188296575e-06, "loss": 0.0, "step": 12390 }, { "epoch": 0.05539448467493712, "grad_norm": 1.7692869391794375e-10, "learning_rate": 9.44605515325063e-06, "loss": 0.001, "step": 12400 }, { "epoch": 0.05543915764644917, "grad_norm": 0.0017321183113381267, "learning_rate": 9.44560842353551e-06, "loss": 0.001, "step": 12410 }, { "epoch": 0.055483830617961215, "grad_norm": 1.5489741045371375e-08, "learning_rate": 9.445161693820388e-06, "loss": 0.0005, "step": 12420 }, { "epoch": 0.05552850358947326, "grad_norm": 1.0361794267055302e-07, "learning_rate": 9.444714964105268e-06, "loss": 0.0691, "step": 12430 }, { "epoch": 0.05557317656098531, "grad_norm": 2.675851619275693e-14, "learning_rate": 9.444268234390149e-06, "loss": 0.0, "step": 12440 }, { "epoch": 0.055617849532497356, "grad_norm": 3.5206210776544955e-12, "learning_rate": 9.443821504675027e-06, "loss": 0.0226, "step": 12450 }, { "epoch": 0.0556625225040094, "grad_norm": 0.00045068253530189395, "learning_rate": 9.443374774959907e-06, "loss": 0.0, "step": 12460 }, { "epoch": 0.05570719547552144, "grad_norm": 0.010806038975715637, "learning_rate": 9.442928045244786e-06, "loss": 0.1406, "step": 12470 }, { "epoch": 0.05575186844703349, "grad_norm": 0.07221142202615738, "learning_rate": 9.442481315529666e-06, "loss": 0.0049, "step": 12480 }, { "epoch": 0.05579654141854554, "grad_norm": 4.355013311779787e-12, "learning_rate": 9.442034585814546e-06, "loss": 0.003, "step": 12490 }, { "epoch": 0.055841214390057584, "grad_norm": 1.2977407095604576e-05, "learning_rate": 9.441587856099424e-06, "loss": 0.0, "step": 12500 }, { "epoch": 0.05588588736156963, "grad_norm": 0.007427062373608351, "learning_rate": 9.441141126384304e-06, "loss": 0.0, "step": 12510 }, { "epoch": 0.05593056033308168, "grad_norm": 2.1748248855146812e-06, "learning_rate": 9.440694396669184e-06, "loss": 0.0387, "step": 12520 }, { "epoch": 0.055975233304593724, "grad_norm": 1.2029813993910676e-13, "learning_rate": 9.440247666954063e-06, "loss": 0.0, "step": 12530 }, { "epoch": 0.056019906276105765, "grad_norm": 0.31753480434417725, "learning_rate": 9.439800937238943e-06, "loss": 0.0, "step": 12540 }, { "epoch": 0.05606457924761781, "grad_norm": 1.325088396697538e-05, "learning_rate": 9.439354207523823e-06, "loss": 0.0692, "step": 12550 }, { "epoch": 0.05610925221912986, "grad_norm": 2.330439174969001e-10, "learning_rate": 9.438907477808702e-06, "loss": 0.0003, "step": 12560 }, { "epoch": 0.056153925190641905, "grad_norm": 2.4827478939913605e-13, "learning_rate": 9.438460748093582e-06, "loss": 0.1055, "step": 12570 }, { "epoch": 0.05619859816215395, "grad_norm": 7.189043045043945, "learning_rate": 9.43801401837846e-06, "loss": 0.0013, "step": 12580 }, { "epoch": 0.056243271133666, "grad_norm": 5.677891490307729e-10, "learning_rate": 9.43756728866334e-06, "loss": 0.0, "step": 12590 }, { "epoch": 0.056287944105178046, "grad_norm": 0.00373545428737998, "learning_rate": 9.43712055894822e-06, "loss": 0.0, "step": 12600 }, { "epoch": 0.05633261707669009, "grad_norm": 8.608561984146945e-06, "learning_rate": 9.436673829233099e-06, "loss": 0.0, "step": 12610 }, { "epoch": 0.05637729004820213, "grad_norm": 3.579793322124221e-15, "learning_rate": 9.436227099517979e-06, "loss": 0.0, "step": 12620 }, { "epoch": 0.05642196301971418, "grad_norm": 0.127075657248497, "learning_rate": 9.435780369802859e-06, "loss": 0.0001, "step": 12630 }, { "epoch": 0.05646663599122623, "grad_norm": 2.5742030729247745e-08, "learning_rate": 9.435333640087738e-06, "loss": 0.0, "step": 12640 }, { "epoch": 0.056511308962738274, "grad_norm": 5.2089426390011795e-06, "learning_rate": 9.434886910372618e-06, "loss": 0.0, "step": 12650 }, { "epoch": 0.05655598193425032, "grad_norm": 6.528809421979531e-07, "learning_rate": 9.434440180657498e-06, "loss": 0.0, "step": 12660 }, { "epoch": 0.05660065490576237, "grad_norm": 36.38373947143555, "learning_rate": 9.433993450942376e-06, "loss": 0.013, "step": 12670 }, { "epoch": 0.056645327877274415, "grad_norm": 1.2341820612910871e-15, "learning_rate": 9.433546721227256e-06, "loss": 0.0023, "step": 12680 }, { "epoch": 0.05669000084878646, "grad_norm": 0.0046431561931967735, "learning_rate": 9.433099991512136e-06, "loss": 0.625, "step": 12690 }, { "epoch": 0.0567346738202985, "grad_norm": 0.017402734607458115, "learning_rate": 9.432653261797015e-06, "loss": 0.0001, "step": 12700 }, { "epoch": 0.05677934679181055, "grad_norm": 0.17794175446033478, "learning_rate": 9.432206532081895e-06, "loss": 0.0758, "step": 12710 }, { "epoch": 0.056824019763322596, "grad_norm": 2.2747856576188497e-07, "learning_rate": 9.431759802366775e-06, "loss": 0.0035, "step": 12720 }, { "epoch": 0.05686869273483464, "grad_norm": 0.004323522560298443, "learning_rate": 9.431313072651654e-06, "loss": 0.0, "step": 12730 }, { "epoch": 0.05691336570634669, "grad_norm": 2.525700892874738e-07, "learning_rate": 9.430866342936534e-06, "loss": 0.0001, "step": 12740 }, { "epoch": 0.05695803867785874, "grad_norm": 4.306988898861164e-08, "learning_rate": 9.430419613221414e-06, "loss": 0.0013, "step": 12750 }, { "epoch": 0.057002711649370784, "grad_norm": 2.449407121574154e-09, "learning_rate": 9.429972883506294e-06, "loss": 0.0001, "step": 12760 }, { "epoch": 0.057047384620882824, "grad_norm": 0.0009273152681998909, "learning_rate": 9.429526153791172e-06, "loss": 0.0079, "step": 12770 }, { "epoch": 0.05709205759239487, "grad_norm": 1.339035687886181e-10, "learning_rate": 9.429079424076053e-06, "loss": 0.0038, "step": 12780 }, { "epoch": 0.05713673056390692, "grad_norm": 3.203273150204078e-12, "learning_rate": 9.428632694360933e-06, "loss": 0.0002, "step": 12790 }, { "epoch": 0.057181403535418965, "grad_norm": 5.628530974632895e-09, "learning_rate": 9.428185964645811e-06, "loss": 0.0348, "step": 12800 }, { "epoch": 0.05722607650693101, "grad_norm": 2.965325620607473e-07, "learning_rate": 9.427739234930691e-06, "loss": 0.007, "step": 12810 }, { "epoch": 0.05727074947844306, "grad_norm": 6.876974545377834e-09, "learning_rate": 9.427292505215571e-06, "loss": 0.0127, "step": 12820 }, { "epoch": 0.057315422449955106, "grad_norm": 4.7581835360688274e-07, "learning_rate": 9.42684577550045e-06, "loss": 0.0055, "step": 12830 }, { "epoch": 0.05736009542146715, "grad_norm": 4.716447074315511e-05, "learning_rate": 9.42639904578533e-06, "loss": 0.0007, "step": 12840 }, { "epoch": 0.05740476839297919, "grad_norm": 9.436319281519445e-09, "learning_rate": 9.42595231607021e-06, "loss": 0.0022, "step": 12850 }, { "epoch": 0.05744944136449124, "grad_norm": 6.438230570893211e-07, "learning_rate": 9.425505586355088e-06, "loss": 0.0759, "step": 12860 }, { "epoch": 0.05749411433600329, "grad_norm": 6.83662833012022e-08, "learning_rate": 9.425058856639969e-06, "loss": 0.0049, "step": 12870 }, { "epoch": 0.057538787307515334, "grad_norm": 3.728951469383901e-06, "learning_rate": 9.424612126924847e-06, "loss": 0.0, "step": 12880 }, { "epoch": 0.05758346027902738, "grad_norm": 5.688684592830562e-11, "learning_rate": 9.424165397209727e-06, "loss": 0.0, "step": 12890 }, { "epoch": 0.05762813325053943, "grad_norm": 5.956259790940166e-14, "learning_rate": 9.423718667494607e-06, "loss": 0.0, "step": 12900 }, { "epoch": 0.057672806222051475, "grad_norm": 1.761552154144752e-10, "learning_rate": 9.423271937779486e-06, "loss": 0.0, "step": 12910 }, { "epoch": 0.05771747919356352, "grad_norm": 0.0009397334652021527, "learning_rate": 9.422825208064366e-06, "loss": 0.0014, "step": 12920 }, { "epoch": 0.05776215216507556, "grad_norm": 2.9478458785092698e-08, "learning_rate": 9.422378478349246e-06, "loss": 0.0001, "step": 12930 }, { "epoch": 0.05780682513658761, "grad_norm": 0.0010126801207661629, "learning_rate": 9.421931748634124e-06, "loss": 0.0, "step": 12940 }, { "epoch": 0.057851498108099655, "grad_norm": 0.05465909093618393, "learning_rate": 9.421485018919004e-06, "loss": 0.0079, "step": 12950 }, { "epoch": 0.0578961710796117, "grad_norm": 2.74059730465126e-09, "learning_rate": 9.421038289203883e-06, "loss": 0.0, "step": 12960 }, { "epoch": 0.05794084405112375, "grad_norm": 0.0038352487608790398, "learning_rate": 9.420591559488763e-06, "loss": 0.0, "step": 12970 }, { "epoch": 0.057985517022635796, "grad_norm": 1.9260965096619918e-14, "learning_rate": 9.420144829773643e-06, "loss": 0.0013, "step": 12980 }, { "epoch": 0.05803018999414784, "grad_norm": 6.832203205964227e-12, "learning_rate": 9.419698100058522e-06, "loss": 0.0, "step": 12990 }, { "epoch": 0.05807486296565989, "grad_norm": 2.557584699580673e-10, "learning_rate": 9.419251370343402e-06, "loss": 0.0, "step": 13000 }, { "epoch": 0.05811953593717193, "grad_norm": 5.910940070741333e-10, "learning_rate": 9.418804640628282e-06, "loss": 0.0001, "step": 13010 }, { "epoch": 0.05816420890868398, "grad_norm": 0.048840224742889404, "learning_rate": 9.41835791091316e-06, "loss": 0.0006, "step": 13020 }, { "epoch": 0.058208881880196024, "grad_norm": 3.80511566699937e-10, "learning_rate": 9.41791118119804e-06, "loss": 0.0, "step": 13030 }, { "epoch": 0.05825355485170807, "grad_norm": 1.9007886375987937e-09, "learning_rate": 9.41746445148292e-06, "loss": 0.0, "step": 13040 }, { "epoch": 0.05829822782322012, "grad_norm": 2.1054762378014402e-09, "learning_rate": 9.417017721767799e-06, "loss": 0.0, "step": 13050 }, { "epoch": 0.058342900794732165, "grad_norm": 6.012247411035787e-08, "learning_rate": 9.416570992052679e-06, "loss": 0.0362, "step": 13060 }, { "epoch": 0.05838757376624421, "grad_norm": 0.007417942397296429, "learning_rate": 9.416124262337558e-06, "loss": 0.0001, "step": 13070 }, { "epoch": 0.05843224673775625, "grad_norm": 9.17901132879706e-09, "learning_rate": 9.415677532622438e-06, "loss": 0.2469, "step": 13080 }, { "epoch": 0.0584769197092683, "grad_norm": 1.002669267058387e-10, "learning_rate": 9.415230802907318e-06, "loss": 0.0034, "step": 13090 }, { "epoch": 0.058521592680780346, "grad_norm": 2.3442614516255844e-10, "learning_rate": 9.414784073192196e-06, "loss": 0.0, "step": 13100 }, { "epoch": 0.05856626565229239, "grad_norm": 7.194295903900638e-05, "learning_rate": 9.414337343477076e-06, "loss": 0.0, "step": 13110 }, { "epoch": 0.05861093862380444, "grad_norm": 5.2120436438984896e-11, "learning_rate": 9.413890613761956e-06, "loss": 0.0038, "step": 13120 }, { "epoch": 0.05865561159531649, "grad_norm": 4.7373262646033254e-08, "learning_rate": 9.413443884046835e-06, "loss": 0.0043, "step": 13130 }, { "epoch": 0.058700284566828534, "grad_norm": 6.5466607829245405e-12, "learning_rate": 9.412997154331715e-06, "loss": 0.0047, "step": 13140 }, { "epoch": 0.05874495753834058, "grad_norm": 0.06896981596946716, "learning_rate": 9.412550424616595e-06, "loss": 0.0, "step": 13150 }, { "epoch": 0.05878963050985262, "grad_norm": 9.386536686806224e-11, "learning_rate": 9.412103694901474e-06, "loss": 0.0, "step": 13160 }, { "epoch": 0.05883430348136467, "grad_norm": 7.611852481514032e-18, "learning_rate": 9.411656965186354e-06, "loss": 0.0146, "step": 13170 }, { "epoch": 0.058878976452876715, "grad_norm": 1.1134966559911419e-10, "learning_rate": 9.411210235471234e-06, "loss": 0.0026, "step": 13180 }, { "epoch": 0.05892364942438876, "grad_norm": 0.00012055486877216026, "learning_rate": 9.410763505756112e-06, "loss": 0.0, "step": 13190 }, { "epoch": 0.05896832239590081, "grad_norm": 6.284484651425393e-13, "learning_rate": 9.410316776040992e-06, "loss": 0.0, "step": 13200 }, { "epoch": 0.059012995367412856, "grad_norm": 1.1680693079441085e-09, "learning_rate": 9.409870046325873e-06, "loss": 0.0, "step": 13210 }, { "epoch": 0.0590576683389249, "grad_norm": 3.1868826226855163e-06, "learning_rate": 9.409423316610753e-06, "loss": 0.0, "step": 13220 }, { "epoch": 0.05910234131043695, "grad_norm": 6.71885274366069e-16, "learning_rate": 9.408976586895631e-06, "loss": 0.0001, "step": 13230 }, { "epoch": 0.05914701428194899, "grad_norm": 24.487958908081055, "learning_rate": 9.408529857180511e-06, "loss": 0.6562, "step": 13240 }, { "epoch": 0.05919168725346104, "grad_norm": 56.855098724365234, "learning_rate": 9.408083127465391e-06, "loss": 0.0113, "step": 13250 }, { "epoch": 0.059236360224973084, "grad_norm": 0.00012145333312219009, "learning_rate": 9.40763639775027e-06, "loss": 0.0022, "step": 13260 }, { "epoch": 0.05928103319648513, "grad_norm": 1.8307785842885949e-10, "learning_rate": 9.40718966803515e-06, "loss": 0.0, "step": 13270 }, { "epoch": 0.05932570616799718, "grad_norm": 6.931577572122025e-14, "learning_rate": 9.40674293832003e-06, "loss": 0.2016, "step": 13280 }, { "epoch": 0.059370379139509225, "grad_norm": 6.213984188319088e-11, "learning_rate": 9.406296208604908e-06, "loss": 0.0064, "step": 13290 }, { "epoch": 0.05941505211102127, "grad_norm": 9.8732723385897e-12, "learning_rate": 9.405849478889789e-06, "loss": 0.0, "step": 13300 }, { "epoch": 0.05945972508253331, "grad_norm": 3.321137635736715e-12, "learning_rate": 9.405402749174669e-06, "loss": 0.0, "step": 13310 }, { "epoch": 0.05950439805404536, "grad_norm": 1.8688808278710667e-09, "learning_rate": 9.404956019459547e-06, "loss": 0.0055, "step": 13320 }, { "epoch": 0.059549071025557405, "grad_norm": 2.0776104065589607e-05, "learning_rate": 9.404509289744427e-06, "loss": 0.2016, "step": 13330 }, { "epoch": 0.05959374399706945, "grad_norm": 6.520615799663254e-17, "learning_rate": 9.404062560029307e-06, "loss": 0.0014, "step": 13340 }, { "epoch": 0.0596384169685815, "grad_norm": 428.4654235839844, "learning_rate": 9.403615830314186e-06, "loss": 0.1042, "step": 13350 }, { "epoch": 0.059683089940093546, "grad_norm": 2.1810656107845716e-05, "learning_rate": 9.403169100599066e-06, "loss": 0.0038, "step": 13360 }, { "epoch": 0.05972776291160559, "grad_norm": 8.029319431557269e-09, "learning_rate": 9.402722370883944e-06, "loss": 0.0, "step": 13370 }, { "epoch": 0.05977243588311764, "grad_norm": 6.180247851261811e-07, "learning_rate": 9.402275641168824e-06, "loss": 0.0, "step": 13380 }, { "epoch": 0.05981710885462968, "grad_norm": 0.000312251562718302, "learning_rate": 9.401828911453705e-06, "loss": 0.0, "step": 13390 }, { "epoch": 0.05986178182614173, "grad_norm": 1.1670133062580135e-05, "learning_rate": 9.401382181738583e-06, "loss": 0.0013, "step": 13400 }, { "epoch": 0.059906454797653774, "grad_norm": 1.9894434710732156e-10, "learning_rate": 9.400935452023463e-06, "loss": 0.0006, "step": 13410 }, { "epoch": 0.05995112776916582, "grad_norm": 7.895046110206749e-07, "learning_rate": 9.400488722308343e-06, "loss": 0.6375, "step": 13420 }, { "epoch": 0.05999580074067787, "grad_norm": 7.807607858845156e-12, "learning_rate": 9.400041992593222e-06, "loss": 0.7125, "step": 13430 }, { "epoch": 0.060040473712189915, "grad_norm": 6.620969088544371e-07, "learning_rate": 9.399595262878102e-06, "loss": 0.0, "step": 13440 }, { "epoch": 0.06008514668370196, "grad_norm": 6.074368741337821e-08, "learning_rate": 9.399148533162982e-06, "loss": 0.0, "step": 13450 }, { "epoch": 0.06012981965521401, "grad_norm": 1.0262946742445322e-10, "learning_rate": 9.39870180344786e-06, "loss": 0.65, "step": 13460 }, { "epoch": 0.06017449262672605, "grad_norm": 0.0005402317037805915, "learning_rate": 9.39825507373274e-06, "loss": 0.0091, "step": 13470 }, { "epoch": 0.060219165598238096, "grad_norm": 1.2266228033297466e-08, "learning_rate": 9.397808344017619e-06, "loss": 0.0, "step": 13480 }, { "epoch": 0.06026383856975014, "grad_norm": 5.947930206579721e-17, "learning_rate": 9.397361614302499e-06, "loss": 0.0004, "step": 13490 }, { "epoch": 0.06030851154126219, "grad_norm": 1.7619703474025528e-09, "learning_rate": 9.39691488458738e-06, "loss": 0.0, "step": 13500 }, { "epoch": 0.06035318451277424, "grad_norm": 4.006636515896389e-07, "learning_rate": 9.396468154872258e-06, "loss": 0.0002, "step": 13510 }, { "epoch": 0.060397857484286284, "grad_norm": 1.7506009391664995e-13, "learning_rate": 9.396021425157138e-06, "loss": 0.0, "step": 13520 }, { "epoch": 0.06044253045579833, "grad_norm": 0.8221254944801331, "learning_rate": 9.395574695442018e-06, "loss": 0.5138, "step": 13530 }, { "epoch": 0.06048720342731038, "grad_norm": 5.2254020554975966e-11, "learning_rate": 9.395127965726896e-06, "loss": 0.0002, "step": 13540 }, { "epoch": 0.06053187639882242, "grad_norm": 0.01927713118493557, "learning_rate": 9.394681236011776e-06, "loss": 0.0063, "step": 13550 }, { "epoch": 0.060576549370334465, "grad_norm": 8.598310508034501e-09, "learning_rate": 9.394234506296655e-06, "loss": 0.0001, "step": 13560 }, { "epoch": 0.06062122234184651, "grad_norm": 4.621643256541574e-06, "learning_rate": 9.393787776581535e-06, "loss": 0.6852, "step": 13570 }, { "epoch": 0.06066589531335856, "grad_norm": 0.004439809825271368, "learning_rate": 9.393341046866415e-06, "loss": 0.6785, "step": 13580 }, { "epoch": 0.060710568284870606, "grad_norm": 0.00013783443137072027, "learning_rate": 9.392894317151294e-06, "loss": 0.0001, "step": 13590 }, { "epoch": 0.06075524125638265, "grad_norm": 2.3134138584136963, "learning_rate": 9.392447587436174e-06, "loss": 0.0132, "step": 13600 }, { "epoch": 0.0607999142278947, "grad_norm": 6.790860061300918e-05, "learning_rate": 9.392000857721054e-06, "loss": 0.0186, "step": 13610 }, { "epoch": 0.06084458719940674, "grad_norm": 0.0005562048172578216, "learning_rate": 9.391554128005932e-06, "loss": 0.4033, "step": 13620 }, { "epoch": 0.06088926017091879, "grad_norm": 9.475531442149077e-06, "learning_rate": 9.391107398290812e-06, "loss": 0.0, "step": 13630 }, { "epoch": 0.060933933142430834, "grad_norm": 4.904528395854868e-06, "learning_rate": 9.390660668575692e-06, "loss": 0.6563, "step": 13640 }, { "epoch": 0.06097860611394288, "grad_norm": 0.0010299455607309937, "learning_rate": 9.390213938860571e-06, "loss": 0.0, "step": 13650 }, { "epoch": 0.06102327908545493, "grad_norm": 1.1471456673461944e-05, "learning_rate": 9.389767209145451e-06, "loss": 0.0002, "step": 13660 }, { "epoch": 0.061067952056966975, "grad_norm": 2.3598842064515013e-13, "learning_rate": 9.389320479430331e-06, "loss": 0.0, "step": 13670 }, { "epoch": 0.06111262502847902, "grad_norm": 2.994657961608027e-07, "learning_rate": 9.388873749715211e-06, "loss": 0.575, "step": 13680 }, { "epoch": 0.06115729799999107, "grad_norm": 2.1437656414491357e-06, "learning_rate": 9.38842702000009e-06, "loss": 0.0021, "step": 13690 }, { "epoch": 0.06120197097150311, "grad_norm": 1.3860614672012161e-05, "learning_rate": 9.38798029028497e-06, "loss": 0.0497, "step": 13700 }, { "epoch": 0.061246643943015155, "grad_norm": 3.358853126189665e-11, "learning_rate": 9.38753356056985e-06, "loss": 0.01, "step": 13710 }, { "epoch": 0.0612913169145272, "grad_norm": 1.0310730935714219e-08, "learning_rate": 9.387086830854728e-06, "loss": 0.0, "step": 13720 }, { "epoch": 0.06133598988603925, "grad_norm": 2.0384447602106448e-11, "learning_rate": 9.386640101139609e-06, "loss": 0.0017, "step": 13730 }, { "epoch": 0.061380662857551296, "grad_norm": 0.6072666049003601, "learning_rate": 9.386193371424489e-06, "loss": 0.0008, "step": 13740 }, { "epoch": 0.06142533582906334, "grad_norm": 6.129848043201491e-05, "learning_rate": 9.385746641709367e-06, "loss": 0.0001, "step": 13750 }, { "epoch": 0.06147000880057539, "grad_norm": 3.1852081720179376e-17, "learning_rate": 9.385299911994247e-06, "loss": 0.0001, "step": 13760 }, { "epoch": 0.06151468177208744, "grad_norm": 0.0002205081982538104, "learning_rate": 9.384853182279127e-06, "loss": 0.0, "step": 13770 }, { "epoch": 0.06155935474359948, "grad_norm": 2.394686705875504e-11, "learning_rate": 9.384406452564006e-06, "loss": 0.0002, "step": 13780 }, { "epoch": 0.061604027715111524, "grad_norm": 0.11703392118215561, "learning_rate": 9.383959722848886e-06, "loss": 0.0, "step": 13790 }, { "epoch": 0.06164870068662357, "grad_norm": 4.587246138498813e-10, "learning_rate": 9.383512993133766e-06, "loss": 0.2469, "step": 13800 }, { "epoch": 0.06169337365813562, "grad_norm": 9.678911737864837e-06, "learning_rate": 9.383066263418644e-06, "loss": 0.0749, "step": 13810 }, { "epoch": 0.061738046629647665, "grad_norm": 0.015324323438107967, "learning_rate": 9.382619533703525e-06, "loss": 0.0014, "step": 13820 }, { "epoch": 0.06178271960115971, "grad_norm": 4.483310857977818e-12, "learning_rate": 9.382172803988405e-06, "loss": 0.0001, "step": 13830 }, { "epoch": 0.06182739257267176, "grad_norm": 7.979075733466345e-10, "learning_rate": 9.381726074273283e-06, "loss": 0.525, "step": 13840 }, { "epoch": 0.0618720655441838, "grad_norm": 2.2147013112316927e-09, "learning_rate": 9.381279344558163e-06, "loss": 0.0003, "step": 13850 }, { "epoch": 0.061916738515695846, "grad_norm": 1.945849462459056e-19, "learning_rate": 9.380832614843042e-06, "loss": 0.0, "step": 13860 }, { "epoch": 0.06196141148720789, "grad_norm": 1.3114672583469655e-05, "learning_rate": 9.380385885127922e-06, "loss": 0.0003, "step": 13870 }, { "epoch": 0.06200608445871994, "grad_norm": 8.562447278981367e-11, "learning_rate": 9.379939155412802e-06, "loss": 0.0, "step": 13880 }, { "epoch": 0.06205075743023199, "grad_norm": 1.6484534848260068e-09, "learning_rate": 9.37949242569768e-06, "loss": 0.0001, "step": 13890 }, { "epoch": 0.062095430401744034, "grad_norm": 313.6671447753906, "learning_rate": 9.37904569598256e-06, "loss": 0.2005, "step": 13900 }, { "epoch": 0.06214010337325608, "grad_norm": 2.4716316368795024e-14, "learning_rate": 9.37859896626744e-06, "loss": 0.0002, "step": 13910 }, { "epoch": 0.06218477634476813, "grad_norm": 2.2001998445375648e-08, "learning_rate": 9.378152236552319e-06, "loss": 0.0001, "step": 13920 }, { "epoch": 0.06222944931628017, "grad_norm": 0.10420388728380203, "learning_rate": 9.3777055068372e-06, "loss": 0.0227, "step": 13930 }, { "epoch": 0.062274122287792215, "grad_norm": 1.254663348197937, "learning_rate": 9.37725877712208e-06, "loss": 0.0001, "step": 13940 }, { "epoch": 0.06231879525930426, "grad_norm": 3.909913825404744e-15, "learning_rate": 9.376812047406958e-06, "loss": 0.0002, "step": 13950 }, { "epoch": 0.06236346823081631, "grad_norm": 4.169524697294341e-13, "learning_rate": 9.376365317691838e-06, "loss": 0.016, "step": 13960 }, { "epoch": 0.062408141202328356, "grad_norm": 6.563727339425895e-10, "learning_rate": 9.375918587976716e-06, "loss": 0.0, "step": 13970 }, { "epoch": 0.0624528141738404, "grad_norm": 20.916521072387695, "learning_rate": 9.375471858261596e-06, "loss": 0.0026, "step": 13980 }, { "epoch": 0.06249748714535245, "grad_norm": 1.0113201248662662e-10, "learning_rate": 9.375025128546477e-06, "loss": 0.0, "step": 13990 }, { "epoch": 0.06254216011686449, "grad_norm": 1.5791814606197843e-14, "learning_rate": 9.374578398831355e-06, "loss": 0.0049, "step": 14000 }, { "epoch": 0.06258683308837654, "grad_norm": 0.00038137752562761307, "learning_rate": 9.374131669116235e-06, "loss": 0.3566, "step": 14010 }, { "epoch": 0.06263150605988858, "grad_norm": 3.751644683802624e-08, "learning_rate": 9.373684939401115e-06, "loss": 0.0, "step": 14020 }, { "epoch": 0.06267617903140063, "grad_norm": 2.194812297821045, "learning_rate": 9.373238209685994e-06, "loss": 0.224, "step": 14030 }, { "epoch": 0.06272085200291268, "grad_norm": 5.022498683615595e-08, "learning_rate": 9.372791479970874e-06, "loss": 0.0, "step": 14040 }, { "epoch": 0.06276552497442472, "grad_norm": 7.967174034772718e-17, "learning_rate": 9.372344750255754e-06, "loss": 0.0057, "step": 14050 }, { "epoch": 0.06281019794593677, "grad_norm": 5.298577785491943, "learning_rate": 9.371898020540632e-06, "loss": 0.0024, "step": 14060 }, { "epoch": 0.06285487091744882, "grad_norm": 1212.22509765625, "learning_rate": 9.371451290825512e-06, "loss": 1.5299, "step": 14070 }, { "epoch": 0.06289954388896087, "grad_norm": 9.872307832337057e-12, "learning_rate": 9.371004561110391e-06, "loss": 0.0, "step": 14080 }, { "epoch": 0.06294421686047291, "grad_norm": 4.333783332199914e-12, "learning_rate": 9.370557831395271e-06, "loss": 0.0043, "step": 14090 }, { "epoch": 0.06298888983198496, "grad_norm": 1.8763324282982754e-19, "learning_rate": 9.370111101680151e-06, "loss": 0.0002, "step": 14100 }, { "epoch": 0.063033562803497, "grad_norm": 2.3205307897455896e-08, "learning_rate": 9.36966437196503e-06, "loss": 0.0001, "step": 14110 }, { "epoch": 0.06307823577500904, "grad_norm": 1.4380998170377416e-17, "learning_rate": 9.36921764224991e-06, "loss": 0.0691, "step": 14120 }, { "epoch": 0.06312290874652109, "grad_norm": 5.406634595730249e-16, "learning_rate": 9.36877091253479e-06, "loss": 0.0, "step": 14130 }, { "epoch": 0.06316758171803313, "grad_norm": 1.297780335893667e-19, "learning_rate": 9.36832418281967e-06, "loss": 1.8627, "step": 14140 }, { "epoch": 0.06321225468954518, "grad_norm": 0.047105517238378525, "learning_rate": 9.367877453104548e-06, "loss": 0.2813, "step": 14150 }, { "epoch": 0.06325692766105723, "grad_norm": 1.0787633073050529e-05, "learning_rate": 9.367430723389429e-06, "loss": 0.0049, "step": 14160 }, { "epoch": 0.06330160063256927, "grad_norm": 7.771322647898149e-12, "learning_rate": 9.366983993674309e-06, "loss": 0.0, "step": 14170 }, { "epoch": 0.06334627360408132, "grad_norm": 0.0006671182345598936, "learning_rate": 9.366537263959187e-06, "loss": 0.0, "step": 14180 }, { "epoch": 0.06339094657559337, "grad_norm": 5.741674375200849e-17, "learning_rate": 9.366090534244067e-06, "loss": 0.0001, "step": 14190 }, { "epoch": 0.06343561954710542, "grad_norm": 3.3889224759775516e-10, "learning_rate": 9.365643804528947e-06, "loss": 0.0475, "step": 14200 }, { "epoch": 0.06348029251861746, "grad_norm": 0.0008080815896391869, "learning_rate": 9.365197074813826e-06, "loss": 0.1359, "step": 14210 }, { "epoch": 0.06352496549012951, "grad_norm": 0.0009778772946447134, "learning_rate": 9.364750345098706e-06, "loss": 0.0049, "step": 14220 }, { "epoch": 0.06356963846164156, "grad_norm": 1.316239450765977e-09, "learning_rate": 9.364303615383586e-06, "loss": 0.0, "step": 14230 }, { "epoch": 0.0636143114331536, "grad_norm": 5.679064088326413e-06, "learning_rate": 9.363856885668464e-06, "loss": 0.0, "step": 14240 }, { "epoch": 0.06365898440466565, "grad_norm": 0.00017402069352101535, "learning_rate": 9.363410155953345e-06, "loss": 0.0128, "step": 14250 }, { "epoch": 0.0637036573761777, "grad_norm": 0.00018130049284081906, "learning_rate": 9.362963426238225e-06, "loss": 0.0, "step": 14260 }, { "epoch": 0.06374833034768974, "grad_norm": 1.457603655830031e-11, "learning_rate": 9.362516696523103e-06, "loss": 0.0, "step": 14270 }, { "epoch": 0.06379300331920178, "grad_norm": 4.875524609815329e-06, "learning_rate": 9.362069966807983e-06, "loss": 0.0, "step": 14280 }, { "epoch": 0.06383767629071382, "grad_norm": 2.04024197536512e-09, "learning_rate": 9.361623237092863e-06, "loss": 0.0, "step": 14290 }, { "epoch": 0.06388234926222587, "grad_norm": 0.00029834118322469294, "learning_rate": 9.361176507377742e-06, "loss": 0.0, "step": 14300 }, { "epoch": 0.06392702223373792, "grad_norm": 0.028371868655085564, "learning_rate": 9.360729777662622e-06, "loss": 0.0, "step": 14310 }, { "epoch": 0.06397169520524996, "grad_norm": 1.0530008163824277e-18, "learning_rate": 9.360283047947502e-06, "loss": 0.0011, "step": 14320 }, { "epoch": 0.06401636817676201, "grad_norm": 1.1820983303323374e-07, "learning_rate": 9.35983631823238e-06, "loss": 0.0, "step": 14330 }, { "epoch": 0.06406104114827406, "grad_norm": 443.0798034667969, "learning_rate": 9.35938958851726e-06, "loss": 0.2126, "step": 14340 }, { "epoch": 0.0641057141197861, "grad_norm": 2.8089353183659207e-15, "learning_rate": 9.358942858802139e-06, "loss": 0.0, "step": 14350 }, { "epoch": 0.06415038709129815, "grad_norm": 4.0729130290628746e-08, "learning_rate": 9.35849612908702e-06, "loss": 1.5, "step": 14360 }, { "epoch": 0.0641950600628102, "grad_norm": 1.8806908687952114e-18, "learning_rate": 9.3580493993719e-06, "loss": 0.0001, "step": 14370 }, { "epoch": 0.06423973303432225, "grad_norm": 4.6179598339746164e-11, "learning_rate": 9.357602669656778e-06, "loss": 0.0, "step": 14380 }, { "epoch": 0.0642844060058343, "grad_norm": 2.830821017596463e-07, "learning_rate": 9.357155939941658e-06, "loss": 0.0006, "step": 14390 }, { "epoch": 0.06432907897734634, "grad_norm": 1.0931796801116889e-08, "learning_rate": 9.356709210226538e-06, "loss": 0.0001, "step": 14400 }, { "epoch": 0.06437375194885839, "grad_norm": 1.5363036709459266e-07, "learning_rate": 9.356262480511416e-06, "loss": 0.0, "step": 14410 }, { "epoch": 0.06441842492037043, "grad_norm": 4.7219604226711454e-08, "learning_rate": 9.355815750796297e-06, "loss": 0.0, "step": 14420 }, { "epoch": 0.06446309789188247, "grad_norm": 1.4899464995110634e-09, "learning_rate": 9.355369021081177e-06, "loss": 0.0089, "step": 14430 }, { "epoch": 0.06450777086339451, "grad_norm": 0.0672210082411766, "learning_rate": 9.354922291366055e-06, "loss": 0.3282, "step": 14440 }, { "epoch": 0.06455244383490656, "grad_norm": 2.0680950001406018e-06, "learning_rate": 9.354475561650935e-06, "loss": 0.0, "step": 14450 }, { "epoch": 0.06459711680641861, "grad_norm": 591.7171020507812, "learning_rate": 9.354028831935814e-06, "loss": 0.2345, "step": 14460 }, { "epoch": 0.06464178977793066, "grad_norm": 2.4443409518681847e-09, "learning_rate": 9.353582102220694e-06, "loss": 0.0068, "step": 14470 }, { "epoch": 0.0646864627494427, "grad_norm": 0.0029351816046983004, "learning_rate": 9.353135372505574e-06, "loss": 0.0005, "step": 14480 }, { "epoch": 0.06473113572095475, "grad_norm": 2.7630872523332073e-07, "learning_rate": 9.352688642790452e-06, "loss": 0.0031, "step": 14490 }, { "epoch": 0.0647758086924668, "grad_norm": 0.0005741612403653562, "learning_rate": 9.352241913075332e-06, "loss": 0.0, "step": 14500 }, { "epoch": 0.06482048166397884, "grad_norm": 2.8474836377689883e-10, "learning_rate": 9.351795183360213e-06, "loss": 0.0001, "step": 14510 }, { "epoch": 0.06486515463549089, "grad_norm": 1.0916278370132204e-07, "learning_rate": 9.351348453645091e-06, "loss": 0.0, "step": 14520 }, { "epoch": 0.06490982760700294, "grad_norm": 1.2938996662298363e-12, "learning_rate": 9.350901723929971e-06, "loss": 0.0, "step": 14530 }, { "epoch": 0.06495450057851498, "grad_norm": 1.8603714055376308e-11, "learning_rate": 9.350454994214851e-06, "loss": 0.0022, "step": 14540 }, { "epoch": 0.06499917355002703, "grad_norm": 5.892105026106265e-09, "learning_rate": 9.35000826449973e-06, "loss": 0.775, "step": 14550 }, { "epoch": 0.06504384652153908, "grad_norm": 0.0011221965542063117, "learning_rate": 9.34956153478461e-06, "loss": 0.0, "step": 14560 }, { "epoch": 0.06508851949305113, "grad_norm": 4.0386902355749044e-07, "learning_rate": 9.349114805069488e-06, "loss": 0.0, "step": 14570 }, { "epoch": 0.06513319246456317, "grad_norm": 2.900226170332587e-10, "learning_rate": 9.348668075354368e-06, "loss": 0.0, "step": 14580 }, { "epoch": 0.0651778654360752, "grad_norm": 1.719579268033729e-15, "learning_rate": 9.348221345639249e-06, "loss": 0.0001, "step": 14590 }, { "epoch": 0.06522253840758725, "grad_norm": 2.6878053602241714e-11, "learning_rate": 9.347774615924129e-06, "loss": 0.0009, "step": 14600 }, { "epoch": 0.0652672113790993, "grad_norm": 4.025258316768532e-13, "learning_rate": 9.347327886209007e-06, "loss": 0.0, "step": 14610 }, { "epoch": 0.06531188435061135, "grad_norm": 2.202270508178117e-07, "learning_rate": 9.346881156493887e-06, "loss": 0.0045, "step": 14620 }, { "epoch": 0.0653565573221234, "grad_norm": 11.413064956665039, "learning_rate": 9.346434426778767e-06, "loss": 0.0016, "step": 14630 }, { "epoch": 0.06540123029363544, "grad_norm": 0.0013619032688438892, "learning_rate": 9.345987697063646e-06, "loss": 0.0, "step": 14640 }, { "epoch": 0.06544590326514749, "grad_norm": 1.1321811679365981e-13, "learning_rate": 9.345540967348526e-06, "loss": 0.1317, "step": 14650 }, { "epoch": 0.06549057623665953, "grad_norm": 0.0016499441117048264, "learning_rate": 9.345094237633406e-06, "loss": 0.0002, "step": 14660 }, { "epoch": 0.06553524920817158, "grad_norm": 2.5986149538616277e-13, "learning_rate": 9.344647507918284e-06, "loss": 0.166, "step": 14670 }, { "epoch": 0.06557992217968363, "grad_norm": 0.00024417508393526077, "learning_rate": 9.344200778203165e-06, "loss": 0.0004, "step": 14680 }, { "epoch": 0.06562459515119567, "grad_norm": 0.21002331376075745, "learning_rate": 9.343754048488045e-06, "loss": 0.0001, "step": 14690 }, { "epoch": 0.06566926812270772, "grad_norm": 30.844282150268555, "learning_rate": 9.343307318772923e-06, "loss": 0.2183, "step": 14700 }, { "epoch": 0.06571394109421977, "grad_norm": 2.5580968099347956e-07, "learning_rate": 9.342860589057803e-06, "loss": 0.0, "step": 14710 }, { "epoch": 0.06575861406573182, "grad_norm": 5.260941406959319e-07, "learning_rate": 9.342413859342683e-06, "loss": 0.0, "step": 14720 }, { "epoch": 0.06580328703724386, "grad_norm": 0.0003637924382928759, "learning_rate": 9.341967129627563e-06, "loss": 0.0, "step": 14730 }, { "epoch": 0.0658479600087559, "grad_norm": 2.3141837296285217e-13, "learning_rate": 9.341520399912442e-06, "loss": 0.0, "step": 14740 }, { "epoch": 0.06589263298026794, "grad_norm": 4.705751166511618e-07, "learning_rate": 9.341073670197322e-06, "loss": 0.0041, "step": 14750 }, { "epoch": 0.06593730595177999, "grad_norm": 9.326619476723863e-09, "learning_rate": 9.3406269404822e-06, "loss": 0.0004, "step": 14760 }, { "epoch": 0.06598197892329204, "grad_norm": 2.6402527386615304e-10, "learning_rate": 9.34018021076708e-06, "loss": 0.0, "step": 14770 }, { "epoch": 0.06602665189480408, "grad_norm": 7.31518491614569e-12, "learning_rate": 9.33973348105196e-06, "loss": 0.0, "step": 14780 }, { "epoch": 0.06607132486631613, "grad_norm": 0.0001492876763222739, "learning_rate": 9.33928675133684e-06, "loss": 0.0018, "step": 14790 }, { "epoch": 0.06611599783782818, "grad_norm": 7.935686153359711e-05, "learning_rate": 9.33884002162172e-06, "loss": 0.0034, "step": 14800 }, { "epoch": 0.06616067080934022, "grad_norm": 4.462126668158817e-09, "learning_rate": 9.3383932919066e-06, "loss": 0.0, "step": 14810 }, { "epoch": 0.06620534378085227, "grad_norm": 9.117017611970368e-07, "learning_rate": 9.337946562191478e-06, "loss": 0.0001, "step": 14820 }, { "epoch": 0.06625001675236432, "grad_norm": 1.0151539981961832e-06, "learning_rate": 9.337499832476358e-06, "loss": 0.0387, "step": 14830 }, { "epoch": 0.06629468972387637, "grad_norm": 1.5695963156758808e-05, "learning_rate": 9.337053102761238e-06, "loss": 0.0, "step": 14840 }, { "epoch": 0.06633936269538841, "grad_norm": 0.00035690312506631017, "learning_rate": 9.336606373046117e-06, "loss": 0.0043, "step": 14850 }, { "epoch": 0.06638403566690046, "grad_norm": 0.0006880881846882403, "learning_rate": 9.336159643330997e-06, "loss": 0.0002, "step": 14860 }, { "epoch": 0.0664287086384125, "grad_norm": 4.308242080279393e-15, "learning_rate": 9.335712913615875e-06, "loss": 0.0, "step": 14870 }, { "epoch": 0.06647338160992455, "grad_norm": 0.030537355691194534, "learning_rate": 9.335266183900755e-06, "loss": 0.0, "step": 14880 }, { "epoch": 0.0665180545814366, "grad_norm": 0.016046520322561264, "learning_rate": 9.334819454185635e-06, "loss": 0.0, "step": 14890 }, { "epoch": 0.06656272755294863, "grad_norm": 8.736069139558822e-05, "learning_rate": 9.334372724470514e-06, "loss": 0.0001, "step": 14900 }, { "epoch": 0.06660740052446068, "grad_norm": 9.695746659045596e-21, "learning_rate": 9.333925994755394e-06, "loss": 0.0357, "step": 14910 }, { "epoch": 0.06665207349597273, "grad_norm": 1.6318119833158562e-07, "learning_rate": 9.333479265040274e-06, "loss": 0.0, "step": 14920 }, { "epoch": 0.06669674646748477, "grad_norm": 6.515248287541908e-07, "learning_rate": 9.333032535325152e-06, "loss": 0.0, "step": 14930 }, { "epoch": 0.06674141943899682, "grad_norm": 3.7281594700289133e-07, "learning_rate": 9.332585805610033e-06, "loss": 0.0021, "step": 14940 }, { "epoch": 0.06678609241050887, "grad_norm": 1.641012048470003e-10, "learning_rate": 9.332139075894911e-06, "loss": 0.0202, "step": 14950 }, { "epoch": 0.06683076538202092, "grad_norm": 4.301578428567154e-06, "learning_rate": 9.331692346179791e-06, "loss": 0.0002, "step": 14960 }, { "epoch": 0.06687543835353296, "grad_norm": 1.3561146563340065e-13, "learning_rate": 9.331245616464671e-06, "loss": 0.0002, "step": 14970 }, { "epoch": 0.06692011132504501, "grad_norm": 0.03523876518011093, "learning_rate": 9.33079888674955e-06, "loss": 0.0089, "step": 14980 }, { "epoch": 0.06696478429655706, "grad_norm": 8.709083801350381e-13, "learning_rate": 9.33035215703443e-06, "loss": 0.0001, "step": 14990 }, { "epoch": 0.0670094572680691, "grad_norm": 1.5945241571557744e-08, "learning_rate": 9.32990542731931e-06, "loss": 0.0, "step": 15000 }, { "epoch": 0.06705413023958115, "grad_norm": 2.682080391114283e-10, "learning_rate": 9.329458697604188e-06, "loss": 0.0002, "step": 15010 }, { "epoch": 0.0670988032110932, "grad_norm": 8.974782722548369e-11, "learning_rate": 9.329011967889069e-06, "loss": 0.0001, "step": 15020 }, { "epoch": 0.06714347618260524, "grad_norm": 7.33713756417842e-10, "learning_rate": 9.328565238173949e-06, "loss": 0.0, "step": 15030 }, { "epoch": 0.06718814915411729, "grad_norm": 1.4199095232925174e-08, "learning_rate": 9.328118508458827e-06, "loss": 0.0, "step": 15040 }, { "epoch": 0.06723282212562932, "grad_norm": 2.654218178577139e-09, "learning_rate": 9.327671778743707e-06, "loss": 0.0016, "step": 15050 }, { "epoch": 0.06727749509714137, "grad_norm": 1.912253111391836e-15, "learning_rate": 9.327225049028587e-06, "loss": 0.0, "step": 15060 }, { "epoch": 0.06732216806865342, "grad_norm": 1.1236858554752871e-08, "learning_rate": 9.326778319313466e-06, "loss": 0.0, "step": 15070 }, { "epoch": 0.06736684104016546, "grad_norm": 0.002570349955931306, "learning_rate": 9.326331589598346e-06, "loss": 0.0, "step": 15080 }, { "epoch": 0.06741151401167751, "grad_norm": 0.00039951750659383833, "learning_rate": 9.325884859883226e-06, "loss": 0.0, "step": 15090 }, { "epoch": 0.06745618698318956, "grad_norm": 1.2615786008161986e-15, "learning_rate": 9.325438130168104e-06, "loss": 0.0002, "step": 15100 }, { "epoch": 0.0675008599547016, "grad_norm": 2.0209645299473777e-05, "learning_rate": 9.324991400452985e-06, "loss": 0.0, "step": 15110 }, { "epoch": 0.06754553292621365, "grad_norm": 1.8193735513705178e-06, "learning_rate": 9.324544670737865e-06, "loss": 0.0, "step": 15120 }, { "epoch": 0.0675902058977257, "grad_norm": 1.6376893667728246e-16, "learning_rate": 9.324097941022743e-06, "loss": 0.0, "step": 15130 }, { "epoch": 0.06763487886923775, "grad_norm": 2.2886815071105957, "learning_rate": 9.323651211307623e-06, "loss": 0.0207, "step": 15140 }, { "epoch": 0.0676795518407498, "grad_norm": 0.00047048478154465556, "learning_rate": 9.323204481592503e-06, "loss": 0.0001, "step": 15150 }, { "epoch": 0.06772422481226184, "grad_norm": 3.897758688253816e-06, "learning_rate": 9.322757751877382e-06, "loss": 0.0055, "step": 15160 }, { "epoch": 0.06776889778377389, "grad_norm": 6.203567863849457e-07, "learning_rate": 9.322311022162262e-06, "loss": 0.0, "step": 15170 }, { "epoch": 0.06781357075528593, "grad_norm": 2.630331891850801e-06, "learning_rate": 9.321864292447142e-06, "loss": 0.0, "step": 15180 }, { "epoch": 0.06785824372679798, "grad_norm": 1.54407257228987e-17, "learning_rate": 9.321417562732022e-06, "loss": 0.0, "step": 15190 }, { "epoch": 0.06790291669831001, "grad_norm": 1.8115958386188957e-11, "learning_rate": 9.3209708330169e-06, "loss": 0.0635, "step": 15200 }, { "epoch": 0.06794758966982206, "grad_norm": 3.644802558724902e-13, "learning_rate": 9.32052410330178e-06, "loss": 0.0003, "step": 15210 }, { "epoch": 0.06799226264133411, "grad_norm": 0.0009677475318312645, "learning_rate": 9.320077373586661e-06, "loss": 0.0, "step": 15220 }, { "epoch": 0.06803693561284616, "grad_norm": 6.228935951091234e-18, "learning_rate": 9.31963064387154e-06, "loss": 0.0, "step": 15230 }, { "epoch": 0.0680816085843582, "grad_norm": 4.024088411824778e-05, "learning_rate": 9.31918391415642e-06, "loss": 0.1055, "step": 15240 }, { "epoch": 0.06812628155587025, "grad_norm": 0.04487450420856476, "learning_rate": 9.318737184441298e-06, "loss": 0.0, "step": 15250 }, { "epoch": 0.0681709545273823, "grad_norm": 4.879202426068046e-15, "learning_rate": 9.318290454726178e-06, "loss": 0.0452, "step": 15260 }, { "epoch": 0.06821562749889434, "grad_norm": 0.0355444997549057, "learning_rate": 9.317843725011058e-06, "loss": 0.0007, "step": 15270 }, { "epoch": 0.06826030047040639, "grad_norm": 1.9042292742632583e-10, "learning_rate": 9.317396995295937e-06, "loss": 0.0004, "step": 15280 }, { "epoch": 0.06830497344191844, "grad_norm": 0.001804482308216393, "learning_rate": 9.316950265580817e-06, "loss": 0.0006, "step": 15290 }, { "epoch": 0.06834964641343048, "grad_norm": 7.79038145992672e-06, "learning_rate": 9.316503535865697e-06, "loss": 0.0, "step": 15300 }, { "epoch": 0.06839431938494253, "grad_norm": 0.006531926803290844, "learning_rate": 9.316056806150575e-06, "loss": 0.0, "step": 15310 }, { "epoch": 0.06843899235645458, "grad_norm": 0.00145976641215384, "learning_rate": 9.315610076435455e-06, "loss": 0.0, "step": 15320 }, { "epoch": 0.06848366532796663, "grad_norm": 1.1706009672707296e-06, "learning_rate": 9.315163346720335e-06, "loss": 0.0026, "step": 15330 }, { "epoch": 0.06852833829947867, "grad_norm": 5.742387454432674e-09, "learning_rate": 9.314716617005214e-06, "loss": 0.0002, "step": 15340 }, { "epoch": 0.06857301127099072, "grad_norm": 2.1157748699188232, "learning_rate": 9.314269887290094e-06, "loss": 0.0007, "step": 15350 }, { "epoch": 0.06861768424250275, "grad_norm": 8.668970394865028e-07, "learning_rate": 9.313823157574972e-06, "loss": 0.0, "step": 15360 }, { "epoch": 0.0686623572140148, "grad_norm": 0.00040037737926468253, "learning_rate": 9.313376427859853e-06, "loss": 0.0011, "step": 15370 }, { "epoch": 0.06870703018552685, "grad_norm": 0.06943363696336746, "learning_rate": 9.312929698144733e-06, "loss": 0.0127, "step": 15380 }, { "epoch": 0.0687517031570389, "grad_norm": 0.0003452890960033983, "learning_rate": 9.312482968429611e-06, "loss": 0.0, "step": 15390 }, { "epoch": 0.06879637612855094, "grad_norm": 324.0003967285156, "learning_rate": 9.312036238714491e-06, "loss": 0.1562, "step": 15400 }, { "epoch": 0.06884104910006299, "grad_norm": 0.0001255038077943027, "learning_rate": 9.311589508999371e-06, "loss": 0.008, "step": 15410 }, { "epoch": 0.06888572207157503, "grad_norm": 0.10439004004001617, "learning_rate": 9.31114277928425e-06, "loss": 0.0001, "step": 15420 }, { "epoch": 0.06893039504308708, "grad_norm": 0.010804546996951103, "learning_rate": 9.31069604956913e-06, "loss": 0.0008, "step": 15430 }, { "epoch": 0.06897506801459913, "grad_norm": 1.23363960824463e-07, "learning_rate": 9.31024931985401e-06, "loss": 0.0, "step": 15440 }, { "epoch": 0.06901974098611117, "grad_norm": 1.2098164809369594e-14, "learning_rate": 9.309802590138888e-06, "loss": 0.0, "step": 15450 }, { "epoch": 0.06906441395762322, "grad_norm": 3.7172107297638035e-13, "learning_rate": 9.309355860423769e-06, "loss": 0.305, "step": 15460 }, { "epoch": 0.06910908692913527, "grad_norm": 1.8141294266501087e-15, "learning_rate": 9.308909130708647e-06, "loss": 0.0, "step": 15470 }, { "epoch": 0.06915375990064732, "grad_norm": 4.109392166137695, "learning_rate": 9.308462400993527e-06, "loss": 0.0006, "step": 15480 }, { "epoch": 0.06919843287215936, "grad_norm": 634.8197631835938, "learning_rate": 9.308015671278407e-06, "loss": 0.3976, "step": 15490 }, { "epoch": 0.06924310584367141, "grad_norm": 7.697084426879883, "learning_rate": 9.307568941563286e-06, "loss": 0.0018, "step": 15500 }, { "epoch": 0.06928777881518344, "grad_norm": 0.9893437027931213, "learning_rate": 9.307122211848166e-06, "loss": 0.0978, "step": 15510 }, { "epoch": 0.06933245178669549, "grad_norm": 3.965033101849258e-06, "learning_rate": 9.306675482133046e-06, "loss": 0.0, "step": 15520 }, { "epoch": 0.06937712475820754, "grad_norm": 481.1986083984375, "learning_rate": 9.306228752417924e-06, "loss": 0.6469, "step": 15530 }, { "epoch": 0.06942179772971958, "grad_norm": 0.00033715751487761736, "learning_rate": 9.305782022702805e-06, "loss": 0.0, "step": 15540 }, { "epoch": 0.06946647070123163, "grad_norm": 740.4984741210938, "learning_rate": 9.305335292987685e-06, "loss": 0.6, "step": 15550 }, { "epoch": 0.06951114367274368, "grad_norm": 1.4198823919286951e-05, "learning_rate": 9.304888563272563e-06, "loss": 0.3234, "step": 15560 }, { "epoch": 0.06955581664425572, "grad_norm": 7.220014595077373e-06, "learning_rate": 9.304441833557443e-06, "loss": 0.0, "step": 15570 }, { "epoch": 0.06960048961576777, "grad_norm": 3.1226806640625, "learning_rate": 9.303995103842323e-06, "loss": 0.0007, "step": 15580 }, { "epoch": 0.06964516258727982, "grad_norm": 7.694500823163253e-07, "learning_rate": 9.303548374127202e-06, "loss": 0.016, "step": 15590 }, { "epoch": 0.06968983555879187, "grad_norm": 1.0497649327589897e-06, "learning_rate": 9.303101644412082e-06, "loss": 0.0, "step": 15600 }, { "epoch": 0.06973450853030391, "grad_norm": 2.502213760635641e-07, "learning_rate": 9.302654914696962e-06, "loss": 0.0, "step": 15610 }, { "epoch": 0.06977918150181596, "grad_norm": 9.276220199957996e-13, "learning_rate": 9.30220818498184e-06, "loss": 0.0008, "step": 15620 }, { "epoch": 0.069823854473328, "grad_norm": 5.567364902425709e-10, "learning_rate": 9.30176145526672e-06, "loss": 0.0001, "step": 15630 }, { "epoch": 0.06986852744484005, "grad_norm": 4.1637374437186736e-08, "learning_rate": 9.3013147255516e-06, "loss": 0.0, "step": 15640 }, { "epoch": 0.0699132004163521, "grad_norm": 8.487996546943655e-14, "learning_rate": 9.30086799583648e-06, "loss": 0.0, "step": 15650 }, { "epoch": 0.06995787338786415, "grad_norm": 4.741735839695546e-14, "learning_rate": 9.30042126612136e-06, "loss": 0.0, "step": 15660 }, { "epoch": 0.07000254635937618, "grad_norm": 2.599381900836306e-07, "learning_rate": 9.29997453640624e-06, "loss": 0.0, "step": 15670 }, { "epoch": 0.07004721933088823, "grad_norm": 8.456419777758128e-07, "learning_rate": 9.29952780669112e-06, "loss": 0.0, "step": 15680 }, { "epoch": 0.07009189230240027, "grad_norm": 4.6256240947708277e-17, "learning_rate": 9.299081076975998e-06, "loss": 0.0114, "step": 15690 }, { "epoch": 0.07013656527391232, "grad_norm": 3.173031715847374e-12, "learning_rate": 9.298634347260878e-06, "loss": 0.0001, "step": 15700 }, { "epoch": 0.07018123824542437, "grad_norm": 0.0004218385729473084, "learning_rate": 9.298187617545758e-06, "loss": 0.0003, "step": 15710 }, { "epoch": 0.07022591121693642, "grad_norm": 1.6945949028013274e-05, "learning_rate": 9.297740887830637e-06, "loss": 0.0079, "step": 15720 }, { "epoch": 0.07027058418844846, "grad_norm": 6.766648439487932e-12, "learning_rate": 9.297294158115517e-06, "loss": 0.0, "step": 15730 }, { "epoch": 0.07031525715996051, "grad_norm": 1.574522101783528e-14, "learning_rate": 9.296847428400397e-06, "loss": 0.0024, "step": 15740 }, { "epoch": 0.07035993013147256, "grad_norm": 0.001323819742538035, "learning_rate": 9.296400698685275e-06, "loss": 0.0, "step": 15750 }, { "epoch": 0.0704046031029846, "grad_norm": 6.251634658838157e-06, "learning_rate": 9.295953968970155e-06, "loss": 0.6818, "step": 15760 }, { "epoch": 0.07044927607449665, "grad_norm": 44.87289047241211, "learning_rate": 9.295507239255034e-06, "loss": 0.0062, "step": 15770 }, { "epoch": 0.0704939490460087, "grad_norm": 2.0296934163770153e-13, "learning_rate": 9.295060509539914e-06, "loss": 0.0011, "step": 15780 }, { "epoch": 0.07053862201752074, "grad_norm": 2.8807021408283617e-06, "learning_rate": 9.294613779824794e-06, "loss": 0.0, "step": 15790 }, { "epoch": 0.07058329498903279, "grad_norm": 4.878136678598821e-06, "learning_rate": 9.294167050109673e-06, "loss": 0.0, "step": 15800 }, { "epoch": 0.07062796796054484, "grad_norm": 2.258621067952049e-10, "learning_rate": 9.293720320394553e-06, "loss": 0.1805, "step": 15810 }, { "epoch": 0.07067264093205687, "grad_norm": 5.581754521699622e-05, "learning_rate": 9.293273590679433e-06, "loss": 0.0, "step": 15820 }, { "epoch": 0.07071731390356892, "grad_norm": 1.2751265785482246e-05, "learning_rate": 9.292826860964311e-06, "loss": 0.0001, "step": 15830 }, { "epoch": 0.07076198687508096, "grad_norm": 0.00011787868425017223, "learning_rate": 9.292380131249191e-06, "loss": 0.0, "step": 15840 }, { "epoch": 0.07080665984659301, "grad_norm": 0.0021484734024852514, "learning_rate": 9.29193340153407e-06, "loss": 0.0824, "step": 15850 }, { "epoch": 0.07085133281810506, "grad_norm": 0.1802006959915161, "learning_rate": 9.29148667181895e-06, "loss": 0.0, "step": 15860 }, { "epoch": 0.0708960057896171, "grad_norm": 0.08214273303747177, "learning_rate": 9.29103994210383e-06, "loss": 0.0019, "step": 15870 }, { "epoch": 0.07094067876112915, "grad_norm": 1.5906814336776733, "learning_rate": 9.290593212388708e-06, "loss": 0.0015, "step": 15880 }, { "epoch": 0.0709853517326412, "grad_norm": 2.6559273926530223e-16, "learning_rate": 9.290146482673589e-06, "loss": 0.0004, "step": 15890 }, { "epoch": 0.07103002470415325, "grad_norm": 4.014354999526404e-06, "learning_rate": 9.289699752958469e-06, "loss": 0.0, "step": 15900 }, { "epoch": 0.0710746976756653, "grad_norm": 0.007187653798609972, "learning_rate": 9.289253023243347e-06, "loss": 0.0, "step": 15910 }, { "epoch": 0.07111937064717734, "grad_norm": 1.90598314908641e-09, "learning_rate": 9.288806293528227e-06, "loss": 0.0005, "step": 15920 }, { "epoch": 0.07116404361868939, "grad_norm": 7.094026429656231e-13, "learning_rate": 9.288359563813107e-06, "loss": 0.0825, "step": 15930 }, { "epoch": 0.07120871659020143, "grad_norm": 2.1279684479069667e-16, "learning_rate": 9.287912834097986e-06, "loss": 0.0038, "step": 15940 }, { "epoch": 0.07125338956171348, "grad_norm": 1.595363841033759e-10, "learning_rate": 9.287466104382866e-06, "loss": 0.0, "step": 15950 }, { "epoch": 0.07129806253322553, "grad_norm": 5.384310280075709e-14, "learning_rate": 9.287019374667744e-06, "loss": 0.0, "step": 15960 }, { "epoch": 0.07134273550473758, "grad_norm": 0.05375685170292854, "learning_rate": 9.286572644952625e-06, "loss": 0.0843, "step": 15970 }, { "epoch": 0.07138740847624961, "grad_norm": 0.05173249915242195, "learning_rate": 9.286125915237505e-06, "loss": 0.0005, "step": 15980 }, { "epoch": 0.07143208144776166, "grad_norm": 1140.3134765625, "learning_rate": 9.285679185522383e-06, "loss": 0.5876, "step": 15990 }, { "epoch": 0.0714767544192737, "grad_norm": 7.1331573963107076e-06, "learning_rate": 9.285232455807263e-06, "loss": 0.0428, "step": 16000 }, { "epoch": 0.07152142739078575, "grad_norm": 3.3845615234895376e-06, "learning_rate": 9.284785726092143e-06, "loss": 0.0336, "step": 16010 }, { "epoch": 0.0715661003622978, "grad_norm": 0.0025598302017897367, "learning_rate": 9.284338996377022e-06, "loss": 0.008, "step": 16020 }, { "epoch": 0.07161077333380984, "grad_norm": 1.5907980198595205e-16, "learning_rate": 9.283892266661902e-06, "loss": 0.0, "step": 16030 }, { "epoch": 0.07165544630532189, "grad_norm": 13.82027530670166, "learning_rate": 9.283445536946782e-06, "loss": 0.006, "step": 16040 }, { "epoch": 0.07170011927683394, "grad_norm": 3.120620428731513e-11, "learning_rate": 9.28299880723166e-06, "loss": 0.0, "step": 16050 }, { "epoch": 0.07174479224834598, "grad_norm": 6.461245760647216e-09, "learning_rate": 9.28255207751654e-06, "loss": 0.0, "step": 16060 }, { "epoch": 0.07178946521985803, "grad_norm": 4.7788863144493376e-11, "learning_rate": 9.28210534780142e-06, "loss": 0.0003, "step": 16070 }, { "epoch": 0.07183413819137008, "grad_norm": 467.54833984375, "learning_rate": 9.281658618086299e-06, "loss": 0.2234, "step": 16080 }, { "epoch": 0.07187881116288213, "grad_norm": 1.4692007376318483e-13, "learning_rate": 9.28121188837118e-06, "loss": 0.0919, "step": 16090 }, { "epoch": 0.07192348413439417, "grad_norm": 2.0292225677565057e-09, "learning_rate": 9.28076515865606e-06, "loss": 0.0, "step": 16100 }, { "epoch": 0.07196815710590622, "grad_norm": 6.15861320274469e-11, "learning_rate": 9.28031842894094e-06, "loss": 0.1408, "step": 16110 }, { "epoch": 0.07201283007741827, "grad_norm": 5.3670348165724135e-09, "learning_rate": 9.279871699225818e-06, "loss": 0.45, "step": 16120 }, { "epoch": 0.0720575030489303, "grad_norm": 7.453482275820988e-13, "learning_rate": 9.279424969510698e-06, "loss": 0.0, "step": 16130 }, { "epoch": 0.07210217602044235, "grad_norm": 0.0009137971210293472, "learning_rate": 9.278978239795578e-06, "loss": 0.0002, "step": 16140 }, { "epoch": 0.0721468489919544, "grad_norm": 0.03269213065505028, "learning_rate": 9.278531510080457e-06, "loss": 0.0005, "step": 16150 }, { "epoch": 0.07219152196346644, "grad_norm": 9.894575825342145e-09, "learning_rate": 9.278084780365337e-06, "loss": 0.0, "step": 16160 }, { "epoch": 0.07223619493497849, "grad_norm": 6.140439836599398e-07, "learning_rate": 9.277638050650217e-06, "loss": 0.0, "step": 16170 }, { "epoch": 0.07228086790649053, "grad_norm": 2.58226134919326e-10, "learning_rate": 9.277191320935095e-06, "loss": 0.0, "step": 16180 }, { "epoch": 0.07232554087800258, "grad_norm": 4.679202338098154e-12, "learning_rate": 9.276744591219975e-06, "loss": 0.0313, "step": 16190 }, { "epoch": 0.07237021384951463, "grad_norm": 2.7806524371953856e-07, "learning_rate": 9.276297861504856e-06, "loss": 0.0, "step": 16200 }, { "epoch": 0.07241488682102667, "grad_norm": 3.463020668836858e-12, "learning_rate": 9.275851131789734e-06, "loss": 0.0, "step": 16210 }, { "epoch": 0.07245955979253872, "grad_norm": 7.632701415349885e-11, "learning_rate": 9.275404402074614e-06, "loss": 0.0313, "step": 16220 }, { "epoch": 0.07250423276405077, "grad_norm": 7.043704812123575e-13, "learning_rate": 9.274957672359494e-06, "loss": 0.0004, "step": 16230 }, { "epoch": 0.07254890573556282, "grad_norm": 0.0031528829131275415, "learning_rate": 9.274510942644373e-06, "loss": 0.0, "step": 16240 }, { "epoch": 0.07259357870707486, "grad_norm": 2.61696027337166e-06, "learning_rate": 9.274064212929253e-06, "loss": 0.0, "step": 16250 }, { "epoch": 0.07263825167858691, "grad_norm": 9.001651763916016, "learning_rate": 9.273617483214131e-06, "loss": 0.0009, "step": 16260 }, { "epoch": 0.07268292465009896, "grad_norm": 8.616438407216265e-08, "learning_rate": 9.273170753499011e-06, "loss": 0.0, "step": 16270 }, { "epoch": 0.07272759762161099, "grad_norm": 1.6792819912403445e-10, "learning_rate": 9.272724023783891e-06, "loss": 0.0, "step": 16280 }, { "epoch": 0.07277227059312304, "grad_norm": 1.614965228213805e-08, "learning_rate": 9.27227729406877e-06, "loss": 0.0, "step": 16290 }, { "epoch": 0.07281694356463508, "grad_norm": 0.0012801244156435132, "learning_rate": 9.27183056435365e-06, "loss": 0.0, "step": 16300 }, { "epoch": 0.07286161653614713, "grad_norm": 1.368288451430999e-07, "learning_rate": 9.27138383463853e-06, "loss": 0.0145, "step": 16310 }, { "epoch": 0.07290628950765918, "grad_norm": 0.010511506348848343, "learning_rate": 9.270937104923409e-06, "loss": 0.0, "step": 16320 }, { "epoch": 0.07295096247917122, "grad_norm": 0.007803723216056824, "learning_rate": 9.270490375208289e-06, "loss": 0.1813, "step": 16330 }, { "epoch": 0.07299563545068327, "grad_norm": 3.816868696104869e-12, "learning_rate": 9.270043645493167e-06, "loss": 0.0904, "step": 16340 }, { "epoch": 0.07304030842219532, "grad_norm": 0.005351858213543892, "learning_rate": 9.269596915778047e-06, "loss": 0.0014, "step": 16350 }, { "epoch": 0.07308498139370737, "grad_norm": 1.0312246889743193e-10, "learning_rate": 9.269150186062927e-06, "loss": 0.0, "step": 16360 }, { "epoch": 0.07312965436521941, "grad_norm": 8.958149326498965e-12, "learning_rate": 9.268703456347806e-06, "loss": 0.0, "step": 16370 }, { "epoch": 0.07317432733673146, "grad_norm": 1.3682298566700113e-12, "learning_rate": 9.268256726632686e-06, "loss": 0.0, "step": 16380 }, { "epoch": 0.0732190003082435, "grad_norm": 5.427913674083129e-10, "learning_rate": 9.267809996917566e-06, "loss": 0.0, "step": 16390 }, { "epoch": 0.07326367327975555, "grad_norm": 4.0604925288789673e-07, "learning_rate": 9.267363267202445e-06, "loss": 0.0092, "step": 16400 }, { "epoch": 0.0733083462512676, "grad_norm": 2.0408910786490604e-16, "learning_rate": 9.266916537487325e-06, "loss": 0.0, "step": 16410 }, { "epoch": 0.07335301922277965, "grad_norm": 9.766434959601611e-05, "learning_rate": 9.266469807772205e-06, "loss": 0.0348, "step": 16420 }, { "epoch": 0.0733976921942917, "grad_norm": 0.0839284136891365, "learning_rate": 9.266023078057083e-06, "loss": 0.0, "step": 16430 }, { "epoch": 0.07344236516580373, "grad_norm": 3.1561714877170743e-06, "learning_rate": 9.265576348341963e-06, "loss": 0.0002, "step": 16440 }, { "epoch": 0.07348703813731577, "grad_norm": 0.004904979839920998, "learning_rate": 9.265129618626842e-06, "loss": 0.0, "step": 16450 }, { "epoch": 0.07353171110882782, "grad_norm": 4.0802338075403196e-19, "learning_rate": 9.264682888911722e-06, "loss": 0.0, "step": 16460 }, { "epoch": 0.07357638408033987, "grad_norm": 9.05775454640434e-12, "learning_rate": 9.264236159196602e-06, "loss": 0.0, "step": 16470 }, { "epoch": 0.07362105705185192, "grad_norm": 1.586842289513779e-11, "learning_rate": 9.26378942948148e-06, "loss": 0.0, "step": 16480 }, { "epoch": 0.07366573002336396, "grad_norm": 5.9178513765800744e-05, "learning_rate": 9.26334269976636e-06, "loss": 0.0016, "step": 16490 }, { "epoch": 0.07371040299487601, "grad_norm": 625.9370727539062, "learning_rate": 9.26289597005124e-06, "loss": 1.0798, "step": 16500 }, { "epoch": 0.07375507596638806, "grad_norm": 14.680355072021484, "learning_rate": 9.262449240336119e-06, "loss": 0.0029, "step": 16510 }, { "epoch": 0.0737997489379001, "grad_norm": 9.009173717103563e-16, "learning_rate": 9.262002510621e-06, "loss": 3.1769, "step": 16520 }, { "epoch": 0.07384442190941215, "grad_norm": 4.8403515393147245e-05, "learning_rate": 9.26155578090588e-06, "loss": 0.0004, "step": 16530 }, { "epoch": 0.0738890948809242, "grad_norm": 2.278566716995556e-05, "learning_rate": 9.261109051190758e-06, "loss": 0.0, "step": 16540 }, { "epoch": 0.07393376785243624, "grad_norm": 3.8744247393696085e-11, "learning_rate": 9.260662321475638e-06, "loss": 0.0, "step": 16550 }, { "epoch": 0.07397844082394829, "grad_norm": 4.385966936859553e-11, "learning_rate": 9.260215591760518e-06, "loss": 0.2777, "step": 16560 }, { "epoch": 0.07402311379546034, "grad_norm": 1.677570062030842e-11, "learning_rate": 9.259768862045398e-06, "loss": 0.0027, "step": 16570 }, { "epoch": 0.07406778676697238, "grad_norm": 3.435206633040089e-11, "learning_rate": 9.259322132330277e-06, "loss": 0.0025, "step": 16580 }, { "epoch": 0.07411245973848442, "grad_norm": 2.0525411231764323e-11, "learning_rate": 9.258875402615157e-06, "loss": 0.0, "step": 16590 }, { "epoch": 0.07415713270999647, "grad_norm": 3.8538229318874073e-07, "learning_rate": 9.258428672900037e-06, "loss": 0.0, "step": 16600 }, { "epoch": 0.07420180568150851, "grad_norm": 6.099716565177715e-11, "learning_rate": 9.257981943184915e-06, "loss": 0.0045, "step": 16610 }, { "epoch": 0.07424647865302056, "grad_norm": 1.0236337819047314e-11, "learning_rate": 9.257535213469795e-06, "loss": 0.0001, "step": 16620 }, { "epoch": 0.0742911516245326, "grad_norm": 0.0001892660220619291, "learning_rate": 9.257088483754676e-06, "loss": 0.0001, "step": 16630 }, { "epoch": 0.07433582459604465, "grad_norm": 0.30247747898101807, "learning_rate": 9.256641754039554e-06, "loss": 0.0252, "step": 16640 }, { "epoch": 0.0743804975675567, "grad_norm": 6.412907531394563e-12, "learning_rate": 9.256195024324434e-06, "loss": 0.016, "step": 16650 }, { "epoch": 0.07442517053906875, "grad_norm": 5.720272611142718e-07, "learning_rate": 9.255748294609314e-06, "loss": 0.0, "step": 16660 }, { "epoch": 0.0744698435105808, "grad_norm": 4.307298695493955e-07, "learning_rate": 9.255301564894193e-06, "loss": 0.0003, "step": 16670 }, { "epoch": 0.07451451648209284, "grad_norm": 1.700377338931247e-11, "learning_rate": 9.254854835179073e-06, "loss": 0.0, "step": 16680 }, { "epoch": 0.07455918945360489, "grad_norm": 0.0009698119829408824, "learning_rate": 9.254408105463953e-06, "loss": 0.0001, "step": 16690 }, { "epoch": 0.07460386242511693, "grad_norm": 750.8846435546875, "learning_rate": 9.253961375748831e-06, "loss": 0.3531, "step": 16700 }, { "epoch": 0.07464853539662898, "grad_norm": 0.014949750155210495, "learning_rate": 9.253514646033711e-06, "loss": 0.0001, "step": 16710 }, { "epoch": 0.07469320836814103, "grad_norm": 1.4294463426267612e-06, "learning_rate": 9.253067916318592e-06, "loss": 0.0226, "step": 16720 }, { "epoch": 0.07473788133965308, "grad_norm": 1.61623481525347e-12, "learning_rate": 9.25262118660347e-06, "loss": 0.0127, "step": 16730 }, { "epoch": 0.07478255431116512, "grad_norm": 2.6460383883986083e-10, "learning_rate": 9.25217445688835e-06, "loss": 0.0, "step": 16740 }, { "epoch": 0.07482722728267716, "grad_norm": 6.732713245583e-06, "learning_rate": 9.251727727173229e-06, "loss": 0.0, "step": 16750 }, { "epoch": 0.0748719002541892, "grad_norm": 1.6906284372453229e-07, "learning_rate": 9.251280997458109e-06, "loss": 0.016, "step": 16760 }, { "epoch": 0.07491657322570125, "grad_norm": 1.4504463408115953e-08, "learning_rate": 9.250834267742989e-06, "loss": 0.0061, "step": 16770 }, { "epoch": 0.0749612461972133, "grad_norm": 5.551413551074802e-08, "learning_rate": 9.250387538027867e-06, "loss": 0.0, "step": 16780 }, { "epoch": 0.07500591916872534, "grad_norm": 1.2507527132055163e-11, "learning_rate": 9.249940808312747e-06, "loss": 0.0, "step": 16790 }, { "epoch": 0.07505059214023739, "grad_norm": 6.933629885519377e-12, "learning_rate": 9.249494078597628e-06, "loss": 0.0, "step": 16800 }, { "epoch": 0.07509526511174944, "grad_norm": 2.2586274894820235e-07, "learning_rate": 9.249047348882506e-06, "loss": 0.0, "step": 16810 }, { "epoch": 0.07513993808326148, "grad_norm": 0.005923112854361534, "learning_rate": 9.248600619167386e-06, "loss": 0.1061, "step": 16820 }, { "epoch": 0.07518461105477353, "grad_norm": 1.0352895270273658e-16, "learning_rate": 9.248153889452266e-06, "loss": 0.0, "step": 16830 }, { "epoch": 0.07522928402628558, "grad_norm": 1.1537285426202687e-15, "learning_rate": 9.247707159737145e-06, "loss": 0.0007, "step": 16840 }, { "epoch": 0.07527395699779763, "grad_norm": 2.0707506337203085e-05, "learning_rate": 9.247260430022025e-06, "loss": 0.0, "step": 16850 }, { "epoch": 0.07531862996930967, "grad_norm": 3.5879608203774616e-13, "learning_rate": 9.246813700306903e-06, "loss": 0.0, "step": 16860 }, { "epoch": 0.07536330294082172, "grad_norm": 0.00023001543013378978, "learning_rate": 9.246366970591783e-06, "loss": 0.0018, "step": 16870 }, { "epoch": 0.07540797591233377, "grad_norm": 0.00014396563346963376, "learning_rate": 9.245920240876663e-06, "loss": 0.0348, "step": 16880 }, { "epoch": 0.07545264888384581, "grad_norm": 9.09528971533291e-05, "learning_rate": 9.245473511161542e-06, "loss": 0.0043, "step": 16890 }, { "epoch": 0.07549732185535785, "grad_norm": 9.14583757035195e-14, "learning_rate": 9.245026781446422e-06, "loss": 0.0, "step": 16900 }, { "epoch": 0.0755419948268699, "grad_norm": 1.2336866461737372e-07, "learning_rate": 9.244580051731302e-06, "loss": 0.0001, "step": 16910 }, { "epoch": 0.07558666779838194, "grad_norm": 3.8933134050012086e-08, "learning_rate": 9.24413332201618e-06, "loss": 0.0043, "step": 16920 }, { "epoch": 0.07563134076989399, "grad_norm": 1.0874463214349817e-06, "learning_rate": 9.24368659230106e-06, "loss": 0.0, "step": 16930 }, { "epoch": 0.07567601374140603, "grad_norm": 0.007349396590143442, "learning_rate": 9.243239862585939e-06, "loss": 0.0001, "step": 16940 }, { "epoch": 0.07572068671291808, "grad_norm": 1.8731738009591936e-06, "learning_rate": 9.24279313287082e-06, "loss": 0.0, "step": 16950 }, { "epoch": 0.07576535968443013, "grad_norm": 2.11032347579021e-05, "learning_rate": 9.2423464031557e-06, "loss": 0.0003, "step": 16960 }, { "epoch": 0.07581003265594217, "grad_norm": 5.606299929503122e-13, "learning_rate": 9.241899673440578e-06, "loss": 0.2553, "step": 16970 }, { "epoch": 0.07585470562745422, "grad_norm": 2.055375739093803e-16, "learning_rate": 9.241452943725458e-06, "loss": 0.0, "step": 16980 }, { "epoch": 0.07589937859896627, "grad_norm": 534.0773315429688, "learning_rate": 9.241006214010338e-06, "loss": 0.6, "step": 16990 }, { "epoch": 0.07594405157047832, "grad_norm": 2.370661000838936e-09, "learning_rate": 9.240559484295216e-06, "loss": 0.0, "step": 17000 }, { "epoch": 0.07598872454199036, "grad_norm": 0.05900305137038231, "learning_rate": 9.240112754580097e-06, "loss": 0.0001, "step": 17010 }, { "epoch": 0.07603339751350241, "grad_norm": 3.1733261851574213e-12, "learning_rate": 9.239666024864977e-06, "loss": 0.0, "step": 17020 }, { "epoch": 0.07607807048501446, "grad_norm": 3.1729326926136425e-15, "learning_rate": 9.239219295149857e-06, "loss": 0.4924, "step": 17030 }, { "epoch": 0.0761227434565265, "grad_norm": 5.282239403270931e-17, "learning_rate": 9.238772565434735e-06, "loss": 0.016, "step": 17040 }, { "epoch": 0.07616741642803855, "grad_norm": 4.513401847816567e-07, "learning_rate": 9.238325835719615e-06, "loss": 0.0, "step": 17050 }, { "epoch": 0.07621208939955058, "grad_norm": 4.919018920190865e-06, "learning_rate": 9.237879106004496e-06, "loss": 0.0003, "step": 17060 }, { "epoch": 0.07625676237106263, "grad_norm": 2.0352650675903305e-09, "learning_rate": 9.237432376289374e-06, "loss": 1.3501, "step": 17070 }, { "epoch": 0.07630143534257468, "grad_norm": 3.793320841358021e-14, "learning_rate": 9.236985646574254e-06, "loss": 0.0824, "step": 17080 }, { "epoch": 0.07634610831408672, "grad_norm": 0.03230379521846771, "learning_rate": 9.236538916859134e-06, "loss": 0.0, "step": 17090 }, { "epoch": 0.07639078128559877, "grad_norm": 0.014470861293375492, "learning_rate": 9.236092187144013e-06, "loss": 0.0, "step": 17100 }, { "epoch": 0.07643545425711082, "grad_norm": 8.92955795279704e-05, "learning_rate": 9.235645457428893e-06, "loss": 0.0, "step": 17110 }, { "epoch": 0.07648012722862287, "grad_norm": 0.0006282811518758535, "learning_rate": 9.235198727713773e-06, "loss": 0.0, "step": 17120 }, { "epoch": 0.07652480020013491, "grad_norm": 1.946099531524581e-13, "learning_rate": 9.234751997998653e-06, "loss": 0.093, "step": 17130 }, { "epoch": 0.07656947317164696, "grad_norm": 0.43705910444259644, "learning_rate": 9.234305268283531e-06, "loss": 0.0001, "step": 17140 }, { "epoch": 0.076614146143159, "grad_norm": 1.526163639908873e-10, "learning_rate": 9.233858538568412e-06, "loss": 0.009, "step": 17150 }, { "epoch": 0.07665881911467105, "grad_norm": 1.7637514702073531e-06, "learning_rate": 9.23341180885329e-06, "loss": 0.0, "step": 17160 }, { "epoch": 0.0767034920861831, "grad_norm": 0.27547845244407654, "learning_rate": 9.23296507913817e-06, "loss": 0.0007, "step": 17170 }, { "epoch": 0.07674816505769515, "grad_norm": 3.1078446482979416e-09, "learning_rate": 9.23251834942305e-06, "loss": 0.2688, "step": 17180 }, { "epoch": 0.0767928380292072, "grad_norm": 2.0889000165030813e-12, "learning_rate": 9.232071619707929e-06, "loss": 0.0043, "step": 17190 }, { "epoch": 0.07683751100071924, "grad_norm": 2.2112160422693705e-06, "learning_rate": 9.231624889992809e-06, "loss": 0.0, "step": 17200 }, { "epoch": 0.07688218397223127, "grad_norm": 5.533650968003734e-11, "learning_rate": 9.231178160277689e-06, "loss": 0.0, "step": 17210 }, { "epoch": 0.07692685694374332, "grad_norm": 8.352378546451066e-16, "learning_rate": 9.230731430562567e-06, "loss": 0.0, "step": 17220 }, { "epoch": 0.07697152991525537, "grad_norm": 2.0515146388788708e-05, "learning_rate": 9.230284700847448e-06, "loss": 0.0575, "step": 17230 }, { "epoch": 0.07701620288676742, "grad_norm": 1.9726203025097026e-14, "learning_rate": 9.229837971132326e-06, "loss": 0.0, "step": 17240 }, { "epoch": 0.07706087585827946, "grad_norm": 1.1162385035387579e-12, "learning_rate": 9.229391241417206e-06, "loss": 0.0006, "step": 17250 }, { "epoch": 0.07710554882979151, "grad_norm": 1.603686783369085e-08, "learning_rate": 9.228944511702086e-06, "loss": 0.0973, "step": 17260 }, { "epoch": 0.07715022180130356, "grad_norm": 0.012515800073742867, "learning_rate": 9.228497781986965e-06, "loss": 0.0, "step": 17270 }, { "epoch": 0.0771948947728156, "grad_norm": 1.2555916660658868e-14, "learning_rate": 9.228051052271845e-06, "loss": 0.0, "step": 17280 }, { "epoch": 0.07723956774432765, "grad_norm": 1.5260538798717167e-19, "learning_rate": 9.227604322556725e-06, "loss": 0.0002, "step": 17290 }, { "epoch": 0.0772842407158397, "grad_norm": 4.507444089085766e-07, "learning_rate": 9.227157592841603e-06, "loss": 0.0001, "step": 17300 }, { "epoch": 0.07732891368735174, "grad_norm": 0.0018883688608184457, "learning_rate": 9.226710863126483e-06, "loss": 0.0, "step": 17310 }, { "epoch": 0.07737358665886379, "grad_norm": 8.304020582272642e-09, "learning_rate": 9.226264133411364e-06, "loss": 0.0, "step": 17320 }, { "epoch": 0.07741825963037584, "grad_norm": 3.2100276539192876e-19, "learning_rate": 9.225817403696242e-06, "loss": 0.0, "step": 17330 }, { "epoch": 0.07746293260188788, "grad_norm": 3.4246664171178054e-08, "learning_rate": 9.225370673981122e-06, "loss": 0.001, "step": 17340 }, { "epoch": 0.07750760557339993, "grad_norm": 9.922782945792283e-14, "learning_rate": 9.224923944266e-06, "loss": 0.0, "step": 17350 }, { "epoch": 0.07755227854491197, "grad_norm": 2.466302717607505e-10, "learning_rate": 9.22447721455088e-06, "loss": 0.0038, "step": 17360 }, { "epoch": 0.07759695151642401, "grad_norm": 1.2259521176005705e-10, "learning_rate": 9.22403048483576e-06, "loss": 0.1602, "step": 17370 }, { "epoch": 0.07764162448793606, "grad_norm": 1.6446602760233908e-11, "learning_rate": 9.22358375512064e-06, "loss": 0.0003, "step": 17380 }, { "epoch": 0.0776862974594481, "grad_norm": 4.474438810575521e-06, "learning_rate": 9.22313702540552e-06, "loss": 0.0, "step": 17390 }, { "epoch": 0.07773097043096015, "grad_norm": 4.021822874733516e-08, "learning_rate": 9.2226902956904e-06, "loss": 0.0004, "step": 17400 }, { "epoch": 0.0777756434024722, "grad_norm": 1.5872502956426615e-07, "learning_rate": 9.222243565975278e-06, "loss": 0.0003, "step": 17410 }, { "epoch": 0.07782031637398425, "grad_norm": 1.657014456202255e-11, "learning_rate": 9.221796836260158e-06, "loss": 0.0, "step": 17420 }, { "epoch": 0.0778649893454963, "grad_norm": 7.961917702914434e-08, "learning_rate": 9.221350106545036e-06, "loss": 0.0016, "step": 17430 }, { "epoch": 0.07790966231700834, "grad_norm": 2.4046839172675583e-11, "learning_rate": 9.220903376829917e-06, "loss": 0.0006, "step": 17440 }, { "epoch": 0.07795433528852039, "grad_norm": 1.4532316461313144e-09, "learning_rate": 9.220456647114797e-06, "loss": 0.0, "step": 17450 }, { "epoch": 0.07799900826003243, "grad_norm": 3.730805886448252e-09, "learning_rate": 9.220009917399675e-06, "loss": 0.0, "step": 17460 }, { "epoch": 0.07804368123154448, "grad_norm": 3.827557235459682e-14, "learning_rate": 9.219563187684555e-06, "loss": 0.0003, "step": 17470 }, { "epoch": 0.07808835420305653, "grad_norm": 2.06796145754992e-18, "learning_rate": 9.219116457969435e-06, "loss": 0.0002, "step": 17480 }, { "epoch": 0.07813302717456858, "grad_norm": 9.762475627894673e-08, "learning_rate": 9.218669728254316e-06, "loss": 0.0001, "step": 17490 }, { "epoch": 0.07817770014608062, "grad_norm": 3.0506463342603023e-15, "learning_rate": 9.218222998539194e-06, "loss": 0.1359, "step": 17500 }, { "epoch": 0.07822237311759267, "grad_norm": 5.758641119513186e-09, "learning_rate": 9.217776268824074e-06, "loss": 0.0, "step": 17510 }, { "epoch": 0.0782670460891047, "grad_norm": 3.089372979658833e-09, "learning_rate": 9.217329539108954e-06, "loss": 0.0, "step": 17520 }, { "epoch": 0.07831171906061675, "grad_norm": 5.25945498090763e-13, "learning_rate": 9.216882809393833e-06, "loss": 0.001, "step": 17530 }, { "epoch": 0.0783563920321288, "grad_norm": 0.00019875851285178214, "learning_rate": 9.216436079678713e-06, "loss": 0.0, "step": 17540 }, { "epoch": 0.07840106500364084, "grad_norm": 3.922442601833609e-07, "learning_rate": 9.215989349963593e-06, "loss": 0.0, "step": 17550 }, { "epoch": 0.07844573797515289, "grad_norm": 2.3536176479987958e-11, "learning_rate": 9.215542620248471e-06, "loss": 0.0021, "step": 17560 }, { "epoch": 0.07849041094666494, "grad_norm": 2.7818711600957613e-07, "learning_rate": 9.215095890533351e-06, "loss": 0.0252, "step": 17570 }, { "epoch": 0.07853508391817698, "grad_norm": 1.2804473925598359e-08, "learning_rate": 9.214649160818232e-06, "loss": 0.0012, "step": 17580 }, { "epoch": 0.07857975688968903, "grad_norm": 4.7054363015535046e-20, "learning_rate": 9.214202431103112e-06, "loss": 0.0, "step": 17590 }, { "epoch": 0.07862442986120108, "grad_norm": 1.086241665192167e-12, "learning_rate": 9.21375570138799e-06, "loss": 0.0, "step": 17600 }, { "epoch": 0.07866910283271313, "grad_norm": 9.0274799519783e-17, "learning_rate": 9.21330897167287e-06, "loss": 0.1071, "step": 17610 }, { "epoch": 0.07871377580422517, "grad_norm": 1.529821425094724e-08, "learning_rate": 9.21286224195775e-06, "loss": 0.0, "step": 17620 }, { "epoch": 0.07875844877573722, "grad_norm": 9.494939234855337e-08, "learning_rate": 9.212415512242629e-06, "loss": 0.0, "step": 17630 }, { "epoch": 0.07880312174724927, "grad_norm": 0.008173040114343166, "learning_rate": 9.211968782527509e-06, "loss": 0.0, "step": 17640 }, { "epoch": 0.07884779471876131, "grad_norm": 9.335294041116228e-18, "learning_rate": 9.211522052812387e-06, "loss": 0.0, "step": 17650 }, { "epoch": 0.07889246769027336, "grad_norm": 0.002261559944599867, "learning_rate": 9.211075323097267e-06, "loss": 0.0, "step": 17660 }, { "epoch": 0.0789371406617854, "grad_norm": 6.691013550153002e-05, "learning_rate": 9.210628593382148e-06, "loss": 0.0, "step": 17670 }, { "epoch": 0.07898181363329744, "grad_norm": 6.267378616087171e-09, "learning_rate": 9.210181863667026e-06, "loss": 0.0, "step": 17680 }, { "epoch": 0.07902648660480949, "grad_norm": 0.007965099066495895, "learning_rate": 9.209735133951906e-06, "loss": 0.0015, "step": 17690 }, { "epoch": 0.07907115957632153, "grad_norm": 5.188430861704542e-15, "learning_rate": 9.209288404236786e-06, "loss": 0.0, "step": 17700 }, { "epoch": 0.07911583254783358, "grad_norm": 0.05706815794110298, "learning_rate": 9.208841674521665e-06, "loss": 0.0, "step": 17710 }, { "epoch": 0.07916050551934563, "grad_norm": 1.819989471629757e-11, "learning_rate": 9.208394944806545e-06, "loss": 0.0, "step": 17720 }, { "epoch": 0.07920517849085768, "grad_norm": 7.620450494585929e-13, "learning_rate": 9.207948215091423e-06, "loss": 0.333, "step": 17730 }, { "epoch": 0.07924985146236972, "grad_norm": 6.586374183825683e-06, "learning_rate": 9.207501485376303e-06, "loss": 0.0, "step": 17740 }, { "epoch": 0.07929452443388177, "grad_norm": 1.1758358731128737e-08, "learning_rate": 9.207054755661184e-06, "loss": 0.1407, "step": 17750 }, { "epoch": 0.07933919740539382, "grad_norm": 3.1918389797210693, "learning_rate": 9.206608025946062e-06, "loss": 0.2583, "step": 17760 }, { "epoch": 0.07938387037690586, "grad_norm": 9.242794897501572e-12, "learning_rate": 9.206161296230942e-06, "loss": 0.0001, "step": 17770 }, { "epoch": 0.07942854334841791, "grad_norm": 1.7713607002641657e-13, "learning_rate": 9.205714566515822e-06, "loss": 0.0004, "step": 17780 }, { "epoch": 0.07947321631992996, "grad_norm": 5.914055023481524e-09, "learning_rate": 9.2052678368007e-06, "loss": 0.0, "step": 17790 }, { "epoch": 0.079517889291442, "grad_norm": 6.293767048325127e-13, "learning_rate": 9.20482110708558e-06, "loss": 0.0, "step": 17800 }, { "epoch": 0.07956256226295405, "grad_norm": 2.157514131795324e-07, "learning_rate": 9.204374377370461e-06, "loss": 0.0, "step": 17810 }, { "epoch": 0.0796072352344661, "grad_norm": 7.039646778917233e-14, "learning_rate": 9.20392764765534e-06, "loss": 0.2479, "step": 17820 }, { "epoch": 0.07965190820597813, "grad_norm": 6.4774088859558105, "learning_rate": 9.20348091794022e-06, "loss": 0.0013, "step": 17830 }, { "epoch": 0.07969658117749018, "grad_norm": 5.9319362755161364e-09, "learning_rate": 9.203034188225098e-06, "loss": 0.0, "step": 17840 }, { "epoch": 0.07974125414900222, "grad_norm": 0.004283531103283167, "learning_rate": 9.202587458509978e-06, "loss": 0.0062, "step": 17850 }, { "epoch": 0.07978592712051427, "grad_norm": 2.560173833483527e-18, "learning_rate": 9.202140728794858e-06, "loss": 0.0001, "step": 17860 }, { "epoch": 0.07983060009202632, "grad_norm": 4.4415496280869604e-10, "learning_rate": 9.201693999079737e-06, "loss": 0.0062, "step": 17870 }, { "epoch": 0.07987527306353837, "grad_norm": 0.0, "learning_rate": 9.201247269364617e-06, "loss": 0.0, "step": 17880 }, { "epoch": 0.07991994603505041, "grad_norm": 0.00010508752893656492, "learning_rate": 9.200800539649497e-06, "loss": 0.0012, "step": 17890 }, { "epoch": 0.07996461900656246, "grad_norm": 0.000583523535169661, "learning_rate": 9.200353809934375e-06, "loss": 0.0006, "step": 17900 }, { "epoch": 0.0800092919780745, "grad_norm": 1.874498778420275e-08, "learning_rate": 9.199907080219255e-06, "loss": 0.4538, "step": 17910 }, { "epoch": 0.08005396494958655, "grad_norm": 5.004399099561851e-07, "learning_rate": 9.199460350504136e-06, "loss": 0.0001, "step": 17920 }, { "epoch": 0.0800986379210986, "grad_norm": 37.80466842651367, "learning_rate": 9.199013620789014e-06, "loss": 0.4853, "step": 17930 }, { "epoch": 0.08014331089261065, "grad_norm": 2.3460300369038123e-10, "learning_rate": 9.198566891073894e-06, "loss": 0.0002, "step": 17940 }, { "epoch": 0.0801879838641227, "grad_norm": 1.444460939345851e-18, "learning_rate": 9.198120161358774e-06, "loss": 0.0001, "step": 17950 }, { "epoch": 0.08023265683563474, "grad_norm": 4.5384392738342285, "learning_rate": 9.197673431643653e-06, "loss": 0.0008, "step": 17960 }, { "epoch": 0.08027732980714679, "grad_norm": 1.2695361206738198e-08, "learning_rate": 9.197226701928533e-06, "loss": 0.0014, "step": 17970 }, { "epoch": 0.08032200277865882, "grad_norm": 0.0022479791659861803, "learning_rate": 9.196779972213413e-06, "loss": 0.0, "step": 17980 }, { "epoch": 0.08036667575017087, "grad_norm": 0.6332511901855469, "learning_rate": 9.196333242498291e-06, "loss": 0.0388, "step": 17990 }, { "epoch": 0.08041134872168292, "grad_norm": 0.0012692047748714685, "learning_rate": 9.195886512783171e-06, "loss": 0.0038, "step": 18000 } ], "logging_steps": 10, "max_steps": 223849, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }