diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.08041134872168292, + "eval_steps": 500, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.4672971512046065e-05, + "grad_norm": 11.733072280883789, + "learning_rate": 9.999553270284881e-06, + "loss": 0.164, + "step": 10 + }, + { + "epoch": 8.934594302409213e-05, + "grad_norm": 0.08407776802778244, + "learning_rate": 9.99910654056976e-06, + "loss": 0.0147, + "step": 20 + }, + { + "epoch": 0.0001340189145361382, + "grad_norm": 0.00012560053437482566, + "learning_rate": 9.99865981085464e-06, + "loss": 0.0002, + "step": 30 + }, + { + "epoch": 0.00017869188604818426, + "grad_norm": 3.125946022919379e-05, + "learning_rate": 9.99821308113952e-06, + "loss": 0.0004, + "step": 40 + }, + { + "epoch": 0.00022336485756023033, + "grad_norm": 0.0017562335124239326, + "learning_rate": 9.997766351424398e-06, + "loss": 0.0, + "step": 50 + }, + { + "epoch": 0.0002680378290722764, + "grad_norm": 10.499456405639648, + "learning_rate": 9.997319621709278e-06, + "loss": 0.0697, + "step": 60 + }, + { + "epoch": 0.0003127108005843225, + "grad_norm": 3.6197413919580868e-06, + "learning_rate": 9.996872891994157e-06, + "loss": 0.0082, + "step": 70 + }, + { + "epoch": 0.0003573837720963685, + "grad_norm": 0.0009690591250546277, + "learning_rate": 9.996426162279037e-06, + "loss": 0.0379, + "step": 80 + }, + { + "epoch": 0.0004020567436084146, + "grad_norm": 0.011041563004255295, + "learning_rate": 9.995979432563917e-06, + "loss": 0.0014, + "step": 90 + }, + { + "epoch": 0.00044672971512046066, + "grad_norm": 1.1426632227085065e-05, + "learning_rate": 9.995532702848795e-06, + "loss": 0.2133, + "step": 100 + }, + { + "epoch": 0.0004914026866325068, + "grad_norm": 4.7969253500923514e-05, + "learning_rate": 9.995085973133675e-06, + "loss": 0.0001, + "step": 110 + }, + { + "epoch": 0.0005360756581445528, + "grad_norm": 0.019292959943413734, + "learning_rate": 9.994639243418556e-06, + "loss": 0.0002, + "step": 120 + }, + { + "epoch": 0.0005807486296565989, + "grad_norm": 135.5595245361328, + "learning_rate": 9.994192513703434e-06, + "loss": 0.0579, + "step": 130 + }, + { + "epoch": 0.000625421601168645, + "grad_norm": 2.2290277001957293e-07, + "learning_rate": 9.993745783988314e-06, + "loss": 0.2915, + "step": 140 + }, + { + "epoch": 0.000670094572680691, + "grad_norm": 1.4152267802103324e-08, + "learning_rate": 9.993299054273193e-06, + "loss": 0.0096, + "step": 150 + }, + { + "epoch": 0.000714767544192737, + "grad_norm": 0.0003992164565715939, + "learning_rate": 9.992852324558073e-06, + "loss": 0.0015, + "step": 160 + }, + { + "epoch": 0.0007594405157047831, + "grad_norm": 4.117830485483864e-06, + "learning_rate": 9.992405594842953e-06, + "loss": 0.0128, + "step": 170 + }, + { + "epoch": 0.0008041134872168292, + "grad_norm": 0.0052748871967196465, + "learning_rate": 9.991958865127833e-06, + "loss": 0.0005, + "step": 180 + }, + { + "epoch": 0.0008487864587288752, + "grad_norm": 0.4053354859352112, + "learning_rate": 9.991512135412711e-06, + "loss": 0.0575, + "step": 190 + }, + { + "epoch": 0.0008934594302409213, + "grad_norm": 1.9831970348604955e-05, + "learning_rate": 9.991065405697591e-06, + "loss": 0.0001, + "step": 200 + }, + { + "epoch": 0.0009381324017529674, + "grad_norm": 2.868847381876094e-08, + "learning_rate": 9.990618675982472e-06, + "loss": 0.0102, + "step": 210 + }, + { + "epoch": 0.0009828053732650135, + "grad_norm": 0.007543394807726145, + "learning_rate": 9.99017194626735e-06, + "loss": 0.0001, + "step": 220 + }, + { + "epoch": 0.0010274783447770596, + "grad_norm": 0.04161032661795616, + "learning_rate": 9.98972521655223e-06, + "loss": 0.0, + "step": 230 + }, + { + "epoch": 0.0010721513162891055, + "grad_norm": 0.07510162144899368, + "learning_rate": 9.98927848683711e-06, + "loss": 0.0002, + "step": 240 + }, + { + "epoch": 0.0011168242878011516, + "grad_norm": 0.00014790371642448008, + "learning_rate": 9.988831757121989e-06, + "loss": 0.005, + "step": 250 + }, + { + "epoch": 0.0011614972593131977, + "grad_norm": 2.0253996808605734e-06, + "learning_rate": 9.988385027406869e-06, + "loss": 0.0166, + "step": 260 + }, + { + "epoch": 0.0012061702308252438, + "grad_norm": 0.8337207436561584, + "learning_rate": 9.987938297691749e-06, + "loss": 0.1077, + "step": 270 + }, + { + "epoch": 0.00125084320233729, + "grad_norm": 1.2096836599084781e-06, + "learning_rate": 9.987491567976627e-06, + "loss": 0.0005, + "step": 280 + }, + { + "epoch": 0.001295516173849336, + "grad_norm": 6.229875725694001e-05, + "learning_rate": 9.987044838261508e-06, + "loss": 0.0021, + "step": 290 + }, + { + "epoch": 0.001340189145361382, + "grad_norm": 52.171180725097656, + "learning_rate": 9.986598108546388e-06, + "loss": 0.014, + "step": 300 + }, + { + "epoch": 0.001384862116873428, + "grad_norm": 0.01654650643467903, + "learning_rate": 9.986151378831268e-06, + "loss": 0.0013, + "step": 310 + }, + { + "epoch": 0.001429535088385474, + "grad_norm": 0.00010301181464456022, + "learning_rate": 9.985704649116146e-06, + "loss": 0.0003, + "step": 320 + }, + { + "epoch": 0.0014742080598975202, + "grad_norm": 0.10489977151155472, + "learning_rate": 9.985257919401026e-06, + "loss": 0.0079, + "step": 330 + }, + { + "epoch": 0.0015188810314095663, + "grad_norm": 1.3773581031273352e-06, + "learning_rate": 9.984811189685906e-06, + "loss": 0.0329, + "step": 340 + }, + { + "epoch": 0.0015635540029216124, + "grad_norm": 0.01485914271324873, + "learning_rate": 9.984364459970785e-06, + "loss": 0.0001, + "step": 350 + }, + { + "epoch": 0.0016082269744336585, + "grad_norm": 30.461793899536133, + "learning_rate": 9.983917730255665e-06, + "loss": 0.0949, + "step": 360 + }, + { + "epoch": 0.0016528999459457046, + "grad_norm": 6.307673454284668, + "learning_rate": 9.983471000540543e-06, + "loss": 0.0017, + "step": 370 + }, + { + "epoch": 0.0016975729174577505, + "grad_norm": 0.014393389225006104, + "learning_rate": 9.983024270825424e-06, + "loss": 0.0147, + "step": 380 + }, + { + "epoch": 0.0017422458889697966, + "grad_norm": 0.019219743087887764, + "learning_rate": 9.982577541110304e-06, + "loss": 0.0021, + "step": 390 + }, + { + "epoch": 0.0017869188604818427, + "grad_norm": 0.05191327631473541, + "learning_rate": 9.982130811395182e-06, + "loss": 0.0, + "step": 400 + }, + { + "epoch": 0.0018315918319938887, + "grad_norm": 5.0406357331667095e-05, + "learning_rate": 9.981684081680062e-06, + "loss": 0.0037, + "step": 410 + }, + { + "epoch": 0.0018762648035059348, + "grad_norm": 5.040341420681216e-05, + "learning_rate": 9.981237351964942e-06, + "loss": 0.0066, + "step": 420 + }, + { + "epoch": 0.001920937775017981, + "grad_norm": 0.006360863335430622, + "learning_rate": 9.98079062224982e-06, + "loss": 0.0002, + "step": 430 + }, + { + "epoch": 0.001965610746530027, + "grad_norm": 4.391232266698353e-08, + "learning_rate": 9.980343892534701e-06, + "loss": 0.1804, + "step": 440 + }, + { + "epoch": 0.002010283718042073, + "grad_norm": 2.598305854917271e-06, + "learning_rate": 9.97989716281958e-06, + "loss": 0.0063, + "step": 450 + }, + { + "epoch": 0.0020549566895541192, + "grad_norm": 2.415512678766163e-09, + "learning_rate": 9.97945043310446e-06, + "loss": 0.0001, + "step": 460 + }, + { + "epoch": 0.0020996296610661653, + "grad_norm": 0.00015337311197072268, + "learning_rate": 9.97900370338934e-06, + "loss": 0.0, + "step": 470 + }, + { + "epoch": 0.002144302632578211, + "grad_norm": 0.00045923213474452496, + "learning_rate": 9.978556973674218e-06, + "loss": 0.0478, + "step": 480 + }, + { + "epoch": 0.002188975604090257, + "grad_norm": 6.670333174696452e-09, + "learning_rate": 9.978110243959098e-06, + "loss": 0.0002, + "step": 490 + }, + { + "epoch": 0.002233648575602303, + "grad_norm": 5.148401396581903e-05, + "learning_rate": 9.977663514243978e-06, + "loss": 0.0009, + "step": 500 + }, + { + "epoch": 0.0022783215471143493, + "grad_norm": 3.0508829240716295e-06, + "learning_rate": 9.977216784528857e-06, + "loss": 0.0001, + "step": 510 + }, + { + "epoch": 0.0023229945186263954, + "grad_norm": 3.840111457975581e-05, + "learning_rate": 9.976770054813737e-06, + "loss": 0.0006, + "step": 520 + }, + { + "epoch": 0.0023676674901384415, + "grad_norm": 0.0005538268014788628, + "learning_rate": 9.976323325098617e-06, + "loss": 0.0, + "step": 530 + }, + { + "epoch": 0.0024123404616504876, + "grad_norm": 2.515283483717212e-07, + "learning_rate": 9.975876595383495e-06, + "loss": 0.0003, + "step": 540 + }, + { + "epoch": 0.0024570134331625337, + "grad_norm": 4.5527631300501525e-05, + "learning_rate": 9.975429865668376e-06, + "loss": 0.5001, + "step": 550 + }, + { + "epoch": 0.00250168640467458, + "grad_norm": 0.0001504126121290028, + "learning_rate": 9.974983135953254e-06, + "loss": 0.003, + "step": 560 + }, + { + "epoch": 0.002546359376186626, + "grad_norm": 0.06473321467638016, + "learning_rate": 9.974536406238134e-06, + "loss": 0.1604, + "step": 570 + }, + { + "epoch": 0.002591032347698672, + "grad_norm": 0.06711556017398834, + "learning_rate": 9.974089676523014e-06, + "loss": 0.0, + "step": 580 + }, + { + "epoch": 0.002635705319210718, + "grad_norm": 0.2837385833263397, + "learning_rate": 9.973642946807893e-06, + "loss": 0.0, + "step": 590 + }, + { + "epoch": 0.002680378290722764, + "grad_norm": 0.00018509538494981825, + "learning_rate": 9.973196217092773e-06, + "loss": 0.0005, + "step": 600 + }, + { + "epoch": 0.00272505126223481, + "grad_norm": 0.49453040957450867, + "learning_rate": 9.972749487377653e-06, + "loss": 0.0145, + "step": 610 + }, + { + "epoch": 0.002769724233746856, + "grad_norm": 0.7000535130500793, + "learning_rate": 9.972302757662531e-06, + "loss": 0.0025, + "step": 620 + }, + { + "epoch": 0.002814397205258902, + "grad_norm": 1.611105382437472e-08, + "learning_rate": 9.971856027947411e-06, + "loss": 0.0, + "step": 630 + }, + { + "epoch": 0.002859070176770948, + "grad_norm": 2.75453260201175e-07, + "learning_rate": 9.971409298232292e-06, + "loss": 0.0081, + "step": 640 + }, + { + "epoch": 0.0029037431482829943, + "grad_norm": 1.9412634344462276e-07, + "learning_rate": 9.97096256851717e-06, + "loss": 0.0693, + "step": 650 + }, + { + "epoch": 0.0029484161197950404, + "grad_norm": 2.8128763762680364e-09, + "learning_rate": 9.97051583880205e-06, + "loss": 0.1002, + "step": 660 + }, + { + "epoch": 0.0029930890913070865, + "grad_norm": 0.0005962385912425816, + "learning_rate": 9.97006910908693e-06, + "loss": 0.0, + "step": 670 + }, + { + "epoch": 0.0030377620628191325, + "grad_norm": 2.213113307952881, + "learning_rate": 9.969622379371809e-06, + "loss": 0.0093, + "step": 680 + }, + { + "epoch": 0.0030824350343311786, + "grad_norm": 1.1586568149368759e-07, + "learning_rate": 9.969175649656689e-06, + "loss": 0.0, + "step": 690 + }, + { + "epoch": 0.0031271080058432247, + "grad_norm": 2.6724624633789062, + "learning_rate": 9.968728919941569e-06, + "loss": 0.0286, + "step": 700 + }, + { + "epoch": 0.003171780977355271, + "grad_norm": 1.0748630074886023e-06, + "learning_rate": 9.968282190226447e-06, + "loss": 0.0226, + "step": 710 + }, + { + "epoch": 0.003216453948867317, + "grad_norm": 0.5267998576164246, + "learning_rate": 9.967835460511328e-06, + "loss": 0.2235, + "step": 720 + }, + { + "epoch": 0.003261126920379363, + "grad_norm": 9.127699485134144e-09, + "learning_rate": 9.967388730796208e-06, + "loss": 0.2336, + "step": 730 + }, + { + "epoch": 0.003305799891891409, + "grad_norm": 6.590168482034642e-07, + "learning_rate": 9.966942001081086e-06, + "loss": 0.0006, + "step": 740 + }, + { + "epoch": 0.003350472863403455, + "grad_norm": 6.971930588406394e-09, + "learning_rate": 9.966495271365966e-06, + "loss": 0.0232, + "step": 750 + }, + { + "epoch": 0.003395145834915501, + "grad_norm": 0.01525203138589859, + "learning_rate": 9.966048541650846e-06, + "loss": 0.1315, + "step": 760 + }, + { + "epoch": 0.003439818806427547, + "grad_norm": 1.4364684375323122e-07, + "learning_rate": 9.965601811935726e-06, + "loss": 0.0452, + "step": 770 + }, + { + "epoch": 0.003484491777939593, + "grad_norm": 0.013514714315533638, + "learning_rate": 9.965155082220605e-06, + "loss": 0.0001, + "step": 780 + }, + { + "epoch": 0.003529164749451639, + "grad_norm": 6.487604724270124e-12, + "learning_rate": 9.964708352505485e-06, + "loss": 0.0, + "step": 790 + }, + { + "epoch": 0.0035738377209636853, + "grad_norm": 0.0036428929306566715, + "learning_rate": 9.964261622790365e-06, + "loss": 0.0, + "step": 800 + }, + { + "epoch": 0.0036185106924757314, + "grad_norm": 0.00026434988831169903, + "learning_rate": 9.963814893075244e-06, + "loss": 0.0079, + "step": 810 + }, + { + "epoch": 0.0036631836639877775, + "grad_norm": 5.467930532176979e-05, + "learning_rate": 9.963368163360124e-06, + "loss": 0.0002, + "step": 820 + }, + { + "epoch": 0.0037078566354998236, + "grad_norm": 1.1106082098422121e-07, + "learning_rate": 9.962921433645004e-06, + "loss": 0.0001, + "step": 830 + }, + { + "epoch": 0.0037525296070118697, + "grad_norm": 31.38213348388672, + "learning_rate": 9.962474703929882e-06, + "loss": 0.0064, + "step": 840 + }, + { + "epoch": 0.003797202578523916, + "grad_norm": 0.00040082065970636904, + "learning_rate": 9.962027974214762e-06, + "loss": 0.0001, + "step": 850 + }, + { + "epoch": 0.003841875550035962, + "grad_norm": 2.0785208276752343e-10, + "learning_rate": 9.96158124449964e-06, + "loss": 0.0, + "step": 860 + }, + { + "epoch": 0.003886548521548008, + "grad_norm": 21.16458511352539, + "learning_rate": 9.961134514784521e-06, + "loss": 0.011, + "step": 870 + }, + { + "epoch": 0.003931221493060054, + "grad_norm": 5.773109212903194e-10, + "learning_rate": 9.960687785069401e-06, + "loss": 0.0001, + "step": 880 + }, + { + "epoch": 0.0039758944645721, + "grad_norm": 8.707175993549754e-07, + "learning_rate": 9.96024105535428e-06, + "loss": 0.0001, + "step": 890 + }, + { + "epoch": 0.004020567436084146, + "grad_norm": 6.301308867057154e-12, + "learning_rate": 9.95979432563916e-06, + "loss": 0.0388, + "step": 900 + }, + { + "epoch": 0.004065240407596192, + "grad_norm": 2.1161547514303436e-10, + "learning_rate": 9.95934759592404e-06, + "loss": 0.0004, + "step": 910 + }, + { + "epoch": 0.0041099133791082385, + "grad_norm": 0.0002068611647700891, + "learning_rate": 9.958900866208918e-06, + "loss": 0.0001, + "step": 920 + }, + { + "epoch": 0.004154586350620285, + "grad_norm": 13.312015533447266, + "learning_rate": 9.958454136493798e-06, + "loss": 0.1857, + "step": 930 + }, + { + "epoch": 0.004199259322132331, + "grad_norm": 2.75353984058313e-09, + "learning_rate": 9.958007406778678e-06, + "loss": 0.0001, + "step": 940 + }, + { + "epoch": 0.004243932293644376, + "grad_norm": 243.2834014892578, + "learning_rate": 9.957560677063557e-06, + "loss": 0.1703, + "step": 950 + }, + { + "epoch": 0.004288605265156422, + "grad_norm": 5.9964871956808e-10, + "learning_rate": 9.957113947348437e-06, + "loss": 0.0, + "step": 960 + }, + { + "epoch": 0.004333278236668468, + "grad_norm": 3.727791181518114e-06, + "learning_rate": 9.956667217633315e-06, + "loss": 0.0201, + "step": 970 + }, + { + "epoch": 0.004377951208180514, + "grad_norm": 2.396371030499722e-07, + "learning_rate": 9.956220487918196e-06, + "loss": 0.4625, + "step": 980 + }, + { + "epoch": 0.00442262417969256, + "grad_norm": 7.116894948921981e-07, + "learning_rate": 9.955773758203076e-06, + "loss": 0.0, + "step": 990 + }, + { + "epoch": 0.004467297151204606, + "grad_norm": 0.00417350372299552, + "learning_rate": 9.955327028487954e-06, + "loss": 0.0003, + "step": 1000 + }, + { + "epoch": 0.0045119701227166525, + "grad_norm": 1.3091346583171681e-11, + "learning_rate": 9.954880298772834e-06, + "loss": 0.0002, + "step": 1010 + }, + { + "epoch": 0.004556643094228699, + "grad_norm": 8.930554629138499e-10, + "learning_rate": 9.954433569057714e-06, + "loss": 0.8625, + "step": 1020 + }, + { + "epoch": 0.004601316065740745, + "grad_norm": 4.606064067047555e-06, + "learning_rate": 9.953986839342593e-06, + "loss": 0.0114, + "step": 1030 + }, + { + "epoch": 0.004645989037252791, + "grad_norm": 0.09143206477165222, + "learning_rate": 9.953540109627473e-06, + "loss": 0.0002, + "step": 1040 + }, + { + "epoch": 0.004690662008764837, + "grad_norm": 1.3204090595245361, + "learning_rate": 9.953093379912351e-06, + "loss": 0.0082, + "step": 1050 + }, + { + "epoch": 0.004735334980276883, + "grad_norm": 0.011992010287940502, + "learning_rate": 9.952646650197231e-06, + "loss": 0.0, + "step": 1060 + }, + { + "epoch": 0.004780007951788929, + "grad_norm": 2.1359237223350647e-07, + "learning_rate": 9.952199920482112e-06, + "loss": 0.0, + "step": 1070 + }, + { + "epoch": 0.004824680923300975, + "grad_norm": 2.1936364191788016e-06, + "learning_rate": 9.95175319076699e-06, + "loss": 0.0018, + "step": 1080 + }, + { + "epoch": 0.004869353894813021, + "grad_norm": 6.708241961916883e-09, + "learning_rate": 9.95130646105187e-06, + "loss": 0.0, + "step": 1090 + }, + { + "epoch": 0.004914026866325067, + "grad_norm": 1.3402251313951452e-12, + "learning_rate": 9.95085973133675e-06, + "loss": 0.0, + "step": 1100 + }, + { + "epoch": 0.0049586998378371135, + "grad_norm": 0.0038719906006008387, + "learning_rate": 9.950413001621629e-06, + "loss": 0.0003, + "step": 1110 + }, + { + "epoch": 0.00500337280934916, + "grad_norm": 2.7270777991361683e-07, + "learning_rate": 9.949966271906509e-06, + "loss": 0.3289, + "step": 1120 + }, + { + "epoch": 0.005048045780861206, + "grad_norm": 5.544622421264648, + "learning_rate": 9.949519542191389e-06, + "loss": 0.0357, + "step": 1130 + }, + { + "epoch": 0.005092718752373252, + "grad_norm": 1.9362450984772295e-06, + "learning_rate": 9.949072812476267e-06, + "loss": 0.0, + "step": 1140 + }, + { + "epoch": 0.005137391723885298, + "grad_norm": 3.2831758145501766e-11, + "learning_rate": 9.948626082761147e-06, + "loss": 0.1835, + "step": 1150 + }, + { + "epoch": 0.005182064695397344, + "grad_norm": 3.0839785836178635e-08, + "learning_rate": 9.948179353046028e-06, + "loss": 0.0, + "step": 1160 + }, + { + "epoch": 0.00522673766690939, + "grad_norm": 4.25630517497666e-08, + "learning_rate": 9.947732623330906e-06, + "loss": 0.0765, + "step": 1170 + }, + { + "epoch": 0.005271410638421436, + "grad_norm": 0.04964405670762062, + "learning_rate": 9.947285893615786e-06, + "loss": 0.0049, + "step": 1180 + }, + { + "epoch": 0.005316083609933482, + "grad_norm": 4.989205081074033e-06, + "learning_rate": 9.946839163900666e-06, + "loss": 0.0089, + "step": 1190 + }, + { + "epoch": 0.005360756581445528, + "grad_norm": 9.702245951093147e-11, + "learning_rate": 9.946392434185545e-06, + "loss": 0.0, + "step": 1200 + }, + { + "epoch": 0.0054054295529575745, + "grad_norm": 13.150774955749512, + "learning_rate": 9.945945704470425e-06, + "loss": 0.0056, + "step": 1210 + }, + { + "epoch": 0.00545010252446962, + "grad_norm": 9.329108252131846e-06, + "learning_rate": 9.945498974755305e-06, + "loss": 0.018, + "step": 1220 + }, + { + "epoch": 0.005494775495981666, + "grad_norm": 9.888663043966517e-06, + "learning_rate": 9.945052245040185e-06, + "loss": 0.4156, + "step": 1230 + }, + { + "epoch": 0.005539448467493712, + "grad_norm": 1.0036829006798698e-11, + "learning_rate": 9.944605515325064e-06, + "loss": 0.0, + "step": 1240 + }, + { + "epoch": 0.005584121439005758, + "grad_norm": 5.702725047740387e-06, + "learning_rate": 9.944158785609944e-06, + "loss": 0.0, + "step": 1250 + }, + { + "epoch": 0.005628794410517804, + "grad_norm": 0.0011835768818855286, + "learning_rate": 9.943712055894824e-06, + "loss": 0.1227, + "step": 1260 + }, + { + "epoch": 0.00567346738202985, + "grad_norm": 4.868632572652132e-07, + "learning_rate": 9.943265326179702e-06, + "loss": 0.0002, + "step": 1270 + }, + { + "epoch": 0.005718140353541896, + "grad_norm": 0.0006190579733811319, + "learning_rate": 9.942818596464582e-06, + "loss": 0.0014, + "step": 1280 + }, + { + "epoch": 0.005762813325053942, + "grad_norm": 9.045915589922515e-07, + "learning_rate": 9.942371866749462e-06, + "loss": 0.0143, + "step": 1290 + }, + { + "epoch": 0.0058074862965659885, + "grad_norm": 0.0010632362682372332, + "learning_rate": 9.941925137034341e-06, + "loss": 0.0001, + "step": 1300 + }, + { + "epoch": 0.005852159268078035, + "grad_norm": 1.4616702515013458e-08, + "learning_rate": 9.941478407319221e-06, + "loss": 0.0003, + "step": 1310 + }, + { + "epoch": 0.005896832239590081, + "grad_norm": 0.003920422866940498, + "learning_rate": 9.941031677604101e-06, + "loss": 0.0, + "step": 1320 + }, + { + "epoch": 0.005941505211102127, + "grad_norm": 1.2216974987211415e-08, + "learning_rate": 9.94058494788898e-06, + "loss": 0.1611, + "step": 1330 + }, + { + "epoch": 0.005986178182614173, + "grad_norm": 2.7995277207537583e-08, + "learning_rate": 9.94013821817386e-06, + "loss": 0.0002, + "step": 1340 + }, + { + "epoch": 0.006030851154126219, + "grad_norm": 5.323033333559657e-11, + "learning_rate": 9.939691488458738e-06, + "loss": 0.0, + "step": 1350 + }, + { + "epoch": 0.006075524125638265, + "grad_norm": 8.650657079556368e-09, + "learning_rate": 9.939244758743618e-06, + "loss": 0.1313, + "step": 1360 + }, + { + "epoch": 0.006120197097150311, + "grad_norm": 7.414421816065442e-06, + "learning_rate": 9.938798029028498e-06, + "loss": 0.0001, + "step": 1370 + }, + { + "epoch": 0.006164870068662357, + "grad_norm": 0.0003329328028485179, + "learning_rate": 9.938351299313377e-06, + "loss": 0.0018, + "step": 1380 + }, + { + "epoch": 0.006209543040174403, + "grad_norm": 0.00023930691531859338, + "learning_rate": 9.937904569598257e-06, + "loss": 0.0013, + "step": 1390 + }, + { + "epoch": 0.0062542160116864495, + "grad_norm": 166.027099609375, + "learning_rate": 9.937457839883137e-06, + "loss": 0.0341, + "step": 1400 + }, + { + "epoch": 0.006298888983198496, + "grad_norm": 2.805340500344755e-06, + "learning_rate": 9.937011110168016e-06, + "loss": 0.0001, + "step": 1410 + }, + { + "epoch": 0.006343561954710542, + "grad_norm": 8.33417709844575e-12, + "learning_rate": 9.936564380452896e-06, + "loss": 0.0312, + "step": 1420 + }, + { + "epoch": 0.006388234926222588, + "grad_norm": 1.616840124130249, + "learning_rate": 9.936117650737776e-06, + "loss": 0.0003, + "step": 1430 + }, + { + "epoch": 0.006432907897734634, + "grad_norm": 7.233583346533123e-06, + "learning_rate": 9.935670921022654e-06, + "loss": 0.0016, + "step": 1440 + }, + { + "epoch": 0.00647758086924668, + "grad_norm": 0.0003906420897692442, + "learning_rate": 9.935224191307534e-06, + "loss": 0.0015, + "step": 1450 + }, + { + "epoch": 0.006522253840758726, + "grad_norm": 4.490878200158477e-05, + "learning_rate": 9.934777461592413e-06, + "loss": 0.0067, + "step": 1460 + }, + { + "epoch": 0.006566926812270772, + "grad_norm": 0.0002814727777149528, + "learning_rate": 9.934330731877293e-06, + "loss": 0.0003, + "step": 1470 + }, + { + "epoch": 0.006611599783782818, + "grad_norm": 0.12526680529117584, + "learning_rate": 9.933884002162173e-06, + "loss": 0.0985, + "step": 1480 + }, + { + "epoch": 0.0066562727552948635, + "grad_norm": 0.0004512220621109009, + "learning_rate": 9.933437272447051e-06, + "loss": 0.0432, + "step": 1490 + }, + { + "epoch": 0.00670094572680691, + "grad_norm": 0.0018296745838597417, + "learning_rate": 9.932990542731932e-06, + "loss": 0.0001, + "step": 1500 + }, + { + "epoch": 0.006745618698318956, + "grad_norm": 0.008827965706586838, + "learning_rate": 9.932543813016812e-06, + "loss": 0.0001, + "step": 1510 + }, + { + "epoch": 0.006790291669831002, + "grad_norm": 0.643787682056427, + "learning_rate": 9.93209708330169e-06, + "loss": 0.0001, + "step": 1520 + }, + { + "epoch": 0.006834964641343048, + "grad_norm": 1.456542668165639e-05, + "learning_rate": 9.93165035358657e-06, + "loss": 0.0, + "step": 1530 + }, + { + "epoch": 0.006879637612855094, + "grad_norm": 0.004160408861935139, + "learning_rate": 9.931203623871449e-06, + "loss": 0.0129, + "step": 1540 + }, + { + "epoch": 0.00692431058436714, + "grad_norm": 1.7304720878601074, + "learning_rate": 9.930756894156329e-06, + "loss": 0.0004, + "step": 1550 + }, + { + "epoch": 0.006968983555879186, + "grad_norm": 3.7183988094329834, + "learning_rate": 9.930310164441209e-06, + "loss": 0.0009, + "step": 1560 + }, + { + "epoch": 0.007013656527391232, + "grad_norm": 8.051643817452714e-05, + "learning_rate": 9.929863434726087e-06, + "loss": 0.0, + "step": 1570 + }, + { + "epoch": 0.007058329498903278, + "grad_norm": 0.0009096893481910229, + "learning_rate": 9.929416705010967e-06, + "loss": 0.0, + "step": 1580 + }, + { + "epoch": 0.0071030024704153245, + "grad_norm": 8.303599088321789e-07, + "learning_rate": 9.928969975295848e-06, + "loss": 0.0101, + "step": 1590 + }, + { + "epoch": 0.007147675441927371, + "grad_norm": 5.950789017106217e-08, + "learning_rate": 9.928523245580726e-06, + "loss": 0.076, + "step": 1600 + }, + { + "epoch": 0.007192348413439417, + "grad_norm": 6.889599717396777e-07, + "learning_rate": 9.928076515865606e-06, + "loss": 0.0125, + "step": 1610 + }, + { + "epoch": 0.007237021384951463, + "grad_norm": 1.637503856954936e-07, + "learning_rate": 9.927629786150486e-06, + "loss": 0.1141, + "step": 1620 + }, + { + "epoch": 0.007281694356463509, + "grad_norm": 0.27094534039497375, + "learning_rate": 9.927183056435365e-06, + "loss": 0.0001, + "step": 1630 + }, + { + "epoch": 0.007326367327975555, + "grad_norm": 4.483622433326673e-06, + "learning_rate": 9.926736326720245e-06, + "loss": 0.0, + "step": 1640 + }, + { + "epoch": 0.007371040299487601, + "grad_norm": 121.53789520263672, + "learning_rate": 9.926289597005125e-06, + "loss": 0.0389, + "step": 1650 + }, + { + "epoch": 0.007415713270999647, + "grad_norm": 0.02433842420578003, + "learning_rate": 9.925842867290003e-06, + "loss": 0.0003, + "step": 1660 + }, + { + "epoch": 0.007460386242511693, + "grad_norm": 0.00012709507427643985, + "learning_rate": 9.925396137574884e-06, + "loss": 0.0001, + "step": 1670 + }, + { + "epoch": 0.007505059214023739, + "grad_norm": 0.0830000564455986, + "learning_rate": 9.924949407859764e-06, + "loss": 0.0001, + "step": 1680 + }, + { + "epoch": 0.0075497321855357855, + "grad_norm": 4.0808568330108486e-11, + "learning_rate": 9.924502678144644e-06, + "loss": 0.0692, + "step": 1690 + }, + { + "epoch": 0.007594405157047832, + "grad_norm": 4.6427681809291244e-05, + "learning_rate": 9.924055948429522e-06, + "loss": 0.0691, + "step": 1700 + }, + { + "epoch": 0.007639078128559878, + "grad_norm": 1.8825395107269287, + "learning_rate": 9.923609218714402e-06, + "loss": 0.0005, + "step": 1710 + }, + { + "epoch": 0.007683751100071924, + "grad_norm": 0.0066458964720368385, + "learning_rate": 9.923162488999282e-06, + "loss": 0.0, + "step": 1720 + }, + { + "epoch": 0.00772842407158397, + "grad_norm": 5.8484480279508144e-12, + "learning_rate": 9.922715759284161e-06, + "loss": 0.0001, + "step": 1730 + }, + { + "epoch": 0.007773097043096016, + "grad_norm": 3.3582630157470703, + "learning_rate": 9.922269029569041e-06, + "loss": 0.0009, + "step": 1740 + }, + { + "epoch": 0.007817770014608061, + "grad_norm": 9.292478895736589e-13, + "learning_rate": 9.921822299853921e-06, + "loss": 0.0, + "step": 1750 + }, + { + "epoch": 0.007862442986120108, + "grad_norm": 6.707903139613336e-06, + "learning_rate": 9.9213755701388e-06, + "loss": 0.0002, + "step": 1760 + }, + { + "epoch": 0.007907115957632153, + "grad_norm": 3.6095958036788667e-13, + "learning_rate": 9.92092884042368e-06, + "loss": 0.0, + "step": 1770 + }, + { + "epoch": 0.0079517889291442, + "grad_norm": 4.903984063275857e-08, + "learning_rate": 9.92048211070856e-06, + "loss": 0.0001, + "step": 1780 + }, + { + "epoch": 0.007996461900656246, + "grad_norm": 8.984227008620148e-17, + "learning_rate": 9.920035380993438e-06, + "loss": 0.0031, + "step": 1790 + }, + { + "epoch": 0.008041134872168293, + "grad_norm": 5.623330253001768e-06, + "learning_rate": 9.919588651278318e-06, + "loss": 0.0006, + "step": 1800 + }, + { + "epoch": 0.008085807843680338, + "grad_norm": 14.42357063293457, + "learning_rate": 9.919141921563199e-06, + "loss": 0.0009, + "step": 1810 + }, + { + "epoch": 0.008130480815192385, + "grad_norm": 2.3612351223584183e-12, + "learning_rate": 9.918695191848077e-06, + "loss": 0.0001, + "step": 1820 + }, + { + "epoch": 0.00817515378670443, + "grad_norm": 9.99601297735353e-07, + "learning_rate": 9.918248462132957e-06, + "loss": 0.0266, + "step": 1830 + }, + { + "epoch": 0.008219826758216477, + "grad_norm": 0.016000457108020782, + "learning_rate": 9.917801732417836e-06, + "loss": 0.0006, + "step": 1840 + }, + { + "epoch": 0.008264499729728522, + "grad_norm": 2.5022452859735367e-11, + "learning_rate": 9.917355002702716e-06, + "loss": 0.0022, + "step": 1850 + }, + { + "epoch": 0.00830917270124057, + "grad_norm": 1.409413086456493e-13, + "learning_rate": 9.916908272987596e-06, + "loss": 0.0018, + "step": 1860 + }, + { + "epoch": 0.008353845672752614, + "grad_norm": 0.0018817445961758494, + "learning_rate": 9.916461543272474e-06, + "loss": 0.2938, + "step": 1870 + }, + { + "epoch": 0.008398518644264661, + "grad_norm": 1.8390163631920586e-06, + "learning_rate": 9.916014813557354e-06, + "loss": 0.0, + "step": 1880 + }, + { + "epoch": 0.008443191615776707, + "grad_norm": 2.352197325805605e-09, + "learning_rate": 9.915568083842234e-06, + "loss": 0.0, + "step": 1890 + }, + { + "epoch": 0.008487864587288752, + "grad_norm": 2.6651733264770883e-08, + "learning_rate": 9.915121354127113e-06, + "loss": 0.0, + "step": 1900 + }, + { + "epoch": 0.008532537558800799, + "grad_norm": 1.7570675817091264e-11, + "learning_rate": 9.914674624411993e-06, + "loss": 0.012, + "step": 1910 + }, + { + "epoch": 0.008577210530312844, + "grad_norm": 5.639159553538775e-06, + "learning_rate": 9.914227894696873e-06, + "loss": 0.0008, + "step": 1920 + }, + { + "epoch": 0.008621883501824891, + "grad_norm": 1.478918534303375e-06, + "learning_rate": 9.913781164981752e-06, + "loss": 0.0, + "step": 1930 + }, + { + "epoch": 0.008666556473336936, + "grad_norm": 34.57212829589844, + "learning_rate": 9.913334435266632e-06, + "loss": 0.005, + "step": 1940 + }, + { + "epoch": 0.008711229444848983, + "grad_norm": 9.411427527084015e-06, + "learning_rate": 9.91288770555151e-06, + "loss": 0.0, + "step": 1950 + }, + { + "epoch": 0.008755902416361028, + "grad_norm": 2.57602250641753e-09, + "learning_rate": 9.91244097583639e-06, + "loss": 0.0008, + "step": 1960 + }, + { + "epoch": 0.008800575387873075, + "grad_norm": 0.00012505015183705837, + "learning_rate": 9.91199424612127e-06, + "loss": 0.0001, + "step": 1970 + }, + { + "epoch": 0.00884524835938512, + "grad_norm": 1.6882954696484376e-06, + "learning_rate": 9.911547516406149e-06, + "loss": 0.0041, + "step": 1980 + }, + { + "epoch": 0.008889921330897168, + "grad_norm": 85.14512634277344, + "learning_rate": 9.911100786691029e-06, + "loss": 0.5186, + "step": 1990 + }, + { + "epoch": 0.008934594302409213, + "grad_norm": 5.102280283608707e-06, + "learning_rate": 9.910654056975909e-06, + "loss": 0.0001, + "step": 2000 + }, + { + "epoch": 0.00897926727392126, + "grad_norm": 2.827220337182912e-13, + "learning_rate": 9.910207327260787e-06, + "loss": 0.0003, + "step": 2010 + }, + { + "epoch": 0.009023940245433305, + "grad_norm": 0.0008906282018870115, + "learning_rate": 9.909760597545668e-06, + "loss": 0.0022, + "step": 2020 + }, + { + "epoch": 0.009068613216945352, + "grad_norm": 0.013927340507507324, + "learning_rate": 9.909313867830548e-06, + "loss": 0.0, + "step": 2030 + }, + { + "epoch": 0.009113286188457397, + "grad_norm": 1.2370523272409173e-08, + "learning_rate": 9.908867138115426e-06, + "loss": 0.0, + "step": 2040 + }, + { + "epoch": 0.009157959159969444, + "grad_norm": 5.075939043308608e-06, + "learning_rate": 9.908420408400306e-06, + "loss": 0.0167, + "step": 2050 + }, + { + "epoch": 0.00920263213148149, + "grad_norm": 1.1170839115948183e-06, + "learning_rate": 9.907973678685185e-06, + "loss": 0.0001, + "step": 2060 + }, + { + "epoch": 0.009247305102993536, + "grad_norm": 1.5858339565966162e-06, + "learning_rate": 9.907526948970065e-06, + "loss": 0.0, + "step": 2070 + }, + { + "epoch": 0.009291978074505582, + "grad_norm": 5.236922788753873e-06, + "learning_rate": 9.907080219254945e-06, + "loss": 0.0281, + "step": 2080 + }, + { + "epoch": 0.009336651046017629, + "grad_norm": 0.10250507295131683, + "learning_rate": 9.906633489539823e-06, + "loss": 0.0001, + "step": 2090 + }, + { + "epoch": 0.009381324017529674, + "grad_norm": 3.122114383431046e-13, + "learning_rate": 9.906186759824704e-06, + "loss": 0.0252, + "step": 2100 + }, + { + "epoch": 0.00942599698904172, + "grad_norm": 1.3368267381963506e-08, + "learning_rate": 9.905740030109584e-06, + "loss": 0.0002, + "step": 2110 + }, + { + "epoch": 0.009470669960553766, + "grad_norm": 1.0595232238222105e-13, + "learning_rate": 9.905293300394462e-06, + "loss": 0.0141, + "step": 2120 + }, + { + "epoch": 0.009515342932065813, + "grad_norm": 0.006708197295665741, + "learning_rate": 9.904846570679342e-06, + "loss": 0.0005, + "step": 2130 + }, + { + "epoch": 0.009560015903577858, + "grad_norm": 2.7285839678370394e-05, + "learning_rate": 9.904399840964222e-06, + "loss": 0.0, + "step": 2140 + }, + { + "epoch": 0.009604688875089905, + "grad_norm": 1.811926007270813, + "learning_rate": 9.903953111249102e-06, + "loss": 0.0004, + "step": 2150 + }, + { + "epoch": 0.00964936184660195, + "grad_norm": 1.2543156782426706e-20, + "learning_rate": 9.903506381533981e-06, + "loss": 0.0001, + "step": 2160 + }, + { + "epoch": 0.009694034818113996, + "grad_norm": 9.28156551616155e-10, + "learning_rate": 9.903059651818861e-06, + "loss": 0.001, + "step": 2170 + }, + { + "epoch": 0.009738707789626043, + "grad_norm": 7.503916992104438e-18, + "learning_rate": 9.902612922103741e-06, + "loss": 0.0, + "step": 2180 + }, + { + "epoch": 0.009783380761138088, + "grad_norm": 2.061022496491205e-05, + "learning_rate": 9.90216619238862e-06, + "loss": 0.0, + "step": 2190 + }, + { + "epoch": 0.009828053732650135, + "grad_norm": 1.511832594871521, + "learning_rate": 9.9017194626735e-06, + "loss": 0.0003, + "step": 2200 + }, + { + "epoch": 0.00987272670416218, + "grad_norm": 4.14693022321444e-06, + "learning_rate": 9.90127273295838e-06, + "loss": 0.0252, + "step": 2210 + }, + { + "epoch": 0.009917399675674227, + "grad_norm": 2.3049189223911526e-07, + "learning_rate": 9.900826003243258e-06, + "loss": 0.0101, + "step": 2220 + }, + { + "epoch": 0.009962072647186272, + "grad_norm": 2.308055400135345e-06, + "learning_rate": 9.900379273528138e-06, + "loss": 0.0002, + "step": 2230 + }, + { + "epoch": 0.01000674561869832, + "grad_norm": 2.468515626283252e-12, + "learning_rate": 9.899932543813019e-06, + "loss": 0.0015, + "step": 2240 + }, + { + "epoch": 0.010051418590210364, + "grad_norm": 1.3838086926343252e-11, + "learning_rate": 9.899485814097897e-06, + "loss": 0.0008, + "step": 2250 + }, + { + "epoch": 0.010096091561722411, + "grad_norm": 0.0163019560277462, + "learning_rate": 9.899039084382777e-06, + "loss": 0.1336, + "step": 2260 + }, + { + "epoch": 0.010140764533234457, + "grad_norm": 1.2105209634682979e-07, + "learning_rate": 9.898592354667657e-06, + "loss": 0.0002, + "step": 2270 + }, + { + "epoch": 0.010185437504746504, + "grad_norm": 0.01281829085201025, + "learning_rate": 9.898145624952536e-06, + "loss": 0.0021, + "step": 2280 + }, + { + "epoch": 0.010230110476258549, + "grad_norm": 8.710338050832434e-08, + "learning_rate": 9.897698895237416e-06, + "loss": 0.0, + "step": 2290 + }, + { + "epoch": 0.010274783447770596, + "grad_norm": 49.676658630371094, + "learning_rate": 9.897252165522296e-06, + "loss": 0.0219, + "step": 2300 + }, + { + "epoch": 0.010319456419282641, + "grad_norm": 2.708988589006367e-11, + "learning_rate": 9.896805435807174e-06, + "loss": 0.0, + "step": 2310 + }, + { + "epoch": 0.010364129390794688, + "grad_norm": 1.1903916075084453e-08, + "learning_rate": 9.896358706092054e-06, + "loss": 0.0252, + "step": 2320 + }, + { + "epoch": 0.010408802362306733, + "grad_norm": 5.5912947573233396e-05, + "learning_rate": 9.895911976376935e-06, + "loss": 0.0, + "step": 2330 + }, + { + "epoch": 0.01045347533381878, + "grad_norm": 6.648967487699053e-11, + "learning_rate": 9.895465246661813e-06, + "loss": 0.0582, + "step": 2340 + }, + { + "epoch": 0.010498148305330825, + "grad_norm": 1.705614749880624e-06, + "learning_rate": 9.895018516946693e-06, + "loss": 0.0008, + "step": 2350 + }, + { + "epoch": 0.010542821276842872, + "grad_norm": 0.09579768031835556, + "learning_rate": 9.894571787231572e-06, + "loss": 0.0, + "step": 2360 + }, + { + "epoch": 0.010587494248354918, + "grad_norm": 1.360779151582392e-07, + "learning_rate": 9.894125057516452e-06, + "loss": 0.0001, + "step": 2370 + }, + { + "epoch": 0.010632167219866965, + "grad_norm": 2.5863171160267484e-08, + "learning_rate": 9.893678327801332e-06, + "loss": 0.0, + "step": 2380 + }, + { + "epoch": 0.01067684019137901, + "grad_norm": 1.5213099718093872, + "learning_rate": 9.89323159808621e-06, + "loss": 0.0006, + "step": 2390 + }, + { + "epoch": 0.010721513162891057, + "grad_norm": 1.3281054207781029e-11, + "learning_rate": 9.89278486837109e-06, + "loss": 0.0004, + "step": 2400 + }, + { + "epoch": 0.010766186134403102, + "grad_norm": 0.18338826298713684, + "learning_rate": 9.89233813865597e-06, + "loss": 0.0, + "step": 2410 + }, + { + "epoch": 0.010810859105915149, + "grad_norm": 0.8633919954299927, + "learning_rate": 9.891891408940849e-06, + "loss": 0.0006, + "step": 2420 + }, + { + "epoch": 0.010855532077427194, + "grad_norm": 0.0002038137463387102, + "learning_rate": 9.891444679225729e-06, + "loss": 0.0016, + "step": 2430 + }, + { + "epoch": 0.01090020504893924, + "grad_norm": 2.7525827590579866e-06, + "learning_rate": 9.890997949510607e-06, + "loss": 0.0252, + "step": 2440 + }, + { + "epoch": 0.010944878020451286, + "grad_norm": 2.0097404296315347e-10, + "learning_rate": 9.890551219795488e-06, + "loss": 0.0007, + "step": 2450 + }, + { + "epoch": 0.010989550991963332, + "grad_norm": 6.054831657754944e-13, + "learning_rate": 9.890104490080368e-06, + "loss": 0.0002, + "step": 2460 + }, + { + "epoch": 0.011034223963475379, + "grad_norm": 1.084228706815793e-08, + "learning_rate": 9.889657760365246e-06, + "loss": 0.0001, + "step": 2470 + }, + { + "epoch": 0.011078896934987424, + "grad_norm": 0.005000013392418623, + "learning_rate": 9.889211030650126e-06, + "loss": 0.0036, + "step": 2480 + }, + { + "epoch": 0.01112356990649947, + "grad_norm": 1.300892371602913e-08, + "learning_rate": 9.888764300935006e-06, + "loss": 0.0604, + "step": 2490 + }, + { + "epoch": 0.011168242878011516, + "grad_norm": 2.1114540100097656, + "learning_rate": 9.888317571219885e-06, + "loss": 0.0057, + "step": 2500 + }, + { + "epoch": 0.011212915849523563, + "grad_norm": 0.0629802942276001, + "learning_rate": 9.887870841504765e-06, + "loss": 0.0027, + "step": 2510 + }, + { + "epoch": 0.011257588821035608, + "grad_norm": 108.7916488647461, + "learning_rate": 9.887424111789645e-06, + "loss": 0.038, + "step": 2520 + }, + { + "epoch": 0.011302261792547655, + "grad_norm": 1.681610115156218e-06, + "learning_rate": 9.886977382074524e-06, + "loss": 0.0996, + "step": 2530 + }, + { + "epoch": 0.0113469347640597, + "grad_norm": 3.4575478036202867e-09, + "learning_rate": 9.886530652359404e-06, + "loss": 0.0004, + "step": 2540 + }, + { + "epoch": 0.011391607735571747, + "grad_norm": 0.0012267986312508583, + "learning_rate": 9.886083922644282e-06, + "loss": 0.0062, + "step": 2550 + }, + { + "epoch": 0.011436280707083793, + "grad_norm": 1.681084826898882e-09, + "learning_rate": 9.885637192929162e-06, + "loss": 0.0004, + "step": 2560 + }, + { + "epoch": 0.01148095367859584, + "grad_norm": 5.801022995299832e-10, + "learning_rate": 9.885190463214042e-06, + "loss": 0.0, + "step": 2570 + }, + { + "epoch": 0.011525626650107885, + "grad_norm": 2.1600253603537567e-05, + "learning_rate": 9.88474373349892e-06, + "loss": 0.0, + "step": 2580 + }, + { + "epoch": 0.011570299621619932, + "grad_norm": 1.1640495678477691e-08, + "learning_rate": 9.884297003783801e-06, + "loss": 0.0, + "step": 2590 + }, + { + "epoch": 0.011614972593131977, + "grad_norm": 0.010300342924892902, + "learning_rate": 9.883850274068681e-06, + "loss": 0.0, + "step": 2600 + }, + { + "epoch": 0.011659645564644024, + "grad_norm": 0.0024663745425641537, + "learning_rate": 9.883403544353561e-06, + "loss": 0.0002, + "step": 2610 + }, + { + "epoch": 0.01170431853615607, + "grad_norm": 0.00015290798910427839, + "learning_rate": 9.88295681463844e-06, + "loss": 0.305, + "step": 2620 + }, + { + "epoch": 0.011748991507668116, + "grad_norm": 4.006568815384526e-06, + "learning_rate": 9.88251008492332e-06, + "loss": 0.4032, + "step": 2630 + }, + { + "epoch": 0.011793664479180161, + "grad_norm": 7.545190783275757e-08, + "learning_rate": 9.8820633552082e-06, + "loss": 0.0001, + "step": 2640 + }, + { + "epoch": 0.011838337450692208, + "grad_norm": 0.004153342917561531, + "learning_rate": 9.881616625493078e-06, + "loss": 0.001, + "step": 2650 + }, + { + "epoch": 0.011883010422204254, + "grad_norm": 0.3302571177482605, + "learning_rate": 9.881169895777958e-06, + "loss": 0.0003, + "step": 2660 + }, + { + "epoch": 0.0119276833937163, + "grad_norm": 7.340275609557523e-10, + "learning_rate": 9.880723166062838e-06, + "loss": 0.0001, + "step": 2670 + }, + { + "epoch": 0.011972356365228346, + "grad_norm": 0.0007693750667385757, + "learning_rate": 9.880276436347717e-06, + "loss": 0.0, + "step": 2680 + }, + { + "epoch": 0.012017029336740393, + "grad_norm": 9.321666355477376e-11, + "learning_rate": 9.879829706632597e-06, + "loss": 0.0024, + "step": 2690 + }, + { + "epoch": 0.012061702308252438, + "grad_norm": 4.1744015572507936e-15, + "learning_rate": 9.879382976917477e-06, + "loss": 0.0, + "step": 2700 + }, + { + "epoch": 0.012106375279764483, + "grad_norm": 0.05857926979660988, + "learning_rate": 9.878936247202356e-06, + "loss": 0.0014, + "step": 2710 + }, + { + "epoch": 0.01215104825127653, + "grad_norm": 1.411705312648337e-07, + "learning_rate": 9.878489517487236e-06, + "loss": 0.0, + "step": 2720 + }, + { + "epoch": 0.012195721222788575, + "grad_norm": 0.00032469897996634245, + "learning_rate": 9.878042787772116e-06, + "loss": 0.0, + "step": 2730 + }, + { + "epoch": 0.012240394194300622, + "grad_norm": 2.2813771920482395e-06, + "learning_rate": 9.877596058056994e-06, + "loss": 0.0, + "step": 2740 + }, + { + "epoch": 0.012285067165812668, + "grad_norm": 1.0996114241379473e-08, + "learning_rate": 9.877149328341874e-06, + "loss": 0.0, + "step": 2750 + }, + { + "epoch": 0.012329740137324715, + "grad_norm": 1.8043495586539393e-09, + "learning_rate": 9.876702598626755e-06, + "loss": 0.003, + "step": 2760 + }, + { + "epoch": 0.01237441310883676, + "grad_norm": 5.025076461606659e-07, + "learning_rate": 9.876255868911633e-06, + "loss": 0.0, + "step": 2770 + }, + { + "epoch": 0.012419086080348807, + "grad_norm": 4.728721414437587e-09, + "learning_rate": 9.875809139196513e-06, + "loss": 0.0002, + "step": 2780 + }, + { + "epoch": 0.012463759051860852, + "grad_norm": 5.2914412663085386e-05, + "learning_rate": 9.875362409481393e-06, + "loss": 0.0038, + "step": 2790 + }, + { + "epoch": 0.012508432023372899, + "grad_norm": 9.6832536655711e-07, + "learning_rate": 9.874915679766272e-06, + "loss": 0.0113, + "step": 2800 + }, + { + "epoch": 0.012553104994884944, + "grad_norm": 7.181570981629193e-05, + "learning_rate": 9.874468950051152e-06, + "loss": 0.0, + "step": 2810 + }, + { + "epoch": 0.012597777966396991, + "grad_norm": 3.6548785828927066e-06, + "learning_rate": 9.874022220336032e-06, + "loss": 0.0, + "step": 2820 + }, + { + "epoch": 0.012642450937909036, + "grad_norm": 1.1662586044472456e-12, + "learning_rate": 9.87357549062091e-06, + "loss": 0.0, + "step": 2830 + }, + { + "epoch": 0.012687123909421083, + "grad_norm": 8.896884537534788e-05, + "learning_rate": 9.87312876090579e-06, + "loss": 0.0, + "step": 2840 + }, + { + "epoch": 0.012731796880933129, + "grad_norm": 3.573189487349193e-12, + "learning_rate": 9.872682031190669e-06, + "loss": 0.0002, + "step": 2850 + }, + { + "epoch": 0.012776469852445176, + "grad_norm": 0.005769051611423492, + "learning_rate": 9.872235301475549e-06, + "loss": 0.0, + "step": 2860 + }, + { + "epoch": 0.01282114282395722, + "grad_norm": 1.0245492648719434e-10, + "learning_rate": 9.871788571760429e-06, + "loss": 0.0004, + "step": 2870 + }, + { + "epoch": 0.012865815795469268, + "grad_norm": 8.58198109199293e-05, + "learning_rate": 9.871341842045308e-06, + "loss": 0.0019, + "step": 2880 + }, + { + "epoch": 0.012910488766981313, + "grad_norm": 7.715765418074561e-13, + "learning_rate": 9.870895112330188e-06, + "loss": 0.0, + "step": 2890 + }, + { + "epoch": 0.01295516173849336, + "grad_norm": 3.119744405921665e-06, + "learning_rate": 9.870448382615068e-06, + "loss": 0.0, + "step": 2900 + }, + { + "epoch": 0.012999834710005405, + "grad_norm": 5.282707604692405e-10, + "learning_rate": 9.870001652899946e-06, + "loss": 0.0057, + "step": 2910 + }, + { + "epoch": 0.013044507681517452, + "grad_norm": 3.762822085168205e-12, + "learning_rate": 9.869554923184826e-06, + "loss": 0.01, + "step": 2920 + }, + { + "epoch": 0.013089180653029497, + "grad_norm": 0.226803719997406, + "learning_rate": 9.869108193469705e-06, + "loss": 0.007, + "step": 2930 + }, + { + "epoch": 0.013133853624541544, + "grad_norm": 6.065501434449061e-09, + "learning_rate": 9.868661463754585e-06, + "loss": 0.0, + "step": 2940 + }, + { + "epoch": 0.01317852659605359, + "grad_norm": 0.18184098601341248, + "learning_rate": 9.868214734039465e-06, + "loss": 0.0348, + "step": 2950 + }, + { + "epoch": 0.013223199567565637, + "grad_norm": 0.1139160618185997, + "learning_rate": 9.867768004324343e-06, + "loss": 0.1315, + "step": 2960 + }, + { + "epoch": 0.013267872539077682, + "grad_norm": 0.00014653371181339025, + "learning_rate": 9.867321274609224e-06, + "loss": 0.0001, + "step": 2970 + }, + { + "epoch": 0.013312545510589727, + "grad_norm": 1.2791061543282467e-09, + "learning_rate": 9.866874544894104e-06, + "loss": 0.0041, + "step": 2980 + }, + { + "epoch": 0.013357218482101774, + "grad_norm": 5.588849383286743e-10, + "learning_rate": 9.866427815178982e-06, + "loss": 0.0016, + "step": 2990 + }, + { + "epoch": 0.01340189145361382, + "grad_norm": 551.7684936523438, + "learning_rate": 9.865981085463862e-06, + "loss": 0.1721, + "step": 3000 + }, + { + "epoch": 0.013446564425125866, + "grad_norm": 1.8122448799864216e-13, + "learning_rate": 9.865534355748742e-06, + "loss": 0.0, + "step": 3010 + }, + { + "epoch": 0.013491237396637911, + "grad_norm": 0.0015737615758553147, + "learning_rate": 9.865087626033621e-06, + "loss": 0.0001, + "step": 3020 + }, + { + "epoch": 0.013535910368149958, + "grad_norm": 1.6226993982354856e-16, + "learning_rate": 9.864640896318501e-06, + "loss": 0.0, + "step": 3030 + }, + { + "epoch": 0.013580583339662004, + "grad_norm": 0.00010395060235168785, + "learning_rate": 9.86419416660338e-06, + "loss": 0.0, + "step": 3040 + }, + { + "epoch": 0.01362525631117405, + "grad_norm": 1.976358632305164e-08, + "learning_rate": 9.86374743688826e-06, + "loss": 0.0, + "step": 3050 + }, + { + "epoch": 0.013669929282686096, + "grad_norm": 3.838881923456938e-08, + "learning_rate": 9.86330070717314e-06, + "loss": 0.0, + "step": 3060 + }, + { + "epoch": 0.013714602254198143, + "grad_norm": 8.503144499627524e-07, + "learning_rate": 9.86285397745802e-06, + "loss": 0.0005, + "step": 3070 + }, + { + "epoch": 0.013759275225710188, + "grad_norm": 2.1550865259012397e-10, + "learning_rate": 9.862407247742898e-06, + "loss": 0.0, + "step": 3080 + }, + { + "epoch": 0.013803948197222235, + "grad_norm": 1.3286899358533333e-11, + "learning_rate": 9.861960518027778e-06, + "loss": 0.0037, + "step": 3090 + }, + { + "epoch": 0.01384862116873428, + "grad_norm": 9.956566645996645e-05, + "learning_rate": 9.861513788312658e-06, + "loss": 0.0, + "step": 3100 + }, + { + "epoch": 0.013893294140246327, + "grad_norm": 1.4003201931345188e-09, + "learning_rate": 9.861067058597537e-06, + "loss": 0.0, + "step": 3110 + }, + { + "epoch": 0.013937967111758372, + "grad_norm": 3.2533531424405737e-09, + "learning_rate": 9.860620328882417e-06, + "loss": 0.0, + "step": 3120 + }, + { + "epoch": 0.01398264008327042, + "grad_norm": 7.113734046271247e-10, + "learning_rate": 9.860173599167297e-06, + "loss": 0.1255, + "step": 3130 + }, + { + "epoch": 0.014027313054782465, + "grad_norm": 3.964849426552064e-08, + "learning_rate": 9.859726869452176e-06, + "loss": 0.0, + "step": 3140 + }, + { + "epoch": 0.014071986026294512, + "grad_norm": 6.418162001864403e-07, + "learning_rate": 9.859280139737056e-06, + "loss": 0.0, + "step": 3150 + }, + { + "epoch": 0.014116658997806557, + "grad_norm": 3.987723928844389e-09, + "learning_rate": 9.858833410021936e-06, + "loss": 0.0005, + "step": 3160 + }, + { + "epoch": 0.014161331969318604, + "grad_norm": 0.5856716632843018, + "learning_rate": 9.858386680306814e-06, + "loss": 0.0002, + "step": 3170 + }, + { + "epoch": 0.014206004940830649, + "grad_norm": 8.413451185740328e-11, + "learning_rate": 9.857939950591694e-06, + "loss": 0.0005, + "step": 3180 + }, + { + "epoch": 0.014250677912342696, + "grad_norm": 1.8648626046169348e-11, + "learning_rate": 9.857493220876575e-06, + "loss": 0.0001, + "step": 3190 + }, + { + "epoch": 0.014295350883854741, + "grad_norm": 5.774042910466903e-10, + "learning_rate": 9.857046491161455e-06, + "loss": 0.0, + "step": 3200 + }, + { + "epoch": 0.014340023855366788, + "grad_norm": 0.00016052935097832233, + "learning_rate": 9.856599761446333e-06, + "loss": 0.0, + "step": 3210 + }, + { + "epoch": 0.014384696826878833, + "grad_norm": 1.0157315018645505e-14, + "learning_rate": 9.856153031731213e-06, + "loss": 0.0034, + "step": 3220 + }, + { + "epoch": 0.01442936979839088, + "grad_norm": 0.029599646106362343, + "learning_rate": 9.855706302016092e-06, + "loss": 0.0267, + "step": 3230 + }, + { + "epoch": 0.014474042769902926, + "grad_norm": 9.77450180053711, + "learning_rate": 9.855259572300972e-06, + "loss": 0.0019, + "step": 3240 + }, + { + "epoch": 0.014518715741414973, + "grad_norm": 1.0505848990760569e-07, + "learning_rate": 9.854812842585852e-06, + "loss": 0.0006, + "step": 3250 + }, + { + "epoch": 0.014563388712927018, + "grad_norm": 0.002367907902225852, + "learning_rate": 9.85436611287073e-06, + "loss": 0.0, + "step": 3260 + }, + { + "epoch": 0.014608061684439063, + "grad_norm": 1.1953168232992084e-08, + "learning_rate": 9.85391938315561e-06, + "loss": 0.0005, + "step": 3270 + }, + { + "epoch": 0.01465273465595111, + "grad_norm": 4.748066089604874e-12, + "learning_rate": 9.85347265344049e-06, + "loss": 0.0001, + "step": 3280 + }, + { + "epoch": 0.014697407627463155, + "grad_norm": 0.01604483276605606, + "learning_rate": 9.853025923725369e-06, + "loss": 0.0, + "step": 3290 + }, + { + "epoch": 0.014742080598975202, + "grad_norm": 2.076117743854411e-05, + "learning_rate": 9.852579194010249e-06, + "loss": 0.0002, + "step": 3300 + }, + { + "epoch": 0.014786753570487247, + "grad_norm": 0.000603312742896378, + "learning_rate": 9.85213246429513e-06, + "loss": 0.425, + "step": 3310 + }, + { + "epoch": 0.014831426541999294, + "grad_norm": 0.004995096940547228, + "learning_rate": 9.851685734580008e-06, + "loss": 0.5148, + "step": 3320 + }, + { + "epoch": 0.01487609951351134, + "grad_norm": 0.03883814066648483, + "learning_rate": 9.851239004864888e-06, + "loss": 0.9627, + "step": 3330 + }, + { + "epoch": 0.014920772485023387, + "grad_norm": 8.102724677883089e-06, + "learning_rate": 9.850792275149766e-06, + "loss": 0.0048, + "step": 3340 + }, + { + "epoch": 0.014965445456535432, + "grad_norm": 0.31644752621650696, + "learning_rate": 9.850345545434646e-06, + "loss": 0.0001, + "step": 3350 + }, + { + "epoch": 0.015010118428047479, + "grad_norm": 3.0332233905792236, + "learning_rate": 9.849898815719526e-06, + "loss": 0.0058, + "step": 3360 + }, + { + "epoch": 0.015054791399559524, + "grad_norm": 6.062638391313158e-09, + "learning_rate": 9.849452086004405e-06, + "loss": 0.0013, + "step": 3370 + }, + { + "epoch": 0.015099464371071571, + "grad_norm": 0.0011684580240398645, + "learning_rate": 9.849005356289285e-06, + "loss": 0.0002, + "step": 3380 + }, + { + "epoch": 0.015144137342583616, + "grad_norm": 5.77909929688758e-07, + "learning_rate": 9.848558626574165e-06, + "loss": 0.0, + "step": 3390 + }, + { + "epoch": 0.015188810314095663, + "grad_norm": 4.860052058575093e-07, + "learning_rate": 9.848111896859044e-06, + "loss": 0.0, + "step": 3400 + }, + { + "epoch": 0.015233483285607708, + "grad_norm": 1.0624749847920612e-05, + "learning_rate": 9.847665167143924e-06, + "loss": 0.0, + "step": 3410 + }, + { + "epoch": 0.015278156257119755, + "grad_norm": 3.9735247969567566e-13, + "learning_rate": 9.847218437428804e-06, + "loss": 0.0005, + "step": 3420 + }, + { + "epoch": 0.0153228292286318, + "grad_norm": 0.0007657803362235427, + "learning_rate": 9.846771707713682e-06, + "loss": 0.0003, + "step": 3430 + }, + { + "epoch": 0.015367502200143848, + "grad_norm": 9.241873377696663e-10, + "learning_rate": 9.846324977998562e-06, + "loss": 0.0004, + "step": 3440 + }, + { + "epoch": 0.015412175171655893, + "grad_norm": 1.5009467233539908e-06, + "learning_rate": 9.845878248283441e-06, + "loss": 0.0001, + "step": 3450 + }, + { + "epoch": 0.01545684814316794, + "grad_norm": 1.1387187193179216e-08, + "learning_rate": 9.845431518568321e-06, + "loss": 0.0, + "step": 3460 + }, + { + "epoch": 0.015501521114679985, + "grad_norm": 6.789093731640605e-09, + "learning_rate": 9.844984788853201e-06, + "loss": 0.0001, + "step": 3470 + }, + { + "epoch": 0.015546194086192032, + "grad_norm": 1.0673472772282366e-08, + "learning_rate": 9.84453805913808e-06, + "loss": 0.0, + "step": 3480 + }, + { + "epoch": 0.015590867057704077, + "grad_norm": 0.0025754093658179045, + "learning_rate": 9.84409132942296e-06, + "loss": 0.0, + "step": 3490 + }, + { + "epoch": 0.015635540029216122, + "grad_norm": 1.62450643215184e-11, + "learning_rate": 9.84364459970784e-06, + "loss": 0.0, + "step": 3500 + }, + { + "epoch": 0.01568021300072817, + "grad_norm": 7.053716277738431e-09, + "learning_rate": 9.843197869992718e-06, + "loss": 0.0, + "step": 3510 + }, + { + "epoch": 0.015724885972240216, + "grad_norm": 0.03826324641704559, + "learning_rate": 9.842751140277598e-06, + "loss": 0.0, + "step": 3520 + }, + { + "epoch": 0.01576955894375226, + "grad_norm": 4.0738819961916306e-07, + "learning_rate": 9.842304410562478e-06, + "loss": 0.0, + "step": 3530 + }, + { + "epoch": 0.015814231915264307, + "grad_norm": 1.7018284351189017e-11, + "learning_rate": 9.841857680847357e-06, + "loss": 0.0, + "step": 3540 + }, + { + "epoch": 0.015858904886776354, + "grad_norm": 0.0078599127009511, + "learning_rate": 9.841410951132237e-06, + "loss": 0.0001, + "step": 3550 + }, + { + "epoch": 0.0159035778582884, + "grad_norm": 4.419483182194206e-11, + "learning_rate": 9.840964221417117e-06, + "loss": 0.4157, + "step": 3560 + }, + { + "epoch": 0.015948250829800444, + "grad_norm": 6.113156268838793e-05, + "learning_rate": 9.840517491701996e-06, + "loss": 0.018, + "step": 3570 + }, + { + "epoch": 0.01599292380131249, + "grad_norm": 1.637298852052993e-09, + "learning_rate": 9.840070761986876e-06, + "loss": 0.0633, + "step": 3580 + }, + { + "epoch": 0.016037596772824538, + "grad_norm": 1.0059465012091096e-06, + "learning_rate": 9.839624032271756e-06, + "loss": 0.0297, + "step": 3590 + }, + { + "epoch": 0.016082269744336585, + "grad_norm": 1.3263518283679332e-08, + "learning_rate": 9.839177302556634e-06, + "loss": 0.0, + "step": 3600 + }, + { + "epoch": 0.01612694271584863, + "grad_norm": 3.2096600932618458e-09, + "learning_rate": 9.838730572841514e-06, + "loss": 0.0593, + "step": 3610 + }, + { + "epoch": 0.016171615687360676, + "grad_norm": 0.5415199995040894, + "learning_rate": 9.838283843126395e-06, + "loss": 0.0341, + "step": 3620 + }, + { + "epoch": 0.016216288658872723, + "grad_norm": 5.8153847959374616e-08, + "learning_rate": 9.837837113411273e-06, + "loss": 0.778, + "step": 3630 + }, + { + "epoch": 0.01626096163038477, + "grad_norm": 0.0023709749802947044, + "learning_rate": 9.837390383696153e-06, + "loss": 0.0, + "step": 3640 + }, + { + "epoch": 0.016305634601896813, + "grad_norm": 2.2431919603077555e-12, + "learning_rate": 9.836943653981033e-06, + "loss": 0.0002, + "step": 3650 + }, + { + "epoch": 0.01635030757340886, + "grad_norm": 1.8902798891067505, + "learning_rate": 9.836496924265913e-06, + "loss": 0.3081, + "step": 3660 + }, + { + "epoch": 0.016394980544920907, + "grad_norm": 7.196881979254499e-10, + "learning_rate": 9.836050194550792e-06, + "loss": 0.0, + "step": 3670 + }, + { + "epoch": 0.016439653516432954, + "grad_norm": 1.9319632053375244, + "learning_rate": 9.835603464835672e-06, + "loss": 0.0006, + "step": 3680 + }, + { + "epoch": 0.016484326487944997, + "grad_norm": 6.603991550946375e-06, + "learning_rate": 9.835156735120552e-06, + "loss": 0.0001, + "step": 3690 + }, + { + "epoch": 0.016528999459457044, + "grad_norm": 1.0397147552632102e-12, + "learning_rate": 9.83471000540543e-06, + "loss": 0.0016, + "step": 3700 + }, + { + "epoch": 0.01657367243096909, + "grad_norm": 6.637133651565819e-07, + "learning_rate": 9.83426327569031e-06, + "loss": 0.0, + "step": 3710 + }, + { + "epoch": 0.01661834540248114, + "grad_norm": 2.6404313757666387e-07, + "learning_rate": 9.83381654597519e-06, + "loss": 0.0023, + "step": 3720 + }, + { + "epoch": 0.016663018373993182, + "grad_norm": 5.428249642136507e-05, + "learning_rate": 9.833369816260069e-06, + "loss": 0.0003, + "step": 3730 + }, + { + "epoch": 0.01670769134550523, + "grad_norm": 1.8128656620319816e-07, + "learning_rate": 9.83292308654495e-06, + "loss": 0.9128, + "step": 3740 + }, + { + "epoch": 0.016752364317017276, + "grad_norm": 1.0186456165683921e-05, + "learning_rate": 9.832476356829828e-06, + "loss": 0.0, + "step": 3750 + }, + { + "epoch": 0.016797037288529323, + "grad_norm": 1.559259033001581e-07, + "learning_rate": 9.832029627114708e-06, + "loss": 0.0231, + "step": 3760 + }, + { + "epoch": 0.016841710260041366, + "grad_norm": 7.261510745593114e-06, + "learning_rate": 9.831582897399588e-06, + "loss": 0.0, + "step": 3770 + }, + { + "epoch": 0.016886383231553413, + "grad_norm": 4.436628557868971e-07, + "learning_rate": 9.831136167684466e-06, + "loss": 0.0013, + "step": 3780 + }, + { + "epoch": 0.01693105620306546, + "grad_norm": 1.7237377166748047, + "learning_rate": 9.830689437969346e-06, + "loss": 0.0007, + "step": 3790 + }, + { + "epoch": 0.016975729174577504, + "grad_norm": 0.7090981006622314, + "learning_rate": 9.830242708254227e-06, + "loss": 0.0003, + "step": 3800 + }, + { + "epoch": 0.01702040214608955, + "grad_norm": 2.859711685232469e-07, + "learning_rate": 9.829795978539105e-06, + "loss": 0.0, + "step": 3810 + }, + { + "epoch": 0.017065075117601598, + "grad_norm": 4.5942306314827874e-05, + "learning_rate": 9.829349248823985e-06, + "loss": 0.2813, + "step": 3820 + }, + { + "epoch": 0.017109748089113645, + "grad_norm": 0.026525896042585373, + "learning_rate": 9.828902519108864e-06, + "loss": 0.0, + "step": 3830 + }, + { + "epoch": 0.017154421060625688, + "grad_norm": 173.7867889404297, + "learning_rate": 9.828455789393744e-06, + "loss": 0.0828, + "step": 3840 + }, + { + "epoch": 0.017199094032137735, + "grad_norm": 0.0008094139629974961, + "learning_rate": 9.828009059678624e-06, + "loss": 0.0015, + "step": 3850 + }, + { + "epoch": 0.017243767003649782, + "grad_norm": 2.362374473818818e-08, + "learning_rate": 9.827562329963502e-06, + "loss": 0.0202, + "step": 3860 + }, + { + "epoch": 0.01728843997516183, + "grad_norm": 1.1910659090952347e-11, + "learning_rate": 9.827115600248382e-06, + "loss": 0.0011, + "step": 3870 + }, + { + "epoch": 0.017333112946673872, + "grad_norm": 0.009401158429682255, + "learning_rate": 9.826668870533263e-06, + "loss": 0.0, + "step": 3880 + }, + { + "epoch": 0.01737778591818592, + "grad_norm": 8.214187552368912e-09, + "learning_rate": 9.826222140818141e-06, + "loss": 0.0, + "step": 3890 + }, + { + "epoch": 0.017422458889697966, + "grad_norm": 5.333662556950003e-05, + "learning_rate": 9.825775411103021e-06, + "loss": 0.0004, + "step": 3900 + }, + { + "epoch": 0.017467131861210013, + "grad_norm": 0.00015062151942402124, + "learning_rate": 9.825328681387901e-06, + "loss": 0.0001, + "step": 3910 + }, + { + "epoch": 0.017511804832722057, + "grad_norm": 0.4469970762729645, + "learning_rate": 9.82488195167278e-06, + "loss": 0.0002, + "step": 3920 + }, + { + "epoch": 0.017556477804234104, + "grad_norm": 0.12761425971984863, + "learning_rate": 9.82443522195766e-06, + "loss": 0.0, + "step": 3930 + }, + { + "epoch": 0.01760115077574615, + "grad_norm": 3.445596308271348e-10, + "learning_rate": 9.823988492242538e-06, + "loss": 0.0021, + "step": 3940 + }, + { + "epoch": 0.017645823747258198, + "grad_norm": 3.4900170930995955e-07, + "learning_rate": 9.823541762527418e-06, + "loss": 0.0, + "step": 3950 + }, + { + "epoch": 0.01769049671877024, + "grad_norm": 526.72021484375, + "learning_rate": 9.823095032812298e-06, + "loss": 0.1069, + "step": 3960 + }, + { + "epoch": 0.017735169690282288, + "grad_norm": 5.073938369750977, + "learning_rate": 9.822648303097177e-06, + "loss": 0.0014, + "step": 3970 + }, + { + "epoch": 0.017779842661794335, + "grad_norm": 23.13395118713379, + "learning_rate": 9.822201573382057e-06, + "loss": 0.0081, + "step": 3980 + }, + { + "epoch": 0.017824515633306382, + "grad_norm": 1.453740151191596e-05, + "learning_rate": 9.821754843666937e-06, + "loss": 0.0007, + "step": 3990 + }, + { + "epoch": 0.017869188604818426, + "grad_norm": 4.718764781951904, + "learning_rate": 9.821308113951816e-06, + "loss": 0.0015, + "step": 4000 + }, + { + "epoch": 0.017913861576330473, + "grad_norm": 7.012652611382286e-12, + "learning_rate": 9.820861384236696e-06, + "loss": 0.0005, + "step": 4010 + }, + { + "epoch": 0.01795853454784252, + "grad_norm": 9.016378409946318e-14, + "learning_rate": 9.820414654521576e-06, + "loss": 0.3409, + "step": 4020 + }, + { + "epoch": 0.018003207519354567, + "grad_norm": 0.34797075390815735, + "learning_rate": 9.819967924806454e-06, + "loss": 0.0013, + "step": 4030 + }, + { + "epoch": 0.01804788049086661, + "grad_norm": 2.998768522388673e-08, + "learning_rate": 9.819521195091334e-06, + "loss": 0.0006, + "step": 4040 + }, + { + "epoch": 0.018092553462378657, + "grad_norm": 0.0008473226334899664, + "learning_rate": 9.819074465376215e-06, + "loss": 0.0, + "step": 4050 + }, + { + "epoch": 0.018137226433890704, + "grad_norm": 6.374268474163003e-11, + "learning_rate": 9.818627735661093e-06, + "loss": 0.0313, + "step": 4060 + }, + { + "epoch": 0.018181899405402747, + "grad_norm": 3.1651177323510638e-06, + "learning_rate": 9.818181005945973e-06, + "loss": 0.0, + "step": 4070 + }, + { + "epoch": 0.018226572376914794, + "grad_norm": 0.00011180033470736817, + "learning_rate": 9.817734276230853e-06, + "loss": 0.0004, + "step": 4080 + }, + { + "epoch": 0.01827124534842684, + "grad_norm": 3.584693502034497e-07, + "learning_rate": 9.817287546515732e-06, + "loss": 0.0, + "step": 4090 + }, + { + "epoch": 0.01831591831993889, + "grad_norm": 752.3006591796875, + "learning_rate": 9.816840816800612e-06, + "loss": 0.3661, + "step": 4100 + }, + { + "epoch": 0.018360591291450932, + "grad_norm": 26.972646713256836, + "learning_rate": 9.816394087085492e-06, + "loss": 0.0867, + "step": 4110 + }, + { + "epoch": 0.01840526426296298, + "grad_norm": 3.087233810608603e-12, + "learning_rate": 9.815947357370372e-06, + "loss": 0.0, + "step": 4120 + }, + { + "epoch": 0.018449937234475026, + "grad_norm": 3.907925025248282e-11, + "learning_rate": 9.81550062765525e-06, + "loss": 0.004, + "step": 4130 + }, + { + "epoch": 0.018494610205987073, + "grad_norm": 0.0007196891237981617, + "learning_rate": 9.81505389794013e-06, + "loss": 0.0, + "step": 4140 + }, + { + "epoch": 0.018539283177499116, + "grad_norm": 2.1690328139811754e-05, + "learning_rate": 9.81460716822501e-06, + "loss": 0.0, + "step": 4150 + }, + { + "epoch": 0.018583956149011163, + "grad_norm": 4.64138536632186e-12, + "learning_rate": 9.814160438509889e-06, + "loss": 0.0002, + "step": 4160 + }, + { + "epoch": 0.01862862912052321, + "grad_norm": 3.0848708152770996, + "learning_rate": 9.81371370879477e-06, + "loss": 0.0008, + "step": 4170 + }, + { + "epoch": 0.018673302092035257, + "grad_norm": 376.42767333984375, + "learning_rate": 9.81326697907965e-06, + "loss": 0.0408, + "step": 4180 + }, + { + "epoch": 0.0187179750635473, + "grad_norm": 1.2392396950191426e-13, + "learning_rate": 9.812820249364528e-06, + "loss": 0.0003, + "step": 4190 + }, + { + "epoch": 0.018762648035059348, + "grad_norm": 1.0111125448020175e-06, + "learning_rate": 9.812373519649408e-06, + "loss": 0.0001, + "step": 4200 + }, + { + "epoch": 0.018807321006571395, + "grad_norm": 1.8822409231233905e-07, + "learning_rate": 9.811926789934288e-06, + "loss": 0.0, + "step": 4210 + }, + { + "epoch": 0.01885199397808344, + "grad_norm": 0.003501696279272437, + "learning_rate": 9.811480060219166e-06, + "loss": 0.0041, + "step": 4220 + }, + { + "epoch": 0.018896666949595485, + "grad_norm": 5.014882844989188e-06, + "learning_rate": 9.811033330504047e-06, + "loss": 0.0025, + "step": 4230 + }, + { + "epoch": 0.018941339921107532, + "grad_norm": 98.7721939086914, + "learning_rate": 9.810586600788925e-06, + "loss": 0.0182, + "step": 4240 + }, + { + "epoch": 0.01898601289261958, + "grad_norm": 1.2174330549896695e-05, + "learning_rate": 9.810139871073805e-06, + "loss": 0.0, + "step": 4250 + }, + { + "epoch": 0.019030685864131626, + "grad_norm": 0.00010244682925986126, + "learning_rate": 9.809693141358685e-06, + "loss": 0.0, + "step": 4260 + }, + { + "epoch": 0.01907535883564367, + "grad_norm": 21.687419891357422, + "learning_rate": 9.809246411643564e-06, + "loss": 0.0187, + "step": 4270 + }, + { + "epoch": 0.019120031807155716, + "grad_norm": 1.5209188075626656e-10, + "learning_rate": 9.808799681928444e-06, + "loss": 0.0134, + "step": 4280 + }, + { + "epoch": 0.019164704778667763, + "grad_norm": 8.382775029680545e-10, + "learning_rate": 9.808352952213324e-06, + "loss": 0.0252, + "step": 4290 + }, + { + "epoch": 0.01920937775017981, + "grad_norm": 1.1792429961124049e-14, + "learning_rate": 9.807906222498202e-06, + "loss": 0.0004, + "step": 4300 + }, + { + "epoch": 0.019254050721691854, + "grad_norm": 0.051694102585315704, + "learning_rate": 9.807459492783083e-06, + "loss": 0.0, + "step": 4310 + }, + { + "epoch": 0.0192987236932039, + "grad_norm": 7.386233291661881e-11, + "learning_rate": 9.807012763067961e-06, + "loss": 0.0001, + "step": 4320 + }, + { + "epoch": 0.019343396664715948, + "grad_norm": 2.1222596596759402e-11, + "learning_rate": 9.806566033352841e-06, + "loss": 0.0001, + "step": 4330 + }, + { + "epoch": 0.01938806963622799, + "grad_norm": 1.083875877938567e-09, + "learning_rate": 9.806119303637721e-06, + "loss": 0.0428, + "step": 4340 + }, + { + "epoch": 0.019432742607740038, + "grad_norm": 2.3458659459407727e-09, + "learning_rate": 9.8056725739226e-06, + "loss": 0.0, + "step": 4350 + }, + { + "epoch": 0.019477415579252085, + "grad_norm": 9.970278454837084e-14, + "learning_rate": 9.80522584420748e-06, + "loss": 0.0, + "step": 4360 + }, + { + "epoch": 0.019522088550764132, + "grad_norm": 3.586633767760361e-10, + "learning_rate": 9.80477911449236e-06, + "loss": 0.6023, + "step": 4370 + }, + { + "epoch": 0.019566761522276176, + "grad_norm": 0.00017252798716071993, + "learning_rate": 9.804332384777238e-06, + "loss": 0.0, + "step": 4380 + }, + { + "epoch": 0.019611434493788223, + "grad_norm": 684.9390258789062, + "learning_rate": 9.803885655062118e-06, + "loss": 1.2474, + "step": 4390 + }, + { + "epoch": 0.01965610746530027, + "grad_norm": 0.024622151628136635, + "learning_rate": 9.803438925346999e-06, + "loss": 0.0758, + "step": 4400 + }, + { + "epoch": 0.019700780436812317, + "grad_norm": 52.290061950683594, + "learning_rate": 9.802992195631877e-06, + "loss": 0.0207, + "step": 4410 + }, + { + "epoch": 0.01974545340832436, + "grad_norm": 0.05896748974919319, + "learning_rate": 9.802545465916757e-06, + "loss": 0.1313, + "step": 4420 + }, + { + "epoch": 0.019790126379836407, + "grad_norm": 1.7239247540601355e-07, + "learning_rate": 9.802098736201636e-06, + "loss": 0.0758, + "step": 4430 + }, + { + "epoch": 0.019834799351348454, + "grad_norm": 8.05067427801525e-18, + "learning_rate": 9.801652006486516e-06, + "loss": 0.0, + "step": 4440 + }, + { + "epoch": 0.0198794723228605, + "grad_norm": 4.193351079424373e-20, + "learning_rate": 9.801205276771396e-06, + "loss": 0.0079, + "step": 4450 + }, + { + "epoch": 0.019924145294372544, + "grad_norm": 8.296374654978567e-13, + "learning_rate": 9.800758547056274e-06, + "loss": 0.2641, + "step": 4460 + }, + { + "epoch": 0.01996881826588459, + "grad_norm": 1.3925680377724348e-06, + "learning_rate": 9.800311817341154e-06, + "loss": 0.0, + "step": 4470 + }, + { + "epoch": 0.02001349123739664, + "grad_norm": 86.02528381347656, + "learning_rate": 9.799865087626034e-06, + "loss": 0.0143, + "step": 4480 + }, + { + "epoch": 0.020058164208908685, + "grad_norm": 51.72323989868164, + "learning_rate": 9.799418357910913e-06, + "loss": 0.0343, + "step": 4490 + }, + { + "epoch": 0.02010283718042073, + "grad_norm": 1.158203190243512e-06, + "learning_rate": 9.798971628195793e-06, + "loss": 0.0001, + "step": 4500 + }, + { + "epoch": 0.020147510151932776, + "grad_norm": 9.528105301276346e-13, + "learning_rate": 9.798524898480673e-06, + "loss": 0.0011, + "step": 4510 + }, + { + "epoch": 0.020192183123444823, + "grad_norm": 4.934356638841564e-07, + "learning_rate": 9.798078168765552e-06, + "loss": 0.0003, + "step": 4520 + }, + { + "epoch": 0.02023685609495687, + "grad_norm": 2.8366539478302, + "learning_rate": 9.797631439050432e-06, + "loss": 0.0005, + "step": 4530 + }, + { + "epoch": 0.020281529066468913, + "grad_norm": 0.00037676902138628066, + "learning_rate": 9.797184709335312e-06, + "loss": 0.0031, + "step": 4540 + }, + { + "epoch": 0.02032620203798096, + "grad_norm": 4.719042379086602e-10, + "learning_rate": 9.79673797962019e-06, + "loss": 0.008, + "step": 4550 + }, + { + "epoch": 0.020370875009493007, + "grad_norm": 3.678450599675642e-11, + "learning_rate": 9.79629124990507e-06, + "loss": 0.0, + "step": 4560 + }, + { + "epoch": 0.020415547981005054, + "grad_norm": 5.407119260780746e-06, + "learning_rate": 9.79584452018995e-06, + "loss": 0.0049, + "step": 4570 + }, + { + "epoch": 0.020460220952517098, + "grad_norm": 1.160919282483519e-06, + "learning_rate": 9.79539779047483e-06, + "loss": 0.1253, + "step": 4580 + }, + { + "epoch": 0.020504893924029145, + "grad_norm": 6.910381489433348e-05, + "learning_rate": 9.794951060759709e-06, + "loss": 0.0004, + "step": 4590 + }, + { + "epoch": 0.02054956689554119, + "grad_norm": 0.00012307892029639333, + "learning_rate": 9.79450433104459e-06, + "loss": 0.0001, + "step": 4600 + }, + { + "epoch": 0.020594239867053235, + "grad_norm": 1.16939194438892e-10, + "learning_rate": 9.79405760132947e-06, + "loss": 0.0483, + "step": 4610 + }, + { + "epoch": 0.020638912838565282, + "grad_norm": 0.00010817075235536322, + "learning_rate": 9.793610871614348e-06, + "loss": 0.0134, + "step": 4620 + }, + { + "epoch": 0.02068358581007733, + "grad_norm": 12.48432445526123, + "learning_rate": 9.793164141899228e-06, + "loss": 0.0017, + "step": 4630 + }, + { + "epoch": 0.020728258781589376, + "grad_norm": 1.0494361646351535e-07, + "learning_rate": 9.792717412184108e-06, + "loss": 0.0, + "step": 4640 + }, + { + "epoch": 0.02077293175310142, + "grad_norm": 4.475537718207079e-09, + "learning_rate": 9.792270682468986e-06, + "loss": 0.0, + "step": 4650 + }, + { + "epoch": 0.020817604724613466, + "grad_norm": 0.003947664052248001, + "learning_rate": 9.791823952753867e-06, + "loss": 0.0007, + "step": 4660 + }, + { + "epoch": 0.020862277696125513, + "grad_norm": 3.053561570265373e-10, + "learning_rate": 9.791377223038747e-06, + "loss": 0.0574, + "step": 4670 + }, + { + "epoch": 0.02090695066763756, + "grad_norm": 0.3782574236392975, + "learning_rate": 9.790930493323625e-06, + "loss": 0.0002, + "step": 4680 + }, + { + "epoch": 0.020951623639149604, + "grad_norm": 0.04846750944852829, + "learning_rate": 9.790483763608505e-06, + "loss": 0.6375, + "step": 4690 + }, + { + "epoch": 0.02099629661066165, + "grad_norm": 0.0004883208894170821, + "learning_rate": 9.790037033893385e-06, + "loss": 0.0, + "step": 4700 + }, + { + "epoch": 0.021040969582173698, + "grad_norm": 7.283672363161964e-13, + "learning_rate": 9.789590304178264e-06, + "loss": 0.0029, + "step": 4710 + }, + { + "epoch": 0.021085642553685745, + "grad_norm": 5.044914264118461e-09, + "learning_rate": 9.789143574463144e-06, + "loss": 0.0357, + "step": 4720 + }, + { + "epoch": 0.021130315525197788, + "grad_norm": 2.224873202338884e-10, + "learning_rate": 9.788696844748022e-06, + "loss": 0.003, + "step": 4730 + }, + { + "epoch": 0.021174988496709835, + "grad_norm": 1.3932583216205285e-14, + "learning_rate": 9.788250115032903e-06, + "loss": 0.0004, + "step": 4740 + }, + { + "epoch": 0.021219661468221882, + "grad_norm": 9.30095952512977e-13, + "learning_rate": 9.787803385317783e-06, + "loss": 0.0123, + "step": 4750 + }, + { + "epoch": 0.02126433443973393, + "grad_norm": 0.00033034654916264117, + "learning_rate": 9.787356655602661e-06, + "loss": 0.0, + "step": 4760 + }, + { + "epoch": 0.021309007411245973, + "grad_norm": 4.3573920513462205e-12, + "learning_rate": 9.786909925887541e-06, + "loss": 0.0, + "step": 4770 + }, + { + "epoch": 0.02135368038275802, + "grad_norm": 0.025035222992300987, + "learning_rate": 9.786463196172421e-06, + "loss": 0.0, + "step": 4780 + }, + { + "epoch": 0.021398353354270067, + "grad_norm": 7.293956105769439e-09, + "learning_rate": 9.7860164664573e-06, + "loss": 0.0001, + "step": 4790 + }, + { + "epoch": 0.021443026325782114, + "grad_norm": 1.0322760035002057e-13, + "learning_rate": 9.78556973674218e-06, + "loss": 0.0, + "step": 4800 + }, + { + "epoch": 0.021487699297294157, + "grad_norm": 7.134031321416601e-13, + "learning_rate": 9.78512300702706e-06, + "loss": 0.0018, + "step": 4810 + }, + { + "epoch": 0.021532372268806204, + "grad_norm": 2.9583046853076667e-05, + "learning_rate": 9.784676277311938e-06, + "loss": 0.0, + "step": 4820 + }, + { + "epoch": 0.02157704524031825, + "grad_norm": 1.6715544226553192e-10, + "learning_rate": 9.784229547596819e-06, + "loss": 0.1215, + "step": 4830 + }, + { + "epoch": 0.021621718211830298, + "grad_norm": 0.0013644276186823845, + "learning_rate": 9.783782817881697e-06, + "loss": 0.0001, + "step": 4840 + }, + { + "epoch": 0.02166639118334234, + "grad_norm": 1.2130669801990734e-09, + "learning_rate": 9.783336088166577e-06, + "loss": 0.0018, + "step": 4850 + }, + { + "epoch": 0.02171106415485439, + "grad_norm": 1.6507818445532507e-09, + "learning_rate": 9.782889358451457e-06, + "loss": 0.0116, + "step": 4860 + }, + { + "epoch": 0.021755737126366435, + "grad_norm": 8.015909602754334e-10, + "learning_rate": 9.782442628736336e-06, + "loss": 0.0001, + "step": 4870 + }, + { + "epoch": 0.02180041009787848, + "grad_norm": 0.0021370230242609978, + "learning_rate": 9.781995899021216e-06, + "loss": 0.0002, + "step": 4880 + }, + { + "epoch": 0.021845083069390526, + "grad_norm": 1.9856920590588658e-16, + "learning_rate": 9.781549169306096e-06, + "loss": 0.0, + "step": 4890 + }, + { + "epoch": 0.021889756040902573, + "grad_norm": 7.834253807747354e-14, + "learning_rate": 9.781102439590974e-06, + "loss": 0.0, + "step": 4900 + }, + { + "epoch": 0.02193442901241462, + "grad_norm": 1.1786222842147254e-07, + "learning_rate": 9.780655709875854e-06, + "loss": 0.0005, + "step": 4910 + }, + { + "epoch": 0.021979101983926663, + "grad_norm": 3.896390199661255, + "learning_rate": 9.780208980160733e-06, + "loss": 0.0829, + "step": 4920 + }, + { + "epoch": 0.02202377495543871, + "grad_norm": 1.0240219339152645e-08, + "learning_rate": 9.779762250445613e-06, + "loss": 0.0, + "step": 4930 + }, + { + "epoch": 0.022068447926950757, + "grad_norm": 3.104096979456017e-09, + "learning_rate": 9.779315520730493e-06, + "loss": 0.0, + "step": 4940 + }, + { + "epoch": 0.022113120898462804, + "grad_norm": 1.9251933736086357e-06, + "learning_rate": 9.778868791015372e-06, + "loss": 0.0002, + "step": 4950 + }, + { + "epoch": 0.022157793869974848, + "grad_norm": 5.579614025919e-06, + "learning_rate": 9.778422061300252e-06, + "loss": 0.0003, + "step": 4960 + }, + { + "epoch": 0.022202466841486895, + "grad_norm": 0.006300140172243118, + "learning_rate": 9.777975331585132e-06, + "loss": 0.0127, + "step": 4970 + }, + { + "epoch": 0.02224713981299894, + "grad_norm": 1.7765872328823207e-08, + "learning_rate": 9.77752860187001e-06, + "loss": 0.0938, + "step": 4980 + }, + { + "epoch": 0.02229181278451099, + "grad_norm": 7.047427061479539e-06, + "learning_rate": 9.77708187215489e-06, + "loss": 0.0, + "step": 4990 + }, + { + "epoch": 0.022336485756023032, + "grad_norm": 0.0007222782005555928, + "learning_rate": 9.77663514243977e-06, + "loss": 0.0016, + "step": 5000 + }, + { + "epoch": 0.02238115872753508, + "grad_norm": 2.581775269714184e-15, + "learning_rate": 9.776188412724649e-06, + "loss": 0.0226, + "step": 5010 + }, + { + "epoch": 0.022425831699047126, + "grad_norm": 9.7462100256962e-07, + "learning_rate": 9.775741683009529e-06, + "loss": 0.0019, + "step": 5020 + }, + { + "epoch": 0.022470504670559173, + "grad_norm": 0.04423893988132477, + "learning_rate": 9.77529495329441e-06, + "loss": 0.0, + "step": 5030 + }, + { + "epoch": 0.022515177642071216, + "grad_norm": 6.549327736138366e-06, + "learning_rate": 9.77484822357929e-06, + "loss": 0.0, + "step": 5040 + }, + { + "epoch": 0.022559850613583263, + "grad_norm": 4.1592920774213837e-10, + "learning_rate": 9.774401493864168e-06, + "loss": 0.0, + "step": 5050 + }, + { + "epoch": 0.02260452358509531, + "grad_norm": 1.6939921687608717e-10, + "learning_rate": 9.773954764149048e-06, + "loss": 0.0009, + "step": 5060 + }, + { + "epoch": 0.022649196556607357, + "grad_norm": 1.3087286561130895e-06, + "learning_rate": 9.773508034433928e-06, + "loss": 0.0451, + "step": 5070 + }, + { + "epoch": 0.0226938695281194, + "grad_norm": 1.7216597938962686e-11, + "learning_rate": 9.773061304718806e-06, + "loss": 0.0005, + "step": 5080 + }, + { + "epoch": 0.022738542499631448, + "grad_norm": 1.9706143383757535e-09, + "learning_rate": 9.772614575003687e-06, + "loss": 0.0003, + "step": 5090 + }, + { + "epoch": 0.022783215471143495, + "grad_norm": 0.001979758031666279, + "learning_rate": 9.772167845288567e-06, + "loss": 0.0, + "step": 5100 + }, + { + "epoch": 0.02282788844265554, + "grad_norm": 2.8083006782253506e-06, + "learning_rate": 9.771721115573445e-06, + "loss": 0.0003, + "step": 5110 + }, + { + "epoch": 0.022872561414167585, + "grad_norm": 3.5032118717026606e-07, + "learning_rate": 9.771274385858325e-06, + "loss": 0.0005, + "step": 5120 + }, + { + "epoch": 0.022917234385679632, + "grad_norm": 8.553373564978983e-10, + "learning_rate": 9.770827656143205e-06, + "loss": 0.0, + "step": 5130 + }, + { + "epoch": 0.02296190735719168, + "grad_norm": 4.153927328312669e-17, + "learning_rate": 9.770380926428084e-06, + "loss": 0.0007, + "step": 5140 + }, + { + "epoch": 0.023006580328703723, + "grad_norm": 6.216424944249788e-14, + "learning_rate": 9.769934196712964e-06, + "loss": 0.0003, + "step": 5150 + }, + { + "epoch": 0.02305125330021577, + "grad_norm": 0.0009006602340377867, + "learning_rate": 9.769487466997844e-06, + "loss": 0.0, + "step": 5160 + }, + { + "epoch": 0.023095926271727817, + "grad_norm": 0.04137864708900452, + "learning_rate": 9.769040737282722e-06, + "loss": 0.2688, + "step": 5170 + }, + { + "epoch": 0.023140599243239864, + "grad_norm": 4.6671142699215157e-14, + "learning_rate": 9.768594007567603e-06, + "loss": 0.0005, + "step": 5180 + }, + { + "epoch": 0.023185272214751907, + "grad_norm": 0.001031166291795671, + "learning_rate": 9.768147277852483e-06, + "loss": 0.0255, + "step": 5190 + }, + { + "epoch": 0.023229945186263954, + "grad_norm": 5.5269151271249313e-17, + "learning_rate": 9.767700548137361e-06, + "loss": 0.0, + "step": 5200 + }, + { + "epoch": 0.023274618157776, + "grad_norm": 4.2891007585865726e-12, + "learning_rate": 9.767253818422241e-06, + "loss": 0.1061, + "step": 5210 + }, + { + "epoch": 0.023319291129288048, + "grad_norm": 3.626120985700254e-07, + "learning_rate": 9.76680708870712e-06, + "loss": 0.0, + "step": 5220 + }, + { + "epoch": 0.02336396410080009, + "grad_norm": 7.203292540225448e-08, + "learning_rate": 9.766360358992e-06, + "loss": 0.6625, + "step": 5230 + }, + { + "epoch": 0.02340863707231214, + "grad_norm": 4.1581765808373916e-10, + "learning_rate": 9.76591362927688e-06, + "loss": 0.0, + "step": 5240 + }, + { + "epoch": 0.023453310043824185, + "grad_norm": 0.008564828895032406, + "learning_rate": 9.765466899561758e-06, + "loss": 0.0062, + "step": 5250 + }, + { + "epoch": 0.023497983015336232, + "grad_norm": 0.0008313873549923301, + "learning_rate": 9.765020169846639e-06, + "loss": 0.0014, + "step": 5260 + }, + { + "epoch": 0.023542655986848276, + "grad_norm": 2.6287691071047448e-05, + "learning_rate": 9.764573440131519e-06, + "loss": 0.0, + "step": 5270 + }, + { + "epoch": 0.023587328958360323, + "grad_norm": 5.458162366522856e-09, + "learning_rate": 9.764126710416397e-06, + "loss": 0.0034, + "step": 5280 + }, + { + "epoch": 0.02363200192987237, + "grad_norm": 2.566042810094732e-13, + "learning_rate": 9.763679980701277e-06, + "loss": 0.0, + "step": 5290 + }, + { + "epoch": 0.023676674901384417, + "grad_norm": 1.3405983168013336e-07, + "learning_rate": 9.763233250986157e-06, + "loss": 0.0014, + "step": 5300 + }, + { + "epoch": 0.02372134787289646, + "grad_norm": 2.0920136734048356e-09, + "learning_rate": 9.762786521271036e-06, + "loss": 0.0079, + "step": 5310 + }, + { + "epoch": 0.023766020844408507, + "grad_norm": 2.4717706992305466e-08, + "learning_rate": 9.762339791555916e-06, + "loss": 0.0, + "step": 5320 + }, + { + "epoch": 0.023810693815920554, + "grad_norm": 1.62759391741929e-07, + "learning_rate": 9.761893061840794e-06, + "loss": 0.0, + "step": 5330 + }, + { + "epoch": 0.0238553667874326, + "grad_norm": 0.00256610126234591, + "learning_rate": 9.761446332125674e-06, + "loss": 0.0, + "step": 5340 + }, + { + "epoch": 0.023900039758944645, + "grad_norm": 8.087974129011855e-05, + "learning_rate": 9.760999602410555e-06, + "loss": 0.0, + "step": 5350 + }, + { + "epoch": 0.02394471273045669, + "grad_norm": 1.1636046672643813e-13, + "learning_rate": 9.760552872695433e-06, + "loss": 0.0002, + "step": 5360 + }, + { + "epoch": 0.02398938570196874, + "grad_norm": 4.7322808865990496e-11, + "learning_rate": 9.760106142980313e-06, + "loss": 0.0003, + "step": 5370 + }, + { + "epoch": 0.024034058673480786, + "grad_norm": 3.8606845009780955e-07, + "learning_rate": 9.759659413265193e-06, + "loss": 0.0016, + "step": 5380 + }, + { + "epoch": 0.02407873164499283, + "grad_norm": 0.017653707414865494, + "learning_rate": 9.759212683550072e-06, + "loss": 0.0, + "step": 5390 + }, + { + "epoch": 0.024123404616504876, + "grad_norm": 1.5549872841802426e-05, + "learning_rate": 9.758765953834952e-06, + "loss": 0.0, + "step": 5400 + }, + { + "epoch": 0.024168077588016923, + "grad_norm": 0.2490101158618927, + "learning_rate": 9.758319224119832e-06, + "loss": 0.0615, + "step": 5410 + }, + { + "epoch": 0.024212750559528966, + "grad_norm": 9.94989818536851e-07, + "learning_rate": 9.75787249440471e-06, + "loss": 0.0, + "step": 5420 + }, + { + "epoch": 0.024257423531041013, + "grad_norm": 0.259600430727005, + "learning_rate": 9.75742576468959e-06, + "loss": 0.0575, + "step": 5430 + }, + { + "epoch": 0.02430209650255306, + "grad_norm": 5.348607667876593e-10, + "learning_rate": 9.756979034974469e-06, + "loss": 0.0, + "step": 5440 + }, + { + "epoch": 0.024346769474065107, + "grad_norm": 0.05933636799454689, + "learning_rate": 9.756532305259349e-06, + "loss": 0.0, + "step": 5450 + }, + { + "epoch": 0.02439144244557715, + "grad_norm": 0.00012589334801305085, + "learning_rate": 9.75608557554423e-06, + "loss": 0.0002, + "step": 5460 + }, + { + "epoch": 0.024436115417089198, + "grad_norm": 1.8503410609271853e-12, + "learning_rate": 9.755638845829108e-06, + "loss": 0.0, + "step": 5470 + }, + { + "epoch": 0.024480788388601245, + "grad_norm": 0.014644503593444824, + "learning_rate": 9.755192116113988e-06, + "loss": 0.0014, + "step": 5480 + }, + { + "epoch": 0.024525461360113292, + "grad_norm": 0.04739692062139511, + "learning_rate": 9.754745386398868e-06, + "loss": 0.0, + "step": 5490 + }, + { + "epoch": 0.024570134331625335, + "grad_norm": 1.7644185845711036e-06, + "learning_rate": 9.754298656683748e-06, + "loss": 0.0, + "step": 5500 + }, + { + "epoch": 0.024614807303137382, + "grad_norm": 0.0011657369323074818, + "learning_rate": 9.753851926968626e-06, + "loss": 0.0, + "step": 5510 + }, + { + "epoch": 0.02465948027464943, + "grad_norm": 7.385870874067608e-18, + "learning_rate": 9.753405197253507e-06, + "loss": 0.0, + "step": 5520 + }, + { + "epoch": 0.024704153246161476, + "grad_norm": 9.445318305267847e-09, + "learning_rate": 9.752958467538387e-06, + "loss": 0.4907, + "step": 5530 + }, + { + "epoch": 0.02474882621767352, + "grad_norm": 2.0039067627486418e-11, + "learning_rate": 9.752511737823265e-06, + "loss": 0.0001, + "step": 5540 + }, + { + "epoch": 0.024793499189185567, + "grad_norm": 1.243283948015872e-10, + "learning_rate": 9.752065008108145e-06, + "loss": 0.0948, + "step": 5550 + }, + { + "epoch": 0.024838172160697614, + "grad_norm": 0.0030552446842193604, + "learning_rate": 9.751618278393025e-06, + "loss": 0.0, + "step": 5560 + }, + { + "epoch": 0.02488284513220966, + "grad_norm": 21.20594596862793, + "learning_rate": 9.751171548677904e-06, + "loss": 0.0193, + "step": 5570 + }, + { + "epoch": 0.024927518103721704, + "grad_norm": 7.996323221626245e-12, + "learning_rate": 9.750724818962784e-06, + "loss": 0.0002, + "step": 5580 + }, + { + "epoch": 0.02497219107523375, + "grad_norm": 2.2440810099055852e-08, + "learning_rate": 9.750278089247664e-06, + "loss": 0.0, + "step": 5590 + }, + { + "epoch": 0.025016864046745798, + "grad_norm": 8.148406777763739e-05, + "learning_rate": 9.749831359532544e-06, + "loss": 0.0119, + "step": 5600 + }, + { + "epoch": 0.025061537018257845, + "grad_norm": 4.539280843568472e-12, + "learning_rate": 9.749384629817423e-06, + "loss": 0.0241, + "step": 5610 + }, + { + "epoch": 0.02510620998976989, + "grad_norm": 2.1162943397712297e-08, + "learning_rate": 9.748937900102303e-06, + "loss": 0.0, + "step": 5620 + }, + { + "epoch": 0.025150882961281935, + "grad_norm": 0.049990683794021606, + "learning_rate": 9.748491170387181e-06, + "loss": 0.0973, + "step": 5630 + }, + { + "epoch": 0.025195555932793982, + "grad_norm": 1.8373774723912106e-11, + "learning_rate": 9.748044440672061e-06, + "loss": 0.0001, + "step": 5640 + }, + { + "epoch": 0.02524022890430603, + "grad_norm": 6.4786890749812075e-18, + "learning_rate": 9.747597710956941e-06, + "loss": 0.0, + "step": 5650 + }, + { + "epoch": 0.025284901875818073, + "grad_norm": 6.861407086944382e-08, + "learning_rate": 9.74715098124182e-06, + "loss": 0.0, + "step": 5660 + }, + { + "epoch": 0.02532957484733012, + "grad_norm": 0.2536761164665222, + "learning_rate": 9.7467042515267e-06, + "loss": 0.0001, + "step": 5670 + }, + { + "epoch": 0.025374247818842167, + "grad_norm": 6.602550983428955, + "learning_rate": 9.74625752181158e-06, + "loss": 0.0078, + "step": 5680 + }, + { + "epoch": 0.02541892079035421, + "grad_norm": 1.3618071115217845e-09, + "learning_rate": 9.745810792096459e-06, + "loss": 0.0001, + "step": 5690 + }, + { + "epoch": 0.025463593761866257, + "grad_norm": 3.1000786293589044e-07, + "learning_rate": 9.745364062381339e-06, + "loss": 0.0, + "step": 5700 + }, + { + "epoch": 0.025508266733378304, + "grad_norm": 1.2080630540367565e-06, + "learning_rate": 9.744917332666219e-06, + "loss": 0.5865, + "step": 5710 + }, + { + "epoch": 0.02555293970489035, + "grad_norm": 1.0851125807675999e-05, + "learning_rate": 9.744470602951097e-06, + "loss": 0.0005, + "step": 5720 + }, + { + "epoch": 0.025597612676402395, + "grad_norm": 1.670174788159784e-05, + "learning_rate": 9.744023873235977e-06, + "loss": 0.0001, + "step": 5730 + }, + { + "epoch": 0.02564228564791444, + "grad_norm": 1.4780018098292658e-08, + "learning_rate": 9.743577143520856e-06, + "loss": 0.0, + "step": 5740 + }, + { + "epoch": 0.02568695861942649, + "grad_norm": 2.2874927646521215e-11, + "learning_rate": 9.743130413805736e-06, + "loss": 0.0089, + "step": 5750 + }, + { + "epoch": 0.025731631590938536, + "grad_norm": 1.2863636129623046e-06, + "learning_rate": 9.742683684090616e-06, + "loss": 0.0, + "step": 5760 + }, + { + "epoch": 0.02577630456245058, + "grad_norm": 9.311942267231643e-05, + "learning_rate": 9.742236954375494e-06, + "loss": 0.0001, + "step": 5770 + }, + { + "epoch": 0.025820977533962626, + "grad_norm": 1.664363367126498e-07, + "learning_rate": 9.741790224660375e-06, + "loss": 0.2125, + "step": 5780 + }, + { + "epoch": 0.025865650505474673, + "grad_norm": 1.3566239798201707e-10, + "learning_rate": 9.741343494945255e-06, + "loss": 0.7625, + "step": 5790 + }, + { + "epoch": 0.02591032347698672, + "grad_norm": 1.458537894905021e-07, + "learning_rate": 9.740896765230133e-06, + "loss": 0.0, + "step": 5800 + }, + { + "epoch": 0.025954996448498763, + "grad_norm": 5.571019937633537e-05, + "learning_rate": 9.740450035515013e-06, + "loss": 0.0002, + "step": 5810 + }, + { + "epoch": 0.02599966942001081, + "grad_norm": 5.8732325669552665e-09, + "learning_rate": 9.740003305799892e-06, + "loss": 0.0523, + "step": 5820 + }, + { + "epoch": 0.026044342391522857, + "grad_norm": 7.986272976268083e-07, + "learning_rate": 9.739556576084772e-06, + "loss": 0.0134, + "step": 5830 + }, + { + "epoch": 0.026089015363034904, + "grad_norm": 2.1820655504711794e-08, + "learning_rate": 9.739109846369652e-06, + "loss": 0.0011, + "step": 5840 + }, + { + "epoch": 0.026133688334546948, + "grad_norm": 0.3102538585662842, + "learning_rate": 9.73866311665453e-06, + "loss": 0.0001, + "step": 5850 + }, + { + "epoch": 0.026178361306058995, + "grad_norm": 0.00025683510466478765, + "learning_rate": 9.73821638693941e-06, + "loss": 0.15, + "step": 5860 + }, + { + "epoch": 0.026223034277571042, + "grad_norm": 1.50867836055113e-05, + "learning_rate": 9.73776965722429e-06, + "loss": 0.0, + "step": 5870 + }, + { + "epoch": 0.02626770724908309, + "grad_norm": 0.005404121708124876, + "learning_rate": 9.737322927509169e-06, + "loss": 0.0002, + "step": 5880 + }, + { + "epoch": 0.026312380220595132, + "grad_norm": 0.002749478444457054, + "learning_rate": 9.73687619779405e-06, + "loss": 0.0, + "step": 5890 + }, + { + "epoch": 0.02635705319210718, + "grad_norm": 6.164871592773125e-05, + "learning_rate": 9.73642946807893e-06, + "loss": 0.0, + "step": 5900 + }, + { + "epoch": 0.026401726163619226, + "grad_norm": 0.1678089052438736, + "learning_rate": 9.735982738363808e-06, + "loss": 0.0, + "step": 5910 + }, + { + "epoch": 0.026446399135131273, + "grad_norm": 3.7444728206992295e-08, + "learning_rate": 9.735536008648688e-06, + "loss": 0.0143, + "step": 5920 + }, + { + "epoch": 0.026491072106643317, + "grad_norm": 0.00017223106988240033, + "learning_rate": 9.735089278933566e-06, + "loss": 0.0, + "step": 5930 + }, + { + "epoch": 0.026535745078155364, + "grad_norm": 1.9701846820652236e-09, + "learning_rate": 9.734642549218446e-06, + "loss": 0.0, + "step": 5940 + }, + { + "epoch": 0.02658041804966741, + "grad_norm": 0.010152035392820835, + "learning_rate": 9.734195819503327e-06, + "loss": 0.2016, + "step": 5950 + }, + { + "epoch": 0.026625091021179454, + "grad_norm": 3.5874224977305857e-06, + "learning_rate": 9.733749089788207e-06, + "loss": 0.035, + "step": 5960 + }, + { + "epoch": 0.0266697639926915, + "grad_norm": 4.681384233983909e-15, + "learning_rate": 9.733302360073085e-06, + "loss": 0.0, + "step": 5970 + }, + { + "epoch": 0.026714436964203548, + "grad_norm": 1.9966948912042426e-06, + "learning_rate": 9.732855630357965e-06, + "loss": 0.0002, + "step": 5980 + }, + { + "epoch": 0.026759109935715595, + "grad_norm": 4.600369265972404e-06, + "learning_rate": 9.732408900642845e-06, + "loss": 0.0, + "step": 5990 + }, + { + "epoch": 0.02680378290722764, + "grad_norm": 4.245321179041639e-06, + "learning_rate": 9.731962170927724e-06, + "loss": 0.0693, + "step": 6000 + }, + { + "epoch": 0.026848455878739685, + "grad_norm": 6.09661583439447e-06, + "learning_rate": 9.731515441212604e-06, + "loss": 0.0049, + "step": 6010 + }, + { + "epoch": 0.026893128850251732, + "grad_norm": 1.7047414075932465e-06, + "learning_rate": 9.731068711497484e-06, + "loss": 0.0004, + "step": 6020 + }, + { + "epoch": 0.02693780182176378, + "grad_norm": 4.4253320452547484e-10, + "learning_rate": 9.730621981782362e-06, + "loss": 0.9938, + "step": 6030 + }, + { + "epoch": 0.026982474793275823, + "grad_norm": 0.0007237186655402184, + "learning_rate": 9.730175252067243e-06, + "loss": 0.0, + "step": 6040 + }, + { + "epoch": 0.02702714776478787, + "grad_norm": 0.001562108751386404, + "learning_rate": 9.729728522352123e-06, + "loss": 0.0004, + "step": 6050 + }, + { + "epoch": 0.027071820736299917, + "grad_norm": 1.0216683769676796e-15, + "learning_rate": 9.729281792637003e-06, + "loss": 0.0574, + "step": 6060 + }, + { + "epoch": 0.027116493707811964, + "grad_norm": 1.391899651093098e-10, + "learning_rate": 9.728835062921881e-06, + "loss": 0.0026, + "step": 6070 + }, + { + "epoch": 0.027161166679324007, + "grad_norm": 0.054726582020521164, + "learning_rate": 9.728388333206761e-06, + "loss": 0.0014, + "step": 6080 + }, + { + "epoch": 0.027205839650836054, + "grad_norm": 433.8678283691406, + "learning_rate": 9.727941603491642e-06, + "loss": 0.2813, + "step": 6090 + }, + { + "epoch": 0.0272505126223481, + "grad_norm": 0.007730548270046711, + "learning_rate": 9.72749487377652e-06, + "loss": 0.0, + "step": 6100 + }, + { + "epoch": 0.027295185593860148, + "grad_norm": 2.500319016499747e-13, + "learning_rate": 9.7270481440614e-06, + "loss": 0.0, + "step": 6110 + }, + { + "epoch": 0.02733985856537219, + "grad_norm": 3.923983967618616e-12, + "learning_rate": 9.726601414346279e-06, + "loss": 0.0982, + "step": 6120 + }, + { + "epoch": 0.02738453153688424, + "grad_norm": 3.476937490631826e-05, + "learning_rate": 9.726154684631159e-06, + "loss": 0.0114, + "step": 6130 + }, + { + "epoch": 0.027429204508396286, + "grad_norm": 426.7615051269531, + "learning_rate": 9.725707954916039e-06, + "loss": 0.1055, + "step": 6140 + }, + { + "epoch": 0.027473877479908333, + "grad_norm": 1.0694459056322689e-15, + "learning_rate": 9.725261225200917e-06, + "loss": 0.0691, + "step": 6150 + }, + { + "epoch": 0.027518550451420376, + "grad_norm": 2.8464754997514774e-09, + "learning_rate": 9.724814495485797e-06, + "loss": 0.0004, + "step": 6160 + }, + { + "epoch": 0.027563223422932423, + "grad_norm": 1.008872207997058e-09, + "learning_rate": 9.724367765770677e-06, + "loss": 0.0006, + "step": 6170 + }, + { + "epoch": 0.02760789639444447, + "grad_norm": 3.2404907750194223e-12, + "learning_rate": 9.723921036055556e-06, + "loss": 0.0003, + "step": 6180 + }, + { + "epoch": 0.027652569365956517, + "grad_norm": 4.715409052807873e-12, + "learning_rate": 9.723474306340436e-06, + "loss": 0.0222, + "step": 6190 + }, + { + "epoch": 0.02769724233746856, + "grad_norm": 6.288278816928006e-13, + "learning_rate": 9.723027576625316e-06, + "loss": 0.0003, + "step": 6200 + }, + { + "epoch": 0.027741915308980607, + "grad_norm": 1.877140675787814e-06, + "learning_rate": 9.722580846910195e-06, + "loss": 0.0001, + "step": 6210 + }, + { + "epoch": 0.027786588280492654, + "grad_norm": 190.27633666992188, + "learning_rate": 9.722134117195075e-06, + "loss": 0.0348, + "step": 6220 + }, + { + "epoch": 0.0278312612520047, + "grad_norm": 2.001493015768574e-07, + "learning_rate": 9.721687387479953e-06, + "loss": 0.0, + "step": 6230 + }, + { + "epoch": 0.027875934223516745, + "grad_norm": 5.368600355937225e-11, + "learning_rate": 9.721240657764833e-06, + "loss": 0.0209, + "step": 6240 + }, + { + "epoch": 0.027920607195028792, + "grad_norm": 0.00173946691211313, + "learning_rate": 9.720793928049713e-06, + "loss": 0.0003, + "step": 6250 + }, + { + "epoch": 0.02796528016654084, + "grad_norm": 0.006585344672203064, + "learning_rate": 9.720347198334592e-06, + "loss": 0.5363, + "step": 6260 + }, + { + "epoch": 0.028009953138052882, + "grad_norm": 0.013094873167574406, + "learning_rate": 9.719900468619472e-06, + "loss": 0.0, + "step": 6270 + }, + { + "epoch": 0.02805462610956493, + "grad_norm": 0.002456572838127613, + "learning_rate": 9.719453738904352e-06, + "loss": 0.0, + "step": 6280 + }, + { + "epoch": 0.028099299081076976, + "grad_norm": 1.4949612658909928e-09, + "learning_rate": 9.71900700918923e-06, + "loss": 0.615, + "step": 6290 + }, + { + "epoch": 0.028143972052589023, + "grad_norm": 6.688063791671084e-08, + "learning_rate": 9.71856027947411e-06, + "loss": 0.0006, + "step": 6300 + }, + { + "epoch": 0.028188645024101067, + "grad_norm": 0.0004353579424787313, + "learning_rate": 9.718113549758989e-06, + "loss": 0.0001, + "step": 6310 + }, + { + "epoch": 0.028233317995613114, + "grad_norm": 0.00210509798489511, + "learning_rate": 9.71766682004387e-06, + "loss": 0.0, + "step": 6320 + }, + { + "epoch": 0.02827799096712516, + "grad_norm": 9.322425285063218e-06, + "learning_rate": 9.71722009032875e-06, + "loss": 0.0, + "step": 6330 + }, + { + "epoch": 0.028322663938637208, + "grad_norm": 5.350936044123955e-05, + "learning_rate": 9.716773360613628e-06, + "loss": 0.0, + "step": 6340 + }, + { + "epoch": 0.02836733691014925, + "grad_norm": 3.720874630630533e-08, + "learning_rate": 9.716326630898508e-06, + "loss": 0.0, + "step": 6350 + }, + { + "epoch": 0.028412009881661298, + "grad_norm": 33.51473617553711, + "learning_rate": 9.715879901183388e-06, + "loss": 0.0089, + "step": 6360 + }, + { + "epoch": 0.028456682853173345, + "grad_norm": 7.0806942531476125e-09, + "learning_rate": 9.715433171468266e-06, + "loss": 0.0003, + "step": 6370 + }, + { + "epoch": 0.028501355824685392, + "grad_norm": 0.007248702924698591, + "learning_rate": 9.714986441753147e-06, + "loss": 0.0, + "step": 6380 + }, + { + "epoch": 0.028546028796197435, + "grad_norm": 1.2337777910431669e-08, + "learning_rate": 9.714539712038027e-06, + "loss": 0.0, + "step": 6390 + }, + { + "epoch": 0.028590701767709482, + "grad_norm": 1.512154388250836e-13, + "learning_rate": 9.714092982322905e-06, + "loss": 0.0, + "step": 6400 + }, + { + "epoch": 0.02863537473922153, + "grad_norm": 4.472500801086426, + "learning_rate": 9.713646252607785e-06, + "loss": 0.2022, + "step": 6410 + }, + { + "epoch": 0.028680047710733576, + "grad_norm": 34.18728256225586, + "learning_rate": 9.713199522892665e-06, + "loss": 0.0081, + "step": 6420 + }, + { + "epoch": 0.02872472068224562, + "grad_norm": 0.18630315363407135, + "learning_rate": 9.712752793177544e-06, + "loss": 0.0565, + "step": 6430 + }, + { + "epoch": 0.028769393653757667, + "grad_norm": 2.7868030071258545, + "learning_rate": 9.712306063462424e-06, + "loss": 0.0005, + "step": 6440 + }, + { + "epoch": 0.028814066625269714, + "grad_norm": 6.858974899159875e-08, + "learning_rate": 9.711859333747304e-06, + "loss": 0.0, + "step": 6450 + }, + { + "epoch": 0.02885873959678176, + "grad_norm": 1.4235383434101773e-13, + "learning_rate": 9.711412604032182e-06, + "loss": 0.0575, + "step": 6460 + }, + { + "epoch": 0.028903412568293804, + "grad_norm": 1.4090331124094746e-09, + "learning_rate": 9.710965874317063e-06, + "loss": 0.0, + "step": 6470 + }, + { + "epoch": 0.02894808553980585, + "grad_norm": 2.0118551269376894e-10, + "learning_rate": 9.710519144601943e-06, + "loss": 1.243, + "step": 6480 + }, + { + "epoch": 0.028992758511317898, + "grad_norm": 5.944572091520772e-13, + "learning_rate": 9.710072414886821e-06, + "loss": 0.0012, + "step": 6490 + }, + { + "epoch": 0.029037431482829945, + "grad_norm": 1.8894002681846267e-11, + "learning_rate": 9.709625685171701e-06, + "loss": 0.0, + "step": 6500 + }, + { + "epoch": 0.02908210445434199, + "grad_norm": 3.621100294615154e-11, + "learning_rate": 9.709178955456581e-06, + "loss": 0.0004, + "step": 6510 + }, + { + "epoch": 0.029126777425854036, + "grad_norm": 1.755638117379751e-19, + "learning_rate": 9.708732225741462e-06, + "loss": 0.044, + "step": 6520 + }, + { + "epoch": 0.029171450397366083, + "grad_norm": 2.4194950043465724e-08, + "learning_rate": 9.70828549602634e-06, + "loss": 0.0, + "step": 6530 + }, + { + "epoch": 0.029216123368878126, + "grad_norm": 1.1459812299108307e-07, + "learning_rate": 9.70783876631122e-06, + "loss": 0.0001, + "step": 6540 + }, + { + "epoch": 0.029260796340390173, + "grad_norm": 0.02885868027806282, + "learning_rate": 9.7073920365961e-06, + "loss": 0.0023, + "step": 6550 + }, + { + "epoch": 0.02930546931190222, + "grad_norm": 3.1013897006104685e-10, + "learning_rate": 9.706945306880979e-06, + "loss": 0.0, + "step": 6560 + }, + { + "epoch": 0.029350142283414267, + "grad_norm": 1.3184380254216421e-09, + "learning_rate": 9.706498577165859e-06, + "loss": 0.2098, + "step": 6570 + }, + { + "epoch": 0.02939481525492631, + "grad_norm": 0.0006379721453413367, + "learning_rate": 9.706051847450739e-06, + "loss": 0.2688, + "step": 6580 + }, + { + "epoch": 0.029439488226438357, + "grad_norm": 2.612810801050358e-10, + "learning_rate": 9.705605117735617e-06, + "loss": 0.0523, + "step": 6590 + }, + { + "epoch": 0.029484161197950404, + "grad_norm": 0.006585241761058569, + "learning_rate": 9.705158388020497e-06, + "loss": 0.0, + "step": 6600 + }, + { + "epoch": 0.02952883416946245, + "grad_norm": 3.1172658054856583e-05, + "learning_rate": 9.704711658305376e-06, + "loss": 0.0001, + "step": 6610 + }, + { + "epoch": 0.029573507140974495, + "grad_norm": 1.5668823877268423e-11, + "learning_rate": 9.704264928590256e-06, + "loss": 0.0014, + "step": 6620 + }, + { + "epoch": 0.029618180112486542, + "grad_norm": 9.767539449967444e-05, + "learning_rate": 9.703818198875136e-06, + "loss": 0.0113, + "step": 6630 + }, + { + "epoch": 0.02966285308399859, + "grad_norm": 1.4886247299727984e-05, + "learning_rate": 9.703371469160015e-06, + "loss": 0.0001, + "step": 6640 + }, + { + "epoch": 0.029707526055510636, + "grad_norm": 8.713771349443122e-13, + "learning_rate": 9.702924739444895e-06, + "loss": 0.0049, + "step": 6650 + }, + { + "epoch": 0.02975219902702268, + "grad_norm": 0.00010709473281167448, + "learning_rate": 9.702478009729775e-06, + "loss": 0.071, + "step": 6660 + }, + { + "epoch": 0.029796871998534726, + "grad_norm": 3.314019281219771e-08, + "learning_rate": 9.702031280014653e-06, + "loss": 0.0, + "step": 6670 + }, + { + "epoch": 0.029841544970046773, + "grad_norm": 3.9084474545703074e-10, + "learning_rate": 9.701584550299533e-06, + "loss": 0.0062, + "step": 6680 + }, + { + "epoch": 0.02988621794155882, + "grad_norm": 0.011260163970291615, + "learning_rate": 9.701137820584413e-06, + "loss": 0.0026, + "step": 6690 + }, + { + "epoch": 0.029930890913070864, + "grad_norm": 5.194561936150421e-07, + "learning_rate": 9.700691090869292e-06, + "loss": 0.0, + "step": 6700 + }, + { + "epoch": 0.02997556388458291, + "grad_norm": 2.75262749482863e-07, + "learning_rate": 9.700244361154172e-06, + "loss": 0.0, + "step": 6710 + }, + { + "epoch": 0.030020236856094958, + "grad_norm": 3.266507701482624e-05, + "learning_rate": 9.69979763143905e-06, + "loss": 0.0, + "step": 6720 + }, + { + "epoch": 0.030064909827607005, + "grad_norm": 0.0007317392737604678, + "learning_rate": 9.69935090172393e-06, + "loss": 0.0, + "step": 6730 + }, + { + "epoch": 0.030109582799119048, + "grad_norm": 7.135286494985849e-08, + "learning_rate": 9.69890417200881e-06, + "loss": 0.0, + "step": 6740 + }, + { + "epoch": 0.030154255770631095, + "grad_norm": 0.0005465031717903912, + "learning_rate": 9.69845744229369e-06, + "loss": 0.0128, + "step": 6750 + }, + { + "epoch": 0.030198928742143142, + "grad_norm": 0.10797858983278275, + "learning_rate": 9.69801071257857e-06, + "loss": 0.45, + "step": 6760 + }, + { + "epoch": 0.03024360171365519, + "grad_norm": 3.4820845127105713, + "learning_rate": 9.69756398286345e-06, + "loss": 0.002, + "step": 6770 + }, + { + "epoch": 0.030288274685167232, + "grad_norm": 2.2667615481763337e-19, + "learning_rate": 9.697117253148328e-06, + "loss": 0.0005, + "step": 6780 + }, + { + "epoch": 0.03033294765667928, + "grad_norm": 0.0035033426247537136, + "learning_rate": 9.696670523433208e-06, + "loss": 0.0049, + "step": 6790 + }, + { + "epoch": 0.030377620628191326, + "grad_norm": 2.2869908633538216e-08, + "learning_rate": 9.696223793718088e-06, + "loss": 0.0018, + "step": 6800 + }, + { + "epoch": 0.03042229359970337, + "grad_norm": 2.1959427613182925e-05, + "learning_rate": 9.695777064002967e-06, + "loss": 0.001, + "step": 6810 + }, + { + "epoch": 0.030466966571215417, + "grad_norm": 3.3809324229139293e-09, + "learning_rate": 9.695330334287847e-06, + "loss": 0.7331, + "step": 6820 + }, + { + "epoch": 0.030511639542727464, + "grad_norm": 0.0002111146313836798, + "learning_rate": 9.694883604572725e-06, + "loss": 0.0, + "step": 6830 + }, + { + "epoch": 0.03055631251423951, + "grad_norm": 6.2108989826170635e-12, + "learning_rate": 9.694436874857605e-06, + "loss": 0.0038, + "step": 6840 + }, + { + "epoch": 0.030600985485751554, + "grad_norm": 1.232419729232788, + "learning_rate": 9.693990145142485e-06, + "loss": 0.4186, + "step": 6850 + }, + { + "epoch": 0.0306456584572636, + "grad_norm": 0.03285527974367142, + "learning_rate": 9.693543415427364e-06, + "loss": 0.0, + "step": 6860 + }, + { + "epoch": 0.030690331428775648, + "grad_norm": 1.7257926858416095e-12, + "learning_rate": 9.693096685712244e-06, + "loss": 0.0, + "step": 6870 + }, + { + "epoch": 0.030735004400287695, + "grad_norm": 3.952288596766919e-14, + "learning_rate": 9.692649955997124e-06, + "loss": 0.0, + "step": 6880 + }, + { + "epoch": 0.03077967737179974, + "grad_norm": 1.4386819372543685e-13, + "learning_rate": 9.692203226282002e-06, + "loss": 0.0009, + "step": 6890 + }, + { + "epoch": 0.030824350343311786, + "grad_norm": 2.87882137298584, + "learning_rate": 9.691756496566883e-06, + "loss": 0.1736, + "step": 6900 + }, + { + "epoch": 0.030869023314823833, + "grad_norm": 2.567782653173367e-10, + "learning_rate": 9.691309766851763e-06, + "loss": 0.0, + "step": 6910 + }, + { + "epoch": 0.03091369628633588, + "grad_norm": 8.343608470795516e-09, + "learning_rate": 9.690863037136641e-06, + "loss": 0.0018, + "step": 6920 + }, + { + "epoch": 0.030958369257847923, + "grad_norm": 4.9662394523620605, + "learning_rate": 9.690416307421521e-06, + "loss": 0.0011, + "step": 6930 + }, + { + "epoch": 0.03100304222935997, + "grad_norm": 0.02405603602528572, + "learning_rate": 9.689969577706401e-06, + "loss": 0.0001, + "step": 6940 + }, + { + "epoch": 0.031047715200872017, + "grad_norm": 9.02886768017197e-07, + "learning_rate": 9.68952284799128e-06, + "loss": 0.0, + "step": 6950 + }, + { + "epoch": 0.031092388172384064, + "grad_norm": 655.1307983398438, + "learning_rate": 9.68907611827616e-06, + "loss": 0.1703, + "step": 6960 + }, + { + "epoch": 0.031137061143896107, + "grad_norm": 7.539566259661431e-16, + "learning_rate": 9.68862938856104e-06, + "loss": 0.0001, + "step": 6970 + }, + { + "epoch": 0.031181734115408154, + "grad_norm": 1.45284036537352e-11, + "learning_rate": 9.68818265884592e-06, + "loss": 0.0113, + "step": 6980 + }, + { + "epoch": 0.0312264070869202, + "grad_norm": 0.020145660266280174, + "learning_rate": 9.687735929130799e-06, + "loss": 1.7126, + "step": 6990 + }, + { + "epoch": 0.031271080058432245, + "grad_norm": 4.409622000878244e-09, + "learning_rate": 9.687289199415679e-06, + "loss": 0.0, + "step": 7000 + }, + { + "epoch": 0.03131575302994429, + "grad_norm": 2.2381243525160244e-06, + "learning_rate": 9.686842469700559e-06, + "loss": 0.0, + "step": 7010 + }, + { + "epoch": 0.03136042600145634, + "grad_norm": 8.434130336354428e-07, + "learning_rate": 9.686395739985437e-06, + "loss": 0.0027, + "step": 7020 + }, + { + "epoch": 0.031405098972968386, + "grad_norm": 6.848535627490548e-11, + "learning_rate": 9.685949010270317e-06, + "loss": 0.0023, + "step": 7030 + }, + { + "epoch": 0.03144977194448043, + "grad_norm": 0.00046889035729691386, + "learning_rate": 9.685502280555198e-06, + "loss": 0.0, + "step": 7040 + }, + { + "epoch": 0.03149444491599248, + "grad_norm": 4.915896655672755e-14, + "learning_rate": 9.685055550840076e-06, + "loss": 0.0, + "step": 7050 + }, + { + "epoch": 0.03153911788750452, + "grad_norm": 0.000905799912288785, + "learning_rate": 9.684608821124956e-06, + "loss": 0.1114, + "step": 7060 + }, + { + "epoch": 0.03158379085901657, + "grad_norm": 8.679131507873535, + "learning_rate": 9.684162091409836e-06, + "loss": 0.0016, + "step": 7070 + }, + { + "epoch": 0.031628463830528614, + "grad_norm": 0.00016720079293008894, + "learning_rate": 9.683715361694715e-06, + "loss": 0.0001, + "step": 7080 + }, + { + "epoch": 0.03167313680204066, + "grad_norm": 9.629166015656665e-06, + "learning_rate": 9.683268631979595e-06, + "loss": 0.0313, + "step": 7090 + }, + { + "epoch": 0.03171780977355271, + "grad_norm": 1.7250875316676684e-05, + "learning_rate": 9.682821902264475e-06, + "loss": 0.0281, + "step": 7100 + }, + { + "epoch": 0.031762482745064755, + "grad_norm": 4.952021370741022e-08, + "learning_rate": 9.682375172549353e-06, + "loss": 0.0023, + "step": 7110 + }, + { + "epoch": 0.0318071557165768, + "grad_norm": 2.3499190149323113e-08, + "learning_rate": 9.681928442834233e-06, + "loss": 0.4156, + "step": 7120 + }, + { + "epoch": 0.03185182868808885, + "grad_norm": 9.509724563372401e-09, + "learning_rate": 9.681481713119112e-06, + "loss": 0.0, + "step": 7130 + }, + { + "epoch": 0.03189650165960089, + "grad_norm": 6.2214312492869794e-06, + "learning_rate": 9.681034983403992e-06, + "loss": 0.0, + "step": 7140 + }, + { + "epoch": 0.031941174631112935, + "grad_norm": 0.1611732840538025, + "learning_rate": 9.680588253688872e-06, + "loss": 0.0, + "step": 7150 + }, + { + "epoch": 0.03198584760262498, + "grad_norm": 1.358881404752918e-14, + "learning_rate": 9.68014152397375e-06, + "loss": 0.0001, + "step": 7160 + }, + { + "epoch": 0.03203052057413703, + "grad_norm": 2.607118965158861e-09, + "learning_rate": 9.67969479425863e-06, + "loss": 0.0, + "step": 7170 + }, + { + "epoch": 0.032075193545649076, + "grad_norm": 7.82560187441092e-14, + "learning_rate": 9.67924806454351e-06, + "loss": 0.0, + "step": 7180 + }, + { + "epoch": 0.03211986651716112, + "grad_norm": 3.4912116007035365e-06, + "learning_rate": 9.67880133482839e-06, + "loss": 0.0009, + "step": 7190 + }, + { + "epoch": 0.03216453948867317, + "grad_norm": 4.0497349939640215e-13, + "learning_rate": 9.67835460511327e-06, + "loss": 0.0011, + "step": 7200 + }, + { + "epoch": 0.03220921246018522, + "grad_norm": 3.911545254364768e-10, + "learning_rate": 9.677907875398148e-06, + "loss": 0.0001, + "step": 7210 + }, + { + "epoch": 0.03225388543169726, + "grad_norm": 6.038759181592468e-08, + "learning_rate": 9.677461145683028e-06, + "loss": 0.0, + "step": 7220 + }, + { + "epoch": 0.032298558403209304, + "grad_norm": 2.0243858500634815e-07, + "learning_rate": 9.677014415967908e-06, + "loss": 0.0, + "step": 7230 + }, + { + "epoch": 0.03234323137472135, + "grad_norm": 501.6820373535156, + "learning_rate": 9.676567686252787e-06, + "loss": 0.3256, + "step": 7240 + }, + { + "epoch": 0.0323879043462334, + "grad_norm": 9.651954542277963e-07, + "learning_rate": 9.676120956537667e-06, + "loss": 0.0001, + "step": 7250 + }, + { + "epoch": 0.032432577317745445, + "grad_norm": 5.384000604830193e-14, + "learning_rate": 9.675674226822547e-06, + "loss": 0.0002, + "step": 7260 + }, + { + "epoch": 0.03247725028925749, + "grad_norm": 1.508151416251824e-11, + "learning_rate": 9.675227497107425e-06, + "loss": 0.0012, + "step": 7270 + }, + { + "epoch": 0.03252192326076954, + "grad_norm": 7.18380704315541e-14, + "learning_rate": 9.674780767392305e-06, + "loss": 0.0, + "step": 7280 + }, + { + "epoch": 0.032566596232281586, + "grad_norm": 4.494653239817126e-06, + "learning_rate": 9.674334037677185e-06, + "loss": 0.0, + "step": 7290 + }, + { + "epoch": 0.032611269203793626, + "grad_norm": 1.3249834864836885e-06, + "learning_rate": 9.673887307962064e-06, + "loss": 0.0466, + "step": 7300 + }, + { + "epoch": 0.03265594217530567, + "grad_norm": 2.338749647140503, + "learning_rate": 9.673440578246944e-06, + "loss": 0.0005, + "step": 7310 + }, + { + "epoch": 0.03270061514681772, + "grad_norm": 4.7304103315643983e-14, + "learning_rate": 9.672993848531822e-06, + "loss": 0.0006, + "step": 7320 + }, + { + "epoch": 0.03274528811832977, + "grad_norm": 1.2597056210650326e-11, + "learning_rate": 9.672547118816703e-06, + "loss": 0.3047, + "step": 7330 + }, + { + "epoch": 0.032789961089841814, + "grad_norm": 8.864343178753564e-13, + "learning_rate": 9.672100389101583e-06, + "loss": 0.0, + "step": 7340 + }, + { + "epoch": 0.03283463406135386, + "grad_norm": 4.640969683208929e-13, + "learning_rate": 9.671653659386461e-06, + "loss": 0.0, + "step": 7350 + }, + { + "epoch": 0.03287930703286591, + "grad_norm": 1.1632348730084008e-11, + "learning_rate": 9.671206929671341e-06, + "loss": 0.0566, + "step": 7360 + }, + { + "epoch": 0.03292398000437795, + "grad_norm": 1.163630145128991e-06, + "learning_rate": 9.670760199956221e-06, + "loss": 0.0387, + "step": 7370 + }, + { + "epoch": 0.032968652975889995, + "grad_norm": 2.2575783020784002e-07, + "learning_rate": 9.6703134702411e-06, + "loss": 0.0006, + "step": 7380 + }, + { + "epoch": 0.03301332594740204, + "grad_norm": 0.007182363886386156, + "learning_rate": 9.66986674052598e-06, + "loss": 1.0463, + "step": 7390 + }, + { + "epoch": 0.03305799891891409, + "grad_norm": 20.120473861694336, + "learning_rate": 9.66942001081086e-06, + "loss": 0.0036, + "step": 7400 + }, + { + "epoch": 0.033102671890426136, + "grad_norm": 1.0165902786241077e-08, + "learning_rate": 9.668973281095738e-06, + "loss": 0.0, + "step": 7410 + }, + { + "epoch": 0.03314734486193818, + "grad_norm": 2.1038341424875484e-13, + "learning_rate": 9.668526551380619e-06, + "loss": 0.0, + "step": 7420 + }, + { + "epoch": 0.03319201783345023, + "grad_norm": 1.064471462086658e-06, + "learning_rate": 9.668079821665499e-06, + "loss": 0.0, + "step": 7430 + }, + { + "epoch": 0.03323669080496228, + "grad_norm": 5.4925614295479874e-14, + "learning_rate": 9.667633091950379e-06, + "loss": 0.0, + "step": 7440 + }, + { + "epoch": 0.03328136377647432, + "grad_norm": 1.8044505776734776e-12, + "learning_rate": 9.667186362235257e-06, + "loss": 0.0006, + "step": 7450 + }, + { + "epoch": 0.033326036747986364, + "grad_norm": 7.88182262567716e-07, + "learning_rate": 9.666739632520137e-06, + "loss": 0.0, + "step": 7460 + }, + { + "epoch": 0.03337070971949841, + "grad_norm": 7.243825734803977e-07, + "learning_rate": 9.666292902805018e-06, + "loss": 0.0001, + "step": 7470 + }, + { + "epoch": 0.03341538269101046, + "grad_norm": 7.187257779150968e-06, + "learning_rate": 9.665846173089896e-06, + "loss": 0.2984, + "step": 7480 + }, + { + "epoch": 0.033460055662522505, + "grad_norm": 0.23065359890460968, + "learning_rate": 9.665399443374776e-06, + "loss": 0.0002, + "step": 7490 + }, + { + "epoch": 0.03350472863403455, + "grad_norm": 0.00021304815891198814, + "learning_rate": 9.664952713659656e-06, + "loss": 0.01, + "step": 7500 + }, + { + "epoch": 0.0335494016055466, + "grad_norm": 1.3281003900800226e-12, + "learning_rate": 9.664505983944535e-06, + "loss": 0.0824, + "step": 7510 + }, + { + "epoch": 0.033594074577058645, + "grad_norm": 3.0334553770793027e-14, + "learning_rate": 9.664059254229415e-06, + "loss": 0.0016, + "step": 7520 + }, + { + "epoch": 0.033638747548570685, + "grad_norm": 4.896755490335636e-05, + "learning_rate": 9.663612524514295e-06, + "loss": 0.0, + "step": 7530 + }, + { + "epoch": 0.03368342052008273, + "grad_norm": 0.00037503300700336695, + "learning_rate": 9.663165794799173e-06, + "loss": 0.0001, + "step": 7540 + }, + { + "epoch": 0.03372809349159478, + "grad_norm": 8.310901778243746e-15, + "learning_rate": 9.662719065084053e-06, + "loss": 0.0049, + "step": 7550 + }, + { + "epoch": 0.033772766463106826, + "grad_norm": 2.1257954358588904e-05, + "learning_rate": 9.662272335368934e-06, + "loss": 0.0001, + "step": 7560 + }, + { + "epoch": 0.03381743943461887, + "grad_norm": 0.00030382740078493953, + "learning_rate": 9.661825605653812e-06, + "loss": 0.0, + "step": 7570 + }, + { + "epoch": 0.03386211240613092, + "grad_norm": 2.665811393853801e-08, + "learning_rate": 9.661378875938692e-06, + "loss": 0.0003, + "step": 7580 + }, + { + "epoch": 0.03390678537764297, + "grad_norm": 17.635639190673828, + "learning_rate": 9.660932146223572e-06, + "loss": 0.0023, + "step": 7590 + }, + { + "epoch": 0.03395145834915501, + "grad_norm": 2.347639194155704e-09, + "learning_rate": 9.66048541650845e-06, + "loss": 0.0001, + "step": 7600 + }, + { + "epoch": 0.033996131320667054, + "grad_norm": 25.22496223449707, + "learning_rate": 9.66003868679333e-06, + "loss": 0.0054, + "step": 7610 + }, + { + "epoch": 0.0340408042921791, + "grad_norm": 0.02057386375963688, + "learning_rate": 9.65959195707821e-06, + "loss": 0.0113, + "step": 7620 + }, + { + "epoch": 0.03408547726369115, + "grad_norm": 1123.7857666015625, + "learning_rate": 9.65914522736309e-06, + "loss": 0.8286, + "step": 7630 + }, + { + "epoch": 0.034130150235203195, + "grad_norm": 3.706858005091259e-15, + "learning_rate": 9.65869849764797e-06, + "loss": 0.018, + "step": 7640 + }, + { + "epoch": 0.03417482320671524, + "grad_norm": 2.1704324806599146e-14, + "learning_rate": 9.658251767932848e-06, + "loss": 0.0, + "step": 7650 + }, + { + "epoch": 0.03421949617822729, + "grad_norm": 7.746194023638964e-05, + "learning_rate": 9.657805038217728e-06, + "loss": 0.0, + "step": 7660 + }, + { + "epoch": 0.034264169149739336, + "grad_norm": 3.4090260214725276e-07, + "learning_rate": 9.657358308502608e-06, + "loss": 0.0, + "step": 7670 + }, + { + "epoch": 0.034308842121251376, + "grad_norm": 85.19580841064453, + "learning_rate": 9.656911578787487e-06, + "loss": 0.0253, + "step": 7680 + }, + { + "epoch": 0.03435351509276342, + "grad_norm": 0.00017949531320482492, + "learning_rate": 9.656464849072367e-06, + "loss": 0.0, + "step": 7690 + }, + { + "epoch": 0.03439818806427547, + "grad_norm": 2.522860813769512e-05, + "learning_rate": 9.656018119357245e-06, + "loss": 0.0001, + "step": 7700 + }, + { + "epoch": 0.03444286103578752, + "grad_norm": 0.0251141469925642, + "learning_rate": 9.655571389642125e-06, + "loss": 0.6876, + "step": 7710 + }, + { + "epoch": 0.034487534007299564, + "grad_norm": 0.000446300080511719, + "learning_rate": 9.655124659927005e-06, + "loss": 0.0, + "step": 7720 + }, + { + "epoch": 0.03453220697881161, + "grad_norm": 3.106126467145032e-11, + "learning_rate": 9.654677930211884e-06, + "loss": 0.0, + "step": 7730 + }, + { + "epoch": 0.03457687995032366, + "grad_norm": 7.441053458023816e-05, + "learning_rate": 9.654231200496764e-06, + "loss": 0.0016, + "step": 7740 + }, + { + "epoch": 0.034621552921835705, + "grad_norm": 4.7033314742872534e-11, + "learning_rate": 9.653784470781644e-06, + "loss": 0.0001, + "step": 7750 + }, + { + "epoch": 0.034666225893347745, + "grad_norm": 1.9171542589901946e-05, + "learning_rate": 9.653337741066523e-06, + "loss": 0.0001, + "step": 7760 + }, + { + "epoch": 0.03471089886485979, + "grad_norm": 11.328216552734375, + "learning_rate": 9.652891011351403e-06, + "loss": 0.022, + "step": 7770 + }, + { + "epoch": 0.03475557183637184, + "grad_norm": 1.524915471673438e-10, + "learning_rate": 9.652444281636283e-06, + "loss": 0.0, + "step": 7780 + }, + { + "epoch": 0.034800244807883886, + "grad_norm": 8.888397830253414e-12, + "learning_rate": 9.651997551921161e-06, + "loss": 0.0, + "step": 7790 + }, + { + "epoch": 0.03484491777939593, + "grad_norm": 2.0420384316821583e-06, + "learning_rate": 9.651550822206041e-06, + "loss": 0.0009, + "step": 7800 + }, + { + "epoch": 0.03488959075090798, + "grad_norm": 0.04410456120967865, + "learning_rate": 9.65110409249092e-06, + "loss": 0.0035, + "step": 7810 + }, + { + "epoch": 0.03493426372242003, + "grad_norm": 2.595260378129849e-12, + "learning_rate": 9.6506573627758e-06, + "loss": 0.0, + "step": 7820 + }, + { + "epoch": 0.034978936693932074, + "grad_norm": 1.2043649133609596e-15, + "learning_rate": 9.65021063306068e-06, + "loss": 0.0, + "step": 7830 + }, + { + "epoch": 0.035023609665444114, + "grad_norm": 2.8324453893446844e-09, + "learning_rate": 9.649763903345558e-06, + "loss": 0.0, + "step": 7840 + }, + { + "epoch": 0.03506828263695616, + "grad_norm": 6.378841703735816e-07, + "learning_rate": 9.649317173630439e-06, + "loss": 0.0, + "step": 7850 + }, + { + "epoch": 0.03511295560846821, + "grad_norm": 1.2266521132175967e-10, + "learning_rate": 9.648870443915319e-06, + "loss": 0.0041, + "step": 7860 + }, + { + "epoch": 0.035157628579980255, + "grad_norm": 2.6962340780173344e-12, + "learning_rate": 9.648423714200197e-06, + "loss": 0.0, + "step": 7870 + }, + { + "epoch": 0.0352023015514923, + "grad_norm": 6.922159373479175e-11, + "learning_rate": 9.647976984485077e-06, + "loss": 0.0, + "step": 7880 + }, + { + "epoch": 0.03524697452300435, + "grad_norm": 1.943918137570222e-09, + "learning_rate": 9.647530254769957e-06, + "loss": 0.1805, + "step": 7890 + }, + { + "epoch": 0.035291647494516395, + "grad_norm": 8.879051165422425e-06, + "learning_rate": 9.647083525054838e-06, + "loss": 0.0001, + "step": 7900 + }, + { + "epoch": 0.035336320466028436, + "grad_norm": 2.9081201091685216e-07, + "learning_rate": 9.646636795339716e-06, + "loss": 0.0076, + "step": 7910 + }, + { + "epoch": 0.03538099343754048, + "grad_norm": 1.1710764169692993, + "learning_rate": 9.646190065624596e-06, + "loss": 0.0002, + "step": 7920 + }, + { + "epoch": 0.03542566640905253, + "grad_norm": 6.1495647983578205e-12, + "learning_rate": 9.645743335909476e-06, + "loss": 0.0001, + "step": 7930 + }, + { + "epoch": 0.035470339380564576, + "grad_norm": 1.2547023288789205e-05, + "learning_rate": 9.645296606194355e-06, + "loss": 0.0026, + "step": 7940 + }, + { + "epoch": 0.03551501235207662, + "grad_norm": 5.72323062982108e-16, + "learning_rate": 9.644849876479235e-06, + "loss": 0.0005, + "step": 7950 + }, + { + "epoch": 0.03555968532358867, + "grad_norm": 1.514777814293211e-08, + "learning_rate": 9.644403146764115e-06, + "loss": 0.1914, + "step": 7960 + }, + { + "epoch": 0.03560435829510072, + "grad_norm": 5.735700359110751e-09, + "learning_rate": 9.643956417048993e-06, + "loss": 0.0753, + "step": 7970 + }, + { + "epoch": 0.035649031266612764, + "grad_norm": 1.7102655647249776e-07, + "learning_rate": 9.643509687333873e-06, + "loss": 0.5125, + "step": 7980 + }, + { + "epoch": 0.035693704238124804, + "grad_norm": 1.2987326769575702e-09, + "learning_rate": 9.643062957618754e-06, + "loss": 0.0012, + "step": 7990 + }, + { + "epoch": 0.03573837720963685, + "grad_norm": 9.948726287234422e-09, + "learning_rate": 9.642616227903632e-06, + "loss": 0.3656, + "step": 8000 + }, + { + "epoch": 0.0357830501811489, + "grad_norm": 5.586425118053739e-07, + "learning_rate": 9.642169498188512e-06, + "loss": 0.0, + "step": 8010 + }, + { + "epoch": 0.035827723152660945, + "grad_norm": 0.6900156140327454, + "learning_rate": 9.641722768473392e-06, + "loss": 0.0001, + "step": 8020 + }, + { + "epoch": 0.03587239612417299, + "grad_norm": 1.6524243307003417e-08, + "learning_rate": 9.64127603875827e-06, + "loss": 0.0049, + "step": 8030 + }, + { + "epoch": 0.03591706909568504, + "grad_norm": 0.005095589440315962, + "learning_rate": 9.64082930904315e-06, + "loss": 0.0, + "step": 8040 + }, + { + "epoch": 0.035961742067197086, + "grad_norm": 0.005782588385045528, + "learning_rate": 9.640382579328031e-06, + "loss": 0.6813, + "step": 8050 + }, + { + "epoch": 0.03600641503870913, + "grad_norm": 1.5047929286956787, + "learning_rate": 9.63993584961291e-06, + "loss": 0.0025, + "step": 8060 + }, + { + "epoch": 0.03605108801022117, + "grad_norm": 6.845878885997081e-08, + "learning_rate": 9.63948911989779e-06, + "loss": 0.01, + "step": 8070 + }, + { + "epoch": 0.03609576098173322, + "grad_norm": 6.9732931824446464e-15, + "learning_rate": 9.63904239018267e-06, + "loss": 0.0005, + "step": 8080 + }, + { + "epoch": 0.03614043395324527, + "grad_norm": 0.029109634459018707, + "learning_rate": 9.638595660467548e-06, + "loss": 0.2125, + "step": 8090 + }, + { + "epoch": 0.036185106924757314, + "grad_norm": 148.90367126464844, + "learning_rate": 9.638148930752428e-06, + "loss": 0.0226, + "step": 8100 + }, + { + "epoch": 0.03622977989626936, + "grad_norm": 1.1183489898036747e-11, + "learning_rate": 9.637702201037307e-06, + "loss": 0.0, + "step": 8110 + }, + { + "epoch": 0.03627445286778141, + "grad_norm": 1.0056037353933789e-06, + "learning_rate": 9.637255471322187e-06, + "loss": 0.4766, + "step": 8120 + }, + { + "epoch": 0.036319125839293455, + "grad_norm": 1.895713808153232e-09, + "learning_rate": 9.636808741607067e-06, + "loss": 0.0023, + "step": 8130 + }, + { + "epoch": 0.036363798810805495, + "grad_norm": 8.699790043920075e-08, + "learning_rate": 9.636362011891945e-06, + "loss": 0.0011, + "step": 8140 + }, + { + "epoch": 0.03640847178231754, + "grad_norm": 6.252026755462423e-14, + "learning_rate": 9.635915282176825e-06, + "loss": 0.0001, + "step": 8150 + }, + { + "epoch": 0.03645314475382959, + "grad_norm": 0.008534946478903294, + "learning_rate": 9.635468552461706e-06, + "loss": 0.0, + "step": 8160 + }, + { + "epoch": 0.036497817725341636, + "grad_norm": 3.932518211513525e-06, + "learning_rate": 9.635021822746584e-06, + "loss": 0.0009, + "step": 8170 + }, + { + "epoch": 0.03654249069685368, + "grad_norm": 8.627313218312338e-05, + "learning_rate": 9.634575093031464e-06, + "loss": 0.0, + "step": 8180 + }, + { + "epoch": 0.03658716366836573, + "grad_norm": 1.372724489634436e-09, + "learning_rate": 9.634128363316344e-06, + "loss": 0.0, + "step": 8190 + }, + { + "epoch": 0.03663183663987778, + "grad_norm": 5.55379529032507e-07, + "learning_rate": 9.633681633601223e-06, + "loss": 0.0578, + "step": 8200 + }, + { + "epoch": 0.036676509611389824, + "grad_norm": 756.7450561523438, + "learning_rate": 9.633234903886103e-06, + "loss": 0.925, + "step": 8210 + }, + { + "epoch": 0.036721182582901864, + "grad_norm": 3.7226468574402816e-09, + "learning_rate": 9.632788174170981e-06, + "loss": 0.0002, + "step": 8220 + }, + { + "epoch": 0.03676585555441391, + "grad_norm": 2.771380781113608e-12, + "learning_rate": 9.632341444455861e-06, + "loss": 0.0, + "step": 8230 + }, + { + "epoch": 0.03681052852592596, + "grad_norm": 3.6899330162556976e-13, + "learning_rate": 9.631894714740741e-06, + "loss": 0.0001, + "step": 8240 + }, + { + "epoch": 0.036855201497438005, + "grad_norm": 5.4532566906573265e-08, + "learning_rate": 9.63144798502562e-06, + "loss": 0.0151, + "step": 8250 + }, + { + "epoch": 0.03689987446895005, + "grad_norm": 8.034226630115882e-05, + "learning_rate": 9.6310012553105e-06, + "loss": 0.0, + "step": 8260 + }, + { + "epoch": 0.0369445474404621, + "grad_norm": 2.354401112825144e-05, + "learning_rate": 9.63055452559538e-06, + "loss": 0.0, + "step": 8270 + }, + { + "epoch": 0.036989220411974146, + "grad_norm": 592.7618408203125, + "learning_rate": 9.630107795880259e-06, + "loss": 0.1501, + "step": 8280 + }, + { + "epoch": 0.03703389338348619, + "grad_norm": 1.201908162329346e-05, + "learning_rate": 9.629661066165139e-06, + "loss": 0.0043, + "step": 8290 + }, + { + "epoch": 0.03707856635499823, + "grad_norm": 5.1015629742824035e-15, + "learning_rate": 9.629214336450017e-06, + "loss": 0.0, + "step": 8300 + }, + { + "epoch": 0.03712323932651028, + "grad_norm": 3.6857750274066348e-06, + "learning_rate": 9.628767606734897e-06, + "loss": 0.0, + "step": 8310 + }, + { + "epoch": 0.037167912298022326, + "grad_norm": 0.0037877659779042006, + "learning_rate": 9.628320877019777e-06, + "loss": 0.0424, + "step": 8320 + }, + { + "epoch": 0.03721258526953437, + "grad_norm": 1.0160490883977941e-13, + "learning_rate": 9.627874147304656e-06, + "loss": 0.0007, + "step": 8330 + }, + { + "epoch": 0.03725725824104642, + "grad_norm": 1.4340297178894179e-11, + "learning_rate": 9.627427417589536e-06, + "loss": 0.0, + "step": 8340 + }, + { + "epoch": 0.03730193121255847, + "grad_norm": 1.898951662582249e-09, + "learning_rate": 9.626980687874416e-06, + "loss": 0.575, + "step": 8350 + }, + { + "epoch": 0.037346604184070514, + "grad_norm": 2.65234375547152e-05, + "learning_rate": 9.626533958159296e-06, + "loss": 0.0005, + "step": 8360 + }, + { + "epoch": 0.03739127715558256, + "grad_norm": 0.002871223958209157, + "learning_rate": 9.626087228444175e-06, + "loss": 0.0252, + "step": 8370 + }, + { + "epoch": 0.0374359501270946, + "grad_norm": 2.3592594891597152e-11, + "learning_rate": 9.625640498729055e-06, + "loss": 0.0252, + "step": 8380 + }, + { + "epoch": 0.03748062309860665, + "grad_norm": 10.255550384521484, + "learning_rate": 9.625193769013935e-06, + "loss": 0.1819, + "step": 8390 + }, + { + "epoch": 0.037525296070118695, + "grad_norm": 5.704014691132464e-11, + "learning_rate": 9.624747039298813e-06, + "loss": 0.0, + "step": 8400 + }, + { + "epoch": 0.03756996904163074, + "grad_norm": 3.851150864875308e-08, + "learning_rate": 9.624300309583693e-06, + "loss": 0.012, + "step": 8410 + }, + { + "epoch": 0.03761464201314279, + "grad_norm": 1.6326265495081316e-06, + "learning_rate": 9.623853579868574e-06, + "loss": 0.0008, + "step": 8420 + }, + { + "epoch": 0.037659314984654836, + "grad_norm": 1.937662703899168e-12, + "learning_rate": 9.623406850153452e-06, + "loss": 0.0016, + "step": 8430 + }, + { + "epoch": 0.03770398795616688, + "grad_norm": 156.47518920898438, + "learning_rate": 9.622960120438332e-06, + "loss": 0.0313, + "step": 8440 + }, + { + "epoch": 0.03774866092767892, + "grad_norm": 1.1682730161055588e-07, + "learning_rate": 9.622513390723212e-06, + "loss": 0.0006, + "step": 8450 + }, + { + "epoch": 0.03779333389919097, + "grad_norm": 0.0061811706982553005, + "learning_rate": 9.62206666100809e-06, + "loss": 0.0, + "step": 8460 + }, + { + "epoch": 0.03783800687070302, + "grad_norm": 6.299590626923646e-09, + "learning_rate": 9.62161993129297e-06, + "loss": 0.0018, + "step": 8470 + }, + { + "epoch": 0.037882679842215064, + "grad_norm": 1.5485067706322297e-05, + "learning_rate": 9.621173201577851e-06, + "loss": 0.0, + "step": 8480 + }, + { + "epoch": 0.03792735281372711, + "grad_norm": 5.004122627383367e-08, + "learning_rate": 9.620726471862731e-06, + "loss": 0.001, + "step": 8490 + }, + { + "epoch": 0.03797202578523916, + "grad_norm": 1.0859906723892365e-10, + "learning_rate": 9.62027974214761e-06, + "loss": 0.0001, + "step": 8500 + }, + { + "epoch": 0.038016698756751205, + "grad_norm": 9.935144576388248e-17, + "learning_rate": 9.61983301243249e-06, + "loss": 0.0001, + "step": 8510 + }, + { + "epoch": 0.03806137172826325, + "grad_norm": 8.756306324381776e-17, + "learning_rate": 9.619386282717368e-06, + "loss": 0.0, + "step": 8520 + }, + { + "epoch": 0.03810604469977529, + "grad_norm": 1.8787010048981756e-05, + "learning_rate": 9.618939553002248e-06, + "loss": 0.0001, + "step": 8530 + }, + { + "epoch": 0.03815071767128734, + "grad_norm": 1.7396580781692738e-15, + "learning_rate": 9.618492823287128e-06, + "loss": 0.0, + "step": 8540 + }, + { + "epoch": 0.038195390642799386, + "grad_norm": 8.588968945788533e-12, + "learning_rate": 9.618046093572007e-06, + "loss": 0.0898, + "step": 8550 + }, + { + "epoch": 0.03824006361431143, + "grad_norm": 7.278631031226723e-10, + "learning_rate": 9.617599363856887e-06, + "loss": 0.0, + "step": 8560 + }, + { + "epoch": 0.03828473658582348, + "grad_norm": 7.282357273652451e-06, + "learning_rate": 9.617152634141767e-06, + "loss": 0.0, + "step": 8570 + }, + { + "epoch": 0.03832940955733553, + "grad_norm": 4.293304201041792e-08, + "learning_rate": 9.616705904426645e-06, + "loss": 0.0081, + "step": 8580 + }, + { + "epoch": 0.038374082528847574, + "grad_norm": 5.928718564973678e-07, + "learning_rate": 9.616259174711526e-06, + "loss": 0.2127, + "step": 8590 + }, + { + "epoch": 0.03841875550035962, + "grad_norm": 9.632236414730544e-14, + "learning_rate": 9.615812444996404e-06, + "loss": 0.0019, + "step": 8600 + }, + { + "epoch": 0.03846342847187166, + "grad_norm": 1.7470816260356514e-07, + "learning_rate": 9.615365715281284e-06, + "loss": 0.0, + "step": 8610 + }, + { + "epoch": 0.03850810144338371, + "grad_norm": 4.201944781706546e-12, + "learning_rate": 9.614918985566164e-06, + "loss": 0.0, + "step": 8620 + }, + { + "epoch": 0.038552774414895755, + "grad_norm": 3.792194366455078, + "learning_rate": 9.614472255851043e-06, + "loss": 0.0005, + "step": 8630 + }, + { + "epoch": 0.0385974473864078, + "grad_norm": 1.5693518706783038e-10, + "learning_rate": 9.614025526135923e-06, + "loss": 0.0, + "step": 8640 + }, + { + "epoch": 0.03864212035791985, + "grad_norm": 6.817324482710774e-13, + "learning_rate": 9.613578796420803e-06, + "loss": 0.2689, + "step": 8650 + }, + { + "epoch": 0.038686793329431896, + "grad_norm": 3.3513822828012962e-09, + "learning_rate": 9.613132066705681e-06, + "loss": 0.0, + "step": 8660 + }, + { + "epoch": 0.03873146630094394, + "grad_norm": 1.621324187306059e-15, + "learning_rate": 9.612685336990561e-06, + "loss": 0.0, + "step": 8670 + }, + { + "epoch": 0.03877613927245598, + "grad_norm": 0.06873009353876114, + "learning_rate": 9.612238607275442e-06, + "loss": 0.2078, + "step": 8680 + }, + { + "epoch": 0.03882081224396803, + "grad_norm": 6.228573852240515e-07, + "learning_rate": 9.61179187756032e-06, + "loss": 0.0, + "step": 8690 + }, + { + "epoch": 0.038865485215480076, + "grad_norm": 1.1779390661104117e-05, + "learning_rate": 9.6113451478452e-06, + "loss": 0.0, + "step": 8700 + }, + { + "epoch": 0.03891015818699212, + "grad_norm": 1.770038920767547e-06, + "learning_rate": 9.610898418130079e-06, + "loss": 0.0001, + "step": 8710 + }, + { + "epoch": 0.03895483115850417, + "grad_norm": 0.00015133483975660056, + "learning_rate": 9.610451688414959e-06, + "loss": 0.0297, + "step": 8720 + }, + { + "epoch": 0.03899950413001622, + "grad_norm": 2.2596263079321943e-05, + "learning_rate": 9.610004958699839e-06, + "loss": 0.4875, + "step": 8730 + }, + { + "epoch": 0.039044177101528264, + "grad_norm": 1.4790319255553186e-05, + "learning_rate": 9.609558228984717e-06, + "loss": 0.0, + "step": 8740 + }, + { + "epoch": 0.03908885007304031, + "grad_norm": 0.060237348079681396, + "learning_rate": 9.609111499269597e-06, + "loss": 0.0, + "step": 8750 + }, + { + "epoch": 0.03913352304455235, + "grad_norm": 0.00015527455252595246, + "learning_rate": 9.608664769554478e-06, + "loss": 0.0062, + "step": 8760 + }, + { + "epoch": 0.0391781960160644, + "grad_norm": 1.2009442051930108e-10, + "learning_rate": 9.608218039839356e-06, + "loss": 0.0348, + "step": 8770 + }, + { + "epoch": 0.039222868987576445, + "grad_norm": 7.036464158497043e-16, + "learning_rate": 9.607771310124236e-06, + "loss": 0.0001, + "step": 8780 + }, + { + "epoch": 0.03926754195908849, + "grad_norm": 5.08871373858244e-12, + "learning_rate": 9.607324580409115e-06, + "loss": 0.0003, + "step": 8790 + }, + { + "epoch": 0.03931221493060054, + "grad_norm": 1.72422603972322e-10, + "learning_rate": 9.606877850693995e-06, + "loss": 0.0403, + "step": 8800 + }, + { + "epoch": 0.039356887902112586, + "grad_norm": 1.8882680141718566e-11, + "learning_rate": 9.606431120978875e-06, + "loss": 0.2578, + "step": 8810 + }, + { + "epoch": 0.03940156087362463, + "grad_norm": 4.5700039753978836e-14, + "learning_rate": 9.605984391263755e-06, + "loss": 0.0, + "step": 8820 + }, + { + "epoch": 0.03944623384513668, + "grad_norm": 6.765368701453639e-14, + "learning_rate": 9.605537661548633e-06, + "loss": 0.0226, + "step": 8830 + }, + { + "epoch": 0.03949090681664872, + "grad_norm": 1.6490670574274792e-15, + "learning_rate": 9.605090931833513e-06, + "loss": 0.0007, + "step": 8840 + }, + { + "epoch": 0.03953557978816077, + "grad_norm": 6.464263152539118e-12, + "learning_rate": 9.604644202118394e-06, + "loss": 0.0006, + "step": 8850 + }, + { + "epoch": 0.039580252759672814, + "grad_norm": 7.716137950808388e-09, + "learning_rate": 9.604197472403272e-06, + "loss": 0.0, + "step": 8860 + }, + { + "epoch": 0.03962492573118486, + "grad_norm": 4.2918849030293416e-10, + "learning_rate": 9.603750742688152e-06, + "loss": 0.675, + "step": 8870 + }, + { + "epoch": 0.03966959870269691, + "grad_norm": 0.0005095364176668227, + "learning_rate": 9.603304012973032e-06, + "loss": 0.0014, + "step": 8880 + }, + { + "epoch": 0.039714271674208955, + "grad_norm": 6.71338548988154e-12, + "learning_rate": 9.60285728325791e-06, + "loss": 0.0, + "step": 8890 + }, + { + "epoch": 0.039758944645721, + "grad_norm": 1.603486089152284e-05, + "learning_rate": 9.60241055354279e-06, + "loss": 0.0006, + "step": 8900 + }, + { + "epoch": 0.03980361761723305, + "grad_norm": 2.7107730602438096e-06, + "learning_rate": 9.601963823827671e-06, + "loss": 0.0, + "step": 8910 + }, + { + "epoch": 0.03984829058874509, + "grad_norm": 2.2744011118902563e-07, + "learning_rate": 9.60151709411255e-06, + "loss": 0.0, + "step": 8920 + }, + { + "epoch": 0.039892963560257136, + "grad_norm": 0.007210536394268274, + "learning_rate": 9.60107036439743e-06, + "loss": 0.3172, + "step": 8930 + }, + { + "epoch": 0.03993763653176918, + "grad_norm": 7.9241566989228e-15, + "learning_rate": 9.60062363468231e-06, + "loss": 0.0046, + "step": 8940 + }, + { + "epoch": 0.03998230950328123, + "grad_norm": 9.576701164245605, + "learning_rate": 9.60017690496719e-06, + "loss": 0.0036, + "step": 8950 + }, + { + "epoch": 0.04002698247479328, + "grad_norm": 0.0009234817116521299, + "learning_rate": 9.599730175252068e-06, + "loss": 0.0014, + "step": 8960 + }, + { + "epoch": 0.040071655446305324, + "grad_norm": 2.6690221588410168e-08, + "learning_rate": 9.599283445536948e-06, + "loss": 0.0049, + "step": 8970 + }, + { + "epoch": 0.04011632841781737, + "grad_norm": 7.448632914019981e-06, + "learning_rate": 9.598836715821828e-06, + "loss": 0.0, + "step": 8980 + }, + { + "epoch": 0.04016100138932941, + "grad_norm": 5.278381898232709e-13, + "learning_rate": 9.598389986106707e-06, + "loss": 0.0, + "step": 8990 + }, + { + "epoch": 0.04020567436084146, + "grad_norm": 1.8674436996768407e-11, + "learning_rate": 9.597943256391587e-06, + "loss": 0.1055, + "step": 9000 + }, + { + "epoch": 0.040250347332353505, + "grad_norm": 6.622378426884845e-13, + "learning_rate": 9.597496526676465e-06, + "loss": 0.0, + "step": 9010 + }, + { + "epoch": 0.04029502030386555, + "grad_norm": 1.352628342665696e-09, + "learning_rate": 9.597049796961346e-06, + "loss": 0.0, + "step": 9020 + }, + { + "epoch": 0.0403396932753776, + "grad_norm": 4.5851612640035455e-07, + "learning_rate": 9.596603067246226e-06, + "loss": 0.2813, + "step": 9030 + }, + { + "epoch": 0.040384366246889646, + "grad_norm": 7.833050119643303e-08, + "learning_rate": 9.596156337531104e-06, + "loss": 0.0, + "step": 9040 + }, + { + "epoch": 0.04042903921840169, + "grad_norm": 8.340951467289415e-07, + "learning_rate": 9.595709607815984e-06, + "loss": 0.0001, + "step": 9050 + }, + { + "epoch": 0.04047371218991374, + "grad_norm": 4.396644506798886e-11, + "learning_rate": 9.595262878100864e-06, + "loss": 0.0, + "step": 9060 + }, + { + "epoch": 0.04051838516142578, + "grad_norm": 1.6098958004060223e-17, + "learning_rate": 9.594816148385743e-06, + "loss": 0.0, + "step": 9070 + }, + { + "epoch": 0.040563058132937826, + "grad_norm": 9.86034774541622e-06, + "learning_rate": 9.594369418670623e-06, + "loss": 0.0, + "step": 9080 + }, + { + "epoch": 0.04060773110444987, + "grad_norm": 8.242366789090738e-09, + "learning_rate": 9.593922688955501e-06, + "loss": 0.0002, + "step": 9090 + }, + { + "epoch": 0.04065240407596192, + "grad_norm": 5.484909333902492e-15, + "learning_rate": 9.593475959240381e-06, + "loss": 0.0, + "step": 9100 + }, + { + "epoch": 0.04069707704747397, + "grad_norm": 2.234793322619666e-10, + "learning_rate": 9.593029229525262e-06, + "loss": 0.0001, + "step": 9110 + }, + { + "epoch": 0.040741750018986014, + "grad_norm": 0.0003598829498514533, + "learning_rate": 9.59258249981014e-06, + "loss": 0.0, + "step": 9120 + }, + { + "epoch": 0.04078642299049806, + "grad_norm": 1.4396250573079327e-21, + "learning_rate": 9.59213577009502e-06, + "loss": 0.0, + "step": 9130 + }, + { + "epoch": 0.04083109596201011, + "grad_norm": 1.944572795764543e-05, + "learning_rate": 9.5916890403799e-06, + "loss": 0.0001, + "step": 9140 + }, + { + "epoch": 0.04087576893352215, + "grad_norm": 6.553003040372929e-13, + "learning_rate": 9.591242310664779e-06, + "loss": 0.0, + "step": 9150 + }, + { + "epoch": 0.040920441905034195, + "grad_norm": 1.1000043542708227e-08, + "learning_rate": 9.590795580949659e-06, + "loss": 0.0007, + "step": 9160 + }, + { + "epoch": 0.04096511487654624, + "grad_norm": 1.991377332130617e-12, + "learning_rate": 9.590348851234539e-06, + "loss": 0.0062, + "step": 9170 + }, + { + "epoch": 0.04100978784805829, + "grad_norm": 8.767239931544053e-16, + "learning_rate": 9.589902121519417e-06, + "loss": 0.0, + "step": 9180 + }, + { + "epoch": 0.041054460819570336, + "grad_norm": 2.192102783737937e-06, + "learning_rate": 9.589455391804297e-06, + "loss": 0.0001, + "step": 9190 + }, + { + "epoch": 0.04109913379108238, + "grad_norm": 3.0813654579298078e-12, + "learning_rate": 9.589008662089176e-06, + "loss": 0.0007, + "step": 9200 + }, + { + "epoch": 0.04114380676259443, + "grad_norm": 0.022311272099614143, + "learning_rate": 9.588561932374056e-06, + "loss": 0.0001, + "step": 9210 + }, + { + "epoch": 0.04118847973410647, + "grad_norm": 1.5432090284470124e-12, + "learning_rate": 9.588115202658936e-06, + "loss": 0.0, + "step": 9220 + }, + { + "epoch": 0.04123315270561852, + "grad_norm": 1.624277388145856e-07, + "learning_rate": 9.587668472943815e-06, + "loss": 0.0, + "step": 9230 + }, + { + "epoch": 0.041277825677130564, + "grad_norm": 7.818975689133367e-09, + "learning_rate": 9.587221743228695e-06, + "loss": 0.0244, + "step": 9240 + }, + { + "epoch": 0.04132249864864261, + "grad_norm": 1.1187073809633061e-13, + "learning_rate": 9.586775013513575e-06, + "loss": 0.0, + "step": 9250 + }, + { + "epoch": 0.04136717162015466, + "grad_norm": 1.5692712906911765e-07, + "learning_rate": 9.586328283798453e-06, + "loss": 0.4656, + "step": 9260 + }, + { + "epoch": 0.041411844591666705, + "grad_norm": 447.6181640625, + "learning_rate": 9.585881554083333e-06, + "loss": 0.1313, + "step": 9270 + }, + { + "epoch": 0.04145651756317875, + "grad_norm": 14.45638656616211, + "learning_rate": 9.585434824368214e-06, + "loss": 0.0018, + "step": 9280 + }, + { + "epoch": 0.0415011905346908, + "grad_norm": 2.5550141429658835e-12, + "learning_rate": 9.584988094653092e-06, + "loss": 0.0153, + "step": 9290 + }, + { + "epoch": 0.04154586350620284, + "grad_norm": 4.1291680335998535, + "learning_rate": 9.584541364937972e-06, + "loss": 0.0525, + "step": 9300 + }, + { + "epoch": 0.041590536477714886, + "grad_norm": 68.95028686523438, + "learning_rate": 9.584094635222852e-06, + "loss": 0.0089, + "step": 9310 + }, + { + "epoch": 0.04163520944922693, + "grad_norm": 9.987977023229178e-08, + "learning_rate": 9.58364790550773e-06, + "loss": 0.0, + "step": 9320 + }, + { + "epoch": 0.04167988242073898, + "grad_norm": 2.3775720975338643e-11, + "learning_rate": 9.58320117579261e-06, + "loss": 0.0, + "step": 9330 + }, + { + "epoch": 0.04172455539225103, + "grad_norm": 0.20666083693504333, + "learning_rate": 9.582754446077491e-06, + "loss": 0.0, + "step": 9340 + }, + { + "epoch": 0.041769228363763074, + "grad_norm": 2.3314614736591466e-05, + "learning_rate": 9.58230771636237e-06, + "loss": 0.0, + "step": 9350 + }, + { + "epoch": 0.04181390133527512, + "grad_norm": 1.173892390628553e-13, + "learning_rate": 9.58186098664725e-06, + "loss": 0.0049, + "step": 9360 + }, + { + "epoch": 0.04185857430678717, + "grad_norm": 2.4579649107181467e-05, + "learning_rate": 9.58141425693213e-06, + "loss": 0.0005, + "step": 9370 + }, + { + "epoch": 0.04190324727829921, + "grad_norm": 0.0003566561790648848, + "learning_rate": 9.580967527217008e-06, + "loss": 0.0049, + "step": 9380 + }, + { + "epoch": 0.041947920249811255, + "grad_norm": 0.08571092039346695, + "learning_rate": 9.580520797501888e-06, + "loss": 0.0064, + "step": 9390 + }, + { + "epoch": 0.0419925932213233, + "grad_norm": 0.00036160662421025336, + "learning_rate": 9.580074067786768e-06, + "loss": 0.0, + "step": 9400 + }, + { + "epoch": 0.04203726619283535, + "grad_norm": 57.7559700012207, + "learning_rate": 9.579627338071648e-06, + "loss": 0.0127, + "step": 9410 + }, + { + "epoch": 0.042081939164347396, + "grad_norm": 2.41027595831219e-14, + "learning_rate": 9.579180608356527e-06, + "loss": 0.0201, + "step": 9420 + }, + { + "epoch": 0.04212661213585944, + "grad_norm": 1.1963922962130585e-13, + "learning_rate": 9.578733878641407e-06, + "loss": 0.0008, + "step": 9430 + }, + { + "epoch": 0.04217128510737149, + "grad_norm": 14.161581039428711, + "learning_rate": 9.578287148926287e-06, + "loss": 0.2502, + "step": 9440 + }, + { + "epoch": 0.042215958078883536, + "grad_norm": 1.318870568312036e-09, + "learning_rate": 9.577840419211166e-06, + "loss": 0.012, + "step": 9450 + }, + { + "epoch": 0.042260631050395576, + "grad_norm": 1.5369117259979248, + "learning_rate": 9.577393689496046e-06, + "loss": 0.0004, + "step": 9460 + }, + { + "epoch": 0.04230530402190762, + "grad_norm": 2.638370208071783e-12, + "learning_rate": 9.576946959780926e-06, + "loss": 0.0, + "step": 9470 + }, + { + "epoch": 0.04234997699341967, + "grad_norm": 9.181162698252364e-16, + "learning_rate": 9.576500230065804e-06, + "loss": 0.0, + "step": 9480 + }, + { + "epoch": 0.04239464996493172, + "grad_norm": 1.4507448895528796e-06, + "learning_rate": 9.576053500350684e-06, + "loss": 0.0, + "step": 9490 + }, + { + "epoch": 0.042439322936443764, + "grad_norm": 1.1731444704565774e-09, + "learning_rate": 9.575606770635563e-06, + "loss": 0.0, + "step": 9500 + }, + { + "epoch": 0.04248399590795581, + "grad_norm": 8.030391395805144e-19, + "learning_rate": 9.575160040920443e-06, + "loss": 0.0001, + "step": 9510 + }, + { + "epoch": 0.04252866887946786, + "grad_norm": 5.895772457122803, + "learning_rate": 9.574713311205323e-06, + "loss": 0.0017, + "step": 9520 + }, + { + "epoch": 0.0425733418509799, + "grad_norm": 2.221716463145923e-13, + "learning_rate": 9.574266581490201e-06, + "loss": 0.0, + "step": 9530 + }, + { + "epoch": 0.042618014822491945, + "grad_norm": 0.000884619599673897, + "learning_rate": 9.573819851775082e-06, + "loss": 0.0018, + "step": 9540 + }, + { + "epoch": 0.04266268779400399, + "grad_norm": 3.657691422631615e-06, + "learning_rate": 9.573373122059962e-06, + "loss": 0.0002, + "step": 9550 + }, + { + "epoch": 0.04270736076551604, + "grad_norm": 4.2471978711766667e-10, + "learning_rate": 9.57292639234484e-06, + "loss": 0.0312, + "step": 9560 + }, + { + "epoch": 0.042752033737028086, + "grad_norm": 2.2941121002872933e-08, + "learning_rate": 9.57247966262972e-06, + "loss": 0.0, + "step": 9570 + }, + { + "epoch": 0.04279670670854013, + "grad_norm": 0.043591901659965515, + "learning_rate": 9.5720329329146e-06, + "loss": 0.0, + "step": 9580 + }, + { + "epoch": 0.04284137968005218, + "grad_norm": 4.469314163202398e-08, + "learning_rate": 9.571586203199479e-06, + "loss": 0.0075, + "step": 9590 + }, + { + "epoch": 0.04288605265156423, + "grad_norm": 3.9559712948512556e-16, + "learning_rate": 9.571139473484359e-06, + "loss": 0.001, + "step": 9600 + }, + { + "epoch": 0.04293072562307627, + "grad_norm": 1.8759371833709793e-09, + "learning_rate": 9.570692743769237e-06, + "loss": 0.0, + "step": 9610 + }, + { + "epoch": 0.042975398594588314, + "grad_norm": 8.860857880765849e-11, + "learning_rate": 9.570246014054117e-06, + "loss": 0.0, + "step": 9620 + }, + { + "epoch": 0.04302007156610036, + "grad_norm": 3.235507195498144e-09, + "learning_rate": 9.569799284338998e-06, + "loss": 0.0, + "step": 9630 + }, + { + "epoch": 0.04306474453761241, + "grad_norm": 1.045658893673862e-12, + "learning_rate": 9.569352554623876e-06, + "loss": 0.0001, + "step": 9640 + }, + { + "epoch": 0.043109417509124455, + "grad_norm": 5.70101528919037e-13, + "learning_rate": 9.568905824908756e-06, + "loss": 0.0, + "step": 9650 + }, + { + "epoch": 0.0431540904806365, + "grad_norm": 2.711625337600708, + "learning_rate": 9.568459095193636e-06, + "loss": 0.0005, + "step": 9660 + }, + { + "epoch": 0.04319876345214855, + "grad_norm": 1.9815430277958512e-05, + "learning_rate": 9.568012365478515e-06, + "loss": 0.0, + "step": 9670 + }, + { + "epoch": 0.043243436423660596, + "grad_norm": 2.702470137862928e-13, + "learning_rate": 9.567565635763395e-06, + "loss": 0.1424, + "step": 9680 + }, + { + "epoch": 0.043288109395172636, + "grad_norm": 2.9452898786530568e-08, + "learning_rate": 9.567118906048273e-06, + "loss": 0.0, + "step": 9690 + }, + { + "epoch": 0.04333278236668468, + "grad_norm": 1.0396380275778938e-05, + "learning_rate": 9.566672176333153e-06, + "loss": 0.0, + "step": 9700 + }, + { + "epoch": 0.04337745533819673, + "grad_norm": 0.10802660882472992, + "learning_rate": 9.566225446618034e-06, + "loss": 0.0, + "step": 9710 + }, + { + "epoch": 0.04342212830970878, + "grad_norm": 0.00029546706355176866, + "learning_rate": 9.565778716902912e-06, + "loss": 0.0007, + "step": 9720 + }, + { + "epoch": 0.043466801281220824, + "grad_norm": 2.035315219575963e-16, + "learning_rate": 9.565331987187792e-06, + "loss": 0.1061, + "step": 9730 + }, + { + "epoch": 0.04351147425273287, + "grad_norm": 1.4710154379587049e-15, + "learning_rate": 9.564885257472672e-06, + "loss": 0.0001, + "step": 9740 + }, + { + "epoch": 0.04355614722424492, + "grad_norm": 3.584343325618855e-12, + "learning_rate": 9.56443852775755e-06, + "loss": 0.0008, + "step": 9750 + }, + { + "epoch": 0.04360082019575696, + "grad_norm": 8.707693837095576e-07, + "learning_rate": 9.56399179804243e-06, + "loss": 0.0003, + "step": 9760 + }, + { + "epoch": 0.043645493167269005, + "grad_norm": 95.9183578491211, + "learning_rate": 9.563545068327311e-06, + "loss": 0.4336, + "step": 9770 + }, + { + "epoch": 0.04369016613878105, + "grad_norm": 5.015752813086749e-10, + "learning_rate": 9.56309833861219e-06, + "loss": 0.0001, + "step": 9780 + }, + { + "epoch": 0.0437348391102931, + "grad_norm": 9.280040831072256e-05, + "learning_rate": 9.56265160889707e-06, + "loss": 0.0016, + "step": 9790 + }, + { + "epoch": 0.043779512081805146, + "grad_norm": 2.6840293685381766e-06, + "learning_rate": 9.56220487918195e-06, + "loss": 0.0, + "step": 9800 + }, + { + "epoch": 0.04382418505331719, + "grad_norm": 4.250260190019617e-06, + "learning_rate": 9.561758149466828e-06, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.04386885802482924, + "grad_norm": 4.036125392303802e-05, + "learning_rate": 9.561311419751708e-06, + "loss": 0.0016, + "step": 9820 + }, + { + "epoch": 0.043913530996341286, + "grad_norm": 7.597957960570056e-07, + "learning_rate": 9.560864690036588e-06, + "loss": 0.0001, + "step": 9830 + }, + { + "epoch": 0.043958203967853327, + "grad_norm": 4.901462014039737e-10, + "learning_rate": 9.560417960321467e-06, + "loss": 0.0002, + "step": 9840 + }, + { + "epoch": 0.044002876939365373, + "grad_norm": 1.544968666888913e-11, + "learning_rate": 9.559971230606347e-06, + "loss": 0.0479, + "step": 9850 + }, + { + "epoch": 0.04404754991087742, + "grad_norm": 6.470037006156559e-11, + "learning_rate": 9.559524500891227e-06, + "loss": 0.0, + "step": 9860 + }, + { + "epoch": 0.04409222288238947, + "grad_norm": 4.1798831063033504e-10, + "learning_rate": 9.559077771176107e-06, + "loss": 0.0, + "step": 9870 + }, + { + "epoch": 0.044136895853901514, + "grad_norm": 3.472337972709738e-10, + "learning_rate": 9.558631041460986e-06, + "loss": 0.0387, + "step": 9880 + }, + { + "epoch": 0.04418156882541356, + "grad_norm": 1.1115545604182403e-15, + "learning_rate": 9.558184311745866e-06, + "loss": 0.0, + "step": 9890 + }, + { + "epoch": 0.04422624179692561, + "grad_norm": 0.0023215063847601414, + "learning_rate": 9.557737582030746e-06, + "loss": 0.0072, + "step": 9900 + }, + { + "epoch": 0.044270914768437655, + "grad_norm": 0.0007233454380184412, + "learning_rate": 9.557290852315624e-06, + "loss": 0.0, + "step": 9910 + }, + { + "epoch": 0.044315587739949695, + "grad_norm": 373.0455017089844, + "learning_rate": 9.556844122600504e-06, + "loss": 0.1313, + "step": 9920 + }, + { + "epoch": 0.04436026071146174, + "grad_norm": 4.1237175452124575e-08, + "learning_rate": 9.556397392885384e-06, + "loss": 0.0, + "step": 9930 + }, + { + "epoch": 0.04440493368297379, + "grad_norm": 3.873950119026581e-11, + "learning_rate": 9.555950663170263e-06, + "loss": 0.0, + "step": 9940 + }, + { + "epoch": 0.044449606654485836, + "grad_norm": 1.5232348442077637, + "learning_rate": 9.555503933455143e-06, + "loss": 0.0004, + "step": 9950 + }, + { + "epoch": 0.04449427962599788, + "grad_norm": 0.0005546921165660024, + "learning_rate": 9.555057203740023e-06, + "loss": 0.0003, + "step": 9960 + }, + { + "epoch": 0.04453895259750993, + "grad_norm": 2.327199405405267e-10, + "learning_rate": 9.554610474024902e-06, + "loss": 0.012, + "step": 9970 + }, + { + "epoch": 0.04458362556902198, + "grad_norm": 2.1845164566158815e-11, + "learning_rate": 9.554163744309782e-06, + "loss": 0.0, + "step": 9980 + }, + { + "epoch": 0.044628298540534024, + "grad_norm": 8.4572713276998e-09, + "learning_rate": 9.55371701459466e-06, + "loss": 0.002, + "step": 9990 + }, + { + "epoch": 0.044672971512046064, + "grad_norm": 1.1127020528925446e-12, + "learning_rate": 9.55327028487954e-06, + "loss": 0.0011, + "step": 10000 + }, + { + "epoch": 0.04471764448355811, + "grad_norm": 4.5678115156988497e-07, + "learning_rate": 9.55282355516442e-06, + "loss": 0.0031, + "step": 10010 + }, + { + "epoch": 0.04476231745507016, + "grad_norm": 9.302107810974121, + "learning_rate": 9.552376825449299e-06, + "loss": 0.002, + "step": 10020 + }, + { + "epoch": 0.044806990426582205, + "grad_norm": 1.2560059498803889e-14, + "learning_rate": 9.551930095734179e-06, + "loss": 0.0, + "step": 10030 + }, + { + "epoch": 0.04485166339809425, + "grad_norm": 0.05222535878419876, + "learning_rate": 9.551483366019059e-06, + "loss": 0.0004, + "step": 10040 + }, + { + "epoch": 0.0448963363696063, + "grad_norm": 0.3058798015117645, + "learning_rate": 9.551036636303937e-06, + "loss": 0.3173, + "step": 10050 + }, + { + "epoch": 0.044941009341118346, + "grad_norm": 3.920735707652057e-06, + "learning_rate": 9.550589906588818e-06, + "loss": 0.0, + "step": 10060 + }, + { + "epoch": 0.044985682312630386, + "grad_norm": 9.553231166137266e-07, + "learning_rate": 9.550143176873698e-06, + "loss": 0.0, + "step": 10070 + }, + { + "epoch": 0.04503035528414243, + "grad_norm": 0.06729566305875778, + "learning_rate": 9.549696447158576e-06, + "loss": 0.001, + "step": 10080 + }, + { + "epoch": 0.04507502825565448, + "grad_norm": 4.8741655156447905e-09, + "learning_rate": 9.549249717443456e-06, + "loss": 0.0, + "step": 10090 + }, + { + "epoch": 0.04511970122716653, + "grad_norm": 5.887299980855687e-09, + "learning_rate": 9.548802987728335e-06, + "loss": 0.7375, + "step": 10100 + }, + { + "epoch": 0.045164374198678574, + "grad_norm": 2.405948407613323e-06, + "learning_rate": 9.548356258013215e-06, + "loss": 0.0, + "step": 10110 + }, + { + "epoch": 0.04520904717019062, + "grad_norm": 7.378541950608247e-14, + "learning_rate": 9.547909528298095e-06, + "loss": 0.0, + "step": 10120 + }, + { + "epoch": 0.04525372014170267, + "grad_norm": 3.9999552281511086e-11, + "learning_rate": 9.547462798582973e-06, + "loss": 0.0, + "step": 10130 + }, + { + "epoch": 0.045298393113214715, + "grad_norm": 1.4985769780651026e-07, + "learning_rate": 9.547016068867854e-06, + "loss": 0.0006, + "step": 10140 + }, + { + "epoch": 0.045343066084726755, + "grad_norm": 5.634565353393555, + "learning_rate": 9.546569339152734e-06, + "loss": 0.0018, + "step": 10150 + }, + { + "epoch": 0.0453877390562388, + "grad_norm": 9.927728861214291e-09, + "learning_rate": 9.546122609437612e-06, + "loss": 0.0079, + "step": 10160 + }, + { + "epoch": 0.04543241202775085, + "grad_norm": 9.97686100401296e-11, + "learning_rate": 9.545675879722492e-06, + "loss": 0.0001, + "step": 10170 + }, + { + "epoch": 0.045477084999262896, + "grad_norm": 2.851215263888207e-09, + "learning_rate": 9.54522915000737e-06, + "loss": 0.3344, + "step": 10180 + }, + { + "epoch": 0.04552175797077494, + "grad_norm": 5.79595762246754e-05, + "learning_rate": 9.54478242029225e-06, + "loss": 0.0, + "step": 10190 + }, + { + "epoch": 0.04556643094228699, + "grad_norm": 1.3934897680201175e-10, + "learning_rate": 9.544335690577131e-06, + "loss": 0.0049, + "step": 10200 + }, + { + "epoch": 0.045611103913799037, + "grad_norm": 8.2845375004581e-10, + "learning_rate": 9.54388896086201e-06, + "loss": 0.0017, + "step": 10210 + }, + { + "epoch": 0.04565577688531108, + "grad_norm": 6.951763165119473e-09, + "learning_rate": 9.54344223114689e-06, + "loss": 0.0, + "step": 10220 + }, + { + "epoch": 0.045700449856823123, + "grad_norm": 5.074342638786145e-13, + "learning_rate": 9.54299550143177e-06, + "loss": 0.0011, + "step": 10230 + }, + { + "epoch": 0.04574512282833517, + "grad_norm": 1.7860834589100705e-07, + "learning_rate": 9.542548771716648e-06, + "loss": 0.2344, + "step": 10240 + }, + { + "epoch": 0.04578979579984722, + "grad_norm": 4.644238651962951e-05, + "learning_rate": 9.542102042001528e-06, + "loss": 0.0128, + "step": 10250 + }, + { + "epoch": 0.045834468771359264, + "grad_norm": 2.7153768655807653e-07, + "learning_rate": 9.541655312286408e-06, + "loss": 0.0, + "step": 10260 + }, + { + "epoch": 0.04587914174287131, + "grad_norm": 3.4437507565598935e-06, + "learning_rate": 9.541208582571287e-06, + "loss": 0.0, + "step": 10270 + }, + { + "epoch": 0.04592381471438336, + "grad_norm": 0.2568793296813965, + "learning_rate": 9.540761852856167e-06, + "loss": 0.003, + "step": 10280 + }, + { + "epoch": 0.045968487685895405, + "grad_norm": 1.8173045290836853e-08, + "learning_rate": 9.540315123141047e-06, + "loss": 0.0, + "step": 10290 + }, + { + "epoch": 0.046013160657407445, + "grad_norm": 2.1706417316792043e-13, + "learning_rate": 9.539868393425925e-06, + "loss": 0.0, + "step": 10300 + }, + { + "epoch": 0.04605783362891949, + "grad_norm": 1.2277396981008604e-13, + "learning_rate": 9.539421663710805e-06, + "loss": 0.0001, + "step": 10310 + }, + { + "epoch": 0.04610250660043154, + "grad_norm": 1.0452093635038295e-09, + "learning_rate": 9.538974933995686e-06, + "loss": 0.0, + "step": 10320 + }, + { + "epoch": 0.046147179571943586, + "grad_norm": 1.810579511568166e-14, + "learning_rate": 9.538528204280566e-06, + "loss": 0.0579, + "step": 10330 + }, + { + "epoch": 0.04619185254345563, + "grad_norm": 1.3487329308947693e-10, + "learning_rate": 9.538081474565444e-06, + "loss": 0.0011, + "step": 10340 + }, + { + "epoch": 0.04623652551496768, + "grad_norm": 1.3384500618940365e-07, + "learning_rate": 9.537634744850324e-06, + "loss": 0.0038, + "step": 10350 + }, + { + "epoch": 0.04628119848647973, + "grad_norm": 0.007952453568577766, + "learning_rate": 9.537188015135204e-06, + "loss": 0.0, + "step": 10360 + }, + { + "epoch": 0.046325871457991774, + "grad_norm": 1.5257786353806146e-11, + "learning_rate": 9.536741285420083e-06, + "loss": 0.0026, + "step": 10370 + }, + { + "epoch": 0.046370544429503814, + "grad_norm": 1.2612064357149988e-15, + "learning_rate": 9.536294555704963e-06, + "loss": 0.0, + "step": 10380 + }, + { + "epoch": 0.04641521740101586, + "grad_norm": 5.068566660132445e-12, + "learning_rate": 9.535847825989843e-06, + "loss": 0.0001, + "step": 10390 + }, + { + "epoch": 0.04645989037252791, + "grad_norm": 1.833808141782356e-06, + "learning_rate": 9.535401096274722e-06, + "loss": 0.0, + "step": 10400 + }, + { + "epoch": 0.046504563344039955, + "grad_norm": 1.5389470297069718e-12, + "learning_rate": 9.534954366559602e-06, + "loss": 0.0003, + "step": 10410 + }, + { + "epoch": 0.046549236315552, + "grad_norm": 1.1092303742188392e-09, + "learning_rate": 9.534507636844482e-06, + "loss": 0.0, + "step": 10420 + }, + { + "epoch": 0.04659390928706405, + "grad_norm": 2.9077733643134707e-07, + "learning_rate": 9.53406090712936e-06, + "loss": 1.6, + "step": 10430 + }, + { + "epoch": 0.046638582258576096, + "grad_norm": 0.9567630887031555, + "learning_rate": 9.53361417741424e-06, + "loss": 0.0018, + "step": 10440 + }, + { + "epoch": 0.04668325523008814, + "grad_norm": 2.428366697417234e-17, + "learning_rate": 9.53316744769912e-06, + "loss": 0.0011, + "step": 10450 + }, + { + "epoch": 0.04672792820160018, + "grad_norm": 1.1588655368655054e-08, + "learning_rate": 9.532720717983999e-06, + "loss": 0.0, + "step": 10460 + }, + { + "epoch": 0.04677260117311223, + "grad_norm": 2.9958778191655497e-15, + "learning_rate": 9.532273988268879e-06, + "loss": 0.0, + "step": 10470 + }, + { + "epoch": 0.04681727414462428, + "grad_norm": 1.0342333933710535e-10, + "learning_rate": 9.531827258553757e-06, + "loss": 0.0, + "step": 10480 + }, + { + "epoch": 0.046861947116136324, + "grad_norm": 1.4794973024478963e-09, + "learning_rate": 9.531380528838638e-06, + "loss": 0.1307, + "step": 10490 + }, + { + "epoch": 0.04690662008764837, + "grad_norm": 7.921020135919117e-16, + "learning_rate": 9.530933799123518e-06, + "loss": 0.0537, + "step": 10500 + }, + { + "epoch": 0.04695129305916042, + "grad_norm": 2.399570348643465e-06, + "learning_rate": 9.530487069408396e-06, + "loss": 0.0, + "step": 10510 + }, + { + "epoch": 0.046995966030672465, + "grad_norm": 0.003332250751554966, + "learning_rate": 9.530040339693276e-06, + "loss": 0.1141, + "step": 10520 + }, + { + "epoch": 0.04704063900218451, + "grad_norm": 0.001612229272723198, + "learning_rate": 9.529593609978156e-06, + "loss": 0.0, + "step": 10530 + }, + { + "epoch": 0.04708531197369655, + "grad_norm": 0.017735157161951065, + "learning_rate": 9.529146880263035e-06, + "loss": 0.0006, + "step": 10540 + }, + { + "epoch": 0.0471299849452086, + "grad_norm": 0.0006999396719038486, + "learning_rate": 9.528700150547915e-06, + "loss": 0.0, + "step": 10550 + }, + { + "epoch": 0.047174657916720646, + "grad_norm": 0.00042073967051692307, + "learning_rate": 9.528253420832795e-06, + "loss": 0.0113, + "step": 10560 + }, + { + "epoch": 0.04721933088823269, + "grad_norm": 2.5330312070082073e-08, + "learning_rate": 9.527806691117674e-06, + "loss": 0.0001, + "step": 10570 + }, + { + "epoch": 0.04726400385974474, + "grad_norm": 0.04455679655075073, + "learning_rate": 9.527359961402554e-06, + "loss": 0.0201, + "step": 10580 + }, + { + "epoch": 0.047308676831256787, + "grad_norm": 2.771469165274709e-10, + "learning_rate": 9.526913231687432e-06, + "loss": 0.0834, + "step": 10590 + }, + { + "epoch": 0.047353349802768833, + "grad_norm": 3.395351555468551e-09, + "learning_rate": 9.526466501972312e-06, + "loss": 0.0908, + "step": 10600 + }, + { + "epoch": 0.047398022774280874, + "grad_norm": 2.7118030629935674e-06, + "learning_rate": 9.526019772257192e-06, + "loss": 0.0008, + "step": 10610 + }, + { + "epoch": 0.04744269574579292, + "grad_norm": 4.103829860687256, + "learning_rate": 9.52557304254207e-06, + "loss": 0.0011, + "step": 10620 + }, + { + "epoch": 0.04748736871730497, + "grad_norm": 6.712851609336212e-05, + "learning_rate": 9.525126312826951e-06, + "loss": 0.0, + "step": 10630 + }, + { + "epoch": 0.047532041688817014, + "grad_norm": 6.580361514352262e-06, + "learning_rate": 9.524679583111831e-06, + "loss": 0.0001, + "step": 10640 + }, + { + "epoch": 0.04757671466032906, + "grad_norm": 0.016762414947152138, + "learning_rate": 9.52423285339671e-06, + "loss": 0.2344, + "step": 10650 + }, + { + "epoch": 0.04762138763184111, + "grad_norm": 2.72741681670019e-12, + "learning_rate": 9.52378612368159e-06, + "loss": 0.0, + "step": 10660 + }, + { + "epoch": 0.047666060603353155, + "grad_norm": 1.9086164940174122e-11, + "learning_rate": 9.52333939396647e-06, + "loss": 0.0, + "step": 10670 + }, + { + "epoch": 0.0477107335748652, + "grad_norm": 2.0200214834176222e-09, + "learning_rate": 9.522892664251348e-06, + "loss": 0.0, + "step": 10680 + }, + { + "epoch": 0.04775540654637724, + "grad_norm": 3.076793291256763e-06, + "learning_rate": 9.522445934536228e-06, + "loss": 0.2235, + "step": 10690 + }, + { + "epoch": 0.04780007951788929, + "grad_norm": 1.306749939918518, + "learning_rate": 9.521999204821107e-06, + "loss": 0.0005, + "step": 10700 + }, + { + "epoch": 0.047844752489401336, + "grad_norm": 0.004337243270128965, + "learning_rate": 9.521552475105987e-06, + "loss": 0.055, + "step": 10710 + }, + { + "epoch": 0.04788942546091338, + "grad_norm": 1.5169650424695647e-08, + "learning_rate": 9.521105745390867e-06, + "loss": 0.5752, + "step": 10720 + }, + { + "epoch": 0.04793409843242543, + "grad_norm": 1.403236971064814e-10, + "learning_rate": 9.520659015675745e-06, + "loss": 0.0062, + "step": 10730 + }, + { + "epoch": 0.04797877140393748, + "grad_norm": 9.81935127826461e-13, + "learning_rate": 9.520212285960625e-06, + "loss": 0.0, + "step": 10740 + }, + { + "epoch": 0.048023444375449524, + "grad_norm": 8.163741767930333e-06, + "learning_rate": 9.519765556245506e-06, + "loss": 0.0, + "step": 10750 + }, + { + "epoch": 0.04806811734696157, + "grad_norm": 7.663325529705389e-10, + "learning_rate": 9.519318826530384e-06, + "loss": 0.0, + "step": 10760 + }, + { + "epoch": 0.04811279031847361, + "grad_norm": 5.807218551635742, + "learning_rate": 9.518872096815264e-06, + "loss": 0.0007, + "step": 10770 + }, + { + "epoch": 0.04815746328998566, + "grad_norm": 9.717787555554858e-13, + "learning_rate": 9.518425367100144e-06, + "loss": 0.0001, + "step": 10780 + }, + { + "epoch": 0.048202136261497705, + "grad_norm": 2.0461568055907264e-05, + "learning_rate": 9.517978637385024e-06, + "loss": 0.0003, + "step": 10790 + }, + { + "epoch": 0.04824680923300975, + "grad_norm": 0.04437157139182091, + "learning_rate": 9.517531907669903e-06, + "loss": 0.0003, + "step": 10800 + }, + { + "epoch": 0.0482914822045218, + "grad_norm": 1.7951228618621826, + "learning_rate": 9.517085177954783e-06, + "loss": 0.2804, + "step": 10810 + }, + { + "epoch": 0.048336155176033846, + "grad_norm": 3.078026722391769e-10, + "learning_rate": 9.516638448239663e-06, + "loss": 0.0, + "step": 10820 + }, + { + "epoch": 0.04838082814754589, + "grad_norm": 0.0025664654094725847, + "learning_rate": 9.516191718524542e-06, + "loss": 0.0, + "step": 10830 + }, + { + "epoch": 0.04842550111905793, + "grad_norm": 2.2002347860317073e-12, + "learning_rate": 9.515744988809422e-06, + "loss": 0.0, + "step": 10840 + }, + { + "epoch": 0.04847017409056998, + "grad_norm": 0.11011355370283127, + "learning_rate": 9.515298259094302e-06, + "loss": 0.0011, + "step": 10850 + }, + { + "epoch": 0.04851484706208203, + "grad_norm": 1.5746836652397178e-05, + "learning_rate": 9.51485152937918e-06, + "loss": 0.0, + "step": 10860 + }, + { + "epoch": 0.048559520033594074, + "grad_norm": 4.677225717841793e-09, + "learning_rate": 9.51440479966406e-06, + "loss": 0.0, + "step": 10870 + }, + { + "epoch": 0.04860419300510612, + "grad_norm": 5.160199931198914e-14, + "learning_rate": 9.51395806994894e-06, + "loss": 0.0, + "step": 10880 + }, + { + "epoch": 0.04864886597661817, + "grad_norm": 8.101441184571456e-10, + "learning_rate": 9.513511340233819e-06, + "loss": 0.0036, + "step": 10890 + }, + { + "epoch": 0.048693538948130215, + "grad_norm": 1.0413211626314478e-08, + "learning_rate": 9.513064610518699e-06, + "loss": 0.0001, + "step": 10900 + }, + { + "epoch": 0.04873821191964226, + "grad_norm": 15.043107986450195, + "learning_rate": 9.512617880803579e-06, + "loss": 0.0028, + "step": 10910 + }, + { + "epoch": 0.0487828848911543, + "grad_norm": 8.022602256184484e-15, + "learning_rate": 9.512171151088458e-06, + "loss": 0.0, + "step": 10920 + }, + { + "epoch": 0.04882755786266635, + "grad_norm": 11.803953170776367, + "learning_rate": 9.511724421373338e-06, + "loss": 0.0745, + "step": 10930 + }, + { + "epoch": 0.048872230834178396, + "grad_norm": 4.265514608192689e-09, + "learning_rate": 9.511277691658218e-06, + "loss": 0.0034, + "step": 10940 + }, + { + "epoch": 0.04891690380569044, + "grad_norm": 0.0013865844812244177, + "learning_rate": 9.510830961943096e-06, + "loss": 0.0004, + "step": 10950 + }, + { + "epoch": 0.04896157677720249, + "grad_norm": 2.2187156278885567e-10, + "learning_rate": 9.510384232227976e-06, + "loss": 0.0002, + "step": 10960 + }, + { + "epoch": 0.04900624974871454, + "grad_norm": 4.963759181464411e-09, + "learning_rate": 9.509937502512857e-06, + "loss": 0.0202, + "step": 10970 + }, + { + "epoch": 0.049050922720226584, + "grad_norm": 0.00019618085934780538, + "learning_rate": 9.509490772797735e-06, + "loss": 0.2945, + "step": 10980 + }, + { + "epoch": 0.04909559569173863, + "grad_norm": 0.0008185982587747276, + "learning_rate": 9.509044043082615e-06, + "loss": 0.0, + "step": 10990 + }, + { + "epoch": 0.04914026866325067, + "grad_norm": 0.4256277084350586, + "learning_rate": 9.508597313367494e-06, + "loss": 0.0, + "step": 11000 + }, + { + "epoch": 0.04918494163476272, + "grad_norm": 5.677510305957355e-17, + "learning_rate": 9.508150583652374e-06, + "loss": 0.0, + "step": 11010 + }, + { + "epoch": 0.049229614606274764, + "grad_norm": 1.41667380951227e-11, + "learning_rate": 9.507703853937254e-06, + "loss": 0.0039, + "step": 11020 + }, + { + "epoch": 0.04927428757778681, + "grad_norm": 6.928135287687098e-10, + "learning_rate": 9.507257124222132e-06, + "loss": 0.0005, + "step": 11030 + }, + { + "epoch": 0.04931896054929886, + "grad_norm": 5.0077971536666155e-05, + "learning_rate": 9.506810394507012e-06, + "loss": 0.0003, + "step": 11040 + }, + { + "epoch": 0.049363633520810905, + "grad_norm": 1.8108100088842782e-11, + "learning_rate": 9.506363664791892e-06, + "loss": 0.0, + "step": 11050 + }, + { + "epoch": 0.04940830649232295, + "grad_norm": 0.032544512301683426, + "learning_rate": 9.505916935076771e-06, + "loss": 0.0, + "step": 11060 + }, + { + "epoch": 0.049452979463835, + "grad_norm": 2.02817362904284e-09, + "learning_rate": 9.505470205361651e-06, + "loss": 0.0, + "step": 11070 + }, + { + "epoch": 0.04949765243534704, + "grad_norm": 7.679974820158642e-13, + "learning_rate": 9.50502347564653e-06, + "loss": 0.039, + "step": 11080 + }, + { + "epoch": 0.049542325406859086, + "grad_norm": 1.672820673093156e-07, + "learning_rate": 9.50457674593141e-06, + "loss": 0.0, + "step": 11090 + }, + { + "epoch": 0.04958699837837113, + "grad_norm": 3.4813847804571196e-08, + "learning_rate": 9.50413001621629e-06, + "loss": 0.0, + "step": 11100 + }, + { + "epoch": 0.04963167134988318, + "grad_norm": 5.6135419229663697e-11, + "learning_rate": 9.503683286501168e-06, + "loss": 0.003, + "step": 11110 + }, + { + "epoch": 0.04967634432139523, + "grad_norm": 1.9980174329248257e-05, + "learning_rate": 9.503236556786048e-06, + "loss": 0.0, + "step": 11120 + }, + { + "epoch": 0.049721017292907274, + "grad_norm": 0.5705450177192688, + "learning_rate": 9.502789827070928e-06, + "loss": 0.0001, + "step": 11130 + }, + { + "epoch": 0.04976569026441932, + "grad_norm": 0.0001016234455164522, + "learning_rate": 9.502343097355807e-06, + "loss": 0.0, + "step": 11140 + }, + { + "epoch": 0.04981036323593136, + "grad_norm": 386.708740234375, + "learning_rate": 9.501896367640687e-06, + "loss": 0.1914, + "step": 11150 + }, + { + "epoch": 0.04985503620744341, + "grad_norm": 0.0004175099020358175, + "learning_rate": 9.501449637925567e-06, + "loss": 0.106, + "step": 11160 + }, + { + "epoch": 0.049899709178955455, + "grad_norm": 6.118619921835489e-07, + "learning_rate": 9.501002908210445e-06, + "loss": 0.001, + "step": 11170 + }, + { + "epoch": 0.0499443821504675, + "grad_norm": 8.625532510464495e-10, + "learning_rate": 9.500556178495326e-06, + "loss": 0.001, + "step": 11180 + }, + { + "epoch": 0.04998905512197955, + "grad_norm": 0.8630591630935669, + "learning_rate": 9.500109448780204e-06, + "loss": 0.0001, + "step": 11190 + }, + { + "epoch": 0.050033728093491596, + "grad_norm": 1.435045252117309e-14, + "learning_rate": 9.499662719065084e-06, + "loss": 0.1915, + "step": 11200 + }, + { + "epoch": 0.05007840106500364, + "grad_norm": 6.082538444272811e-13, + "learning_rate": 9.499215989349964e-06, + "loss": 0.0006, + "step": 11210 + }, + { + "epoch": 0.05012307403651569, + "grad_norm": 1.1062415462565877e-14, + "learning_rate": 9.498769259634843e-06, + "loss": 0.0009, + "step": 11220 + }, + { + "epoch": 0.05016774700802773, + "grad_norm": 5.843751864631486e-07, + "learning_rate": 9.498322529919723e-06, + "loss": 0.0, + "step": 11230 + }, + { + "epoch": 0.05021241997953978, + "grad_norm": 3.578229046397041e-09, + "learning_rate": 9.497875800204603e-06, + "loss": 0.0143, + "step": 11240 + }, + { + "epoch": 0.050257092951051824, + "grad_norm": 9.63085056149282e-10, + "learning_rate": 9.497429070489483e-06, + "loss": 0.0001, + "step": 11250 + }, + { + "epoch": 0.05030176592256387, + "grad_norm": 2.2219956008484587e-06, + "learning_rate": 9.496982340774362e-06, + "loss": 0.0079, + "step": 11260 + }, + { + "epoch": 0.05034643889407592, + "grad_norm": 0.00028614644543267787, + "learning_rate": 9.496535611059242e-06, + "loss": 0.0, + "step": 11270 + }, + { + "epoch": 0.050391111865587965, + "grad_norm": 1.2245513200759888, + "learning_rate": 9.496088881344122e-06, + "loss": 0.0977, + "step": 11280 + }, + { + "epoch": 0.05043578483710001, + "grad_norm": 1.8125715636535186e-14, + "learning_rate": 9.495642151629e-06, + "loss": 0.033, + "step": 11290 + }, + { + "epoch": 0.05048045780861206, + "grad_norm": 2.919892637519514e-17, + "learning_rate": 9.49519542191388e-06, + "loss": 0.1024, + "step": 11300 + }, + { + "epoch": 0.0505251307801241, + "grad_norm": 1.0463354798752062e-14, + "learning_rate": 9.49474869219876e-06, + "loss": 0.0049, + "step": 11310 + }, + { + "epoch": 0.050569803751636146, + "grad_norm": 34.62099838256836, + "learning_rate": 9.494301962483639e-06, + "loss": 0.0073, + "step": 11320 + }, + { + "epoch": 0.05061447672314819, + "grad_norm": 0.00027098608552478254, + "learning_rate": 9.493855232768519e-06, + "loss": 0.0001, + "step": 11330 + }, + { + "epoch": 0.05065914969466024, + "grad_norm": 1.0252830806578572e-09, + "learning_rate": 9.493408503053399e-06, + "loss": 0.0002, + "step": 11340 + }, + { + "epoch": 0.05070382266617229, + "grad_norm": 2.8664729959234844e-14, + "learning_rate": 9.492961773338278e-06, + "loss": 0.0002, + "step": 11350 + }, + { + "epoch": 0.050748495637684334, + "grad_norm": 9.993117419071496e-05, + "learning_rate": 9.492515043623158e-06, + "loss": 0.0001, + "step": 11360 + }, + { + "epoch": 0.05079316860919638, + "grad_norm": 5.520075515082907e-21, + "learning_rate": 9.492068313908038e-06, + "loss": 0.0, + "step": 11370 + }, + { + "epoch": 0.05083784158070842, + "grad_norm": 5.1444808377709705e-06, + "learning_rate": 9.491621584192916e-06, + "loss": 0.0913, + "step": 11380 + }, + { + "epoch": 0.05088251455222047, + "grad_norm": 1.1695376223030962e-08, + "learning_rate": 9.491174854477796e-06, + "loss": 0.0, + "step": 11390 + }, + { + "epoch": 0.050927187523732514, + "grad_norm": 0.02048194594681263, + "learning_rate": 9.490728124762677e-06, + "loss": 0.0, + "step": 11400 + }, + { + "epoch": 0.05097186049524456, + "grad_norm": 5.23670157530267e-14, + "learning_rate": 9.490281395047555e-06, + "loss": 0.0038, + "step": 11410 + }, + { + "epoch": 0.05101653346675661, + "grad_norm": 0.026476258412003517, + "learning_rate": 9.489834665332435e-06, + "loss": 0.0001, + "step": 11420 + }, + { + "epoch": 0.051061206438268655, + "grad_norm": 0.005338937975466251, + "learning_rate": 9.489387935617315e-06, + "loss": 0.0001, + "step": 11430 + }, + { + "epoch": 0.0511058794097807, + "grad_norm": 4.433853519406739e-16, + "learning_rate": 9.488941205902194e-06, + "loss": 0.0, + "step": 11440 + }, + { + "epoch": 0.05115055238129275, + "grad_norm": 7.238606358651143e-19, + "learning_rate": 9.488494476187074e-06, + "loss": 0.0001, + "step": 11450 + }, + { + "epoch": 0.05119522535280479, + "grad_norm": 9.190580222318175e-15, + "learning_rate": 9.488047746471954e-06, + "loss": 0.0006, + "step": 11460 + }, + { + "epoch": 0.051239898324316836, + "grad_norm": 3.4059677279062173e-13, + "learning_rate": 9.487601016756832e-06, + "loss": 0.0824, + "step": 11470 + }, + { + "epoch": 0.05128457129582888, + "grad_norm": 2.744744051597081e-05, + "learning_rate": 9.487154287041712e-06, + "loss": 0.003, + "step": 11480 + }, + { + "epoch": 0.05132924426734093, + "grad_norm": 7.354658548841542e-12, + "learning_rate": 9.486707557326591e-06, + "loss": 0.1917, + "step": 11490 + }, + { + "epoch": 0.05137391723885298, + "grad_norm": 1.0190446924135594e-16, + "learning_rate": 9.486260827611471e-06, + "loss": 0.0, + "step": 11500 + }, + { + "epoch": 0.051418590210365024, + "grad_norm": 1.3966475975735193e-08, + "learning_rate": 9.485814097896351e-06, + "loss": 0.0, + "step": 11510 + }, + { + "epoch": 0.05146326318187707, + "grad_norm": 0.02896651066839695, + "learning_rate": 9.48536736818123e-06, + "loss": 0.0018, + "step": 11520 + }, + { + "epoch": 0.05150793615338912, + "grad_norm": 4.734069852709716e-14, + "learning_rate": 9.48492063846611e-06, + "loss": 0.0004, + "step": 11530 + }, + { + "epoch": 0.05155260912490116, + "grad_norm": 0.000538403692189604, + "learning_rate": 9.48447390875099e-06, + "loss": 0.0, + "step": 11540 + }, + { + "epoch": 0.051597282096413205, + "grad_norm": 1.768347804070782e-14, + "learning_rate": 9.484027179035868e-06, + "loss": 0.0009, + "step": 11550 + }, + { + "epoch": 0.05164195506792525, + "grad_norm": 1.6111727507706064e-09, + "learning_rate": 9.483580449320748e-06, + "loss": 0.0005, + "step": 11560 + }, + { + "epoch": 0.0516866280394373, + "grad_norm": 5.3446917031019225e-11, + "learning_rate": 9.483133719605627e-06, + "loss": 0.0003, + "step": 11570 + }, + { + "epoch": 0.051731301010949346, + "grad_norm": 2.6434767264049697e-08, + "learning_rate": 9.482686989890507e-06, + "loss": 0.1141, + "step": 11580 + }, + { + "epoch": 0.05177597398246139, + "grad_norm": 1.753207024002279e-10, + "learning_rate": 9.482240260175387e-06, + "loss": 0.0062, + "step": 11590 + }, + { + "epoch": 0.05182064695397344, + "grad_norm": 3.0608987566793644e-11, + "learning_rate": 9.481793530460265e-06, + "loss": 0.0002, + "step": 11600 + }, + { + "epoch": 0.05186531992548549, + "grad_norm": 1.7744504754446666e-09, + "learning_rate": 9.481346800745146e-06, + "loss": 0.0022, + "step": 11610 + }, + { + "epoch": 0.05190999289699753, + "grad_norm": 4.3283048967168725e-07, + "learning_rate": 9.480900071030026e-06, + "loss": 0.0, + "step": 11620 + }, + { + "epoch": 0.051954665868509574, + "grad_norm": 3.8451598811661825e-05, + "learning_rate": 9.480453341314904e-06, + "loss": 0.0, + "step": 11630 + }, + { + "epoch": 0.05199933884002162, + "grad_norm": 4.421446889169012e-11, + "learning_rate": 9.480006611599784e-06, + "loss": 0.0, + "step": 11640 + }, + { + "epoch": 0.05204401181153367, + "grad_norm": 0.0026478637009859085, + "learning_rate": 9.479559881884664e-06, + "loss": 0.0899, + "step": 11650 + }, + { + "epoch": 0.052088684783045715, + "grad_norm": 3.592525899875909e-05, + "learning_rate": 9.479113152169543e-06, + "loss": 0.0, + "step": 11660 + }, + { + "epoch": 0.05213335775455776, + "grad_norm": 1.283037465850837e-16, + "learning_rate": 9.478666422454423e-06, + "loss": 0.0, + "step": 11670 + }, + { + "epoch": 0.05217803072606981, + "grad_norm": 0.014142933301627636, + "learning_rate": 9.478219692739301e-06, + "loss": 0.0, + "step": 11680 + }, + { + "epoch": 0.05222270369758185, + "grad_norm": 9.994188076234423e-06, + "learning_rate": 9.477772963024182e-06, + "loss": 0.0, + "step": 11690 + }, + { + "epoch": 0.052267376669093896, + "grad_norm": 0.0016839229501783848, + "learning_rate": 9.477326233309062e-06, + "loss": 0.0, + "step": 11700 + }, + { + "epoch": 0.05231204964060594, + "grad_norm": 0.00028825466870330274, + "learning_rate": 9.476879503593942e-06, + "loss": 0.0387, + "step": 11710 + }, + { + "epoch": 0.05235672261211799, + "grad_norm": 2.1650969905806505e-10, + "learning_rate": 9.47643277387882e-06, + "loss": 0.0001, + "step": 11720 + }, + { + "epoch": 0.05240139558363004, + "grad_norm": 8.670284842082765e-06, + "learning_rate": 9.4759860441637e-06, + "loss": 0.0, + "step": 11730 + }, + { + "epoch": 0.052446068555142084, + "grad_norm": 1.1954272167713498e-06, + "learning_rate": 9.47553931444858e-06, + "loss": 0.0006, + "step": 11740 + }, + { + "epoch": 0.05249074152665413, + "grad_norm": 4.82285622638301e-06, + "learning_rate": 9.475092584733459e-06, + "loss": 0.0826, + "step": 11750 + }, + { + "epoch": 0.05253541449816618, + "grad_norm": 4.6414779575343346e-08, + "learning_rate": 9.474645855018339e-06, + "loss": 0.0035, + "step": 11760 + }, + { + "epoch": 0.05258008746967822, + "grad_norm": 1.3316235891736028e-09, + "learning_rate": 9.474199125303219e-06, + "loss": 0.0001, + "step": 11770 + }, + { + "epoch": 0.052624760441190264, + "grad_norm": 1.7648435601813617e-08, + "learning_rate": 9.473752395588098e-06, + "loss": 0.0005, + "step": 11780 + }, + { + "epoch": 0.05266943341270231, + "grad_norm": 4.0121092559353144e-10, + "learning_rate": 9.473305665872978e-06, + "loss": 0.0, + "step": 11790 + }, + { + "epoch": 0.05271410638421436, + "grad_norm": 1.308599273386335e-08, + "learning_rate": 9.472858936157858e-06, + "loss": 0.0, + "step": 11800 + }, + { + "epoch": 0.052758779355726405, + "grad_norm": 800.6392211914062, + "learning_rate": 9.472412206442736e-06, + "loss": 0.8, + "step": 11810 + }, + { + "epoch": 0.05280345232723845, + "grad_norm": 1.0625454187393188, + "learning_rate": 9.471965476727616e-06, + "loss": 0.0002, + "step": 11820 + }, + { + "epoch": 0.0528481252987505, + "grad_norm": 0.2922259271144867, + "learning_rate": 9.471518747012496e-06, + "loss": 0.0537, + "step": 11830 + }, + { + "epoch": 0.052892798270262546, + "grad_norm": 2.0052473814757832e-07, + "learning_rate": 9.471072017297377e-06, + "loss": 0.0, + "step": 11840 + }, + { + "epoch": 0.052937471241774586, + "grad_norm": 0.00018748472211882472, + "learning_rate": 9.470625287582255e-06, + "loss": 0.0, + "step": 11850 + }, + { + "epoch": 0.05298214421328663, + "grad_norm": 3.574021079089107e-09, + "learning_rate": 9.470178557867135e-06, + "loss": 0.0003, + "step": 11860 + }, + { + "epoch": 0.05302681718479868, + "grad_norm": 3.325407726961771e-10, + "learning_rate": 9.469731828152014e-06, + "loss": 0.0117, + "step": 11870 + }, + { + "epoch": 0.05307149015631073, + "grad_norm": 5.634159242617898e-05, + "learning_rate": 9.469285098436894e-06, + "loss": 0.0011, + "step": 11880 + }, + { + "epoch": 0.053116163127822774, + "grad_norm": 6.641070649493486e-05, + "learning_rate": 9.468838368721774e-06, + "loss": 0.0, + "step": 11890 + }, + { + "epoch": 0.05316083609933482, + "grad_norm": 3.0292048904811963e-05, + "learning_rate": 9.468391639006652e-06, + "loss": 0.0001, + "step": 11900 + }, + { + "epoch": 0.05320550907084687, + "grad_norm": 0.0033375348430126905, + "learning_rate": 9.467944909291532e-06, + "loss": 0.0, + "step": 11910 + }, + { + "epoch": 0.05325018204235891, + "grad_norm": 1.4492547961708624e-06, + "learning_rate": 9.467498179576413e-06, + "loss": 0.0217, + "step": 11920 + }, + { + "epoch": 0.053294855013870955, + "grad_norm": 4.281428822583422e-10, + "learning_rate": 9.467051449861291e-06, + "loss": 0.0034, + "step": 11930 + }, + { + "epoch": 0.053339527985383, + "grad_norm": 9.358368697576225e-05, + "learning_rate": 9.466604720146171e-06, + "loss": 0.0017, + "step": 11940 + }, + { + "epoch": 0.05338420095689505, + "grad_norm": 1.4051488271549228e-11, + "learning_rate": 9.466157990431051e-06, + "loss": 0.0009, + "step": 11950 + }, + { + "epoch": 0.053428873928407096, + "grad_norm": 8.350542479718115e-10, + "learning_rate": 9.46571126071593e-06, + "loss": 0.0, + "step": 11960 + }, + { + "epoch": 0.05347354689991914, + "grad_norm": 3.462303332923966e-09, + "learning_rate": 9.46526453100081e-06, + "loss": 0.0, + "step": 11970 + }, + { + "epoch": 0.05351821987143119, + "grad_norm": 0.0007217188831418753, + "learning_rate": 9.464817801285688e-06, + "loss": 0.0002, + "step": 11980 + }, + { + "epoch": 0.05356289284294324, + "grad_norm": 3.165804285387708e-10, + "learning_rate": 9.464371071570568e-06, + "loss": 0.0, + "step": 11990 + }, + { + "epoch": 0.05360756581445528, + "grad_norm": 4.4119716841478704e-20, + "learning_rate": 9.463924341855448e-06, + "loss": 0.0201, + "step": 12000 + }, + { + "epoch": 0.053652238785967324, + "grad_norm": 7.734079002230665e-12, + "learning_rate": 9.463477612140327e-06, + "loss": 0.1211, + "step": 12010 + }, + { + "epoch": 0.05369691175747937, + "grad_norm": 5.054994964281967e-11, + "learning_rate": 9.463030882425207e-06, + "loss": 0.0523, + "step": 12020 + }, + { + "epoch": 0.05374158472899142, + "grad_norm": 3.923462596477911e-11, + "learning_rate": 9.462584152710087e-06, + "loss": 0.0141, + "step": 12030 + }, + { + "epoch": 0.053786257700503465, + "grad_norm": 2.907965812593005e-10, + "learning_rate": 9.462137422994966e-06, + "loss": 0.0, + "step": 12040 + }, + { + "epoch": 0.05383093067201551, + "grad_norm": 0.014994989149272442, + "learning_rate": 9.461690693279846e-06, + "loss": 0.0001, + "step": 12050 + }, + { + "epoch": 0.05387560364352756, + "grad_norm": 8.538211915265492e-08, + "learning_rate": 9.461243963564726e-06, + "loss": 0.003, + "step": 12060 + }, + { + "epoch": 0.053920276615039606, + "grad_norm": 1.8153332192868347e-16, + "learning_rate": 9.460797233849604e-06, + "loss": 0.0, + "step": 12070 + }, + { + "epoch": 0.053964949586551646, + "grad_norm": 4.282461700793466e-13, + "learning_rate": 9.460350504134484e-06, + "loss": 0.0, + "step": 12080 + }, + { + "epoch": 0.05400962255806369, + "grad_norm": 1.88760368473595e-05, + "learning_rate": 9.459903774419363e-06, + "loss": 0.7031, + "step": 12090 + }, + { + "epoch": 0.05405429552957574, + "grad_norm": 1.0948496307423752e-14, + "learning_rate": 9.459457044704243e-06, + "loss": 0.0253, + "step": 12100 + }, + { + "epoch": 0.05409896850108779, + "grad_norm": 1.64843222592026e-05, + "learning_rate": 9.459010314989123e-06, + "loss": 0.153, + "step": 12110 + }, + { + "epoch": 0.054143641472599834, + "grad_norm": 6.418565630167058e-14, + "learning_rate": 9.458563585274001e-06, + "loss": 0.0, + "step": 12120 + }, + { + "epoch": 0.05418831444411188, + "grad_norm": 1.8156572093891595e-16, + "learning_rate": 9.458116855558882e-06, + "loss": 0.0084, + "step": 12130 + }, + { + "epoch": 0.05423298741562393, + "grad_norm": 2.4297784761984076e-07, + "learning_rate": 9.457670125843762e-06, + "loss": 0.01, + "step": 12140 + }, + { + "epoch": 0.054277660387135974, + "grad_norm": 2.5206797999999253e-06, + "learning_rate": 9.45722339612864e-06, + "loss": 0.0002, + "step": 12150 + }, + { + "epoch": 0.054322333358648014, + "grad_norm": 5.1032744002865726e-11, + "learning_rate": 9.45677666641352e-06, + "loss": 0.0001, + "step": 12160 + }, + { + "epoch": 0.05436700633016006, + "grad_norm": 1.2425232398527442e-08, + "learning_rate": 9.4563299366984e-06, + "loss": 0.0, + "step": 12170 + }, + { + "epoch": 0.05441167930167211, + "grad_norm": 9.334379491576783e-09, + "learning_rate": 9.455883206983279e-06, + "loss": 0.0038, + "step": 12180 + }, + { + "epoch": 0.054456352273184155, + "grad_norm": 2.4313667381647974e-06, + "learning_rate": 9.455436477268159e-06, + "loss": 0.0, + "step": 12190 + }, + { + "epoch": 0.0545010252446962, + "grad_norm": 2.281985176625155e-17, + "learning_rate": 9.454989747553039e-06, + "loss": 0.0, + "step": 12200 + }, + { + "epoch": 0.05454569821620825, + "grad_norm": 1.9201582581303235e-15, + "learning_rate": 9.454543017837918e-06, + "loss": 0.0, + "step": 12210 + }, + { + "epoch": 0.054590371187720296, + "grad_norm": 2.42331623517833e-11, + "learning_rate": 9.454096288122798e-06, + "loss": 0.0004, + "step": 12220 + }, + { + "epoch": 0.054635044159232336, + "grad_norm": 1.804826723628139e-07, + "learning_rate": 9.453649558407678e-06, + "loss": 0.0, + "step": 12230 + }, + { + "epoch": 0.05467971713074438, + "grad_norm": 0.00022683448332827538, + "learning_rate": 9.453202828692556e-06, + "loss": 0.0, + "step": 12240 + }, + { + "epoch": 0.05472439010225643, + "grad_norm": 7.438219995492545e-07, + "learning_rate": 9.452756098977436e-06, + "loss": 0.4503, + "step": 12250 + }, + { + "epoch": 0.05476906307376848, + "grad_norm": 2.3686347105922323e-08, + "learning_rate": 9.452309369262316e-06, + "loss": 0.8375, + "step": 12260 + }, + { + "epoch": 0.054813736045280524, + "grad_norm": 2.794357897073496e-06, + "learning_rate": 9.451862639547195e-06, + "loss": 1.3188, + "step": 12270 + }, + { + "epoch": 0.05485840901679257, + "grad_norm": 7.669413089752197, + "learning_rate": 9.451415909832075e-06, + "loss": 0.0016, + "step": 12280 + }, + { + "epoch": 0.05490308198830462, + "grad_norm": 0.001205424894578755, + "learning_rate": 9.450969180116955e-06, + "loss": 0.004, + "step": 12290 + }, + { + "epoch": 0.054947754959816665, + "grad_norm": 6.918762210261775e-06, + "learning_rate": 9.450522450401835e-06, + "loss": 0.0002, + "step": 12300 + }, + { + "epoch": 0.054992427931328705, + "grad_norm": 2.9161958958745515e-10, + "learning_rate": 9.450075720686714e-06, + "loss": 0.525, + "step": 12310 + }, + { + "epoch": 0.05503710090284075, + "grad_norm": 2.0024484911118634e-05, + "learning_rate": 9.449628990971594e-06, + "loss": 0.0007, + "step": 12320 + }, + { + "epoch": 0.0550817738743528, + "grad_norm": 4.540103873296175e-06, + "learning_rate": 9.449182261256474e-06, + "loss": 0.0, + "step": 12330 + }, + { + "epoch": 0.055126446845864846, + "grad_norm": 1.0630525348886977e-09, + "learning_rate": 9.448735531541352e-06, + "loss": 0.475, + "step": 12340 + }, + { + "epoch": 0.05517111981737689, + "grad_norm": 618.4067993164062, + "learning_rate": 9.448288801826233e-06, + "loss": 0.3183, + "step": 12350 + }, + { + "epoch": 0.05521579278888894, + "grad_norm": 7.532779733310235e-09, + "learning_rate": 9.447842072111113e-06, + "loss": 0.0213, + "step": 12360 + }, + { + "epoch": 0.05526046576040099, + "grad_norm": 3.242120460988929e-11, + "learning_rate": 9.447395342395991e-06, + "loss": 0.0428, + "step": 12370 + }, + { + "epoch": 0.055305138731913034, + "grad_norm": 2.8545087300824168e-12, + "learning_rate": 9.446948612680871e-06, + "loss": 0.0, + "step": 12380 + }, + { + "epoch": 0.055349811703425074, + "grad_norm": 2.1261237179714954e-06, + "learning_rate": 9.44650188296575e-06, + "loss": 0.0, + "step": 12390 + }, + { + "epoch": 0.05539448467493712, + "grad_norm": 1.7692869391794375e-10, + "learning_rate": 9.44605515325063e-06, + "loss": 0.001, + "step": 12400 + }, + { + "epoch": 0.05543915764644917, + "grad_norm": 0.0017321183113381267, + "learning_rate": 9.44560842353551e-06, + "loss": 0.001, + "step": 12410 + }, + { + "epoch": 0.055483830617961215, + "grad_norm": 1.5489741045371375e-08, + "learning_rate": 9.445161693820388e-06, + "loss": 0.0005, + "step": 12420 + }, + { + "epoch": 0.05552850358947326, + "grad_norm": 1.0361794267055302e-07, + "learning_rate": 9.444714964105268e-06, + "loss": 0.0691, + "step": 12430 + }, + { + "epoch": 0.05557317656098531, + "grad_norm": 2.675851619275693e-14, + "learning_rate": 9.444268234390149e-06, + "loss": 0.0, + "step": 12440 + }, + { + "epoch": 0.055617849532497356, + "grad_norm": 3.5206210776544955e-12, + "learning_rate": 9.443821504675027e-06, + "loss": 0.0226, + "step": 12450 + }, + { + "epoch": 0.0556625225040094, + "grad_norm": 0.00045068253530189395, + "learning_rate": 9.443374774959907e-06, + "loss": 0.0, + "step": 12460 + }, + { + "epoch": 0.05570719547552144, + "grad_norm": 0.010806038975715637, + "learning_rate": 9.442928045244786e-06, + "loss": 0.1406, + "step": 12470 + }, + { + "epoch": 0.05575186844703349, + "grad_norm": 0.07221142202615738, + "learning_rate": 9.442481315529666e-06, + "loss": 0.0049, + "step": 12480 + }, + { + "epoch": 0.05579654141854554, + "grad_norm": 4.355013311779787e-12, + "learning_rate": 9.442034585814546e-06, + "loss": 0.003, + "step": 12490 + }, + { + "epoch": 0.055841214390057584, + "grad_norm": 1.2977407095604576e-05, + "learning_rate": 9.441587856099424e-06, + "loss": 0.0, + "step": 12500 + }, + { + "epoch": 0.05588588736156963, + "grad_norm": 0.007427062373608351, + "learning_rate": 9.441141126384304e-06, + "loss": 0.0, + "step": 12510 + }, + { + "epoch": 0.05593056033308168, + "grad_norm": 2.1748248855146812e-06, + "learning_rate": 9.440694396669184e-06, + "loss": 0.0387, + "step": 12520 + }, + { + "epoch": 0.055975233304593724, + "grad_norm": 1.2029813993910676e-13, + "learning_rate": 9.440247666954063e-06, + "loss": 0.0, + "step": 12530 + }, + { + "epoch": 0.056019906276105765, + "grad_norm": 0.31753480434417725, + "learning_rate": 9.439800937238943e-06, + "loss": 0.0, + "step": 12540 + }, + { + "epoch": 0.05606457924761781, + "grad_norm": 1.325088396697538e-05, + "learning_rate": 9.439354207523823e-06, + "loss": 0.0692, + "step": 12550 + }, + { + "epoch": 0.05610925221912986, + "grad_norm": 2.330439174969001e-10, + "learning_rate": 9.438907477808702e-06, + "loss": 0.0003, + "step": 12560 + }, + { + "epoch": 0.056153925190641905, + "grad_norm": 2.4827478939913605e-13, + "learning_rate": 9.438460748093582e-06, + "loss": 0.1055, + "step": 12570 + }, + { + "epoch": 0.05619859816215395, + "grad_norm": 7.189043045043945, + "learning_rate": 9.43801401837846e-06, + "loss": 0.0013, + "step": 12580 + }, + { + "epoch": 0.056243271133666, + "grad_norm": 5.677891490307729e-10, + "learning_rate": 9.43756728866334e-06, + "loss": 0.0, + "step": 12590 + }, + { + "epoch": 0.056287944105178046, + "grad_norm": 0.00373545428737998, + "learning_rate": 9.43712055894822e-06, + "loss": 0.0, + "step": 12600 + }, + { + "epoch": 0.05633261707669009, + "grad_norm": 8.608561984146945e-06, + "learning_rate": 9.436673829233099e-06, + "loss": 0.0, + "step": 12610 + }, + { + "epoch": 0.05637729004820213, + "grad_norm": 3.579793322124221e-15, + "learning_rate": 9.436227099517979e-06, + "loss": 0.0, + "step": 12620 + }, + { + "epoch": 0.05642196301971418, + "grad_norm": 0.127075657248497, + "learning_rate": 9.435780369802859e-06, + "loss": 0.0001, + "step": 12630 + }, + { + "epoch": 0.05646663599122623, + "grad_norm": 2.5742030729247745e-08, + "learning_rate": 9.435333640087738e-06, + "loss": 0.0, + "step": 12640 + }, + { + "epoch": 0.056511308962738274, + "grad_norm": 5.2089426390011795e-06, + "learning_rate": 9.434886910372618e-06, + "loss": 0.0, + "step": 12650 + }, + { + "epoch": 0.05655598193425032, + "grad_norm": 6.528809421979531e-07, + "learning_rate": 9.434440180657498e-06, + "loss": 0.0, + "step": 12660 + }, + { + "epoch": 0.05660065490576237, + "grad_norm": 36.38373947143555, + "learning_rate": 9.433993450942376e-06, + "loss": 0.013, + "step": 12670 + }, + { + "epoch": 0.056645327877274415, + "grad_norm": 1.2341820612910871e-15, + "learning_rate": 9.433546721227256e-06, + "loss": 0.0023, + "step": 12680 + }, + { + "epoch": 0.05669000084878646, + "grad_norm": 0.0046431561931967735, + "learning_rate": 9.433099991512136e-06, + "loss": 0.625, + "step": 12690 + }, + { + "epoch": 0.0567346738202985, + "grad_norm": 0.017402734607458115, + "learning_rate": 9.432653261797015e-06, + "loss": 0.0001, + "step": 12700 + }, + { + "epoch": 0.05677934679181055, + "grad_norm": 0.17794175446033478, + "learning_rate": 9.432206532081895e-06, + "loss": 0.0758, + "step": 12710 + }, + { + "epoch": 0.056824019763322596, + "grad_norm": 2.2747856576188497e-07, + "learning_rate": 9.431759802366775e-06, + "loss": 0.0035, + "step": 12720 + }, + { + "epoch": 0.05686869273483464, + "grad_norm": 0.004323522560298443, + "learning_rate": 9.431313072651654e-06, + "loss": 0.0, + "step": 12730 + }, + { + "epoch": 0.05691336570634669, + "grad_norm": 2.525700892874738e-07, + "learning_rate": 9.430866342936534e-06, + "loss": 0.0001, + "step": 12740 + }, + { + "epoch": 0.05695803867785874, + "grad_norm": 4.306988898861164e-08, + "learning_rate": 9.430419613221414e-06, + "loss": 0.0013, + "step": 12750 + }, + { + "epoch": 0.057002711649370784, + "grad_norm": 2.449407121574154e-09, + "learning_rate": 9.429972883506294e-06, + "loss": 0.0001, + "step": 12760 + }, + { + "epoch": 0.057047384620882824, + "grad_norm": 0.0009273152681998909, + "learning_rate": 9.429526153791172e-06, + "loss": 0.0079, + "step": 12770 + }, + { + "epoch": 0.05709205759239487, + "grad_norm": 1.339035687886181e-10, + "learning_rate": 9.429079424076053e-06, + "loss": 0.0038, + "step": 12780 + }, + { + "epoch": 0.05713673056390692, + "grad_norm": 3.203273150204078e-12, + "learning_rate": 9.428632694360933e-06, + "loss": 0.0002, + "step": 12790 + }, + { + "epoch": 0.057181403535418965, + "grad_norm": 5.628530974632895e-09, + "learning_rate": 9.428185964645811e-06, + "loss": 0.0348, + "step": 12800 + }, + { + "epoch": 0.05722607650693101, + "grad_norm": 2.965325620607473e-07, + "learning_rate": 9.427739234930691e-06, + "loss": 0.007, + "step": 12810 + }, + { + "epoch": 0.05727074947844306, + "grad_norm": 6.876974545377834e-09, + "learning_rate": 9.427292505215571e-06, + "loss": 0.0127, + "step": 12820 + }, + { + "epoch": 0.057315422449955106, + "grad_norm": 4.7581835360688274e-07, + "learning_rate": 9.42684577550045e-06, + "loss": 0.0055, + "step": 12830 + }, + { + "epoch": 0.05736009542146715, + "grad_norm": 4.716447074315511e-05, + "learning_rate": 9.42639904578533e-06, + "loss": 0.0007, + "step": 12840 + }, + { + "epoch": 0.05740476839297919, + "grad_norm": 9.436319281519445e-09, + "learning_rate": 9.42595231607021e-06, + "loss": 0.0022, + "step": 12850 + }, + { + "epoch": 0.05744944136449124, + "grad_norm": 6.438230570893211e-07, + "learning_rate": 9.425505586355088e-06, + "loss": 0.0759, + "step": 12860 + }, + { + "epoch": 0.05749411433600329, + "grad_norm": 6.83662833012022e-08, + "learning_rate": 9.425058856639969e-06, + "loss": 0.0049, + "step": 12870 + }, + { + "epoch": 0.057538787307515334, + "grad_norm": 3.728951469383901e-06, + "learning_rate": 9.424612126924847e-06, + "loss": 0.0, + "step": 12880 + }, + { + "epoch": 0.05758346027902738, + "grad_norm": 5.688684592830562e-11, + "learning_rate": 9.424165397209727e-06, + "loss": 0.0, + "step": 12890 + }, + { + "epoch": 0.05762813325053943, + "grad_norm": 5.956259790940166e-14, + "learning_rate": 9.423718667494607e-06, + "loss": 0.0, + "step": 12900 + }, + { + "epoch": 0.057672806222051475, + "grad_norm": 1.761552154144752e-10, + "learning_rate": 9.423271937779486e-06, + "loss": 0.0, + "step": 12910 + }, + { + "epoch": 0.05771747919356352, + "grad_norm": 0.0009397334652021527, + "learning_rate": 9.422825208064366e-06, + "loss": 0.0014, + "step": 12920 + }, + { + "epoch": 0.05776215216507556, + "grad_norm": 2.9478458785092698e-08, + "learning_rate": 9.422378478349246e-06, + "loss": 0.0001, + "step": 12930 + }, + { + "epoch": 0.05780682513658761, + "grad_norm": 0.0010126801207661629, + "learning_rate": 9.421931748634124e-06, + "loss": 0.0, + "step": 12940 + }, + { + "epoch": 0.057851498108099655, + "grad_norm": 0.05465909093618393, + "learning_rate": 9.421485018919004e-06, + "loss": 0.0079, + "step": 12950 + }, + { + "epoch": 0.0578961710796117, + "grad_norm": 2.74059730465126e-09, + "learning_rate": 9.421038289203883e-06, + "loss": 0.0, + "step": 12960 + }, + { + "epoch": 0.05794084405112375, + "grad_norm": 0.0038352487608790398, + "learning_rate": 9.420591559488763e-06, + "loss": 0.0, + "step": 12970 + }, + { + "epoch": 0.057985517022635796, + "grad_norm": 1.9260965096619918e-14, + "learning_rate": 9.420144829773643e-06, + "loss": 0.0013, + "step": 12980 + }, + { + "epoch": 0.05803018999414784, + "grad_norm": 6.832203205964227e-12, + "learning_rate": 9.419698100058522e-06, + "loss": 0.0, + "step": 12990 + }, + { + "epoch": 0.05807486296565989, + "grad_norm": 2.557584699580673e-10, + "learning_rate": 9.419251370343402e-06, + "loss": 0.0, + "step": 13000 + }, + { + "epoch": 0.05811953593717193, + "grad_norm": 5.910940070741333e-10, + "learning_rate": 9.418804640628282e-06, + "loss": 0.0001, + "step": 13010 + }, + { + "epoch": 0.05816420890868398, + "grad_norm": 0.048840224742889404, + "learning_rate": 9.41835791091316e-06, + "loss": 0.0006, + "step": 13020 + }, + { + "epoch": 0.058208881880196024, + "grad_norm": 3.80511566699937e-10, + "learning_rate": 9.41791118119804e-06, + "loss": 0.0, + "step": 13030 + }, + { + "epoch": 0.05825355485170807, + "grad_norm": 1.9007886375987937e-09, + "learning_rate": 9.41746445148292e-06, + "loss": 0.0, + "step": 13040 + }, + { + "epoch": 0.05829822782322012, + "grad_norm": 2.1054762378014402e-09, + "learning_rate": 9.417017721767799e-06, + "loss": 0.0, + "step": 13050 + }, + { + "epoch": 0.058342900794732165, + "grad_norm": 6.012247411035787e-08, + "learning_rate": 9.416570992052679e-06, + "loss": 0.0362, + "step": 13060 + }, + { + "epoch": 0.05838757376624421, + "grad_norm": 0.007417942397296429, + "learning_rate": 9.416124262337558e-06, + "loss": 0.0001, + "step": 13070 + }, + { + "epoch": 0.05843224673775625, + "grad_norm": 9.17901132879706e-09, + "learning_rate": 9.415677532622438e-06, + "loss": 0.2469, + "step": 13080 + }, + { + "epoch": 0.0584769197092683, + "grad_norm": 1.002669267058387e-10, + "learning_rate": 9.415230802907318e-06, + "loss": 0.0034, + "step": 13090 + }, + { + "epoch": 0.058521592680780346, + "grad_norm": 2.3442614516255844e-10, + "learning_rate": 9.414784073192196e-06, + "loss": 0.0, + "step": 13100 + }, + { + "epoch": 0.05856626565229239, + "grad_norm": 7.194295903900638e-05, + "learning_rate": 9.414337343477076e-06, + "loss": 0.0, + "step": 13110 + }, + { + "epoch": 0.05861093862380444, + "grad_norm": 5.2120436438984896e-11, + "learning_rate": 9.413890613761956e-06, + "loss": 0.0038, + "step": 13120 + }, + { + "epoch": 0.05865561159531649, + "grad_norm": 4.7373262646033254e-08, + "learning_rate": 9.413443884046835e-06, + "loss": 0.0043, + "step": 13130 + }, + { + "epoch": 0.058700284566828534, + "grad_norm": 6.5466607829245405e-12, + "learning_rate": 9.412997154331715e-06, + "loss": 0.0047, + "step": 13140 + }, + { + "epoch": 0.05874495753834058, + "grad_norm": 0.06896981596946716, + "learning_rate": 9.412550424616595e-06, + "loss": 0.0, + "step": 13150 + }, + { + "epoch": 0.05878963050985262, + "grad_norm": 9.386536686806224e-11, + "learning_rate": 9.412103694901474e-06, + "loss": 0.0, + "step": 13160 + }, + { + "epoch": 0.05883430348136467, + "grad_norm": 7.611852481514032e-18, + "learning_rate": 9.411656965186354e-06, + "loss": 0.0146, + "step": 13170 + }, + { + "epoch": 0.058878976452876715, + "grad_norm": 1.1134966559911419e-10, + "learning_rate": 9.411210235471234e-06, + "loss": 0.0026, + "step": 13180 + }, + { + "epoch": 0.05892364942438876, + "grad_norm": 0.00012055486877216026, + "learning_rate": 9.410763505756112e-06, + "loss": 0.0, + "step": 13190 + }, + { + "epoch": 0.05896832239590081, + "grad_norm": 6.284484651425393e-13, + "learning_rate": 9.410316776040992e-06, + "loss": 0.0, + "step": 13200 + }, + { + "epoch": 0.059012995367412856, + "grad_norm": 1.1680693079441085e-09, + "learning_rate": 9.409870046325873e-06, + "loss": 0.0, + "step": 13210 + }, + { + "epoch": 0.0590576683389249, + "grad_norm": 3.1868826226855163e-06, + "learning_rate": 9.409423316610753e-06, + "loss": 0.0, + "step": 13220 + }, + { + "epoch": 0.05910234131043695, + "grad_norm": 6.71885274366069e-16, + "learning_rate": 9.408976586895631e-06, + "loss": 0.0001, + "step": 13230 + }, + { + "epoch": 0.05914701428194899, + "grad_norm": 24.487958908081055, + "learning_rate": 9.408529857180511e-06, + "loss": 0.6562, + "step": 13240 + }, + { + "epoch": 0.05919168725346104, + "grad_norm": 56.855098724365234, + "learning_rate": 9.408083127465391e-06, + "loss": 0.0113, + "step": 13250 + }, + { + "epoch": 0.059236360224973084, + "grad_norm": 0.00012145333312219009, + "learning_rate": 9.40763639775027e-06, + "loss": 0.0022, + "step": 13260 + }, + { + "epoch": 0.05928103319648513, + "grad_norm": 1.8307785842885949e-10, + "learning_rate": 9.40718966803515e-06, + "loss": 0.0, + "step": 13270 + }, + { + "epoch": 0.05932570616799718, + "grad_norm": 6.931577572122025e-14, + "learning_rate": 9.40674293832003e-06, + "loss": 0.2016, + "step": 13280 + }, + { + "epoch": 0.059370379139509225, + "grad_norm": 6.213984188319088e-11, + "learning_rate": 9.406296208604908e-06, + "loss": 0.0064, + "step": 13290 + }, + { + "epoch": 0.05941505211102127, + "grad_norm": 9.8732723385897e-12, + "learning_rate": 9.405849478889789e-06, + "loss": 0.0, + "step": 13300 + }, + { + "epoch": 0.05945972508253331, + "grad_norm": 3.321137635736715e-12, + "learning_rate": 9.405402749174669e-06, + "loss": 0.0, + "step": 13310 + }, + { + "epoch": 0.05950439805404536, + "grad_norm": 1.8688808278710667e-09, + "learning_rate": 9.404956019459547e-06, + "loss": 0.0055, + "step": 13320 + }, + { + "epoch": 0.059549071025557405, + "grad_norm": 2.0776104065589607e-05, + "learning_rate": 9.404509289744427e-06, + "loss": 0.2016, + "step": 13330 + }, + { + "epoch": 0.05959374399706945, + "grad_norm": 6.520615799663254e-17, + "learning_rate": 9.404062560029307e-06, + "loss": 0.0014, + "step": 13340 + }, + { + "epoch": 0.0596384169685815, + "grad_norm": 428.4654235839844, + "learning_rate": 9.403615830314186e-06, + "loss": 0.1042, + "step": 13350 + }, + { + "epoch": 0.059683089940093546, + "grad_norm": 2.1810656107845716e-05, + "learning_rate": 9.403169100599066e-06, + "loss": 0.0038, + "step": 13360 + }, + { + "epoch": 0.05972776291160559, + "grad_norm": 8.029319431557269e-09, + "learning_rate": 9.402722370883944e-06, + "loss": 0.0, + "step": 13370 + }, + { + "epoch": 0.05977243588311764, + "grad_norm": 6.180247851261811e-07, + "learning_rate": 9.402275641168824e-06, + "loss": 0.0, + "step": 13380 + }, + { + "epoch": 0.05981710885462968, + "grad_norm": 0.000312251562718302, + "learning_rate": 9.401828911453705e-06, + "loss": 0.0, + "step": 13390 + }, + { + "epoch": 0.05986178182614173, + "grad_norm": 1.1670133062580135e-05, + "learning_rate": 9.401382181738583e-06, + "loss": 0.0013, + "step": 13400 + }, + { + "epoch": 0.059906454797653774, + "grad_norm": 1.9894434710732156e-10, + "learning_rate": 9.400935452023463e-06, + "loss": 0.0006, + "step": 13410 + }, + { + "epoch": 0.05995112776916582, + "grad_norm": 7.895046110206749e-07, + "learning_rate": 9.400488722308343e-06, + "loss": 0.6375, + "step": 13420 + }, + { + "epoch": 0.05999580074067787, + "grad_norm": 7.807607858845156e-12, + "learning_rate": 9.400041992593222e-06, + "loss": 0.7125, + "step": 13430 + }, + { + "epoch": 0.060040473712189915, + "grad_norm": 6.620969088544371e-07, + "learning_rate": 9.399595262878102e-06, + "loss": 0.0, + "step": 13440 + }, + { + "epoch": 0.06008514668370196, + "grad_norm": 6.074368741337821e-08, + "learning_rate": 9.399148533162982e-06, + "loss": 0.0, + "step": 13450 + }, + { + "epoch": 0.06012981965521401, + "grad_norm": 1.0262946742445322e-10, + "learning_rate": 9.39870180344786e-06, + "loss": 0.65, + "step": 13460 + }, + { + "epoch": 0.06017449262672605, + "grad_norm": 0.0005402317037805915, + "learning_rate": 9.39825507373274e-06, + "loss": 0.0091, + "step": 13470 + }, + { + "epoch": 0.060219165598238096, + "grad_norm": 1.2266228033297466e-08, + "learning_rate": 9.397808344017619e-06, + "loss": 0.0, + "step": 13480 + }, + { + "epoch": 0.06026383856975014, + "grad_norm": 5.947930206579721e-17, + "learning_rate": 9.397361614302499e-06, + "loss": 0.0004, + "step": 13490 + }, + { + "epoch": 0.06030851154126219, + "grad_norm": 1.7619703474025528e-09, + "learning_rate": 9.39691488458738e-06, + "loss": 0.0, + "step": 13500 + }, + { + "epoch": 0.06035318451277424, + "grad_norm": 4.006636515896389e-07, + "learning_rate": 9.396468154872258e-06, + "loss": 0.0002, + "step": 13510 + }, + { + "epoch": 0.060397857484286284, + "grad_norm": 1.7506009391664995e-13, + "learning_rate": 9.396021425157138e-06, + "loss": 0.0, + "step": 13520 + }, + { + "epoch": 0.06044253045579833, + "grad_norm": 0.8221254944801331, + "learning_rate": 9.395574695442018e-06, + "loss": 0.5138, + "step": 13530 + }, + { + "epoch": 0.06048720342731038, + "grad_norm": 5.2254020554975966e-11, + "learning_rate": 9.395127965726896e-06, + "loss": 0.0002, + "step": 13540 + }, + { + "epoch": 0.06053187639882242, + "grad_norm": 0.01927713118493557, + "learning_rate": 9.394681236011776e-06, + "loss": 0.0063, + "step": 13550 + }, + { + "epoch": 0.060576549370334465, + "grad_norm": 8.598310508034501e-09, + "learning_rate": 9.394234506296655e-06, + "loss": 0.0001, + "step": 13560 + }, + { + "epoch": 0.06062122234184651, + "grad_norm": 4.621643256541574e-06, + "learning_rate": 9.393787776581535e-06, + "loss": 0.6852, + "step": 13570 + }, + { + "epoch": 0.06066589531335856, + "grad_norm": 0.004439809825271368, + "learning_rate": 9.393341046866415e-06, + "loss": 0.6785, + "step": 13580 + }, + { + "epoch": 0.060710568284870606, + "grad_norm": 0.00013783443137072027, + "learning_rate": 9.392894317151294e-06, + "loss": 0.0001, + "step": 13590 + }, + { + "epoch": 0.06075524125638265, + "grad_norm": 2.3134138584136963, + "learning_rate": 9.392447587436174e-06, + "loss": 0.0132, + "step": 13600 + }, + { + "epoch": 0.0607999142278947, + "grad_norm": 6.790860061300918e-05, + "learning_rate": 9.392000857721054e-06, + "loss": 0.0186, + "step": 13610 + }, + { + "epoch": 0.06084458719940674, + "grad_norm": 0.0005562048172578216, + "learning_rate": 9.391554128005932e-06, + "loss": 0.4033, + "step": 13620 + }, + { + "epoch": 0.06088926017091879, + "grad_norm": 9.475531442149077e-06, + "learning_rate": 9.391107398290812e-06, + "loss": 0.0, + "step": 13630 + }, + { + "epoch": 0.060933933142430834, + "grad_norm": 4.904528395854868e-06, + "learning_rate": 9.390660668575692e-06, + "loss": 0.6563, + "step": 13640 + }, + { + "epoch": 0.06097860611394288, + "grad_norm": 0.0010299455607309937, + "learning_rate": 9.390213938860571e-06, + "loss": 0.0, + "step": 13650 + }, + { + "epoch": 0.06102327908545493, + "grad_norm": 1.1471456673461944e-05, + "learning_rate": 9.389767209145451e-06, + "loss": 0.0002, + "step": 13660 + }, + { + "epoch": 0.061067952056966975, + "grad_norm": 2.3598842064515013e-13, + "learning_rate": 9.389320479430331e-06, + "loss": 0.0, + "step": 13670 + }, + { + "epoch": 0.06111262502847902, + "grad_norm": 2.994657961608027e-07, + "learning_rate": 9.388873749715211e-06, + "loss": 0.575, + "step": 13680 + }, + { + "epoch": 0.06115729799999107, + "grad_norm": 2.1437656414491357e-06, + "learning_rate": 9.38842702000009e-06, + "loss": 0.0021, + "step": 13690 + }, + { + "epoch": 0.06120197097150311, + "grad_norm": 1.3860614672012161e-05, + "learning_rate": 9.38798029028497e-06, + "loss": 0.0497, + "step": 13700 + }, + { + "epoch": 0.061246643943015155, + "grad_norm": 3.358853126189665e-11, + "learning_rate": 9.38753356056985e-06, + "loss": 0.01, + "step": 13710 + }, + { + "epoch": 0.0612913169145272, + "grad_norm": 1.0310730935714219e-08, + "learning_rate": 9.387086830854728e-06, + "loss": 0.0, + "step": 13720 + }, + { + "epoch": 0.06133598988603925, + "grad_norm": 2.0384447602106448e-11, + "learning_rate": 9.386640101139609e-06, + "loss": 0.0017, + "step": 13730 + }, + { + "epoch": 0.061380662857551296, + "grad_norm": 0.6072666049003601, + "learning_rate": 9.386193371424489e-06, + "loss": 0.0008, + "step": 13740 + }, + { + "epoch": 0.06142533582906334, + "grad_norm": 6.129848043201491e-05, + "learning_rate": 9.385746641709367e-06, + "loss": 0.0001, + "step": 13750 + }, + { + "epoch": 0.06147000880057539, + "grad_norm": 3.1852081720179376e-17, + "learning_rate": 9.385299911994247e-06, + "loss": 0.0001, + "step": 13760 + }, + { + "epoch": 0.06151468177208744, + "grad_norm": 0.0002205081982538104, + "learning_rate": 9.384853182279127e-06, + "loss": 0.0, + "step": 13770 + }, + { + "epoch": 0.06155935474359948, + "grad_norm": 2.394686705875504e-11, + "learning_rate": 9.384406452564006e-06, + "loss": 0.0002, + "step": 13780 + }, + { + "epoch": 0.061604027715111524, + "grad_norm": 0.11703392118215561, + "learning_rate": 9.383959722848886e-06, + "loss": 0.0, + "step": 13790 + }, + { + "epoch": 0.06164870068662357, + "grad_norm": 4.587246138498813e-10, + "learning_rate": 9.383512993133766e-06, + "loss": 0.2469, + "step": 13800 + }, + { + "epoch": 0.06169337365813562, + "grad_norm": 9.678911737864837e-06, + "learning_rate": 9.383066263418644e-06, + "loss": 0.0749, + "step": 13810 + }, + { + "epoch": 0.061738046629647665, + "grad_norm": 0.015324323438107967, + "learning_rate": 9.382619533703525e-06, + "loss": 0.0014, + "step": 13820 + }, + { + "epoch": 0.06178271960115971, + "grad_norm": 4.483310857977818e-12, + "learning_rate": 9.382172803988405e-06, + "loss": 0.0001, + "step": 13830 + }, + { + "epoch": 0.06182739257267176, + "grad_norm": 7.979075733466345e-10, + "learning_rate": 9.381726074273283e-06, + "loss": 0.525, + "step": 13840 + }, + { + "epoch": 0.0618720655441838, + "grad_norm": 2.2147013112316927e-09, + "learning_rate": 9.381279344558163e-06, + "loss": 0.0003, + "step": 13850 + }, + { + "epoch": 0.061916738515695846, + "grad_norm": 1.945849462459056e-19, + "learning_rate": 9.380832614843042e-06, + "loss": 0.0, + "step": 13860 + }, + { + "epoch": 0.06196141148720789, + "grad_norm": 1.3114672583469655e-05, + "learning_rate": 9.380385885127922e-06, + "loss": 0.0003, + "step": 13870 + }, + { + "epoch": 0.06200608445871994, + "grad_norm": 8.562447278981367e-11, + "learning_rate": 9.379939155412802e-06, + "loss": 0.0, + "step": 13880 + }, + { + "epoch": 0.06205075743023199, + "grad_norm": 1.6484534848260068e-09, + "learning_rate": 9.37949242569768e-06, + "loss": 0.0001, + "step": 13890 + }, + { + "epoch": 0.062095430401744034, + "grad_norm": 313.6671447753906, + "learning_rate": 9.37904569598256e-06, + "loss": 0.2005, + "step": 13900 + }, + { + "epoch": 0.06214010337325608, + "grad_norm": 2.4716316368795024e-14, + "learning_rate": 9.37859896626744e-06, + "loss": 0.0002, + "step": 13910 + }, + { + "epoch": 0.06218477634476813, + "grad_norm": 2.2001998445375648e-08, + "learning_rate": 9.378152236552319e-06, + "loss": 0.0001, + "step": 13920 + }, + { + "epoch": 0.06222944931628017, + "grad_norm": 0.10420388728380203, + "learning_rate": 9.3777055068372e-06, + "loss": 0.0227, + "step": 13930 + }, + { + "epoch": 0.062274122287792215, + "grad_norm": 1.254663348197937, + "learning_rate": 9.37725877712208e-06, + "loss": 0.0001, + "step": 13940 + }, + { + "epoch": 0.06231879525930426, + "grad_norm": 3.909913825404744e-15, + "learning_rate": 9.376812047406958e-06, + "loss": 0.0002, + "step": 13950 + }, + { + "epoch": 0.06236346823081631, + "grad_norm": 4.169524697294341e-13, + "learning_rate": 9.376365317691838e-06, + "loss": 0.016, + "step": 13960 + }, + { + "epoch": 0.062408141202328356, + "grad_norm": 6.563727339425895e-10, + "learning_rate": 9.375918587976716e-06, + "loss": 0.0, + "step": 13970 + }, + { + "epoch": 0.0624528141738404, + "grad_norm": 20.916521072387695, + "learning_rate": 9.375471858261596e-06, + "loss": 0.0026, + "step": 13980 + }, + { + "epoch": 0.06249748714535245, + "grad_norm": 1.0113201248662662e-10, + "learning_rate": 9.375025128546477e-06, + "loss": 0.0, + "step": 13990 + }, + { + "epoch": 0.06254216011686449, + "grad_norm": 1.5791814606197843e-14, + "learning_rate": 9.374578398831355e-06, + "loss": 0.0049, + "step": 14000 + }, + { + "epoch": 0.06258683308837654, + "grad_norm": 0.00038137752562761307, + "learning_rate": 9.374131669116235e-06, + "loss": 0.3566, + "step": 14010 + }, + { + "epoch": 0.06263150605988858, + "grad_norm": 3.751644683802624e-08, + "learning_rate": 9.373684939401115e-06, + "loss": 0.0, + "step": 14020 + }, + { + "epoch": 0.06267617903140063, + "grad_norm": 2.194812297821045, + "learning_rate": 9.373238209685994e-06, + "loss": 0.224, + "step": 14030 + }, + { + "epoch": 0.06272085200291268, + "grad_norm": 5.022498683615595e-08, + "learning_rate": 9.372791479970874e-06, + "loss": 0.0, + "step": 14040 + }, + { + "epoch": 0.06276552497442472, + "grad_norm": 7.967174034772718e-17, + "learning_rate": 9.372344750255754e-06, + "loss": 0.0057, + "step": 14050 + }, + { + "epoch": 0.06281019794593677, + "grad_norm": 5.298577785491943, + "learning_rate": 9.371898020540632e-06, + "loss": 0.0024, + "step": 14060 + }, + { + "epoch": 0.06285487091744882, + "grad_norm": 1212.22509765625, + "learning_rate": 9.371451290825512e-06, + "loss": 1.5299, + "step": 14070 + }, + { + "epoch": 0.06289954388896087, + "grad_norm": 9.872307832337057e-12, + "learning_rate": 9.371004561110391e-06, + "loss": 0.0, + "step": 14080 + }, + { + "epoch": 0.06294421686047291, + "grad_norm": 4.333783332199914e-12, + "learning_rate": 9.370557831395271e-06, + "loss": 0.0043, + "step": 14090 + }, + { + "epoch": 0.06298888983198496, + "grad_norm": 1.8763324282982754e-19, + "learning_rate": 9.370111101680151e-06, + "loss": 0.0002, + "step": 14100 + }, + { + "epoch": 0.063033562803497, + "grad_norm": 2.3205307897455896e-08, + "learning_rate": 9.36966437196503e-06, + "loss": 0.0001, + "step": 14110 + }, + { + "epoch": 0.06307823577500904, + "grad_norm": 1.4380998170377416e-17, + "learning_rate": 9.36921764224991e-06, + "loss": 0.0691, + "step": 14120 + }, + { + "epoch": 0.06312290874652109, + "grad_norm": 5.406634595730249e-16, + "learning_rate": 9.36877091253479e-06, + "loss": 0.0, + "step": 14130 + }, + { + "epoch": 0.06316758171803313, + "grad_norm": 1.297780335893667e-19, + "learning_rate": 9.36832418281967e-06, + "loss": 1.8627, + "step": 14140 + }, + { + "epoch": 0.06321225468954518, + "grad_norm": 0.047105517238378525, + "learning_rate": 9.367877453104548e-06, + "loss": 0.2813, + "step": 14150 + }, + { + "epoch": 0.06325692766105723, + "grad_norm": 1.0787633073050529e-05, + "learning_rate": 9.367430723389429e-06, + "loss": 0.0049, + "step": 14160 + }, + { + "epoch": 0.06330160063256927, + "grad_norm": 7.771322647898149e-12, + "learning_rate": 9.366983993674309e-06, + "loss": 0.0, + "step": 14170 + }, + { + "epoch": 0.06334627360408132, + "grad_norm": 0.0006671182345598936, + "learning_rate": 9.366537263959187e-06, + "loss": 0.0, + "step": 14180 + }, + { + "epoch": 0.06339094657559337, + "grad_norm": 5.741674375200849e-17, + "learning_rate": 9.366090534244067e-06, + "loss": 0.0001, + "step": 14190 + }, + { + "epoch": 0.06343561954710542, + "grad_norm": 3.3889224759775516e-10, + "learning_rate": 9.365643804528947e-06, + "loss": 0.0475, + "step": 14200 + }, + { + "epoch": 0.06348029251861746, + "grad_norm": 0.0008080815896391869, + "learning_rate": 9.365197074813826e-06, + "loss": 0.1359, + "step": 14210 + }, + { + "epoch": 0.06352496549012951, + "grad_norm": 0.0009778772946447134, + "learning_rate": 9.364750345098706e-06, + "loss": 0.0049, + "step": 14220 + }, + { + "epoch": 0.06356963846164156, + "grad_norm": 1.316239450765977e-09, + "learning_rate": 9.364303615383586e-06, + "loss": 0.0, + "step": 14230 + }, + { + "epoch": 0.0636143114331536, + "grad_norm": 5.679064088326413e-06, + "learning_rate": 9.363856885668464e-06, + "loss": 0.0, + "step": 14240 + }, + { + "epoch": 0.06365898440466565, + "grad_norm": 0.00017402069352101535, + "learning_rate": 9.363410155953345e-06, + "loss": 0.0128, + "step": 14250 + }, + { + "epoch": 0.0637036573761777, + "grad_norm": 0.00018130049284081906, + "learning_rate": 9.362963426238225e-06, + "loss": 0.0, + "step": 14260 + }, + { + "epoch": 0.06374833034768974, + "grad_norm": 1.457603655830031e-11, + "learning_rate": 9.362516696523103e-06, + "loss": 0.0, + "step": 14270 + }, + { + "epoch": 0.06379300331920178, + "grad_norm": 4.875524609815329e-06, + "learning_rate": 9.362069966807983e-06, + "loss": 0.0, + "step": 14280 + }, + { + "epoch": 0.06383767629071382, + "grad_norm": 2.04024197536512e-09, + "learning_rate": 9.361623237092863e-06, + "loss": 0.0, + "step": 14290 + }, + { + "epoch": 0.06388234926222587, + "grad_norm": 0.00029834118322469294, + "learning_rate": 9.361176507377742e-06, + "loss": 0.0, + "step": 14300 + }, + { + "epoch": 0.06392702223373792, + "grad_norm": 0.028371868655085564, + "learning_rate": 9.360729777662622e-06, + "loss": 0.0, + "step": 14310 + }, + { + "epoch": 0.06397169520524996, + "grad_norm": 1.0530008163824277e-18, + "learning_rate": 9.360283047947502e-06, + "loss": 0.0011, + "step": 14320 + }, + { + "epoch": 0.06401636817676201, + "grad_norm": 1.1820983303323374e-07, + "learning_rate": 9.35983631823238e-06, + "loss": 0.0, + "step": 14330 + }, + { + "epoch": 0.06406104114827406, + "grad_norm": 443.0798034667969, + "learning_rate": 9.35938958851726e-06, + "loss": 0.2126, + "step": 14340 + }, + { + "epoch": 0.0641057141197861, + "grad_norm": 2.8089353183659207e-15, + "learning_rate": 9.358942858802139e-06, + "loss": 0.0, + "step": 14350 + }, + { + "epoch": 0.06415038709129815, + "grad_norm": 4.0729130290628746e-08, + "learning_rate": 9.35849612908702e-06, + "loss": 1.5, + "step": 14360 + }, + { + "epoch": 0.0641950600628102, + "grad_norm": 1.8806908687952114e-18, + "learning_rate": 9.3580493993719e-06, + "loss": 0.0001, + "step": 14370 + }, + { + "epoch": 0.06423973303432225, + "grad_norm": 4.6179598339746164e-11, + "learning_rate": 9.357602669656778e-06, + "loss": 0.0, + "step": 14380 + }, + { + "epoch": 0.0642844060058343, + "grad_norm": 2.830821017596463e-07, + "learning_rate": 9.357155939941658e-06, + "loss": 0.0006, + "step": 14390 + }, + { + "epoch": 0.06432907897734634, + "grad_norm": 1.0931796801116889e-08, + "learning_rate": 9.356709210226538e-06, + "loss": 0.0001, + "step": 14400 + }, + { + "epoch": 0.06437375194885839, + "grad_norm": 1.5363036709459266e-07, + "learning_rate": 9.356262480511416e-06, + "loss": 0.0, + "step": 14410 + }, + { + "epoch": 0.06441842492037043, + "grad_norm": 4.7219604226711454e-08, + "learning_rate": 9.355815750796297e-06, + "loss": 0.0, + "step": 14420 + }, + { + "epoch": 0.06446309789188247, + "grad_norm": 1.4899464995110634e-09, + "learning_rate": 9.355369021081177e-06, + "loss": 0.0089, + "step": 14430 + }, + { + "epoch": 0.06450777086339451, + "grad_norm": 0.0672210082411766, + "learning_rate": 9.354922291366055e-06, + "loss": 0.3282, + "step": 14440 + }, + { + "epoch": 0.06455244383490656, + "grad_norm": 2.0680950001406018e-06, + "learning_rate": 9.354475561650935e-06, + "loss": 0.0, + "step": 14450 + }, + { + "epoch": 0.06459711680641861, + "grad_norm": 591.7171020507812, + "learning_rate": 9.354028831935814e-06, + "loss": 0.2345, + "step": 14460 + }, + { + "epoch": 0.06464178977793066, + "grad_norm": 2.4443409518681847e-09, + "learning_rate": 9.353582102220694e-06, + "loss": 0.0068, + "step": 14470 + }, + { + "epoch": 0.0646864627494427, + "grad_norm": 0.0029351816046983004, + "learning_rate": 9.353135372505574e-06, + "loss": 0.0005, + "step": 14480 + }, + { + "epoch": 0.06473113572095475, + "grad_norm": 2.7630872523332073e-07, + "learning_rate": 9.352688642790452e-06, + "loss": 0.0031, + "step": 14490 + }, + { + "epoch": 0.0647758086924668, + "grad_norm": 0.0005741612403653562, + "learning_rate": 9.352241913075332e-06, + "loss": 0.0, + "step": 14500 + }, + { + "epoch": 0.06482048166397884, + "grad_norm": 2.8474836377689883e-10, + "learning_rate": 9.351795183360213e-06, + "loss": 0.0001, + "step": 14510 + }, + { + "epoch": 0.06486515463549089, + "grad_norm": 1.0916278370132204e-07, + "learning_rate": 9.351348453645091e-06, + "loss": 0.0, + "step": 14520 + }, + { + "epoch": 0.06490982760700294, + "grad_norm": 1.2938996662298363e-12, + "learning_rate": 9.350901723929971e-06, + "loss": 0.0, + "step": 14530 + }, + { + "epoch": 0.06495450057851498, + "grad_norm": 1.8603714055376308e-11, + "learning_rate": 9.350454994214851e-06, + "loss": 0.0022, + "step": 14540 + }, + { + "epoch": 0.06499917355002703, + "grad_norm": 5.892105026106265e-09, + "learning_rate": 9.35000826449973e-06, + "loss": 0.775, + "step": 14550 + }, + { + "epoch": 0.06504384652153908, + "grad_norm": 0.0011221965542063117, + "learning_rate": 9.34956153478461e-06, + "loss": 0.0, + "step": 14560 + }, + { + "epoch": 0.06508851949305113, + "grad_norm": 4.0386902355749044e-07, + "learning_rate": 9.349114805069488e-06, + "loss": 0.0, + "step": 14570 + }, + { + "epoch": 0.06513319246456317, + "grad_norm": 2.900226170332587e-10, + "learning_rate": 9.348668075354368e-06, + "loss": 0.0, + "step": 14580 + }, + { + "epoch": 0.0651778654360752, + "grad_norm": 1.719579268033729e-15, + "learning_rate": 9.348221345639249e-06, + "loss": 0.0001, + "step": 14590 + }, + { + "epoch": 0.06522253840758725, + "grad_norm": 2.6878053602241714e-11, + "learning_rate": 9.347774615924129e-06, + "loss": 0.0009, + "step": 14600 + }, + { + "epoch": 0.0652672113790993, + "grad_norm": 4.025258316768532e-13, + "learning_rate": 9.347327886209007e-06, + "loss": 0.0, + "step": 14610 + }, + { + "epoch": 0.06531188435061135, + "grad_norm": 2.202270508178117e-07, + "learning_rate": 9.346881156493887e-06, + "loss": 0.0045, + "step": 14620 + }, + { + "epoch": 0.0653565573221234, + "grad_norm": 11.413064956665039, + "learning_rate": 9.346434426778767e-06, + "loss": 0.0016, + "step": 14630 + }, + { + "epoch": 0.06540123029363544, + "grad_norm": 0.0013619032688438892, + "learning_rate": 9.345987697063646e-06, + "loss": 0.0, + "step": 14640 + }, + { + "epoch": 0.06544590326514749, + "grad_norm": 1.1321811679365981e-13, + "learning_rate": 9.345540967348526e-06, + "loss": 0.1317, + "step": 14650 + }, + { + "epoch": 0.06549057623665953, + "grad_norm": 0.0016499441117048264, + "learning_rate": 9.345094237633406e-06, + "loss": 0.0002, + "step": 14660 + }, + { + "epoch": 0.06553524920817158, + "grad_norm": 2.5986149538616277e-13, + "learning_rate": 9.344647507918284e-06, + "loss": 0.166, + "step": 14670 + }, + { + "epoch": 0.06557992217968363, + "grad_norm": 0.00024417508393526077, + "learning_rate": 9.344200778203165e-06, + "loss": 0.0004, + "step": 14680 + }, + { + "epoch": 0.06562459515119567, + "grad_norm": 0.21002331376075745, + "learning_rate": 9.343754048488045e-06, + "loss": 0.0001, + "step": 14690 + }, + { + "epoch": 0.06566926812270772, + "grad_norm": 30.844282150268555, + "learning_rate": 9.343307318772923e-06, + "loss": 0.2183, + "step": 14700 + }, + { + "epoch": 0.06571394109421977, + "grad_norm": 2.5580968099347956e-07, + "learning_rate": 9.342860589057803e-06, + "loss": 0.0, + "step": 14710 + }, + { + "epoch": 0.06575861406573182, + "grad_norm": 5.260941406959319e-07, + "learning_rate": 9.342413859342683e-06, + "loss": 0.0, + "step": 14720 + }, + { + "epoch": 0.06580328703724386, + "grad_norm": 0.0003637924382928759, + "learning_rate": 9.341967129627563e-06, + "loss": 0.0, + "step": 14730 + }, + { + "epoch": 0.0658479600087559, + "grad_norm": 2.3141837296285217e-13, + "learning_rate": 9.341520399912442e-06, + "loss": 0.0, + "step": 14740 + }, + { + "epoch": 0.06589263298026794, + "grad_norm": 4.705751166511618e-07, + "learning_rate": 9.341073670197322e-06, + "loss": 0.0041, + "step": 14750 + }, + { + "epoch": 0.06593730595177999, + "grad_norm": 9.326619476723863e-09, + "learning_rate": 9.3406269404822e-06, + "loss": 0.0004, + "step": 14760 + }, + { + "epoch": 0.06598197892329204, + "grad_norm": 2.6402527386615304e-10, + "learning_rate": 9.34018021076708e-06, + "loss": 0.0, + "step": 14770 + }, + { + "epoch": 0.06602665189480408, + "grad_norm": 7.31518491614569e-12, + "learning_rate": 9.33973348105196e-06, + "loss": 0.0, + "step": 14780 + }, + { + "epoch": 0.06607132486631613, + "grad_norm": 0.0001492876763222739, + "learning_rate": 9.33928675133684e-06, + "loss": 0.0018, + "step": 14790 + }, + { + "epoch": 0.06611599783782818, + "grad_norm": 7.935686153359711e-05, + "learning_rate": 9.33884002162172e-06, + "loss": 0.0034, + "step": 14800 + }, + { + "epoch": 0.06616067080934022, + "grad_norm": 4.462126668158817e-09, + "learning_rate": 9.3383932919066e-06, + "loss": 0.0, + "step": 14810 + }, + { + "epoch": 0.06620534378085227, + "grad_norm": 9.117017611970368e-07, + "learning_rate": 9.337946562191478e-06, + "loss": 0.0001, + "step": 14820 + }, + { + "epoch": 0.06625001675236432, + "grad_norm": 1.0151539981961832e-06, + "learning_rate": 9.337499832476358e-06, + "loss": 0.0387, + "step": 14830 + }, + { + "epoch": 0.06629468972387637, + "grad_norm": 1.5695963156758808e-05, + "learning_rate": 9.337053102761238e-06, + "loss": 0.0, + "step": 14840 + }, + { + "epoch": 0.06633936269538841, + "grad_norm": 0.00035690312506631017, + "learning_rate": 9.336606373046117e-06, + "loss": 0.0043, + "step": 14850 + }, + { + "epoch": 0.06638403566690046, + "grad_norm": 0.0006880881846882403, + "learning_rate": 9.336159643330997e-06, + "loss": 0.0002, + "step": 14860 + }, + { + "epoch": 0.0664287086384125, + "grad_norm": 4.308242080279393e-15, + "learning_rate": 9.335712913615875e-06, + "loss": 0.0, + "step": 14870 + }, + { + "epoch": 0.06647338160992455, + "grad_norm": 0.030537355691194534, + "learning_rate": 9.335266183900755e-06, + "loss": 0.0, + "step": 14880 + }, + { + "epoch": 0.0665180545814366, + "grad_norm": 0.016046520322561264, + "learning_rate": 9.334819454185635e-06, + "loss": 0.0, + "step": 14890 + }, + { + "epoch": 0.06656272755294863, + "grad_norm": 8.736069139558822e-05, + "learning_rate": 9.334372724470514e-06, + "loss": 0.0001, + "step": 14900 + }, + { + "epoch": 0.06660740052446068, + "grad_norm": 9.695746659045596e-21, + "learning_rate": 9.333925994755394e-06, + "loss": 0.0357, + "step": 14910 + }, + { + "epoch": 0.06665207349597273, + "grad_norm": 1.6318119833158562e-07, + "learning_rate": 9.333479265040274e-06, + "loss": 0.0, + "step": 14920 + }, + { + "epoch": 0.06669674646748477, + "grad_norm": 6.515248287541908e-07, + "learning_rate": 9.333032535325152e-06, + "loss": 0.0, + "step": 14930 + }, + { + "epoch": 0.06674141943899682, + "grad_norm": 3.7281594700289133e-07, + "learning_rate": 9.332585805610033e-06, + "loss": 0.0021, + "step": 14940 + }, + { + "epoch": 0.06678609241050887, + "grad_norm": 1.641012048470003e-10, + "learning_rate": 9.332139075894911e-06, + "loss": 0.0202, + "step": 14950 + }, + { + "epoch": 0.06683076538202092, + "grad_norm": 4.301578428567154e-06, + "learning_rate": 9.331692346179791e-06, + "loss": 0.0002, + "step": 14960 + }, + { + "epoch": 0.06687543835353296, + "grad_norm": 1.3561146563340065e-13, + "learning_rate": 9.331245616464671e-06, + "loss": 0.0002, + "step": 14970 + }, + { + "epoch": 0.06692011132504501, + "grad_norm": 0.03523876518011093, + "learning_rate": 9.33079888674955e-06, + "loss": 0.0089, + "step": 14980 + }, + { + "epoch": 0.06696478429655706, + "grad_norm": 8.709083801350381e-13, + "learning_rate": 9.33035215703443e-06, + "loss": 0.0001, + "step": 14990 + }, + { + "epoch": 0.0670094572680691, + "grad_norm": 1.5945241571557744e-08, + "learning_rate": 9.32990542731931e-06, + "loss": 0.0, + "step": 15000 + }, + { + "epoch": 0.06705413023958115, + "grad_norm": 2.682080391114283e-10, + "learning_rate": 9.329458697604188e-06, + "loss": 0.0002, + "step": 15010 + }, + { + "epoch": 0.0670988032110932, + "grad_norm": 8.974782722548369e-11, + "learning_rate": 9.329011967889069e-06, + "loss": 0.0001, + "step": 15020 + }, + { + "epoch": 0.06714347618260524, + "grad_norm": 7.33713756417842e-10, + "learning_rate": 9.328565238173949e-06, + "loss": 0.0, + "step": 15030 + }, + { + "epoch": 0.06718814915411729, + "grad_norm": 1.4199095232925174e-08, + "learning_rate": 9.328118508458827e-06, + "loss": 0.0, + "step": 15040 + }, + { + "epoch": 0.06723282212562932, + "grad_norm": 2.654218178577139e-09, + "learning_rate": 9.327671778743707e-06, + "loss": 0.0016, + "step": 15050 + }, + { + "epoch": 0.06727749509714137, + "grad_norm": 1.912253111391836e-15, + "learning_rate": 9.327225049028587e-06, + "loss": 0.0, + "step": 15060 + }, + { + "epoch": 0.06732216806865342, + "grad_norm": 1.1236858554752871e-08, + "learning_rate": 9.326778319313466e-06, + "loss": 0.0, + "step": 15070 + }, + { + "epoch": 0.06736684104016546, + "grad_norm": 0.002570349955931306, + "learning_rate": 9.326331589598346e-06, + "loss": 0.0, + "step": 15080 + }, + { + "epoch": 0.06741151401167751, + "grad_norm": 0.00039951750659383833, + "learning_rate": 9.325884859883226e-06, + "loss": 0.0, + "step": 15090 + }, + { + "epoch": 0.06745618698318956, + "grad_norm": 1.2615786008161986e-15, + "learning_rate": 9.325438130168104e-06, + "loss": 0.0002, + "step": 15100 + }, + { + "epoch": 0.0675008599547016, + "grad_norm": 2.0209645299473777e-05, + "learning_rate": 9.324991400452985e-06, + "loss": 0.0, + "step": 15110 + }, + { + "epoch": 0.06754553292621365, + "grad_norm": 1.8193735513705178e-06, + "learning_rate": 9.324544670737865e-06, + "loss": 0.0, + "step": 15120 + }, + { + "epoch": 0.0675902058977257, + "grad_norm": 1.6376893667728246e-16, + "learning_rate": 9.324097941022743e-06, + "loss": 0.0, + "step": 15130 + }, + { + "epoch": 0.06763487886923775, + "grad_norm": 2.2886815071105957, + "learning_rate": 9.323651211307623e-06, + "loss": 0.0207, + "step": 15140 + }, + { + "epoch": 0.0676795518407498, + "grad_norm": 0.00047048478154465556, + "learning_rate": 9.323204481592503e-06, + "loss": 0.0001, + "step": 15150 + }, + { + "epoch": 0.06772422481226184, + "grad_norm": 3.897758688253816e-06, + "learning_rate": 9.322757751877382e-06, + "loss": 0.0055, + "step": 15160 + }, + { + "epoch": 0.06776889778377389, + "grad_norm": 6.203567863849457e-07, + "learning_rate": 9.322311022162262e-06, + "loss": 0.0, + "step": 15170 + }, + { + "epoch": 0.06781357075528593, + "grad_norm": 2.630331891850801e-06, + "learning_rate": 9.321864292447142e-06, + "loss": 0.0, + "step": 15180 + }, + { + "epoch": 0.06785824372679798, + "grad_norm": 1.54407257228987e-17, + "learning_rate": 9.321417562732022e-06, + "loss": 0.0, + "step": 15190 + }, + { + "epoch": 0.06790291669831001, + "grad_norm": 1.8115958386188957e-11, + "learning_rate": 9.3209708330169e-06, + "loss": 0.0635, + "step": 15200 + }, + { + "epoch": 0.06794758966982206, + "grad_norm": 3.644802558724902e-13, + "learning_rate": 9.32052410330178e-06, + "loss": 0.0003, + "step": 15210 + }, + { + "epoch": 0.06799226264133411, + "grad_norm": 0.0009677475318312645, + "learning_rate": 9.320077373586661e-06, + "loss": 0.0, + "step": 15220 + }, + { + "epoch": 0.06803693561284616, + "grad_norm": 6.228935951091234e-18, + "learning_rate": 9.31963064387154e-06, + "loss": 0.0, + "step": 15230 + }, + { + "epoch": 0.0680816085843582, + "grad_norm": 4.024088411824778e-05, + "learning_rate": 9.31918391415642e-06, + "loss": 0.1055, + "step": 15240 + }, + { + "epoch": 0.06812628155587025, + "grad_norm": 0.04487450420856476, + "learning_rate": 9.318737184441298e-06, + "loss": 0.0, + "step": 15250 + }, + { + "epoch": 0.0681709545273823, + "grad_norm": 4.879202426068046e-15, + "learning_rate": 9.318290454726178e-06, + "loss": 0.0452, + "step": 15260 + }, + { + "epoch": 0.06821562749889434, + "grad_norm": 0.0355444997549057, + "learning_rate": 9.317843725011058e-06, + "loss": 0.0007, + "step": 15270 + }, + { + "epoch": 0.06826030047040639, + "grad_norm": 1.9042292742632583e-10, + "learning_rate": 9.317396995295937e-06, + "loss": 0.0004, + "step": 15280 + }, + { + "epoch": 0.06830497344191844, + "grad_norm": 0.001804482308216393, + "learning_rate": 9.316950265580817e-06, + "loss": 0.0006, + "step": 15290 + }, + { + "epoch": 0.06834964641343048, + "grad_norm": 7.79038145992672e-06, + "learning_rate": 9.316503535865697e-06, + "loss": 0.0, + "step": 15300 + }, + { + "epoch": 0.06839431938494253, + "grad_norm": 0.006531926803290844, + "learning_rate": 9.316056806150575e-06, + "loss": 0.0, + "step": 15310 + }, + { + "epoch": 0.06843899235645458, + "grad_norm": 0.00145976641215384, + "learning_rate": 9.315610076435455e-06, + "loss": 0.0, + "step": 15320 + }, + { + "epoch": 0.06848366532796663, + "grad_norm": 1.1706009672707296e-06, + "learning_rate": 9.315163346720335e-06, + "loss": 0.0026, + "step": 15330 + }, + { + "epoch": 0.06852833829947867, + "grad_norm": 5.742387454432674e-09, + "learning_rate": 9.314716617005214e-06, + "loss": 0.0002, + "step": 15340 + }, + { + "epoch": 0.06857301127099072, + "grad_norm": 2.1157748699188232, + "learning_rate": 9.314269887290094e-06, + "loss": 0.0007, + "step": 15350 + }, + { + "epoch": 0.06861768424250275, + "grad_norm": 8.668970394865028e-07, + "learning_rate": 9.313823157574972e-06, + "loss": 0.0, + "step": 15360 + }, + { + "epoch": 0.0686623572140148, + "grad_norm": 0.00040037737926468253, + "learning_rate": 9.313376427859853e-06, + "loss": 0.0011, + "step": 15370 + }, + { + "epoch": 0.06870703018552685, + "grad_norm": 0.06943363696336746, + "learning_rate": 9.312929698144733e-06, + "loss": 0.0127, + "step": 15380 + }, + { + "epoch": 0.0687517031570389, + "grad_norm": 0.0003452890960033983, + "learning_rate": 9.312482968429611e-06, + "loss": 0.0, + "step": 15390 + }, + { + "epoch": 0.06879637612855094, + "grad_norm": 324.0003967285156, + "learning_rate": 9.312036238714491e-06, + "loss": 0.1562, + "step": 15400 + }, + { + "epoch": 0.06884104910006299, + "grad_norm": 0.0001255038077943027, + "learning_rate": 9.311589508999371e-06, + "loss": 0.008, + "step": 15410 + }, + { + "epoch": 0.06888572207157503, + "grad_norm": 0.10439004004001617, + "learning_rate": 9.31114277928425e-06, + "loss": 0.0001, + "step": 15420 + }, + { + "epoch": 0.06893039504308708, + "grad_norm": 0.010804546996951103, + "learning_rate": 9.31069604956913e-06, + "loss": 0.0008, + "step": 15430 + }, + { + "epoch": 0.06897506801459913, + "grad_norm": 1.23363960824463e-07, + "learning_rate": 9.31024931985401e-06, + "loss": 0.0, + "step": 15440 + }, + { + "epoch": 0.06901974098611117, + "grad_norm": 1.2098164809369594e-14, + "learning_rate": 9.309802590138888e-06, + "loss": 0.0, + "step": 15450 + }, + { + "epoch": 0.06906441395762322, + "grad_norm": 3.7172107297638035e-13, + "learning_rate": 9.309355860423769e-06, + "loss": 0.305, + "step": 15460 + }, + { + "epoch": 0.06910908692913527, + "grad_norm": 1.8141294266501087e-15, + "learning_rate": 9.308909130708647e-06, + "loss": 0.0, + "step": 15470 + }, + { + "epoch": 0.06915375990064732, + "grad_norm": 4.109392166137695, + "learning_rate": 9.308462400993527e-06, + "loss": 0.0006, + "step": 15480 + }, + { + "epoch": 0.06919843287215936, + "grad_norm": 634.8197631835938, + "learning_rate": 9.308015671278407e-06, + "loss": 0.3976, + "step": 15490 + }, + { + "epoch": 0.06924310584367141, + "grad_norm": 7.697084426879883, + "learning_rate": 9.307568941563286e-06, + "loss": 0.0018, + "step": 15500 + }, + { + "epoch": 0.06928777881518344, + "grad_norm": 0.9893437027931213, + "learning_rate": 9.307122211848166e-06, + "loss": 0.0978, + "step": 15510 + }, + { + "epoch": 0.06933245178669549, + "grad_norm": 3.965033101849258e-06, + "learning_rate": 9.306675482133046e-06, + "loss": 0.0, + "step": 15520 + }, + { + "epoch": 0.06937712475820754, + "grad_norm": 481.1986083984375, + "learning_rate": 9.306228752417924e-06, + "loss": 0.6469, + "step": 15530 + }, + { + "epoch": 0.06942179772971958, + "grad_norm": 0.00033715751487761736, + "learning_rate": 9.305782022702805e-06, + "loss": 0.0, + "step": 15540 + }, + { + "epoch": 0.06946647070123163, + "grad_norm": 740.4984741210938, + "learning_rate": 9.305335292987685e-06, + "loss": 0.6, + "step": 15550 + }, + { + "epoch": 0.06951114367274368, + "grad_norm": 1.4198823919286951e-05, + "learning_rate": 9.304888563272563e-06, + "loss": 0.3234, + "step": 15560 + }, + { + "epoch": 0.06955581664425572, + "grad_norm": 7.220014595077373e-06, + "learning_rate": 9.304441833557443e-06, + "loss": 0.0, + "step": 15570 + }, + { + "epoch": 0.06960048961576777, + "grad_norm": 3.1226806640625, + "learning_rate": 9.303995103842323e-06, + "loss": 0.0007, + "step": 15580 + }, + { + "epoch": 0.06964516258727982, + "grad_norm": 7.694500823163253e-07, + "learning_rate": 9.303548374127202e-06, + "loss": 0.016, + "step": 15590 + }, + { + "epoch": 0.06968983555879187, + "grad_norm": 1.0497649327589897e-06, + "learning_rate": 9.303101644412082e-06, + "loss": 0.0, + "step": 15600 + }, + { + "epoch": 0.06973450853030391, + "grad_norm": 2.502213760635641e-07, + "learning_rate": 9.302654914696962e-06, + "loss": 0.0, + "step": 15610 + }, + { + "epoch": 0.06977918150181596, + "grad_norm": 9.276220199957996e-13, + "learning_rate": 9.30220818498184e-06, + "loss": 0.0008, + "step": 15620 + }, + { + "epoch": 0.069823854473328, + "grad_norm": 5.567364902425709e-10, + "learning_rate": 9.30176145526672e-06, + "loss": 0.0001, + "step": 15630 + }, + { + "epoch": 0.06986852744484005, + "grad_norm": 4.1637374437186736e-08, + "learning_rate": 9.3013147255516e-06, + "loss": 0.0, + "step": 15640 + }, + { + "epoch": 0.0699132004163521, + "grad_norm": 8.487996546943655e-14, + "learning_rate": 9.30086799583648e-06, + "loss": 0.0, + "step": 15650 + }, + { + "epoch": 0.06995787338786415, + "grad_norm": 4.741735839695546e-14, + "learning_rate": 9.30042126612136e-06, + "loss": 0.0, + "step": 15660 + }, + { + "epoch": 0.07000254635937618, + "grad_norm": 2.599381900836306e-07, + "learning_rate": 9.29997453640624e-06, + "loss": 0.0, + "step": 15670 + }, + { + "epoch": 0.07004721933088823, + "grad_norm": 8.456419777758128e-07, + "learning_rate": 9.29952780669112e-06, + "loss": 0.0, + "step": 15680 + }, + { + "epoch": 0.07009189230240027, + "grad_norm": 4.6256240947708277e-17, + "learning_rate": 9.299081076975998e-06, + "loss": 0.0114, + "step": 15690 + }, + { + "epoch": 0.07013656527391232, + "grad_norm": 3.173031715847374e-12, + "learning_rate": 9.298634347260878e-06, + "loss": 0.0001, + "step": 15700 + }, + { + "epoch": 0.07018123824542437, + "grad_norm": 0.0004218385729473084, + "learning_rate": 9.298187617545758e-06, + "loss": 0.0003, + "step": 15710 + }, + { + "epoch": 0.07022591121693642, + "grad_norm": 1.6945949028013274e-05, + "learning_rate": 9.297740887830637e-06, + "loss": 0.0079, + "step": 15720 + }, + { + "epoch": 0.07027058418844846, + "grad_norm": 6.766648439487932e-12, + "learning_rate": 9.297294158115517e-06, + "loss": 0.0, + "step": 15730 + }, + { + "epoch": 0.07031525715996051, + "grad_norm": 1.574522101783528e-14, + "learning_rate": 9.296847428400397e-06, + "loss": 0.0024, + "step": 15740 + }, + { + "epoch": 0.07035993013147256, + "grad_norm": 0.001323819742538035, + "learning_rate": 9.296400698685275e-06, + "loss": 0.0, + "step": 15750 + }, + { + "epoch": 0.0704046031029846, + "grad_norm": 6.251634658838157e-06, + "learning_rate": 9.295953968970155e-06, + "loss": 0.6818, + "step": 15760 + }, + { + "epoch": 0.07044927607449665, + "grad_norm": 44.87289047241211, + "learning_rate": 9.295507239255034e-06, + "loss": 0.0062, + "step": 15770 + }, + { + "epoch": 0.0704939490460087, + "grad_norm": 2.0296934163770153e-13, + "learning_rate": 9.295060509539914e-06, + "loss": 0.0011, + "step": 15780 + }, + { + "epoch": 0.07053862201752074, + "grad_norm": 2.8807021408283617e-06, + "learning_rate": 9.294613779824794e-06, + "loss": 0.0, + "step": 15790 + }, + { + "epoch": 0.07058329498903279, + "grad_norm": 4.878136678598821e-06, + "learning_rate": 9.294167050109673e-06, + "loss": 0.0, + "step": 15800 + }, + { + "epoch": 0.07062796796054484, + "grad_norm": 2.258621067952049e-10, + "learning_rate": 9.293720320394553e-06, + "loss": 0.1805, + "step": 15810 + }, + { + "epoch": 0.07067264093205687, + "grad_norm": 5.581754521699622e-05, + "learning_rate": 9.293273590679433e-06, + "loss": 0.0, + "step": 15820 + }, + { + "epoch": 0.07071731390356892, + "grad_norm": 1.2751265785482246e-05, + "learning_rate": 9.292826860964311e-06, + "loss": 0.0001, + "step": 15830 + }, + { + "epoch": 0.07076198687508096, + "grad_norm": 0.00011787868425017223, + "learning_rate": 9.292380131249191e-06, + "loss": 0.0, + "step": 15840 + }, + { + "epoch": 0.07080665984659301, + "grad_norm": 0.0021484734024852514, + "learning_rate": 9.29193340153407e-06, + "loss": 0.0824, + "step": 15850 + }, + { + "epoch": 0.07085133281810506, + "grad_norm": 0.1802006959915161, + "learning_rate": 9.29148667181895e-06, + "loss": 0.0, + "step": 15860 + }, + { + "epoch": 0.0708960057896171, + "grad_norm": 0.08214273303747177, + "learning_rate": 9.29103994210383e-06, + "loss": 0.0019, + "step": 15870 + }, + { + "epoch": 0.07094067876112915, + "grad_norm": 1.5906814336776733, + "learning_rate": 9.290593212388708e-06, + "loss": 0.0015, + "step": 15880 + }, + { + "epoch": 0.0709853517326412, + "grad_norm": 2.6559273926530223e-16, + "learning_rate": 9.290146482673589e-06, + "loss": 0.0004, + "step": 15890 + }, + { + "epoch": 0.07103002470415325, + "grad_norm": 4.014354999526404e-06, + "learning_rate": 9.289699752958469e-06, + "loss": 0.0, + "step": 15900 + }, + { + "epoch": 0.0710746976756653, + "grad_norm": 0.007187653798609972, + "learning_rate": 9.289253023243347e-06, + "loss": 0.0, + "step": 15910 + }, + { + "epoch": 0.07111937064717734, + "grad_norm": 1.90598314908641e-09, + "learning_rate": 9.288806293528227e-06, + "loss": 0.0005, + "step": 15920 + }, + { + "epoch": 0.07116404361868939, + "grad_norm": 7.094026429656231e-13, + "learning_rate": 9.288359563813107e-06, + "loss": 0.0825, + "step": 15930 + }, + { + "epoch": 0.07120871659020143, + "grad_norm": 2.1279684479069667e-16, + "learning_rate": 9.287912834097986e-06, + "loss": 0.0038, + "step": 15940 + }, + { + "epoch": 0.07125338956171348, + "grad_norm": 1.595363841033759e-10, + "learning_rate": 9.287466104382866e-06, + "loss": 0.0, + "step": 15950 + }, + { + "epoch": 0.07129806253322553, + "grad_norm": 5.384310280075709e-14, + "learning_rate": 9.287019374667744e-06, + "loss": 0.0, + "step": 15960 + }, + { + "epoch": 0.07134273550473758, + "grad_norm": 0.05375685170292854, + "learning_rate": 9.286572644952625e-06, + "loss": 0.0843, + "step": 15970 + }, + { + "epoch": 0.07138740847624961, + "grad_norm": 0.05173249915242195, + "learning_rate": 9.286125915237505e-06, + "loss": 0.0005, + "step": 15980 + }, + { + "epoch": 0.07143208144776166, + "grad_norm": 1140.3134765625, + "learning_rate": 9.285679185522383e-06, + "loss": 0.5876, + "step": 15990 + }, + { + "epoch": 0.0714767544192737, + "grad_norm": 7.1331573963107076e-06, + "learning_rate": 9.285232455807263e-06, + "loss": 0.0428, + "step": 16000 + }, + { + "epoch": 0.07152142739078575, + "grad_norm": 3.3845615234895376e-06, + "learning_rate": 9.284785726092143e-06, + "loss": 0.0336, + "step": 16010 + }, + { + "epoch": 0.0715661003622978, + "grad_norm": 0.0025598302017897367, + "learning_rate": 9.284338996377022e-06, + "loss": 0.008, + "step": 16020 + }, + { + "epoch": 0.07161077333380984, + "grad_norm": 1.5907980198595205e-16, + "learning_rate": 9.283892266661902e-06, + "loss": 0.0, + "step": 16030 + }, + { + "epoch": 0.07165544630532189, + "grad_norm": 13.82027530670166, + "learning_rate": 9.283445536946782e-06, + "loss": 0.006, + "step": 16040 + }, + { + "epoch": 0.07170011927683394, + "grad_norm": 3.120620428731513e-11, + "learning_rate": 9.28299880723166e-06, + "loss": 0.0, + "step": 16050 + }, + { + "epoch": 0.07174479224834598, + "grad_norm": 6.461245760647216e-09, + "learning_rate": 9.28255207751654e-06, + "loss": 0.0, + "step": 16060 + }, + { + "epoch": 0.07178946521985803, + "grad_norm": 4.7788863144493376e-11, + "learning_rate": 9.28210534780142e-06, + "loss": 0.0003, + "step": 16070 + }, + { + "epoch": 0.07183413819137008, + "grad_norm": 467.54833984375, + "learning_rate": 9.281658618086299e-06, + "loss": 0.2234, + "step": 16080 + }, + { + "epoch": 0.07187881116288213, + "grad_norm": 1.4692007376318483e-13, + "learning_rate": 9.28121188837118e-06, + "loss": 0.0919, + "step": 16090 + }, + { + "epoch": 0.07192348413439417, + "grad_norm": 2.0292225677565057e-09, + "learning_rate": 9.28076515865606e-06, + "loss": 0.0, + "step": 16100 + }, + { + "epoch": 0.07196815710590622, + "grad_norm": 6.15861320274469e-11, + "learning_rate": 9.28031842894094e-06, + "loss": 0.1408, + "step": 16110 + }, + { + "epoch": 0.07201283007741827, + "grad_norm": 5.3670348165724135e-09, + "learning_rate": 9.279871699225818e-06, + "loss": 0.45, + "step": 16120 + }, + { + "epoch": 0.0720575030489303, + "grad_norm": 7.453482275820988e-13, + "learning_rate": 9.279424969510698e-06, + "loss": 0.0, + "step": 16130 + }, + { + "epoch": 0.07210217602044235, + "grad_norm": 0.0009137971210293472, + "learning_rate": 9.278978239795578e-06, + "loss": 0.0002, + "step": 16140 + }, + { + "epoch": 0.0721468489919544, + "grad_norm": 0.03269213065505028, + "learning_rate": 9.278531510080457e-06, + "loss": 0.0005, + "step": 16150 + }, + { + "epoch": 0.07219152196346644, + "grad_norm": 9.894575825342145e-09, + "learning_rate": 9.278084780365337e-06, + "loss": 0.0, + "step": 16160 + }, + { + "epoch": 0.07223619493497849, + "grad_norm": 6.140439836599398e-07, + "learning_rate": 9.277638050650217e-06, + "loss": 0.0, + "step": 16170 + }, + { + "epoch": 0.07228086790649053, + "grad_norm": 2.58226134919326e-10, + "learning_rate": 9.277191320935095e-06, + "loss": 0.0, + "step": 16180 + }, + { + "epoch": 0.07232554087800258, + "grad_norm": 4.679202338098154e-12, + "learning_rate": 9.276744591219975e-06, + "loss": 0.0313, + "step": 16190 + }, + { + "epoch": 0.07237021384951463, + "grad_norm": 2.7806524371953856e-07, + "learning_rate": 9.276297861504856e-06, + "loss": 0.0, + "step": 16200 + }, + { + "epoch": 0.07241488682102667, + "grad_norm": 3.463020668836858e-12, + "learning_rate": 9.275851131789734e-06, + "loss": 0.0, + "step": 16210 + }, + { + "epoch": 0.07245955979253872, + "grad_norm": 7.632701415349885e-11, + "learning_rate": 9.275404402074614e-06, + "loss": 0.0313, + "step": 16220 + }, + { + "epoch": 0.07250423276405077, + "grad_norm": 7.043704812123575e-13, + "learning_rate": 9.274957672359494e-06, + "loss": 0.0004, + "step": 16230 + }, + { + "epoch": 0.07254890573556282, + "grad_norm": 0.0031528829131275415, + "learning_rate": 9.274510942644373e-06, + "loss": 0.0, + "step": 16240 + }, + { + "epoch": 0.07259357870707486, + "grad_norm": 2.61696027337166e-06, + "learning_rate": 9.274064212929253e-06, + "loss": 0.0, + "step": 16250 + }, + { + "epoch": 0.07263825167858691, + "grad_norm": 9.001651763916016, + "learning_rate": 9.273617483214131e-06, + "loss": 0.0009, + "step": 16260 + }, + { + "epoch": 0.07268292465009896, + "grad_norm": 8.616438407216265e-08, + "learning_rate": 9.273170753499011e-06, + "loss": 0.0, + "step": 16270 + }, + { + "epoch": 0.07272759762161099, + "grad_norm": 1.6792819912403445e-10, + "learning_rate": 9.272724023783891e-06, + "loss": 0.0, + "step": 16280 + }, + { + "epoch": 0.07277227059312304, + "grad_norm": 1.614965228213805e-08, + "learning_rate": 9.27227729406877e-06, + "loss": 0.0, + "step": 16290 + }, + { + "epoch": 0.07281694356463508, + "grad_norm": 0.0012801244156435132, + "learning_rate": 9.27183056435365e-06, + "loss": 0.0, + "step": 16300 + }, + { + "epoch": 0.07286161653614713, + "grad_norm": 1.368288451430999e-07, + "learning_rate": 9.27138383463853e-06, + "loss": 0.0145, + "step": 16310 + }, + { + "epoch": 0.07290628950765918, + "grad_norm": 0.010511506348848343, + "learning_rate": 9.270937104923409e-06, + "loss": 0.0, + "step": 16320 + }, + { + "epoch": 0.07295096247917122, + "grad_norm": 0.007803723216056824, + "learning_rate": 9.270490375208289e-06, + "loss": 0.1813, + "step": 16330 + }, + { + "epoch": 0.07299563545068327, + "grad_norm": 3.816868696104869e-12, + "learning_rate": 9.270043645493167e-06, + "loss": 0.0904, + "step": 16340 + }, + { + "epoch": 0.07304030842219532, + "grad_norm": 0.005351858213543892, + "learning_rate": 9.269596915778047e-06, + "loss": 0.0014, + "step": 16350 + }, + { + "epoch": 0.07308498139370737, + "grad_norm": 1.0312246889743193e-10, + "learning_rate": 9.269150186062927e-06, + "loss": 0.0, + "step": 16360 + }, + { + "epoch": 0.07312965436521941, + "grad_norm": 8.958149326498965e-12, + "learning_rate": 9.268703456347806e-06, + "loss": 0.0, + "step": 16370 + }, + { + "epoch": 0.07317432733673146, + "grad_norm": 1.3682298566700113e-12, + "learning_rate": 9.268256726632686e-06, + "loss": 0.0, + "step": 16380 + }, + { + "epoch": 0.0732190003082435, + "grad_norm": 5.427913674083129e-10, + "learning_rate": 9.267809996917566e-06, + "loss": 0.0, + "step": 16390 + }, + { + "epoch": 0.07326367327975555, + "grad_norm": 4.0604925288789673e-07, + "learning_rate": 9.267363267202445e-06, + "loss": 0.0092, + "step": 16400 + }, + { + "epoch": 0.0733083462512676, + "grad_norm": 2.0408910786490604e-16, + "learning_rate": 9.266916537487325e-06, + "loss": 0.0, + "step": 16410 + }, + { + "epoch": 0.07335301922277965, + "grad_norm": 9.766434959601611e-05, + "learning_rate": 9.266469807772205e-06, + "loss": 0.0348, + "step": 16420 + }, + { + "epoch": 0.0733976921942917, + "grad_norm": 0.0839284136891365, + "learning_rate": 9.266023078057083e-06, + "loss": 0.0, + "step": 16430 + }, + { + "epoch": 0.07344236516580373, + "grad_norm": 3.1561714877170743e-06, + "learning_rate": 9.265576348341963e-06, + "loss": 0.0002, + "step": 16440 + }, + { + "epoch": 0.07348703813731577, + "grad_norm": 0.004904979839920998, + "learning_rate": 9.265129618626842e-06, + "loss": 0.0, + "step": 16450 + }, + { + "epoch": 0.07353171110882782, + "grad_norm": 4.0802338075403196e-19, + "learning_rate": 9.264682888911722e-06, + "loss": 0.0, + "step": 16460 + }, + { + "epoch": 0.07357638408033987, + "grad_norm": 9.05775454640434e-12, + "learning_rate": 9.264236159196602e-06, + "loss": 0.0, + "step": 16470 + }, + { + "epoch": 0.07362105705185192, + "grad_norm": 1.586842289513779e-11, + "learning_rate": 9.26378942948148e-06, + "loss": 0.0, + "step": 16480 + }, + { + "epoch": 0.07366573002336396, + "grad_norm": 5.9178513765800744e-05, + "learning_rate": 9.26334269976636e-06, + "loss": 0.0016, + "step": 16490 + }, + { + "epoch": 0.07371040299487601, + "grad_norm": 625.9370727539062, + "learning_rate": 9.26289597005124e-06, + "loss": 1.0798, + "step": 16500 + }, + { + "epoch": 0.07375507596638806, + "grad_norm": 14.680355072021484, + "learning_rate": 9.262449240336119e-06, + "loss": 0.0029, + "step": 16510 + }, + { + "epoch": 0.0737997489379001, + "grad_norm": 9.009173717103563e-16, + "learning_rate": 9.262002510621e-06, + "loss": 3.1769, + "step": 16520 + }, + { + "epoch": 0.07384442190941215, + "grad_norm": 4.8403515393147245e-05, + "learning_rate": 9.26155578090588e-06, + "loss": 0.0004, + "step": 16530 + }, + { + "epoch": 0.0738890948809242, + "grad_norm": 2.278566716995556e-05, + "learning_rate": 9.261109051190758e-06, + "loss": 0.0, + "step": 16540 + }, + { + "epoch": 0.07393376785243624, + "grad_norm": 3.8744247393696085e-11, + "learning_rate": 9.260662321475638e-06, + "loss": 0.0, + "step": 16550 + }, + { + "epoch": 0.07397844082394829, + "grad_norm": 4.385966936859553e-11, + "learning_rate": 9.260215591760518e-06, + "loss": 0.2777, + "step": 16560 + }, + { + "epoch": 0.07402311379546034, + "grad_norm": 1.677570062030842e-11, + "learning_rate": 9.259768862045398e-06, + "loss": 0.0027, + "step": 16570 + }, + { + "epoch": 0.07406778676697238, + "grad_norm": 3.435206633040089e-11, + "learning_rate": 9.259322132330277e-06, + "loss": 0.0025, + "step": 16580 + }, + { + "epoch": 0.07411245973848442, + "grad_norm": 2.0525411231764323e-11, + "learning_rate": 9.258875402615157e-06, + "loss": 0.0, + "step": 16590 + }, + { + "epoch": 0.07415713270999647, + "grad_norm": 3.8538229318874073e-07, + "learning_rate": 9.258428672900037e-06, + "loss": 0.0, + "step": 16600 + }, + { + "epoch": 0.07420180568150851, + "grad_norm": 6.099716565177715e-11, + "learning_rate": 9.257981943184915e-06, + "loss": 0.0045, + "step": 16610 + }, + { + "epoch": 0.07424647865302056, + "grad_norm": 1.0236337819047314e-11, + "learning_rate": 9.257535213469795e-06, + "loss": 0.0001, + "step": 16620 + }, + { + "epoch": 0.0742911516245326, + "grad_norm": 0.0001892660220619291, + "learning_rate": 9.257088483754676e-06, + "loss": 0.0001, + "step": 16630 + }, + { + "epoch": 0.07433582459604465, + "grad_norm": 0.30247747898101807, + "learning_rate": 9.256641754039554e-06, + "loss": 0.0252, + "step": 16640 + }, + { + "epoch": 0.0743804975675567, + "grad_norm": 6.412907531394563e-12, + "learning_rate": 9.256195024324434e-06, + "loss": 0.016, + "step": 16650 + }, + { + "epoch": 0.07442517053906875, + "grad_norm": 5.720272611142718e-07, + "learning_rate": 9.255748294609314e-06, + "loss": 0.0, + "step": 16660 + }, + { + "epoch": 0.0744698435105808, + "grad_norm": 4.307298695493955e-07, + "learning_rate": 9.255301564894193e-06, + "loss": 0.0003, + "step": 16670 + }, + { + "epoch": 0.07451451648209284, + "grad_norm": 1.700377338931247e-11, + "learning_rate": 9.254854835179073e-06, + "loss": 0.0, + "step": 16680 + }, + { + "epoch": 0.07455918945360489, + "grad_norm": 0.0009698119829408824, + "learning_rate": 9.254408105463953e-06, + "loss": 0.0001, + "step": 16690 + }, + { + "epoch": 0.07460386242511693, + "grad_norm": 750.8846435546875, + "learning_rate": 9.253961375748831e-06, + "loss": 0.3531, + "step": 16700 + }, + { + "epoch": 0.07464853539662898, + "grad_norm": 0.014949750155210495, + "learning_rate": 9.253514646033711e-06, + "loss": 0.0001, + "step": 16710 + }, + { + "epoch": 0.07469320836814103, + "grad_norm": 1.4294463426267612e-06, + "learning_rate": 9.253067916318592e-06, + "loss": 0.0226, + "step": 16720 + }, + { + "epoch": 0.07473788133965308, + "grad_norm": 1.61623481525347e-12, + "learning_rate": 9.25262118660347e-06, + "loss": 0.0127, + "step": 16730 + }, + { + "epoch": 0.07478255431116512, + "grad_norm": 2.6460383883986083e-10, + "learning_rate": 9.25217445688835e-06, + "loss": 0.0, + "step": 16740 + }, + { + "epoch": 0.07482722728267716, + "grad_norm": 6.732713245583e-06, + "learning_rate": 9.251727727173229e-06, + "loss": 0.0, + "step": 16750 + }, + { + "epoch": 0.0748719002541892, + "grad_norm": 1.6906284372453229e-07, + "learning_rate": 9.251280997458109e-06, + "loss": 0.016, + "step": 16760 + }, + { + "epoch": 0.07491657322570125, + "grad_norm": 1.4504463408115953e-08, + "learning_rate": 9.250834267742989e-06, + "loss": 0.0061, + "step": 16770 + }, + { + "epoch": 0.0749612461972133, + "grad_norm": 5.551413551074802e-08, + "learning_rate": 9.250387538027867e-06, + "loss": 0.0, + "step": 16780 + }, + { + "epoch": 0.07500591916872534, + "grad_norm": 1.2507527132055163e-11, + "learning_rate": 9.249940808312747e-06, + "loss": 0.0, + "step": 16790 + }, + { + "epoch": 0.07505059214023739, + "grad_norm": 6.933629885519377e-12, + "learning_rate": 9.249494078597628e-06, + "loss": 0.0, + "step": 16800 + }, + { + "epoch": 0.07509526511174944, + "grad_norm": 2.2586274894820235e-07, + "learning_rate": 9.249047348882506e-06, + "loss": 0.0, + "step": 16810 + }, + { + "epoch": 0.07513993808326148, + "grad_norm": 0.005923112854361534, + "learning_rate": 9.248600619167386e-06, + "loss": 0.1061, + "step": 16820 + }, + { + "epoch": 0.07518461105477353, + "grad_norm": 1.0352895270273658e-16, + "learning_rate": 9.248153889452266e-06, + "loss": 0.0, + "step": 16830 + }, + { + "epoch": 0.07522928402628558, + "grad_norm": 1.1537285426202687e-15, + "learning_rate": 9.247707159737145e-06, + "loss": 0.0007, + "step": 16840 + }, + { + "epoch": 0.07527395699779763, + "grad_norm": 2.0707506337203085e-05, + "learning_rate": 9.247260430022025e-06, + "loss": 0.0, + "step": 16850 + }, + { + "epoch": 0.07531862996930967, + "grad_norm": 3.5879608203774616e-13, + "learning_rate": 9.246813700306903e-06, + "loss": 0.0, + "step": 16860 + }, + { + "epoch": 0.07536330294082172, + "grad_norm": 0.00023001543013378978, + "learning_rate": 9.246366970591783e-06, + "loss": 0.0018, + "step": 16870 + }, + { + "epoch": 0.07540797591233377, + "grad_norm": 0.00014396563346963376, + "learning_rate": 9.245920240876663e-06, + "loss": 0.0348, + "step": 16880 + }, + { + "epoch": 0.07545264888384581, + "grad_norm": 9.09528971533291e-05, + "learning_rate": 9.245473511161542e-06, + "loss": 0.0043, + "step": 16890 + }, + { + "epoch": 0.07549732185535785, + "grad_norm": 9.14583757035195e-14, + "learning_rate": 9.245026781446422e-06, + "loss": 0.0, + "step": 16900 + }, + { + "epoch": 0.0755419948268699, + "grad_norm": 1.2336866461737372e-07, + "learning_rate": 9.244580051731302e-06, + "loss": 0.0001, + "step": 16910 + }, + { + "epoch": 0.07558666779838194, + "grad_norm": 3.8933134050012086e-08, + "learning_rate": 9.24413332201618e-06, + "loss": 0.0043, + "step": 16920 + }, + { + "epoch": 0.07563134076989399, + "grad_norm": 1.0874463214349817e-06, + "learning_rate": 9.24368659230106e-06, + "loss": 0.0, + "step": 16930 + }, + { + "epoch": 0.07567601374140603, + "grad_norm": 0.007349396590143442, + "learning_rate": 9.243239862585939e-06, + "loss": 0.0001, + "step": 16940 + }, + { + "epoch": 0.07572068671291808, + "grad_norm": 1.8731738009591936e-06, + "learning_rate": 9.24279313287082e-06, + "loss": 0.0, + "step": 16950 + }, + { + "epoch": 0.07576535968443013, + "grad_norm": 2.11032347579021e-05, + "learning_rate": 9.2423464031557e-06, + "loss": 0.0003, + "step": 16960 + }, + { + "epoch": 0.07581003265594217, + "grad_norm": 5.606299929503122e-13, + "learning_rate": 9.241899673440578e-06, + "loss": 0.2553, + "step": 16970 + }, + { + "epoch": 0.07585470562745422, + "grad_norm": 2.055375739093803e-16, + "learning_rate": 9.241452943725458e-06, + "loss": 0.0, + "step": 16980 + }, + { + "epoch": 0.07589937859896627, + "grad_norm": 534.0773315429688, + "learning_rate": 9.241006214010338e-06, + "loss": 0.6, + "step": 16990 + }, + { + "epoch": 0.07594405157047832, + "grad_norm": 2.370661000838936e-09, + "learning_rate": 9.240559484295216e-06, + "loss": 0.0, + "step": 17000 + }, + { + "epoch": 0.07598872454199036, + "grad_norm": 0.05900305137038231, + "learning_rate": 9.240112754580097e-06, + "loss": 0.0001, + "step": 17010 + }, + { + "epoch": 0.07603339751350241, + "grad_norm": 3.1733261851574213e-12, + "learning_rate": 9.239666024864977e-06, + "loss": 0.0, + "step": 17020 + }, + { + "epoch": 0.07607807048501446, + "grad_norm": 3.1729326926136425e-15, + "learning_rate": 9.239219295149857e-06, + "loss": 0.4924, + "step": 17030 + }, + { + "epoch": 0.0761227434565265, + "grad_norm": 5.282239403270931e-17, + "learning_rate": 9.238772565434735e-06, + "loss": 0.016, + "step": 17040 + }, + { + "epoch": 0.07616741642803855, + "grad_norm": 4.513401847816567e-07, + "learning_rate": 9.238325835719615e-06, + "loss": 0.0, + "step": 17050 + }, + { + "epoch": 0.07621208939955058, + "grad_norm": 4.919018920190865e-06, + "learning_rate": 9.237879106004496e-06, + "loss": 0.0003, + "step": 17060 + }, + { + "epoch": 0.07625676237106263, + "grad_norm": 2.0352650675903305e-09, + "learning_rate": 9.237432376289374e-06, + "loss": 1.3501, + "step": 17070 + }, + { + "epoch": 0.07630143534257468, + "grad_norm": 3.793320841358021e-14, + "learning_rate": 9.236985646574254e-06, + "loss": 0.0824, + "step": 17080 + }, + { + "epoch": 0.07634610831408672, + "grad_norm": 0.03230379521846771, + "learning_rate": 9.236538916859134e-06, + "loss": 0.0, + "step": 17090 + }, + { + "epoch": 0.07639078128559877, + "grad_norm": 0.014470861293375492, + "learning_rate": 9.236092187144013e-06, + "loss": 0.0, + "step": 17100 + }, + { + "epoch": 0.07643545425711082, + "grad_norm": 8.92955795279704e-05, + "learning_rate": 9.235645457428893e-06, + "loss": 0.0, + "step": 17110 + }, + { + "epoch": 0.07648012722862287, + "grad_norm": 0.0006282811518758535, + "learning_rate": 9.235198727713773e-06, + "loss": 0.0, + "step": 17120 + }, + { + "epoch": 0.07652480020013491, + "grad_norm": 1.946099531524581e-13, + "learning_rate": 9.234751997998653e-06, + "loss": 0.093, + "step": 17130 + }, + { + "epoch": 0.07656947317164696, + "grad_norm": 0.43705910444259644, + "learning_rate": 9.234305268283531e-06, + "loss": 0.0001, + "step": 17140 + }, + { + "epoch": 0.076614146143159, + "grad_norm": 1.526163639908873e-10, + "learning_rate": 9.233858538568412e-06, + "loss": 0.009, + "step": 17150 + }, + { + "epoch": 0.07665881911467105, + "grad_norm": 1.7637514702073531e-06, + "learning_rate": 9.23341180885329e-06, + "loss": 0.0, + "step": 17160 + }, + { + "epoch": 0.0767034920861831, + "grad_norm": 0.27547845244407654, + "learning_rate": 9.23296507913817e-06, + "loss": 0.0007, + "step": 17170 + }, + { + "epoch": 0.07674816505769515, + "grad_norm": 3.1078446482979416e-09, + "learning_rate": 9.23251834942305e-06, + "loss": 0.2688, + "step": 17180 + }, + { + "epoch": 0.0767928380292072, + "grad_norm": 2.0889000165030813e-12, + "learning_rate": 9.232071619707929e-06, + "loss": 0.0043, + "step": 17190 + }, + { + "epoch": 0.07683751100071924, + "grad_norm": 2.2112160422693705e-06, + "learning_rate": 9.231624889992809e-06, + "loss": 0.0, + "step": 17200 + }, + { + "epoch": 0.07688218397223127, + "grad_norm": 5.533650968003734e-11, + "learning_rate": 9.231178160277689e-06, + "loss": 0.0, + "step": 17210 + }, + { + "epoch": 0.07692685694374332, + "grad_norm": 8.352378546451066e-16, + "learning_rate": 9.230731430562567e-06, + "loss": 0.0, + "step": 17220 + }, + { + "epoch": 0.07697152991525537, + "grad_norm": 2.0515146388788708e-05, + "learning_rate": 9.230284700847448e-06, + "loss": 0.0575, + "step": 17230 + }, + { + "epoch": 0.07701620288676742, + "grad_norm": 1.9726203025097026e-14, + "learning_rate": 9.229837971132326e-06, + "loss": 0.0, + "step": 17240 + }, + { + "epoch": 0.07706087585827946, + "grad_norm": 1.1162385035387579e-12, + "learning_rate": 9.229391241417206e-06, + "loss": 0.0006, + "step": 17250 + }, + { + "epoch": 0.07710554882979151, + "grad_norm": 1.603686783369085e-08, + "learning_rate": 9.228944511702086e-06, + "loss": 0.0973, + "step": 17260 + }, + { + "epoch": 0.07715022180130356, + "grad_norm": 0.012515800073742867, + "learning_rate": 9.228497781986965e-06, + "loss": 0.0, + "step": 17270 + }, + { + "epoch": 0.0771948947728156, + "grad_norm": 1.2555916660658868e-14, + "learning_rate": 9.228051052271845e-06, + "loss": 0.0, + "step": 17280 + }, + { + "epoch": 0.07723956774432765, + "grad_norm": 1.5260538798717167e-19, + "learning_rate": 9.227604322556725e-06, + "loss": 0.0002, + "step": 17290 + }, + { + "epoch": 0.0772842407158397, + "grad_norm": 4.507444089085766e-07, + "learning_rate": 9.227157592841603e-06, + "loss": 0.0001, + "step": 17300 + }, + { + "epoch": 0.07732891368735174, + "grad_norm": 0.0018883688608184457, + "learning_rate": 9.226710863126483e-06, + "loss": 0.0, + "step": 17310 + }, + { + "epoch": 0.07737358665886379, + "grad_norm": 8.304020582272642e-09, + "learning_rate": 9.226264133411364e-06, + "loss": 0.0, + "step": 17320 + }, + { + "epoch": 0.07741825963037584, + "grad_norm": 3.2100276539192876e-19, + "learning_rate": 9.225817403696242e-06, + "loss": 0.0, + "step": 17330 + }, + { + "epoch": 0.07746293260188788, + "grad_norm": 3.4246664171178054e-08, + "learning_rate": 9.225370673981122e-06, + "loss": 0.001, + "step": 17340 + }, + { + "epoch": 0.07750760557339993, + "grad_norm": 9.922782945792283e-14, + "learning_rate": 9.224923944266e-06, + "loss": 0.0, + "step": 17350 + }, + { + "epoch": 0.07755227854491197, + "grad_norm": 2.466302717607505e-10, + "learning_rate": 9.22447721455088e-06, + "loss": 0.0038, + "step": 17360 + }, + { + "epoch": 0.07759695151642401, + "grad_norm": 1.2259521176005705e-10, + "learning_rate": 9.22403048483576e-06, + "loss": 0.1602, + "step": 17370 + }, + { + "epoch": 0.07764162448793606, + "grad_norm": 1.6446602760233908e-11, + "learning_rate": 9.22358375512064e-06, + "loss": 0.0003, + "step": 17380 + }, + { + "epoch": 0.0776862974594481, + "grad_norm": 4.474438810575521e-06, + "learning_rate": 9.22313702540552e-06, + "loss": 0.0, + "step": 17390 + }, + { + "epoch": 0.07773097043096015, + "grad_norm": 4.021822874733516e-08, + "learning_rate": 9.2226902956904e-06, + "loss": 0.0004, + "step": 17400 + }, + { + "epoch": 0.0777756434024722, + "grad_norm": 1.5872502956426615e-07, + "learning_rate": 9.222243565975278e-06, + "loss": 0.0003, + "step": 17410 + }, + { + "epoch": 0.07782031637398425, + "grad_norm": 1.657014456202255e-11, + "learning_rate": 9.221796836260158e-06, + "loss": 0.0, + "step": 17420 + }, + { + "epoch": 0.0778649893454963, + "grad_norm": 7.961917702914434e-08, + "learning_rate": 9.221350106545036e-06, + "loss": 0.0016, + "step": 17430 + }, + { + "epoch": 0.07790966231700834, + "grad_norm": 2.4046839172675583e-11, + "learning_rate": 9.220903376829917e-06, + "loss": 0.0006, + "step": 17440 + }, + { + "epoch": 0.07795433528852039, + "grad_norm": 1.4532316461313144e-09, + "learning_rate": 9.220456647114797e-06, + "loss": 0.0, + "step": 17450 + }, + { + "epoch": 0.07799900826003243, + "grad_norm": 3.730805886448252e-09, + "learning_rate": 9.220009917399675e-06, + "loss": 0.0, + "step": 17460 + }, + { + "epoch": 0.07804368123154448, + "grad_norm": 3.827557235459682e-14, + "learning_rate": 9.219563187684555e-06, + "loss": 0.0003, + "step": 17470 + }, + { + "epoch": 0.07808835420305653, + "grad_norm": 2.06796145754992e-18, + "learning_rate": 9.219116457969435e-06, + "loss": 0.0002, + "step": 17480 + }, + { + "epoch": 0.07813302717456858, + "grad_norm": 9.762475627894673e-08, + "learning_rate": 9.218669728254316e-06, + "loss": 0.0001, + "step": 17490 + }, + { + "epoch": 0.07817770014608062, + "grad_norm": 3.0506463342603023e-15, + "learning_rate": 9.218222998539194e-06, + "loss": 0.1359, + "step": 17500 + }, + { + "epoch": 0.07822237311759267, + "grad_norm": 5.758641119513186e-09, + "learning_rate": 9.217776268824074e-06, + "loss": 0.0, + "step": 17510 + }, + { + "epoch": 0.0782670460891047, + "grad_norm": 3.089372979658833e-09, + "learning_rate": 9.217329539108954e-06, + "loss": 0.0, + "step": 17520 + }, + { + "epoch": 0.07831171906061675, + "grad_norm": 5.25945498090763e-13, + "learning_rate": 9.216882809393833e-06, + "loss": 0.001, + "step": 17530 + }, + { + "epoch": 0.0783563920321288, + "grad_norm": 0.00019875851285178214, + "learning_rate": 9.216436079678713e-06, + "loss": 0.0, + "step": 17540 + }, + { + "epoch": 0.07840106500364084, + "grad_norm": 3.922442601833609e-07, + "learning_rate": 9.215989349963593e-06, + "loss": 0.0, + "step": 17550 + }, + { + "epoch": 0.07844573797515289, + "grad_norm": 2.3536176479987958e-11, + "learning_rate": 9.215542620248471e-06, + "loss": 0.0021, + "step": 17560 + }, + { + "epoch": 0.07849041094666494, + "grad_norm": 2.7818711600957613e-07, + "learning_rate": 9.215095890533351e-06, + "loss": 0.0252, + "step": 17570 + }, + { + "epoch": 0.07853508391817698, + "grad_norm": 1.2804473925598359e-08, + "learning_rate": 9.214649160818232e-06, + "loss": 0.0012, + "step": 17580 + }, + { + "epoch": 0.07857975688968903, + "grad_norm": 4.7054363015535046e-20, + "learning_rate": 9.214202431103112e-06, + "loss": 0.0, + "step": 17590 + }, + { + "epoch": 0.07862442986120108, + "grad_norm": 1.086241665192167e-12, + "learning_rate": 9.21375570138799e-06, + "loss": 0.0, + "step": 17600 + }, + { + "epoch": 0.07866910283271313, + "grad_norm": 9.0274799519783e-17, + "learning_rate": 9.21330897167287e-06, + "loss": 0.1071, + "step": 17610 + }, + { + "epoch": 0.07871377580422517, + "grad_norm": 1.529821425094724e-08, + "learning_rate": 9.21286224195775e-06, + "loss": 0.0, + "step": 17620 + }, + { + "epoch": 0.07875844877573722, + "grad_norm": 9.494939234855337e-08, + "learning_rate": 9.212415512242629e-06, + "loss": 0.0, + "step": 17630 + }, + { + "epoch": 0.07880312174724927, + "grad_norm": 0.008173040114343166, + "learning_rate": 9.211968782527509e-06, + "loss": 0.0, + "step": 17640 + }, + { + "epoch": 0.07884779471876131, + "grad_norm": 9.335294041116228e-18, + "learning_rate": 9.211522052812387e-06, + "loss": 0.0, + "step": 17650 + }, + { + "epoch": 0.07889246769027336, + "grad_norm": 0.002261559944599867, + "learning_rate": 9.211075323097267e-06, + "loss": 0.0, + "step": 17660 + }, + { + "epoch": 0.0789371406617854, + "grad_norm": 6.691013550153002e-05, + "learning_rate": 9.210628593382148e-06, + "loss": 0.0, + "step": 17670 + }, + { + "epoch": 0.07898181363329744, + "grad_norm": 6.267378616087171e-09, + "learning_rate": 9.210181863667026e-06, + "loss": 0.0, + "step": 17680 + }, + { + "epoch": 0.07902648660480949, + "grad_norm": 0.007965099066495895, + "learning_rate": 9.209735133951906e-06, + "loss": 0.0015, + "step": 17690 + }, + { + "epoch": 0.07907115957632153, + "grad_norm": 5.188430861704542e-15, + "learning_rate": 9.209288404236786e-06, + "loss": 0.0, + "step": 17700 + }, + { + "epoch": 0.07911583254783358, + "grad_norm": 0.05706815794110298, + "learning_rate": 9.208841674521665e-06, + "loss": 0.0, + "step": 17710 + }, + { + "epoch": 0.07916050551934563, + "grad_norm": 1.819989471629757e-11, + "learning_rate": 9.208394944806545e-06, + "loss": 0.0, + "step": 17720 + }, + { + "epoch": 0.07920517849085768, + "grad_norm": 7.620450494585929e-13, + "learning_rate": 9.207948215091423e-06, + "loss": 0.333, + "step": 17730 + }, + { + "epoch": 0.07924985146236972, + "grad_norm": 6.586374183825683e-06, + "learning_rate": 9.207501485376303e-06, + "loss": 0.0, + "step": 17740 + }, + { + "epoch": 0.07929452443388177, + "grad_norm": 1.1758358731128737e-08, + "learning_rate": 9.207054755661184e-06, + "loss": 0.1407, + "step": 17750 + }, + { + "epoch": 0.07933919740539382, + "grad_norm": 3.1918389797210693, + "learning_rate": 9.206608025946062e-06, + "loss": 0.2583, + "step": 17760 + }, + { + "epoch": 0.07938387037690586, + "grad_norm": 9.242794897501572e-12, + "learning_rate": 9.206161296230942e-06, + "loss": 0.0001, + "step": 17770 + }, + { + "epoch": 0.07942854334841791, + "grad_norm": 1.7713607002641657e-13, + "learning_rate": 9.205714566515822e-06, + "loss": 0.0004, + "step": 17780 + }, + { + "epoch": 0.07947321631992996, + "grad_norm": 5.914055023481524e-09, + "learning_rate": 9.2052678368007e-06, + "loss": 0.0, + "step": 17790 + }, + { + "epoch": 0.079517889291442, + "grad_norm": 6.293767048325127e-13, + "learning_rate": 9.20482110708558e-06, + "loss": 0.0, + "step": 17800 + }, + { + "epoch": 0.07956256226295405, + "grad_norm": 2.157514131795324e-07, + "learning_rate": 9.204374377370461e-06, + "loss": 0.0, + "step": 17810 + }, + { + "epoch": 0.0796072352344661, + "grad_norm": 7.039646778917233e-14, + "learning_rate": 9.20392764765534e-06, + "loss": 0.2479, + "step": 17820 + }, + { + "epoch": 0.07965190820597813, + "grad_norm": 6.4774088859558105, + "learning_rate": 9.20348091794022e-06, + "loss": 0.0013, + "step": 17830 + }, + { + "epoch": 0.07969658117749018, + "grad_norm": 5.9319362755161364e-09, + "learning_rate": 9.203034188225098e-06, + "loss": 0.0, + "step": 17840 + }, + { + "epoch": 0.07974125414900222, + "grad_norm": 0.004283531103283167, + "learning_rate": 9.202587458509978e-06, + "loss": 0.0062, + "step": 17850 + }, + { + "epoch": 0.07978592712051427, + "grad_norm": 2.560173833483527e-18, + "learning_rate": 9.202140728794858e-06, + "loss": 0.0001, + "step": 17860 + }, + { + "epoch": 0.07983060009202632, + "grad_norm": 4.4415496280869604e-10, + "learning_rate": 9.201693999079737e-06, + "loss": 0.0062, + "step": 17870 + }, + { + "epoch": 0.07987527306353837, + "grad_norm": 0.0, + "learning_rate": 9.201247269364617e-06, + "loss": 0.0, + "step": 17880 + }, + { + "epoch": 0.07991994603505041, + "grad_norm": 0.00010508752893656492, + "learning_rate": 9.200800539649497e-06, + "loss": 0.0012, + "step": 17890 + }, + { + "epoch": 0.07996461900656246, + "grad_norm": 0.000583523535169661, + "learning_rate": 9.200353809934375e-06, + "loss": 0.0006, + "step": 17900 + }, + { + "epoch": 0.0800092919780745, + "grad_norm": 1.874498778420275e-08, + "learning_rate": 9.199907080219255e-06, + "loss": 0.4538, + "step": 17910 + }, + { + "epoch": 0.08005396494958655, + "grad_norm": 5.004399099561851e-07, + "learning_rate": 9.199460350504136e-06, + "loss": 0.0001, + "step": 17920 + }, + { + "epoch": 0.0800986379210986, + "grad_norm": 37.80466842651367, + "learning_rate": 9.199013620789014e-06, + "loss": 0.4853, + "step": 17930 + }, + { + "epoch": 0.08014331089261065, + "grad_norm": 2.3460300369038123e-10, + "learning_rate": 9.198566891073894e-06, + "loss": 0.0002, + "step": 17940 + }, + { + "epoch": 0.0801879838641227, + "grad_norm": 1.444460939345851e-18, + "learning_rate": 9.198120161358774e-06, + "loss": 0.0001, + "step": 17950 + }, + { + "epoch": 0.08023265683563474, + "grad_norm": 4.5384392738342285, + "learning_rate": 9.197673431643653e-06, + "loss": 0.0008, + "step": 17960 + }, + { + "epoch": 0.08027732980714679, + "grad_norm": 1.2695361206738198e-08, + "learning_rate": 9.197226701928533e-06, + "loss": 0.0014, + "step": 17970 + }, + { + "epoch": 0.08032200277865882, + "grad_norm": 0.0022479791659861803, + "learning_rate": 9.196779972213413e-06, + "loss": 0.0, + "step": 17980 + }, + { + "epoch": 0.08036667575017087, + "grad_norm": 0.6332511901855469, + "learning_rate": 9.196333242498291e-06, + "loss": 0.0388, + "step": 17990 + }, + { + "epoch": 0.08041134872168292, + "grad_norm": 0.0012692047748714685, + "learning_rate": 9.195886512783171e-06, + "loss": 0.0038, + "step": 18000 + } + ], + "logging_steps": 10, + "max_steps": 223849, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}