diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,33284 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.684270423671435, + "eval_steps": 500, + "global_step": 47500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00014405693129924946, + "grad_norm": 27.26666831970215, + "learning_rate": 8.641382621219396e-08, + "loss": 4.9, + "step": 10 + }, + { + "epoch": 0.0002881138625984989, + "grad_norm": 27.764198303222656, + "learning_rate": 1.8242918867018726e-07, + "loss": 4.9161, + "step": 20 + }, + { + "epoch": 0.0004321707938977484, + "grad_norm": 28.83149528503418, + "learning_rate": 2.7844455112818053e-07, + "loss": 4.9883, + "step": 30 + }, + { + "epoch": 0.0005762277251969978, + "grad_norm": 27.213775634765625, + "learning_rate": 3.744599135861738e-07, + "loss": 4.9025, + "step": 40 + }, + { + "epoch": 0.0007202846564962473, + "grad_norm": 24.665821075439453, + "learning_rate": 4.7047527604416713e-07, + "loss": 4.8487, + "step": 50 + }, + { + "epoch": 0.0008643415877954968, + "grad_norm": 21.37822914123535, + "learning_rate": 5.664906385021604e-07, + "loss": 4.728, + "step": 60 + }, + { + "epoch": 0.0010083985190947463, + "grad_norm": 16.232444763183594, + "learning_rate": 6.625060009601537e-07, + "loss": 4.5959, + "step": 70 + }, + { + "epoch": 0.0011524554503939957, + "grad_norm": 14.029967308044434, + "learning_rate": 7.585213634181469e-07, + "loss": 4.4123, + "step": 80 + }, + { + "epoch": 0.0012965123816932453, + "grad_norm": 10.092180252075195, + "learning_rate": 8.545367258761402e-07, + "loss": 4.2001, + "step": 90 + }, + { + "epoch": 0.0014405693129924946, + "grad_norm": 7.05848503112793, + "learning_rate": 9.505520883341334e-07, + "loss": 4.0065, + "step": 100 + }, + { + "epoch": 0.0015846262442917442, + "grad_norm": 5.146378517150879, + "learning_rate": 1.0465674507921267e-06, + "loss": 3.7921, + "step": 110 + }, + { + "epoch": 0.0017286831755909935, + "grad_norm": 4.485132217407227, + "learning_rate": 1.1425828132501202e-06, + "loss": 3.637, + "step": 120 + }, + { + "epoch": 0.001872740106890243, + "grad_norm": 3.7661523818969727, + "learning_rate": 1.2385981757081133e-06, + "loss": 3.5115, + "step": 130 + }, + { + "epoch": 0.0020167970381894927, + "grad_norm": 3.7206203937530518, + "learning_rate": 1.3346135381661066e-06, + "loss": 3.3966, + "step": 140 + }, + { + "epoch": 0.002160853969488742, + "grad_norm": 3.2748324871063232, + "learning_rate": 1.4306289006241e-06, + "loss": 3.1868, + "step": 150 + }, + { + "epoch": 0.0023049109007879914, + "grad_norm": 2.8945841789245605, + "learning_rate": 1.5266442630820932e-06, + "loss": 3.0535, + "step": 160 + }, + { + "epoch": 0.002448967832087241, + "grad_norm": 2.8431167602539062, + "learning_rate": 1.6226596255400865e-06, + "loss": 2.8823, + "step": 170 + }, + { + "epoch": 0.0025930247633864905, + "grad_norm": 2.856003522872925, + "learning_rate": 1.7186749879980798e-06, + "loss": 2.7049, + "step": 180 + }, + { + "epoch": 0.0027370816946857396, + "grad_norm": 2.8247039318084717, + "learning_rate": 1.8146903504560731e-06, + "loss": 2.6149, + "step": 190 + }, + { + "epoch": 0.002881138625984989, + "grad_norm": 2.9072368144989014, + "learning_rate": 1.9107057129140664e-06, + "loss": 2.4336, + "step": 200 + }, + { + "epoch": 0.0030251955572842388, + "grad_norm": 2.8501107692718506, + "learning_rate": 2.0067210753720597e-06, + "loss": 2.3022, + "step": 210 + }, + { + "epoch": 0.0031692524885834883, + "grad_norm": 3.5651540756225586, + "learning_rate": 2.102736437830053e-06, + "loss": 2.1635, + "step": 220 + }, + { + "epoch": 0.0033133094198827375, + "grad_norm": 3.2838971614837646, + "learning_rate": 2.1987518002880463e-06, + "loss": 2.0627, + "step": 230 + }, + { + "epoch": 0.003457366351181987, + "grad_norm": 4.433910369873047, + "learning_rate": 2.2947671627460396e-06, + "loss": 1.9501, + "step": 240 + }, + { + "epoch": 0.0036014232824812366, + "grad_norm": 3.1078832149505615, + "learning_rate": 2.390782525204033e-06, + "loss": 1.8411, + "step": 250 + }, + { + "epoch": 0.003745480213780486, + "grad_norm": 3.1979434490203857, + "learning_rate": 2.4867978876620263e-06, + "loss": 1.7389, + "step": 260 + }, + { + "epoch": 0.0038895371450797353, + "grad_norm": 3.8683359622955322, + "learning_rate": 2.582813250120019e-06, + "loss": 1.6616, + "step": 270 + }, + { + "epoch": 0.004033594076378985, + "grad_norm": 3.9330086708068848, + "learning_rate": 2.678828612578013e-06, + "loss": 1.6143, + "step": 280 + }, + { + "epoch": 0.004177651007678234, + "grad_norm": 5.806529521942139, + "learning_rate": 2.774843975036006e-06, + "loss": 1.5516, + "step": 290 + }, + { + "epoch": 0.004321707938977484, + "grad_norm": 3.495863676071167, + "learning_rate": 2.870859337493999e-06, + "loss": 1.5236, + "step": 300 + }, + { + "epoch": 0.004465764870276733, + "grad_norm": 2.784912347793579, + "learning_rate": 2.9668746999519928e-06, + "loss": 1.4801, + "step": 310 + }, + { + "epoch": 0.004609821801575983, + "grad_norm": 3.510164737701416, + "learning_rate": 3.062890062409986e-06, + "loss": 1.4932, + "step": 320 + }, + { + "epoch": 0.004753878732875232, + "grad_norm": 3.2866201400756836, + "learning_rate": 3.158905424867979e-06, + "loss": 1.4565, + "step": 330 + }, + { + "epoch": 0.004897935664174482, + "grad_norm": 4.239346981048584, + "learning_rate": 3.2549207873259727e-06, + "loss": 1.4323, + "step": 340 + }, + { + "epoch": 0.0050419925954737314, + "grad_norm": 3.097536563873291, + "learning_rate": 3.350936149783966e-06, + "loss": 1.4196, + "step": 350 + }, + { + "epoch": 0.005186049526772981, + "grad_norm": 4.322780132293701, + "learning_rate": 3.446951512241959e-06, + "loss": 1.3871, + "step": 360 + }, + { + "epoch": 0.005330106458072231, + "grad_norm": 3.513322591781616, + "learning_rate": 3.5429668746999526e-06, + "loss": 1.3872, + "step": 370 + }, + { + "epoch": 0.005474163389371479, + "grad_norm": 3.922034740447998, + "learning_rate": 3.6389822371579454e-06, + "loss": 1.3878, + "step": 380 + }, + { + "epoch": 0.005618220320670729, + "grad_norm": 2.89475154876709, + "learning_rate": 3.7349975996159387e-06, + "loss": 1.377, + "step": 390 + }, + { + "epoch": 0.005762277251969978, + "grad_norm": 2.8162641525268555, + "learning_rate": 3.831012962073932e-06, + "loss": 1.3718, + "step": 400 + }, + { + "epoch": 0.005906334183269228, + "grad_norm": 3.8730788230895996, + "learning_rate": 3.927028324531925e-06, + "loss": 1.3513, + "step": 410 + }, + { + "epoch": 0.0060503911145684776, + "grad_norm": 2.5291128158569336, + "learning_rate": 4.023043686989919e-06, + "loss": 1.3291, + "step": 420 + }, + { + "epoch": 0.006194448045867727, + "grad_norm": 2.768787384033203, + "learning_rate": 4.119059049447912e-06, + "loss": 1.3187, + "step": 430 + }, + { + "epoch": 0.006338504977166977, + "grad_norm": 3.5093281269073486, + "learning_rate": 4.215074411905905e-06, + "loss": 1.313, + "step": 440 + }, + { + "epoch": 0.006482561908466226, + "grad_norm": 3.6907739639282227, + "learning_rate": 4.3110897743638986e-06, + "loss": 1.2778, + "step": 450 + }, + { + "epoch": 0.006626618839765475, + "grad_norm": 3.0370869636535645, + "learning_rate": 4.407105136821892e-06, + "loss": 1.3173, + "step": 460 + }, + { + "epoch": 0.0067706757710647245, + "grad_norm": 3.6910996437072754, + "learning_rate": 4.503120499279885e-06, + "loss": 1.3183, + "step": 470 + }, + { + "epoch": 0.006914732702363974, + "grad_norm": 3.107311487197876, + "learning_rate": 4.5991358617378785e-06, + "loss": 1.2866, + "step": 480 + }, + { + "epoch": 0.007058789633663224, + "grad_norm": 2.9197614192962646, + "learning_rate": 4.695151224195872e-06, + "loss": 1.2928, + "step": 490 + }, + { + "epoch": 0.007202846564962473, + "grad_norm": 3.9091696739196777, + "learning_rate": 4.791166586653865e-06, + "loss": 1.2397, + "step": 500 + }, + { + "epoch": 0.007346903496261723, + "grad_norm": 3.2482357025146484, + "learning_rate": 4.887181949111858e-06, + "loss": 1.2641, + "step": 510 + }, + { + "epoch": 0.007490960427560972, + "grad_norm": 3.3543107509613037, + "learning_rate": 4.983197311569852e-06, + "loss": 1.2643, + "step": 520 + }, + { + "epoch": 0.007635017358860222, + "grad_norm": 2.6477134227752686, + "learning_rate": 5.079212674027845e-06, + "loss": 1.2904, + "step": 530 + }, + { + "epoch": 0.007779074290159471, + "grad_norm": 2.9984657764434814, + "learning_rate": 5.175228036485837e-06, + "loss": 1.2355, + "step": 540 + }, + { + "epoch": 0.007923131221458721, + "grad_norm": 2.8463642597198486, + "learning_rate": 5.2712433989438316e-06, + "loss": 1.2643, + "step": 550 + }, + { + "epoch": 0.00806718815275797, + "grad_norm": 3.5982484817504883, + "learning_rate": 5.367258761401825e-06, + "loss": 1.2626, + "step": 560 + }, + { + "epoch": 0.00821124508405722, + "grad_norm": 2.690351724624634, + "learning_rate": 5.463274123859817e-06, + "loss": 1.2533, + "step": 570 + }, + { + "epoch": 0.008355302015356468, + "grad_norm": 2.9995064735412598, + "learning_rate": 5.5592894863178115e-06, + "loss": 1.2195, + "step": 580 + }, + { + "epoch": 0.008499358946655718, + "grad_norm": 3.226581335067749, + "learning_rate": 5.655304848775805e-06, + "loss": 1.2339, + "step": 590 + }, + { + "epoch": 0.008643415877954967, + "grad_norm": 4.1260833740234375, + "learning_rate": 5.751320211233797e-06, + "loss": 1.2249, + "step": 600 + }, + { + "epoch": 0.008787472809254217, + "grad_norm": 2.9851315021514893, + "learning_rate": 5.847335573691791e-06, + "loss": 1.2367, + "step": 610 + }, + { + "epoch": 0.008931529740553466, + "grad_norm": 3.313911199569702, + "learning_rate": 5.943350936149785e-06, + "loss": 1.2336, + "step": 620 + }, + { + "epoch": 0.009075586671852716, + "grad_norm": 3.2980895042419434, + "learning_rate": 6.039366298607777e-06, + "loss": 1.2055, + "step": 630 + }, + { + "epoch": 0.009219643603151965, + "grad_norm": 2.9993834495544434, + "learning_rate": 6.135381661065771e-06, + "loss": 1.193, + "step": 640 + }, + { + "epoch": 0.009363700534451215, + "grad_norm": 3.1403629779815674, + "learning_rate": 6.231397023523765e-06, + "loss": 1.223, + "step": 650 + }, + { + "epoch": 0.009507757465750465, + "grad_norm": 2.9803543090820312, + "learning_rate": 6.327412385981757e-06, + "loss": 1.1861, + "step": 660 + }, + { + "epoch": 0.009651814397049714, + "grad_norm": 3.8311474323272705, + "learning_rate": 6.423427748439751e-06, + "loss": 1.1753, + "step": 670 + }, + { + "epoch": 0.009795871328348964, + "grad_norm": 3.520228147506714, + "learning_rate": 6.5194431108977445e-06, + "loss": 1.1899, + "step": 680 + }, + { + "epoch": 0.009939928259648213, + "grad_norm": 3.1205902099609375, + "learning_rate": 6.615458473355737e-06, + "loss": 1.1884, + "step": 690 + }, + { + "epoch": 0.010083985190947463, + "grad_norm": 3.5261614322662354, + "learning_rate": 6.711473835813731e-06, + "loss": 1.1966, + "step": 700 + }, + { + "epoch": 0.010228042122246712, + "grad_norm": 3.649890184402466, + "learning_rate": 6.8074891982717235e-06, + "loss": 1.1988, + "step": 710 + }, + { + "epoch": 0.010372099053545962, + "grad_norm": 2.4982945919036865, + "learning_rate": 6.903504560729717e-06, + "loss": 1.2032, + "step": 720 + }, + { + "epoch": 0.010516155984845212, + "grad_norm": 3.6677379608154297, + "learning_rate": 6.999519923187711e-06, + "loss": 1.2, + "step": 730 + }, + { + "epoch": 0.010660212916144461, + "grad_norm": 2.5797760486602783, + "learning_rate": 7.0955352856457035e-06, + "loss": 1.1846, + "step": 740 + }, + { + "epoch": 0.010804269847443709, + "grad_norm": 2.922659158706665, + "learning_rate": 7.191550648103697e-06, + "loss": 1.1787, + "step": 750 + }, + { + "epoch": 0.010948326778742959, + "grad_norm": 2.6787455081939697, + "learning_rate": 7.287566010561691e-06, + "loss": 1.167, + "step": 760 + }, + { + "epoch": 0.011092383710042208, + "grad_norm": 3.1619741916656494, + "learning_rate": 7.383581373019683e-06, + "loss": 1.1754, + "step": 770 + }, + { + "epoch": 0.011236440641341458, + "grad_norm": 3.1390509605407715, + "learning_rate": 7.479596735477677e-06, + "loss": 1.1876, + "step": 780 + }, + { + "epoch": 0.011380497572640707, + "grad_norm": 3.50462007522583, + "learning_rate": 7.575612097935671e-06, + "loss": 1.1585, + "step": 790 + }, + { + "epoch": 0.011524554503939957, + "grad_norm": 2.9821434020996094, + "learning_rate": 7.671627460393664e-06, + "loss": 1.1848, + "step": 800 + }, + { + "epoch": 0.011668611435239206, + "grad_norm": 3.2226595878601074, + "learning_rate": 7.767642822851656e-06, + "loss": 1.152, + "step": 810 + }, + { + "epoch": 0.011812668366538456, + "grad_norm": 2.48160719871521, + "learning_rate": 7.86365818530965e-06, + "loss": 1.1387, + "step": 820 + }, + { + "epoch": 0.011956725297837706, + "grad_norm": 2.464972496032715, + "learning_rate": 7.959673547767644e-06, + "loss": 1.1585, + "step": 830 + }, + { + "epoch": 0.012100782229136955, + "grad_norm": 2.104072093963623, + "learning_rate": 8.055688910225636e-06, + "loss": 1.1385, + "step": 840 + }, + { + "epoch": 0.012244839160436205, + "grad_norm": 2.9877710342407227, + "learning_rate": 8.15170427268363e-06, + "loss": 1.1372, + "step": 850 + }, + { + "epoch": 0.012388896091735454, + "grad_norm": 2.8173460960388184, + "learning_rate": 8.247719635141624e-06, + "loss": 1.152, + "step": 860 + }, + { + "epoch": 0.012532953023034704, + "grad_norm": 2.934769630432129, + "learning_rate": 8.343734997599616e-06, + "loss": 1.1609, + "step": 870 + }, + { + "epoch": 0.012677009954333953, + "grad_norm": 2.321155309677124, + "learning_rate": 8.43975036005761e-06, + "loss": 1.1276, + "step": 880 + }, + { + "epoch": 0.012821066885633203, + "grad_norm": 3.0589613914489746, + "learning_rate": 8.535765722515604e-06, + "loss": 1.1481, + "step": 890 + }, + { + "epoch": 0.012965123816932453, + "grad_norm": 3.533726215362549, + "learning_rate": 8.631781084973595e-06, + "loss": 1.1312, + "step": 900 + }, + { + "epoch": 0.0131091807482317, + "grad_norm": 2.3042759895324707, + "learning_rate": 8.72779644743159e-06, + "loss": 1.1595, + "step": 910 + }, + { + "epoch": 0.01325323767953095, + "grad_norm": 2.614858388900757, + "learning_rate": 8.823811809889584e-06, + "loss": 1.1179, + "step": 920 + }, + { + "epoch": 0.0133972946108302, + "grad_norm": 2.8544540405273438, + "learning_rate": 8.919827172347575e-06, + "loss": 1.1117, + "step": 930 + }, + { + "epoch": 0.013541351542129449, + "grad_norm": 2.6770823001861572, + "learning_rate": 9.01584253480557e-06, + "loss": 1.1287, + "step": 940 + }, + { + "epoch": 0.013685408473428699, + "grad_norm": 1.9468642473220825, + "learning_rate": 9.111857897263564e-06, + "loss": 1.1345, + "step": 950 + }, + { + "epoch": 0.013829465404727948, + "grad_norm": 2.8672447204589844, + "learning_rate": 9.207873259721555e-06, + "loss": 1.1124, + "step": 960 + }, + { + "epoch": 0.013973522336027198, + "grad_norm": 2.024662733078003, + "learning_rate": 9.30388862217955e-06, + "loss": 1.1246, + "step": 970 + }, + { + "epoch": 0.014117579267326447, + "grad_norm": 2.2646453380584717, + "learning_rate": 9.399903984637544e-06, + "loss": 1.1344, + "step": 980 + }, + { + "epoch": 0.014261636198625697, + "grad_norm": 3.0018582344055176, + "learning_rate": 9.495919347095535e-06, + "loss": 1.1436, + "step": 990 + }, + { + "epoch": 0.014405693129924946, + "grad_norm": 2.3975377082824707, + "learning_rate": 9.59193470955353e-06, + "loss": 1.1309, + "step": 1000 + }, + { + "epoch": 0.014549750061224196, + "grad_norm": 2.0984370708465576, + "learning_rate": 9.687950072011523e-06, + "loss": 1.1325, + "step": 1010 + }, + { + "epoch": 0.014693806992523446, + "grad_norm": 2.1143908500671387, + "learning_rate": 9.783965434469515e-06, + "loss": 1.1405, + "step": 1020 + }, + { + "epoch": 0.014837863923822695, + "grad_norm": 2.5381438732147217, + "learning_rate": 9.87998079692751e-06, + "loss": 1.1473, + "step": 1030 + }, + { + "epoch": 0.014981920855121945, + "grad_norm": 2.9158403873443604, + "learning_rate": 9.975996159385503e-06, + "loss": 1.1215, + "step": 1040 + }, + { + "epoch": 0.015125977786421194, + "grad_norm": 3.113672971725464, + "learning_rate": 1.0072011521843497e-05, + "loss": 1.1221, + "step": 1050 + }, + { + "epoch": 0.015270034717720444, + "grad_norm": 2.7869443893432617, + "learning_rate": 1.016802688430149e-05, + "loss": 1.1025, + "step": 1060 + }, + { + "epoch": 0.015414091649019693, + "grad_norm": 2.0424559116363525, + "learning_rate": 1.0264042246759483e-05, + "loss": 1.1074, + "step": 1070 + }, + { + "epoch": 0.015558148580318941, + "grad_norm": 2.2862319946289062, + "learning_rate": 1.0360057609217475e-05, + "loss": 1.1281, + "step": 1080 + }, + { + "epoch": 0.015702205511618193, + "grad_norm": 2.280027389526367, + "learning_rate": 1.0456072971675468e-05, + "loss": 1.1067, + "step": 1090 + }, + { + "epoch": 0.015846262442917442, + "grad_norm": 2.6573803424835205, + "learning_rate": 1.0552088334133461e-05, + "loss": 1.1035, + "step": 1100 + }, + { + "epoch": 0.01599031937421669, + "grad_norm": 1.6579245328903198, + "learning_rate": 1.0648103696591456e-05, + "loss": 1.1411, + "step": 1110 + }, + { + "epoch": 0.01613437630551594, + "grad_norm": 2.261164426803589, + "learning_rate": 1.074411905904945e-05, + "loss": 1.1015, + "step": 1120 + }, + { + "epoch": 0.01627843323681519, + "grad_norm": 2.3432836532592773, + "learning_rate": 1.0840134421507443e-05, + "loss": 1.1097, + "step": 1130 + }, + { + "epoch": 0.01642249016811444, + "grad_norm": 2.1592226028442383, + "learning_rate": 1.0936149783965435e-05, + "loss": 1.1143, + "step": 1140 + }, + { + "epoch": 0.01656654709941369, + "grad_norm": 2.1739590167999268, + "learning_rate": 1.1032165146423428e-05, + "loss": 1.1129, + "step": 1150 + }, + { + "epoch": 0.016710604030712936, + "grad_norm": 1.7060655355453491, + "learning_rate": 1.1128180508881421e-05, + "loss": 1.0962, + "step": 1160 + }, + { + "epoch": 0.016854660962012186, + "grad_norm": 3.007624387741089, + "learning_rate": 1.1224195871339416e-05, + "loss": 1.1294, + "step": 1170 + }, + { + "epoch": 0.016998717893311435, + "grad_norm": 2.0622236728668213, + "learning_rate": 1.132021123379741e-05, + "loss": 1.1212, + "step": 1180 + }, + { + "epoch": 0.017142774824610685, + "grad_norm": 2.5541915893554688, + "learning_rate": 1.1416226596255401e-05, + "loss": 1.1295, + "step": 1190 + }, + { + "epoch": 0.017286831755909934, + "grad_norm": 2.384983539581299, + "learning_rate": 1.1512241958713394e-05, + "loss": 1.1233, + "step": 1200 + }, + { + "epoch": 0.017430888687209184, + "grad_norm": 2.4423985481262207, + "learning_rate": 1.1608257321171388e-05, + "loss": 1.1184, + "step": 1210 + }, + { + "epoch": 0.017574945618508434, + "grad_norm": 2.1598997116088867, + "learning_rate": 1.1704272683629381e-05, + "loss": 1.1226, + "step": 1220 + }, + { + "epoch": 0.017719002549807683, + "grad_norm": 1.8650798797607422, + "learning_rate": 1.1800288046087376e-05, + "loss": 1.1236, + "step": 1230 + }, + { + "epoch": 0.017863059481106933, + "grad_norm": 2.189276695251465, + "learning_rate": 1.189630340854537e-05, + "loss": 1.1205, + "step": 1240 + }, + { + "epoch": 0.018007116412406182, + "grad_norm": 2.8257031440734863, + "learning_rate": 1.1992318771003361e-05, + "loss": 1.1289, + "step": 1250 + }, + { + "epoch": 0.018151173343705432, + "grad_norm": 2.3813562393188477, + "learning_rate": 1.2088334133461354e-05, + "loss": 1.1262, + "step": 1260 + }, + { + "epoch": 0.01829523027500468, + "grad_norm": 2.005934238433838, + "learning_rate": 1.2184349495919348e-05, + "loss": 1.1085, + "step": 1270 + }, + { + "epoch": 0.01843928720630393, + "grad_norm": 2.3918726444244385, + "learning_rate": 1.2280364858377341e-05, + "loss": 1.1314, + "step": 1280 + }, + { + "epoch": 0.01858334413760318, + "grad_norm": 1.9000368118286133, + "learning_rate": 1.2376380220835336e-05, + "loss": 1.1034, + "step": 1290 + }, + { + "epoch": 0.01872740106890243, + "grad_norm": 2.176527976989746, + "learning_rate": 1.247239558329333e-05, + "loss": 1.0931, + "step": 1300 + }, + { + "epoch": 0.01887145800020168, + "grad_norm": 2.615513563156128, + "learning_rate": 1.256841094575132e-05, + "loss": 1.1062, + "step": 1310 + }, + { + "epoch": 0.01901551493150093, + "grad_norm": 1.7527588605880737, + "learning_rate": 1.2664426308209314e-05, + "loss": 1.1152, + "step": 1320 + }, + { + "epoch": 0.01915957186280018, + "grad_norm": 2.300352096557617, + "learning_rate": 1.2760441670667307e-05, + "loss": 1.1291, + "step": 1330 + }, + { + "epoch": 0.01930362879409943, + "grad_norm": 2.247837543487549, + "learning_rate": 1.28564570331253e-05, + "loss": 1.1031, + "step": 1340 + }, + { + "epoch": 0.019447685725398678, + "grad_norm": 1.7947758436203003, + "learning_rate": 1.2952472395583296e-05, + "loss": 1.1037, + "step": 1350 + }, + { + "epoch": 0.019591742656697927, + "grad_norm": 1.8685263395309448, + "learning_rate": 1.3048487758041289e-05, + "loss": 1.1063, + "step": 1360 + }, + { + "epoch": 0.019735799587997177, + "grad_norm": 2.6336042881011963, + "learning_rate": 1.314450312049928e-05, + "loss": 1.1082, + "step": 1370 + }, + { + "epoch": 0.019879856519296427, + "grad_norm": 2.208981990814209, + "learning_rate": 1.3240518482957274e-05, + "loss": 1.1055, + "step": 1380 + }, + { + "epoch": 0.020023913450595676, + "grad_norm": 1.8364813327789307, + "learning_rate": 1.3336533845415267e-05, + "loss": 1.117, + "step": 1390 + }, + { + "epoch": 0.020167970381894926, + "grad_norm": 2.072045087814331, + "learning_rate": 1.343254920787326e-05, + "loss": 1.1358, + "step": 1400 + }, + { + "epoch": 0.020312027313194175, + "grad_norm": 1.5558465719223022, + "learning_rate": 1.3528564570331255e-05, + "loss": 1.1104, + "step": 1410 + }, + { + "epoch": 0.020456084244493425, + "grad_norm": 2.4063785076141357, + "learning_rate": 1.3624579932789249e-05, + "loss": 1.0778, + "step": 1420 + }, + { + "epoch": 0.020600141175792674, + "grad_norm": 1.9792841672897339, + "learning_rate": 1.372059529524724e-05, + "loss": 1.1243, + "step": 1430 + }, + { + "epoch": 0.020744198107091924, + "grad_norm": 1.7703602313995361, + "learning_rate": 1.3816610657705234e-05, + "loss": 1.1126, + "step": 1440 + }, + { + "epoch": 0.020888255038391174, + "grad_norm": 1.716643214225769, + "learning_rate": 1.3912626020163227e-05, + "loss": 1.0767, + "step": 1450 + }, + { + "epoch": 0.021032311969690423, + "grad_norm": 2.2212705612182617, + "learning_rate": 1.400864138262122e-05, + "loss": 1.1305, + "step": 1460 + }, + { + "epoch": 0.021176368900989673, + "grad_norm": 1.8742339611053467, + "learning_rate": 1.4104656745079215e-05, + "loss": 1.1, + "step": 1470 + }, + { + "epoch": 0.021320425832288922, + "grad_norm": 1.9609624147415161, + "learning_rate": 1.4200672107537209e-05, + "loss": 1.125, + "step": 1480 + }, + { + "epoch": 0.02146448276358817, + "grad_norm": 1.5962414741516113, + "learning_rate": 1.42966874699952e-05, + "loss": 1.1115, + "step": 1490 + }, + { + "epoch": 0.021608539694887418, + "grad_norm": 1.8134044408798218, + "learning_rate": 1.4392702832453194e-05, + "loss": 1.1039, + "step": 1500 + }, + { + "epoch": 0.021752596626186668, + "grad_norm": 2.0705580711364746, + "learning_rate": 1.4488718194911187e-05, + "loss": 1.0849, + "step": 1510 + }, + { + "epoch": 0.021896653557485917, + "grad_norm": 1.7956775426864624, + "learning_rate": 1.458473355736918e-05, + "loss": 1.1061, + "step": 1520 + }, + { + "epoch": 0.022040710488785167, + "grad_norm": 1.871857762336731, + "learning_rate": 1.4680748919827175e-05, + "loss": 1.0928, + "step": 1530 + }, + { + "epoch": 0.022184767420084416, + "grad_norm": 1.7649261951446533, + "learning_rate": 1.4776764282285168e-05, + "loss": 1.0978, + "step": 1540 + }, + { + "epoch": 0.022328824351383666, + "grad_norm": 1.5898739099502563, + "learning_rate": 1.487277964474316e-05, + "loss": 1.1193, + "step": 1550 + }, + { + "epoch": 0.022472881282682915, + "grad_norm": 1.7966949939727783, + "learning_rate": 1.4968795007201153e-05, + "loss": 1.1104, + "step": 1560 + }, + { + "epoch": 0.022616938213982165, + "grad_norm": 1.7936170101165771, + "learning_rate": 1.5064810369659147e-05, + "loss": 1.1049, + "step": 1570 + }, + { + "epoch": 0.022760995145281415, + "grad_norm": 2.4540610313415527, + "learning_rate": 1.516082573211714e-05, + "loss": 1.1066, + "step": 1580 + }, + { + "epoch": 0.022905052076580664, + "grad_norm": 2.32489013671875, + "learning_rate": 1.5256841094575132e-05, + "loss": 1.1095, + "step": 1590 + }, + { + "epoch": 0.023049109007879914, + "grad_norm": 1.5011610984802246, + "learning_rate": 1.5352856457033127e-05, + "loss": 1.12, + "step": 1600 + }, + { + "epoch": 0.023193165939179163, + "grad_norm": 1.724061369895935, + "learning_rate": 1.544887181949112e-05, + "loss": 1.0904, + "step": 1610 + }, + { + "epoch": 0.023337222870478413, + "grad_norm": 2.0318827629089355, + "learning_rate": 1.5544887181949113e-05, + "loss": 1.0939, + "step": 1620 + }, + { + "epoch": 0.023481279801777662, + "grad_norm": 1.449752926826477, + "learning_rate": 1.5640902544407106e-05, + "loss": 1.1056, + "step": 1630 + }, + { + "epoch": 0.023625336733076912, + "grad_norm": 1.4921005964279175, + "learning_rate": 1.57369179068651e-05, + "loss": 1.1037, + "step": 1640 + }, + { + "epoch": 0.02376939366437616, + "grad_norm": 1.6666138172149658, + "learning_rate": 1.5832933269323093e-05, + "loss": 1.1198, + "step": 1650 + }, + { + "epoch": 0.02391345059567541, + "grad_norm": 1.498950481414795, + "learning_rate": 1.5928948631781086e-05, + "loss": 1.1133, + "step": 1660 + }, + { + "epoch": 0.02405750752697466, + "grad_norm": 2.2030534744262695, + "learning_rate": 1.602496399423908e-05, + "loss": 1.1035, + "step": 1670 + }, + { + "epoch": 0.02420156445827391, + "grad_norm": 1.7890610694885254, + "learning_rate": 1.6120979356697073e-05, + "loss": 1.1097, + "step": 1680 + }, + { + "epoch": 0.02434562138957316, + "grad_norm": 1.7875065803527832, + "learning_rate": 1.6216994719155066e-05, + "loss": 1.1045, + "step": 1690 + }, + { + "epoch": 0.02448967832087241, + "grad_norm": 1.5518572330474854, + "learning_rate": 1.631301008161306e-05, + "loss": 1.1103, + "step": 1700 + }, + { + "epoch": 0.02463373525217166, + "grad_norm": 1.769522786140442, + "learning_rate": 1.6409025444071053e-05, + "loss": 1.0833, + "step": 1710 + }, + { + "epoch": 0.02477779218347091, + "grad_norm": 2.2341511249542236, + "learning_rate": 1.6505040806529046e-05, + "loss": 1.0817, + "step": 1720 + }, + { + "epoch": 0.024921849114770158, + "grad_norm": 1.615538239479065, + "learning_rate": 1.660105616898704e-05, + "loss": 1.1174, + "step": 1730 + }, + { + "epoch": 0.025065906046069408, + "grad_norm": 1.597995400428772, + "learning_rate": 1.6697071531445033e-05, + "loss": 1.0972, + "step": 1740 + }, + { + "epoch": 0.025209962977368657, + "grad_norm": 1.4337286949157715, + "learning_rate": 1.6793086893903026e-05, + "loss": 1.1342, + "step": 1750 + }, + { + "epoch": 0.025354019908667907, + "grad_norm": 1.727091908454895, + "learning_rate": 1.688910225636102e-05, + "loss": 1.1004, + "step": 1760 + }, + { + "epoch": 0.025498076839967156, + "grad_norm": 1.6202733516693115, + "learning_rate": 1.6985117618819013e-05, + "loss": 1.1225, + "step": 1770 + }, + { + "epoch": 0.025642133771266406, + "grad_norm": 2.443167209625244, + "learning_rate": 1.7081132981277006e-05, + "loss": 1.1007, + "step": 1780 + }, + { + "epoch": 0.025786190702565655, + "grad_norm": 1.505105972290039, + "learning_rate": 1.7177148343735e-05, + "loss": 1.1251, + "step": 1790 + }, + { + "epoch": 0.025930247633864905, + "grad_norm": 1.6193865537643433, + "learning_rate": 1.7273163706192993e-05, + "loss": 1.0994, + "step": 1800 + }, + { + "epoch": 0.026074304565164155, + "grad_norm": 2.123363733291626, + "learning_rate": 1.7369179068650986e-05, + "loss": 1.0792, + "step": 1810 + }, + { + "epoch": 0.0262183614964634, + "grad_norm": 1.483593225479126, + "learning_rate": 1.746519443110898e-05, + "loss": 1.1105, + "step": 1820 + }, + { + "epoch": 0.02636241842776265, + "grad_norm": 1.5574554204940796, + "learning_rate": 1.7561209793566972e-05, + "loss": 1.1109, + "step": 1830 + }, + { + "epoch": 0.0265064753590619, + "grad_norm": 1.426472544670105, + "learning_rate": 1.7657225156024966e-05, + "loss": 1.0843, + "step": 1840 + }, + { + "epoch": 0.02665053229036115, + "grad_norm": 1.7228853702545166, + "learning_rate": 1.775324051848296e-05, + "loss": 1.0837, + "step": 1850 + }, + { + "epoch": 0.0267945892216604, + "grad_norm": 1.8484386205673218, + "learning_rate": 1.7849255880940952e-05, + "loss": 1.116, + "step": 1860 + }, + { + "epoch": 0.02693864615295965, + "grad_norm": 1.4843940734863281, + "learning_rate": 1.7945271243398946e-05, + "loss": 1.0948, + "step": 1870 + }, + { + "epoch": 0.027082703084258898, + "grad_norm": 1.2204471826553345, + "learning_rate": 1.804128660585694e-05, + "loss": 1.0927, + "step": 1880 + }, + { + "epoch": 0.027226760015558148, + "grad_norm": 1.3783999681472778, + "learning_rate": 1.8137301968314932e-05, + "loss": 1.099, + "step": 1890 + }, + { + "epoch": 0.027370816946857397, + "grad_norm": 1.6829112768173218, + "learning_rate": 1.8233317330772926e-05, + "loss": 1.1028, + "step": 1900 + }, + { + "epoch": 0.027514873878156647, + "grad_norm": 1.10300612449646, + "learning_rate": 1.832933269323092e-05, + "loss": 1.0999, + "step": 1910 + }, + { + "epoch": 0.027658930809455896, + "grad_norm": 1.9120315313339233, + "learning_rate": 1.8425348055688912e-05, + "loss": 1.1231, + "step": 1920 + }, + { + "epoch": 0.027802987740755146, + "grad_norm": 1.7644262313842773, + "learning_rate": 1.8521363418146905e-05, + "loss": 1.1207, + "step": 1930 + }, + { + "epoch": 0.027947044672054396, + "grad_norm": 1.7623440027236938, + "learning_rate": 1.86173787806049e-05, + "loss": 1.0908, + "step": 1940 + }, + { + "epoch": 0.028091101603353645, + "grad_norm": 1.6724064350128174, + "learning_rate": 1.8713394143062892e-05, + "loss": 1.0977, + "step": 1950 + }, + { + "epoch": 0.028235158534652895, + "grad_norm": 1.154085397720337, + "learning_rate": 1.8809409505520885e-05, + "loss": 1.0658, + "step": 1960 + }, + { + "epoch": 0.028379215465952144, + "grad_norm": 1.228071689605713, + "learning_rate": 1.890542486797888e-05, + "loss": 1.1198, + "step": 1970 + }, + { + "epoch": 0.028523272397251394, + "grad_norm": 1.41117525100708, + "learning_rate": 1.9001440230436872e-05, + "loss": 1.0918, + "step": 1980 + }, + { + "epoch": 0.028667329328550643, + "grad_norm": 1.3081333637237549, + "learning_rate": 1.9097455592894865e-05, + "loss": 1.1058, + "step": 1990 + }, + { + "epoch": 0.028811386259849893, + "grad_norm": 1.481508731842041, + "learning_rate": 1.919347095535286e-05, + "loss": 1.0802, + "step": 2000 + }, + { + "epoch": 0.028955443191149143, + "grad_norm": 1.1865828037261963, + "learning_rate": 1.9289486317810852e-05, + "loss": 1.0995, + "step": 2010 + }, + { + "epoch": 0.029099500122448392, + "grad_norm": 1.5778357982635498, + "learning_rate": 1.9385501680268845e-05, + "loss": 1.0992, + "step": 2020 + }, + { + "epoch": 0.02924355705374764, + "grad_norm": 1.2929495573043823, + "learning_rate": 1.948151704272684e-05, + "loss": 1.1211, + "step": 2030 + }, + { + "epoch": 0.02938761398504689, + "grad_norm": 1.2938451766967773, + "learning_rate": 1.9577532405184832e-05, + "loss": 1.0747, + "step": 2040 + }, + { + "epoch": 0.02953167091634614, + "grad_norm": 1.5689811706542969, + "learning_rate": 1.9673547767642825e-05, + "loss": 1.1022, + "step": 2050 + }, + { + "epoch": 0.02967572784764539, + "grad_norm": 1.4578216075897217, + "learning_rate": 1.976956313010082e-05, + "loss": 1.1003, + "step": 2060 + }, + { + "epoch": 0.02981978477894464, + "grad_norm": 0.896359384059906, + "learning_rate": 1.986557849255881e-05, + "loss": 1.118, + "step": 2070 + }, + { + "epoch": 0.02996384171024389, + "grad_norm": 1.6154524087905884, + "learning_rate": 1.9961593855016805e-05, + "loss": 1.088, + "step": 2080 + }, + { + "epoch": 0.03010789864154314, + "grad_norm": 1.3850620985031128, + "learning_rate": 1.9999999608164827e-05, + "loss": 1.0875, + "step": 2090 + }, + { + "epoch": 0.03025195557284239, + "grad_norm": 1.1587663888931274, + "learning_rate": 1.9999997213616654e-05, + "loss": 1.1036, + "step": 2100 + }, + { + "epoch": 0.030396012504141638, + "grad_norm": 1.6226013898849487, + "learning_rate": 1.999999264220704e-05, + "loss": 1.1045, + "step": 2110 + }, + { + "epoch": 0.030540069435440888, + "grad_norm": 1.5852738618850708, + "learning_rate": 1.9999985893936977e-05, + "loss": 1.1033, + "step": 2120 + }, + { + "epoch": 0.030684126366740137, + "grad_norm": 1.262050986289978, + "learning_rate": 1.9999976968807936e-05, + "loss": 1.102, + "step": 2130 + }, + { + "epoch": 0.030828183298039387, + "grad_norm": 1.405041217803955, + "learning_rate": 1.999996586682186e-05, + "loss": 1.1035, + "step": 2140 + }, + { + "epoch": 0.030972240229338633, + "grad_norm": 1.2300574779510498, + "learning_rate": 1.9999952587981158e-05, + "loss": 1.0992, + "step": 2150 + }, + { + "epoch": 0.031116297160637883, + "grad_norm": 0.9463424682617188, + "learning_rate": 1.999993713228873e-05, + "loss": 1.0986, + "step": 2160 + }, + { + "epoch": 0.031260354091937136, + "grad_norm": 1.373226284980774, + "learning_rate": 1.9999919499747938e-05, + "loss": 1.1031, + "step": 2170 + }, + { + "epoch": 0.031404411023236385, + "grad_norm": 1.2275688648223877, + "learning_rate": 1.999989969036262e-05, + "loss": 1.1126, + "step": 2180 + }, + { + "epoch": 0.031548467954535635, + "grad_norm": 0.9869627356529236, + "learning_rate": 1.9999877704137088e-05, + "loss": 1.0908, + "step": 2190 + }, + { + "epoch": 0.031692524885834884, + "grad_norm": 1.3384692668914795, + "learning_rate": 1.9999853541076132e-05, + "loss": 1.1195, + "step": 2200 + }, + { + "epoch": 0.031836581817134134, + "grad_norm": 1.0576239824295044, + "learning_rate": 1.9999827201185004e-05, + "loss": 1.1007, + "step": 2210 + }, + { + "epoch": 0.03198063874843338, + "grad_norm": 0.9131837487220764, + "learning_rate": 1.999979868446944e-05, + "loss": 1.0917, + "step": 2220 + }, + { + "epoch": 0.03212469567973263, + "grad_norm": 0.8731379508972168, + "learning_rate": 1.9999767990935653e-05, + "loss": 1.1014, + "step": 2230 + }, + { + "epoch": 0.03226875261103188, + "grad_norm": 1.5275228023529053, + "learning_rate": 1.999973512059032e-05, + "loss": 1.0953, + "step": 2240 + }, + { + "epoch": 0.03241280954233113, + "grad_norm": 1.290822982788086, + "learning_rate": 1.99997000734406e-05, + "loss": 1.0837, + "step": 2250 + }, + { + "epoch": 0.03255686647363038, + "grad_norm": 1.1078766584396362, + "learning_rate": 1.999966284949412e-05, + "loss": 1.0982, + "step": 2260 + }, + { + "epoch": 0.03270092340492963, + "grad_norm": 1.3421151638031006, + "learning_rate": 1.9999623448758977e-05, + "loss": 1.0839, + "step": 2270 + }, + { + "epoch": 0.03284498033622888, + "grad_norm": 1.0649149417877197, + "learning_rate": 1.999958187124376e-05, + "loss": 1.0729, + "step": 2280 + }, + { + "epoch": 0.03298903726752813, + "grad_norm": 1.1525821685791016, + "learning_rate": 1.9999538116957514e-05, + "loss": 1.1201, + "step": 2290 + }, + { + "epoch": 0.03313309419882738, + "grad_norm": 1.1011182069778442, + "learning_rate": 1.999949218590976e-05, + "loss": 1.0785, + "step": 2300 + }, + { + "epoch": 0.03327715113012662, + "grad_norm": 1.1103689670562744, + "learning_rate": 1.9999444078110503e-05, + "loss": 1.0915, + "step": 2310 + }, + { + "epoch": 0.03342120806142587, + "grad_norm": 1.0673400163650513, + "learning_rate": 1.999939379357021e-05, + "loss": 1.0906, + "step": 2320 + }, + { + "epoch": 0.03356526499272512, + "grad_norm": 1.1162818670272827, + "learning_rate": 1.9999341332299833e-05, + "loss": 1.0902, + "step": 2330 + }, + { + "epoch": 0.03370932192402437, + "grad_norm": 0.9606404304504395, + "learning_rate": 1.9999286694310788e-05, + "loss": 1.0721, + "step": 2340 + }, + { + "epoch": 0.03385337885532362, + "grad_norm": 0.931309163570404, + "learning_rate": 1.9999229879614967e-05, + "loss": 1.1225, + "step": 2350 + }, + { + "epoch": 0.03399743578662287, + "grad_norm": 1.082837462425232, + "learning_rate": 1.9999170888224743e-05, + "loss": 1.0968, + "step": 2360 + }, + { + "epoch": 0.03414149271792212, + "grad_norm": 1.1223711967468262, + "learning_rate": 1.9999109720152956e-05, + "loss": 1.104, + "step": 2370 + }, + { + "epoch": 0.03428554964922137, + "grad_norm": 0.9647048115730286, + "learning_rate": 1.999904637541292e-05, + "loss": 1.0784, + "step": 2380 + }, + { + "epoch": 0.03442960658052062, + "grad_norm": 1.4086133241653442, + "learning_rate": 1.9998980854018426e-05, + "loss": 1.105, + "step": 2390 + }, + { + "epoch": 0.03457366351181987, + "grad_norm": 1.022999882698059, + "learning_rate": 1.999891315598373e-05, + "loss": 1.0812, + "step": 2400 + }, + { + "epoch": 0.03471772044311912, + "grad_norm": 1.5496896505355835, + "learning_rate": 1.9998843281323586e-05, + "loss": 1.0823, + "step": 2410 + }, + { + "epoch": 0.03486177737441837, + "grad_norm": 1.1944360733032227, + "learning_rate": 1.9998771230053186e-05, + "loss": 1.0809, + "step": 2420 + }, + { + "epoch": 0.03500583430571762, + "grad_norm": 1.2143431901931763, + "learning_rate": 1.9998697002188226e-05, + "loss": 1.0923, + "step": 2430 + }, + { + "epoch": 0.03514989123701687, + "grad_norm": 1.3194953203201294, + "learning_rate": 1.999862059774486e-05, + "loss": 1.0672, + "step": 2440 + }, + { + "epoch": 0.03529394816831612, + "grad_norm": 1.2011752128601074, + "learning_rate": 1.9998542016739716e-05, + "loss": 1.0682, + "step": 2450 + }, + { + "epoch": 0.035438005099615366, + "grad_norm": 1.190067172050476, + "learning_rate": 1.999846125918991e-05, + "loss": 1.0769, + "step": 2460 + }, + { + "epoch": 0.035582062030914616, + "grad_norm": 1.3442232608795166, + "learning_rate": 1.9998378325113015e-05, + "loss": 1.0927, + "step": 2470 + }, + { + "epoch": 0.035726118962213865, + "grad_norm": 1.5200982093811035, + "learning_rate": 1.9998293214527088e-05, + "loss": 1.0985, + "step": 2480 + }, + { + "epoch": 0.035870175893513115, + "grad_norm": 0.9945452809333801, + "learning_rate": 1.9998205927450653e-05, + "loss": 1.0696, + "step": 2490 + }, + { + "epoch": 0.036014232824812364, + "grad_norm": 1.065801739692688, + "learning_rate": 1.999811646390271e-05, + "loss": 1.0953, + "step": 2500 + }, + { + "epoch": 0.036158289756111614, + "grad_norm": 1.2544124126434326, + "learning_rate": 1.9998024823902744e-05, + "loss": 1.0668, + "step": 2510 + }, + { + "epoch": 0.036302346687410864, + "grad_norm": 0.8284255862236023, + "learning_rate": 1.999793100747069e-05, + "loss": 1.1062, + "step": 2520 + }, + { + "epoch": 0.03644640361871011, + "grad_norm": 1.414483666419983, + "learning_rate": 1.999783501462698e-05, + "loss": 1.0956, + "step": 2530 + }, + { + "epoch": 0.03659046055000936, + "grad_norm": 1.02438485622406, + "learning_rate": 1.999773684539251e-05, + "loss": 1.0935, + "step": 2540 + }, + { + "epoch": 0.03673451748130861, + "grad_norm": 1.4005661010742188, + "learning_rate": 1.9997636499788645e-05, + "loss": 1.0925, + "step": 2550 + }, + { + "epoch": 0.03687857441260786, + "grad_norm": 1.299225091934204, + "learning_rate": 1.999753397783723e-05, + "loss": 1.0794, + "step": 2560 + }, + { + "epoch": 0.03702263134390711, + "grad_norm": 1.0879071950912476, + "learning_rate": 1.9997429279560587e-05, + "loss": 1.1003, + "step": 2570 + }, + { + "epoch": 0.03716668827520636, + "grad_norm": 1.0114030838012695, + "learning_rate": 1.99973224049815e-05, + "loss": 1.0797, + "step": 2580 + }, + { + "epoch": 0.03731074520650561, + "grad_norm": 0.9800271987915039, + "learning_rate": 1.9997213354123246e-05, + "loss": 1.0991, + "step": 2590 + }, + { + "epoch": 0.03745480213780486, + "grad_norm": 1.0941202640533447, + "learning_rate": 1.9997102127009552e-05, + "loss": 1.0637, + "step": 2600 + }, + { + "epoch": 0.03759885906910411, + "grad_norm": 1.0339268445968628, + "learning_rate": 1.9996988723664637e-05, + "loss": 1.0646, + "step": 2610 + }, + { + "epoch": 0.03774291600040336, + "grad_norm": 1.1409887075424194, + "learning_rate": 1.9996873144113184e-05, + "loss": 1.0968, + "step": 2620 + }, + { + "epoch": 0.03788697293170261, + "grad_norm": 1.3173432350158691, + "learning_rate": 1.9996755388380353e-05, + "loss": 1.074, + "step": 2630 + }, + { + "epoch": 0.03803102986300186, + "grad_norm": 1.7910503149032593, + "learning_rate": 1.999663545649178e-05, + "loss": 1.0889, + "step": 2640 + }, + { + "epoch": 0.03817508679430111, + "grad_norm": 1.238196849822998, + "learning_rate": 1.9996513348473572e-05, + "loss": 1.0823, + "step": 2650 + }, + { + "epoch": 0.03831914372560036, + "grad_norm": 1.1959824562072754, + "learning_rate": 1.9996389064352312e-05, + "loss": 1.0768, + "step": 2660 + }, + { + "epoch": 0.03846320065689961, + "grad_norm": 1.093868613243103, + "learning_rate": 1.999626260415505e-05, + "loss": 1.0639, + "step": 2670 + }, + { + "epoch": 0.03860725758819886, + "grad_norm": 1.0971081256866455, + "learning_rate": 1.999613396790932e-05, + "loss": 1.0773, + "step": 2680 + }, + { + "epoch": 0.038751314519498106, + "grad_norm": 0.9527555108070374, + "learning_rate": 1.999600315564312e-05, + "loss": 1.0591, + "step": 2690 + }, + { + "epoch": 0.038895371450797356, + "grad_norm": 1.4275157451629639, + "learning_rate": 1.999587016738493e-05, + "loss": 1.0783, + "step": 2700 + }, + { + "epoch": 0.039039428382096605, + "grad_norm": 0.8318230509757996, + "learning_rate": 1.9995735003163693e-05, + "loss": 1.0682, + "step": 2710 + }, + { + "epoch": 0.039183485313395855, + "grad_norm": 0.9934574365615845, + "learning_rate": 1.999559766300884e-05, + "loss": 1.0946, + "step": 2720 + }, + { + "epoch": 0.039327542244695105, + "grad_norm": 1.059827446937561, + "learning_rate": 1.9995458146950266e-05, + "loss": 1.0873, + "step": 2730 + }, + { + "epoch": 0.039471599175994354, + "grad_norm": 0.9701938033103943, + "learning_rate": 1.9995316455018342e-05, + "loss": 1.0807, + "step": 2740 + }, + { + "epoch": 0.039615656107293604, + "grad_norm": 0.9101185202598572, + "learning_rate": 1.999517258724391e-05, + "loss": 1.0832, + "step": 2750 + }, + { + "epoch": 0.03975971303859285, + "grad_norm": 1.1547293663024902, + "learning_rate": 1.999502654365829e-05, + "loss": 1.0885, + "step": 2760 + }, + { + "epoch": 0.0399037699698921, + "grad_norm": 0.9609155058860779, + "learning_rate": 1.9994878324293266e-05, + "loss": 1.1198, + "step": 2770 + }, + { + "epoch": 0.04004782690119135, + "grad_norm": 1.024674415588379, + "learning_rate": 1.999472792918112e-05, + "loss": 1.0841, + "step": 2780 + }, + { + "epoch": 0.0401918838324906, + "grad_norm": 1.2069023847579956, + "learning_rate": 1.9994575358354576e-05, + "loss": 1.0905, + "step": 2790 + }, + { + "epoch": 0.04033594076378985, + "grad_norm": 1.0754157304763794, + "learning_rate": 1.999442061184685e-05, + "loss": 1.0699, + "step": 2800 + }, + { + "epoch": 0.0404799976950891, + "grad_norm": 1.038758635520935, + "learning_rate": 1.9994263689691635e-05, + "loss": 1.0587, + "step": 2810 + }, + { + "epoch": 0.04062405462638835, + "grad_norm": 0.8334084749221802, + "learning_rate": 1.9994104591923086e-05, + "loss": 1.0726, + "step": 2820 + }, + { + "epoch": 0.0407681115576876, + "grad_norm": 0.8922152519226074, + "learning_rate": 1.999394331857583e-05, + "loss": 1.0784, + "step": 2830 + }, + { + "epoch": 0.04091216848898685, + "grad_norm": 1.0895780324935913, + "learning_rate": 1.9993779869684988e-05, + "loss": 1.1066, + "step": 2840 + }, + { + "epoch": 0.0410562254202861, + "grad_norm": 0.9029639363288879, + "learning_rate": 1.9993614245286125e-05, + "loss": 1.0741, + "step": 2850 + }, + { + "epoch": 0.04120028235158535, + "grad_norm": 0.9833770990371704, + "learning_rate": 1.9993446445415308e-05, + "loss": 1.1026, + "step": 2860 + }, + { + "epoch": 0.0413443392828846, + "grad_norm": 1.0057151317596436, + "learning_rate": 1.9993276470109054e-05, + "loss": 1.065, + "step": 2870 + }, + { + "epoch": 0.04148839621418385, + "grad_norm": 0.981825590133667, + "learning_rate": 1.9993104319404375e-05, + "loss": 1.0984, + "step": 2880 + }, + { + "epoch": 0.0416324531454831, + "grad_norm": 1.2358293533325195, + "learning_rate": 1.9992929993338738e-05, + "loss": 1.0701, + "step": 2890 + }, + { + "epoch": 0.04177651007678235, + "grad_norm": 1.2199301719665527, + "learning_rate": 1.9992753491950096e-05, + "loss": 1.0765, + "step": 2900 + }, + { + "epoch": 0.0419205670080816, + "grad_norm": 0.9787526726722717, + "learning_rate": 1.9992574815276867e-05, + "loss": 1.0774, + "step": 2910 + }, + { + "epoch": 0.042064623939380846, + "grad_norm": 0.8597845435142517, + "learning_rate": 1.999239396335795e-05, + "loss": 1.0801, + "step": 2920 + }, + { + "epoch": 0.042208680870680096, + "grad_norm": 1.1765086650848389, + "learning_rate": 1.999221093623271e-05, + "loss": 1.082, + "step": 2930 + }, + { + "epoch": 0.042352737801979345, + "grad_norm": 1.0328409671783447, + "learning_rate": 1.9992025733940994e-05, + "loss": 1.0755, + "step": 2940 + }, + { + "epoch": 0.042496794733278595, + "grad_norm": 1.0411959886550903, + "learning_rate": 1.9991838356523114e-05, + "loss": 1.0912, + "step": 2950 + }, + { + "epoch": 0.042640851664577845, + "grad_norm": 1.1128629446029663, + "learning_rate": 1.9991648804019867e-05, + "loss": 1.0777, + "step": 2960 + }, + { + "epoch": 0.04278490859587709, + "grad_norm": 1.0264414548873901, + "learning_rate": 1.9991457076472502e-05, + "loss": 1.0742, + "step": 2970 + }, + { + "epoch": 0.04292896552717634, + "grad_norm": 0.8396328687667847, + "learning_rate": 1.9991263173922767e-05, + "loss": 1.0939, + "step": 2980 + }, + { + "epoch": 0.043073022458475586, + "grad_norm": 1.1300128698349, + "learning_rate": 1.999106709641287e-05, + "loss": 1.0691, + "step": 2990 + }, + { + "epoch": 0.043217079389774836, + "grad_norm": 0.9339667558670044, + "learning_rate": 1.999086884398549e-05, + "loss": 1.0568, + "step": 3000 + }, + { + "epoch": 0.043361136321074085, + "grad_norm": 1.1959909200668335, + "learning_rate": 1.9990668416683788e-05, + "loss": 1.0842, + "step": 3010 + }, + { + "epoch": 0.043505193252373335, + "grad_norm": 0.9370341897010803, + "learning_rate": 1.9990465814551397e-05, + "loss": 1.0924, + "step": 3020 + }, + { + "epoch": 0.043649250183672585, + "grad_norm": 0.9694163203239441, + "learning_rate": 1.999026103763241e-05, + "loss": 1.0971, + "step": 3030 + }, + { + "epoch": 0.043793307114971834, + "grad_norm": 0.8307744264602661, + "learning_rate": 1.9990054085971416e-05, + "loss": 1.089, + "step": 3040 + }, + { + "epoch": 0.043937364046271084, + "grad_norm": 1.1763337850570679, + "learning_rate": 1.9989844959613458e-05, + "loss": 1.0734, + "step": 3050 + }, + { + "epoch": 0.04408142097757033, + "grad_norm": 0.9161782264709473, + "learning_rate": 1.9989633658604064e-05, + "loss": 1.0719, + "step": 3060 + }, + { + "epoch": 0.04422547790886958, + "grad_norm": 0.9989141821861267, + "learning_rate": 1.9989420182989227e-05, + "loss": 1.0672, + "step": 3070 + }, + { + "epoch": 0.04436953484016883, + "grad_norm": 0.9473924040794373, + "learning_rate": 1.9989204532815422e-05, + "loss": 1.0845, + "step": 3080 + }, + { + "epoch": 0.04451359177146808, + "grad_norm": 0.7662175297737122, + "learning_rate": 1.9988986708129593e-05, + "loss": 1.0826, + "step": 3090 + }, + { + "epoch": 0.04465764870276733, + "grad_norm": 0.8995963335037231, + "learning_rate": 1.998876670897915e-05, + "loss": 1.0918, + "step": 3100 + }, + { + "epoch": 0.04480170563406658, + "grad_norm": 1.0424665212631226, + "learning_rate": 1.9988544535411996e-05, + "loss": 1.075, + "step": 3110 + }, + { + "epoch": 0.04494576256536583, + "grad_norm": 1.1500542163848877, + "learning_rate": 1.9988320187476483e-05, + "loss": 1.0634, + "step": 3120 + }, + { + "epoch": 0.04508981949666508, + "grad_norm": 1.0494294166564941, + "learning_rate": 1.998809366522146e-05, + "loss": 1.0811, + "step": 3130 + }, + { + "epoch": 0.04523387642796433, + "grad_norm": 0.932778000831604, + "learning_rate": 1.998786496869623e-05, + "loss": 1.0786, + "step": 3140 + }, + { + "epoch": 0.04537793335926358, + "grad_norm": 1.0902131795883179, + "learning_rate": 1.9987634097950576e-05, + "loss": 1.0769, + "step": 3150 + }, + { + "epoch": 0.04552199029056283, + "grad_norm": 1.001160979270935, + "learning_rate": 1.998740105303476e-05, + "loss": 1.0606, + "step": 3160 + }, + { + "epoch": 0.04566604722186208, + "grad_norm": 1.0201925039291382, + "learning_rate": 1.998716583399951e-05, + "loss": 1.0732, + "step": 3170 + }, + { + "epoch": 0.04581010415316133, + "grad_norm": 0.8125157952308655, + "learning_rate": 1.9986928440896034e-05, + "loss": 1.0875, + "step": 3180 + }, + { + "epoch": 0.04595416108446058, + "grad_norm": 0.8411962985992432, + "learning_rate": 1.9986688873776003e-05, + "loss": 1.0784, + "step": 3190 + }, + { + "epoch": 0.04609821801575983, + "grad_norm": 1.008968472480774, + "learning_rate": 1.9986447132691575e-05, + "loss": 1.061, + "step": 3200 + }, + { + "epoch": 0.04624227494705908, + "grad_norm": 0.8611114025115967, + "learning_rate": 1.9986203217695365e-05, + "loss": 1.0628, + "step": 3210 + }, + { + "epoch": 0.046386331878358326, + "grad_norm": 1.017021656036377, + "learning_rate": 1.9985957128840478e-05, + "loss": 1.0681, + "step": 3220 + }, + { + "epoch": 0.046530388809657576, + "grad_norm": 0.9527968168258667, + "learning_rate": 1.9985708866180475e-05, + "loss": 1.0733, + "step": 3230 + }, + { + "epoch": 0.046674445740956826, + "grad_norm": 1.0302634239196777, + "learning_rate": 1.998545842976941e-05, + "loss": 1.0769, + "step": 3240 + }, + { + "epoch": 0.046818502672256075, + "grad_norm": 1.3760242462158203, + "learning_rate": 1.998520581966179e-05, + "loss": 1.0727, + "step": 3250 + }, + { + "epoch": 0.046962559603555325, + "grad_norm": 1.0351104736328125, + "learning_rate": 1.9984951035912613e-05, + "loss": 1.0802, + "step": 3260 + }, + { + "epoch": 0.047106616534854574, + "grad_norm": 0.9689775109291077, + "learning_rate": 1.9984694078577334e-05, + "loss": 1.0666, + "step": 3270 + }, + { + "epoch": 0.047250673466153824, + "grad_norm": 0.8249936699867249, + "learning_rate": 1.99844349477119e-05, + "loss": 1.0926, + "step": 3280 + }, + { + "epoch": 0.04739473039745307, + "grad_norm": 0.8944165110588074, + "learning_rate": 1.9984173643372705e-05, + "loss": 1.0789, + "step": 3290 + }, + { + "epoch": 0.04753878732875232, + "grad_norm": 0.8791295289993286, + "learning_rate": 1.9983910165616643e-05, + "loss": 1.0879, + "step": 3300 + }, + { + "epoch": 0.04768284426005157, + "grad_norm": 0.7624006867408752, + "learning_rate": 1.998364451450107e-05, + "loss": 1.0806, + "step": 3310 + }, + { + "epoch": 0.04782690119135082, + "grad_norm": 0.9690932035446167, + "learning_rate": 1.9983376690083804e-05, + "loss": 1.0838, + "step": 3320 + }, + { + "epoch": 0.04797095812265007, + "grad_norm": 0.7400982975959778, + "learning_rate": 1.9983106692423154e-05, + "loss": 1.0778, + "step": 3330 + }, + { + "epoch": 0.04811501505394932, + "grad_norm": 1.065669298171997, + "learning_rate": 1.99828345215779e-05, + "loss": 1.0902, + "step": 3340 + }, + { + "epoch": 0.04825907198524857, + "grad_norm": 0.9505540132522583, + "learning_rate": 1.998256017760728e-05, + "loss": 1.099, + "step": 3350 + }, + { + "epoch": 0.04840312891654782, + "grad_norm": 0.7896109223365784, + "learning_rate": 1.9982283660571018e-05, + "loss": 1.0589, + "step": 3360 + }, + { + "epoch": 0.04854718584784707, + "grad_norm": 0.8883846998214722, + "learning_rate": 1.998200497052931e-05, + "loss": 1.0563, + "step": 3370 + }, + { + "epoch": 0.04869124277914632, + "grad_norm": 1.1594328880310059, + "learning_rate": 1.998172410754282e-05, + "loss": 1.0897, + "step": 3380 + }, + { + "epoch": 0.04883529971044557, + "grad_norm": 0.9646573066711426, + "learning_rate": 1.9981441071672693e-05, + "loss": 1.1009, + "step": 3390 + }, + { + "epoch": 0.04897935664174482, + "grad_norm": 0.7927548885345459, + "learning_rate": 1.9981155862980536e-05, + "loss": 1.1053, + "step": 3400 + }, + { + "epoch": 0.04912341357304407, + "grad_norm": 0.8204102516174316, + "learning_rate": 1.998086848152844e-05, + "loss": 1.0492, + "step": 3410 + }, + { + "epoch": 0.04926747050434332, + "grad_norm": 0.8601950407028198, + "learning_rate": 1.998057892737896e-05, + "loss": 1.0984, + "step": 3420 + }, + { + "epoch": 0.04941152743564257, + "grad_norm": 0.9040478467941284, + "learning_rate": 1.998028720059513e-05, + "loss": 1.0773, + "step": 3430 + }, + { + "epoch": 0.04955558436694182, + "grad_norm": 0.9031913876533508, + "learning_rate": 1.9979993301240453e-05, + "loss": 1.0797, + "step": 3440 + }, + { + "epoch": 0.049699641298241067, + "grad_norm": 0.8928804993629456, + "learning_rate": 1.997969722937891e-05, + "loss": 1.0806, + "step": 3450 + }, + { + "epoch": 0.049843698229540316, + "grad_norm": 1.0597736835479736, + "learning_rate": 1.9979398985074952e-05, + "loss": 1.0711, + "step": 3460 + }, + { + "epoch": 0.049987755160839566, + "grad_norm": 0.7926428914070129, + "learning_rate": 1.9979098568393498e-05, + "loss": 1.0664, + "step": 3470 + }, + { + "epoch": 0.050131812092138815, + "grad_norm": 0.9089264869689941, + "learning_rate": 1.9978795979399947e-05, + "loss": 1.081, + "step": 3480 + }, + { + "epoch": 0.050275869023438065, + "grad_norm": 0.8083300590515137, + "learning_rate": 1.9978491218160173e-05, + "loss": 1.0699, + "step": 3490 + }, + { + "epoch": 0.050419925954737314, + "grad_norm": 0.854659914970398, + "learning_rate": 1.997818428474051e-05, + "loss": 1.0734, + "step": 3500 + }, + { + "epoch": 0.050563982886036564, + "grad_norm": 0.8330906629562378, + "learning_rate": 1.997787517920778e-05, + "loss": 1.0931, + "step": 3510 + }, + { + "epoch": 0.050708039817335814, + "grad_norm": 0.7009166479110718, + "learning_rate": 1.9977563901629268e-05, + "loss": 1.0709, + "step": 3520 + }, + { + "epoch": 0.05085209674863506, + "grad_norm": 1.5810503959655762, + "learning_rate": 1.9977250452072732e-05, + "loss": 1.0725, + "step": 3530 + }, + { + "epoch": 0.05099615367993431, + "grad_norm": 0.7954139709472656, + "learning_rate": 1.9976934830606415e-05, + "loss": 1.0927, + "step": 3540 + }, + { + "epoch": 0.05114021061123356, + "grad_norm": 0.6684684753417969, + "learning_rate": 1.9976617037299012e-05, + "loss": 1.0656, + "step": 3550 + }, + { + "epoch": 0.05128426754253281, + "grad_norm": 0.9031051993370056, + "learning_rate": 1.997629707221971e-05, + "loss": 1.0786, + "step": 3560 + }, + { + "epoch": 0.05142832447383206, + "grad_norm": 0.8937859535217285, + "learning_rate": 1.9975974935438158e-05, + "loss": 1.0773, + "step": 3570 + }, + { + "epoch": 0.05157238140513131, + "grad_norm": 0.8457878232002258, + "learning_rate": 1.9975650627024476e-05, + "loss": 1.0728, + "step": 3580 + }, + { + "epoch": 0.05171643833643056, + "grad_norm": 0.938657820224762, + "learning_rate": 1.9975324147049274e-05, + "loss": 1.0856, + "step": 3590 + }, + { + "epoch": 0.05186049526772981, + "grad_norm": 0.7948675155639648, + "learning_rate": 1.997499549558361e-05, + "loss": 1.0388, + "step": 3600 + }, + { + "epoch": 0.05200455219902906, + "grad_norm": 0.8567399382591248, + "learning_rate": 1.9974664672699037e-05, + "loss": 1.0829, + "step": 3610 + }, + { + "epoch": 0.05214860913032831, + "grad_norm": 0.8215539455413818, + "learning_rate": 1.997433167846756e-05, + "loss": 1.0899, + "step": 3620 + }, + { + "epoch": 0.05229266606162755, + "grad_norm": 0.7473692893981934, + "learning_rate": 1.9973996512961677e-05, + "loss": 1.0815, + "step": 3630 + }, + { + "epoch": 0.0524367229929268, + "grad_norm": 0.7624854445457458, + "learning_rate": 1.9973659176254342e-05, + "loss": 1.0752, + "step": 3640 + }, + { + "epoch": 0.05258077992422605, + "grad_norm": 0.755251944065094, + "learning_rate": 1.997331966841899e-05, + "loss": 1.0708, + "step": 3650 + }, + { + "epoch": 0.0527248368555253, + "grad_norm": 1.3446722030639648, + "learning_rate": 1.997297798952953e-05, + "loss": 1.0519, + "step": 3660 + }, + { + "epoch": 0.05286889378682455, + "grad_norm": 1.1751680374145508, + "learning_rate": 1.997263413966034e-05, + "loss": 1.0863, + "step": 3670 + }, + { + "epoch": 0.0530129507181238, + "grad_norm": 0.9749715924263, + "learning_rate": 1.9972288118886264e-05, + "loss": 1.0808, + "step": 3680 + }, + { + "epoch": 0.05315700764942305, + "grad_norm": 0.6592453122138977, + "learning_rate": 1.997193992728264e-05, + "loss": 1.0786, + "step": 3690 + }, + { + "epoch": 0.0533010645807223, + "grad_norm": 0.891697347164154, + "learning_rate": 1.9971589564925253e-05, + "loss": 1.06, + "step": 3700 + }, + { + "epoch": 0.05344512151202155, + "grad_norm": 0.7699964642524719, + "learning_rate": 1.9971237031890374e-05, + "loss": 1.06, + "step": 3710 + }, + { + "epoch": 0.0535891784433208, + "grad_norm": 1.3091051578521729, + "learning_rate": 1.9970882328254754e-05, + "loss": 1.0784, + "step": 3720 + }, + { + "epoch": 0.05373323537462005, + "grad_norm": 0.7992281317710876, + "learning_rate": 1.9970525454095596e-05, + "loss": 1.0922, + "step": 3730 + }, + { + "epoch": 0.0538772923059193, + "grad_norm": 1.0404248237609863, + "learning_rate": 1.9970166409490588e-05, + "loss": 1.0586, + "step": 3740 + }, + { + "epoch": 0.05402134923721855, + "grad_norm": 0.7184622883796692, + "learning_rate": 1.996980519451789e-05, + "loss": 1.0805, + "step": 3750 + }, + { + "epoch": 0.054165406168517796, + "grad_norm": 0.7246914505958557, + "learning_rate": 1.996944180925614e-05, + "loss": 1.0846, + "step": 3760 + }, + { + "epoch": 0.054309463099817046, + "grad_norm": 0.9941462278366089, + "learning_rate": 1.9969076253784433e-05, + "loss": 1.0596, + "step": 3770 + }, + { + "epoch": 0.054453520031116295, + "grad_norm": 1.0598793029785156, + "learning_rate": 1.996870852818235e-05, + "loss": 1.0835, + "step": 3780 + }, + { + "epoch": 0.054597576962415545, + "grad_norm": 0.9485270380973816, + "learning_rate": 1.9968338632529935e-05, + "loss": 1.0558, + "step": 3790 + }, + { + "epoch": 0.054741633893714794, + "grad_norm": 0.6576977968215942, + "learning_rate": 1.9967966566907716e-05, + "loss": 1.0767, + "step": 3800 + }, + { + "epoch": 0.054885690825014044, + "grad_norm": 0.8786500096321106, + "learning_rate": 1.9967592331396687e-05, + "loss": 1.0547, + "step": 3810 + }, + { + "epoch": 0.055029747756313294, + "grad_norm": 1.036960244178772, + "learning_rate": 1.9967215926078304e-05, + "loss": 1.0436, + "step": 3820 + }, + { + "epoch": 0.05517380468761254, + "grad_norm": 0.8549398183822632, + "learning_rate": 1.9966837351034517e-05, + "loss": 1.0723, + "step": 3830 + }, + { + "epoch": 0.05531786161891179, + "grad_norm": 0.8665590286254883, + "learning_rate": 1.9966456606347728e-05, + "loss": 1.0872, + "step": 3840 + }, + { + "epoch": 0.05546191855021104, + "grad_norm": 0.7687617540359497, + "learning_rate": 1.9966073692100824e-05, + "loss": 1.0543, + "step": 3850 + }, + { + "epoch": 0.05560597548151029, + "grad_norm": 0.831766664981842, + "learning_rate": 1.996568860837716e-05, + "loss": 1.0872, + "step": 3860 + }, + { + "epoch": 0.05575003241280954, + "grad_norm": 0.9214545488357544, + "learning_rate": 1.9965301355260563e-05, + "loss": 1.0597, + "step": 3870 + }, + { + "epoch": 0.05589408934410879, + "grad_norm": 0.68256014585495, + "learning_rate": 1.996491193283533e-05, + "loss": 1.0837, + "step": 3880 + }, + { + "epoch": 0.05603814627540804, + "grad_norm": 0.9160231351852417, + "learning_rate": 1.9964520341186236e-05, + "loss": 1.0835, + "step": 3890 + }, + { + "epoch": 0.05618220320670729, + "grad_norm": 0.7571538686752319, + "learning_rate": 1.9964126580398528e-05, + "loss": 1.0703, + "step": 3900 + }, + { + "epoch": 0.05632626013800654, + "grad_norm": 0.8873952031135559, + "learning_rate": 1.9963730650557918e-05, + "loss": 1.0631, + "step": 3910 + }, + { + "epoch": 0.05647031706930579, + "grad_norm": 0.8825143575668335, + "learning_rate": 1.996333255175059e-05, + "loss": 1.0535, + "step": 3920 + }, + { + "epoch": 0.05661437400060504, + "grad_norm": 0.8717725872993469, + "learning_rate": 1.9962932284063215e-05, + "loss": 1.0596, + "step": 3930 + }, + { + "epoch": 0.05675843093190429, + "grad_norm": 0.8045423626899719, + "learning_rate": 1.9962529847582916e-05, + "loss": 1.0463, + "step": 3940 + }, + { + "epoch": 0.05690248786320354, + "grad_norm": 0.9340243339538574, + "learning_rate": 1.9962125242397307e-05, + "loss": 1.0637, + "step": 3950 + }, + { + "epoch": 0.05704654479450279, + "grad_norm": 0.7398734092712402, + "learning_rate": 1.9961718468594457e-05, + "loss": 1.0579, + "step": 3960 + }, + { + "epoch": 0.05719060172580204, + "grad_norm": 0.8126267790794373, + "learning_rate": 1.996130952626292e-05, + "loss": 1.0624, + "step": 3970 + }, + { + "epoch": 0.05733465865710129, + "grad_norm": 1.2689749002456665, + "learning_rate": 1.9960898415491712e-05, + "loss": 1.0704, + "step": 3980 + }, + { + "epoch": 0.057478715588400536, + "grad_norm": 1.1170166730880737, + "learning_rate": 1.9960485136370335e-05, + "loss": 1.0836, + "step": 3990 + }, + { + "epoch": 0.057622772519699786, + "grad_norm": 0.98178631067276, + "learning_rate": 1.9960069688988746e-05, + "loss": 1.0561, + "step": 4000 + }, + { + "epoch": 0.057766829450999035, + "grad_norm": 1.0303664207458496, + "learning_rate": 1.9959652073437384e-05, + "loss": 1.0828, + "step": 4010 + }, + { + "epoch": 0.057910886382298285, + "grad_norm": 0.8844668865203857, + "learning_rate": 1.995923228980716e-05, + "loss": 1.0663, + "step": 4020 + }, + { + "epoch": 0.058054943313597535, + "grad_norm": 0.8205824494361877, + "learning_rate": 1.9958810338189458e-05, + "loss": 1.0771, + "step": 4030 + }, + { + "epoch": 0.058199000244896784, + "grad_norm": 0.6665905714035034, + "learning_rate": 1.995838621867612e-05, + "loss": 1.0525, + "step": 4040 + }, + { + "epoch": 0.058343057176196034, + "grad_norm": 0.9651935696601868, + "learning_rate": 1.9957959931359484e-05, + "loss": 1.054, + "step": 4050 + }, + { + "epoch": 0.05848711410749528, + "grad_norm": 0.7814925909042358, + "learning_rate": 1.995753147633234e-05, + "loss": 1.0635, + "step": 4060 + }, + { + "epoch": 0.05863117103879453, + "grad_norm": 0.9259180426597595, + "learning_rate": 1.9957100853687955e-05, + "loss": 1.0841, + "step": 4070 + }, + { + "epoch": 0.05877522797009378, + "grad_norm": 0.988832414150238, + "learning_rate": 1.9956668063520075e-05, + "loss": 1.0687, + "step": 4080 + }, + { + "epoch": 0.05891928490139303, + "grad_norm": 0.8639126420021057, + "learning_rate": 1.9956233105922907e-05, + "loss": 1.0352, + "step": 4090 + }, + { + "epoch": 0.05906334183269228, + "grad_norm": 0.8149707913398743, + "learning_rate": 1.9955795980991143e-05, + "loss": 1.065, + "step": 4100 + }, + { + "epoch": 0.05920739876399153, + "grad_norm": 0.8967310786247253, + "learning_rate": 1.995535668881993e-05, + "loss": 1.0655, + "step": 4110 + }, + { + "epoch": 0.05935145569529078, + "grad_norm": 0.7974560856819153, + "learning_rate": 1.9954915229504904e-05, + "loss": 1.0549, + "step": 4120 + }, + { + "epoch": 0.05949551262659003, + "grad_norm": 0.8725034594535828, + "learning_rate": 1.9954471603142157e-05, + "loss": 1.0668, + "step": 4130 + }, + { + "epoch": 0.05963956955788928, + "grad_norm": 0.8084957003593445, + "learning_rate": 1.9954025809828266e-05, + "loss": 1.0524, + "step": 4140 + }, + { + "epoch": 0.05978362648918853, + "grad_norm": 0.8285936713218689, + "learning_rate": 1.995357784966027e-05, + "loss": 1.08, + "step": 4150 + }, + { + "epoch": 0.05992768342048778, + "grad_norm": 0.8037678003311157, + "learning_rate": 1.9953127722735688e-05, + "loss": 1.0634, + "step": 4160 + }, + { + "epoch": 0.06007174035178703, + "grad_norm": 0.7721096277236938, + "learning_rate": 1.9952675429152503e-05, + "loss": 1.0701, + "step": 4170 + }, + { + "epoch": 0.06021579728308628, + "grad_norm": 0.6830259561538696, + "learning_rate": 1.9952220969009175e-05, + "loss": 1.0651, + "step": 4180 + }, + { + "epoch": 0.06035985421438553, + "grad_norm": 1.0105340480804443, + "learning_rate": 1.995176434240463e-05, + "loss": 1.073, + "step": 4190 + }, + { + "epoch": 0.06050391114568478, + "grad_norm": 1.1359127759933472, + "learning_rate": 1.995130554943828e-05, + "loss": 1.0713, + "step": 4200 + }, + { + "epoch": 0.06064796807698403, + "grad_norm": 0.9534348249435425, + "learning_rate": 1.9950844590209982e-05, + "loss": 1.0618, + "step": 4210 + }, + { + "epoch": 0.060792025008283276, + "grad_norm": 0.7248373031616211, + "learning_rate": 1.9950381464820093e-05, + "loss": 1.0526, + "step": 4220 + }, + { + "epoch": 0.060936081939582526, + "grad_norm": 0.7521722912788391, + "learning_rate": 1.9949916173369424e-05, + "loss": 1.0809, + "step": 4230 + }, + { + "epoch": 0.061080138870881776, + "grad_norm": 0.7272984385490417, + "learning_rate": 1.994944871595926e-05, + "loss": 1.0616, + "step": 4240 + }, + { + "epoch": 0.061224195802181025, + "grad_norm": 0.9190437197685242, + "learning_rate": 1.994897909269137e-05, + "loss": 1.0691, + "step": 4250 + }, + { + "epoch": 0.061368252733480275, + "grad_norm": 0.8137868046760559, + "learning_rate": 1.9948507303667972e-05, + "loss": 1.0591, + "step": 4260 + }, + { + "epoch": 0.061512309664779524, + "grad_norm": 0.8312646746635437, + "learning_rate": 1.9948033348991777e-05, + "loss": 1.0482, + "step": 4270 + }, + { + "epoch": 0.061656366596078774, + "grad_norm": 0.9318916201591492, + "learning_rate": 1.994755722876595e-05, + "loss": 1.0838, + "step": 4280 + }, + { + "epoch": 0.061800423527378016, + "grad_norm": 0.7599692344665527, + "learning_rate": 1.9947078943094148e-05, + "loss": 1.0376, + "step": 4290 + }, + { + "epoch": 0.061944480458677266, + "grad_norm": 0.8444345593452454, + "learning_rate": 1.9946598492080475e-05, + "loss": 1.092, + "step": 4300 + }, + { + "epoch": 0.062088537389976516, + "grad_norm": 0.8249942064285278, + "learning_rate": 1.9946115875829526e-05, + "loss": 1.0936, + "step": 4310 + }, + { + "epoch": 0.062232594321275765, + "grad_norm": 0.7922937870025635, + "learning_rate": 1.9945631094446355e-05, + "loss": 1.0707, + "step": 4320 + }, + { + "epoch": 0.062376651252575015, + "grad_norm": 1.1079844236373901, + "learning_rate": 1.99451441480365e-05, + "loss": 1.0788, + "step": 4330 + }, + { + "epoch": 0.06252070818387427, + "grad_norm": 0.7890607118606567, + "learning_rate": 1.9944655036705952e-05, + "loss": 1.0795, + "step": 4340 + }, + { + "epoch": 0.06266476511517352, + "grad_norm": 0.7317211031913757, + "learning_rate": 1.994416376056119e-05, + "loss": 1.0614, + "step": 4350 + }, + { + "epoch": 0.06280882204647277, + "grad_norm": 0.8626124262809753, + "learning_rate": 1.9943670319709162e-05, + "loss": 1.0846, + "step": 4360 + }, + { + "epoch": 0.06295287897777202, + "grad_norm": 0.8770148158073425, + "learning_rate": 1.994317471425727e-05, + "loss": 1.0782, + "step": 4370 + }, + { + "epoch": 0.06309693590907127, + "grad_norm": 0.9757588505744934, + "learning_rate": 1.9942676944313416e-05, + "loss": 1.0663, + "step": 4380 + }, + { + "epoch": 0.06324099284037052, + "grad_norm": 0.8641564249992371, + "learning_rate": 1.9942177009985947e-05, + "loss": 1.0705, + "step": 4390 + }, + { + "epoch": 0.06338504977166977, + "grad_norm": 0.9755296111106873, + "learning_rate": 1.99416749113837e-05, + "loss": 1.0659, + "step": 4400 + }, + { + "epoch": 0.06352910670296902, + "grad_norm": 0.9701070785522461, + "learning_rate": 1.9941170648615963e-05, + "loss": 1.0398, + "step": 4410 + }, + { + "epoch": 0.06367316363426827, + "grad_norm": 0.8417096138000488, + "learning_rate": 1.994066422179252e-05, + "loss": 1.0635, + "step": 4420 + }, + { + "epoch": 0.06381722056556752, + "grad_norm": 0.7876541018486023, + "learning_rate": 1.9940155631023604e-05, + "loss": 1.0863, + "step": 4430 + }, + { + "epoch": 0.06396127749686677, + "grad_norm": 0.8045748472213745, + "learning_rate": 1.9939644876419934e-05, + "loss": 1.0734, + "step": 4440 + }, + { + "epoch": 0.06410533442816602, + "grad_norm": 0.7743977904319763, + "learning_rate": 1.993913195809269e-05, + "loss": 1.0511, + "step": 4450 + }, + { + "epoch": 0.06424939135946527, + "grad_norm": 0.6236658692359924, + "learning_rate": 1.9938616876153533e-05, + "loss": 1.044, + "step": 4460 + }, + { + "epoch": 0.06439344829076452, + "grad_norm": 0.8350549936294556, + "learning_rate": 1.9938099630714582e-05, + "loss": 1.0797, + "step": 4470 + }, + { + "epoch": 0.06453750522206377, + "grad_norm": 0.8814065456390381, + "learning_rate": 1.9937580221888438e-05, + "loss": 1.0769, + "step": 4480 + }, + { + "epoch": 0.06468156215336301, + "grad_norm": 0.8198302984237671, + "learning_rate": 1.9937058649788167e-05, + "loss": 1.0507, + "step": 4490 + }, + { + "epoch": 0.06482561908466226, + "grad_norm": 0.8173579573631287, + "learning_rate": 1.9936534914527312e-05, + "loss": 1.0905, + "step": 4500 + }, + { + "epoch": 0.06496967601596151, + "grad_norm": 0.9264322519302368, + "learning_rate": 1.9936009016219883e-05, + "loss": 1.041, + "step": 4510 + }, + { + "epoch": 0.06511373294726076, + "grad_norm": 0.8179509043693542, + "learning_rate": 1.9935480954980352e-05, + "loss": 1.0739, + "step": 4520 + }, + { + "epoch": 0.06525778987856001, + "grad_norm": 0.8666450381278992, + "learning_rate": 1.993495073092368e-05, + "loss": 1.0975, + "step": 4530 + }, + { + "epoch": 0.06540184680985926, + "grad_norm": 0.7168770432472229, + "learning_rate": 1.9934418344165288e-05, + "loss": 1.0763, + "step": 4540 + }, + { + "epoch": 0.06554590374115851, + "grad_norm": 0.6926397681236267, + "learning_rate": 1.9933883794821066e-05, + "loss": 1.0664, + "step": 4550 + }, + { + "epoch": 0.06568996067245776, + "grad_norm": 0.70762038230896, + "learning_rate": 1.9933347083007382e-05, + "loss": 1.0484, + "step": 4560 + }, + { + "epoch": 0.06583401760375701, + "grad_norm": 1.0127503871917725, + "learning_rate": 1.993280820884107e-05, + "loss": 1.0619, + "step": 4570 + }, + { + "epoch": 0.06597807453505626, + "grad_norm": 0.7766719460487366, + "learning_rate": 1.993226717243943e-05, + "loss": 1.0684, + "step": 4580 + }, + { + "epoch": 0.06612213146635551, + "grad_norm": 0.7988908886909485, + "learning_rate": 1.9931723973920245e-05, + "loss": 1.0403, + "step": 4590 + }, + { + "epoch": 0.06626618839765476, + "grad_norm": 0.7570084929466248, + "learning_rate": 1.9931178613401758e-05, + "loss": 1.0252, + "step": 4600 + }, + { + "epoch": 0.06641024532895401, + "grad_norm": 0.7890056371688843, + "learning_rate": 1.993063109100269e-05, + "loss": 1.0672, + "step": 4610 + }, + { + "epoch": 0.06655430226025325, + "grad_norm": 0.8622692823410034, + "learning_rate": 1.9930081406842222e-05, + "loss": 1.0702, + "step": 4620 + }, + { + "epoch": 0.0666983591915525, + "grad_norm": 0.7100027203559875, + "learning_rate": 1.992952956104002e-05, + "loss": 1.0532, + "step": 4630 + }, + { + "epoch": 0.06684241612285174, + "grad_norm": 0.6402877569198608, + "learning_rate": 1.992897555371621e-05, + "loss": 1.0754, + "step": 4640 + }, + { + "epoch": 0.066986473054151, + "grad_norm": 0.9325397610664368, + "learning_rate": 1.992841938499139e-05, + "loss": 1.0463, + "step": 4650 + }, + { + "epoch": 0.06713052998545024, + "grad_norm": 0.816155731678009, + "learning_rate": 1.9927861054986634e-05, + "loss": 1.0635, + "step": 4660 + }, + { + "epoch": 0.0672745869167495, + "grad_norm": 0.6558541059494019, + "learning_rate": 1.9927300563823485e-05, + "loss": 1.066, + "step": 4670 + }, + { + "epoch": 0.06741864384804874, + "grad_norm": 0.7550584077835083, + "learning_rate": 1.9926737911623947e-05, + "loss": 1.0356, + "step": 4680 + }, + { + "epoch": 0.06756270077934799, + "grad_norm": 0.8332971334457397, + "learning_rate": 1.9926173098510503e-05, + "loss": 1.0542, + "step": 4690 + }, + { + "epoch": 0.06770675771064724, + "grad_norm": 0.7391961812973022, + "learning_rate": 1.9925606124606113e-05, + "loss": 1.0795, + "step": 4700 + }, + { + "epoch": 0.06785081464194649, + "grad_norm": 0.8095934391021729, + "learning_rate": 1.992503699003419e-05, + "loss": 1.0627, + "step": 4710 + }, + { + "epoch": 0.06799487157324574, + "grad_norm": 0.8879664540290833, + "learning_rate": 1.992446569491863e-05, + "loss": 1.0445, + "step": 4720 + }, + { + "epoch": 0.06813892850454499, + "grad_norm": 0.8452285528182983, + "learning_rate": 1.99238922393838e-05, + "loss": 1.0752, + "step": 4730 + }, + { + "epoch": 0.06828298543584424, + "grad_norm": 0.8504966497421265, + "learning_rate": 1.9923316623554522e-05, + "loss": 1.0659, + "step": 4740 + }, + { + "epoch": 0.06842704236714349, + "grad_norm": 0.8374481201171875, + "learning_rate": 1.992273884755611e-05, + "loss": 1.0657, + "step": 4750 + }, + { + "epoch": 0.06857109929844274, + "grad_norm": 0.7134926319122314, + "learning_rate": 1.992215891151434e-05, + "loss": 1.0682, + "step": 4760 + }, + { + "epoch": 0.06871515622974199, + "grad_norm": 0.7555819749832153, + "learning_rate": 1.9921576815555448e-05, + "loss": 1.0763, + "step": 4770 + }, + { + "epoch": 0.06885921316104124, + "grad_norm": 0.6944034695625305, + "learning_rate": 1.992099255980615e-05, + "loss": 1.0558, + "step": 4780 + }, + { + "epoch": 0.06900327009234049, + "grad_norm": 0.7401833534240723, + "learning_rate": 1.9920406144393635e-05, + "loss": 1.0475, + "step": 4790 + }, + { + "epoch": 0.06914732702363974, + "grad_norm": 1.1073486804962158, + "learning_rate": 1.991981756944555e-05, + "loss": 1.0552, + "step": 4800 + }, + { + "epoch": 0.06929138395493899, + "grad_norm": 0.7874646186828613, + "learning_rate": 1.9919226835090025e-05, + "loss": 1.0692, + "step": 4810 + }, + { + "epoch": 0.06943544088623824, + "grad_norm": 0.7678020596504211, + "learning_rate": 1.991863394145566e-05, + "loss": 1.0719, + "step": 4820 + }, + { + "epoch": 0.06957949781753749, + "grad_norm": 0.7626875638961792, + "learning_rate": 1.9918038888671507e-05, + "loss": 1.054, + "step": 4830 + }, + { + "epoch": 0.06972355474883674, + "grad_norm": 0.8412059545516968, + "learning_rate": 1.991744167686711e-05, + "loss": 1.0566, + "step": 4840 + }, + { + "epoch": 0.06986761168013599, + "grad_norm": 0.8255806565284729, + "learning_rate": 1.991684230617247e-05, + "loss": 1.0711, + "step": 4850 + }, + { + "epoch": 0.07001166861143523, + "grad_norm": 0.7971453070640564, + "learning_rate": 1.9916240776718063e-05, + "loss": 1.0303, + "step": 4860 + }, + { + "epoch": 0.07015572554273448, + "grad_norm": 0.7085686326026917, + "learning_rate": 1.9915637088634836e-05, + "loss": 1.0476, + "step": 4870 + }, + { + "epoch": 0.07029978247403373, + "grad_norm": 0.7923370599746704, + "learning_rate": 1.99150312420542e-05, + "loss": 1.0587, + "step": 4880 + }, + { + "epoch": 0.07044383940533298, + "grad_norm": 0.827341616153717, + "learning_rate": 1.991442323710804e-05, + "loss": 1.064, + "step": 4890 + }, + { + "epoch": 0.07058789633663223, + "grad_norm": 0.7372671961784363, + "learning_rate": 1.9913813073928708e-05, + "loss": 1.0889, + "step": 4900 + }, + { + "epoch": 0.07073195326793148, + "grad_norm": 0.7749466300010681, + "learning_rate": 1.9913200752649033e-05, + "loss": 1.0859, + "step": 4910 + }, + { + "epoch": 0.07087601019923073, + "grad_norm": 0.9195302724838257, + "learning_rate": 1.9912586273402308e-05, + "loss": 1.0787, + "step": 4920 + }, + { + "epoch": 0.07102006713052998, + "grad_norm": 0.7387667894363403, + "learning_rate": 1.9911969636322297e-05, + "loss": 1.0395, + "step": 4930 + }, + { + "epoch": 0.07116412406182923, + "grad_norm": 0.7745783925056458, + "learning_rate": 1.9911350841543225e-05, + "loss": 1.0518, + "step": 4940 + }, + { + "epoch": 0.07130818099312848, + "grad_norm": 0.9240629076957703, + "learning_rate": 1.991072988919981e-05, + "loss": 1.0497, + "step": 4950 + }, + { + "epoch": 0.07145223792442773, + "grad_norm": 0.7768181562423706, + "learning_rate": 1.9910106779427213e-05, + "loss": 1.0539, + "step": 4960 + }, + { + "epoch": 0.07159629485572698, + "grad_norm": 0.6498637795448303, + "learning_rate": 1.9909481512361083e-05, + "loss": 1.0643, + "step": 4970 + }, + { + "epoch": 0.07174035178702623, + "grad_norm": 0.8959274291992188, + "learning_rate": 1.9908854088137525e-05, + "loss": 1.0548, + "step": 4980 + }, + { + "epoch": 0.07188440871832548, + "grad_norm": 0.9232971668243408, + "learning_rate": 1.9908224506893124e-05, + "loss": 1.0458, + "step": 4990 + }, + { + "epoch": 0.07202846564962473, + "grad_norm": 0.599284827709198, + "learning_rate": 1.9907592768764935e-05, + "loss": 1.0692, + "step": 5000 + }, + { + "epoch": 0.07217252258092398, + "grad_norm": 0.7615419626235962, + "learning_rate": 1.9906958873890474e-05, + "loss": 1.0357, + "step": 5010 + }, + { + "epoch": 0.07231657951222323, + "grad_norm": 0.9292043447494507, + "learning_rate": 1.9906322822407734e-05, + "loss": 1.0751, + "step": 5020 + }, + { + "epoch": 0.07246063644352248, + "grad_norm": 0.6925828456878662, + "learning_rate": 1.990568461445517e-05, + "loss": 1.0652, + "step": 5030 + }, + { + "epoch": 0.07260469337482173, + "grad_norm": 0.8829536437988281, + "learning_rate": 1.990504425017172e-05, + "loss": 1.0627, + "step": 5040 + }, + { + "epoch": 0.07274875030612098, + "grad_norm": 0.967322826385498, + "learning_rate": 1.9904401729696775e-05, + "loss": 1.05, + "step": 5050 + }, + { + "epoch": 0.07289280723742023, + "grad_norm": 0.8069252371788025, + "learning_rate": 1.9903757053170203e-05, + "loss": 1.06, + "step": 5060 + }, + { + "epoch": 0.07303686416871948, + "grad_norm": 0.9855362772941589, + "learning_rate": 1.9903110220732344e-05, + "loss": 1.0571, + "step": 5070 + }, + { + "epoch": 0.07318092110001873, + "grad_norm": 0.764899492263794, + "learning_rate": 1.9902461232524003e-05, + "loss": 1.0853, + "step": 5080 + }, + { + "epoch": 0.07332497803131797, + "grad_norm": 0.7716745734214783, + "learning_rate": 1.9901810088686457e-05, + "loss": 1.0688, + "step": 5090 + }, + { + "epoch": 0.07346903496261722, + "grad_norm": 0.768663227558136, + "learning_rate": 1.990115678936145e-05, + "loss": 1.0718, + "step": 5100 + }, + { + "epoch": 0.07361309189391647, + "grad_norm": 0.859196126461029, + "learning_rate": 1.9900501334691194e-05, + "loss": 1.0867, + "step": 5110 + }, + { + "epoch": 0.07375714882521572, + "grad_norm": 0.7254055142402649, + "learning_rate": 1.9899843724818378e-05, + "loss": 1.059, + "step": 5120 + }, + { + "epoch": 0.07390120575651497, + "grad_norm": 0.7900877594947815, + "learning_rate": 1.9899183959886153e-05, + "loss": 1.0737, + "step": 5130 + }, + { + "epoch": 0.07404526268781422, + "grad_norm": 0.7576618790626526, + "learning_rate": 1.9898522040038136e-05, + "loss": 1.0433, + "step": 5140 + }, + { + "epoch": 0.07418931961911347, + "grad_norm": 0.7771943807601929, + "learning_rate": 1.9897857965418423e-05, + "loss": 1.0622, + "step": 5150 + }, + { + "epoch": 0.07433337655041272, + "grad_norm": 0.6736606955528259, + "learning_rate": 1.989719173617157e-05, + "loss": 1.0559, + "step": 5160 + }, + { + "epoch": 0.07447743348171197, + "grad_norm": 0.8990259170532227, + "learning_rate": 1.9896523352442608e-05, + "loss": 1.0605, + "step": 5170 + }, + { + "epoch": 0.07462149041301122, + "grad_norm": 0.7779847979545593, + "learning_rate": 1.9895852814377034e-05, + "loss": 1.0855, + "step": 5180 + }, + { + "epoch": 0.07476554734431047, + "grad_norm": 0.8152965903282166, + "learning_rate": 1.989518012212082e-05, + "loss": 1.0682, + "step": 5190 + }, + { + "epoch": 0.07490960427560972, + "grad_norm": 0.7711450457572937, + "learning_rate": 1.9894505275820393e-05, + "loss": 1.0408, + "step": 5200 + }, + { + "epoch": 0.07505366120690897, + "grad_norm": 0.7221505641937256, + "learning_rate": 1.9893828275622663e-05, + "loss": 1.0547, + "step": 5210 + }, + { + "epoch": 0.07519771813820822, + "grad_norm": 1.1068379878997803, + "learning_rate": 1.9893149121675004e-05, + "loss": 1.0609, + "step": 5220 + }, + { + "epoch": 0.07534177506950747, + "grad_norm": 0.8413456678390503, + "learning_rate": 1.9892467814125256e-05, + "loss": 1.0622, + "step": 5230 + }, + { + "epoch": 0.07548583200080672, + "grad_norm": 0.8818542957305908, + "learning_rate": 1.9891784353121735e-05, + "loss": 1.0667, + "step": 5240 + }, + { + "epoch": 0.07562988893210597, + "grad_norm": 0.7753092646598816, + "learning_rate": 1.9891098738813214e-05, + "loss": 1.0343, + "step": 5250 + }, + { + "epoch": 0.07577394586340522, + "grad_norm": 0.6563664078712463, + "learning_rate": 1.9890410971348947e-05, + "loss": 1.0512, + "step": 5260 + }, + { + "epoch": 0.07591800279470447, + "grad_norm": 0.7327760457992554, + "learning_rate": 1.9889721050878648e-05, + "loss": 1.0648, + "step": 5270 + }, + { + "epoch": 0.07606205972600372, + "grad_norm": 0.7993945479393005, + "learning_rate": 1.9889028977552507e-05, + "loss": 1.0633, + "step": 5280 + }, + { + "epoch": 0.07620611665730297, + "grad_norm": 0.7560327649116516, + "learning_rate": 1.9888334751521174e-05, + "loss": 1.0747, + "step": 5290 + }, + { + "epoch": 0.07635017358860222, + "grad_norm": 0.7332075834274292, + "learning_rate": 1.988763837293578e-05, + "loss": 1.034, + "step": 5300 + }, + { + "epoch": 0.07649423051990147, + "grad_norm": 0.7934123873710632, + "learning_rate": 1.9886939841947908e-05, + "loss": 1.064, + "step": 5310 + }, + { + "epoch": 0.07663828745120072, + "grad_norm": 0.7081010341644287, + "learning_rate": 1.9886239158709623e-05, + "loss": 1.056, + "step": 5320 + }, + { + "epoch": 0.07678234438249996, + "grad_norm": 0.7036541700363159, + "learning_rate": 1.988553632337346e-05, + "loss": 1.0861, + "step": 5330 + }, + { + "epoch": 0.07692640131379921, + "grad_norm": 0.7646002769470215, + "learning_rate": 1.9884831336092402e-05, + "loss": 1.0845, + "step": 5340 + }, + { + "epoch": 0.07707045824509846, + "grad_norm": 0.7128923535346985, + "learning_rate": 1.988412419701993e-05, + "loss": 1.0437, + "step": 5350 + }, + { + "epoch": 0.07721451517639771, + "grad_norm": 0.8597097396850586, + "learning_rate": 1.9883414906309968e-05, + "loss": 1.043, + "step": 5360 + }, + { + "epoch": 0.07735857210769696, + "grad_norm": 0.8186230063438416, + "learning_rate": 1.9882703464116925e-05, + "loss": 1.0641, + "step": 5370 + }, + { + "epoch": 0.07750262903899621, + "grad_norm": 0.6041313409805298, + "learning_rate": 1.988198987059567e-05, + "loss": 1.0499, + "step": 5380 + }, + { + "epoch": 0.07764668597029546, + "grad_norm": 0.6971914172172546, + "learning_rate": 1.988127412590154e-05, + "loss": 1.0587, + "step": 5390 + }, + { + "epoch": 0.07779074290159471, + "grad_norm": 0.7470895648002625, + "learning_rate": 1.9880556230190345e-05, + "loss": 1.0452, + "step": 5400 + }, + { + "epoch": 0.07793479983289396, + "grad_norm": 0.9479622840881348, + "learning_rate": 1.987983618361836e-05, + "loss": 1.0646, + "step": 5410 + }, + { + "epoch": 0.07807885676419321, + "grad_norm": 0.7283429503440857, + "learning_rate": 1.9879113986342335e-05, + "loss": 1.0643, + "step": 5420 + }, + { + "epoch": 0.07822291369549246, + "grad_norm": 0.7103187441825867, + "learning_rate": 1.9878389638519473e-05, + "loss": 1.0577, + "step": 5430 + }, + { + "epoch": 0.07836697062679171, + "grad_norm": 0.7464463114738464, + "learning_rate": 1.987766314030746e-05, + "loss": 1.044, + "step": 5440 + }, + { + "epoch": 0.07851102755809096, + "grad_norm": 0.7824941277503967, + "learning_rate": 1.9876934491864442e-05, + "loss": 1.0483, + "step": 5450 + }, + { + "epoch": 0.07865508448939021, + "grad_norm": 0.6849684119224548, + "learning_rate": 1.9876203693349044e-05, + "loss": 1.0661, + "step": 5460 + }, + { + "epoch": 0.07879914142068946, + "grad_norm": 0.7476369142532349, + "learning_rate": 1.987547074492034e-05, + "loss": 1.0597, + "step": 5470 + }, + { + "epoch": 0.07894319835198871, + "grad_norm": 0.818276584148407, + "learning_rate": 1.987473564673789e-05, + "loss": 1.0407, + "step": 5480 + }, + { + "epoch": 0.07908725528328796, + "grad_norm": 0.7284795641899109, + "learning_rate": 1.9873998398961708e-05, + "loss": 1.06, + "step": 5490 + }, + { + "epoch": 0.07923131221458721, + "grad_norm": 0.7441176176071167, + "learning_rate": 1.9873259001752284e-05, + "loss": 1.0547, + "step": 5500 + }, + { + "epoch": 0.07937536914588646, + "grad_norm": 0.922211766242981, + "learning_rate": 1.9872517455270583e-05, + "loss": 1.0634, + "step": 5510 + }, + { + "epoch": 0.0795194260771857, + "grad_norm": 0.7370014190673828, + "learning_rate": 1.987177375967802e-05, + "loss": 1.0553, + "step": 5520 + }, + { + "epoch": 0.07966348300848496, + "grad_norm": 0.7166247963905334, + "learning_rate": 1.987102791513649e-05, + "loss": 1.0467, + "step": 5530 + }, + { + "epoch": 0.0798075399397842, + "grad_norm": 0.7569652199745178, + "learning_rate": 1.9870279921808356e-05, + "loss": 1.0489, + "step": 5540 + }, + { + "epoch": 0.07995159687108346, + "grad_norm": 0.7443186044692993, + "learning_rate": 1.9869529779856448e-05, + "loss": 1.062, + "step": 5550 + }, + { + "epoch": 0.0800956538023827, + "grad_norm": 0.7470227479934692, + "learning_rate": 1.9868777489444052e-05, + "loss": 1.055, + "step": 5560 + }, + { + "epoch": 0.08023971073368195, + "grad_norm": 1.0605957508087158, + "learning_rate": 1.9868023050734935e-05, + "loss": 1.0612, + "step": 5570 + }, + { + "epoch": 0.0803837676649812, + "grad_norm": 0.7970819473266602, + "learning_rate": 1.986726646389333e-05, + "loss": 1.0296, + "step": 5580 + }, + { + "epoch": 0.08052782459628045, + "grad_norm": 0.7080398201942444, + "learning_rate": 1.9866507729083937e-05, + "loss": 1.0654, + "step": 5590 + }, + { + "epoch": 0.0806718815275797, + "grad_norm": 0.7669221758842468, + "learning_rate": 1.986574684647192e-05, + "loss": 1.0545, + "step": 5600 + }, + { + "epoch": 0.08081593845887895, + "grad_norm": 0.9112254977226257, + "learning_rate": 1.986498381622291e-05, + "loss": 1.0693, + "step": 5610 + }, + { + "epoch": 0.0809599953901782, + "grad_norm": 0.851102888584137, + "learning_rate": 1.9864218638503012e-05, + "loss": 1.0597, + "step": 5620 + }, + { + "epoch": 0.08110405232147745, + "grad_norm": 0.764011561870575, + "learning_rate": 1.9863451313478796e-05, + "loss": 1.0503, + "step": 5630 + }, + { + "epoch": 0.0812481092527767, + "grad_norm": 0.6748458743095398, + "learning_rate": 1.986268184131729e-05, + "loss": 1.0549, + "step": 5640 + }, + { + "epoch": 0.08139216618407595, + "grad_norm": 0.6882784366607666, + "learning_rate": 1.9861910222186006e-05, + "loss": 1.0698, + "step": 5650 + }, + { + "epoch": 0.0815362231153752, + "grad_norm": 0.7472461462020874, + "learning_rate": 1.9861136456252912e-05, + "loss": 1.0662, + "step": 5660 + }, + { + "epoch": 0.08168028004667445, + "grad_norm": 0.7899004817008972, + "learning_rate": 1.986036054368645e-05, + "loss": 1.0491, + "step": 5670 + }, + { + "epoch": 0.0818243369779737, + "grad_norm": 0.8129688501358032, + "learning_rate": 1.9859582484655515e-05, + "loss": 1.0488, + "step": 5680 + }, + { + "epoch": 0.08196839390927295, + "grad_norm": 0.8368383049964905, + "learning_rate": 1.9858802279329486e-05, + "loss": 1.0577, + "step": 5690 + }, + { + "epoch": 0.0821124508405722, + "grad_norm": 0.7655535936355591, + "learning_rate": 1.9858019927878204e-05, + "loss": 1.0549, + "step": 5700 + }, + { + "epoch": 0.08225650777187145, + "grad_norm": 0.8787673711776733, + "learning_rate": 1.985723543047198e-05, + "loss": 1.0851, + "step": 5710 + }, + { + "epoch": 0.0824005647031707, + "grad_norm": 0.6964054703712463, + "learning_rate": 1.9856448787281576e-05, + "loss": 1.0309, + "step": 5720 + }, + { + "epoch": 0.08254462163446995, + "grad_norm": 0.7971175909042358, + "learning_rate": 1.9855659998478245e-05, + "loss": 1.0492, + "step": 5730 + }, + { + "epoch": 0.0826886785657692, + "grad_norm": 0.8056570887565613, + "learning_rate": 1.9854869064233694e-05, + "loss": 1.0761, + "step": 5740 + }, + { + "epoch": 0.08283273549706845, + "grad_norm": 0.8324787020683289, + "learning_rate": 1.985407598472009e-05, + "loss": 1.0696, + "step": 5750 + }, + { + "epoch": 0.0829767924283677, + "grad_norm": 0.863656222820282, + "learning_rate": 1.9853280760110084e-05, + "loss": 1.0613, + "step": 5760 + }, + { + "epoch": 0.08312084935966695, + "grad_norm": 0.7228241562843323, + "learning_rate": 1.985248339057678e-05, + "loss": 1.0698, + "step": 5770 + }, + { + "epoch": 0.0832649062909662, + "grad_norm": 0.7174868583679199, + "learning_rate": 1.985168387629376e-05, + "loss": 1.0546, + "step": 5780 + }, + { + "epoch": 0.08340896322226544, + "grad_norm": 0.9294284582138062, + "learning_rate": 1.9850882217435063e-05, + "loss": 1.0556, + "step": 5790 + }, + { + "epoch": 0.0835530201535647, + "grad_norm": 0.8399975299835205, + "learning_rate": 1.9850078414175203e-05, + "loss": 1.0577, + "step": 5800 + }, + { + "epoch": 0.08369707708486394, + "grad_norm": 0.6481358408927917, + "learning_rate": 1.984927246668915e-05, + "loss": 1.0627, + "step": 5810 + }, + { + "epoch": 0.0838411340161632, + "grad_norm": 0.7580750584602356, + "learning_rate": 1.9848464375152356e-05, + "loss": 1.0733, + "step": 5820 + }, + { + "epoch": 0.08398519094746244, + "grad_norm": 0.5967349410057068, + "learning_rate": 1.984765413974072e-05, + "loss": 1.0292, + "step": 5830 + }, + { + "epoch": 0.08412924787876169, + "grad_norm": 0.7164834141731262, + "learning_rate": 1.9846841760630633e-05, + "loss": 1.0483, + "step": 5840 + }, + { + "epoch": 0.08427330481006094, + "grad_norm": 0.755739688873291, + "learning_rate": 1.984602723799893e-05, + "loss": 1.0507, + "step": 5850 + }, + { + "epoch": 0.08441736174136019, + "grad_norm": 1.1750564575195312, + "learning_rate": 1.9845210572022923e-05, + "loss": 1.0873, + "step": 5860 + }, + { + "epoch": 0.08456141867265944, + "grad_norm": 0.7422271966934204, + "learning_rate": 1.984439176288039e-05, + "loss": 1.0529, + "step": 5870 + }, + { + "epoch": 0.08470547560395869, + "grad_norm": 0.8092758059501648, + "learning_rate": 1.984357081074957e-05, + "loss": 1.0675, + "step": 5880 + }, + { + "epoch": 0.08484953253525794, + "grad_norm": 0.8689402341842651, + "learning_rate": 1.984274771580918e-05, + "loss": 1.0583, + "step": 5890 + }, + { + "epoch": 0.08499358946655719, + "grad_norm": 0.7116698622703552, + "learning_rate": 1.9841922478238394e-05, + "loss": 1.0674, + "step": 5900 + }, + { + "epoch": 0.08513764639785644, + "grad_norm": 0.7035679817199707, + "learning_rate": 1.9841095098216853e-05, + "loss": 1.0587, + "step": 5910 + }, + { + "epoch": 0.08528170332915569, + "grad_norm": 0.6325827240943909, + "learning_rate": 1.9840265575924665e-05, + "loss": 1.0675, + "step": 5920 + }, + { + "epoch": 0.08542576026045492, + "grad_norm": 0.7157877087593079, + "learning_rate": 1.9839433911542413e-05, + "loss": 1.0396, + "step": 5930 + }, + { + "epoch": 0.08556981719175417, + "grad_norm": 0.7110846042633057, + "learning_rate": 1.9838600105251128e-05, + "loss": 1.0472, + "step": 5940 + }, + { + "epoch": 0.08571387412305342, + "grad_norm": 0.7248791456222534, + "learning_rate": 1.9837764157232327e-05, + "loss": 1.072, + "step": 5950 + }, + { + "epoch": 0.08585793105435267, + "grad_norm": 0.7378261685371399, + "learning_rate": 1.983692606766798e-05, + "loss": 1.0503, + "step": 5960 + }, + { + "epoch": 0.08600198798565192, + "grad_norm": 0.9149982333183289, + "learning_rate": 1.9836085836740525e-05, + "loss": 1.0747, + "step": 5970 + }, + { + "epoch": 0.08614604491695117, + "grad_norm": 0.6886637806892395, + "learning_rate": 1.9835243464632876e-05, + "loss": 1.0571, + "step": 5980 + }, + { + "epoch": 0.08629010184825042, + "grad_norm": 0.7848793268203735, + "learning_rate": 1.9834398951528402e-05, + "loss": 1.084, + "step": 5990 + }, + { + "epoch": 0.08643415877954967, + "grad_norm": 0.7105270624160767, + "learning_rate": 1.9833552297610943e-05, + "loss": 1.0671, + "step": 6000 + }, + { + "epoch": 0.08657821571084892, + "grad_norm": 0.7114749550819397, + "learning_rate": 1.98327035030648e-05, + "loss": 1.0806, + "step": 6010 + }, + { + "epoch": 0.08672227264214817, + "grad_norm": 0.8656984567642212, + "learning_rate": 1.9831852568074746e-05, + "loss": 1.0563, + "step": 6020 + }, + { + "epoch": 0.08686632957344742, + "grad_norm": 0.8152972459793091, + "learning_rate": 1.9830999492826023e-05, + "loss": 1.069, + "step": 6030 + }, + { + "epoch": 0.08701038650474667, + "grad_norm": 0.6695161461830139, + "learning_rate": 1.9830144277504323e-05, + "loss": 1.0601, + "step": 6040 + }, + { + "epoch": 0.08715444343604592, + "grad_norm": 0.6562400460243225, + "learning_rate": 1.9829286922295824e-05, + "loss": 1.0528, + "step": 6050 + }, + { + "epoch": 0.08729850036734517, + "grad_norm": 0.7877181172370911, + "learning_rate": 1.982842742738716e-05, + "loss": 1.0629, + "step": 6060 + }, + { + "epoch": 0.08744255729864442, + "grad_norm": 0.6838402152061462, + "learning_rate": 1.9827565792965422e-05, + "loss": 1.0629, + "step": 6070 + }, + { + "epoch": 0.08758661422994367, + "grad_norm": 0.8361284136772156, + "learning_rate": 1.9826702019218188e-05, + "loss": 1.054, + "step": 6080 + }, + { + "epoch": 0.08773067116124292, + "grad_norm": 0.6967858076095581, + "learning_rate": 1.982583610633348e-05, + "loss": 1.0565, + "step": 6090 + }, + { + "epoch": 0.08787472809254217, + "grad_norm": 0.8265675902366638, + "learning_rate": 1.98249680544998e-05, + "loss": 1.054, + "step": 6100 + }, + { + "epoch": 0.08801878502384142, + "grad_norm": 0.7994257211685181, + "learning_rate": 1.9824097863906112e-05, + "loss": 1.0718, + "step": 6110 + }, + { + "epoch": 0.08816284195514067, + "grad_norm": 0.6750326752662659, + "learning_rate": 1.982322553474184e-05, + "loss": 1.0732, + "step": 6120 + }, + { + "epoch": 0.08830689888643992, + "grad_norm": 0.6326549649238586, + "learning_rate": 1.982235106719688e-05, + "loss": 1.0518, + "step": 6130 + }, + { + "epoch": 0.08845095581773917, + "grad_norm": 0.7757358551025391, + "learning_rate": 1.9821474461461593e-05, + "loss": 1.0456, + "step": 6140 + }, + { + "epoch": 0.08859501274903842, + "grad_norm": 0.7172616124153137, + "learning_rate": 1.9820595717726806e-05, + "loss": 1.0514, + "step": 6150 + }, + { + "epoch": 0.08873906968033766, + "grad_norm": 0.7145668864250183, + "learning_rate": 1.9819714836183805e-05, + "loss": 1.0576, + "step": 6160 + }, + { + "epoch": 0.08888312661163691, + "grad_norm": 0.601446270942688, + "learning_rate": 1.9818831817024344e-05, + "loss": 1.0352, + "step": 6170 + }, + { + "epoch": 0.08902718354293616, + "grad_norm": 0.9036988019943237, + "learning_rate": 1.9817946660440647e-05, + "loss": 1.0571, + "step": 6180 + }, + { + "epoch": 0.08917124047423541, + "grad_norm": 0.7691981792449951, + "learning_rate": 1.9817059366625406e-05, + "loss": 1.0452, + "step": 6190 + }, + { + "epoch": 0.08931529740553466, + "grad_norm": 0.8221045732498169, + "learning_rate": 1.9816169935771765e-05, + "loss": 1.0567, + "step": 6200 + }, + { + "epoch": 0.08945935433683391, + "grad_norm": 0.8755362629890442, + "learning_rate": 1.9815278368073342e-05, + "loss": 1.056, + "step": 6210 + }, + { + "epoch": 0.08960341126813316, + "grad_norm": 0.8427380323410034, + "learning_rate": 1.981438466372422e-05, + "loss": 1.0557, + "step": 6220 + }, + { + "epoch": 0.08974746819943241, + "grad_norm": 0.7444628477096558, + "learning_rate": 1.9813488822918946e-05, + "loss": 1.0886, + "step": 6230 + }, + { + "epoch": 0.08989152513073166, + "grad_norm": 0.6707512736320496, + "learning_rate": 1.9812590845852532e-05, + "loss": 1.059, + "step": 6240 + }, + { + "epoch": 0.09003558206203091, + "grad_norm": 0.7399901151657104, + "learning_rate": 1.981169073272046e-05, + "loss": 1.0475, + "step": 6250 + }, + { + "epoch": 0.09017963899333016, + "grad_norm": 0.7267374396324158, + "learning_rate": 1.9810788483718663e-05, + "loss": 1.0647, + "step": 6260 + }, + { + "epoch": 0.09032369592462941, + "grad_norm": 0.7055925130844116, + "learning_rate": 1.9809884099043553e-05, + "loss": 1.053, + "step": 6270 + }, + { + "epoch": 0.09046775285592866, + "grad_norm": 0.7201550602912903, + "learning_rate": 1.9808977578892006e-05, + "loss": 1.0635, + "step": 6280 + }, + { + "epoch": 0.09061180978722791, + "grad_norm": 0.9184608459472656, + "learning_rate": 1.9808068923461354e-05, + "loss": 1.082, + "step": 6290 + }, + { + "epoch": 0.09075586671852716, + "grad_norm": 0.6973926424980164, + "learning_rate": 1.9807158132949398e-05, + "loss": 1.07, + "step": 6300 + }, + { + "epoch": 0.09089992364982641, + "grad_norm": 0.6459714770317078, + "learning_rate": 1.9806245207554405e-05, + "loss": 1.0435, + "step": 6310 + }, + { + "epoch": 0.09104398058112566, + "grad_norm": 0.8179060220718384, + "learning_rate": 1.980533014747511e-05, + "loss": 1.0539, + "step": 6320 + }, + { + "epoch": 0.09118803751242491, + "grad_norm": 0.7989359498023987, + "learning_rate": 1.9804412952910706e-05, + "loss": 1.0561, + "step": 6330 + }, + { + "epoch": 0.09133209444372416, + "grad_norm": 0.6431265473365784, + "learning_rate": 1.9803493624060857e-05, + "loss": 1.0448, + "step": 6340 + }, + { + "epoch": 0.0914761513750234, + "grad_norm": 0.7607747316360474, + "learning_rate": 1.980257216112568e-05, + "loss": 1.0649, + "step": 6350 + }, + { + "epoch": 0.09162020830632266, + "grad_norm": 0.6712044477462769, + "learning_rate": 1.9801648564305772e-05, + "loss": 1.0598, + "step": 6360 + }, + { + "epoch": 0.0917642652376219, + "grad_norm": 0.8743002414703369, + "learning_rate": 1.9800722833802186e-05, + "loss": 1.0421, + "step": 6370 + }, + { + "epoch": 0.09190832216892116, + "grad_norm": 0.7426406145095825, + "learning_rate": 1.979979496981644e-05, + "loss": 1.0655, + "step": 6380 + }, + { + "epoch": 0.0920523791002204, + "grad_norm": 0.7609308958053589, + "learning_rate": 1.9798864972550517e-05, + "loss": 1.0717, + "step": 6390 + }, + { + "epoch": 0.09219643603151965, + "grad_norm": 0.7155079245567322, + "learning_rate": 1.9797932842206865e-05, + "loss": 1.0535, + "step": 6400 + }, + { + "epoch": 0.0923404929628189, + "grad_norm": 0.6351951956748962, + "learning_rate": 1.9796998578988397e-05, + "loss": 1.0336, + "step": 6410 + }, + { + "epoch": 0.09248454989411815, + "grad_norm": 0.6947046518325806, + "learning_rate": 1.9796062183098485e-05, + "loss": 1.0268, + "step": 6420 + }, + { + "epoch": 0.0926286068254174, + "grad_norm": 0.7363710999488831, + "learning_rate": 1.9795123654740977e-05, + "loss": 1.0436, + "step": 6430 + }, + { + "epoch": 0.09277266375671665, + "grad_norm": 1.0118937492370605, + "learning_rate": 1.979418299412017e-05, + "loss": 1.0365, + "step": 6440 + }, + { + "epoch": 0.0929167206880159, + "grad_norm": 0.7479383945465088, + "learning_rate": 1.9793240201440835e-05, + "loss": 1.0552, + "step": 6450 + }, + { + "epoch": 0.09306077761931515, + "grad_norm": 0.6502164602279663, + "learning_rate": 1.9792295276908206e-05, + "loss": 1.0555, + "step": 6460 + }, + { + "epoch": 0.0932048345506144, + "grad_norm": 0.6710230708122253, + "learning_rate": 1.9791348220727983e-05, + "loss": 1.0578, + "step": 6470 + }, + { + "epoch": 0.09334889148191365, + "grad_norm": 0.7776168584823608, + "learning_rate": 1.9790399033106323e-05, + "loss": 1.0504, + "step": 6480 + }, + { + "epoch": 0.0934929484132129, + "grad_norm": 0.7499729990959167, + "learning_rate": 1.9789447714249853e-05, + "loss": 1.0592, + "step": 6490 + }, + { + "epoch": 0.09363700534451215, + "grad_norm": 0.6763216853141785, + "learning_rate": 1.978849426436566e-05, + "loss": 1.0794, + "step": 6500 + }, + { + "epoch": 0.0937810622758114, + "grad_norm": 0.700145959854126, + "learning_rate": 1.9787538683661295e-05, + "loss": 1.0542, + "step": 6510 + }, + { + "epoch": 0.09392511920711065, + "grad_norm": 0.632504403591156, + "learning_rate": 1.9786580972344784e-05, + "loss": 1.0477, + "step": 6520 + }, + { + "epoch": 0.0940691761384099, + "grad_norm": 0.7052755951881409, + "learning_rate": 1.9785621130624596e-05, + "loss": 1.0513, + "step": 6530 + }, + { + "epoch": 0.09421323306970915, + "grad_norm": 0.7767089009284973, + "learning_rate": 1.9784659158709683e-05, + "loss": 1.0499, + "step": 6540 + }, + { + "epoch": 0.0943572900010084, + "grad_norm": 0.7191714644432068, + "learning_rate": 1.9783695056809447e-05, + "loss": 1.0624, + "step": 6550 + }, + { + "epoch": 0.09450134693230765, + "grad_norm": 0.890355110168457, + "learning_rate": 1.9782728825133767e-05, + "loss": 1.0658, + "step": 6560 + }, + { + "epoch": 0.0946454038636069, + "grad_norm": 0.6689414978027344, + "learning_rate": 1.9781760463892974e-05, + "loss": 1.0637, + "step": 6570 + }, + { + "epoch": 0.09478946079490615, + "grad_norm": 0.7338350415229797, + "learning_rate": 1.978078997329787e-05, + "loss": 1.0445, + "step": 6580 + }, + { + "epoch": 0.0949335177262054, + "grad_norm": 0.6850501298904419, + "learning_rate": 1.977981735355971e-05, + "loss": 1.064, + "step": 6590 + }, + { + "epoch": 0.09507757465750465, + "grad_norm": 0.7931393980979919, + "learning_rate": 1.977884260489023e-05, + "loss": 1.0421, + "step": 6600 + }, + { + "epoch": 0.0952216315888039, + "grad_norm": 0.727503776550293, + "learning_rate": 1.977786572750161e-05, + "loss": 1.0398, + "step": 6610 + }, + { + "epoch": 0.09536568852010315, + "grad_norm": 0.7935442328453064, + "learning_rate": 1.9776886721606506e-05, + "loss": 1.0414, + "step": 6620 + }, + { + "epoch": 0.0955097454514024, + "grad_norm": 0.8451200127601624, + "learning_rate": 1.977590558741804e-05, + "loss": 1.0521, + "step": 6630 + }, + { + "epoch": 0.09565380238270164, + "grad_norm": 0.6927206516265869, + "learning_rate": 1.977492232514978e-05, + "loss": 1.0659, + "step": 6640 + }, + { + "epoch": 0.0957978593140009, + "grad_norm": 0.6532142162322998, + "learning_rate": 1.977393693501578e-05, + "loss": 1.0525, + "step": 6650 + }, + { + "epoch": 0.09594191624530014, + "grad_norm": 0.8253028988838196, + "learning_rate": 1.977294941723054e-05, + "loss": 1.0569, + "step": 6660 + }, + { + "epoch": 0.09608597317659939, + "grad_norm": 0.7183331847190857, + "learning_rate": 1.9771959772009028e-05, + "loss": 1.0755, + "step": 6670 + }, + { + "epoch": 0.09623003010789864, + "grad_norm": 0.6303145885467529, + "learning_rate": 1.977096799956668e-05, + "loss": 1.0589, + "step": 6680 + }, + { + "epoch": 0.09637408703919789, + "grad_norm": 1.0233185291290283, + "learning_rate": 1.9769974100119383e-05, + "loss": 1.0494, + "step": 6690 + }, + { + "epoch": 0.09651814397049714, + "grad_norm": 0.7218311429023743, + "learning_rate": 1.9768978073883506e-05, + "loss": 1.0553, + "step": 6700 + }, + { + "epoch": 0.09666220090179639, + "grad_norm": 0.7105904817581177, + "learning_rate": 1.976797992107587e-05, + "loss": 1.0477, + "step": 6710 + }, + { + "epoch": 0.09680625783309564, + "grad_norm": 0.8148890733718872, + "learning_rate": 1.9766979641913746e-05, + "loss": 1.0741, + "step": 6720 + }, + { + "epoch": 0.09695031476439489, + "grad_norm": 0.766864538192749, + "learning_rate": 1.9765977236614894e-05, + "loss": 1.0546, + "step": 6730 + }, + { + "epoch": 0.09709437169569414, + "grad_norm": 0.6800231337547302, + "learning_rate": 1.9764972705397517e-05, + "loss": 1.033, + "step": 6740 + }, + { + "epoch": 0.09723842862699339, + "grad_norm": 0.6132718324661255, + "learning_rate": 1.9763966048480293e-05, + "loss": 1.0655, + "step": 6750 + }, + { + "epoch": 0.09738248555829264, + "grad_norm": 0.7320756912231445, + "learning_rate": 1.9762957266082355e-05, + "loss": 1.0537, + "step": 6760 + }, + { + "epoch": 0.09752654248959189, + "grad_norm": 0.7121520638465881, + "learning_rate": 1.9761946358423297e-05, + "loss": 1.0559, + "step": 6770 + }, + { + "epoch": 0.09767059942089114, + "grad_norm": 0.7753499150276184, + "learning_rate": 1.9760933325723185e-05, + "loss": 1.0666, + "step": 6780 + }, + { + "epoch": 0.09781465635219039, + "grad_norm": 0.7425855994224548, + "learning_rate": 1.975991816820254e-05, + "loss": 1.0108, + "step": 6790 + }, + { + "epoch": 0.09795871328348964, + "grad_norm": 0.7980513572692871, + "learning_rate": 1.9758900886082343e-05, + "loss": 1.0617, + "step": 6800 + }, + { + "epoch": 0.09810277021478889, + "grad_norm": 0.7671613097190857, + "learning_rate": 1.9757881479584055e-05, + "loss": 1.0852, + "step": 6810 + }, + { + "epoch": 0.09824682714608814, + "grad_norm": 0.7799088358879089, + "learning_rate": 1.9756859948929573e-05, + "loss": 1.0588, + "step": 6820 + }, + { + "epoch": 0.09839088407738739, + "grad_norm": 0.7418830394744873, + "learning_rate": 1.975583629434128e-05, + "loss": 1.0643, + "step": 6830 + }, + { + "epoch": 0.09853494100868664, + "grad_norm": 0.711216151714325, + "learning_rate": 1.9754810516042006e-05, + "loss": 1.0411, + "step": 6840 + }, + { + "epoch": 0.09867899793998589, + "grad_norm": 0.8446986675262451, + "learning_rate": 1.9753782614255047e-05, + "loss": 1.0508, + "step": 6850 + }, + { + "epoch": 0.09882305487128513, + "grad_norm": 0.6371778845787048, + "learning_rate": 1.9752752589204172e-05, + "loss": 1.0711, + "step": 6860 + }, + { + "epoch": 0.09896711180258438, + "grad_norm": 0.8106171488761902, + "learning_rate": 1.9751720441113594e-05, + "loss": 1.0396, + "step": 6870 + }, + { + "epoch": 0.09911116873388363, + "grad_norm": 0.7657573223114014, + "learning_rate": 1.9750686170208003e-05, + "loss": 1.0928, + "step": 6880 + }, + { + "epoch": 0.09925522566518288, + "grad_norm": 0.6844088435173035, + "learning_rate": 1.9749649776712542e-05, + "loss": 1.0425, + "step": 6890 + }, + { + "epoch": 0.09939928259648213, + "grad_norm": 0.8555119633674622, + "learning_rate": 1.974861126085282e-05, + "loss": 1.0587, + "step": 6900 + }, + { + "epoch": 0.09954333952778138, + "grad_norm": 0.5692336559295654, + "learning_rate": 1.9747570622854912e-05, + "loss": 1.0452, + "step": 6910 + }, + { + "epoch": 0.09968739645908063, + "grad_norm": 0.7701729536056519, + "learning_rate": 1.9746527862945347e-05, + "loss": 1.0363, + "step": 6920 + }, + { + "epoch": 0.09983145339037988, + "grad_norm": 0.7081140279769897, + "learning_rate": 1.974548298135112e-05, + "loss": 1.0609, + "step": 6930 + }, + { + "epoch": 0.09997551032167913, + "grad_norm": 0.7233031988143921, + "learning_rate": 1.9744435978299682e-05, + "loss": 1.0412, + "step": 6940 + }, + { + "epoch": 0.10011956725297838, + "grad_norm": 0.6908624768257141, + "learning_rate": 1.9743386854018963e-05, + "loss": 1.0434, + "step": 6950 + }, + { + "epoch": 0.10026362418427763, + "grad_norm": 0.6765705943107605, + "learning_rate": 1.974233560873733e-05, + "loss": 1.0525, + "step": 6960 + }, + { + "epoch": 0.10040768111557688, + "grad_norm": 0.670443058013916, + "learning_rate": 1.974128224268363e-05, + "loss": 1.0455, + "step": 6970 + }, + { + "epoch": 0.10055173804687613, + "grad_norm": 0.5861614346504211, + "learning_rate": 1.9740226756087174e-05, + "loss": 1.0495, + "step": 6980 + }, + { + "epoch": 0.10069579497817538, + "grad_norm": 0.6091853976249695, + "learning_rate": 1.9739169149177716e-05, + "loss": 1.0728, + "step": 6990 + }, + { + "epoch": 0.10083985190947463, + "grad_norm": 0.6173252463340759, + "learning_rate": 1.9738109422185485e-05, + "loss": 1.0552, + "step": 7000 + }, + { + "epoch": 0.10098390884077388, + "grad_norm": 0.741486132144928, + "learning_rate": 1.973704757534117e-05, + "loss": 1.0369, + "step": 7010 + }, + { + "epoch": 0.10112796577207313, + "grad_norm": 0.7461585402488708, + "learning_rate": 1.9735983608875918e-05, + "loss": 1.0615, + "step": 7020 + }, + { + "epoch": 0.10127202270337238, + "grad_norm": 0.637152910232544, + "learning_rate": 1.973491752302135e-05, + "loss": 1.0546, + "step": 7030 + }, + { + "epoch": 0.10141607963467163, + "grad_norm": 0.5802614092826843, + "learning_rate": 1.9733849318009523e-05, + "loss": 1.0381, + "step": 7040 + }, + { + "epoch": 0.10156013656597088, + "grad_norm": 0.7531036138534546, + "learning_rate": 1.973277899407298e-05, + "loss": 1.0521, + "step": 7050 + }, + { + "epoch": 0.10170419349727013, + "grad_norm": 0.8710116744041443, + "learning_rate": 1.973170655144472e-05, + "loss": 1.0503, + "step": 7060 + }, + { + "epoch": 0.10184825042856938, + "grad_norm": 0.8190781474113464, + "learning_rate": 1.9730631990358185e-05, + "loss": 1.0351, + "step": 7070 + }, + { + "epoch": 0.10199230735986863, + "grad_norm": 0.8092232942581177, + "learning_rate": 1.9729555311047305e-05, + "loss": 1.0359, + "step": 7080 + }, + { + "epoch": 0.10213636429116787, + "grad_norm": 0.8566160798072815, + "learning_rate": 1.972847651374645e-05, + "loss": 1.0538, + "step": 7090 + }, + { + "epoch": 0.10228042122246712, + "grad_norm": 0.7003466486930847, + "learning_rate": 1.972739559869046e-05, + "loss": 1.0398, + "step": 7100 + }, + { + "epoch": 0.10242447815376637, + "grad_norm": 0.6902493834495544, + "learning_rate": 1.972631256611464e-05, + "loss": 1.0338, + "step": 7110 + }, + { + "epoch": 0.10256853508506562, + "grad_norm": 0.5801345109939575, + "learning_rate": 1.972522741625475e-05, + "loss": 1.0658, + "step": 7120 + }, + { + "epoch": 0.10271259201636487, + "grad_norm": 0.6949053406715393, + "learning_rate": 1.972414014934701e-05, + "loss": 1.0514, + "step": 7130 + }, + { + "epoch": 0.10285664894766412, + "grad_norm": 0.6205598711967468, + "learning_rate": 1.9723050765628105e-05, + "loss": 1.0365, + "step": 7140 + }, + { + "epoch": 0.10300070587896337, + "grad_norm": 0.5499454140663147, + "learning_rate": 1.9721959265335178e-05, + "loss": 1.0578, + "step": 7150 + }, + { + "epoch": 0.10314476281026262, + "grad_norm": 0.7337759137153625, + "learning_rate": 1.972086564870583e-05, + "loss": 1.0707, + "step": 7160 + }, + { + "epoch": 0.10328881974156187, + "grad_norm": 0.8025751113891602, + "learning_rate": 1.9719769915978135e-05, + "loss": 1.0573, + "step": 7170 + }, + { + "epoch": 0.10343287667286112, + "grad_norm": 0.6096850037574768, + "learning_rate": 1.971867206739061e-05, + "loss": 1.063, + "step": 7180 + }, + { + "epoch": 0.10357693360416037, + "grad_norm": 0.7098557353019714, + "learning_rate": 1.9717572103182246e-05, + "loss": 1.0648, + "step": 7190 + }, + { + "epoch": 0.10372099053545962, + "grad_norm": 0.567997932434082, + "learning_rate": 1.9716470023592488e-05, + "loss": 1.065, + "step": 7200 + }, + { + "epoch": 0.10386504746675887, + "grad_norm": 0.6367721557617188, + "learning_rate": 1.971536582886125e-05, + "loss": 1.0476, + "step": 7210 + }, + { + "epoch": 0.10400910439805812, + "grad_norm": 0.7719098925590515, + "learning_rate": 1.9714259519228886e-05, + "loss": 1.0614, + "step": 7220 + }, + { + "epoch": 0.10415316132935737, + "grad_norm": 0.7801731824874878, + "learning_rate": 1.9713151094936238e-05, + "loss": 1.0761, + "step": 7230 + }, + { + "epoch": 0.10429721826065662, + "grad_norm": 0.6191163063049316, + "learning_rate": 1.971204055622459e-05, + "loss": 1.0425, + "step": 7240 + }, + { + "epoch": 0.10444127519195585, + "grad_norm": 0.6448741555213928, + "learning_rate": 1.971092790333569e-05, + "loss": 1.0406, + "step": 7250 + }, + { + "epoch": 0.1045853321232551, + "grad_norm": 0.7942621111869812, + "learning_rate": 1.9709813136511748e-05, + "loss": 1.0693, + "step": 7260 + }, + { + "epoch": 0.10472938905455435, + "grad_norm": 0.6539813280105591, + "learning_rate": 1.9708696255995433e-05, + "loss": 1.0699, + "step": 7270 + }, + { + "epoch": 0.1048734459858536, + "grad_norm": 0.7940925359725952, + "learning_rate": 1.9707577262029874e-05, + "loss": 1.0881, + "step": 7280 + }, + { + "epoch": 0.10501750291715285, + "grad_norm": 0.7164614200592041, + "learning_rate": 1.970645615485866e-05, + "loss": 1.0444, + "step": 7290 + }, + { + "epoch": 0.1051615598484521, + "grad_norm": 0.6418722867965698, + "learning_rate": 1.970533293472584e-05, + "loss": 1.0592, + "step": 7300 + }, + { + "epoch": 0.10530561677975135, + "grad_norm": 0.6691604256629944, + "learning_rate": 1.9704207601875933e-05, + "loss": 1.0534, + "step": 7310 + }, + { + "epoch": 0.1054496737110506, + "grad_norm": 0.6756848096847534, + "learning_rate": 1.9703080156553894e-05, + "loss": 1.0439, + "step": 7320 + }, + { + "epoch": 0.10559373064234985, + "grad_norm": 0.7866693735122681, + "learning_rate": 1.970195059900516e-05, + "loss": 1.045, + "step": 7330 + }, + { + "epoch": 0.1057377875736491, + "grad_norm": 0.6200616955757141, + "learning_rate": 1.970081892947562e-05, + "loss": 1.0357, + "step": 7340 + }, + { + "epoch": 0.10588184450494835, + "grad_norm": 0.733914315700531, + "learning_rate": 1.9699685148211626e-05, + "loss": 1.0638, + "step": 7350 + }, + { + "epoch": 0.1060259014362476, + "grad_norm": 0.6396364569664001, + "learning_rate": 1.9698549255459975e-05, + "loss": 1.0608, + "step": 7360 + }, + { + "epoch": 0.10616995836754685, + "grad_norm": 0.6390619874000549, + "learning_rate": 1.9697411251467947e-05, + "loss": 1.0624, + "step": 7370 + }, + { + "epoch": 0.1063140152988461, + "grad_norm": 0.6570200324058533, + "learning_rate": 1.9696271136483266e-05, + "loss": 1.0468, + "step": 7380 + }, + { + "epoch": 0.10645807223014535, + "grad_norm": 0.5907531380653381, + "learning_rate": 1.9695128910754115e-05, + "loss": 1.0317, + "step": 7390 + }, + { + "epoch": 0.1066021291614446, + "grad_norm": 0.8321852087974548, + "learning_rate": 1.9693984574529147e-05, + "loss": 1.0775, + "step": 7400 + }, + { + "epoch": 0.10674618609274385, + "grad_norm": 0.7287123203277588, + "learning_rate": 1.9692838128057466e-05, + "loss": 1.0651, + "step": 7410 + }, + { + "epoch": 0.1068902430240431, + "grad_norm": 0.7644389271736145, + "learning_rate": 1.9691689571588637e-05, + "loss": 1.0807, + "step": 7420 + }, + { + "epoch": 0.10703429995534235, + "grad_norm": 0.6191002130508423, + "learning_rate": 1.9690538905372685e-05, + "loss": 1.055, + "step": 7430 + }, + { + "epoch": 0.1071783568866416, + "grad_norm": 0.6928269863128662, + "learning_rate": 1.9689386129660093e-05, + "loss": 1.0293, + "step": 7440 + }, + { + "epoch": 0.10732241381794085, + "grad_norm": 0.6972824931144714, + "learning_rate": 1.968823124470181e-05, + "loss": 1.0602, + "step": 7450 + }, + { + "epoch": 0.1074664707492401, + "grad_norm": 0.6372748017311096, + "learning_rate": 1.968707425074923e-05, + "loss": 1.0496, + "step": 7460 + }, + { + "epoch": 0.10761052768053934, + "grad_norm": 0.7519180774688721, + "learning_rate": 1.968591514805422e-05, + "loss": 1.0782, + "step": 7470 + }, + { + "epoch": 0.1077545846118386, + "grad_norm": 0.7396631240844727, + "learning_rate": 1.96847539368691e-05, + "loss": 1.0467, + "step": 7480 + }, + { + "epoch": 0.10789864154313784, + "grad_norm": 0.8233817219734192, + "learning_rate": 1.9683590617446647e-05, + "loss": 1.033, + "step": 7490 + }, + { + "epoch": 0.1080426984744371, + "grad_norm": 0.6281062364578247, + "learning_rate": 1.9682425190040104e-05, + "loss": 1.0628, + "step": 7500 + }, + { + "epoch": 0.10818675540573634, + "grad_norm": 0.8033367991447449, + "learning_rate": 1.9681257654903164e-05, + "loss": 1.0604, + "step": 7510 + }, + { + "epoch": 0.10833081233703559, + "grad_norm": 0.6199148893356323, + "learning_rate": 1.968008801228999e-05, + "loss": 1.039, + "step": 7520 + }, + { + "epoch": 0.10847486926833484, + "grad_norm": 0.8492701053619385, + "learning_rate": 1.967891626245519e-05, + "loss": 1.0424, + "step": 7530 + }, + { + "epoch": 0.10861892619963409, + "grad_norm": 0.82408207654953, + "learning_rate": 1.9677742405653837e-05, + "loss": 1.0436, + "step": 7540 + }, + { + "epoch": 0.10876298313093334, + "grad_norm": 0.7059163451194763, + "learning_rate": 1.9676566442141472e-05, + "loss": 1.0577, + "step": 7550 + }, + { + "epoch": 0.10890704006223259, + "grad_norm": 0.8421022295951843, + "learning_rate": 1.967538837217408e-05, + "loss": 1.0604, + "step": 7560 + }, + { + "epoch": 0.10905109699353184, + "grad_norm": 0.6940760612487793, + "learning_rate": 1.967420819600811e-05, + "loss": 1.0553, + "step": 7570 + }, + { + "epoch": 0.10919515392483109, + "grad_norm": 0.7205594182014465, + "learning_rate": 1.967302591390047e-05, + "loss": 1.0596, + "step": 7580 + }, + { + "epoch": 0.10933921085613034, + "grad_norm": 0.652677595615387, + "learning_rate": 1.9671841526108534e-05, + "loss": 1.0559, + "step": 7590 + }, + { + "epoch": 0.10948326778742959, + "grad_norm": 0.6200570464134216, + "learning_rate": 1.9670655032890115e-05, + "loss": 1.0615, + "step": 7600 + }, + { + "epoch": 0.10962732471872884, + "grad_norm": 0.6742100119590759, + "learning_rate": 1.9669466434503506e-05, + "loss": 1.051, + "step": 7610 + }, + { + "epoch": 0.10977138165002809, + "grad_norm": 0.6377570033073425, + "learning_rate": 1.9668275731207444e-05, + "loss": 1.0206, + "step": 7620 + }, + { + "epoch": 0.10991543858132734, + "grad_norm": 0.8838081359863281, + "learning_rate": 1.9667082923261126e-05, + "loss": 1.0426, + "step": 7630 + }, + { + "epoch": 0.11005949551262659, + "grad_norm": 0.849931001663208, + "learning_rate": 1.9665888010924212e-05, + "loss": 1.052, + "step": 7640 + }, + { + "epoch": 0.11020355244392584, + "grad_norm": 0.663451611995697, + "learning_rate": 1.9664690994456824e-05, + "loss": 1.0351, + "step": 7650 + }, + { + "epoch": 0.11034760937522509, + "grad_norm": 0.7051265239715576, + "learning_rate": 1.966349187411953e-05, + "loss": 1.0693, + "step": 7660 + }, + { + "epoch": 0.11049166630652434, + "grad_norm": 0.7019264698028564, + "learning_rate": 1.966229065017336e-05, + "loss": 1.0526, + "step": 7670 + }, + { + "epoch": 0.11063572323782359, + "grad_norm": 0.526744544506073, + "learning_rate": 1.966108732287981e-05, + "loss": 1.041, + "step": 7680 + }, + { + "epoch": 0.11077978016912284, + "grad_norm": 0.7118310928344727, + "learning_rate": 1.965988189250082e-05, + "loss": 1.058, + "step": 7690 + }, + { + "epoch": 0.11092383710042208, + "grad_norm": 0.7060966491699219, + "learning_rate": 1.9658674359298807e-05, + "loss": 1.0537, + "step": 7700 + }, + { + "epoch": 0.11106789403172133, + "grad_norm": 0.7887639999389648, + "learning_rate": 1.9657464723536622e-05, + "loss": 1.0521, + "step": 7710 + }, + { + "epoch": 0.11121195096302058, + "grad_norm": 0.7852970361709595, + "learning_rate": 1.9656252985477595e-05, + "loss": 1.0575, + "step": 7720 + }, + { + "epoch": 0.11135600789431983, + "grad_norm": 0.6167747378349304, + "learning_rate": 1.9655039145385498e-05, + "loss": 1.0586, + "step": 7730 + }, + { + "epoch": 0.11150006482561908, + "grad_norm": 0.6489522457122803, + "learning_rate": 1.9653823203524573e-05, + "loss": 1.0552, + "step": 7740 + }, + { + "epoch": 0.11164412175691833, + "grad_norm": 0.7230646014213562, + "learning_rate": 1.965260516015951e-05, + "loss": 1.0477, + "step": 7750 + }, + { + "epoch": 0.11178817868821758, + "grad_norm": 0.9800652265548706, + "learning_rate": 1.965138501555546e-05, + "loss": 1.0679, + "step": 7760 + }, + { + "epoch": 0.11193223561951683, + "grad_norm": 0.7883552312850952, + "learning_rate": 1.9650162769978034e-05, + "loss": 1.0583, + "step": 7770 + }, + { + "epoch": 0.11207629255081608, + "grad_norm": 0.7069298028945923, + "learning_rate": 1.9648938423693302e-05, + "loss": 1.0376, + "step": 7780 + }, + { + "epoch": 0.11222034948211533, + "grad_norm": 0.6600155234336853, + "learning_rate": 1.9647711976967776e-05, + "loss": 1.0452, + "step": 7790 + }, + { + "epoch": 0.11236440641341458, + "grad_norm": 0.7556878924369812, + "learning_rate": 1.9646483430068442e-05, + "loss": 1.0588, + "step": 7800 + }, + { + "epoch": 0.11250846334471383, + "grad_norm": 0.7326832413673401, + "learning_rate": 1.9645252783262742e-05, + "loss": 1.0325, + "step": 7810 + }, + { + "epoch": 0.11265252027601308, + "grad_norm": 0.7369070053100586, + "learning_rate": 1.964402003681857e-05, + "loss": 1.0459, + "step": 7820 + }, + { + "epoch": 0.11279657720731233, + "grad_norm": 0.554426372051239, + "learning_rate": 1.964278519100427e-05, + "loss": 1.0645, + "step": 7830 + }, + { + "epoch": 0.11294063413861158, + "grad_norm": 0.5718262791633606, + "learning_rate": 1.9641548246088658e-05, + "loss": 1.0426, + "step": 7840 + }, + { + "epoch": 0.11308469106991083, + "grad_norm": 0.5710174441337585, + "learning_rate": 1.9640309202341e-05, + "loss": 1.0486, + "step": 7850 + }, + { + "epoch": 0.11322874800121008, + "grad_norm": 0.7513896226882935, + "learning_rate": 1.9639068060031013e-05, + "loss": 1.0544, + "step": 7860 + }, + { + "epoch": 0.11337280493250933, + "grad_norm": 0.7050769925117493, + "learning_rate": 1.963782481942888e-05, + "loss": 1.0801, + "step": 7870 + }, + { + "epoch": 0.11351686186380858, + "grad_norm": 0.6037879586219788, + "learning_rate": 1.9636579480805242e-05, + "loss": 1.0359, + "step": 7880 + }, + { + "epoch": 0.11366091879510783, + "grad_norm": 0.5690057277679443, + "learning_rate": 1.9635332044431187e-05, + "loss": 1.0455, + "step": 7890 + }, + { + "epoch": 0.11380497572640708, + "grad_norm": 0.7461000084877014, + "learning_rate": 1.9634082510578264e-05, + "loss": 1.0555, + "step": 7900 + }, + { + "epoch": 0.11394903265770633, + "grad_norm": 0.7836570143699646, + "learning_rate": 1.9632830879518482e-05, + "loss": 1.0686, + "step": 7910 + }, + { + "epoch": 0.11409308958900558, + "grad_norm": 0.62332683801651, + "learning_rate": 1.9631577151524298e-05, + "loss": 1.059, + "step": 7920 + }, + { + "epoch": 0.11423714652030482, + "grad_norm": 0.803758978843689, + "learning_rate": 1.963032132686864e-05, + "loss": 1.0624, + "step": 7930 + }, + { + "epoch": 0.11438120345160407, + "grad_norm": 0.65031498670578, + "learning_rate": 1.962906340582488e-05, + "loss": 1.0323, + "step": 7940 + }, + { + "epoch": 0.11452526038290332, + "grad_norm": 0.6738755106925964, + "learning_rate": 1.9627803388666845e-05, + "loss": 1.077, + "step": 7950 + }, + { + "epoch": 0.11466931731420257, + "grad_norm": 0.6606619954109192, + "learning_rate": 1.9626541275668832e-05, + "loss": 1.0569, + "step": 7960 + }, + { + "epoch": 0.11481337424550182, + "grad_norm": 0.6373894810676575, + "learning_rate": 1.9625277067105583e-05, + "loss": 1.0637, + "step": 7970 + }, + { + "epoch": 0.11495743117680107, + "grad_norm": 0.6798399686813354, + "learning_rate": 1.9624010763252296e-05, + "loss": 1.0714, + "step": 7980 + }, + { + "epoch": 0.11510148810810032, + "grad_norm": 0.6052120923995972, + "learning_rate": 1.9622742364384625e-05, + "loss": 1.0532, + "step": 7990 + }, + { + "epoch": 0.11524554503939957, + "grad_norm": 0.8813684582710266, + "learning_rate": 1.9621471870778692e-05, + "loss": 1.0527, + "step": 8000 + }, + { + "epoch": 0.11538960197069882, + "grad_norm": 0.6334227919578552, + "learning_rate": 1.962019928271106e-05, + "loss": 1.0443, + "step": 8010 + }, + { + "epoch": 0.11553365890199807, + "grad_norm": 0.6585490107536316, + "learning_rate": 1.9618924600458755e-05, + "loss": 1.038, + "step": 8020 + }, + { + "epoch": 0.11567771583329732, + "grad_norm": 0.8443551659584045, + "learning_rate": 1.9617647824299256e-05, + "loss": 1.0418, + "step": 8030 + }, + { + "epoch": 0.11582177276459657, + "grad_norm": 0.6266554594039917, + "learning_rate": 1.9616368954510498e-05, + "loss": 1.0462, + "step": 8040 + }, + { + "epoch": 0.11596582969589582, + "grad_norm": 0.6462236046791077, + "learning_rate": 1.9615087991370884e-05, + "loss": 1.0703, + "step": 8050 + }, + { + "epoch": 0.11610988662719507, + "grad_norm": 0.8236831426620483, + "learning_rate": 1.9613804935159247e-05, + "loss": 1.055, + "step": 8060 + }, + { + "epoch": 0.11625394355849432, + "grad_norm": 0.7455435395240784, + "learning_rate": 1.9612519786154905e-05, + "loss": 1.059, + "step": 8070 + }, + { + "epoch": 0.11639800048979357, + "grad_norm": 0.8345641493797302, + "learning_rate": 1.9611232544637605e-05, + "loss": 1.0405, + "step": 8080 + }, + { + "epoch": 0.11654205742109282, + "grad_norm": 0.6024748682975769, + "learning_rate": 1.960994321088757e-05, + "loss": 1.034, + "step": 8090 + }, + { + "epoch": 0.11668611435239207, + "grad_norm": 0.752213180065155, + "learning_rate": 1.9608651785185468e-05, + "loss": 1.0471, + "step": 8100 + }, + { + "epoch": 0.11683017128369132, + "grad_norm": 0.6021220088005066, + "learning_rate": 1.960735826781242e-05, + "loss": 1.0413, + "step": 8110 + }, + { + "epoch": 0.11697422821499057, + "grad_norm": 0.8410196304321289, + "learning_rate": 1.9606062659050015e-05, + "loss": 1.0536, + "step": 8120 + }, + { + "epoch": 0.11711828514628982, + "grad_norm": 0.749474823474884, + "learning_rate": 1.9604764959180283e-05, + "loss": 1.062, + "step": 8130 + }, + { + "epoch": 0.11726234207758907, + "grad_norm": 0.5890676975250244, + "learning_rate": 1.9603465168485716e-05, + "loss": 1.0469, + "step": 8140 + }, + { + "epoch": 0.11740639900888832, + "grad_norm": 0.8925262093544006, + "learning_rate": 1.9602163287249264e-05, + "loss": 1.0546, + "step": 8150 + }, + { + "epoch": 0.11755045594018756, + "grad_norm": 0.7735393643379211, + "learning_rate": 1.9600859315754325e-05, + "loss": 1.0219, + "step": 8160 + }, + { + "epoch": 0.11769451287148681, + "grad_norm": 0.6977030038833618, + "learning_rate": 1.9599553254284755e-05, + "loss": 1.052, + "step": 8170 + }, + { + "epoch": 0.11783856980278606, + "grad_norm": 0.7124899625778198, + "learning_rate": 1.959824510312487e-05, + "loss": 1.0424, + "step": 8180 + }, + { + "epoch": 0.11798262673408531, + "grad_norm": 0.8691676259040833, + "learning_rate": 1.9596934862559432e-05, + "loss": 1.055, + "step": 8190 + }, + { + "epoch": 0.11812668366538456, + "grad_norm": 0.7811499834060669, + "learning_rate": 1.9595622532873665e-05, + "loss": 1.0518, + "step": 8200 + }, + { + "epoch": 0.11827074059668381, + "grad_norm": 0.7809152007102966, + "learning_rate": 1.9594308114353248e-05, + "loss": 1.0351, + "step": 8210 + }, + { + "epoch": 0.11841479752798306, + "grad_norm": 0.7715036869049072, + "learning_rate": 1.9592991607284302e-05, + "loss": 1.0373, + "step": 8220 + }, + { + "epoch": 0.11855885445928231, + "grad_norm": 0.9572784304618835, + "learning_rate": 1.959167301195342e-05, + "loss": 1.0555, + "step": 8230 + }, + { + "epoch": 0.11870291139058156, + "grad_norm": 0.7498046159744263, + "learning_rate": 1.9590352328647645e-05, + "loss": 1.0364, + "step": 8240 + }, + { + "epoch": 0.11884696832188081, + "grad_norm": 0.7903684377670288, + "learning_rate": 1.9589029557654462e-05, + "loss": 1.0493, + "step": 8250 + }, + { + "epoch": 0.11899102525318006, + "grad_norm": 0.7427484393119812, + "learning_rate": 1.9587704699261825e-05, + "loss": 1.0607, + "step": 8260 + }, + { + "epoch": 0.11913508218447931, + "grad_norm": 0.7210958003997803, + "learning_rate": 1.958637775375814e-05, + "loss": 1.0551, + "step": 8270 + }, + { + "epoch": 0.11927913911577856, + "grad_norm": 0.6811267733573914, + "learning_rate": 1.9585048721432263e-05, + "loss": 1.0474, + "step": 8280 + }, + { + "epoch": 0.11942319604707781, + "grad_norm": 0.6749264001846313, + "learning_rate": 1.9583717602573503e-05, + "loss": 1.0475, + "step": 8290 + }, + { + "epoch": 0.11956725297837706, + "grad_norm": 0.5424134135246277, + "learning_rate": 1.958238439747163e-05, + "loss": 1.0468, + "step": 8300 + }, + { + "epoch": 0.11971130990967631, + "grad_norm": 0.643915593624115, + "learning_rate": 1.958104910641686e-05, + "loss": 1.0617, + "step": 8310 + }, + { + "epoch": 0.11985536684097556, + "grad_norm": 0.8453126549720764, + "learning_rate": 1.957971172969987e-05, + "loss": 1.0658, + "step": 8320 + }, + { + "epoch": 0.11999942377227481, + "grad_norm": 0.8805552124977112, + "learning_rate": 1.9578372267611795e-05, + "loss": 1.0582, + "step": 8330 + }, + { + "epoch": 0.12014348070357406, + "grad_norm": 0.6731013655662537, + "learning_rate": 1.9577030720444203e-05, + "loss": 1.0604, + "step": 8340 + }, + { + "epoch": 0.1202875376348733, + "grad_norm": 0.7662180066108704, + "learning_rate": 1.9575687088489143e-05, + "loss": 1.0298, + "step": 8350 + }, + { + "epoch": 0.12043159456617256, + "grad_norm": 0.6646352410316467, + "learning_rate": 1.95743413720391e-05, + "loss": 1.0513, + "step": 8360 + }, + { + "epoch": 0.1205756514974718, + "grad_norm": 0.7564797401428223, + "learning_rate": 1.9572993571387018e-05, + "loss": 1.0308, + "step": 8370 + }, + { + "epoch": 0.12071970842877106, + "grad_norm": 0.7111014723777771, + "learning_rate": 1.9571643686826296e-05, + "loss": 1.0268, + "step": 8380 + }, + { + "epoch": 0.1208637653600703, + "grad_norm": 0.6569526791572571, + "learning_rate": 1.9570291718650782e-05, + "loss": 1.0386, + "step": 8390 + }, + { + "epoch": 0.12100782229136955, + "grad_norm": 0.6299819946289062, + "learning_rate": 1.9568937667154784e-05, + "loss": 1.0638, + "step": 8400 + }, + { + "epoch": 0.1211518792226688, + "grad_norm": 0.6129927039146423, + "learning_rate": 1.956758153263306e-05, + "loss": 1.0417, + "step": 8410 + }, + { + "epoch": 0.12129593615396805, + "grad_norm": 0.7334028482437134, + "learning_rate": 1.956622331538082e-05, + "loss": 1.0578, + "step": 8420 + }, + { + "epoch": 0.1214399930852673, + "grad_norm": 0.624396026134491, + "learning_rate": 1.956486301569373e-05, + "loss": 1.0387, + "step": 8430 + }, + { + "epoch": 0.12158405001656655, + "grad_norm": 0.9984021782875061, + "learning_rate": 1.956350063386791e-05, + "loss": 1.0378, + "step": 8440 + }, + { + "epoch": 0.1217281069478658, + "grad_norm": 0.8487315773963928, + "learning_rate": 1.9562136170199933e-05, + "loss": 1.0597, + "step": 8450 + }, + { + "epoch": 0.12187216387916505, + "grad_norm": 0.7279149889945984, + "learning_rate": 1.9560769624986817e-05, + "loss": 1.049, + "step": 8460 + }, + { + "epoch": 0.1220162208104643, + "grad_norm": 0.6539647579193115, + "learning_rate": 1.9559400998526048e-05, + "loss": 1.0724, + "step": 8470 + }, + { + "epoch": 0.12216027774176355, + "grad_norm": 0.6667382121086121, + "learning_rate": 1.955803029111555e-05, + "loss": 1.017, + "step": 8480 + }, + { + "epoch": 0.1223043346730628, + "grad_norm": 0.7969838976860046, + "learning_rate": 1.9556657503053715e-05, + "loss": 1.0552, + "step": 8490 + }, + { + "epoch": 0.12244839160436205, + "grad_norm": 0.6321619153022766, + "learning_rate": 1.9555282634639374e-05, + "loss": 1.0535, + "step": 8500 + }, + { + "epoch": 0.1225924485356613, + "grad_norm": 0.6364926695823669, + "learning_rate": 1.955390568617182e-05, + "loss": 1.0483, + "step": 8510 + }, + { + "epoch": 0.12273650546696055, + "grad_norm": 0.697006344795227, + "learning_rate": 1.955252665795079e-05, + "loss": 1.0741, + "step": 8520 + }, + { + "epoch": 0.1228805623982598, + "grad_norm": 0.5230076909065247, + "learning_rate": 1.955114555027649e-05, + "loss": 1.044, + "step": 8530 + }, + { + "epoch": 0.12302461932955905, + "grad_norm": 0.6966224312782288, + "learning_rate": 1.9549762363449555e-05, + "loss": 1.0734, + "step": 8540 + }, + { + "epoch": 0.1231686762608583, + "grad_norm": 0.6040503978729248, + "learning_rate": 1.9548377097771097e-05, + "loss": 1.0711, + "step": 8550 + }, + { + "epoch": 0.12331273319215755, + "grad_norm": 0.660383939743042, + "learning_rate": 1.9546989753542666e-05, + "loss": 1.0213, + "step": 8560 + }, + { + "epoch": 0.12345679012345678, + "grad_norm": 0.6777083873748779, + "learning_rate": 1.954560033106626e-05, + "loss": 1.0408, + "step": 8570 + }, + { + "epoch": 0.12360084705475603, + "grad_norm": 0.6483912467956543, + "learning_rate": 1.9544208830644347e-05, + "loss": 1.0244, + "step": 8580 + }, + { + "epoch": 0.12374490398605528, + "grad_norm": 0.6874343156814575, + "learning_rate": 1.9542815252579834e-05, + "loss": 1.0437, + "step": 8590 + }, + { + "epoch": 0.12388896091735453, + "grad_norm": 0.7024880647659302, + "learning_rate": 1.9541419597176084e-05, + "loss": 1.0362, + "step": 8600 + }, + { + "epoch": 0.12403301784865378, + "grad_norm": 0.8031400442123413, + "learning_rate": 1.954002186473691e-05, + "loss": 1.0571, + "step": 8610 + }, + { + "epoch": 0.12417707477995303, + "grad_norm": 0.7505627870559692, + "learning_rate": 1.9538622055566584e-05, + "loss": 1.0514, + "step": 8620 + }, + { + "epoch": 0.12432113171125228, + "grad_norm": 0.7040508985519409, + "learning_rate": 1.953722016996982e-05, + "loss": 1.0483, + "step": 8630 + }, + { + "epoch": 0.12446518864255153, + "grad_norm": 0.5585609078407288, + "learning_rate": 1.9535816208251796e-05, + "loss": 1.0283, + "step": 8640 + }, + { + "epoch": 0.12460924557385078, + "grad_norm": 0.6182272434234619, + "learning_rate": 1.9534410170718123e-05, + "loss": 1.0465, + "step": 8650 + }, + { + "epoch": 0.12475330250515003, + "grad_norm": 0.7459903955459595, + "learning_rate": 1.9533002057674886e-05, + "loss": 1.0645, + "step": 8660 + }, + { + "epoch": 0.12489735943644928, + "grad_norm": 0.8726353049278259, + "learning_rate": 1.9531591869428607e-05, + "loss": 1.0383, + "step": 8670 + }, + { + "epoch": 0.12504141636774854, + "grad_norm": 0.7270867824554443, + "learning_rate": 1.953017960628627e-05, + "loss": 1.0642, + "step": 8680 + }, + { + "epoch": 0.12518547329904778, + "grad_norm": 0.6112450361251831, + "learning_rate": 1.95287652685553e-05, + "loss": 1.0575, + "step": 8690 + }, + { + "epoch": 0.12532953023034704, + "grad_norm": 0.7989335656166077, + "learning_rate": 1.952734885654358e-05, + "loss": 1.0528, + "step": 8700 + }, + { + "epoch": 0.12547358716164628, + "grad_norm": 0.7024810910224915, + "learning_rate": 1.9525930370559446e-05, + "loss": 1.0279, + "step": 8710 + }, + { + "epoch": 0.12561764409294554, + "grad_norm": 0.67440265417099, + "learning_rate": 1.9524509810911675e-05, + "loss": 1.0368, + "step": 8720 + }, + { + "epoch": 0.12576170102424478, + "grad_norm": 0.7015420794487, + "learning_rate": 1.9523087177909513e-05, + "loss": 1.0496, + "step": 8730 + }, + { + "epoch": 0.12590575795554404, + "grad_norm": 0.717402458190918, + "learning_rate": 1.952166247186264e-05, + "loss": 1.0604, + "step": 8740 + }, + { + "epoch": 0.12604981488684328, + "grad_norm": 0.6328299045562744, + "learning_rate": 1.9520235693081205e-05, + "loss": 1.0441, + "step": 8750 + }, + { + "epoch": 0.12619387181814254, + "grad_norm": 0.7053465843200684, + "learning_rate": 1.9518806841875787e-05, + "loss": 1.0623, + "step": 8760 + }, + { + "epoch": 0.12633792874944177, + "grad_norm": 0.6671106219291687, + "learning_rate": 1.951737591855743e-05, + "loss": 1.0458, + "step": 8770 + }, + { + "epoch": 0.12648198568074104, + "grad_norm": 0.5957756042480469, + "learning_rate": 1.9515942923437632e-05, + "loss": 1.0615, + "step": 8780 + }, + { + "epoch": 0.12662604261204027, + "grad_norm": 0.6355538964271545, + "learning_rate": 1.9514507856828328e-05, + "loss": 1.0684, + "step": 8790 + }, + { + "epoch": 0.12677009954333954, + "grad_norm": 0.6601867079734802, + "learning_rate": 1.9513070719041918e-05, + "loss": 1.04, + "step": 8800 + }, + { + "epoch": 0.12691415647463877, + "grad_norm": 0.6541124582290649, + "learning_rate": 1.9511631510391245e-05, + "loss": 1.0608, + "step": 8810 + }, + { + "epoch": 0.12705821340593804, + "grad_norm": 0.6592003703117371, + "learning_rate": 1.9510190231189607e-05, + "loss": 1.0398, + "step": 8820 + }, + { + "epoch": 0.12720227033723727, + "grad_norm": 0.6421726942062378, + "learning_rate": 1.9508746881750745e-05, + "loss": 1.0588, + "step": 8830 + }, + { + "epoch": 0.12734632726853654, + "grad_norm": 0.6557596921920776, + "learning_rate": 1.9507301462388864e-05, + "loss": 1.0732, + "step": 8840 + }, + { + "epoch": 0.12749038419983577, + "grad_norm": 0.790360152721405, + "learning_rate": 1.950585397341861e-05, + "loss": 1.0473, + "step": 8850 + }, + { + "epoch": 0.12763444113113503, + "grad_norm": 0.5429847240447998, + "learning_rate": 1.9504404415155073e-05, + "loss": 1.0565, + "step": 8860 + }, + { + "epoch": 0.12777849806243427, + "grad_norm": 0.8884764313697815, + "learning_rate": 1.950295278791381e-05, + "loss": 1.0664, + "step": 8870 + }, + { + "epoch": 0.12792255499373353, + "grad_norm": 0.6776615977287292, + "learning_rate": 1.9501499092010818e-05, + "loss": 1.048, + "step": 8880 + }, + { + "epoch": 0.12806661192503277, + "grad_norm": 0.6573163270950317, + "learning_rate": 1.950004332776255e-05, + "loss": 1.0378, + "step": 8890 + }, + { + "epoch": 0.12821066885633203, + "grad_norm": 0.6077698469161987, + "learning_rate": 1.94985854954859e-05, + "loss": 1.0513, + "step": 8900 + }, + { + "epoch": 0.12835472578763127, + "grad_norm": 0.9807862043380737, + "learning_rate": 1.949712559549822e-05, + "loss": 1.0462, + "step": 8910 + }, + { + "epoch": 0.12849878271893053, + "grad_norm": 0.6129643321037292, + "learning_rate": 1.9495663628117315e-05, + "loss": 1.0552, + "step": 8920 + }, + { + "epoch": 0.12864283965022977, + "grad_norm": 0.7125595211982727, + "learning_rate": 1.9494199593661426e-05, + "loss": 1.0726, + "step": 8930 + }, + { + "epoch": 0.12878689658152903, + "grad_norm": 0.5645942091941833, + "learning_rate": 1.949273349244926e-05, + "loss": 1.0338, + "step": 8940 + }, + { + "epoch": 0.12893095351282827, + "grad_norm": 0.8663925528526306, + "learning_rate": 1.9491265324799966e-05, + "loss": 1.0529, + "step": 8950 + }, + { + "epoch": 0.12907501044412753, + "grad_norm": 0.7779350876808167, + "learning_rate": 1.9489795091033142e-05, + "loss": 1.0262, + "step": 8960 + }, + { + "epoch": 0.12921906737542677, + "grad_norm": 0.6418645977973938, + "learning_rate": 1.9488322791468837e-05, + "loss": 1.0375, + "step": 8970 + }, + { + "epoch": 0.12936312430672603, + "grad_norm": 0.7466892600059509, + "learning_rate": 1.9486848426427552e-05, + "loss": 1.053, + "step": 8980 + }, + { + "epoch": 0.12950718123802527, + "grad_norm": 0.5885640382766724, + "learning_rate": 1.948537199623024e-05, + "loss": 1.0315, + "step": 8990 + }, + { + "epoch": 0.12965123816932453, + "grad_norm": 0.716010570526123, + "learning_rate": 1.948389350119829e-05, + "loss": 1.0322, + "step": 9000 + }, + { + "epoch": 0.12979529510062376, + "grad_norm": 0.7401978969573975, + "learning_rate": 1.948241294165356e-05, + "loss": 1.0596, + "step": 9010 + }, + { + "epoch": 0.12993935203192303, + "grad_norm": 0.6437592506408691, + "learning_rate": 1.9480930317918337e-05, + "loss": 1.0647, + "step": 9020 + }, + { + "epoch": 0.13008340896322226, + "grad_norm": 0.7413766980171204, + "learning_rate": 1.9479445630315378e-05, + "loss": 1.0397, + "step": 9030 + }, + { + "epoch": 0.13022746589452153, + "grad_norm": 0.5429770946502686, + "learning_rate": 1.9477958879167874e-05, + "loss": 1.0569, + "step": 9040 + }, + { + "epoch": 0.13037152282582076, + "grad_norm": 0.7051211595535278, + "learning_rate": 1.9476470064799467e-05, + "loss": 1.0469, + "step": 9050 + }, + { + "epoch": 0.13051557975712003, + "grad_norm": 0.7300934791564941, + "learning_rate": 1.9474979187534258e-05, + "loss": 1.038, + "step": 9060 + }, + { + "epoch": 0.13065963668841926, + "grad_norm": 0.6431097984313965, + "learning_rate": 1.9473486247696785e-05, + "loss": 1.0461, + "step": 9070 + }, + { + "epoch": 0.13080369361971853, + "grad_norm": 0.6957307457923889, + "learning_rate": 1.9471991245612044e-05, + "loss": 1.0518, + "step": 9080 + }, + { + "epoch": 0.13094775055101776, + "grad_norm": 0.6514873504638672, + "learning_rate": 1.9470494181605476e-05, + "loss": 1.0388, + "step": 9090 + }, + { + "epoch": 0.13109180748231702, + "grad_norm": 0.6488949656486511, + "learning_rate": 1.946899505600297e-05, + "loss": 1.0551, + "step": 9100 + }, + { + "epoch": 0.13123586441361626, + "grad_norm": 0.6016454696655273, + "learning_rate": 1.946749386913086e-05, + "loss": 1.0441, + "step": 9110 + }, + { + "epoch": 0.13137992134491552, + "grad_norm": 0.7101909518241882, + "learning_rate": 1.9465990621315945e-05, + "loss": 1.0375, + "step": 9120 + }, + { + "epoch": 0.13152397827621476, + "grad_norm": 0.6185438632965088, + "learning_rate": 1.9464485312885455e-05, + "loss": 1.0401, + "step": 9130 + }, + { + "epoch": 0.13166803520751402, + "grad_norm": 0.6171557307243347, + "learning_rate": 1.9462977944167074e-05, + "loss": 1.0416, + "step": 9140 + }, + { + "epoch": 0.13181209213881326, + "grad_norm": 0.6617201566696167, + "learning_rate": 1.9461468515488936e-05, + "loss": 1.0503, + "step": 9150 + }, + { + "epoch": 0.13195614907011252, + "grad_norm": 0.5675362348556519, + "learning_rate": 1.9459957027179623e-05, + "loss": 1.031, + "step": 9160 + }, + { + "epoch": 0.13210020600141176, + "grad_norm": 0.6881184577941895, + "learning_rate": 1.9458443479568165e-05, + "loss": 1.0113, + "step": 9170 + }, + { + "epoch": 0.13224426293271102, + "grad_norm": 0.5947275757789612, + "learning_rate": 1.945692787298404e-05, + "loss": 1.0473, + "step": 9180 + }, + { + "epoch": 0.13238831986401026, + "grad_norm": 0.7350253462791443, + "learning_rate": 1.9455410207757176e-05, + "loss": 1.0461, + "step": 9190 + }, + { + "epoch": 0.13253237679530952, + "grad_norm": 0.605141282081604, + "learning_rate": 1.9453890484217947e-05, + "loss": 1.0323, + "step": 9200 + }, + { + "epoch": 0.13267643372660876, + "grad_norm": 0.661953330039978, + "learning_rate": 1.9452368702697177e-05, + "loss": 1.0632, + "step": 9210 + }, + { + "epoch": 0.13282049065790802, + "grad_norm": 0.5995630621910095, + "learning_rate": 1.9450844863526136e-05, + "loss": 1.0591, + "step": 9220 + }, + { + "epoch": 0.13296454758920725, + "grad_norm": 0.5376889705657959, + "learning_rate": 1.9449318967036543e-05, + "loss": 1.0434, + "step": 9230 + }, + { + "epoch": 0.1331086045205065, + "grad_norm": 0.7438921332359314, + "learning_rate": 1.9447791013560563e-05, + "loss": 1.056, + "step": 9240 + }, + { + "epoch": 0.13325266145180575, + "grad_norm": 0.5766037702560425, + "learning_rate": 1.9446261003430816e-05, + "loss": 1.0466, + "step": 9250 + }, + { + "epoch": 0.133396718383105, + "grad_norm": 0.6660729050636292, + "learning_rate": 1.9444728936980355e-05, + "loss": 1.0486, + "step": 9260 + }, + { + "epoch": 0.13354077531440425, + "grad_norm": 0.7104220986366272, + "learning_rate": 1.9443194814542695e-05, + "loss": 1.0581, + "step": 9270 + }, + { + "epoch": 0.1336848322457035, + "grad_norm": 0.6871854662895203, + "learning_rate": 1.9441658636451794e-05, + "loss": 1.0393, + "step": 9280 + }, + { + "epoch": 0.13382888917700275, + "grad_norm": 0.6358904242515564, + "learning_rate": 1.9440120403042056e-05, + "loss": 1.0224, + "step": 9290 + }, + { + "epoch": 0.133972946108302, + "grad_norm": 0.6637552380561829, + "learning_rate": 1.943858011464833e-05, + "loss": 1.0585, + "step": 9300 + }, + { + "epoch": 0.13411700303960125, + "grad_norm": 0.6891149282455444, + "learning_rate": 1.9437037771605922e-05, + "loss": 1.0488, + "step": 9310 + }, + { + "epoch": 0.1342610599709005, + "grad_norm": 0.615485429763794, + "learning_rate": 1.9435493374250572e-05, + "loss": 1.0545, + "step": 9320 + }, + { + "epoch": 0.13440511690219975, + "grad_norm": 0.6307300329208374, + "learning_rate": 1.943394692291848e-05, + "loss": 1.0512, + "step": 9330 + }, + { + "epoch": 0.134549173833499, + "grad_norm": 0.6017307043075562, + "learning_rate": 1.943239841794628e-05, + "loss": 1.0623, + "step": 9340 + }, + { + "epoch": 0.13469323076479825, + "grad_norm": 0.7184136509895325, + "learning_rate": 1.943084785967107e-05, + "loss": 1.0497, + "step": 9350 + }, + { + "epoch": 0.13483728769609749, + "grad_norm": 0.5148404836654663, + "learning_rate": 1.9429295248430376e-05, + "loss": 1.0158, + "step": 9360 + }, + { + "epoch": 0.13498134462739675, + "grad_norm": 0.6019318103790283, + "learning_rate": 1.9427740584562184e-05, + "loss": 1.0482, + "step": 9370 + }, + { + "epoch": 0.13512540155869598, + "grad_norm": 0.6289660930633545, + "learning_rate": 1.942618386840492e-05, + "loss": 1.0388, + "step": 9380 + }, + { + "epoch": 0.13526945848999525, + "grad_norm": 0.6551307439804077, + "learning_rate": 1.9424625100297468e-05, + "loss": 1.0366, + "step": 9390 + }, + { + "epoch": 0.13541351542129448, + "grad_norm": 0.7314261794090271, + "learning_rate": 1.942306428057914e-05, + "loss": 1.0653, + "step": 9400 + }, + { + "epoch": 0.13555757235259375, + "grad_norm": 0.6418008208274841, + "learning_rate": 1.942150140958971e-05, + "loss": 1.0536, + "step": 9410 + }, + { + "epoch": 0.13570162928389298, + "grad_norm": 0.7704811096191406, + "learning_rate": 1.9419936487669396e-05, + "loss": 1.0447, + "step": 9420 + }, + { + "epoch": 0.13584568621519225, + "grad_norm": 0.5979043841362, + "learning_rate": 1.9418369515158854e-05, + "loss": 1.0515, + "step": 9430 + }, + { + "epoch": 0.13598974314649148, + "grad_norm": 0.6340309977531433, + "learning_rate": 1.9416800492399195e-05, + "loss": 1.0646, + "step": 9440 + }, + { + "epoch": 0.13613380007779075, + "grad_norm": 0.5940436124801636, + "learning_rate": 1.9415229419731974e-05, + "loss": 1.0511, + "step": 9450 + }, + { + "epoch": 0.13627785700908998, + "grad_norm": 0.6780868768692017, + "learning_rate": 1.9413656297499194e-05, + "loss": 1.0749, + "step": 9460 + }, + { + "epoch": 0.13642191394038924, + "grad_norm": 0.6650206446647644, + "learning_rate": 1.9412081126043295e-05, + "loss": 1.0406, + "step": 9470 + }, + { + "epoch": 0.13656597087168848, + "grad_norm": 0.7658565044403076, + "learning_rate": 1.9410503905707175e-05, + "loss": 1.0391, + "step": 9480 + }, + { + "epoch": 0.13671002780298774, + "grad_norm": 0.6320905089378357, + "learning_rate": 1.9408924636834175e-05, + "loss": 1.0405, + "step": 9490 + }, + { + "epoch": 0.13685408473428698, + "grad_norm": 0.6337910890579224, + "learning_rate": 1.9407343319768077e-05, + "loss": 1.0159, + "step": 9500 + }, + { + "epoch": 0.13699814166558624, + "grad_norm": 0.7351735234260559, + "learning_rate": 1.940575995485311e-05, + "loss": 1.0511, + "step": 9510 + }, + { + "epoch": 0.13714219859688548, + "grad_norm": 0.610489547252655, + "learning_rate": 1.940417454243396e-05, + "loss": 1.0386, + "step": 9520 + }, + { + "epoch": 0.13728625552818474, + "grad_norm": 0.5717934370040894, + "learning_rate": 1.940258708285574e-05, + "loss": 1.0694, + "step": 9530 + }, + { + "epoch": 0.13743031245948398, + "grad_norm": 0.6930076479911804, + "learning_rate": 1.9400997576464018e-05, + "loss": 1.0618, + "step": 9540 + }, + { + "epoch": 0.13757436939078324, + "grad_norm": 0.6314212679862976, + "learning_rate": 1.939940602360481e-05, + "loss": 1.0526, + "step": 9550 + }, + { + "epoch": 0.13771842632208248, + "grad_norm": 0.6929250955581665, + "learning_rate": 1.9397812424624578e-05, + "loss": 1.0472, + "step": 9560 + }, + { + "epoch": 0.13786248325338174, + "grad_norm": 0.6031248569488525, + "learning_rate": 1.9396216779870224e-05, + "loss": 1.0235, + "step": 9570 + }, + { + "epoch": 0.13800654018468098, + "grad_norm": 0.67605060338974, + "learning_rate": 1.9394619089689098e-05, + "loss": 1.0479, + "step": 9580 + }, + { + "epoch": 0.13815059711598024, + "grad_norm": 0.6733813285827637, + "learning_rate": 1.9393019354428994e-05, + "loss": 1.0312, + "step": 9590 + }, + { + "epoch": 0.13829465404727947, + "grad_norm": 0.5362619161605835, + "learning_rate": 1.9391417574438157e-05, + "loss": 1.0393, + "step": 9600 + }, + { + "epoch": 0.13843871097857874, + "grad_norm": 0.724517822265625, + "learning_rate": 1.9389813750065263e-05, + "loss": 1.0562, + "step": 9610 + }, + { + "epoch": 0.13858276790987797, + "grad_norm": 0.7083704471588135, + "learning_rate": 1.9388207881659452e-05, + "loss": 1.0551, + "step": 9620 + }, + { + "epoch": 0.13872682484117724, + "grad_norm": 0.6152426600456238, + "learning_rate": 1.9386599969570296e-05, + "loss": 1.041, + "step": 9630 + }, + { + "epoch": 0.13887088177247647, + "grad_norm": 0.6370306015014648, + "learning_rate": 1.938499001414781e-05, + "loss": 1.0503, + "step": 9640 + }, + { + "epoch": 0.13901493870377574, + "grad_norm": 0.6503737568855286, + "learning_rate": 1.9383378015742468e-05, + "loss": 1.0491, + "step": 9650 + }, + { + "epoch": 0.13915899563507497, + "grad_norm": 0.7164560556411743, + "learning_rate": 1.9381763974705173e-05, + "loss": 1.0313, + "step": 9660 + }, + { + "epoch": 0.13930305256637424, + "grad_norm": 0.6730453968048096, + "learning_rate": 1.9380147891387285e-05, + "loss": 1.0454, + "step": 9670 + }, + { + "epoch": 0.13944710949767347, + "grad_norm": 0.6627982258796692, + "learning_rate": 1.93785297661406e-05, + "loss": 1.0701, + "step": 9680 + }, + { + "epoch": 0.13959116642897273, + "grad_norm": 0.5412722229957581, + "learning_rate": 1.937690959931736e-05, + "loss": 1.0595, + "step": 9690 + }, + { + "epoch": 0.13973522336027197, + "grad_norm": 0.5805692076683044, + "learning_rate": 1.9375287391270253e-05, + "loss": 1.0541, + "step": 9700 + }, + { + "epoch": 0.13987928029157123, + "grad_norm": 0.6356375813484192, + "learning_rate": 1.9373663142352417e-05, + "loss": 1.0406, + "step": 9710 + }, + { + "epoch": 0.14002333722287047, + "grad_norm": 0.6632521748542786, + "learning_rate": 1.9372036852917423e-05, + "loss": 1.0405, + "step": 9720 + }, + { + "epoch": 0.14016739415416973, + "grad_norm": 0.6239160895347595, + "learning_rate": 1.9370408523319298e-05, + "loss": 1.0385, + "step": 9730 + }, + { + "epoch": 0.14031145108546897, + "grad_norm": 0.7359287142753601, + "learning_rate": 1.9368778153912497e-05, + "loss": 1.0455, + "step": 9740 + }, + { + "epoch": 0.14045550801676823, + "grad_norm": 0.583785891532898, + "learning_rate": 1.9367145745051935e-05, + "loss": 1.0476, + "step": 9750 + }, + { + "epoch": 0.14059956494806747, + "grad_norm": 0.5760996341705322, + "learning_rate": 1.9365511297092964e-05, + "loss": 1.0303, + "step": 9760 + }, + { + "epoch": 0.14074362187936673, + "grad_norm": 0.5912659168243408, + "learning_rate": 1.9363874810391384e-05, + "loss": 1.0533, + "step": 9770 + }, + { + "epoch": 0.14088767881066597, + "grad_norm": 0.816480815410614, + "learning_rate": 1.936223628530343e-05, + "loss": 1.059, + "step": 9780 + }, + { + "epoch": 0.14103173574196523, + "grad_norm": 0.550395131111145, + "learning_rate": 1.9360595722185785e-05, + "loss": 1.0953, + "step": 9790 + }, + { + "epoch": 0.14117579267326447, + "grad_norm": 0.6576113104820251, + "learning_rate": 1.9358953121395587e-05, + "loss": 1.0441, + "step": 9800 + }, + { + "epoch": 0.14131984960456373, + "grad_norm": 0.651944637298584, + "learning_rate": 1.93573084832904e-05, + "loss": 1.0535, + "step": 9810 + }, + { + "epoch": 0.14146390653586297, + "grad_norm": 0.6657174229621887, + "learning_rate": 1.935566180822824e-05, + "loss": 1.0358, + "step": 9820 + }, + { + "epoch": 0.14160796346716223, + "grad_norm": 0.7346987724304199, + "learning_rate": 1.9354013096567567e-05, + "loss": 1.0442, + "step": 9830 + }, + { + "epoch": 0.14175202039846146, + "grad_norm": 0.6800191402435303, + "learning_rate": 1.9352362348667276e-05, + "loss": 1.0471, + "step": 9840 + }, + { + "epoch": 0.14189607732976073, + "grad_norm": 0.5743468403816223, + "learning_rate": 1.9350709564886722e-05, + "loss": 1.0337, + "step": 9850 + }, + { + "epoch": 0.14204013426105996, + "grad_norm": 0.6555769443511963, + "learning_rate": 1.9349054745585686e-05, + "loss": 1.0422, + "step": 9860 + }, + { + "epoch": 0.14218419119235923, + "grad_norm": 0.6481942534446716, + "learning_rate": 1.9347397891124406e-05, + "loss": 1.0607, + "step": 9870 + }, + { + "epoch": 0.14232824812365846, + "grad_norm": 0.6060455441474915, + "learning_rate": 1.934573900186355e-05, + "loss": 1.0556, + "step": 9880 + }, + { + "epoch": 0.14247230505495773, + "grad_norm": 0.5978740453720093, + "learning_rate": 1.9344078078164237e-05, + "loss": 1.034, + "step": 9890 + }, + { + "epoch": 0.14261636198625696, + "grad_norm": 0.5864404439926147, + "learning_rate": 1.934241512038803e-05, + "loss": 1.0557, + "step": 9900 + }, + { + "epoch": 0.14276041891755623, + "grad_norm": 0.6731454730033875, + "learning_rate": 1.9340750128896933e-05, + "loss": 1.0486, + "step": 9910 + }, + { + "epoch": 0.14290447584885546, + "grad_norm": 0.6897068619728088, + "learning_rate": 1.9339083104053386e-05, + "loss": 1.0499, + "step": 9920 + }, + { + "epoch": 0.14304853278015472, + "grad_norm": 0.6522696614265442, + "learning_rate": 1.9337414046220278e-05, + "loss": 1.0532, + "step": 9930 + }, + { + "epoch": 0.14319258971145396, + "grad_norm": 0.6743817925453186, + "learning_rate": 1.9335742955760943e-05, + "loss": 1.0644, + "step": 9940 + }, + { + "epoch": 0.14333664664275322, + "grad_norm": 0.6498114466667175, + "learning_rate": 1.9334069833039154e-05, + "loss": 1.0606, + "step": 9950 + }, + { + "epoch": 0.14348070357405246, + "grad_norm": 0.5908329486846924, + "learning_rate": 1.9332394678419125e-05, + "loss": 1.0363, + "step": 9960 + }, + { + "epoch": 0.14362476050535172, + "grad_norm": 0.7792730331420898, + "learning_rate": 1.9330717492265514e-05, + "loss": 1.0623, + "step": 9970 + }, + { + "epoch": 0.14376881743665096, + "grad_norm": 0.656216561794281, + "learning_rate": 1.9329038274943423e-05, + "loss": 1.056, + "step": 9980 + }, + { + "epoch": 0.14391287436795022, + "grad_norm": 0.6079681515693665, + "learning_rate": 1.9327357026818394e-05, + "loss": 1.0588, + "step": 9990 + }, + { + "epoch": 0.14405693129924946, + "grad_norm": 0.7425257563591003, + "learning_rate": 1.932567374825641e-05, + "loss": 1.0762, + "step": 10000 + }, + { + "epoch": 0.14420098823054872, + "grad_norm": 0.5706034898757935, + "learning_rate": 1.9323988439623903e-05, + "loss": 1.0358, + "step": 10010 + }, + { + "epoch": 0.14434504516184796, + "grad_norm": 0.62879478931427, + "learning_rate": 1.932230110128773e-05, + "loss": 1.0365, + "step": 10020 + }, + { + "epoch": 0.14448910209314722, + "grad_norm": 0.8347147703170776, + "learning_rate": 1.9320611733615215e-05, + "loss": 1.0526, + "step": 10030 + }, + { + "epoch": 0.14463315902444646, + "grad_norm": 0.5717446804046631, + "learning_rate": 1.93189203369741e-05, + "loss": 1.0577, + "step": 10040 + }, + { + "epoch": 0.14477721595574572, + "grad_norm": 0.633722722530365, + "learning_rate": 1.9317226911732586e-05, + "loss": 1.068, + "step": 10050 + }, + { + "epoch": 0.14492127288704496, + "grad_norm": 0.73835688829422, + "learning_rate": 1.9315531458259302e-05, + "loss": 1.055, + "step": 10060 + }, + { + "epoch": 0.14506532981834422, + "grad_norm": 0.6140055656433105, + "learning_rate": 1.9313833976923327e-05, + "loss": 1.0642, + "step": 10070 + }, + { + "epoch": 0.14520938674964345, + "grad_norm": 0.7030134797096252, + "learning_rate": 1.931213446809418e-05, + "loss": 1.0421, + "step": 10080 + }, + { + "epoch": 0.14535344368094272, + "grad_norm": 0.5750778913497925, + "learning_rate": 1.931043293214182e-05, + "loss": 1.0405, + "step": 10090 + }, + { + "epoch": 0.14549750061224195, + "grad_norm": 0.6405826210975647, + "learning_rate": 1.930872936943665e-05, + "loss": 1.0544, + "step": 10100 + }, + { + "epoch": 0.14564155754354122, + "grad_norm": 0.6836420893669128, + "learning_rate": 1.930702378034951e-05, + "loss": 1.0424, + "step": 10110 + }, + { + "epoch": 0.14578561447484045, + "grad_norm": 0.6426882743835449, + "learning_rate": 1.9305316165251676e-05, + "loss": 1.027, + "step": 10120 + }, + { + "epoch": 0.14592967140613972, + "grad_norm": 0.6070466041564941, + "learning_rate": 1.930360652451489e-05, + "loss": 1.0395, + "step": 10130 + }, + { + "epoch": 0.14607372833743895, + "grad_norm": 0.7554711699485779, + "learning_rate": 1.93018948585113e-05, + "loss": 1.0445, + "step": 10140 + }, + { + "epoch": 0.14621778526873822, + "grad_norm": 0.6213268041610718, + "learning_rate": 1.9300181167613522e-05, + "loss": 1.0542, + "step": 10150 + }, + { + "epoch": 0.14636184220003745, + "grad_norm": 0.6937504410743713, + "learning_rate": 1.92984654521946e-05, + "loss": 1.0606, + "step": 10160 + }, + { + "epoch": 0.14650589913133671, + "grad_norm": 0.5986953377723694, + "learning_rate": 1.9296747712628022e-05, + "loss": 1.0268, + "step": 10170 + }, + { + "epoch": 0.14664995606263595, + "grad_norm": 0.6458836197853088, + "learning_rate": 1.929502794928771e-05, + "loss": 1.0373, + "step": 10180 + }, + { + "epoch": 0.1467940129939352, + "grad_norm": 0.6366413831710815, + "learning_rate": 1.9293306162548045e-05, + "loss": 1.0525, + "step": 10190 + }, + { + "epoch": 0.14693806992523445, + "grad_norm": 0.6416839957237244, + "learning_rate": 1.929158235278383e-05, + "loss": 1.0415, + "step": 10200 + }, + { + "epoch": 0.1470821268565337, + "grad_norm": 0.6393887400627136, + "learning_rate": 1.9289856520370313e-05, + "loss": 1.0599, + "step": 10210 + }, + { + "epoch": 0.14722618378783295, + "grad_norm": 0.5655863285064697, + "learning_rate": 1.9288128665683182e-05, + "loss": 1.0683, + "step": 10220 + }, + { + "epoch": 0.1473702407191322, + "grad_norm": 0.5792210698127747, + "learning_rate": 1.9286398789098574e-05, + "loss": 1.0298, + "step": 10230 + }, + { + "epoch": 0.14751429765043145, + "grad_norm": 0.6570830345153809, + "learning_rate": 1.9284666890993055e-05, + "loss": 1.0458, + "step": 10240 + }, + { + "epoch": 0.1476583545817307, + "grad_norm": 0.5635051727294922, + "learning_rate": 1.9282932971743633e-05, + "loss": 1.0381, + "step": 10250 + }, + { + "epoch": 0.14780241151302995, + "grad_norm": 0.6540777683258057, + "learning_rate": 1.9281197031727763e-05, + "loss": 1.0634, + "step": 10260 + }, + { + "epoch": 0.1479464684443292, + "grad_norm": 0.7320727705955505, + "learning_rate": 1.9279459071323334e-05, + "loss": 1.0737, + "step": 10270 + }, + { + "epoch": 0.14809052537562845, + "grad_norm": 0.6890342831611633, + "learning_rate": 1.9277719090908674e-05, + "loss": 1.0184, + "step": 10280 + }, + { + "epoch": 0.1482345823069277, + "grad_norm": 0.6491656303405762, + "learning_rate": 1.9275977090862556e-05, + "loss": 1.0574, + "step": 10290 + }, + { + "epoch": 0.14837863923822694, + "grad_norm": 0.8918922543525696, + "learning_rate": 1.9274233071564187e-05, + "loss": 1.0522, + "step": 10300 + }, + { + "epoch": 0.1485226961695262, + "grad_norm": 0.7084387540817261, + "learning_rate": 1.9272487033393217e-05, + "loss": 1.0371, + "step": 10310 + }, + { + "epoch": 0.14866675310082544, + "grad_norm": 0.7133810520172119, + "learning_rate": 1.927073897672973e-05, + "loss": 1.0353, + "step": 10320 + }, + { + "epoch": 0.1488108100321247, + "grad_norm": 0.6229279041290283, + "learning_rate": 1.926898890195426e-05, + "loss": 1.0335, + "step": 10330 + }, + { + "epoch": 0.14895486696342394, + "grad_norm": 0.5358096957206726, + "learning_rate": 1.9267236809447774e-05, + "loss": 1.0058, + "step": 10340 + }, + { + "epoch": 0.1490989238947232, + "grad_norm": 0.751314103603363, + "learning_rate": 1.9265482699591675e-05, + "loss": 1.0572, + "step": 10350 + }, + { + "epoch": 0.14924298082602244, + "grad_norm": 0.7541031837463379, + "learning_rate": 1.926372657276781e-05, + "loss": 1.0529, + "step": 10360 + }, + { + "epoch": 0.1493870377573217, + "grad_norm": 0.5620636940002441, + "learning_rate": 1.9261968429358462e-05, + "loss": 1.0555, + "step": 10370 + }, + { + "epoch": 0.14953109468862094, + "grad_norm": 0.7588586807250977, + "learning_rate": 1.9260208269746354e-05, + "loss": 1.0538, + "step": 10380 + }, + { + "epoch": 0.1496751516199202, + "grad_norm": 0.608447790145874, + "learning_rate": 1.9258446094314653e-05, + "loss": 1.0565, + "step": 10390 + }, + { + "epoch": 0.14981920855121944, + "grad_norm": 0.6301183700561523, + "learning_rate": 1.9256681903446957e-05, + "loss": 1.0649, + "step": 10400 + }, + { + "epoch": 0.1499632654825187, + "grad_norm": 0.6962247490882874, + "learning_rate": 1.9254915697527307e-05, + "loss": 1.0667, + "step": 10410 + }, + { + "epoch": 0.15010732241381794, + "grad_norm": 0.7227652072906494, + "learning_rate": 1.9253147476940182e-05, + "loss": 1.0578, + "step": 10420 + }, + { + "epoch": 0.1502513793451172, + "grad_norm": 0.7597373723983765, + "learning_rate": 1.9251377242070497e-05, + "loss": 1.0496, + "step": 10430 + }, + { + "epoch": 0.15039543627641644, + "grad_norm": 0.6414986848831177, + "learning_rate": 1.924960499330361e-05, + "loss": 1.0501, + "step": 10440 + }, + { + "epoch": 0.1505394932077157, + "grad_norm": 0.7147359848022461, + "learning_rate": 1.9247830731025315e-05, + "loss": 1.0474, + "step": 10450 + }, + { + "epoch": 0.15068355013901494, + "grad_norm": 0.6228944659233093, + "learning_rate": 1.9246054455621844e-05, + "loss": 1.0527, + "step": 10460 + }, + { + "epoch": 0.1508276070703142, + "grad_norm": 0.5738548636436462, + "learning_rate": 1.9244276167479863e-05, + "loss": 1.0503, + "step": 10470 + }, + { + "epoch": 0.15097166400161344, + "grad_norm": 0.6146776676177979, + "learning_rate": 1.924249586698649e-05, + "loss": 1.0461, + "step": 10480 + }, + { + "epoch": 0.1511157209329127, + "grad_norm": 0.6455808281898499, + "learning_rate": 1.9240713554529264e-05, + "loss": 1.0357, + "step": 10490 + }, + { + "epoch": 0.15125977786421194, + "grad_norm": 0.6520363688468933, + "learning_rate": 1.9238929230496173e-05, + "loss": 1.065, + "step": 10500 + }, + { + "epoch": 0.1514038347955112, + "grad_norm": 0.7376220226287842, + "learning_rate": 1.9237142895275642e-05, + "loss": 1.036, + "step": 10510 + }, + { + "epoch": 0.15154789172681044, + "grad_norm": 0.6889696717262268, + "learning_rate": 1.9235354549256532e-05, + "loss": 1.0455, + "step": 10520 + }, + { + "epoch": 0.1516919486581097, + "grad_norm": 0.7327942848205566, + "learning_rate": 1.923356419282813e-05, + "loss": 1.0619, + "step": 10530 + }, + { + "epoch": 0.15183600558940893, + "grad_norm": 0.6709638237953186, + "learning_rate": 1.9231771826380185e-05, + "loss": 1.0572, + "step": 10540 + }, + { + "epoch": 0.15198006252070817, + "grad_norm": 0.6950246095657349, + "learning_rate": 1.922997745030287e-05, + "loss": 1.0418, + "step": 10550 + }, + { + "epoch": 0.15212411945200743, + "grad_norm": 0.6457738876342773, + "learning_rate": 1.9228181064986785e-05, + "loss": 1.0289, + "step": 10560 + }, + { + "epoch": 0.15226817638330667, + "grad_norm": 0.6225009560585022, + "learning_rate": 1.9226382670822986e-05, + "loss": 1.0484, + "step": 10570 + }, + { + "epoch": 0.15241223331460593, + "grad_norm": 0.6474888324737549, + "learning_rate": 1.922458226820296e-05, + "loss": 1.067, + "step": 10580 + }, + { + "epoch": 0.15255629024590517, + "grad_norm": 0.6236519813537598, + "learning_rate": 1.9222779857518627e-05, + "loss": 1.0439, + "step": 10590 + }, + { + "epoch": 0.15270034717720443, + "grad_norm": 0.6439448595046997, + "learning_rate": 1.9220975439162347e-05, + "loss": 1.0399, + "step": 10600 + }, + { + "epoch": 0.15284440410850367, + "grad_norm": 0.6349307298660278, + "learning_rate": 1.921916901352692e-05, + "loss": 1.0578, + "step": 10610 + }, + { + "epoch": 0.15298846103980293, + "grad_norm": 0.642374575138092, + "learning_rate": 1.921736058100557e-05, + "loss": 1.0714, + "step": 10620 + }, + { + "epoch": 0.15313251797110217, + "grad_norm": 0.8094242811203003, + "learning_rate": 1.921555014199198e-05, + "loss": 1.043, + "step": 10630 + }, + { + "epoch": 0.15327657490240143, + "grad_norm": 0.6853436827659607, + "learning_rate": 1.921373769688025e-05, + "loss": 1.075, + "step": 10640 + }, + { + "epoch": 0.15342063183370067, + "grad_norm": 0.5926113128662109, + "learning_rate": 1.921192324606493e-05, + "loss": 1.0214, + "step": 10650 + }, + { + "epoch": 0.15356468876499993, + "grad_norm": 0.563201367855072, + "learning_rate": 1.9210106789940996e-05, + "loss": 1.0729, + "step": 10660 + }, + { + "epoch": 0.15370874569629916, + "grad_norm": 0.5924573540687561, + "learning_rate": 1.9208288328903867e-05, + "loss": 1.0454, + "step": 10670 + }, + { + "epoch": 0.15385280262759843, + "grad_norm": 0.6228936910629272, + "learning_rate": 1.92064678633494e-05, + "loss": 1.0638, + "step": 10680 + }, + { + "epoch": 0.15399685955889766, + "grad_norm": 0.617900550365448, + "learning_rate": 1.9204645393673882e-05, + "loss": 1.0514, + "step": 10690 + }, + { + "epoch": 0.15414091649019693, + "grad_norm": 0.5415571331977844, + "learning_rate": 1.920282092027404e-05, + "loss": 1.0561, + "step": 10700 + }, + { + "epoch": 0.15428497342149616, + "grad_norm": 0.6749228835105896, + "learning_rate": 1.9200994443547036e-05, + "loss": 1.0568, + "step": 10710 + }, + { + "epoch": 0.15442903035279543, + "grad_norm": 0.6251257061958313, + "learning_rate": 1.9199165963890465e-05, + "loss": 1.0803, + "step": 10720 + }, + { + "epoch": 0.15457308728409466, + "grad_norm": 0.7738173604011536, + "learning_rate": 1.9197335481702374e-05, + "loss": 1.045, + "step": 10730 + }, + { + "epoch": 0.15471714421539393, + "grad_norm": 0.8898658156394958, + "learning_rate": 1.919550299738122e-05, + "loss": 1.027, + "step": 10740 + }, + { + "epoch": 0.15486120114669316, + "grad_norm": 0.7574092745780945, + "learning_rate": 1.9193668511325917e-05, + "loss": 1.0434, + "step": 10750 + }, + { + "epoch": 0.15500525807799242, + "grad_norm": 0.7242676615715027, + "learning_rate": 1.9191832023935805e-05, + "loss": 1.0437, + "step": 10760 + }, + { + "epoch": 0.15514931500929166, + "grad_norm": 0.653777539730072, + "learning_rate": 1.9189993535610664e-05, + "loss": 1.0647, + "step": 10770 + }, + { + "epoch": 0.15529337194059092, + "grad_norm": 0.7875447273254395, + "learning_rate": 1.9188153046750705e-05, + "loss": 1.0201, + "step": 10780 + }, + { + "epoch": 0.15543742887189016, + "grad_norm": 0.7534798979759216, + "learning_rate": 1.9186310557756583e-05, + "loss": 1.0434, + "step": 10790 + }, + { + "epoch": 0.15558148580318942, + "grad_norm": 0.7386358380317688, + "learning_rate": 1.9184466069029373e-05, + "loss": 1.0459, + "step": 10800 + }, + { + "epoch": 0.15572554273448866, + "grad_norm": 0.5817023515701294, + "learning_rate": 1.9182619580970602e-05, + "loss": 1.0513, + "step": 10810 + }, + { + "epoch": 0.15586959966578792, + "grad_norm": 0.7862343788146973, + "learning_rate": 1.918077109398222e-05, + "loss": 1.0559, + "step": 10820 + }, + { + "epoch": 0.15601365659708716, + "grad_norm": 0.7072425484657288, + "learning_rate": 1.917892060846662e-05, + "loss": 1.0439, + "step": 10830 + }, + { + "epoch": 0.15615771352838642, + "grad_norm": 0.6007922291755676, + "learning_rate": 1.917706812482663e-05, + "loss": 1.046, + "step": 10840 + }, + { + "epoch": 0.15630177045968566, + "grad_norm": 0.7150635719299316, + "learning_rate": 1.9175213643465503e-05, + "loss": 1.05, + "step": 10850 + }, + { + "epoch": 0.15644582739098492, + "grad_norm": 0.6381576657295227, + "learning_rate": 1.9173357164786944e-05, + "loss": 1.0467, + "step": 10860 + }, + { + "epoch": 0.15658988432228416, + "grad_norm": 0.5520989298820496, + "learning_rate": 1.9171498689195068e-05, + "loss": 1.0473, + "step": 10870 + }, + { + "epoch": 0.15673394125358342, + "grad_norm": 0.5694143176078796, + "learning_rate": 1.9169638217094454e-05, + "loss": 1.0542, + "step": 10880 + }, + { + "epoch": 0.15687799818488266, + "grad_norm": 0.7016416788101196, + "learning_rate": 1.9167775748890097e-05, + "loss": 1.027, + "step": 10890 + }, + { + "epoch": 0.15702205511618192, + "grad_norm": 0.6645250916481018, + "learning_rate": 1.9165911284987426e-05, + "loss": 1.0427, + "step": 10900 + }, + { + "epoch": 0.15716611204748115, + "grad_norm": 0.7823187708854675, + "learning_rate": 1.916404482579231e-05, + "loss": 1.0617, + "step": 10910 + }, + { + "epoch": 0.15731016897878042, + "grad_norm": 0.6369243264198303, + "learning_rate": 1.916217637171106e-05, + "loss": 1.0344, + "step": 10920 + }, + { + "epoch": 0.15745422591007965, + "grad_norm": 0.5803185105323792, + "learning_rate": 1.9160305923150398e-05, + "loss": 1.0341, + "step": 10930 + }, + { + "epoch": 0.15759828284137892, + "grad_norm": 0.6197068095207214, + "learning_rate": 1.9158433480517508e-05, + "loss": 1.0616, + "step": 10940 + }, + { + "epoch": 0.15774233977267815, + "grad_norm": 0.5247350931167603, + "learning_rate": 1.915655904421999e-05, + "loss": 1.0534, + "step": 10950 + }, + { + "epoch": 0.15788639670397742, + "grad_norm": 0.6269147396087646, + "learning_rate": 1.9154682614665883e-05, + "loss": 1.0323, + "step": 10960 + }, + { + "epoch": 0.15803045363527665, + "grad_norm": 0.640401303768158, + "learning_rate": 1.915280419226366e-05, + "loss": 1.056, + "step": 10970 + }, + { + "epoch": 0.15817451056657592, + "grad_norm": 0.607360303401947, + "learning_rate": 1.9150923777422224e-05, + "loss": 1.0513, + "step": 10980 + }, + { + "epoch": 0.15831856749787515, + "grad_norm": 0.6519664525985718, + "learning_rate": 1.9149041370550922e-05, + "loss": 1.053, + "step": 10990 + }, + { + "epoch": 0.15846262442917441, + "grad_norm": 0.579047441482544, + "learning_rate": 1.9147156972059518e-05, + "loss": 1.0417, + "step": 11000 + }, + { + "epoch": 0.15860668136047365, + "grad_norm": 0.9328927993774414, + "learning_rate": 1.9145270582358234e-05, + "loss": 1.0421, + "step": 11010 + }, + { + "epoch": 0.1587507382917729, + "grad_norm": 0.689149022102356, + "learning_rate": 1.91433822018577e-05, + "loss": 1.0659, + "step": 11020 + }, + { + "epoch": 0.15889479522307215, + "grad_norm": 0.6681456565856934, + "learning_rate": 1.9141491830968996e-05, + "loss": 1.0402, + "step": 11030 + }, + { + "epoch": 0.1590388521543714, + "grad_norm": 0.7554347515106201, + "learning_rate": 1.9139599470103624e-05, + "loss": 1.0461, + "step": 11040 + }, + { + "epoch": 0.15918290908567065, + "grad_norm": 0.5673980116844177, + "learning_rate": 1.913770511967353e-05, + "loss": 1.0614, + "step": 11050 + }, + { + "epoch": 0.1593269660169699, + "grad_norm": 0.5546961426734924, + "learning_rate": 1.913580878009109e-05, + "loss": 1.052, + "step": 11060 + }, + { + "epoch": 0.15947102294826915, + "grad_norm": 0.5435275435447693, + "learning_rate": 1.9133910451769102e-05, + "loss": 1.0521, + "step": 11070 + }, + { + "epoch": 0.1596150798795684, + "grad_norm": 0.6310040950775146, + "learning_rate": 1.9132010135120813e-05, + "loss": 1.0281, + "step": 11080 + }, + { + "epoch": 0.15975913681086765, + "grad_norm": 0.6460842490196228, + "learning_rate": 1.9130107830559895e-05, + "loss": 1.0518, + "step": 11090 + }, + { + "epoch": 0.1599031937421669, + "grad_norm": 0.6667739748954773, + "learning_rate": 1.912820353850045e-05, + "loss": 1.05, + "step": 11100 + }, + { + "epoch": 0.16004725067346615, + "grad_norm": 0.5846189856529236, + "learning_rate": 1.9126297259357018e-05, + "loss": 1.0537, + "step": 11110 + }, + { + "epoch": 0.1601913076047654, + "grad_norm": 0.710723340511322, + "learning_rate": 1.9124388993544568e-05, + "loss": 1.0587, + "step": 11120 + }, + { + "epoch": 0.16033536453606465, + "grad_norm": 0.7449038624763489, + "learning_rate": 1.912247874147851e-05, + "loss": 1.0503, + "step": 11130 + }, + { + "epoch": 0.1604794214673639, + "grad_norm": 0.611016571521759, + "learning_rate": 1.9120566503574675e-05, + "loss": 1.0271, + "step": 11140 + }, + { + "epoch": 0.16062347839866314, + "grad_norm": 0.6310858130455017, + "learning_rate": 1.9118652280249327e-05, + "loss": 1.0511, + "step": 11150 + }, + { + "epoch": 0.1607675353299624, + "grad_norm": 0.6700077056884766, + "learning_rate": 1.911673607191917e-05, + "loss": 1.0441, + "step": 11160 + }, + { + "epoch": 0.16091159226126164, + "grad_norm": 0.6764875054359436, + "learning_rate": 1.9114817879001335e-05, + "loss": 1.0239, + "step": 11170 + }, + { + "epoch": 0.1610556491925609, + "grad_norm": 0.8168401718139648, + "learning_rate": 1.9112897701913387e-05, + "loss": 1.0313, + "step": 11180 + }, + { + "epoch": 0.16119970612386014, + "grad_norm": 0.6589007377624512, + "learning_rate": 1.9110975541073322e-05, + "loss": 1.072, + "step": 11190 + }, + { + "epoch": 0.1613437630551594, + "grad_norm": 0.577829122543335, + "learning_rate": 1.9109051396899567e-05, + "loss": 1.0613, + "step": 11200 + }, + { + "epoch": 0.16148781998645864, + "grad_norm": 0.677116334438324, + "learning_rate": 1.9107125269810985e-05, + "loss": 1.0231, + "step": 11210 + }, + { + "epoch": 0.1616318769177579, + "grad_norm": 0.622451663017273, + "learning_rate": 1.9105197160226863e-05, + "loss": 1.0336, + "step": 11220 + }, + { + "epoch": 0.16177593384905714, + "grad_norm": 0.7007042169570923, + "learning_rate": 1.9103267068566924e-05, + "loss": 1.0192, + "step": 11230 + }, + { + "epoch": 0.1619199907803564, + "grad_norm": 0.5768957138061523, + "learning_rate": 1.9101334995251325e-05, + "loss": 1.0286, + "step": 11240 + }, + { + "epoch": 0.16206404771165564, + "grad_norm": 0.746347963809967, + "learning_rate": 1.909940094070065e-05, + "loss": 1.0645, + "step": 11250 + }, + { + "epoch": 0.1622081046429549, + "grad_norm": 0.6460885405540466, + "learning_rate": 1.909746490533592e-05, + "loss": 1.0527, + "step": 11260 + }, + { + "epoch": 0.16235216157425414, + "grad_norm": 0.7336540818214417, + "learning_rate": 1.9095526889578577e-05, + "loss": 1.0432, + "step": 11270 + }, + { + "epoch": 0.1624962185055534, + "grad_norm": 0.6185027360916138, + "learning_rate": 1.9093586893850503e-05, + "loss": 1.025, + "step": 11280 + }, + { + "epoch": 0.16264027543685264, + "grad_norm": 0.7867199182510376, + "learning_rate": 1.909164491857401e-05, + "loss": 1.046, + "step": 11290 + }, + { + "epoch": 0.1627843323681519, + "grad_norm": 0.6278853416442871, + "learning_rate": 1.9089700964171832e-05, + "loss": 1.0176, + "step": 11300 + }, + { + "epoch": 0.16292838929945114, + "grad_norm": 0.6531842350959778, + "learning_rate": 1.9087755031067153e-05, + "loss": 1.0257, + "step": 11310 + }, + { + "epoch": 0.1630724462307504, + "grad_norm": 0.6189087629318237, + "learning_rate": 1.9085807119683565e-05, + "loss": 1.0491, + "step": 11320 + }, + { + "epoch": 0.16321650316204964, + "grad_norm": 0.6218734383583069, + "learning_rate": 1.908385723044511e-05, + "loss": 1.0469, + "step": 11330 + }, + { + "epoch": 0.1633605600933489, + "grad_norm": 0.6597093343734741, + "learning_rate": 1.9081905363776244e-05, + "loss": 1.0381, + "step": 11340 + }, + { + "epoch": 0.16350461702464814, + "grad_norm": 0.5974107384681702, + "learning_rate": 1.9079951520101868e-05, + "loss": 1.0585, + "step": 11350 + }, + { + "epoch": 0.1636486739559474, + "grad_norm": 0.5562683939933777, + "learning_rate": 1.9077995699847304e-05, + "loss": 1.0231, + "step": 11360 + }, + { + "epoch": 0.16379273088724663, + "grad_norm": 0.6765367388725281, + "learning_rate": 1.9076037903438304e-05, + "loss": 1.0318, + "step": 11370 + }, + { + "epoch": 0.1639367878185459, + "grad_norm": 0.6025965809822083, + "learning_rate": 1.9074078131301058e-05, + "loss": 1.0349, + "step": 11380 + }, + { + "epoch": 0.16408084474984513, + "grad_norm": 0.7687731981277466, + "learning_rate": 1.907211638386218e-05, + "loss": 1.0488, + "step": 11390 + }, + { + "epoch": 0.1642249016811444, + "grad_norm": 0.6119621992111206, + "learning_rate": 1.9070152661548715e-05, + "loss": 1.0253, + "step": 11400 + }, + { + "epoch": 0.16436895861244363, + "grad_norm": 0.5947549939155579, + "learning_rate": 1.906818696478814e-05, + "loss": 1.0422, + "step": 11410 + }, + { + "epoch": 0.1645130155437429, + "grad_norm": 0.6977937817573547, + "learning_rate": 1.906621929400836e-05, + "loss": 1.0276, + "step": 11420 + }, + { + "epoch": 0.16465707247504213, + "grad_norm": 0.6394926309585571, + "learning_rate": 1.9064249649637703e-05, + "loss": 1.0176, + "step": 11430 + }, + { + "epoch": 0.1648011294063414, + "grad_norm": 0.7498409748077393, + "learning_rate": 1.9062278032104943e-05, + "loss": 1.0383, + "step": 11440 + }, + { + "epoch": 0.16494518633764063, + "grad_norm": 0.675934374332428, + "learning_rate": 1.9060304441839265e-05, + "loss": 1.031, + "step": 11450 + }, + { + "epoch": 0.1650892432689399, + "grad_norm": 0.7206798791885376, + "learning_rate": 1.90583288792703e-05, + "loss": 1.0612, + "step": 11460 + }, + { + "epoch": 0.16523330020023913, + "grad_norm": 0.6218888759613037, + "learning_rate": 1.9056351344828097e-05, + "loss": 1.0413, + "step": 11470 + }, + { + "epoch": 0.1653773571315384, + "grad_norm": 0.6903321146965027, + "learning_rate": 1.905437183894314e-05, + "loss": 1.0694, + "step": 11480 + }, + { + "epoch": 0.16552141406283763, + "grad_norm": 0.6396926045417786, + "learning_rate": 1.905239036204634e-05, + "loss": 1.0397, + "step": 11490 + }, + { + "epoch": 0.1656654709941369, + "grad_norm": 0.6658194065093994, + "learning_rate": 1.9050406914569032e-05, + "loss": 1.0303, + "step": 11500 + }, + { + "epoch": 0.16580952792543613, + "grad_norm": 0.537991464138031, + "learning_rate": 1.904842149694299e-05, + "loss": 1.0419, + "step": 11510 + }, + { + "epoch": 0.1659535848567354, + "grad_norm": 0.7544538378715515, + "learning_rate": 1.9046434109600413e-05, + "loss": 1.0099, + "step": 11520 + }, + { + "epoch": 0.16609764178803463, + "grad_norm": 0.6946489810943604, + "learning_rate": 1.9044444752973927e-05, + "loss": 1.0324, + "step": 11530 + }, + { + "epoch": 0.1662416987193339, + "grad_norm": 0.6735936999320984, + "learning_rate": 1.9042453427496584e-05, + "loss": 1.0402, + "step": 11540 + }, + { + "epoch": 0.16638575565063313, + "grad_norm": 0.6542741656303406, + "learning_rate": 1.904046013360187e-05, + "loss": 1.0545, + "step": 11550 + }, + { + "epoch": 0.1665298125819324, + "grad_norm": 0.5523136258125305, + "learning_rate": 1.90384648717237e-05, + "loss": 1.0371, + "step": 11560 + }, + { + "epoch": 0.16667386951323163, + "grad_norm": 0.7047058343887329, + "learning_rate": 1.9036467642296413e-05, + "loss": 1.0344, + "step": 11570 + }, + { + "epoch": 0.1668179264445309, + "grad_norm": 0.5968069434165955, + "learning_rate": 1.9034468445754776e-05, + "loss": 1.0649, + "step": 11580 + }, + { + "epoch": 0.16696198337583013, + "grad_norm": 0.648099422454834, + "learning_rate": 1.9032467282533993e-05, + "loss": 1.041, + "step": 11590 + }, + { + "epoch": 0.1671060403071294, + "grad_norm": 0.8043709397315979, + "learning_rate": 1.9030464153069684e-05, + "loss": 1.0628, + "step": 11600 + }, + { + "epoch": 0.16725009723842862, + "grad_norm": 0.678976833820343, + "learning_rate": 1.90284590577979e-05, + "loss": 1.0556, + "step": 11610 + }, + { + "epoch": 0.1673941541697279, + "grad_norm": 0.6578510999679565, + "learning_rate": 1.902645199715513e-05, + "loss": 1.0441, + "step": 11620 + }, + { + "epoch": 0.16753821110102712, + "grad_norm": 0.711756706237793, + "learning_rate": 1.9024442971578282e-05, + "loss": 1.0566, + "step": 11630 + }, + { + "epoch": 0.1676822680323264, + "grad_norm": 0.6507493853569031, + "learning_rate": 1.902243198150469e-05, + "loss": 1.0528, + "step": 11640 + }, + { + "epoch": 0.16782632496362562, + "grad_norm": 0.6045960783958435, + "learning_rate": 1.9020419027372117e-05, + "loss": 1.058, + "step": 11650 + }, + { + "epoch": 0.1679703818949249, + "grad_norm": 0.6027932167053223, + "learning_rate": 1.9018404109618764e-05, + "loss": 1.0447, + "step": 11660 + }, + { + "epoch": 0.16811443882622412, + "grad_norm": 0.6618443131446838, + "learning_rate": 1.9016387228683243e-05, + "loss": 1.0531, + "step": 11670 + }, + { + "epoch": 0.16825849575752339, + "grad_norm": 0.6192331314086914, + "learning_rate": 1.9014368385004604e-05, + "loss": 1.0429, + "step": 11680 + }, + { + "epoch": 0.16840255268882262, + "grad_norm": 0.6641239523887634, + "learning_rate": 1.901234757902232e-05, + "loss": 1.0478, + "step": 11690 + }, + { + "epoch": 0.16854660962012188, + "grad_norm": 0.8388392329216003, + "learning_rate": 1.9010324811176293e-05, + "loss": 1.029, + "step": 11700 + }, + { + "epoch": 0.16869066655142112, + "grad_norm": 0.6802681684494019, + "learning_rate": 1.900830008190685e-05, + "loss": 1.034, + "step": 11710 + }, + { + "epoch": 0.16883472348272038, + "grad_norm": 0.7114946246147156, + "learning_rate": 1.9006273391654754e-05, + "loss": 1.0461, + "step": 11720 + }, + { + "epoch": 0.16897878041401962, + "grad_norm": 0.6896271109580994, + "learning_rate": 1.9004244740861176e-05, + "loss": 1.0505, + "step": 11730 + }, + { + "epoch": 0.16912283734531888, + "grad_norm": 0.6172512173652649, + "learning_rate": 1.9002214129967737e-05, + "loss": 1.0593, + "step": 11740 + }, + { + "epoch": 0.16926689427661812, + "grad_norm": 0.5373944640159607, + "learning_rate": 1.9000181559416465e-05, + "loss": 1.0364, + "step": 11750 + }, + { + "epoch": 0.16941095120791738, + "grad_norm": 0.6389153003692627, + "learning_rate": 1.8998147029649828e-05, + "loss": 1.0255, + "step": 11760 + }, + { + "epoch": 0.16955500813921662, + "grad_norm": 0.5937640070915222, + "learning_rate": 1.8996110541110708e-05, + "loss": 1.0388, + "step": 11770 + }, + { + "epoch": 0.16969906507051588, + "grad_norm": 0.6417317986488342, + "learning_rate": 1.8994072094242426e-05, + "loss": 1.0438, + "step": 11780 + }, + { + "epoch": 0.16984312200181512, + "grad_norm": 0.751429557800293, + "learning_rate": 1.8992031689488724e-05, + "loss": 1.0434, + "step": 11790 + }, + { + "epoch": 0.16998717893311438, + "grad_norm": 0.6613959670066833, + "learning_rate": 1.8989989327293767e-05, + "loss": 1.0347, + "step": 11800 + }, + { + "epoch": 0.17013123586441362, + "grad_norm": 0.6063255667686462, + "learning_rate": 1.898794500810215e-05, + "loss": 1.0476, + "step": 11810 + }, + { + "epoch": 0.17027529279571288, + "grad_norm": 0.6742021441459656, + "learning_rate": 1.8985898732358894e-05, + "loss": 1.0263, + "step": 11820 + }, + { + "epoch": 0.17041934972701211, + "grad_norm": 0.6540886163711548, + "learning_rate": 1.8983850500509447e-05, + "loss": 1.0431, + "step": 11830 + }, + { + "epoch": 0.17056340665831138, + "grad_norm": 0.6701706647872925, + "learning_rate": 1.8981800312999675e-05, + "loss": 1.0553, + "step": 11840 + }, + { + "epoch": 0.17070746358961061, + "grad_norm": 0.5419948101043701, + "learning_rate": 1.897974817027588e-05, + "loss": 1.0466, + "step": 11850 + }, + { + "epoch": 0.17085152052090985, + "grad_norm": 0.6257351636886597, + "learning_rate": 1.8977694072784782e-05, + "loss": 1.0502, + "step": 11860 + }, + { + "epoch": 0.1709955774522091, + "grad_norm": 0.6071768999099731, + "learning_rate": 1.8975638020973534e-05, + "loss": 1.0512, + "step": 11870 + }, + { + "epoch": 0.17113963438350835, + "grad_norm": 0.7279354333877563, + "learning_rate": 1.8973580015289706e-05, + "loss": 1.054, + "step": 11880 + }, + { + "epoch": 0.1712836913148076, + "grad_norm": 0.6097837090492249, + "learning_rate": 1.8971520056181304e-05, + "loss": 1.0842, + "step": 11890 + }, + { + "epoch": 0.17142774824610685, + "grad_norm": 0.6748334169387817, + "learning_rate": 1.896945814409674e-05, + "loss": 1.0441, + "step": 11900 + }, + { + "epoch": 0.1715718051774061, + "grad_norm": 0.595467746257782, + "learning_rate": 1.8967394279484878e-05, + "loss": 1.0301, + "step": 11910 + }, + { + "epoch": 0.17171586210870535, + "grad_norm": 0.5484843254089355, + "learning_rate": 1.8965328462794987e-05, + "loss": 1.0407, + "step": 11920 + }, + { + "epoch": 0.1718599190400046, + "grad_norm": 0.7679455280303955, + "learning_rate": 1.8963260694476763e-05, + "loss": 1.0348, + "step": 11930 + }, + { + "epoch": 0.17200397597130385, + "grad_norm": 0.5871204137802124, + "learning_rate": 1.8961190974980333e-05, + "loss": 1.063, + "step": 11940 + }, + { + "epoch": 0.1721480329026031, + "grad_norm": 0.6413636207580566, + "learning_rate": 1.8959119304756252e-05, + "loss": 1.0446, + "step": 11950 + }, + { + "epoch": 0.17229208983390235, + "grad_norm": 0.5831906199455261, + "learning_rate": 1.895704568425549e-05, + "loss": 1.0469, + "step": 11960 + }, + { + "epoch": 0.1724361467652016, + "grad_norm": 0.5874209403991699, + "learning_rate": 1.895497011392944e-05, + "loss": 1.0436, + "step": 11970 + }, + { + "epoch": 0.17258020369650084, + "grad_norm": 0.7418249845504761, + "learning_rate": 1.895289259422993e-05, + "loss": 1.0553, + "step": 11980 + }, + { + "epoch": 0.1727242606278001, + "grad_norm": 0.46058523654937744, + "learning_rate": 1.895081312560921e-05, + "loss": 1.0286, + "step": 11990 + }, + { + "epoch": 0.17286831755909934, + "grad_norm": 0.6460348963737488, + "learning_rate": 1.8948731708519952e-05, + "loss": 1.0266, + "step": 12000 + }, + { + "epoch": 0.1730123744903986, + "grad_norm": 0.6433600187301636, + "learning_rate": 1.8946648343415245e-05, + "loss": 1.0391, + "step": 12010 + }, + { + "epoch": 0.17315643142169784, + "grad_norm": 0.693173348903656, + "learning_rate": 1.8944563030748614e-05, + "loss": 1.0244, + "step": 12020 + }, + { + "epoch": 0.1733004883529971, + "grad_norm": 0.7197710275650024, + "learning_rate": 1.8942475770974002e-05, + "loss": 1.0269, + "step": 12030 + }, + { + "epoch": 0.17344454528429634, + "grad_norm": 0.8151099681854248, + "learning_rate": 1.8940386564545773e-05, + "loss": 1.0449, + "step": 12040 + }, + { + "epoch": 0.1735886022155956, + "grad_norm": 0.6173912882804871, + "learning_rate": 1.8938295411918727e-05, + "loss": 1.0361, + "step": 12050 + }, + { + "epoch": 0.17373265914689484, + "grad_norm": 0.5067716240882874, + "learning_rate": 1.893620231354807e-05, + "loss": 1.0561, + "step": 12060 + }, + { + "epoch": 0.1738767160781941, + "grad_norm": 0.5978410840034485, + "learning_rate": 1.8934107269889442e-05, + "loss": 1.0561, + "step": 12070 + }, + { + "epoch": 0.17402077300949334, + "grad_norm": 0.567631721496582, + "learning_rate": 1.8932010281398912e-05, + "loss": 1.0361, + "step": 12080 + }, + { + "epoch": 0.1741648299407926, + "grad_norm": 0.5313737392425537, + "learning_rate": 1.892991134853296e-05, + "loss": 1.0588, + "step": 12090 + }, + { + "epoch": 0.17430888687209184, + "grad_norm": 0.580583930015564, + "learning_rate": 1.8927810471748494e-05, + "loss": 1.0368, + "step": 12100 + }, + { + "epoch": 0.1744529438033911, + "grad_norm": 0.589990496635437, + "learning_rate": 1.892570765150285e-05, + "loss": 1.058, + "step": 12110 + }, + { + "epoch": 0.17459700073469034, + "grad_norm": 0.6194111704826355, + "learning_rate": 1.8923602888253777e-05, + "loss": 1.0588, + "step": 12120 + }, + { + "epoch": 0.1747410576659896, + "grad_norm": 0.5884047746658325, + "learning_rate": 1.892149618245946e-05, + "loss": 1.047, + "step": 12130 + }, + { + "epoch": 0.17488511459728884, + "grad_norm": 0.7635648846626282, + "learning_rate": 1.8919387534578493e-05, + "loss": 1.0481, + "step": 12140 + }, + { + "epoch": 0.1750291715285881, + "grad_norm": 0.6717743277549744, + "learning_rate": 1.8917276945069903e-05, + "loss": 1.0641, + "step": 12150 + }, + { + "epoch": 0.17517322845988734, + "grad_norm": 0.6033238768577576, + "learning_rate": 1.891516441439314e-05, + "loss": 1.0463, + "step": 12160 + }, + { + "epoch": 0.1753172853911866, + "grad_norm": 0.633753776550293, + "learning_rate": 1.8913049943008063e-05, + "loss": 1.0293, + "step": 12170 + }, + { + "epoch": 0.17546134232248584, + "grad_norm": 0.5848496556282043, + "learning_rate": 1.891093353137497e-05, + "loss": 1.0271, + "step": 12180 + }, + { + "epoch": 0.1756053992537851, + "grad_norm": 0.6577739119529724, + "learning_rate": 1.8908815179954578e-05, + "loss": 1.0413, + "step": 12190 + }, + { + "epoch": 0.17574945618508434, + "grad_norm": 0.6133542060852051, + "learning_rate": 1.890669488920802e-05, + "loss": 1.0549, + "step": 12200 + }, + { + "epoch": 0.1758935131163836, + "grad_norm": 0.7499854564666748, + "learning_rate": 1.8904572659596843e-05, + "loss": 1.0407, + "step": 12210 + }, + { + "epoch": 0.17603757004768283, + "grad_norm": 0.5730758309364319, + "learning_rate": 1.8902448491583044e-05, + "loss": 1.023, + "step": 12220 + }, + { + "epoch": 0.1761816269789821, + "grad_norm": 0.6072816252708435, + "learning_rate": 1.8900322385629015e-05, + "loss": 1.0161, + "step": 12230 + }, + { + "epoch": 0.17632568391028133, + "grad_norm": 0.5822769999504089, + "learning_rate": 1.8898194342197582e-05, + "loss": 1.0215, + "step": 12240 + }, + { + "epoch": 0.1764697408415806, + "grad_norm": 0.6300902962684631, + "learning_rate": 1.8896064361751995e-05, + "loss": 1.0528, + "step": 12250 + }, + { + "epoch": 0.17661379777287983, + "grad_norm": 0.5611024498939514, + "learning_rate": 1.8893932444755916e-05, + "loss": 1.0572, + "step": 12260 + }, + { + "epoch": 0.1767578547041791, + "grad_norm": 0.6063017249107361, + "learning_rate": 1.8891798591673434e-05, + "loss": 1.0583, + "step": 12270 + }, + { + "epoch": 0.17690191163547833, + "grad_norm": 0.6732243895530701, + "learning_rate": 1.8889662802969063e-05, + "loss": 1.0525, + "step": 12280 + }, + { + "epoch": 0.1770459685667776, + "grad_norm": 0.7215601801872253, + "learning_rate": 1.888752507910773e-05, + "loss": 1.0349, + "step": 12290 + }, + { + "epoch": 0.17719002549807683, + "grad_norm": 0.6400814652442932, + "learning_rate": 1.8885385420554795e-05, + "loss": 1.0388, + "step": 12300 + }, + { + "epoch": 0.1773340824293761, + "grad_norm": 0.5283488631248474, + "learning_rate": 1.8883243827776026e-05, + "loss": 1.0505, + "step": 12310 + }, + { + "epoch": 0.17747813936067533, + "grad_norm": 0.5196974873542786, + "learning_rate": 1.888110030123762e-05, + "loss": 1.0404, + "step": 12320 + }, + { + "epoch": 0.1776221962919746, + "grad_norm": 0.6163458824157715, + "learning_rate": 1.8878954841406193e-05, + "loss": 1.0391, + "step": 12330 + }, + { + "epoch": 0.17776625322327383, + "grad_norm": 0.6230759024620056, + "learning_rate": 1.887680744874878e-05, + "loss": 1.0292, + "step": 12340 + }, + { + "epoch": 0.1779103101545731, + "grad_norm": 0.6491036415100098, + "learning_rate": 1.8874658123732844e-05, + "loss": 0.9997, + "step": 12350 + }, + { + "epoch": 0.17805436708587233, + "grad_norm": 0.5875597596168518, + "learning_rate": 1.887250686682626e-05, + "loss": 1.0458, + "step": 12360 + }, + { + "epoch": 0.1781984240171716, + "grad_norm": 0.6479558348655701, + "learning_rate": 1.8870353678497327e-05, + "loss": 1.0612, + "step": 12370 + }, + { + "epoch": 0.17834248094847083, + "grad_norm": 0.6008232235908508, + "learning_rate": 1.8868198559214765e-05, + "loss": 1.0379, + "step": 12380 + }, + { + "epoch": 0.1784865378797701, + "grad_norm": 0.660557746887207, + "learning_rate": 1.8866041509447713e-05, + "loss": 1.059, + "step": 12390 + }, + { + "epoch": 0.17863059481106933, + "grad_norm": 0.7118351459503174, + "learning_rate": 1.8863882529665733e-05, + "loss": 1.0586, + "step": 12400 + }, + { + "epoch": 0.1787746517423686, + "grad_norm": 0.5953468084335327, + "learning_rate": 1.88617216203388e-05, + "loss": 1.0099, + "step": 12410 + }, + { + "epoch": 0.17891870867366783, + "grad_norm": 0.7735770344734192, + "learning_rate": 1.885955878193732e-05, + "loss": 1.0581, + "step": 12420 + }, + { + "epoch": 0.1790627656049671, + "grad_norm": 0.5638709664344788, + "learning_rate": 1.8857394014932114e-05, + "loss": 1.0236, + "step": 12430 + }, + { + "epoch": 0.17920682253626632, + "grad_norm": 0.6094455122947693, + "learning_rate": 1.8855227319794415e-05, + "loss": 1.049, + "step": 12440 + }, + { + "epoch": 0.1793508794675656, + "grad_norm": 0.6717058420181274, + "learning_rate": 1.8853058696995886e-05, + "loss": 1.031, + "step": 12450 + }, + { + "epoch": 0.17949493639886482, + "grad_norm": 0.6776756644248962, + "learning_rate": 1.885088814700861e-05, + "loss": 1.0461, + "step": 12460 + }, + { + "epoch": 0.1796389933301641, + "grad_norm": 0.5989999175071716, + "learning_rate": 1.884871567030508e-05, + "loss": 1.057, + "step": 12470 + }, + { + "epoch": 0.17978305026146332, + "grad_norm": 0.502468466758728, + "learning_rate": 1.8846541267358217e-05, + "loss": 1.0272, + "step": 12480 + }, + { + "epoch": 0.1799271071927626, + "grad_norm": 0.5758592486381531, + "learning_rate": 1.884436493864136e-05, + "loss": 1.05, + "step": 12490 + }, + { + "epoch": 0.18007116412406182, + "grad_norm": 0.5701432824134827, + "learning_rate": 1.884218668462826e-05, + "loss": 1.0405, + "step": 12500 + }, + { + "epoch": 0.18021522105536109, + "grad_norm": 0.6102204322814941, + "learning_rate": 1.88400065057931e-05, + "loss": 1.0431, + "step": 12510 + }, + { + "epoch": 0.18035927798666032, + "grad_norm": 0.6020178198814392, + "learning_rate": 1.883782440261047e-05, + "loss": 1.0329, + "step": 12520 + }, + { + "epoch": 0.18050333491795958, + "grad_norm": 0.7687045931816101, + "learning_rate": 1.8835640375555387e-05, + "loss": 1.0373, + "step": 12530 + }, + { + "epoch": 0.18064739184925882, + "grad_norm": 0.5445989370346069, + "learning_rate": 1.883345442510328e-05, + "loss": 1.0317, + "step": 12540 + }, + { + "epoch": 0.18079144878055808, + "grad_norm": 0.5568065047264099, + "learning_rate": 1.8831266551730002e-05, + "loss": 1.0345, + "step": 12550 + }, + { + "epoch": 0.18093550571185732, + "grad_norm": 0.7024680376052856, + "learning_rate": 1.8829076755911826e-05, + "loss": 1.0545, + "step": 12560 + }, + { + "epoch": 0.18107956264315658, + "grad_norm": 0.608268678188324, + "learning_rate": 1.8826885038125437e-05, + "loss": 1.027, + "step": 12570 + }, + { + "epoch": 0.18122361957445582, + "grad_norm": 0.6229660511016846, + "learning_rate": 1.882469139884794e-05, + "loss": 1.0678, + "step": 12580 + }, + { + "epoch": 0.18136767650575508, + "grad_norm": 0.623124361038208, + "learning_rate": 1.8822495838556866e-05, + "loss": 1.0501, + "step": 12590 + }, + { + "epoch": 0.18151173343705432, + "grad_norm": 0.6334506869316101, + "learning_rate": 1.882029835773015e-05, + "loss": 1.0186, + "step": 12600 + }, + { + "epoch": 0.18165579036835358, + "grad_norm": 0.6877279877662659, + "learning_rate": 1.8818098956846157e-05, + "loss": 1.05, + "step": 12610 + }, + { + "epoch": 0.18179984729965282, + "grad_norm": 0.6072930693626404, + "learning_rate": 1.881589763638367e-05, + "loss": 1.0413, + "step": 12620 + }, + { + "epoch": 0.18194390423095208, + "grad_norm": 0.5373785495758057, + "learning_rate": 1.881369439682188e-05, + "loss": 1.034, + "step": 12630 + }, + { + "epoch": 0.18208796116225132, + "grad_norm": 0.5830142498016357, + "learning_rate": 1.8811489238640407e-05, + "loss": 1.0197, + "step": 12640 + }, + { + "epoch": 0.18223201809355058, + "grad_norm": 0.5567898154258728, + "learning_rate": 1.8809282162319282e-05, + "loss": 1.0145, + "step": 12650 + }, + { + "epoch": 0.18237607502484982, + "grad_norm": 0.6073896288871765, + "learning_rate": 1.8807073168338953e-05, + "loss": 1.012, + "step": 12660 + }, + { + "epoch": 0.18252013195614908, + "grad_norm": 0.5818758606910706, + "learning_rate": 1.8804862257180287e-05, + "loss": 1.0529, + "step": 12670 + }, + { + "epoch": 0.18266418888744831, + "grad_norm": 0.580693781375885, + "learning_rate": 1.8802649429324574e-05, + "loss": 1.0249, + "step": 12680 + }, + { + "epoch": 0.18280824581874758, + "grad_norm": 0.8039982318878174, + "learning_rate": 1.8800434685253514e-05, + "loss": 1.044, + "step": 12690 + }, + { + "epoch": 0.1829523027500468, + "grad_norm": 0.6613562703132629, + "learning_rate": 1.8798218025449222e-05, + "loss": 1.0332, + "step": 12700 + }, + { + "epoch": 0.18309635968134608, + "grad_norm": 0.5530600547790527, + "learning_rate": 1.8795999450394237e-05, + "loss": 1.0442, + "step": 12710 + }, + { + "epoch": 0.1832404166126453, + "grad_norm": 0.7691448926925659, + "learning_rate": 1.8793778960571516e-05, + "loss": 1.0533, + "step": 12720 + }, + { + "epoch": 0.18338447354394458, + "grad_norm": 0.6959360241889954, + "learning_rate": 1.879155655646442e-05, + "loss": 1.0444, + "step": 12730 + }, + { + "epoch": 0.1835285304752438, + "grad_norm": 0.7252940535545349, + "learning_rate": 1.8789332238556747e-05, + "loss": 1.0378, + "step": 12740 + }, + { + "epoch": 0.18367258740654308, + "grad_norm": 0.6496034264564514, + "learning_rate": 1.878710600733269e-05, + "loss": 1.0511, + "step": 12750 + }, + { + "epoch": 0.1838166443378423, + "grad_norm": 0.6047234535217285, + "learning_rate": 1.878487786327688e-05, + "loss": 1.0383, + "step": 12760 + }, + { + "epoch": 0.18396070126914157, + "grad_norm": 0.6182611584663391, + "learning_rate": 1.878264780687434e-05, + "loss": 1.0238, + "step": 12770 + }, + { + "epoch": 0.1841047582004408, + "grad_norm": 0.6061338186264038, + "learning_rate": 1.878041583861053e-05, + "loss": 1.0426, + "step": 12780 + }, + { + "epoch": 0.18424881513174007, + "grad_norm": 0.6683133840560913, + "learning_rate": 1.877818195897132e-05, + "loss": 1.0447, + "step": 12790 + }, + { + "epoch": 0.1843928720630393, + "grad_norm": 0.5210486054420471, + "learning_rate": 1.877594616844299e-05, + "loss": 1.0325, + "step": 12800 + }, + { + "epoch": 0.18453692899433857, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.8773708467512247e-05, + "loss": 1.0367, + "step": 12810 + }, + { + "epoch": 0.1846809859256378, + "grad_norm": 0.6415544748306274, + "learning_rate": 1.8771468856666203e-05, + "loss": 1.0507, + "step": 12820 + }, + { + "epoch": 0.18482504285693707, + "grad_norm": 0.5568451881408691, + "learning_rate": 1.876922733639239e-05, + "loss": 1.0534, + "step": 12830 + }, + { + "epoch": 0.1849690997882363, + "grad_norm": 0.6413741707801819, + "learning_rate": 1.8766983907178756e-05, + "loss": 1.0538, + "step": 12840 + }, + { + "epoch": 0.18511315671953557, + "grad_norm": 0.6150505542755127, + "learning_rate": 1.876473856951367e-05, + "loss": 1.0401, + "step": 12850 + }, + { + "epoch": 0.1852572136508348, + "grad_norm": 1.147980809211731, + "learning_rate": 1.8762491323885904e-05, + "loss": 1.0432, + "step": 12860 + }, + { + "epoch": 0.18540127058213407, + "grad_norm": 0.6297221779823303, + "learning_rate": 1.8760242170784656e-05, + "loss": 1.0491, + "step": 12870 + }, + { + "epoch": 0.1855453275134333, + "grad_norm": 0.6736859083175659, + "learning_rate": 1.8757991110699537e-05, + "loss": 1.0353, + "step": 12880 + }, + { + "epoch": 0.18568938444473257, + "grad_norm": 0.6270202994346619, + "learning_rate": 1.8755738144120567e-05, + "loss": 1.0411, + "step": 12890 + }, + { + "epoch": 0.1858334413760318, + "grad_norm": 0.5883143544197083, + "learning_rate": 1.875348327153819e-05, + "loss": 1.047, + "step": 12900 + }, + { + "epoch": 0.18597749830733107, + "grad_norm": 0.9572540521621704, + "learning_rate": 1.8751226493443257e-05, + "loss": 1.0395, + "step": 12910 + }, + { + "epoch": 0.1861215552386303, + "grad_norm": 0.6460018157958984, + "learning_rate": 1.874896781032704e-05, + "loss": 1.0468, + "step": 12920 + }, + { + "epoch": 0.18626561216992957, + "grad_norm": 0.6704892516136169, + "learning_rate": 1.8746707222681225e-05, + "loss": 1.0566, + "step": 12930 + }, + { + "epoch": 0.1864096691012288, + "grad_norm": 0.6742843389511108, + "learning_rate": 1.8744444730997907e-05, + "loss": 1.0416, + "step": 12940 + }, + { + "epoch": 0.18655372603252807, + "grad_norm": 0.6100138425827026, + "learning_rate": 1.8742180335769603e-05, + "loss": 1.0472, + "step": 12950 + }, + { + "epoch": 0.1866977829638273, + "grad_norm": 0.7344133853912354, + "learning_rate": 1.8739914037489236e-05, + "loss": 1.0576, + "step": 12960 + }, + { + "epoch": 0.18684183989512657, + "grad_norm": 0.8023427724838257, + "learning_rate": 1.873764583665015e-05, + "loss": 1.0604, + "step": 12970 + }, + { + "epoch": 0.1869858968264258, + "grad_norm": 0.5891733765602112, + "learning_rate": 1.8735375733746104e-05, + "loss": 1.0359, + "step": 12980 + }, + { + "epoch": 0.18712995375772506, + "grad_norm": 0.6603121161460876, + "learning_rate": 1.873310372927126e-05, + "loss": 1.0515, + "step": 12990 + }, + { + "epoch": 0.1872740106890243, + "grad_norm": 0.5632944703102112, + "learning_rate": 1.8730829823720215e-05, + "loss": 1.0353, + "step": 13000 + }, + { + "epoch": 0.18741806762032356, + "grad_norm": 0.548271894454956, + "learning_rate": 1.8728554017587954e-05, + "loss": 1.0416, + "step": 13010 + }, + { + "epoch": 0.1875621245516228, + "grad_norm": 0.6604188680648804, + "learning_rate": 1.8726276311369896e-05, + "loss": 1.0312, + "step": 13020 + }, + { + "epoch": 0.18770618148292206, + "grad_norm": 0.6293408870697021, + "learning_rate": 1.8723996705561865e-05, + "loss": 1.047, + "step": 13030 + }, + { + "epoch": 0.1878502384142213, + "grad_norm": 0.6757556796073914, + "learning_rate": 1.8721715200660094e-05, + "loss": 1.0323, + "step": 13040 + }, + { + "epoch": 0.18799429534552056, + "grad_norm": 0.5530831813812256, + "learning_rate": 1.8719431797161244e-05, + "loss": 1.0652, + "step": 13050 + }, + { + "epoch": 0.1881383522768198, + "grad_norm": 0.6663712859153748, + "learning_rate": 1.8717146495562372e-05, + "loss": 1.0395, + "step": 13060 + }, + { + "epoch": 0.18828240920811906, + "grad_norm": 0.5699810981750488, + "learning_rate": 1.8714859296360966e-05, + "loss": 1.0316, + "step": 13070 + }, + { + "epoch": 0.1884264661394183, + "grad_norm": 0.5460770726203918, + "learning_rate": 1.871257020005491e-05, + "loss": 1.0713, + "step": 13080 + }, + { + "epoch": 0.18857052307071756, + "grad_norm": 0.6627286076545715, + "learning_rate": 1.8710279207142516e-05, + "loss": 1.0467, + "step": 13090 + }, + { + "epoch": 0.1887145800020168, + "grad_norm": 0.760555624961853, + "learning_rate": 1.870798631812249e-05, + "loss": 1.0621, + "step": 13100 + }, + { + "epoch": 0.18885863693331606, + "grad_norm": 0.6630510687828064, + "learning_rate": 1.8705691533493976e-05, + "loss": 1.0214, + "step": 13110 + }, + { + "epoch": 0.1890026938646153, + "grad_norm": 0.6597281694412231, + "learning_rate": 1.8703394853756503e-05, + "loss": 1.0326, + "step": 13120 + }, + { + "epoch": 0.18914675079591456, + "grad_norm": 0.6269526481628418, + "learning_rate": 1.8701096279410042e-05, + "loss": 1.0398, + "step": 13130 + }, + { + "epoch": 0.1892908077272138, + "grad_norm": 0.6193029880523682, + "learning_rate": 1.8698795810954945e-05, + "loss": 1.0229, + "step": 13140 + }, + { + "epoch": 0.18943486465851306, + "grad_norm": 0.7493313550949097, + "learning_rate": 1.8696493448892002e-05, + "loss": 1.0617, + "step": 13150 + }, + { + "epoch": 0.1895789215898123, + "grad_norm": 0.6989824771881104, + "learning_rate": 1.8694189193722406e-05, + "loss": 1.0333, + "step": 13160 + }, + { + "epoch": 0.18972297852111156, + "grad_norm": 0.7208826541900635, + "learning_rate": 1.8691883045947756e-05, + "loss": 1.0427, + "step": 13170 + }, + { + "epoch": 0.1898670354524108, + "grad_norm": 0.5444157123565674, + "learning_rate": 1.8689575006070075e-05, + "loss": 1.036, + "step": 13180 + }, + { + "epoch": 0.19001109238371003, + "grad_norm": 0.6155458688735962, + "learning_rate": 1.8687265074591785e-05, + "loss": 1.0511, + "step": 13190 + }, + { + "epoch": 0.1901551493150093, + "grad_norm": 0.65130215883255, + "learning_rate": 1.868495325201573e-05, + "loss": 1.0507, + "step": 13200 + }, + { + "epoch": 0.19029920624630853, + "grad_norm": 0.5150203108787537, + "learning_rate": 1.8682639538845162e-05, + "loss": 1.0292, + "step": 13210 + }, + { + "epoch": 0.1904432631776078, + "grad_norm": 0.6285935044288635, + "learning_rate": 1.868032393558374e-05, + "loss": 1.052, + "step": 13220 + }, + { + "epoch": 0.19058732010890703, + "grad_norm": 0.6595324873924255, + "learning_rate": 1.8678006442735546e-05, + "loss": 1.0423, + "step": 13230 + }, + { + "epoch": 0.1907313770402063, + "grad_norm": 0.6310179233551025, + "learning_rate": 1.8675687060805062e-05, + "loss": 1.0604, + "step": 13240 + }, + { + "epoch": 0.19087543397150553, + "grad_norm": 0.5544841885566711, + "learning_rate": 1.8673365790297184e-05, + "loss": 1.0275, + "step": 13250 + }, + { + "epoch": 0.1910194909028048, + "grad_norm": 0.6470851898193359, + "learning_rate": 1.8671042631717226e-05, + "loss": 1.0454, + "step": 13260 + }, + { + "epoch": 0.19116354783410403, + "grad_norm": 0.6718747019767761, + "learning_rate": 1.86687175855709e-05, + "loss": 1.0478, + "step": 13270 + }, + { + "epoch": 0.1913076047654033, + "grad_norm": 0.8221065998077393, + "learning_rate": 1.8666390652364343e-05, + "loss": 1.0537, + "step": 13280 + }, + { + "epoch": 0.19145166169670252, + "grad_norm": 0.7532829642295837, + "learning_rate": 1.8664061832604093e-05, + "loss": 1.0496, + "step": 13290 + }, + { + "epoch": 0.1915957186280018, + "grad_norm": 0.5681278109550476, + "learning_rate": 1.8661731126797105e-05, + "loss": 1.0237, + "step": 13300 + }, + { + "epoch": 0.19173977555930102, + "grad_norm": 0.586743175983429, + "learning_rate": 1.865939853545074e-05, + "loss": 1.041, + "step": 13310 + }, + { + "epoch": 0.1918838324906003, + "grad_norm": 0.6237199902534485, + "learning_rate": 1.865706405907277e-05, + "loss": 1.0306, + "step": 13320 + }, + { + "epoch": 0.19202788942189952, + "grad_norm": 1.0356930494308472, + "learning_rate": 1.8654727698171372e-05, + "loss": 1.042, + "step": 13330 + }, + { + "epoch": 0.19217194635319879, + "grad_norm": 0.683358371257782, + "learning_rate": 1.8652389453255154e-05, + "loss": 1.0573, + "step": 13340 + }, + { + "epoch": 0.19231600328449802, + "grad_norm": 0.614015519618988, + "learning_rate": 1.8650049324833107e-05, + "loss": 1.0321, + "step": 13350 + }, + { + "epoch": 0.19246006021579729, + "grad_norm": 0.8009682893753052, + "learning_rate": 1.8647707313414652e-05, + "loss": 1.0486, + "step": 13360 + }, + { + "epoch": 0.19260411714709652, + "grad_norm": 0.832682192325592, + "learning_rate": 1.8645363419509606e-05, + "loss": 1.0345, + "step": 13370 + }, + { + "epoch": 0.19274817407839578, + "grad_norm": 0.625429630279541, + "learning_rate": 1.8643017643628214e-05, + "loss": 1.0202, + "step": 13380 + }, + { + "epoch": 0.19289223100969502, + "grad_norm": 0.6398469805717468, + "learning_rate": 1.8640669986281106e-05, + "loss": 1.0328, + "step": 13390 + }, + { + "epoch": 0.19303628794099428, + "grad_norm": 0.6227033734321594, + "learning_rate": 1.863832044797934e-05, + "loss": 1.0634, + "step": 13400 + }, + { + "epoch": 0.19318034487229352, + "grad_norm": 0.6822697520256042, + "learning_rate": 1.863596902923438e-05, + "loss": 1.0212, + "step": 13410 + }, + { + "epoch": 0.19332440180359278, + "grad_norm": 0.5786993503570557, + "learning_rate": 1.8633615730558095e-05, + "loss": 1.0208, + "step": 13420 + }, + { + "epoch": 0.19346845873489202, + "grad_norm": 0.6412922739982605, + "learning_rate": 1.863126055246277e-05, + "loss": 1.0315, + "step": 13430 + }, + { + "epoch": 0.19361251566619128, + "grad_norm": 0.660926342010498, + "learning_rate": 1.8628903495461083e-05, + "loss": 1.0576, + "step": 13440 + }, + { + "epoch": 0.19375657259749052, + "grad_norm": 0.6085519194602966, + "learning_rate": 1.8626544560066146e-05, + "loss": 1.0401, + "step": 13450 + }, + { + "epoch": 0.19390062952878978, + "grad_norm": 0.73714679479599, + "learning_rate": 1.862418374679146e-05, + "loss": 1.0363, + "step": 13460 + }, + { + "epoch": 0.19404468646008902, + "grad_norm": 0.635399341583252, + "learning_rate": 1.8621821056150945e-05, + "loss": 1.0373, + "step": 13470 + }, + { + "epoch": 0.19418874339138828, + "grad_norm": 0.5904781818389893, + "learning_rate": 1.861945648865892e-05, + "loss": 1.0189, + "step": 13480 + }, + { + "epoch": 0.19433280032268752, + "grad_norm": 0.589888334274292, + "learning_rate": 1.8617090044830128e-05, + "loss": 1.0447, + "step": 13490 + }, + { + "epoch": 0.19447685725398678, + "grad_norm": 0.6887579560279846, + "learning_rate": 1.8614721725179706e-05, + "loss": 1.0389, + "step": 13500 + }, + { + "epoch": 0.19462091418528601, + "grad_norm": 0.5638279318809509, + "learning_rate": 1.86123515302232e-05, + "loss": 1.0291, + "step": 13510 + }, + { + "epoch": 0.19476497111658528, + "grad_norm": 0.5188870429992676, + "learning_rate": 1.8609979460476576e-05, + "loss": 1.0323, + "step": 13520 + }, + { + "epoch": 0.1949090280478845, + "grad_norm": 0.6359505653381348, + "learning_rate": 1.8607605516456196e-05, + "loss": 1.0305, + "step": 13530 + }, + { + "epoch": 0.19505308497918378, + "grad_norm": 0.5990992784500122, + "learning_rate": 1.8605229698678842e-05, + "loss": 1.0319, + "step": 13540 + }, + { + "epoch": 0.195197141910483, + "grad_norm": 0.4979810118675232, + "learning_rate": 1.860285200766169e-05, + "loss": 1.0318, + "step": 13550 + }, + { + "epoch": 0.19534119884178228, + "grad_norm": 0.7600204944610596, + "learning_rate": 1.860047244392233e-05, + "loss": 1.0494, + "step": 13560 + }, + { + "epoch": 0.1954852557730815, + "grad_norm": 0.5677869319915771, + "learning_rate": 1.8598091007978762e-05, + "loss": 1.0409, + "step": 13570 + }, + { + "epoch": 0.19562931270438078, + "grad_norm": 0.7052412033081055, + "learning_rate": 1.8595707700349394e-05, + "loss": 1.0759, + "step": 13580 + }, + { + "epoch": 0.19577336963568, + "grad_norm": 0.5724908113479614, + "learning_rate": 1.8593322521553036e-05, + "loss": 1.0516, + "step": 13590 + }, + { + "epoch": 0.19591742656697927, + "grad_norm": 0.6373711824417114, + "learning_rate": 1.859093547210891e-05, + "loss": 1.0468, + "step": 13600 + }, + { + "epoch": 0.1960614834982785, + "grad_norm": 0.6027516722679138, + "learning_rate": 1.858854655253665e-05, + "loss": 1.0475, + "step": 13610 + }, + { + "epoch": 0.19620554042957777, + "grad_norm": 0.5821101069450378, + "learning_rate": 1.858615576335628e-05, + "loss": 1.0271, + "step": 13620 + }, + { + "epoch": 0.196349597360877, + "grad_norm": 0.6327353119850159, + "learning_rate": 1.8583763105088242e-05, + "loss": 1.0509, + "step": 13630 + }, + { + "epoch": 0.19649365429217627, + "grad_norm": 0.6976615786552429, + "learning_rate": 1.8581368578253395e-05, + "loss": 1.0328, + "step": 13640 + }, + { + "epoch": 0.1966377112234755, + "grad_norm": 0.6341861486434937, + "learning_rate": 1.8578972183372987e-05, + "loss": 1.054, + "step": 13650 + }, + { + "epoch": 0.19678176815477477, + "grad_norm": 0.626846194267273, + "learning_rate": 1.857657392096868e-05, + "loss": 1.0232, + "step": 13660 + }, + { + "epoch": 0.196925825086074, + "grad_norm": 0.7061647176742554, + "learning_rate": 1.8574173791562547e-05, + "loss": 1.0262, + "step": 13670 + }, + { + "epoch": 0.19706988201737327, + "grad_norm": 0.593008816242218, + "learning_rate": 1.857177179567706e-05, + "loss": 1.0289, + "step": 13680 + }, + { + "epoch": 0.1972139389486725, + "grad_norm": 0.7061359882354736, + "learning_rate": 1.85693679338351e-05, + "loss": 1.02, + "step": 13690 + }, + { + "epoch": 0.19735799587997177, + "grad_norm": 0.8743509650230408, + "learning_rate": 1.8566962206559955e-05, + "loss": 1.028, + "step": 13700 + }, + { + "epoch": 0.197502052811271, + "grad_norm": 0.5671470165252686, + "learning_rate": 1.8564554614375317e-05, + "loss": 1.0318, + "step": 13710 + }, + { + "epoch": 0.19764610974257027, + "grad_norm": 0.6084232926368713, + "learning_rate": 1.856214515780529e-05, + "loss": 1.0327, + "step": 13720 + }, + { + "epoch": 0.1977901666738695, + "grad_norm": 0.6257429718971252, + "learning_rate": 1.8559733837374378e-05, + "loss": 1.0492, + "step": 13730 + }, + { + "epoch": 0.19793422360516877, + "grad_norm": 0.5424414277076721, + "learning_rate": 1.8557320653607492e-05, + "loss": 1.0671, + "step": 13740 + }, + { + "epoch": 0.198078280536468, + "grad_norm": 0.5478965044021606, + "learning_rate": 1.8554905607029944e-05, + "loss": 1.0498, + "step": 13750 + }, + { + "epoch": 0.19822233746776727, + "grad_norm": 0.6122015714645386, + "learning_rate": 1.8552488698167462e-05, + "loss": 1.0428, + "step": 13760 + }, + { + "epoch": 0.1983663943990665, + "grad_norm": 0.580527126789093, + "learning_rate": 1.8550069927546172e-05, + "loss": 1.0351, + "step": 13770 + }, + { + "epoch": 0.19851045133036577, + "grad_norm": 0.5698811411857605, + "learning_rate": 1.8547649295692608e-05, + "loss": 1.0522, + "step": 13780 + }, + { + "epoch": 0.198654508261665, + "grad_norm": 0.6567976474761963, + "learning_rate": 1.8545226803133704e-05, + "loss": 1.018, + "step": 13790 + }, + { + "epoch": 0.19879856519296427, + "grad_norm": 0.6164844036102295, + "learning_rate": 1.854280245039681e-05, + "loss": 1.0182, + "step": 13800 + }, + { + "epoch": 0.1989426221242635, + "grad_norm": 0.7950931191444397, + "learning_rate": 1.854037623800967e-05, + "loss": 1.0535, + "step": 13810 + }, + { + "epoch": 0.19908667905556277, + "grad_norm": 0.5991912484169006, + "learning_rate": 1.853794816650044e-05, + "loss": 1.0357, + "step": 13820 + }, + { + "epoch": 0.199230735986862, + "grad_norm": 0.7581918835639954, + "learning_rate": 1.8535518236397668e-05, + "loss": 1.0462, + "step": 13830 + }, + { + "epoch": 0.19937479291816126, + "grad_norm": 0.6731258630752563, + "learning_rate": 1.853308644823033e-05, + "loss": 1.0167, + "step": 13840 + }, + { + "epoch": 0.1995188498494605, + "grad_norm": 0.6933732628822327, + "learning_rate": 1.8530652802527784e-05, + "loss": 1.0496, + "step": 13850 + }, + { + "epoch": 0.19966290678075976, + "grad_norm": 0.5585038065910339, + "learning_rate": 1.8528217299819803e-05, + "loss": 1.0173, + "step": 13860 + }, + { + "epoch": 0.199806963712059, + "grad_norm": 0.5706941485404968, + "learning_rate": 1.852577994063656e-05, + "loss": 1.0693, + "step": 13870 + }, + { + "epoch": 0.19995102064335826, + "grad_norm": 0.641236424446106, + "learning_rate": 1.852334072550864e-05, + "loss": 1.0327, + "step": 13880 + }, + { + "epoch": 0.2000950775746575, + "grad_norm": 0.5940988659858704, + "learning_rate": 1.8520899654967025e-05, + "loss": 1.0509, + "step": 13890 + }, + { + "epoch": 0.20023913450595676, + "grad_norm": 0.5936237573623657, + "learning_rate": 1.8518456729543103e-05, + "loss": 1.0459, + "step": 13900 + }, + { + "epoch": 0.200383191437256, + "grad_norm": 0.604388415813446, + "learning_rate": 1.851601194976866e-05, + "loss": 1.0371, + "step": 13910 + }, + { + "epoch": 0.20052724836855526, + "grad_norm": 0.49210256338119507, + "learning_rate": 1.851356531617589e-05, + "loss": 1.03, + "step": 13920 + }, + { + "epoch": 0.2006713052998545, + "grad_norm": 0.5752059817314148, + "learning_rate": 1.85111168292974e-05, + "loss": 1.0049, + "step": 13930 + }, + { + "epoch": 0.20081536223115376, + "grad_norm": 0.6246433854103088, + "learning_rate": 1.8508666489666187e-05, + "loss": 1.0154, + "step": 13940 + }, + { + "epoch": 0.200959419162453, + "grad_norm": 0.5601815581321716, + "learning_rate": 1.8506214297815653e-05, + "loss": 1.0429, + "step": 13950 + }, + { + "epoch": 0.20110347609375226, + "grad_norm": 0.6816391944885254, + "learning_rate": 1.8503760254279614e-05, + "loss": 1.033, + "step": 13960 + }, + { + "epoch": 0.2012475330250515, + "grad_norm": 0.6566990613937378, + "learning_rate": 1.8501304359592276e-05, + "loss": 1.0417, + "step": 13970 + }, + { + "epoch": 0.20139158995635076, + "grad_norm": 0.5980787873268127, + "learning_rate": 1.8498846614288252e-05, + "loss": 1.0308, + "step": 13980 + }, + { + "epoch": 0.20153564688765, + "grad_norm": 0.677169680595398, + "learning_rate": 1.8496387018902562e-05, + "loss": 1.0378, + "step": 13990 + }, + { + "epoch": 0.20167970381894926, + "grad_norm": 0.61399245262146, + "learning_rate": 1.8493925573970627e-05, + "loss": 1.0486, + "step": 14000 + }, + { + "epoch": 0.2018237607502485, + "grad_norm": 0.6262736320495605, + "learning_rate": 1.8491462280028273e-05, + "loss": 1.0625, + "step": 14010 + }, + { + "epoch": 0.20196781768154776, + "grad_norm": 0.5216875076293945, + "learning_rate": 1.8488997137611714e-05, + "loss": 1.0375, + "step": 14020 + }, + { + "epoch": 0.202111874612847, + "grad_norm": 0.5989294648170471, + "learning_rate": 1.8486530147257588e-05, + "loss": 1.05, + "step": 14030 + }, + { + "epoch": 0.20225593154414626, + "grad_norm": 0.6899014711380005, + "learning_rate": 1.8484061309502916e-05, + "loss": 1.0441, + "step": 14040 + }, + { + "epoch": 0.2023999884754455, + "grad_norm": 0.5948405861854553, + "learning_rate": 1.848159062488514e-05, + "loss": 1.0231, + "step": 14050 + }, + { + "epoch": 0.20254404540674475, + "grad_norm": 0.6017546653747559, + "learning_rate": 1.8479118093942083e-05, + "loss": 1.0351, + "step": 14060 + }, + { + "epoch": 0.202688102338044, + "grad_norm": 0.5726602673530579, + "learning_rate": 1.8476643717211988e-05, + "loss": 1.04, + "step": 14070 + }, + { + "epoch": 0.20283215926934325, + "grad_norm": 0.6256138682365417, + "learning_rate": 1.8474167495233493e-05, + "loss": 1.0376, + "step": 14080 + }, + { + "epoch": 0.2029762162006425, + "grad_norm": 0.5992451906204224, + "learning_rate": 1.8471689428545635e-05, + "loss": 1.0501, + "step": 14090 + }, + { + "epoch": 0.20312027313194175, + "grad_norm": 0.5503131747245789, + "learning_rate": 1.8469209517687856e-05, + "loss": 1.0287, + "step": 14100 + }, + { + "epoch": 0.203264330063241, + "grad_norm": 0.5980885028839111, + "learning_rate": 1.8466727763199997e-05, + "loss": 1.0316, + "step": 14110 + }, + { + "epoch": 0.20340838699454025, + "grad_norm": 0.5954563617706299, + "learning_rate": 1.8464244165622303e-05, + "loss": 1.0386, + "step": 14120 + }, + { + "epoch": 0.2035524439258395, + "grad_norm": 0.5852318406105042, + "learning_rate": 1.8461758725495417e-05, + "loss": 1.0321, + "step": 14130 + }, + { + "epoch": 0.20369650085713875, + "grad_norm": 0.7828177809715271, + "learning_rate": 1.845927144336039e-05, + "loss": 1.0409, + "step": 14140 + }, + { + "epoch": 0.203840557788438, + "grad_norm": 0.49479296803474426, + "learning_rate": 1.845678231975866e-05, + "loss": 1.0388, + "step": 14150 + }, + { + "epoch": 0.20398461471973725, + "grad_norm": 0.6297884583473206, + "learning_rate": 1.8454291355232085e-05, + "loss": 1.0491, + "step": 14160 + }, + { + "epoch": 0.2041286716510365, + "grad_norm": 0.5248891711235046, + "learning_rate": 1.8451798550322906e-05, + "loss": 1.0474, + "step": 14170 + }, + { + "epoch": 0.20427272858233575, + "grad_norm": 0.6096994280815125, + "learning_rate": 1.8449303905573777e-05, + "loss": 1.0565, + "step": 14180 + }, + { + "epoch": 0.20441678551363499, + "grad_norm": 0.71397864818573, + "learning_rate": 1.8446807421527745e-05, + "loss": 1.0376, + "step": 14190 + }, + { + "epoch": 0.20456084244493425, + "grad_norm": 0.623956024646759, + "learning_rate": 1.844430909872826e-05, + "loss": 1.0212, + "step": 14200 + }, + { + "epoch": 0.20470489937623348, + "grad_norm": 0.682212769985199, + "learning_rate": 1.8441808937719177e-05, + "loss": 1.0341, + "step": 14210 + }, + { + "epoch": 0.20484895630753275, + "grad_norm": 0.6393522620201111, + "learning_rate": 1.843930693904474e-05, + "loss": 1.0308, + "step": 14220 + }, + { + "epoch": 0.20499301323883198, + "grad_norm": 0.705729603767395, + "learning_rate": 1.8436803103249604e-05, + "loss": 1.0362, + "step": 14230 + }, + { + "epoch": 0.20513707017013125, + "grad_norm": 0.6077045798301697, + "learning_rate": 1.843429743087882e-05, + "loss": 1.0389, + "step": 14240 + }, + { + "epoch": 0.20528112710143048, + "grad_norm": 0.6043947339057922, + "learning_rate": 1.8431789922477835e-05, + "loss": 1.0303, + "step": 14250 + }, + { + "epoch": 0.20542518403272975, + "grad_norm": 0.6769972443580627, + "learning_rate": 1.8429280578592497e-05, + "loss": 1.0263, + "step": 14260 + }, + { + "epoch": 0.20556924096402898, + "grad_norm": 0.5661643147468567, + "learning_rate": 1.8426769399769064e-05, + "loss": 1.0458, + "step": 14270 + }, + { + "epoch": 0.20571329789532825, + "grad_norm": 0.742868185043335, + "learning_rate": 1.8424256386554177e-05, + "loss": 1.0554, + "step": 14280 + }, + { + "epoch": 0.20585735482662748, + "grad_norm": 0.6280497312545776, + "learning_rate": 1.8421741539494887e-05, + "loss": 1.0331, + "step": 14290 + }, + { + "epoch": 0.20600141175792674, + "grad_norm": 0.5431658625602722, + "learning_rate": 1.8419224859138648e-05, + "loss": 1.016, + "step": 14300 + }, + { + "epoch": 0.20614546868922598, + "grad_norm": 0.5135045051574707, + "learning_rate": 1.8416706346033294e-05, + "loss": 1.0596, + "step": 14310 + }, + { + "epoch": 0.20628952562052524, + "grad_norm": 0.589107096195221, + "learning_rate": 1.8414186000727075e-05, + "loss": 1.0417, + "step": 14320 + }, + { + "epoch": 0.20643358255182448, + "grad_norm": 0.5661746859550476, + "learning_rate": 1.8411663823768643e-05, + "loss": 1.0341, + "step": 14330 + }, + { + "epoch": 0.20657763948312374, + "grad_norm": 0.7416191101074219, + "learning_rate": 1.8409139815707036e-05, + "loss": 1.0356, + "step": 14340 + }, + { + "epoch": 0.20672169641442298, + "grad_norm": 0.6631795167922974, + "learning_rate": 1.840661397709169e-05, + "loss": 1.0494, + "step": 14350 + }, + { + "epoch": 0.20686575334572224, + "grad_norm": 0.6519991755485535, + "learning_rate": 1.8404086308472456e-05, + "loss": 1.018, + "step": 14360 + }, + { + "epoch": 0.20700981027702148, + "grad_norm": 0.6510144472122192, + "learning_rate": 1.8401556810399565e-05, + "loss": 1.0382, + "step": 14370 + }, + { + "epoch": 0.20715386720832074, + "grad_norm": 0.6138314604759216, + "learning_rate": 1.8399025483423654e-05, + "loss": 1.012, + "step": 14380 + }, + { + "epoch": 0.20729792413961998, + "grad_norm": 0.6701445579528809, + "learning_rate": 1.839649232809576e-05, + "loss": 1.043, + "step": 14390 + }, + { + "epoch": 0.20744198107091924, + "grad_norm": 0.6574751734733582, + "learning_rate": 1.8393957344967317e-05, + "loss": 1.0224, + "step": 14400 + }, + { + "epoch": 0.20758603800221848, + "grad_norm": 0.5700885057449341, + "learning_rate": 1.8391420534590153e-05, + "loss": 1.0383, + "step": 14410 + }, + { + "epoch": 0.20773009493351774, + "grad_norm": 0.590834379196167, + "learning_rate": 1.83888818975165e-05, + "loss": 1.046, + "step": 14420 + }, + { + "epoch": 0.20787415186481698, + "grad_norm": 0.5771095752716064, + "learning_rate": 1.8386341434298985e-05, + "loss": 1.0215, + "step": 14430 + }, + { + "epoch": 0.20801820879611624, + "grad_norm": 0.6535927653312683, + "learning_rate": 1.8383799145490627e-05, + "loss": 1.037, + "step": 14440 + }, + { + "epoch": 0.20816226572741547, + "grad_norm": 0.5353608131408691, + "learning_rate": 1.8381255031644853e-05, + "loss": 1.0457, + "step": 14450 + }, + { + "epoch": 0.20830632265871474, + "grad_norm": 0.5100358724594116, + "learning_rate": 1.8378709093315474e-05, + "loss": 1.0272, + "step": 14460 + }, + { + "epoch": 0.20845037959001397, + "grad_norm": 0.6513608694076538, + "learning_rate": 1.8376161331056707e-05, + "loss": 1.0404, + "step": 14470 + }, + { + "epoch": 0.20859443652131324, + "grad_norm": 0.8126131296157837, + "learning_rate": 1.8373611745423173e-05, + "loss": 1.0584, + "step": 14480 + }, + { + "epoch": 0.20873849345261247, + "grad_norm": 0.6187640428543091, + "learning_rate": 1.8371060336969874e-05, + "loss": 1.0506, + "step": 14490 + }, + { + "epoch": 0.2088825503839117, + "grad_norm": 0.6069515943527222, + "learning_rate": 1.8368507106252217e-05, + "loss": 1.0352, + "step": 14500 + }, + { + "epoch": 0.20902660731521097, + "grad_norm": 0.5389923453330994, + "learning_rate": 1.836595205382601e-05, + "loss": 1.039, + "step": 14510 + }, + { + "epoch": 0.2091706642465102, + "grad_norm": 0.6446439027786255, + "learning_rate": 1.8363395180247446e-05, + "loss": 1.0139, + "step": 14520 + }, + { + "epoch": 0.20931472117780947, + "grad_norm": 0.609357476234436, + "learning_rate": 1.8360836486073127e-05, + "loss": 1.0354, + "step": 14530 + }, + { + "epoch": 0.2094587781091087, + "grad_norm": 0.7137535810470581, + "learning_rate": 1.835827597186004e-05, + "loss": 1.0388, + "step": 14540 + }, + { + "epoch": 0.20960283504040797, + "grad_norm": 0.5982151627540588, + "learning_rate": 1.8355713638165576e-05, + "loss": 1.0352, + "step": 14550 + }, + { + "epoch": 0.2097468919717072, + "grad_norm": 0.5672206878662109, + "learning_rate": 1.8353149485547525e-05, + "loss": 1.0297, + "step": 14560 + }, + { + "epoch": 0.20989094890300647, + "grad_norm": 0.5580592751502991, + "learning_rate": 1.835058351456406e-05, + "loss": 1.0468, + "step": 14570 + }, + { + "epoch": 0.2100350058343057, + "grad_norm": 0.5752267241477966, + "learning_rate": 1.834801572577376e-05, + "loss": 1.032, + "step": 14580 + }, + { + "epoch": 0.21017906276560497, + "grad_norm": 0.5035666227340698, + "learning_rate": 1.8345446119735596e-05, + "loss": 1.048, + "step": 14590 + }, + { + "epoch": 0.2103231196969042, + "grad_norm": 0.5395452380180359, + "learning_rate": 1.8342874697008937e-05, + "loss": 1.0421, + "step": 14600 + }, + { + "epoch": 0.21046717662820347, + "grad_norm": 0.5929610729217529, + "learning_rate": 1.834030145815355e-05, + "loss": 1.031, + "step": 14610 + }, + { + "epoch": 0.2106112335595027, + "grad_norm": 0.5652207136154175, + "learning_rate": 1.8337726403729588e-05, + "loss": 1.0367, + "step": 14620 + }, + { + "epoch": 0.21075529049080197, + "grad_norm": 0.7196754217147827, + "learning_rate": 1.8335149534297606e-05, + "loss": 1.0086, + "step": 14630 + }, + { + "epoch": 0.2108993474221012, + "grad_norm": 0.6522864103317261, + "learning_rate": 1.8332570850418557e-05, + "loss": 1.0353, + "step": 14640 + }, + { + "epoch": 0.21104340435340047, + "grad_norm": 0.6994026303291321, + "learning_rate": 1.832999035265378e-05, + "loss": 1.0311, + "step": 14650 + }, + { + "epoch": 0.2111874612846997, + "grad_norm": 0.6242994070053101, + "learning_rate": 1.8327408041565013e-05, + "loss": 1.0671, + "step": 14660 + }, + { + "epoch": 0.21133151821599896, + "grad_norm": 0.7104344367980957, + "learning_rate": 1.8324823917714395e-05, + "loss": 1.049, + "step": 14670 + }, + { + "epoch": 0.2114755751472982, + "grad_norm": 0.55716472864151, + "learning_rate": 1.832223798166445e-05, + "loss": 1.0338, + "step": 14680 + }, + { + "epoch": 0.21161963207859746, + "grad_norm": 0.9544410109519958, + "learning_rate": 1.8319650233978103e-05, + "loss": 1.0276, + "step": 14690 + }, + { + "epoch": 0.2117636890098967, + "grad_norm": 0.6191002726554871, + "learning_rate": 1.831706067521867e-05, + "loss": 1.0287, + "step": 14700 + }, + { + "epoch": 0.21190774594119596, + "grad_norm": 0.5778596997261047, + "learning_rate": 1.8314469305949856e-05, + "loss": 1.0143, + "step": 14710 + }, + { + "epoch": 0.2120518028724952, + "grad_norm": 0.5484819412231445, + "learning_rate": 1.8311876126735778e-05, + "loss": 1.0174, + "step": 14720 + }, + { + "epoch": 0.21219585980379446, + "grad_norm": 0.6016744375228882, + "learning_rate": 1.8309281138140932e-05, + "loss": 1.0573, + "step": 14730 + }, + { + "epoch": 0.2123399167350937, + "grad_norm": 0.6980452537536621, + "learning_rate": 1.8306684340730202e-05, + "loss": 1.0425, + "step": 14740 + }, + { + "epoch": 0.21248397366639296, + "grad_norm": 0.5923468470573425, + "learning_rate": 1.8304085735068885e-05, + "loss": 1.0598, + "step": 14750 + }, + { + "epoch": 0.2126280305976922, + "grad_norm": 0.655491292476654, + "learning_rate": 1.830148532172266e-05, + "loss": 1.0563, + "step": 14760 + }, + { + "epoch": 0.21277208752899146, + "grad_norm": 0.6702054142951965, + "learning_rate": 1.8298883101257597e-05, + "loss": 1.0397, + "step": 14770 + }, + { + "epoch": 0.2129161444602907, + "grad_norm": 0.6654492616653442, + "learning_rate": 1.829627907424017e-05, + "loss": 1.0177, + "step": 14780 + }, + { + "epoch": 0.21306020139158996, + "grad_norm": 0.5811224579811096, + "learning_rate": 1.829367324123723e-05, + "loss": 1.0396, + "step": 14790 + }, + { + "epoch": 0.2132042583228892, + "grad_norm": 0.5703511238098145, + "learning_rate": 1.8291065602816038e-05, + "loss": 1.0348, + "step": 14800 + }, + { + "epoch": 0.21334831525418846, + "grad_norm": 0.5730449557304382, + "learning_rate": 1.8288456159544242e-05, + "loss": 1.0289, + "step": 14810 + }, + { + "epoch": 0.2134923721854877, + "grad_norm": 0.5400986075401306, + "learning_rate": 1.8285844911989882e-05, + "loss": 1.0476, + "step": 14820 + }, + { + "epoch": 0.21363642911678696, + "grad_norm": 0.5935134291648865, + "learning_rate": 1.828323186072138e-05, + "loss": 1.0435, + "step": 14830 + }, + { + "epoch": 0.2137804860480862, + "grad_norm": 0.5888200998306274, + "learning_rate": 1.8280617006307575e-05, + "loss": 1.0365, + "step": 14840 + }, + { + "epoch": 0.21392454297938546, + "grad_norm": 0.6139171123504639, + "learning_rate": 1.827800034931768e-05, + "loss": 1.0399, + "step": 14850 + }, + { + "epoch": 0.2140685999106847, + "grad_norm": 0.5612664818763733, + "learning_rate": 1.8275381890321297e-05, + "loss": 1.0416, + "step": 14860 + }, + { + "epoch": 0.21421265684198396, + "grad_norm": 0.5276797413825989, + "learning_rate": 1.8272761629888444e-05, + "loss": 1.0504, + "step": 14870 + }, + { + "epoch": 0.2143567137732832, + "grad_norm": 0.5620393753051758, + "learning_rate": 1.82701395685895e-05, + "loss": 1.0465, + "step": 14880 + }, + { + "epoch": 0.21450077070458246, + "grad_norm": 0.5414578318595886, + "learning_rate": 1.8267515706995262e-05, + "loss": 1.0447, + "step": 14890 + }, + { + "epoch": 0.2146448276358817, + "grad_norm": 0.636154055595398, + "learning_rate": 1.8264890045676903e-05, + "loss": 1.0263, + "step": 14900 + }, + { + "epoch": 0.21478888456718095, + "grad_norm": 0.5891751646995544, + "learning_rate": 1.8262262585206e-05, + "loss": 1.03, + "step": 14910 + }, + { + "epoch": 0.2149329414984802, + "grad_norm": 0.6565343737602234, + "learning_rate": 1.8259633326154506e-05, + "loss": 1.038, + "step": 14920 + }, + { + "epoch": 0.21507699842977945, + "grad_norm": 0.6619620323181152, + "learning_rate": 1.8257002269094778e-05, + "loss": 1.0412, + "step": 14930 + }, + { + "epoch": 0.2152210553610787, + "grad_norm": 0.5966458320617676, + "learning_rate": 1.8254369414599562e-05, + "loss": 1.0184, + "step": 14940 + }, + { + "epoch": 0.21536511229237795, + "grad_norm": 0.6068892478942871, + "learning_rate": 1.8251734763241998e-05, + "loss": 1.0259, + "step": 14950 + }, + { + "epoch": 0.2155091692236772, + "grad_norm": 0.6060580611228943, + "learning_rate": 1.8249098315595605e-05, + "loss": 1.0255, + "step": 14960 + }, + { + "epoch": 0.21565322615497645, + "grad_norm": 0.7341133952140808, + "learning_rate": 1.8246460072234305e-05, + "loss": 1.0336, + "step": 14970 + }, + { + "epoch": 0.2157972830862757, + "grad_norm": 0.6035784482955933, + "learning_rate": 1.8243820033732407e-05, + "loss": 1.0283, + "step": 14980 + }, + { + "epoch": 0.21594134001757495, + "grad_norm": 0.5042765736579895, + "learning_rate": 1.824117820066461e-05, + "loss": 1.042, + "step": 14990 + }, + { + "epoch": 0.2160853969488742, + "grad_norm": 0.6901978254318237, + "learning_rate": 1.8238534573606007e-05, + "loss": 1.0523, + "step": 15000 + }, + { + "epoch": 0.21622945388017345, + "grad_norm": 0.5616651773452759, + "learning_rate": 1.823588915313208e-05, + "loss": 1.0401, + "step": 15010 + }, + { + "epoch": 0.21637351081147269, + "grad_norm": 0.613223671913147, + "learning_rate": 1.8233241939818697e-05, + "loss": 1.0434, + "step": 15020 + }, + { + "epoch": 0.21651756774277195, + "grad_norm": 0.5958622097969055, + "learning_rate": 1.823059293424212e-05, + "loss": 1.0412, + "step": 15030 + }, + { + "epoch": 0.21666162467407118, + "grad_norm": 0.5168639421463013, + "learning_rate": 1.8227942136979004e-05, + "loss": 1.023, + "step": 15040 + }, + { + "epoch": 0.21680568160537045, + "grad_norm": 0.7793817520141602, + "learning_rate": 1.8225289548606387e-05, + "loss": 1.025, + "step": 15050 + }, + { + "epoch": 0.21694973853666968, + "grad_norm": 0.5913326740264893, + "learning_rate": 1.822263516970171e-05, + "loss": 1.0333, + "step": 15060 + }, + { + "epoch": 0.21709379546796895, + "grad_norm": 0.6637604236602783, + "learning_rate": 1.8219979000842782e-05, + "loss": 1.0613, + "step": 15070 + }, + { + "epoch": 0.21723785239926818, + "grad_norm": 0.5894302129745483, + "learning_rate": 1.821732104260782e-05, + "loss": 1.0533, + "step": 15080 + }, + { + "epoch": 0.21738190933056745, + "grad_norm": 0.6399484872817993, + "learning_rate": 1.8214661295575427e-05, + "loss": 1.0251, + "step": 15090 + }, + { + "epoch": 0.21752596626186668, + "grad_norm": 0.6171691417694092, + "learning_rate": 1.8211999760324592e-05, + "loss": 1.027, + "step": 15100 + }, + { + "epoch": 0.21767002319316595, + "grad_norm": 0.5771392583847046, + "learning_rate": 1.820933643743469e-05, + "loss": 1.0433, + "step": 15110 + }, + { + "epoch": 0.21781408012446518, + "grad_norm": 0.6488246917724609, + "learning_rate": 1.8206671327485496e-05, + "loss": 1.0251, + "step": 15120 + }, + { + "epoch": 0.21795813705576444, + "grad_norm": 0.6200474500656128, + "learning_rate": 1.8204004431057166e-05, + "loss": 1.0381, + "step": 15130 + }, + { + "epoch": 0.21810219398706368, + "grad_norm": 0.5430780053138733, + "learning_rate": 1.8201335748730244e-05, + "loss": 1.0392, + "step": 15140 + }, + { + "epoch": 0.21824625091836294, + "grad_norm": 0.6101915836334229, + "learning_rate": 1.8198665281085668e-05, + "loss": 1.0672, + "step": 15150 + }, + { + "epoch": 0.21839030784966218, + "grad_norm": 0.5767624378204346, + "learning_rate": 1.819599302870476e-05, + "loss": 1.0095, + "step": 15160 + }, + { + "epoch": 0.21853436478096144, + "grad_norm": 0.5709320306777954, + "learning_rate": 1.8193318992169234e-05, + "loss": 1.0373, + "step": 15170 + }, + { + "epoch": 0.21867842171226068, + "grad_norm": 0.5829881429672241, + "learning_rate": 1.819064317206119e-05, + "loss": 1.0247, + "step": 15180 + }, + { + "epoch": 0.21882247864355994, + "grad_norm": 0.5822725296020508, + "learning_rate": 1.8187965568963117e-05, + "loss": 1.0547, + "step": 15190 + }, + { + "epoch": 0.21896653557485918, + "grad_norm": 0.5555112361907959, + "learning_rate": 1.8185286183457896e-05, + "loss": 1.0442, + "step": 15200 + }, + { + "epoch": 0.21911059250615844, + "grad_norm": 0.5043044090270996, + "learning_rate": 1.8182605016128786e-05, + "loss": 1.0107, + "step": 15210 + }, + { + "epoch": 0.21925464943745768, + "grad_norm": 0.5960538387298584, + "learning_rate": 1.8179922067559445e-05, + "loss": 1.0812, + "step": 15220 + }, + { + "epoch": 0.21939870636875694, + "grad_norm": 0.5488318204879761, + "learning_rate": 1.8177237338333908e-05, + "loss": 1.041, + "step": 15230 + }, + { + "epoch": 0.21954276330005618, + "grad_norm": 0.5550768971443176, + "learning_rate": 1.8174550829036608e-05, + "loss": 1.0605, + "step": 15240 + }, + { + "epoch": 0.21968682023135544, + "grad_norm": 0.6654821634292603, + "learning_rate": 1.8171862540252363e-05, + "loss": 1.0322, + "step": 15250 + }, + { + "epoch": 0.21983087716265468, + "grad_norm": 0.67624831199646, + "learning_rate": 1.816917247256637e-05, + "loss": 1.0566, + "step": 15260 + }, + { + "epoch": 0.21997493409395394, + "grad_norm": 0.7128103375434875, + "learning_rate": 1.8166480626564232e-05, + "loss": 1.0366, + "step": 15270 + }, + { + "epoch": 0.22011899102525317, + "grad_norm": 0.5526544451713562, + "learning_rate": 1.816378700283191e-05, + "loss": 1.0476, + "step": 15280 + }, + { + "epoch": 0.22026304795655244, + "grad_norm": 0.5415549874305725, + "learning_rate": 1.8161091601955782e-05, + "loss": 1.028, + "step": 15290 + }, + { + "epoch": 0.22040710488785167, + "grad_norm": 0.5962819457054138, + "learning_rate": 1.8158394424522592e-05, + "loss": 1.0619, + "step": 15300 + }, + { + "epoch": 0.22055116181915094, + "grad_norm": 0.9033251404762268, + "learning_rate": 1.8155695471119484e-05, + "loss": 1.0442, + "step": 15310 + }, + { + "epoch": 0.22069521875045017, + "grad_norm": 0.6221054196357727, + "learning_rate": 1.8152994742333977e-05, + "loss": 1.0386, + "step": 15320 + }, + { + "epoch": 0.22083927568174944, + "grad_norm": 0.5393169522285461, + "learning_rate": 1.8150292238753983e-05, + "loss": 1.0347, + "step": 15330 + }, + { + "epoch": 0.22098333261304867, + "grad_norm": 0.5820224285125732, + "learning_rate": 1.8147587960967806e-05, + "loss": 1.0396, + "step": 15340 + }, + { + "epoch": 0.22112738954434794, + "grad_norm": 0.5936693549156189, + "learning_rate": 1.8144881909564122e-05, + "loss": 1.0574, + "step": 15350 + }, + { + "epoch": 0.22127144647564717, + "grad_norm": 0.7099258303642273, + "learning_rate": 1.8142174085132008e-05, + "loss": 1.0312, + "step": 15360 + }, + { + "epoch": 0.22141550340694643, + "grad_norm": 0.6385607719421387, + "learning_rate": 1.8139464488260917e-05, + "loss": 1.0564, + "step": 15370 + }, + { + "epoch": 0.22155956033824567, + "grad_norm": 0.5421634316444397, + "learning_rate": 1.813675311954069e-05, + "loss": 1.0252, + "step": 15380 + }, + { + "epoch": 0.22170361726954493, + "grad_norm": 0.7158612608909607, + "learning_rate": 1.8134039979561558e-05, + "loss": 1.0494, + "step": 15390 + }, + { + "epoch": 0.22184767420084417, + "grad_norm": 0.5323343873023987, + "learning_rate": 1.8131325068914125e-05, + "loss": 1.0184, + "step": 15400 + }, + { + "epoch": 0.22199173113214343, + "grad_norm": 0.6327012777328491, + "learning_rate": 1.8128608388189402e-05, + "loss": 1.0257, + "step": 15410 + }, + { + "epoch": 0.22213578806344267, + "grad_norm": 0.5406925082206726, + "learning_rate": 1.8125889937978765e-05, + "loss": 1.0336, + "step": 15420 + }, + { + "epoch": 0.22227984499474193, + "grad_norm": 0.504689633846283, + "learning_rate": 1.8123169718873987e-05, + "loss": 1.0346, + "step": 15430 + }, + { + "epoch": 0.22242390192604117, + "grad_norm": 0.5919815897941589, + "learning_rate": 1.812044773146722e-05, + "loss": 1.0203, + "step": 15440 + }, + { + "epoch": 0.22256795885734043, + "grad_norm": 0.6278406381607056, + "learning_rate": 1.8117723976351e-05, + "loss": 1.0313, + "step": 15450 + }, + { + "epoch": 0.22271201578863967, + "grad_norm": 0.6674849390983582, + "learning_rate": 1.811499845411826e-05, + "loss": 1.0413, + "step": 15460 + }, + { + "epoch": 0.22285607271993893, + "grad_norm": 0.7212050557136536, + "learning_rate": 1.81122711653623e-05, + "loss": 1.0494, + "step": 15470 + }, + { + "epoch": 0.22300012965123817, + "grad_norm": 0.5566467046737671, + "learning_rate": 1.8109542110676813e-05, + "loss": 1.0483, + "step": 15480 + }, + { + "epoch": 0.22314418658253743, + "grad_norm": 0.5412294268608093, + "learning_rate": 1.8106811290655884e-05, + "loss": 1.0288, + "step": 15490 + }, + { + "epoch": 0.22328824351383667, + "grad_norm": 0.6451317667961121, + "learning_rate": 1.810407870589397e-05, + "loss": 1.0463, + "step": 15500 + }, + { + "epoch": 0.22343230044513593, + "grad_norm": 0.5637590885162354, + "learning_rate": 1.8101344356985918e-05, + "loss": 1.0202, + "step": 15510 + }, + { + "epoch": 0.22357635737643516, + "grad_norm": 0.6563864350318909, + "learning_rate": 1.8098608244526955e-05, + "loss": 1.068, + "step": 15520 + }, + { + "epoch": 0.22372041430773443, + "grad_norm": 0.5868936777114868, + "learning_rate": 1.80958703691127e-05, + "loss": 1.0519, + "step": 15530 + }, + { + "epoch": 0.22386447123903366, + "grad_norm": 0.6614019870758057, + "learning_rate": 1.8093130731339146e-05, + "loss": 1.0186, + "step": 15540 + }, + { + "epoch": 0.22400852817033293, + "grad_norm": 0.5744838714599609, + "learning_rate": 1.809038933180268e-05, + "loss": 1.0282, + "step": 15550 + }, + { + "epoch": 0.22415258510163216, + "grad_norm": 0.6549461483955383, + "learning_rate": 1.808764617110006e-05, + "loss": 1.0591, + "step": 15560 + }, + { + "epoch": 0.22429664203293143, + "grad_norm": 0.629159152507782, + "learning_rate": 1.808490124982844e-05, + "loss": 1.0353, + "step": 15570 + }, + { + "epoch": 0.22444069896423066, + "grad_norm": 0.5964781045913696, + "learning_rate": 1.808215456858535e-05, + "loss": 1.0396, + "step": 15580 + }, + { + "epoch": 0.22458475589552993, + "grad_norm": 0.601401686668396, + "learning_rate": 1.8079406127968702e-05, + "loss": 1.0264, + "step": 15590 + }, + { + "epoch": 0.22472881282682916, + "grad_norm": 0.5822338461875916, + "learning_rate": 1.8076655928576795e-05, + "loss": 1.0278, + "step": 15600 + }, + { + "epoch": 0.22487286975812842, + "grad_norm": 0.6674925088882446, + "learning_rate": 1.807390397100831e-05, + "loss": 1.0466, + "step": 15610 + }, + { + "epoch": 0.22501692668942766, + "grad_norm": 0.5742388963699341, + "learning_rate": 1.807115025586231e-05, + "loss": 1.0326, + "step": 15620 + }, + { + "epoch": 0.22516098362072692, + "grad_norm": 0.657478928565979, + "learning_rate": 1.8068394783738242e-05, + "loss": 1.0251, + "step": 15630 + }, + { + "epoch": 0.22530504055202616, + "grad_norm": 0.6310794353485107, + "learning_rate": 1.8065637555235935e-05, + "loss": 1.0334, + "step": 15640 + }, + { + "epoch": 0.22544909748332542, + "grad_norm": 0.7108659744262695, + "learning_rate": 1.806287857095559e-05, + "loss": 1.03, + "step": 15650 + }, + { + "epoch": 0.22559315441462466, + "grad_norm": 0.6125839948654175, + "learning_rate": 1.8060117831497814e-05, + "loss": 1.0516, + "step": 15660 + }, + { + "epoch": 0.22573721134592392, + "grad_norm": 0.5679513216018677, + "learning_rate": 1.8057355337463572e-05, + "loss": 1.0308, + "step": 15670 + }, + { + "epoch": 0.22588126827722316, + "grad_norm": 0.566606342792511, + "learning_rate": 1.8054591089454228e-05, + "loss": 1.0594, + "step": 15680 + }, + { + "epoch": 0.22602532520852242, + "grad_norm": 0.5276409387588501, + "learning_rate": 1.805182508807152e-05, + "loss": 1.0178, + "step": 15690 + }, + { + "epoch": 0.22616938213982166, + "grad_norm": 0.5563755035400391, + "learning_rate": 1.8049057333917558e-05, + "loss": 1.0329, + "step": 15700 + }, + { + "epoch": 0.22631343907112092, + "grad_norm": 0.6629067063331604, + "learning_rate": 1.8046287827594854e-05, + "loss": 1.0243, + "step": 15710 + }, + { + "epoch": 0.22645749600242016, + "grad_norm": 0.5855606198310852, + "learning_rate": 1.8043516569706288e-05, + "loss": 1.0274, + "step": 15720 + }, + { + "epoch": 0.22660155293371942, + "grad_norm": 0.572472333908081, + "learning_rate": 1.8040743560855127e-05, + "loss": 1.0352, + "step": 15730 + }, + { + "epoch": 0.22674560986501865, + "grad_norm": 0.6293073892593384, + "learning_rate": 1.803796880164501e-05, + "loss": 1.0395, + "step": 15740 + }, + { + "epoch": 0.22688966679631792, + "grad_norm": 0.5595530867576599, + "learning_rate": 1.8035192292679974e-05, + "loss": 1.0303, + "step": 15750 + }, + { + "epoch": 0.22703372372761715, + "grad_norm": 0.6038975119590759, + "learning_rate": 1.803241403456442e-05, + "loss": 1.0436, + "step": 15760 + }, + { + "epoch": 0.22717778065891642, + "grad_norm": 0.6200108528137207, + "learning_rate": 1.802963402790314e-05, + "loss": 1.0428, + "step": 15770 + }, + { + "epoch": 0.22732183759021565, + "grad_norm": 0.5227588415145874, + "learning_rate": 1.80268522733013e-05, + "loss": 1.0411, + "step": 15780 + }, + { + "epoch": 0.22746589452151492, + "grad_norm": 0.6519966125488281, + "learning_rate": 1.8024068771364446e-05, + "loss": 1.0235, + "step": 15790 + }, + { + "epoch": 0.22760995145281415, + "grad_norm": 0.5424467325210571, + "learning_rate": 1.802128352269852e-05, + "loss": 1.0239, + "step": 15800 + }, + { + "epoch": 0.2277540083841134, + "grad_norm": 0.6248921751976013, + "learning_rate": 1.8018496527909818e-05, + "loss": 1.0474, + "step": 15810 + }, + { + "epoch": 0.22789806531541265, + "grad_norm": 0.5277923941612244, + "learning_rate": 1.8015707787605038e-05, + "loss": 1.0323, + "step": 15820 + }, + { + "epoch": 0.2280421222467119, + "grad_norm": 0.7009173035621643, + "learning_rate": 1.801291730239125e-05, + "loss": 1.0411, + "step": 15830 + }, + { + "epoch": 0.22818617917801115, + "grad_norm": 0.5484472513198853, + "learning_rate": 1.8010125072875907e-05, + "loss": 1.0214, + "step": 15840 + }, + { + "epoch": 0.22833023610931039, + "grad_norm": 0.5944812893867493, + "learning_rate": 1.800733109966683e-05, + "loss": 1.0365, + "step": 15850 + }, + { + "epoch": 0.22847429304060965, + "grad_norm": 0.7652137875556946, + "learning_rate": 1.800453538337224e-05, + "loss": 1.0434, + "step": 15860 + }, + { + "epoch": 0.22861834997190889, + "grad_norm": 0.5834174752235413, + "learning_rate": 1.8001737924600716e-05, + "loss": 1.0277, + "step": 15870 + }, + { + "epoch": 0.22876240690320815, + "grad_norm": 0.5894598960876465, + "learning_rate": 1.799893872396123e-05, + "loss": 1.0364, + "step": 15880 + }, + { + "epoch": 0.22890646383450738, + "grad_norm": 0.4981635808944702, + "learning_rate": 1.799613778206313e-05, + "loss": 1.0536, + "step": 15890 + }, + { + "epoch": 0.22905052076580665, + "grad_norm": 0.5469146370887756, + "learning_rate": 1.799333509951614e-05, + "loss": 1.0337, + "step": 15900 + }, + { + "epoch": 0.22919457769710588, + "grad_norm": 0.6103417277336121, + "learning_rate": 1.799053067693037e-05, + "loss": 1.0538, + "step": 15910 + }, + { + "epoch": 0.22933863462840515, + "grad_norm": 0.5183495879173279, + "learning_rate": 1.79877245149163e-05, + "loss": 1.0317, + "step": 15920 + }, + { + "epoch": 0.22948269155970438, + "grad_norm": 0.6987003087997437, + "learning_rate": 1.798491661408479e-05, + "loss": 1.0589, + "step": 15930 + }, + { + "epoch": 0.22962674849100365, + "grad_norm": 0.6228061318397522, + "learning_rate": 1.7982106975047088e-05, + "loss": 1.0446, + "step": 15940 + }, + { + "epoch": 0.22977080542230288, + "grad_norm": 0.5950088500976562, + "learning_rate": 1.7979295598414814e-05, + "loss": 1.0346, + "step": 15950 + }, + { + "epoch": 0.22991486235360215, + "grad_norm": 0.5522655844688416, + "learning_rate": 1.797648248479996e-05, + "loss": 1.0191, + "step": 15960 + }, + { + "epoch": 0.23005891928490138, + "grad_norm": 0.6294156312942505, + "learning_rate": 1.79736676348149e-05, + "loss": 1.041, + "step": 15970 + }, + { + "epoch": 0.23020297621620064, + "grad_norm": 0.9068011045455933, + "learning_rate": 1.7970851049072397e-05, + "loss": 1.0469, + "step": 15980 + }, + { + "epoch": 0.23034703314749988, + "grad_norm": 0.49483510851860046, + "learning_rate": 1.7968032728185577e-05, + "loss": 1.0358, + "step": 15990 + }, + { + "epoch": 0.23049109007879914, + "grad_norm": 0.5636140704154968, + "learning_rate": 1.7965212672767955e-05, + "loss": 1.0401, + "step": 16000 + }, + { + "epoch": 0.23063514701009838, + "grad_norm": 0.5015645027160645, + "learning_rate": 1.796239088343341e-05, + "loss": 1.0322, + "step": 16010 + }, + { + "epoch": 0.23077920394139764, + "grad_norm": 0.8636760711669922, + "learning_rate": 1.7959567360796215e-05, + "loss": 1.0413, + "step": 16020 + }, + { + "epoch": 0.23092326087269688, + "grad_norm": 0.6280728578567505, + "learning_rate": 1.7956742105471006e-05, + "loss": 1.0342, + "step": 16030 + }, + { + "epoch": 0.23106731780399614, + "grad_norm": 0.5852565169334412, + "learning_rate": 1.7953915118072803e-05, + "loss": 1.0593, + "step": 16040 + }, + { + "epoch": 0.23121137473529538, + "grad_norm": 0.5820422768592834, + "learning_rate": 1.7951086399217002e-05, + "loss": 1.054, + "step": 16050 + }, + { + "epoch": 0.23135543166659464, + "grad_norm": 0.5244117379188538, + "learning_rate": 1.794825594951938e-05, + "loss": 1.0408, + "step": 16060 + }, + { + "epoch": 0.23149948859789388, + "grad_norm": 0.6109441518783569, + "learning_rate": 1.7945423769596083e-05, + "loss": 1.046, + "step": 16070 + }, + { + "epoch": 0.23164354552919314, + "grad_norm": 0.7189362049102783, + "learning_rate": 1.794258986006364e-05, + "loss": 1.0278, + "step": 16080 + }, + { + "epoch": 0.23178760246049238, + "grad_norm": 0.6301892399787903, + "learning_rate": 1.793975422153895e-05, + "loss": 1.0443, + "step": 16090 + }, + { + "epoch": 0.23193165939179164, + "grad_norm": 0.615687906742096, + "learning_rate": 1.7936916854639298e-05, + "loss": 1.0381, + "step": 16100 + }, + { + "epoch": 0.23207571632309087, + "grad_norm": 0.5806190371513367, + "learning_rate": 1.793407775998233e-05, + "loss": 1.0284, + "step": 16110 + }, + { + "epoch": 0.23221977325439014, + "grad_norm": 0.5128661394119263, + "learning_rate": 1.7931236938186093e-05, + "loss": 1.0288, + "step": 16120 + }, + { + "epoch": 0.23236383018568937, + "grad_norm": 0.5688104033470154, + "learning_rate": 1.792839438986898e-05, + "loss": 1.0123, + "step": 16130 + }, + { + "epoch": 0.23250788711698864, + "grad_norm": 0.5432714819908142, + "learning_rate": 1.7925550115649782e-05, + "loss": 1.0502, + "step": 16140 + }, + { + "epoch": 0.23265194404828787, + "grad_norm": 0.5542018413543701, + "learning_rate": 1.792270411614766e-05, + "loss": 1.0323, + "step": 16150 + }, + { + "epoch": 0.23279600097958714, + "grad_norm": 0.6536816358566284, + "learning_rate": 1.791985639198214e-05, + "loss": 1.0353, + "step": 16160 + }, + { + "epoch": 0.23294005791088637, + "grad_norm": 0.6699745059013367, + "learning_rate": 1.791700694377314e-05, + "loss": 1.0264, + "step": 16170 + }, + { + "epoch": 0.23308411484218564, + "grad_norm": 0.582292914390564, + "learning_rate": 1.7914155772140946e-05, + "loss": 1.0236, + "step": 16180 + }, + { + "epoch": 0.23322817177348487, + "grad_norm": 0.5736205577850342, + "learning_rate": 1.7911302877706217e-05, + "loss": 1.0388, + "step": 16190 + }, + { + "epoch": 0.23337222870478413, + "grad_norm": 0.5483204126358032, + "learning_rate": 1.7908448261089985e-05, + "loss": 1.0476, + "step": 16200 + }, + { + "epoch": 0.23351628563608337, + "grad_norm": 0.5308883786201477, + "learning_rate": 1.7905591922913664e-05, + "loss": 1.047, + "step": 16210 + }, + { + "epoch": 0.23366034256738263, + "grad_norm": 0.7244343757629395, + "learning_rate": 1.7902733863799037e-05, + "loss": 1.0416, + "step": 16220 + }, + { + "epoch": 0.23380439949868187, + "grad_norm": 0.5258388519287109, + "learning_rate": 1.7899874084368273e-05, + "loss": 1.0434, + "step": 16230 + }, + { + "epoch": 0.23394845642998113, + "grad_norm": 0.6775047779083252, + "learning_rate": 1.789701258524389e-05, + "loss": 1.0229, + "step": 16240 + }, + { + "epoch": 0.23409251336128037, + "grad_norm": 0.5259122252464294, + "learning_rate": 1.7894149367048815e-05, + "loss": 1.0225, + "step": 16250 + }, + { + "epoch": 0.23423657029257963, + "grad_norm": 0.6943677663803101, + "learning_rate": 1.7891284430406316e-05, + "loss": 1.0332, + "step": 16260 + }, + { + "epoch": 0.23438062722387887, + "grad_norm": 0.6831935048103333, + "learning_rate": 1.7888417775940062e-05, + "loss": 1.0553, + "step": 16270 + }, + { + "epoch": 0.23452468415517813, + "grad_norm": 0.5620949864387512, + "learning_rate": 1.7885549404274075e-05, + "loss": 1.0259, + "step": 16280 + }, + { + "epoch": 0.23466874108647737, + "grad_norm": 0.5302318334579468, + "learning_rate": 1.7882679316032766e-05, + "loss": 1.0284, + "step": 16290 + }, + { + "epoch": 0.23481279801777663, + "grad_norm": 0.6939184069633484, + "learning_rate": 1.7879807511840916e-05, + "loss": 1.065, + "step": 16300 + }, + { + "epoch": 0.23495685494907587, + "grad_norm": 0.5826215744018555, + "learning_rate": 1.7876933992323667e-05, + "loss": 1.0345, + "step": 16310 + }, + { + "epoch": 0.23510091188037513, + "grad_norm": 0.5437131524085999, + "learning_rate": 1.787405875810655e-05, + "loss": 1.0167, + "step": 16320 + }, + { + "epoch": 0.23524496881167437, + "grad_norm": 0.6369717121124268, + "learning_rate": 1.787118180981547e-05, + "loss": 1.0243, + "step": 16330 + }, + { + "epoch": 0.23538902574297363, + "grad_norm": 0.5772448182106018, + "learning_rate": 1.786830314807669e-05, + "loss": 1.0265, + "step": 16340 + }, + { + "epoch": 0.23553308267427286, + "grad_norm": 0.6192107796669006, + "learning_rate": 1.786542277351685e-05, + "loss": 1.0324, + "step": 16350 + }, + { + "epoch": 0.23567713960557213, + "grad_norm": 0.5261062383651733, + "learning_rate": 1.7862540686762987e-05, + "loss": 1.0226, + "step": 16360 + }, + { + "epoch": 0.23582119653687136, + "grad_norm": 0.6214985251426697, + "learning_rate": 1.785965688844247e-05, + "loss": 1.0436, + "step": 16370 + }, + { + "epoch": 0.23596525346817063, + "grad_norm": 0.5913611650466919, + "learning_rate": 1.7856771379183083e-05, + "loss": 1.026, + "step": 16380 + }, + { + "epoch": 0.23610931039946986, + "grad_norm": 0.6042903661727905, + "learning_rate": 1.7853884159612945e-05, + "loss": 1.0267, + "step": 16390 + }, + { + "epoch": 0.23625336733076913, + "grad_norm": 0.5531982779502869, + "learning_rate": 1.785099523036057e-05, + "loss": 1.0376, + "step": 16400 + }, + { + "epoch": 0.23639742426206836, + "grad_norm": 0.6330571174621582, + "learning_rate": 1.7848104592054838e-05, + "loss": 1.0433, + "step": 16410 + }, + { + "epoch": 0.23654148119336763, + "grad_norm": 0.6041118502616882, + "learning_rate": 1.7845212245325e-05, + "loss": 1.0391, + "step": 16420 + }, + { + "epoch": 0.23668553812466686, + "grad_norm": 0.6083371639251709, + "learning_rate": 1.7842318190800686e-05, + "loss": 1.0476, + "step": 16430 + }, + { + "epoch": 0.23682959505596612, + "grad_norm": 0.5628354549407959, + "learning_rate": 1.7839422429111883e-05, + "loss": 1.0445, + "step": 16440 + }, + { + "epoch": 0.23697365198726536, + "grad_norm": 0.5915509462356567, + "learning_rate": 1.783652496088896e-05, + "loss": 1.0371, + "step": 16450 + }, + { + "epoch": 0.23711770891856462, + "grad_norm": 0.6667011976242065, + "learning_rate": 1.783362578676266e-05, + "loss": 1.0386, + "step": 16460 + }, + { + "epoch": 0.23726176584986386, + "grad_norm": 0.6184618473052979, + "learning_rate": 1.7830724907364093e-05, + "loss": 1.0266, + "step": 16470 + }, + { + "epoch": 0.23740582278116312, + "grad_norm": 0.5585595965385437, + "learning_rate": 1.7827822323324737e-05, + "loss": 1.0399, + "step": 16480 + }, + { + "epoch": 0.23754987971246236, + "grad_norm": 0.6253140568733215, + "learning_rate": 1.7824918035276448e-05, + "loss": 1.0329, + "step": 16490 + }, + { + "epoch": 0.23769393664376162, + "grad_norm": 0.5357626676559448, + "learning_rate": 1.7822012043851444e-05, + "loss": 1.0336, + "step": 16500 + }, + { + "epoch": 0.23783799357506086, + "grad_norm": 0.5444202423095703, + "learning_rate": 1.7819104349682327e-05, + "loss": 1.0453, + "step": 16510 + }, + { + "epoch": 0.23798205050636012, + "grad_norm": 0.6564856767654419, + "learning_rate": 1.781619495340205e-05, + "loss": 1.0308, + "step": 16520 + }, + { + "epoch": 0.23812610743765936, + "grad_norm": 0.615155041217804, + "learning_rate": 1.7813283855643963e-05, + "loss": 1.0289, + "step": 16530 + }, + { + "epoch": 0.23827016436895862, + "grad_norm": 0.5427000522613525, + "learning_rate": 1.7810371057041766e-05, + "loss": 1.0366, + "step": 16540 + }, + { + "epoch": 0.23841422130025786, + "grad_norm": 0.6768283843994141, + "learning_rate": 1.780745655822953e-05, + "loss": 1.0366, + "step": 16550 + }, + { + "epoch": 0.23855827823155712, + "grad_norm": 0.6584059596061707, + "learning_rate": 1.7804540359841705e-05, + "loss": 1.0277, + "step": 16560 + }, + { + "epoch": 0.23870233516285635, + "grad_norm": 0.5058512091636658, + "learning_rate": 1.7801622462513107e-05, + "loss": 1.0355, + "step": 16570 + }, + { + "epoch": 0.23884639209415562, + "grad_norm": 0.5551502704620361, + "learning_rate": 1.7798702866878924e-05, + "loss": 1.039, + "step": 16580 + }, + { + "epoch": 0.23899044902545485, + "grad_norm": 0.6150371432304382, + "learning_rate": 1.779578157357471e-05, + "loss": 1.0492, + "step": 16590 + }, + { + "epoch": 0.23913450595675412, + "grad_norm": 0.658364474773407, + "learning_rate": 1.7792858583236393e-05, + "loss": 1.048, + "step": 16600 + }, + { + "epoch": 0.23927856288805335, + "grad_norm": 0.5722893476486206, + "learning_rate": 1.7789933896500262e-05, + "loss": 1.0213, + "step": 16610 + }, + { + "epoch": 0.23942261981935262, + "grad_norm": 0.6082572937011719, + "learning_rate": 1.7787007514002984e-05, + "loss": 1.0405, + "step": 16620 + }, + { + "epoch": 0.23956667675065185, + "grad_norm": 0.5616563558578491, + "learning_rate": 1.7784079436381593e-05, + "loss": 1.0263, + "step": 16630 + }, + { + "epoch": 0.23971073368195112, + "grad_norm": 0.5879483222961426, + "learning_rate": 1.778114966427349e-05, + "loss": 1.0273, + "step": 16640 + }, + { + "epoch": 0.23985479061325035, + "grad_norm": 0.5526086091995239, + "learning_rate": 1.7778218198316445e-05, + "loss": 1.0605, + "step": 16650 + }, + { + "epoch": 0.23999884754454961, + "grad_norm": 0.5538169741630554, + "learning_rate": 1.77752850391486e-05, + "loss": 1.0334, + "step": 16660 + }, + { + "epoch": 0.24014290447584885, + "grad_norm": 0.6564728617668152, + "learning_rate": 1.777235018740846e-05, + "loss": 1.0355, + "step": 16670 + }, + { + "epoch": 0.24028696140714811, + "grad_norm": 0.7152305245399475, + "learning_rate": 1.7769413643734905e-05, + "loss": 1.0464, + "step": 16680 + }, + { + "epoch": 0.24043101833844735, + "grad_norm": 0.6362254619598389, + "learning_rate": 1.776647540876718e-05, + "loss": 1.0392, + "step": 16690 + }, + { + "epoch": 0.2405750752697466, + "grad_norm": 0.6082077026367188, + "learning_rate": 1.7763535483144895e-05, + "loss": 1.0574, + "step": 16700 + }, + { + "epoch": 0.24071913220104585, + "grad_norm": 0.7910304069519043, + "learning_rate": 1.7760593867508036e-05, + "loss": 1.0353, + "step": 16710 + }, + { + "epoch": 0.2408631891323451, + "grad_norm": 0.7080479860305786, + "learning_rate": 1.775765056249695e-05, + "loss": 1.0257, + "step": 16720 + }, + { + "epoch": 0.24100724606364435, + "grad_norm": 0.7581346035003662, + "learning_rate": 1.775470556875235e-05, + "loss": 1.0268, + "step": 16730 + }, + { + "epoch": 0.2411513029949436, + "grad_norm": 0.6228922605514526, + "learning_rate": 1.7751758886915324e-05, + "loss": 1.0651, + "step": 16740 + }, + { + "epoch": 0.24129535992624285, + "grad_norm": 0.6909641027450562, + "learning_rate": 1.7748810517627325e-05, + "loss": 1.0552, + "step": 16750 + }, + { + "epoch": 0.2414394168575421, + "grad_norm": 0.5827440023422241, + "learning_rate": 1.7745860461530173e-05, + "loss": 1.0317, + "step": 16760 + }, + { + "epoch": 0.24158347378884135, + "grad_norm": 0.6382594108581543, + "learning_rate": 1.774290871926605e-05, + "loss": 1.018, + "step": 16770 + }, + { + "epoch": 0.2417275307201406, + "grad_norm": 0.6372701525688171, + "learning_rate": 1.773995529147751e-05, + "loss": 1.0662, + "step": 16780 + }, + { + "epoch": 0.24187158765143985, + "grad_norm": 0.6797804832458496, + "learning_rate": 1.7737000178807483e-05, + "loss": 1.0404, + "step": 16790 + }, + { + "epoch": 0.2420156445827391, + "grad_norm": 0.5957540273666382, + "learning_rate": 1.7734043381899245e-05, + "loss": 1.0415, + "step": 16800 + }, + { + "epoch": 0.24215970151403834, + "grad_norm": 0.6584307551383972, + "learning_rate": 1.7731084901396457e-05, + "loss": 1.0577, + "step": 16810 + }, + { + "epoch": 0.2423037584453376, + "grad_norm": 0.6933635473251343, + "learning_rate": 1.7728124737943133e-05, + "loss": 1.0501, + "step": 16820 + }, + { + "epoch": 0.24244781537663684, + "grad_norm": 0.5589372515678406, + "learning_rate": 1.7725162892183663e-05, + "loss": 1.0187, + "step": 16830 + }, + { + "epoch": 0.2425918723079361, + "grad_norm": 0.607906699180603, + "learning_rate": 1.77221993647628e-05, + "loss": 1.0586, + "step": 16840 + }, + { + "epoch": 0.24273592923923534, + "grad_norm": 0.6130890250205994, + "learning_rate": 1.7719234156325664e-05, + "loss": 1.0521, + "step": 16850 + }, + { + "epoch": 0.2428799861705346, + "grad_norm": 0.6340318918228149, + "learning_rate": 1.771626726751774e-05, + "loss": 1.0362, + "step": 16860 + }, + { + "epoch": 0.24302404310183384, + "grad_norm": 0.6948375105857849, + "learning_rate": 1.7713298698984876e-05, + "loss": 1.0274, + "step": 16870 + }, + { + "epoch": 0.2431681000331331, + "grad_norm": 0.6357523798942566, + "learning_rate": 1.7710328451373292e-05, + "loss": 1.0578, + "step": 16880 + }, + { + "epoch": 0.24331215696443234, + "grad_norm": 0.6093547940254211, + "learning_rate": 1.7707356525329565e-05, + "loss": 1.0507, + "step": 16890 + }, + { + "epoch": 0.2434562138957316, + "grad_norm": 0.5905465483665466, + "learning_rate": 1.770438292150065e-05, + "loss": 1.044, + "step": 16900 + }, + { + "epoch": 0.24360027082703084, + "grad_norm": 0.47149771451950073, + "learning_rate": 1.7701407640533852e-05, + "loss": 1.0196, + "step": 16910 + }, + { + "epoch": 0.2437443277583301, + "grad_norm": 0.5732711553573608, + "learning_rate": 1.7698430683076854e-05, + "loss": 1.0316, + "step": 16920 + }, + { + "epoch": 0.24388838468962934, + "grad_norm": 0.5275248289108276, + "learning_rate": 1.769545204977769e-05, + "loss": 1.0371, + "step": 16930 + }, + { + "epoch": 0.2440324416209286, + "grad_norm": 0.559514045715332, + "learning_rate": 1.769247174128478e-05, + "loss": 1.018, + "step": 16940 + }, + { + "epoch": 0.24417649855222784, + "grad_norm": 0.6358952522277832, + "learning_rate": 1.7689489758246887e-05, + "loss": 1.0141, + "step": 16950 + }, + { + "epoch": 0.2443205554835271, + "grad_norm": 0.6493760943412781, + "learning_rate": 1.768650610131315e-05, + "loss": 1.0541, + "step": 16960 + }, + { + "epoch": 0.24446461241482634, + "grad_norm": 0.5447071194648743, + "learning_rate": 1.768352077113307e-05, + "loss": 1.0204, + "step": 16970 + }, + { + "epoch": 0.2446086693461256, + "grad_norm": 0.6011931896209717, + "learning_rate": 1.7680533768356517e-05, + "loss": 1.0661, + "step": 16980 + }, + { + "epoch": 0.24475272627742484, + "grad_norm": 0.558472752571106, + "learning_rate": 1.7677545093633713e-05, + "loss": 1.0344, + "step": 16990 + }, + { + "epoch": 0.2448967832087241, + "grad_norm": 0.601445734500885, + "learning_rate": 1.767455474761525e-05, + "loss": 1.0485, + "step": 17000 + }, + { + "epoch": 0.24504084014002334, + "grad_norm": 0.5469062328338623, + "learning_rate": 1.767156273095209e-05, + "loss": 1.0157, + "step": 17010 + }, + { + "epoch": 0.2451848970713226, + "grad_norm": 0.6900678873062134, + "learning_rate": 1.7668569044295558e-05, + "loss": 1.0375, + "step": 17020 + }, + { + "epoch": 0.24532895400262184, + "grad_norm": 0.5829229950904846, + "learning_rate": 1.7665573688297328e-05, + "loss": 1.018, + "step": 17030 + }, + { + "epoch": 0.2454730109339211, + "grad_norm": 0.5610002279281616, + "learning_rate": 1.7662576663609457e-05, + "loss": 1.022, + "step": 17040 + }, + { + "epoch": 0.24561706786522033, + "grad_norm": 0.564368724822998, + "learning_rate": 1.7659577970884346e-05, + "loss": 1.0282, + "step": 17050 + }, + { + "epoch": 0.2457611247965196, + "grad_norm": 0.7293027639389038, + "learning_rate": 1.765657761077478e-05, + "loss": 1.0301, + "step": 17060 + }, + { + "epoch": 0.24590518172781883, + "grad_norm": 0.7138099074363708, + "learning_rate": 1.765357558393389e-05, + "loss": 1.0271, + "step": 17070 + }, + { + "epoch": 0.2460492386591181, + "grad_norm": 0.5354404449462891, + "learning_rate": 1.7650571891015174e-05, + "loss": 1.0383, + "step": 17080 + }, + { + "epoch": 0.24619329559041733, + "grad_norm": 0.6015536785125732, + "learning_rate": 1.7647566532672495e-05, + "loss": 1.0432, + "step": 17090 + }, + { + "epoch": 0.2463373525217166, + "grad_norm": 0.6709904670715332, + "learning_rate": 1.7644559509560084e-05, + "loss": 1.0375, + "step": 17100 + }, + { + "epoch": 0.24648140945301583, + "grad_norm": 0.7079071998596191, + "learning_rate": 1.7641550822332525e-05, + "loss": 1.0468, + "step": 17110 + }, + { + "epoch": 0.2466254663843151, + "grad_norm": 0.6117041707038879, + "learning_rate": 1.7638540471644764e-05, + "loss": 1.0446, + "step": 17120 + }, + { + "epoch": 0.24676952331561433, + "grad_norm": 0.610614001750946, + "learning_rate": 1.763552845815212e-05, + "loss": 1.0107, + "step": 17130 + }, + { + "epoch": 0.24691358024691357, + "grad_norm": 0.612248957157135, + "learning_rate": 1.763251478251026e-05, + "loss": 1.0223, + "step": 17140 + }, + { + "epoch": 0.24705763717821283, + "grad_norm": 0.5730803608894348, + "learning_rate": 1.7629499445375225e-05, + "loss": 1.0188, + "step": 17150 + }, + { + "epoch": 0.24720169410951207, + "grad_norm": 0.6203054189682007, + "learning_rate": 1.7626482447403405e-05, + "loss": 1.0188, + "step": 17160 + }, + { + "epoch": 0.24734575104081133, + "grad_norm": 0.5931071043014526, + "learning_rate": 1.762346378925157e-05, + "loss": 1.0436, + "step": 17170 + }, + { + "epoch": 0.24748980797211056, + "grad_norm": 0.535603404045105, + "learning_rate": 1.7620443471576827e-05, + "loss": 1.0411, + "step": 17180 + }, + { + "epoch": 0.24763386490340983, + "grad_norm": 0.6503838896751404, + "learning_rate": 1.761742149503667e-05, + "loss": 1.0407, + "step": 17190 + }, + { + "epoch": 0.24777792183470906, + "grad_norm": 0.762205183506012, + "learning_rate": 1.761439786028893e-05, + "loss": 1.0111, + "step": 17200 + }, + { + "epoch": 0.24792197876600833, + "grad_norm": 0.7181898355484009, + "learning_rate": 1.7611372567991823e-05, + "loss": 1.0465, + "step": 17210 + }, + { + "epoch": 0.24806603569730756, + "grad_norm": 0.4939252436161041, + "learning_rate": 1.7608345618803903e-05, + "loss": 1.0378, + "step": 17220 + }, + { + "epoch": 0.24821009262860683, + "grad_norm": 0.4960174560546875, + "learning_rate": 1.76053170133841e-05, + "loss": 1.0362, + "step": 17230 + }, + { + "epoch": 0.24835414955990606, + "grad_norm": 0.6288329362869263, + "learning_rate": 1.7602286752391697e-05, + "loss": 1.0444, + "step": 17240 + }, + { + "epoch": 0.24849820649120533, + "grad_norm": 0.5479284524917603, + "learning_rate": 1.759925483648634e-05, + "loss": 1.0437, + "step": 17250 + }, + { + "epoch": 0.24864226342250456, + "grad_norm": 0.5709022283554077, + "learning_rate": 1.7596221266328043e-05, + "loss": 1.0353, + "step": 17260 + }, + { + "epoch": 0.24878632035380382, + "grad_norm": 0.6153503060340881, + "learning_rate": 1.7593186042577163e-05, + "loss": 1.0319, + "step": 17270 + }, + { + "epoch": 0.24893037728510306, + "grad_norm": 0.6913089752197266, + "learning_rate": 1.759014916589443e-05, + "loss": 1.0377, + "step": 17280 + }, + { + "epoch": 0.24907443421640232, + "grad_norm": 0.6153222322463989, + "learning_rate": 1.758711063694093e-05, + "loss": 1.0466, + "step": 17290 + }, + { + "epoch": 0.24921849114770156, + "grad_norm": 0.51612389087677, + "learning_rate": 1.7584070456378107e-05, + "loss": 1.0568, + "step": 17300 + }, + { + "epoch": 0.24936254807900082, + "grad_norm": 0.5424662828445435, + "learning_rate": 1.758102862486777e-05, + "loss": 1.0328, + "step": 17310 + }, + { + "epoch": 0.24950660501030006, + "grad_norm": 0.5843436121940613, + "learning_rate": 1.7577985143072077e-05, + "loss": 1.0266, + "step": 17320 + }, + { + "epoch": 0.24965066194159932, + "grad_norm": 0.5523232221603394, + "learning_rate": 1.7574940011653563e-05, + "loss": 1.0263, + "step": 17330 + }, + { + "epoch": 0.24979471887289856, + "grad_norm": 0.6400233507156372, + "learning_rate": 1.75718932312751e-05, + "loss": 1.0569, + "step": 17340 + }, + { + "epoch": 0.24993877580419782, + "grad_norm": 0.5132960677146912, + "learning_rate": 1.756884480259994e-05, + "loss": 1.0341, + "step": 17350 + }, + { + "epoch": 0.2500828327354971, + "grad_norm": 0.763143002986908, + "learning_rate": 1.7565794726291674e-05, + "loss": 1.0489, + "step": 17360 + }, + { + "epoch": 0.2502268896667963, + "grad_norm": 0.5453976392745972, + "learning_rate": 1.756274300301427e-05, + "loss": 1.0297, + "step": 17370 + }, + { + "epoch": 0.25037094659809556, + "grad_norm": 0.5535269379615784, + "learning_rate": 1.7559689633432038e-05, + "loss": 1.0351, + "step": 17380 + }, + { + "epoch": 0.2505150035293948, + "grad_norm": 0.6392086744308472, + "learning_rate": 1.7556634618209663e-05, + "loss": 1.0582, + "step": 17390 + }, + { + "epoch": 0.2506590604606941, + "grad_norm": 0.675186276435852, + "learning_rate": 1.7553577958012174e-05, + "loss": 1.0405, + "step": 17400 + }, + { + "epoch": 0.2508031173919933, + "grad_norm": 0.6513963937759399, + "learning_rate": 1.7550519653504966e-05, + "loss": 1.0504, + "step": 17410 + }, + { + "epoch": 0.25094717432329255, + "grad_norm": 0.537735104560852, + "learning_rate": 1.7547459705353787e-05, + "loss": 1.0369, + "step": 17420 + }, + { + "epoch": 0.2510912312545918, + "grad_norm": 0.6523123383522034, + "learning_rate": 1.7544398114224746e-05, + "loss": 1.0448, + "step": 17430 + }, + { + "epoch": 0.2512352881858911, + "grad_norm": 0.5637397766113281, + "learning_rate": 1.7541334880784314e-05, + "loss": 1.0542, + "step": 17440 + }, + { + "epoch": 0.2513793451171903, + "grad_norm": 0.5768483281135559, + "learning_rate": 1.753827000569931e-05, + "loss": 1.0274, + "step": 17450 + }, + { + "epoch": 0.25152340204848955, + "grad_norm": 0.717302680015564, + "learning_rate": 1.7535203489636915e-05, + "loss": 1.0417, + "step": 17460 + }, + { + "epoch": 0.2516674589797888, + "grad_norm": 0.6151435375213623, + "learning_rate": 1.753213533326467e-05, + "loss": 1.0323, + "step": 17470 + }, + { + "epoch": 0.2518115159110881, + "grad_norm": 0.4801254868507385, + "learning_rate": 1.752906553725047e-05, + "loss": 1.0328, + "step": 17480 + }, + { + "epoch": 0.2519555728423873, + "grad_norm": 0.5034201145172119, + "learning_rate": 1.7525994102262564e-05, + "loss": 1.0201, + "step": 17490 + }, + { + "epoch": 0.25209962977368655, + "grad_norm": 0.6454114317893982, + "learning_rate": 1.7522921028969562e-05, + "loss": 1.056, + "step": 17500 + }, + { + "epoch": 0.2522436867049858, + "grad_norm": 0.6408239006996155, + "learning_rate": 1.7519846318040435e-05, + "loss": 1.019, + "step": 17510 + }, + { + "epoch": 0.2523877436362851, + "grad_norm": 0.5316129326820374, + "learning_rate": 1.7516769970144497e-05, + "loss": 1.0436, + "step": 17520 + }, + { + "epoch": 0.2525318005675843, + "grad_norm": 0.575065016746521, + "learning_rate": 1.751369198595143e-05, + "loss": 1.0362, + "step": 17530 + }, + { + "epoch": 0.25267585749888355, + "grad_norm": 0.7316829562187195, + "learning_rate": 1.7510612366131273e-05, + "loss": 1.0051, + "step": 17540 + }, + { + "epoch": 0.2528199144301828, + "grad_norm": 0.7314066886901855, + "learning_rate": 1.750753111135441e-05, + "loss": 1.0318, + "step": 17550 + }, + { + "epoch": 0.2529639713614821, + "grad_norm": 0.6269857287406921, + "learning_rate": 1.750444822229159e-05, + "loss": 1.0308, + "step": 17560 + }, + { + "epoch": 0.2531080282927813, + "grad_norm": 0.7021530866622925, + "learning_rate": 1.7501363699613917e-05, + "loss": 1.0622, + "step": 17570 + }, + { + "epoch": 0.25325208522408055, + "grad_norm": 0.5551626086235046, + "learning_rate": 1.7498277543992848e-05, + "loss": 1.0363, + "step": 17580 + }, + { + "epoch": 0.2533961421553798, + "grad_norm": 0.5241049528121948, + "learning_rate": 1.7495189756100198e-05, + "loss": 1.0498, + "step": 17590 + }, + { + "epoch": 0.2535401990866791, + "grad_norm": 0.6174381971359253, + "learning_rate": 1.7492100336608133e-05, + "loss": 1.0316, + "step": 17600 + }, + { + "epoch": 0.2536842560179783, + "grad_norm": 0.49905723333358765, + "learning_rate": 1.748900928618918e-05, + "loss": 1.0334, + "step": 17610 + }, + { + "epoch": 0.25382831294927755, + "grad_norm": 0.5309996008872986, + "learning_rate": 1.7485916605516212e-05, + "loss": 1.0198, + "step": 17620 + }, + { + "epoch": 0.2539723698805768, + "grad_norm": 0.574708878993988, + "learning_rate": 1.748282229526247e-05, + "loss": 1.0464, + "step": 17630 + }, + { + "epoch": 0.2541164268118761, + "grad_norm": 0.7914347648620605, + "learning_rate": 1.7479726356101537e-05, + "loss": 1.0485, + "step": 17640 + }, + { + "epoch": 0.2542604837431753, + "grad_norm": 0.6053186655044556, + "learning_rate": 1.7476628788707364e-05, + "loss": 1.0203, + "step": 17650 + }, + { + "epoch": 0.25440454067447454, + "grad_norm": 0.6396844387054443, + "learning_rate": 1.747352959375424e-05, + "loss": 1.0334, + "step": 17660 + }, + { + "epoch": 0.2545485976057738, + "grad_norm": 0.5476585030555725, + "learning_rate": 1.7470428771916827e-05, + "loss": 1.0289, + "step": 17670 + }, + { + "epoch": 0.25469265453707307, + "grad_norm": 0.5465442538261414, + "learning_rate": 1.7467326323870117e-05, + "loss": 1.02, + "step": 17680 + }, + { + "epoch": 0.2548367114683723, + "grad_norm": 0.6411909461021423, + "learning_rate": 1.7464222250289482e-05, + "loss": 1.0334, + "step": 17690 + }, + { + "epoch": 0.25498076839967154, + "grad_norm": 0.5101737380027771, + "learning_rate": 1.7461116551850634e-05, + "loss": 1.0424, + "step": 17700 + }, + { + "epoch": 0.2551248253309708, + "grad_norm": 0.5736578106880188, + "learning_rate": 1.7458009229229636e-05, + "loss": 1.0282, + "step": 17710 + }, + { + "epoch": 0.25526888226227007, + "grad_norm": 0.6061928868293762, + "learning_rate": 1.745490028310291e-05, + "loss": 1.059, + "step": 17720 + }, + { + "epoch": 0.2554129391935693, + "grad_norm": 0.5113356113433838, + "learning_rate": 1.7451789714147235e-05, + "loss": 1.0301, + "step": 17730 + }, + { + "epoch": 0.25555699612486854, + "grad_norm": 0.6834507584571838, + "learning_rate": 1.7448677523039736e-05, + "loss": 1.054, + "step": 17740 + }, + { + "epoch": 0.2557010530561678, + "grad_norm": 0.5508944392204285, + "learning_rate": 1.7445563710457897e-05, + "loss": 1.0499, + "step": 17750 + }, + { + "epoch": 0.25584510998746707, + "grad_norm": 0.5912610292434692, + "learning_rate": 1.744244827707955e-05, + "loss": 1.0546, + "step": 17760 + }, + { + "epoch": 0.2559891669187663, + "grad_norm": 0.633089542388916, + "learning_rate": 1.743933122358288e-05, + "loss": 1.0337, + "step": 17770 + }, + { + "epoch": 0.25613322385006554, + "grad_norm": 0.8170694708824158, + "learning_rate": 1.743621255064643e-05, + "loss": 1.0327, + "step": 17780 + }, + { + "epoch": 0.2562772807813648, + "grad_norm": 0.5366178750991821, + "learning_rate": 1.7433092258949087e-05, + "loss": 1.0146, + "step": 17790 + }, + { + "epoch": 0.25642133771266407, + "grad_norm": 0.5773735642433167, + "learning_rate": 1.7429970349170104e-05, + "loss": 1.036, + "step": 17800 + }, + { + "epoch": 0.2565653946439633, + "grad_norm": 0.6646609902381897, + "learning_rate": 1.7426846821989066e-05, + "loss": 1.0362, + "step": 17810 + }, + { + "epoch": 0.25670945157526254, + "grad_norm": 0.5626872181892395, + "learning_rate": 1.7423721678085932e-05, + "loss": 1.0139, + "step": 17820 + }, + { + "epoch": 0.2568535085065618, + "grad_norm": 0.6005491614341736, + "learning_rate": 1.7420594918140998e-05, + "loss": 1.0261, + "step": 17830 + }, + { + "epoch": 0.25699756543786106, + "grad_norm": 0.61122727394104, + "learning_rate": 1.741746654283492e-05, + "loss": 1.0213, + "step": 17840 + }, + { + "epoch": 0.25714162236916027, + "grad_norm": 0.5532557964324951, + "learning_rate": 1.7414336552848696e-05, + "loss": 1.0215, + "step": 17850 + }, + { + "epoch": 0.25728567930045954, + "grad_norm": 0.5664517879486084, + "learning_rate": 1.7411204948863686e-05, + "loss": 1.019, + "step": 17860 + }, + { + "epoch": 0.2574297362317588, + "grad_norm": 0.6716731190681458, + "learning_rate": 1.74080717315616e-05, + "loss": 1.0369, + "step": 17870 + }, + { + "epoch": 0.25757379316305806, + "grad_norm": 0.5846714377403259, + "learning_rate": 1.7404936901624487e-05, + "loss": 1.0284, + "step": 17880 + }, + { + "epoch": 0.25771785009435727, + "grad_norm": 0.5675822496414185, + "learning_rate": 1.7401800459734762e-05, + "loss": 1.0316, + "step": 17890 + }, + { + "epoch": 0.25786190702565653, + "grad_norm": 0.5803831815719604, + "learning_rate": 1.739866240657519e-05, + "loss": 1.0162, + "step": 17900 + }, + { + "epoch": 0.2580059639569558, + "grad_norm": 0.5902814865112305, + "learning_rate": 1.7395522742828874e-05, + "loss": 1.0371, + "step": 17910 + }, + { + "epoch": 0.25815002088825506, + "grad_norm": 0.5528362989425659, + "learning_rate": 1.739238146917928e-05, + "loss": 1.0391, + "step": 17920 + }, + { + "epoch": 0.25829407781955427, + "grad_norm": 0.7447479963302612, + "learning_rate": 1.7389238586310213e-05, + "loss": 1.0342, + "step": 17930 + }, + { + "epoch": 0.25843813475085353, + "grad_norm": 0.5475665330886841, + "learning_rate": 1.7386094094905847e-05, + "loss": 1.0363, + "step": 17940 + }, + { + "epoch": 0.2585821916821528, + "grad_norm": 0.4762347638607025, + "learning_rate": 1.738294799565068e-05, + "loss": 1.0518, + "step": 17950 + }, + { + "epoch": 0.25872624861345206, + "grad_norm": 0.5582030415534973, + "learning_rate": 1.7379800289229587e-05, + "loss": 1.0203, + "step": 17960 + }, + { + "epoch": 0.25887030554475127, + "grad_norm": 0.6422776579856873, + "learning_rate": 1.7376650976327772e-05, + "loss": 1.0498, + "step": 17970 + }, + { + "epoch": 0.25901436247605053, + "grad_norm": 0.575137197971344, + "learning_rate": 1.7373500057630806e-05, + "loss": 1.023, + "step": 17980 + }, + { + "epoch": 0.2591584194073498, + "grad_norm": 0.5468251705169678, + "learning_rate": 1.7370347533824592e-05, + "loss": 1.0318, + "step": 17990 + }, + { + "epoch": 0.25930247633864906, + "grad_norm": 0.569378137588501, + "learning_rate": 1.7367193405595394e-05, + "loss": 1.0156, + "step": 18000 + }, + { + "epoch": 0.25944653326994827, + "grad_norm": 0.7267173528671265, + "learning_rate": 1.7364037673629822e-05, + "loss": 1.0593, + "step": 18010 + }, + { + "epoch": 0.25959059020124753, + "grad_norm": 0.6949495673179626, + "learning_rate": 1.7360880338614835e-05, + "loss": 1.0443, + "step": 18020 + }, + { + "epoch": 0.2597346471325468, + "grad_norm": 0.5617415904998779, + "learning_rate": 1.7357721401237744e-05, + "loss": 1.0242, + "step": 18030 + }, + { + "epoch": 0.25987870406384606, + "grad_norm": 0.5180186033248901, + "learning_rate": 1.73545608621862e-05, + "loss": 1.0444, + "step": 18040 + }, + { + "epoch": 0.26002276099514526, + "grad_norm": 0.5655116438865662, + "learning_rate": 1.7351398722148214e-05, + "loss": 1.0339, + "step": 18050 + }, + { + "epoch": 0.2601668179264445, + "grad_norm": 0.5050772428512573, + "learning_rate": 1.7348234981812143e-05, + "loss": 1.0264, + "step": 18060 + }, + { + "epoch": 0.2603108748577438, + "grad_norm": 0.6562170386314392, + "learning_rate": 1.734506964186668e-05, + "loss": 1.0326, + "step": 18070 + }, + { + "epoch": 0.26045493178904305, + "grad_norm": 0.610063374042511, + "learning_rate": 1.7341902703000883e-05, + "loss": 1.0114, + "step": 18080 + }, + { + "epoch": 0.26059898872034226, + "grad_norm": 0.6089850664138794, + "learning_rate": 1.733873416590415e-05, + "loss": 1.061, + "step": 18090 + }, + { + "epoch": 0.2607430456516415, + "grad_norm": 0.6482865214347839, + "learning_rate": 1.7335564031266224e-05, + "loss": 1.0238, + "step": 18100 + }, + { + "epoch": 0.2608871025829408, + "grad_norm": 0.5729652047157288, + "learning_rate": 1.7332392299777206e-05, + "loss": 1.0317, + "step": 18110 + }, + { + "epoch": 0.26103115951424005, + "grad_norm": 0.5370912551879883, + "learning_rate": 1.7329218972127532e-05, + "loss": 1.0246, + "step": 18120 + }, + { + "epoch": 0.26117521644553926, + "grad_norm": 0.5290127992630005, + "learning_rate": 1.7326044049007995e-05, + "loss": 1.0284, + "step": 18130 + }, + { + "epoch": 0.2613192733768385, + "grad_norm": 0.653229296207428, + "learning_rate": 1.7322867531109732e-05, + "loss": 1.034, + "step": 18140 + }, + { + "epoch": 0.2614633303081378, + "grad_norm": 0.5976172685623169, + "learning_rate": 1.7319689419124228e-05, + "loss": 1.0448, + "step": 18150 + }, + { + "epoch": 0.26160738723943705, + "grad_norm": 0.5717659592628479, + "learning_rate": 1.731650971374331e-05, + "loss": 1.0596, + "step": 18160 + }, + { + "epoch": 0.26175144417073626, + "grad_norm": 0.49149689078330994, + "learning_rate": 1.731332841565916e-05, + "loss": 1.0159, + "step": 18170 + }, + { + "epoch": 0.2618955011020355, + "grad_norm": 0.6619843244552612, + "learning_rate": 1.73101455255643e-05, + "loss": 1.0159, + "step": 18180 + }, + { + "epoch": 0.2620395580333348, + "grad_norm": 0.6329603791236877, + "learning_rate": 1.7306961044151608e-05, + "loss": 1.0431, + "step": 18190 + }, + { + "epoch": 0.26218361496463405, + "grad_norm": 0.5910669565200806, + "learning_rate": 1.730377497211429e-05, + "loss": 1.0214, + "step": 18200 + }, + { + "epoch": 0.26232767189593326, + "grad_norm": 0.5871611833572388, + "learning_rate": 1.7300587310145917e-05, + "loss": 1.0356, + "step": 18210 + }, + { + "epoch": 0.2624717288272325, + "grad_norm": 0.5438082218170166, + "learning_rate": 1.72973980589404e-05, + "loss": 1.0273, + "step": 18220 + }, + { + "epoch": 0.2626157857585318, + "grad_norm": 0.6762313842773438, + "learning_rate": 1.7294207219191995e-05, + "loss": 1.0485, + "step": 18230 + }, + { + "epoch": 0.26275984268983105, + "grad_norm": 0.48789089918136597, + "learning_rate": 1.72910147915953e-05, + "loss": 1.0238, + "step": 18240 + }, + { + "epoch": 0.26290389962113025, + "grad_norm": 0.5649069547653198, + "learning_rate": 1.728782077684526e-05, + "loss": 1.0525, + "step": 18250 + }, + { + "epoch": 0.2630479565524295, + "grad_norm": 0.5777900815010071, + "learning_rate": 1.7284625175637176e-05, + "loss": 1.0468, + "step": 18260 + }, + { + "epoch": 0.2631920134837288, + "grad_norm": 0.5941057801246643, + "learning_rate": 1.7281427988666686e-05, + "loss": 1.0515, + "step": 18270 + }, + { + "epoch": 0.26333607041502805, + "grad_norm": 0.525132954120636, + "learning_rate": 1.7278229216629767e-05, + "loss": 1.0359, + "step": 18280 + }, + { + "epoch": 0.26348012734632725, + "grad_norm": 0.6041716933250427, + "learning_rate": 1.727502886022275e-05, + "loss": 1.0122, + "step": 18290 + }, + { + "epoch": 0.2636241842776265, + "grad_norm": 0.8156771063804626, + "learning_rate": 1.7271826920142313e-05, + "loss": 1.0439, + "step": 18300 + }, + { + "epoch": 0.2637682412089258, + "grad_norm": 0.5602260231971741, + "learning_rate": 1.7268623397085467e-05, + "loss": 1.0141, + "step": 18310 + }, + { + "epoch": 0.26391229814022504, + "grad_norm": 0.6049859523773193, + "learning_rate": 1.7265418291749577e-05, + "loss": 1.0439, + "step": 18320 + }, + { + "epoch": 0.26405635507152425, + "grad_norm": 0.5453393459320068, + "learning_rate": 1.7262211604832355e-05, + "loss": 1.039, + "step": 18330 + }, + { + "epoch": 0.2642004120028235, + "grad_norm": 0.5879477262496948, + "learning_rate": 1.725900333703185e-05, + "loss": 1.0509, + "step": 18340 + }, + { + "epoch": 0.2643444689341228, + "grad_norm": 0.5193531513214111, + "learning_rate": 1.7255793489046452e-05, + "loss": 1.0346, + "step": 18350 + }, + { + "epoch": 0.26448852586542204, + "grad_norm": 0.5647846460342407, + "learning_rate": 1.7252582061574906e-05, + "loss": 1.0114, + "step": 18360 + }, + { + "epoch": 0.26463258279672125, + "grad_norm": 0.6013086438179016, + "learning_rate": 1.72493690553163e-05, + "loss": 1.0118, + "step": 18370 + }, + { + "epoch": 0.2647766397280205, + "grad_norm": 0.5349829792976379, + "learning_rate": 1.724615447097005e-05, + "loss": 1.0332, + "step": 18380 + }, + { + "epoch": 0.2649206966593198, + "grad_norm": 0.5422055125236511, + "learning_rate": 1.724293830923593e-05, + "loss": 1.0543, + "step": 18390 + }, + { + "epoch": 0.26506475359061904, + "grad_norm": 0.4812210500240326, + "learning_rate": 1.723972057081406e-05, + "loss": 1.0321, + "step": 18400 + }, + { + "epoch": 0.26520881052191825, + "grad_norm": 0.5967666506767273, + "learning_rate": 1.72365012564049e-05, + "loss": 1.0282, + "step": 18410 + }, + { + "epoch": 0.2653528674532175, + "grad_norm": 0.652559757232666, + "learning_rate": 1.723328036670923e-05, + "loss": 1.0318, + "step": 18420 + }, + { + "epoch": 0.2654969243845168, + "grad_norm": 0.5375730395317078, + "learning_rate": 1.723005790242822e-05, + "loss": 1.0408, + "step": 18430 + }, + { + "epoch": 0.26564098131581604, + "grad_norm": 0.5464618802070618, + "learning_rate": 1.7226833864263338e-05, + "loss": 1.0216, + "step": 18440 + }, + { + "epoch": 0.26578503824711525, + "grad_norm": 0.5174851417541504, + "learning_rate": 1.722360825291642e-05, + "loss": 1.0357, + "step": 18450 + }, + { + "epoch": 0.2659290951784145, + "grad_norm": 0.5950974225997925, + "learning_rate": 1.7220381069089635e-05, + "loss": 1.0263, + "step": 18460 + }, + { + "epoch": 0.2660731521097138, + "grad_norm": 0.6485792994499207, + "learning_rate": 1.7217152313485495e-05, + "loss": 1.0332, + "step": 18470 + }, + { + "epoch": 0.266217209041013, + "grad_norm": 0.6197564005851746, + "learning_rate": 1.7213921986806857e-05, + "loss": 1.0375, + "step": 18480 + }, + { + "epoch": 0.26636126597231224, + "grad_norm": 0.48383066058158875, + "learning_rate": 1.7210690089756917e-05, + "loss": 1.0362, + "step": 18490 + }, + { + "epoch": 0.2665053229036115, + "grad_norm": 0.641230046749115, + "learning_rate": 1.720745662303922e-05, + "loss": 1.0518, + "step": 18500 + }, + { + "epoch": 0.26664937983491077, + "grad_norm": 0.5075659155845642, + "learning_rate": 1.7204221587357638e-05, + "loss": 1.0454, + "step": 18510 + }, + { + "epoch": 0.26679343676621, + "grad_norm": 0.5523113012313843, + "learning_rate": 1.7200984983416402e-05, + "loss": 1.027, + "step": 18520 + }, + { + "epoch": 0.26693749369750924, + "grad_norm": 0.6356770992279053, + "learning_rate": 1.7197746811920073e-05, + "loss": 1.0468, + "step": 18530 + }, + { + "epoch": 0.2670815506288085, + "grad_norm": 0.5757063627243042, + "learning_rate": 1.7194507073573555e-05, + "loss": 1.0389, + "step": 18540 + }, + { + "epoch": 0.26722560756010777, + "grad_norm": 0.568065345287323, + "learning_rate": 1.7191265769082098e-05, + "loss": 1.0348, + "step": 18550 + }, + { + "epoch": 0.267369664491407, + "grad_norm": 0.5348308682441711, + "learning_rate": 1.7188022899151283e-05, + "loss": 1.0255, + "step": 18560 + }, + { + "epoch": 0.26751372142270624, + "grad_norm": 0.5122363567352295, + "learning_rate": 1.7184778464487046e-05, + "loss": 1.0179, + "step": 18570 + }, + { + "epoch": 0.2676577783540055, + "grad_norm": 0.6201374530792236, + "learning_rate": 1.7181532465795644e-05, + "loss": 1.0208, + "step": 18580 + }, + { + "epoch": 0.26780183528530477, + "grad_norm": 0.5712049603462219, + "learning_rate": 1.7178284903783698e-05, + "loss": 1.0393, + "step": 18590 + }, + { + "epoch": 0.267945892216604, + "grad_norm": 0.7647679448127747, + "learning_rate": 1.7175035779158157e-05, + "loss": 1.0241, + "step": 18600 + }, + { + "epoch": 0.26808994914790324, + "grad_norm": 0.7282312512397766, + "learning_rate": 1.71717850926263e-05, + "loss": 1.055, + "step": 18610 + }, + { + "epoch": 0.2682340060792025, + "grad_norm": 0.677687406539917, + "learning_rate": 1.7168532844895765e-05, + "loss": 1.0393, + "step": 18620 + }, + { + "epoch": 0.26837806301050177, + "grad_norm": 0.5776512622833252, + "learning_rate": 1.7165279036674523e-05, + "loss": 1.0328, + "step": 18630 + }, + { + "epoch": 0.268522119941801, + "grad_norm": 0.6276368498802185, + "learning_rate": 1.7162023668670874e-05, + "loss": 1.0437, + "step": 18640 + }, + { + "epoch": 0.26866617687310024, + "grad_norm": 0.547092616558075, + "learning_rate": 1.715876674159348e-05, + "loss": 1.0537, + "step": 18650 + }, + { + "epoch": 0.2688102338043995, + "grad_norm": 0.5741291642189026, + "learning_rate": 1.7155508256151315e-05, + "loss": 1.0172, + "step": 18660 + }, + { + "epoch": 0.26895429073569876, + "grad_norm": 0.5554395318031311, + "learning_rate": 1.7152248213053716e-05, + "loss": 1.0191, + "step": 18670 + }, + { + "epoch": 0.269098347666998, + "grad_norm": 0.5335158705711365, + "learning_rate": 1.7148986613010344e-05, + "loss": 1.0162, + "step": 18680 + }, + { + "epoch": 0.26924240459829724, + "grad_norm": 0.5677670240402222, + "learning_rate": 1.7145723456731208e-05, + "loss": 1.0108, + "step": 18690 + }, + { + "epoch": 0.2693864615295965, + "grad_norm": 0.7178367376327515, + "learning_rate": 1.714245874492665e-05, + "loss": 1.0671, + "step": 18700 + }, + { + "epoch": 0.26953051846089576, + "grad_norm": 0.5835092663764954, + "learning_rate": 1.7139192478307354e-05, + "loss": 1.0472, + "step": 18710 + }, + { + "epoch": 0.26967457539219497, + "grad_norm": 0.47994673252105713, + "learning_rate": 1.713592465758434e-05, + "loss": 1.0238, + "step": 18720 + }, + { + "epoch": 0.26981863232349423, + "grad_norm": 0.5740872621536255, + "learning_rate": 1.713265528346897e-05, + "loss": 1.0633, + "step": 18730 + }, + { + "epoch": 0.2699626892547935, + "grad_norm": 0.601265549659729, + "learning_rate": 1.712938435667294e-05, + "loss": 1.0476, + "step": 18740 + }, + { + "epoch": 0.27010674618609276, + "grad_norm": 0.6448707580566406, + "learning_rate": 1.7126111877908283e-05, + "loss": 1.0264, + "step": 18750 + }, + { + "epoch": 0.27025080311739197, + "grad_norm": 0.6133461594581604, + "learning_rate": 1.712283784788738e-05, + "loss": 1.0142, + "step": 18760 + }, + { + "epoch": 0.27039486004869123, + "grad_norm": 0.5708298087120056, + "learning_rate": 1.711956226732293e-05, + "loss": 1.0376, + "step": 18770 + }, + { + "epoch": 0.2705389169799905, + "grad_norm": 0.5778443813323975, + "learning_rate": 1.7116285136927992e-05, + "loss": 1.0372, + "step": 18780 + }, + { + "epoch": 0.27068297391128976, + "grad_norm": 0.495801717042923, + "learning_rate": 1.7113006457415952e-05, + "loss": 1.0356, + "step": 18790 + }, + { + "epoch": 0.27082703084258897, + "grad_norm": 0.5203620791435242, + "learning_rate": 1.710972622950053e-05, + "loss": 1.0115, + "step": 18800 + }, + { + "epoch": 0.27097108777388823, + "grad_norm": 0.5737788081169128, + "learning_rate": 1.710644445389578e-05, + "loss": 1.0738, + "step": 18810 + }, + { + "epoch": 0.2711151447051875, + "grad_norm": 0.5559195280075073, + "learning_rate": 1.710316113131611e-05, + "loss": 1.0377, + "step": 18820 + }, + { + "epoch": 0.27125920163648676, + "grad_norm": 0.4836956858634949, + "learning_rate": 1.709987626247625e-05, + "loss": 1.0487, + "step": 18830 + }, + { + "epoch": 0.27140325856778597, + "grad_norm": 0.5687073469161987, + "learning_rate": 1.709658984809127e-05, + "loss": 1.034, + "step": 18840 + }, + { + "epoch": 0.27154731549908523, + "grad_norm": 0.5611147880554199, + "learning_rate": 1.709330188887658e-05, + "loss": 1.026, + "step": 18850 + }, + { + "epoch": 0.2716913724303845, + "grad_norm": 0.6767673492431641, + "learning_rate": 1.7090012385547917e-05, + "loss": 1.0136, + "step": 18860 + }, + { + "epoch": 0.27183542936168376, + "grad_norm": 0.6080965399742126, + "learning_rate": 1.7086721338821366e-05, + "loss": 1.018, + "step": 18870 + }, + { + "epoch": 0.27197948629298296, + "grad_norm": 0.6415413618087769, + "learning_rate": 1.708342874941334e-05, + "loss": 1.0508, + "step": 18880 + }, + { + "epoch": 0.2721235432242822, + "grad_norm": 0.6209141612052917, + "learning_rate": 1.7080134618040594e-05, + "loss": 1.0544, + "step": 18890 + }, + { + "epoch": 0.2722676001555815, + "grad_norm": 0.7484108209609985, + "learning_rate": 1.7076838945420208e-05, + "loss": 1.0484, + "step": 18900 + }, + { + "epoch": 0.27241165708688075, + "grad_norm": 0.6004678010940552, + "learning_rate": 1.7073541732269613e-05, + "loss": 1.0143, + "step": 18910 + }, + { + "epoch": 0.27255571401817996, + "grad_norm": 0.5813729763031006, + "learning_rate": 1.707024297930656e-05, + "loss": 1.0467, + "step": 18920 + }, + { + "epoch": 0.2726997709494792, + "grad_norm": 0.5733342170715332, + "learning_rate": 1.7066942687249146e-05, + "loss": 1.0419, + "step": 18930 + }, + { + "epoch": 0.2728438278807785, + "grad_norm": 0.5739596486091614, + "learning_rate": 1.7063640856815795e-05, + "loss": 1.0229, + "step": 18940 + }, + { + "epoch": 0.27298788481207775, + "grad_norm": 0.5423083901405334, + "learning_rate": 1.7060337488725276e-05, + "loss": 1.0268, + "step": 18950 + }, + { + "epoch": 0.27313194174337696, + "grad_norm": 0.6381399035453796, + "learning_rate": 1.705703258369668e-05, + "loss": 1.0172, + "step": 18960 + }, + { + "epoch": 0.2732759986746762, + "grad_norm": 0.654782772064209, + "learning_rate": 1.7053726142449444e-05, + "loss": 1.0382, + "step": 18970 + }, + { + "epoch": 0.2734200556059755, + "grad_norm": 0.5004866719245911, + "learning_rate": 1.7050418165703332e-05, + "loss": 1.0208, + "step": 18980 + }, + { + "epoch": 0.27356411253727475, + "grad_norm": 0.6272166967391968, + "learning_rate": 1.7047108654178446e-05, + "loss": 1.0338, + "step": 18990 + }, + { + "epoch": 0.27370816946857396, + "grad_norm": 0.5967186093330383, + "learning_rate": 1.7043797608595222e-05, + "loss": 1.0332, + "step": 19000 + }, + { + "epoch": 0.2738522263998732, + "grad_norm": 0.5652865767478943, + "learning_rate": 1.7040485029674427e-05, + "loss": 1.0387, + "step": 19010 + }, + { + "epoch": 0.2739962833311725, + "grad_norm": 0.49113377928733826, + "learning_rate": 1.7037170918137163e-05, + "loss": 1.0373, + "step": 19020 + }, + { + "epoch": 0.27414034026247175, + "grad_norm": 0.6672168374061584, + "learning_rate": 1.703385527470487e-05, + "loss": 1.0214, + "step": 19030 + }, + { + "epoch": 0.27428439719377096, + "grad_norm": 0.5516164302825928, + "learning_rate": 1.7030538100099318e-05, + "loss": 1.0452, + "step": 19040 + }, + { + "epoch": 0.2744284541250702, + "grad_norm": 0.5690533518791199, + "learning_rate": 1.7027219395042604e-05, + "loss": 1.0477, + "step": 19050 + }, + { + "epoch": 0.2745725110563695, + "grad_norm": 0.6833709478378296, + "learning_rate": 1.702389916025717e-05, + "loss": 1.0593, + "step": 19060 + }, + { + "epoch": 0.27471656798766875, + "grad_norm": 0.5207808017730713, + "learning_rate": 1.7020577396465783e-05, + "loss": 1.0411, + "step": 19070 + }, + { + "epoch": 0.27486062491896796, + "grad_norm": 0.5609144568443298, + "learning_rate": 1.7017254104391544e-05, + "loss": 1.0375, + "step": 19080 + }, + { + "epoch": 0.2750046818502672, + "grad_norm": 0.5942318439483643, + "learning_rate": 1.7013929284757894e-05, + "loss": 1.0185, + "step": 19090 + }, + { + "epoch": 0.2751487387815665, + "grad_norm": 0.5625672340393066, + "learning_rate": 1.7010602938288592e-05, + "loss": 1.0308, + "step": 19100 + }, + { + "epoch": 0.27529279571286575, + "grad_norm": 0.503961443901062, + "learning_rate": 1.7007275065707745e-05, + "loss": 1.0325, + "step": 19110 + }, + { + "epoch": 0.27543685264416495, + "grad_norm": 0.7923728227615356, + "learning_rate": 1.7003945667739777e-05, + "loss": 1.0231, + "step": 19120 + }, + { + "epoch": 0.2755809095754642, + "grad_norm": 0.6111789345741272, + "learning_rate": 1.7000614745109462e-05, + "loss": 1.0258, + "step": 19130 + }, + { + "epoch": 0.2757249665067635, + "grad_norm": 0.562232494354248, + "learning_rate": 1.6997282298541887e-05, + "loss": 1.0291, + "step": 19140 + }, + { + "epoch": 0.27586902343806274, + "grad_norm": 0.7612364888191223, + "learning_rate": 1.6993948328762484e-05, + "loss": 1.0471, + "step": 19150 + }, + { + "epoch": 0.27601308036936195, + "grad_norm": 0.5305697917938232, + "learning_rate": 1.6990612836497012e-05, + "loss": 1.0221, + "step": 19160 + }, + { + "epoch": 0.2761571373006612, + "grad_norm": 0.5058618187904358, + "learning_rate": 1.6987275822471557e-05, + "loss": 1.0512, + "step": 19170 + }, + { + "epoch": 0.2763011942319605, + "grad_norm": 0.5611950755119324, + "learning_rate": 1.698393728741255e-05, + "loss": 1.0357, + "step": 19180 + }, + { + "epoch": 0.27644525116325974, + "grad_norm": 0.5557824373245239, + "learning_rate": 1.698059723204674e-05, + "loss": 1.056, + "step": 19190 + }, + { + "epoch": 0.27658930809455895, + "grad_norm": 0.6539870500564575, + "learning_rate": 1.6977255657101202e-05, + "loss": 1.0623, + "step": 19200 + }, + { + "epoch": 0.2767333650258582, + "grad_norm": 0.6483203172683716, + "learning_rate": 1.6973912563303364e-05, + "loss": 1.051, + "step": 19210 + }, + { + "epoch": 0.2768774219571575, + "grad_norm": 0.6784367561340332, + "learning_rate": 1.697056795138097e-05, + "loss": 1.0378, + "step": 19220 + }, + { + "epoch": 0.27702147888845674, + "grad_norm": 0.5995579957962036, + "learning_rate": 1.6967221822062084e-05, + "loss": 1.0218, + "step": 19230 + }, + { + "epoch": 0.27716553581975595, + "grad_norm": 0.6947809457778931, + "learning_rate": 1.6963874176075125e-05, + "loss": 1.035, + "step": 19240 + }, + { + "epoch": 0.2773095927510552, + "grad_norm": 0.5625107288360596, + "learning_rate": 1.6960525014148825e-05, + "loss": 1.0215, + "step": 19250 + }, + { + "epoch": 0.2774536496823545, + "grad_norm": 0.6118045449256897, + "learning_rate": 1.695717433701225e-05, + "loss": 1.0293, + "step": 19260 + }, + { + "epoch": 0.27759770661365374, + "grad_norm": 0.5792883634567261, + "learning_rate": 1.6953822145394795e-05, + "loss": 1.0274, + "step": 19270 + }, + { + "epoch": 0.27774176354495295, + "grad_norm": 0.7543878555297852, + "learning_rate": 1.6950468440026188e-05, + "loss": 1.0373, + "step": 19280 + }, + { + "epoch": 0.2778858204762522, + "grad_norm": 0.5332752466201782, + "learning_rate": 1.694711322163648e-05, + "loss": 1.0083, + "step": 19290 + }, + { + "epoch": 0.2780298774075515, + "grad_norm": 0.5911184549331665, + "learning_rate": 1.6943756490956063e-05, + "loss": 1.0533, + "step": 19300 + }, + { + "epoch": 0.27817393433885074, + "grad_norm": 0.49870455265045166, + "learning_rate": 1.6940398248715642e-05, + "loss": 1.0624, + "step": 19310 + }, + { + "epoch": 0.27831799127014994, + "grad_norm": 0.5611423254013062, + "learning_rate": 1.693703849564627e-05, + "loss": 1.0509, + "step": 19320 + }, + { + "epoch": 0.2784620482014492, + "grad_norm": 0.5123780965805054, + "learning_rate": 1.6933677232479313e-05, + "loss": 1.017, + "step": 19330 + }, + { + "epoch": 0.27860610513274847, + "grad_norm": 0.5750264525413513, + "learning_rate": 1.6930314459946474e-05, + "loss": 1.0338, + "step": 19340 + }, + { + "epoch": 0.27875016206404774, + "grad_norm": 0.6050014495849609, + "learning_rate": 1.6926950178779777e-05, + "loss": 1.0426, + "step": 19350 + }, + { + "epoch": 0.27889421899534694, + "grad_norm": 0.5525252819061279, + "learning_rate": 1.6923584389711587e-05, + "loss": 1.061, + "step": 19360 + }, + { + "epoch": 0.2790382759266462, + "grad_norm": 0.5604902505874634, + "learning_rate": 1.6920217093474583e-05, + "loss": 1.0249, + "step": 19370 + }, + { + "epoch": 0.27918233285794547, + "grad_norm": 0.5230039954185486, + "learning_rate": 1.6916848290801784e-05, + "loss": 1.0, + "step": 19380 + }, + { + "epoch": 0.27932638978924473, + "grad_norm": 0.612128734588623, + "learning_rate": 1.691347798242653e-05, + "loss": 1.0477, + "step": 19390 + }, + { + "epoch": 0.27947044672054394, + "grad_norm": 0.6256338953971863, + "learning_rate": 1.691010616908249e-05, + "loss": 1.0532, + "step": 19400 + }, + { + "epoch": 0.2796145036518432, + "grad_norm": 0.6843680739402771, + "learning_rate": 1.6906732851503665e-05, + "loss": 1.0241, + "step": 19410 + }, + { + "epoch": 0.27975856058314247, + "grad_norm": 0.5366109609603882, + "learning_rate": 1.6903358030424373e-05, + "loss": 1.0332, + "step": 19420 + }, + { + "epoch": 0.27990261751444173, + "grad_norm": 0.5890610814094543, + "learning_rate": 1.689998170657927e-05, + "loss": 1.0333, + "step": 19430 + }, + { + "epoch": 0.28004667444574094, + "grad_norm": 0.6060905456542969, + "learning_rate": 1.689660388070334e-05, + "loss": 1.017, + "step": 19440 + }, + { + "epoch": 0.2801907313770402, + "grad_norm": 0.6831796765327454, + "learning_rate": 1.6893224553531876e-05, + "loss": 1.0284, + "step": 19450 + }, + { + "epoch": 0.28033478830833947, + "grad_norm": 0.5462735295295715, + "learning_rate": 1.6889843725800523e-05, + "loss": 1.0192, + "step": 19460 + }, + { + "epoch": 0.28047884523963873, + "grad_norm": 0.5883505940437317, + "learning_rate": 1.6886461398245234e-05, + "loss": 1.0397, + "step": 19470 + }, + { + "epoch": 0.28062290217093794, + "grad_norm": 0.5593507885932922, + "learning_rate": 1.6883077571602303e-05, + "loss": 1.0486, + "step": 19480 + }, + { + "epoch": 0.2807669591022372, + "grad_norm": 0.5353315472602844, + "learning_rate": 1.6879692246608332e-05, + "loss": 1.0466, + "step": 19490 + }, + { + "epoch": 0.28091101603353646, + "grad_norm": 0.5512276291847229, + "learning_rate": 1.687630542400026e-05, + "loss": 1.0276, + "step": 19500 + }, + { + "epoch": 0.28105507296483573, + "grad_norm": 0.5370855927467346, + "learning_rate": 1.6872917104515364e-05, + "loss": 1.0362, + "step": 19510 + }, + { + "epoch": 0.28119912989613494, + "grad_norm": 0.6030184030532837, + "learning_rate": 1.6869527288891224e-05, + "loss": 1.0432, + "step": 19520 + }, + { + "epoch": 0.2813431868274342, + "grad_norm": 0.5397059917449951, + "learning_rate": 1.6866135977865758e-05, + "loss": 1.0351, + "step": 19530 + }, + { + "epoch": 0.28148724375873346, + "grad_norm": 0.641499400138855, + "learning_rate": 1.686274317217721e-05, + "loss": 1.0503, + "step": 19540 + }, + { + "epoch": 0.2816313006900327, + "grad_norm": 0.5896559953689575, + "learning_rate": 1.6859348872564137e-05, + "loss": 1.0373, + "step": 19550 + }, + { + "epoch": 0.28177535762133193, + "grad_norm": 0.5953365564346313, + "learning_rate": 1.685595307976545e-05, + "loss": 1.0426, + "step": 19560 + }, + { + "epoch": 0.2819194145526312, + "grad_norm": 0.5723481178283691, + "learning_rate": 1.6852555794520346e-05, + "loss": 1.0368, + "step": 19570 + }, + { + "epoch": 0.28206347148393046, + "grad_norm": 0.5377451181411743, + "learning_rate": 1.6849157017568382e-05, + "loss": 1.0357, + "step": 19580 + }, + { + "epoch": 0.2822075284152297, + "grad_norm": 0.5293382406234741, + "learning_rate": 1.684575674964942e-05, + "loss": 1.0455, + "step": 19590 + }, + { + "epoch": 0.28235158534652893, + "grad_norm": 0.7441731095314026, + "learning_rate": 1.684235499150365e-05, + "loss": 1.0389, + "step": 19600 + }, + { + "epoch": 0.2824956422778282, + "grad_norm": 0.49708226323127747, + "learning_rate": 1.6838951743871587e-05, + "loss": 1.0322, + "step": 19610 + }, + { + "epoch": 0.28263969920912746, + "grad_norm": 0.4691644608974457, + "learning_rate": 1.6835547007494073e-05, + "loss": 1.0408, + "step": 19620 + }, + { + "epoch": 0.2827837561404267, + "grad_norm": 0.6220194697380066, + "learning_rate": 1.6832140783112274e-05, + "loss": 1.0341, + "step": 19630 + }, + { + "epoch": 0.28292781307172593, + "grad_norm": 0.6367720365524292, + "learning_rate": 1.6828733071467674e-05, + "loss": 1.019, + "step": 19640 + }, + { + "epoch": 0.2830718700030252, + "grad_norm": 0.5318676233291626, + "learning_rate": 1.6825323873302088e-05, + "loss": 1.0198, + "step": 19650 + }, + { + "epoch": 0.28321592693432446, + "grad_norm": 0.5065328478813171, + "learning_rate": 1.682191318935765e-05, + "loss": 1.0715, + "step": 19660 + }, + { + "epoch": 0.2833599838656237, + "grad_norm": 0.5749115347862244, + "learning_rate": 1.681850102037682e-05, + "loss": 1.0481, + "step": 19670 + }, + { + "epoch": 0.28350404079692293, + "grad_norm": 0.5503406524658203, + "learning_rate": 1.6815087367102374e-05, + "loss": 1.0602, + "step": 19680 + }, + { + "epoch": 0.2836480977282222, + "grad_norm": 0.604962944984436, + "learning_rate": 1.6811672230277423e-05, + "loss": 1.0402, + "step": 19690 + }, + { + "epoch": 0.28379215465952146, + "grad_norm": 0.5853504538536072, + "learning_rate": 1.6808255610645405e-05, + "loss": 1.0404, + "step": 19700 + }, + { + "epoch": 0.2839362115908207, + "grad_norm": 0.5229265093803406, + "learning_rate": 1.680483750895005e-05, + "loss": 1.0282, + "step": 19710 + }, + { + "epoch": 0.2840802685221199, + "grad_norm": 0.5563735365867615, + "learning_rate": 1.6801417925935443e-05, + "loss": 1.03, + "step": 19720 + }, + { + "epoch": 0.2842243254534192, + "grad_norm": 0.5629031658172607, + "learning_rate": 1.679799686234598e-05, + "loss": 1.0443, + "step": 19730 + }, + { + "epoch": 0.28436838238471845, + "grad_norm": 0.5461305379867554, + "learning_rate": 1.6794574318926386e-05, + "loss": 1.0291, + "step": 19740 + }, + { + "epoch": 0.2845124393160177, + "grad_norm": 0.5271806120872498, + "learning_rate": 1.6791150296421683e-05, + "loss": 1.0134, + "step": 19750 + }, + { + "epoch": 0.2846564962473169, + "grad_norm": 0.49659666419029236, + "learning_rate": 1.6787724795577252e-05, + "loss": 1.0211, + "step": 19760 + }, + { + "epoch": 0.2848005531786162, + "grad_norm": 0.5652765035629272, + "learning_rate": 1.6784297817138766e-05, + "loss": 1.03, + "step": 19770 + }, + { + "epoch": 0.28494461010991545, + "grad_norm": 0.541502058506012, + "learning_rate": 1.6780869361852234e-05, + "loss": 1.0104, + "step": 19780 + }, + { + "epoch": 0.28508866704121466, + "grad_norm": 0.5626928806304932, + "learning_rate": 1.6777439430463987e-05, + "loss": 1.018, + "step": 19790 + }, + { + "epoch": 0.2852327239725139, + "grad_norm": 0.5848968625068665, + "learning_rate": 1.6774008023720667e-05, + "loss": 1.0196, + "step": 19800 + }, + { + "epoch": 0.2853767809038132, + "grad_norm": 0.5680802464485168, + "learning_rate": 1.6770575142369254e-05, + "loss": 1.0435, + "step": 19810 + }, + { + "epoch": 0.28552083783511245, + "grad_norm": 0.4836186468601227, + "learning_rate": 1.6767140787157025e-05, + "loss": 1.0315, + "step": 19820 + }, + { + "epoch": 0.28566489476641166, + "grad_norm": 0.5267698764801025, + "learning_rate": 1.6763704958831604e-05, + "loss": 1.0446, + "step": 19830 + }, + { + "epoch": 0.2858089516977109, + "grad_norm": 0.6138490438461304, + "learning_rate": 1.676026765814092e-05, + "loss": 1.0096, + "step": 19840 + }, + { + "epoch": 0.2859530086290102, + "grad_norm": 0.6735447645187378, + "learning_rate": 1.6756828885833224e-05, + "loss": 1.0333, + "step": 19850 + }, + { + "epoch": 0.28609706556030945, + "grad_norm": 0.5271909236907959, + "learning_rate": 1.6753388642657085e-05, + "loss": 1.0198, + "step": 19860 + }, + { + "epoch": 0.28624112249160866, + "grad_norm": 0.6136609315872192, + "learning_rate": 1.6749946929361405e-05, + "loss": 1.0168, + "step": 19870 + }, + { + "epoch": 0.2863851794229079, + "grad_norm": 0.5632956027984619, + "learning_rate": 1.6746503746695397e-05, + "loss": 1.0367, + "step": 19880 + }, + { + "epoch": 0.2865292363542072, + "grad_norm": 0.6717610359191895, + "learning_rate": 1.674305909540859e-05, + "loss": 1.0523, + "step": 19890 + }, + { + "epoch": 0.28667329328550645, + "grad_norm": 0.6363637447357178, + "learning_rate": 1.6739612976250836e-05, + "loss": 1.0364, + "step": 19900 + }, + { + "epoch": 0.28681735021680566, + "grad_norm": 0.5844237804412842, + "learning_rate": 1.673616538997231e-05, + "loss": 1.0168, + "step": 19910 + }, + { + "epoch": 0.2869614071481049, + "grad_norm": 0.6943037509918213, + "learning_rate": 1.6732716337323506e-05, + "loss": 1.0192, + "step": 19920 + }, + { + "epoch": 0.2871054640794042, + "grad_norm": 0.5689255595207214, + "learning_rate": 1.6729265819055232e-05, + "loss": 1.0391, + "step": 19930 + }, + { + "epoch": 0.28724952101070345, + "grad_norm": 0.5841550827026367, + "learning_rate": 1.6725813835918622e-05, + "loss": 1.0263, + "step": 19940 + }, + { + "epoch": 0.28739357794200265, + "grad_norm": 0.5294755101203918, + "learning_rate": 1.6722360388665118e-05, + "loss": 1.025, + "step": 19950 + }, + { + "epoch": 0.2875376348733019, + "grad_norm": 0.5748166441917419, + "learning_rate": 1.6718905478046497e-05, + "loss": 1.0463, + "step": 19960 + }, + { + "epoch": 0.2876816918046012, + "grad_norm": 0.5007639527320862, + "learning_rate": 1.6715449104814842e-05, + "loss": 1.0509, + "step": 19970 + }, + { + "epoch": 0.28782574873590044, + "grad_norm": 0.6602321267127991, + "learning_rate": 1.6711991269722552e-05, + "loss": 1.0537, + "step": 19980 + }, + { + "epoch": 0.28796980566719965, + "grad_norm": 0.5422115921974182, + "learning_rate": 1.6708531973522355e-05, + "loss": 1.0147, + "step": 19990 + }, + { + "epoch": 0.2881138625984989, + "grad_norm": 0.6622305512428284, + "learning_rate": 1.6705071216967294e-05, + "loss": 1.0313, + "step": 20000 + }, + { + "epoch": 0.2882579195297982, + "grad_norm": 0.658025324344635, + "learning_rate": 1.670160900081072e-05, + "loss": 1.0425, + "step": 20010 + }, + { + "epoch": 0.28840197646109744, + "grad_norm": 0.5682501196861267, + "learning_rate": 1.6698145325806322e-05, + "loss": 1.0084, + "step": 20020 + }, + { + "epoch": 0.28854603339239665, + "grad_norm": 0.5901638269424438, + "learning_rate": 1.6694680192708083e-05, + "loss": 1.0426, + "step": 20030 + }, + { + "epoch": 0.2886900903236959, + "grad_norm": 0.49289217591285706, + "learning_rate": 1.6691213602270325e-05, + "loss": 1.0386, + "step": 20040 + }, + { + "epoch": 0.2888341472549952, + "grad_norm": 0.6020851135253906, + "learning_rate": 1.6687745555247664e-05, + "loss": 1.0558, + "step": 20050 + }, + { + "epoch": 0.28897820418629444, + "grad_norm": 0.5392109155654907, + "learning_rate": 1.6684276052395056e-05, + "loss": 1.001, + "step": 20060 + }, + { + "epoch": 0.28912226111759365, + "grad_norm": 0.6716406941413879, + "learning_rate": 1.6680805094467763e-05, + "loss": 1.037, + "step": 20070 + }, + { + "epoch": 0.2892663180488929, + "grad_norm": 0.6591319441795349, + "learning_rate": 1.667733268222136e-05, + "loss": 1.0455, + "step": 20080 + }, + { + "epoch": 0.2894103749801922, + "grad_norm": 0.5621931552886963, + "learning_rate": 1.667385881641175e-05, + "loss": 1.0574, + "step": 20090 + }, + { + "epoch": 0.28955443191149144, + "grad_norm": 0.5402848720550537, + "learning_rate": 1.6670383497795136e-05, + "loss": 1.0321, + "step": 20100 + }, + { + "epoch": 0.28969848884279065, + "grad_norm": 0.6561526656150818, + "learning_rate": 1.6666906727128055e-05, + "loss": 1.0498, + "step": 20110 + }, + { + "epoch": 0.2898425457740899, + "grad_norm": 0.5861624479293823, + "learning_rate": 1.6663428505167353e-05, + "loss": 1.057, + "step": 20120 + }, + { + "epoch": 0.2899866027053892, + "grad_norm": 0.5896426439285278, + "learning_rate": 1.6659948832670183e-05, + "loss": 1.0493, + "step": 20130 + }, + { + "epoch": 0.29013065963668844, + "grad_norm": 0.5974220633506775, + "learning_rate": 1.665646771039403e-05, + "loss": 1.0337, + "step": 20140 + }, + { + "epoch": 0.29027471656798765, + "grad_norm": 0.6230769157409668, + "learning_rate": 1.6652985139096678e-05, + "loss": 1.0166, + "step": 20150 + }, + { + "epoch": 0.2904187734992869, + "grad_norm": 0.6564093828201294, + "learning_rate": 1.664950111953624e-05, + "loss": 1.0296, + "step": 20160 + }, + { + "epoch": 0.29056283043058617, + "grad_norm": 0.46283620595932007, + "learning_rate": 1.664601565247114e-05, + "loss": 1.0271, + "step": 20170 + }, + { + "epoch": 0.29070688736188544, + "grad_norm": 0.521889328956604, + "learning_rate": 1.664252873866011e-05, + "loss": 1.0379, + "step": 20180 + }, + { + "epoch": 0.29085094429318464, + "grad_norm": 0.6000293493270874, + "learning_rate": 1.6639040378862215e-05, + "loss": 1.0343, + "step": 20190 + }, + { + "epoch": 0.2909950012244839, + "grad_norm": 0.5207245945930481, + "learning_rate": 1.6635550573836806e-05, + "loss": 1.033, + "step": 20200 + }, + { + "epoch": 0.29113905815578317, + "grad_norm": 0.5225106477737427, + "learning_rate": 1.6632059324343583e-05, + "loss": 1.0286, + "step": 20210 + }, + { + "epoch": 0.29128311508708243, + "grad_norm": 0.5262543559074402, + "learning_rate": 1.662856663114253e-05, + "loss": 1.0267, + "step": 20220 + }, + { + "epoch": 0.29142717201838164, + "grad_norm": 0.5357270836830139, + "learning_rate": 1.6625072494993967e-05, + "loss": 1.0265, + "step": 20230 + }, + { + "epoch": 0.2915712289496809, + "grad_norm": 0.5790686011314392, + "learning_rate": 1.6621576916658508e-05, + "loss": 1.0194, + "step": 20240 + }, + { + "epoch": 0.29171528588098017, + "grad_norm": 0.5545904636383057, + "learning_rate": 1.6618079896897105e-05, + "loss": 1.0377, + "step": 20250 + }, + { + "epoch": 0.29185934281227943, + "grad_norm": 0.520649254322052, + "learning_rate": 1.6614581436471e-05, + "loss": 1.0359, + "step": 20260 + }, + { + "epoch": 0.29200339974357864, + "grad_norm": 0.5825948119163513, + "learning_rate": 1.6611081536141768e-05, + "loss": 1.0492, + "step": 20270 + }, + { + "epoch": 0.2921474566748779, + "grad_norm": 0.597335696220398, + "learning_rate": 1.660758019667128e-05, + "loss": 1.0203, + "step": 20280 + }, + { + "epoch": 0.29229151360617717, + "grad_norm": 0.6152435541152954, + "learning_rate": 1.6604077418821742e-05, + "loss": 1.0335, + "step": 20290 + }, + { + "epoch": 0.29243557053747643, + "grad_norm": 0.5295214653015137, + "learning_rate": 1.660057320335565e-05, + "loss": 1.0415, + "step": 20300 + }, + { + "epoch": 0.29257962746877564, + "grad_norm": 0.5754656195640564, + "learning_rate": 1.6597067551035827e-05, + "loss": 1.0281, + "step": 20310 + }, + { + "epoch": 0.2927236844000749, + "grad_norm": 0.6352607607841492, + "learning_rate": 1.6593560462625405e-05, + "loss": 1.0496, + "step": 20320 + }, + { + "epoch": 0.29286774133137417, + "grad_norm": 0.6430121064186096, + "learning_rate": 1.6590051938887824e-05, + "loss": 1.045, + "step": 20330 + }, + { + "epoch": 0.29301179826267343, + "grad_norm": 0.7143761515617371, + "learning_rate": 1.658654198058685e-05, + "loss": 1.0392, + "step": 20340 + }, + { + "epoch": 0.29315585519397264, + "grad_norm": 0.5576317310333252, + "learning_rate": 1.6583030588486547e-05, + "loss": 1.0267, + "step": 20350 + }, + { + "epoch": 0.2932999121252719, + "grad_norm": 0.7117175459861755, + "learning_rate": 1.6579517763351302e-05, + "loss": 1.035, + "step": 20360 + }, + { + "epoch": 0.29344396905657116, + "grad_norm": 0.5505491495132446, + "learning_rate": 1.65760035059458e-05, + "loss": 1.0217, + "step": 20370 + }, + { + "epoch": 0.2935880259878704, + "grad_norm": 0.5846567153930664, + "learning_rate": 1.6572487817035054e-05, + "loss": 1.0179, + "step": 20380 + }, + { + "epoch": 0.29373208291916963, + "grad_norm": 0.6030494570732117, + "learning_rate": 1.656897069738437e-05, + "loss": 1.05, + "step": 20390 + }, + { + "epoch": 0.2938761398504689, + "grad_norm": 0.5789952874183655, + "learning_rate": 1.6565452147759393e-05, + "loss": 1.0117, + "step": 20400 + }, + { + "epoch": 0.29402019678176816, + "grad_norm": 0.5466416478157043, + "learning_rate": 1.6561932168926053e-05, + "loss": 1.032, + "step": 20410 + }, + { + "epoch": 0.2941642537130674, + "grad_norm": 0.635989248752594, + "learning_rate": 1.6558410761650598e-05, + "loss": 1.0556, + "step": 20420 + }, + { + "epoch": 0.29430831064436663, + "grad_norm": 0.5572335720062256, + "learning_rate": 1.6554887926699595e-05, + "loss": 1.0115, + "step": 20430 + }, + { + "epoch": 0.2944523675756659, + "grad_norm": 0.5357279777526855, + "learning_rate": 1.6551363664839915e-05, + "loss": 1.019, + "step": 20440 + }, + { + "epoch": 0.29459642450696516, + "grad_norm": 0.5445215702056885, + "learning_rate": 1.654783797683874e-05, + "loss": 0.9884, + "step": 20450 + }, + { + "epoch": 0.2947404814382644, + "grad_norm": 0.5228425860404968, + "learning_rate": 1.6544310863463568e-05, + "loss": 1.0215, + "step": 20460 + }, + { + "epoch": 0.29488453836956363, + "grad_norm": 0.5866361260414124, + "learning_rate": 1.6540782325482195e-05, + "loss": 1.0442, + "step": 20470 + }, + { + "epoch": 0.2950285953008629, + "grad_norm": 0.5385952591896057, + "learning_rate": 1.6537252363662744e-05, + "loss": 1.0343, + "step": 20480 + }, + { + "epoch": 0.29517265223216216, + "grad_norm": 0.5235530138015747, + "learning_rate": 1.6533720978773634e-05, + "loss": 1.0344, + "step": 20490 + }, + { + "epoch": 0.2953167091634614, + "grad_norm": 0.6012077927589417, + "learning_rate": 1.65301881715836e-05, + "loss": 1.0438, + "step": 20500 + }, + { + "epoch": 0.29546076609476063, + "grad_norm": 0.58733731508255, + "learning_rate": 1.6526653942861685e-05, + "loss": 1.0227, + "step": 20510 + }, + { + "epoch": 0.2956048230260599, + "grad_norm": 0.662129282951355, + "learning_rate": 1.652311829337724e-05, + "loss": 1.0133, + "step": 20520 + }, + { + "epoch": 0.29574887995735916, + "grad_norm": 0.6669366955757141, + "learning_rate": 1.651958122389993e-05, + "loss": 1.0378, + "step": 20530 + }, + { + "epoch": 0.2958929368886584, + "grad_norm": 0.638819694519043, + "learning_rate": 1.651604273519973e-05, + "loss": 1.0532, + "step": 20540 + }, + { + "epoch": 0.29603699381995763, + "grad_norm": 0.5322790145874023, + "learning_rate": 1.651250282804691e-05, + "loss": 1.0466, + "step": 20550 + }, + { + "epoch": 0.2961810507512569, + "grad_norm": 0.569996178150177, + "learning_rate": 1.6508961503212063e-05, + "loss": 1.0276, + "step": 20560 + }, + { + "epoch": 0.29632510768255615, + "grad_norm": 0.6113317608833313, + "learning_rate": 1.650541876146609e-05, + "loss": 1.019, + "step": 20570 + }, + { + "epoch": 0.2964691646138554, + "grad_norm": 0.5505111217498779, + "learning_rate": 1.6501874603580195e-05, + "loss": 1.0411, + "step": 20580 + }, + { + "epoch": 0.2966132215451546, + "grad_norm": 0.5778427124023438, + "learning_rate": 1.6498329030325894e-05, + "loss": 1.0259, + "step": 20590 + }, + { + "epoch": 0.2967572784764539, + "grad_norm": 0.5522878170013428, + "learning_rate": 1.6494782042475007e-05, + "loss": 1.0457, + "step": 20600 + }, + { + "epoch": 0.29690133540775315, + "grad_norm": 0.6135400533676147, + "learning_rate": 1.649123364079966e-05, + "loss": 1.039, + "step": 20610 + }, + { + "epoch": 0.2970453923390524, + "grad_norm": 0.623113214969635, + "learning_rate": 1.6487683826072302e-05, + "loss": 1.0324, + "step": 20620 + }, + { + "epoch": 0.2971894492703516, + "grad_norm": 0.7084492444992065, + "learning_rate": 1.6484132599065673e-05, + "loss": 1.0225, + "step": 20630 + }, + { + "epoch": 0.2973335062016509, + "grad_norm": 0.6530845165252686, + "learning_rate": 1.648057996055282e-05, + "loss": 1.02, + "step": 20640 + }, + { + "epoch": 0.29747756313295015, + "grad_norm": 0.6086758375167847, + "learning_rate": 1.6477025911307115e-05, + "loss": 1.0203, + "step": 20650 + }, + { + "epoch": 0.2976216200642494, + "grad_norm": 0.6114258766174316, + "learning_rate": 1.6473470452102218e-05, + "loss": 1.0293, + "step": 20660 + }, + { + "epoch": 0.2977656769955486, + "grad_norm": 0.5102829933166504, + "learning_rate": 1.6469913583712103e-05, + "loss": 1.0324, + "step": 20670 + }, + { + "epoch": 0.2979097339268479, + "grad_norm": 0.5642375349998474, + "learning_rate": 1.6466355306911054e-05, + "loss": 1.0168, + "step": 20680 + }, + { + "epoch": 0.29805379085814715, + "grad_norm": 0.5559593439102173, + "learning_rate": 1.6462795622473658e-05, + "loss": 1.0528, + "step": 20690 + }, + { + "epoch": 0.2981978477894464, + "grad_norm": 0.5291323065757751, + "learning_rate": 1.645923453117481e-05, + "loss": 1.0136, + "step": 20700 + }, + { + "epoch": 0.2983419047207456, + "grad_norm": 0.5465613007545471, + "learning_rate": 1.6455672033789712e-05, + "loss": 1.0422, + "step": 20710 + }, + { + "epoch": 0.2984859616520449, + "grad_norm": 0.5516693592071533, + "learning_rate": 1.6452108131093866e-05, + "loss": 1.0241, + "step": 20720 + }, + { + "epoch": 0.29863001858334415, + "grad_norm": 0.652644157409668, + "learning_rate": 1.6448542823863088e-05, + "loss": 1.041, + "step": 20730 + }, + { + "epoch": 0.2987740755146434, + "grad_norm": 0.7366124987602234, + "learning_rate": 1.6444976112873493e-05, + "loss": 1.0127, + "step": 20740 + }, + { + "epoch": 0.2989181324459426, + "grad_norm": 0.49661996960639954, + "learning_rate": 1.644140799890151e-05, + "loss": 1.0279, + "step": 20750 + }, + { + "epoch": 0.2990621893772419, + "grad_norm": 0.5000258088111877, + "learning_rate": 1.643783848272386e-05, + "loss": 1.0221, + "step": 20760 + }, + { + "epoch": 0.29920624630854115, + "grad_norm": 0.5386760234832764, + "learning_rate": 1.6434267565117586e-05, + "loss": 1.0341, + "step": 20770 + }, + { + "epoch": 0.2993503032398404, + "grad_norm": 0.5642584562301636, + "learning_rate": 1.6430695246860026e-05, + "loss": 1.0624, + "step": 20780 + }, + { + "epoch": 0.2994943601711396, + "grad_norm": 0.5452033877372742, + "learning_rate": 1.6427121528728816e-05, + "loss": 1.0381, + "step": 20790 + }, + { + "epoch": 0.2996384171024389, + "grad_norm": 0.5538674592971802, + "learning_rate": 1.6423546411501916e-05, + "loss": 1.0195, + "step": 20800 + }, + { + "epoch": 0.29978247403373814, + "grad_norm": 0.6314508318901062, + "learning_rate": 1.6419969895957574e-05, + "loss": 1.0404, + "step": 20810 + }, + { + "epoch": 0.2999265309650374, + "grad_norm": 0.5990069508552551, + "learning_rate": 1.6416391982874348e-05, + "loss": 1.0206, + "step": 20820 + }, + { + "epoch": 0.3000705878963366, + "grad_norm": 0.5225420594215393, + "learning_rate": 1.6412812673031102e-05, + "loss": 1.0394, + "step": 20830 + }, + { + "epoch": 0.3002146448276359, + "grad_norm": 0.5708862543106079, + "learning_rate": 1.6409231967207004e-05, + "loss": 1.0171, + "step": 20840 + }, + { + "epoch": 0.30035870175893514, + "grad_norm": 0.6064662337303162, + "learning_rate": 1.6405649866181518e-05, + "loss": 1.0127, + "step": 20850 + }, + { + "epoch": 0.3005027586902344, + "grad_norm": 0.5953640341758728, + "learning_rate": 1.6402066370734426e-05, + "loss": 1.0358, + "step": 20860 + }, + { + "epoch": 0.3006468156215336, + "grad_norm": 0.603715181350708, + "learning_rate": 1.63984814816458e-05, + "loss": 1.0274, + "step": 20870 + }, + { + "epoch": 0.3007908725528329, + "grad_norm": 0.615307092666626, + "learning_rate": 1.639489519969602e-05, + "loss": 1.0424, + "step": 20880 + }, + { + "epoch": 0.30093492948413214, + "grad_norm": 0.5497210025787354, + "learning_rate": 1.6391307525665777e-05, + "loss": 1.0445, + "step": 20890 + }, + { + "epoch": 0.3010789864154314, + "grad_norm": 0.5842812657356262, + "learning_rate": 1.6387718460336052e-05, + "loss": 1.0537, + "step": 20900 + }, + { + "epoch": 0.3012230433467306, + "grad_norm": 0.4811582565307617, + "learning_rate": 1.6384128004488137e-05, + "loss": 1.0189, + "step": 20910 + }, + { + "epoch": 0.3013671002780299, + "grad_norm": 0.6424331665039062, + "learning_rate": 1.6380536158903627e-05, + "loss": 1.0306, + "step": 20920 + }, + { + "epoch": 0.30151115720932914, + "grad_norm": 0.6306112408638, + "learning_rate": 1.637694292436441e-05, + "loss": 1.0369, + "step": 20930 + }, + { + "epoch": 0.3016552141406284, + "grad_norm": 0.5689916610717773, + "learning_rate": 1.637334830165269e-05, + "loss": 1.0384, + "step": 20940 + }, + { + "epoch": 0.3017992710719276, + "grad_norm": 0.5546683669090271, + "learning_rate": 1.6369752291550965e-05, + "loss": 1.0277, + "step": 20950 + }, + { + "epoch": 0.3019433280032269, + "grad_norm": 0.559262216091156, + "learning_rate": 1.6366154894842038e-05, + "loss": 1.0368, + "step": 20960 + }, + { + "epoch": 0.30208738493452614, + "grad_norm": 0.5267927050590515, + "learning_rate": 1.6362556112309008e-05, + "loss": 1.0213, + "step": 20970 + }, + { + "epoch": 0.3022314418658254, + "grad_norm": 0.560387909412384, + "learning_rate": 1.635895594473529e-05, + "loss": 1.0386, + "step": 20980 + }, + { + "epoch": 0.3023754987971246, + "grad_norm": 0.5496255159378052, + "learning_rate": 1.6355354392904584e-05, + "loss": 1.0391, + "step": 20990 + }, + { + "epoch": 0.3025195557284239, + "grad_norm": 0.5311712622642517, + "learning_rate": 1.6351751457600894e-05, + "loss": 1.0272, + "step": 21000 + }, + { + "epoch": 0.30266361265972314, + "grad_norm": 0.5476694703102112, + "learning_rate": 1.634814713960854e-05, + "loss": 1.0277, + "step": 21010 + }, + { + "epoch": 0.3028076695910224, + "grad_norm": 0.5251296162605286, + "learning_rate": 1.634454143971212e-05, + "loss": 1.0362, + "step": 21020 + }, + { + "epoch": 0.3029517265223216, + "grad_norm": 0.46365106105804443, + "learning_rate": 1.6340934358696553e-05, + "loss": 1.0149, + "step": 21030 + }, + { + "epoch": 0.30309578345362087, + "grad_norm": 0.7171838879585266, + "learning_rate": 1.6337325897347052e-05, + "loss": 1.0523, + "step": 21040 + }, + { + "epoch": 0.30323984038492013, + "grad_norm": 0.6140922904014587, + "learning_rate": 1.6333716056449125e-05, + "loss": 1.0429, + "step": 21050 + }, + { + "epoch": 0.3033838973162194, + "grad_norm": 0.5282112956047058, + "learning_rate": 1.6330104836788588e-05, + "loss": 1.033, + "step": 21060 + }, + { + "epoch": 0.3035279542475186, + "grad_norm": 0.5564023852348328, + "learning_rate": 1.632649223915155e-05, + "loss": 1.052, + "step": 21070 + }, + { + "epoch": 0.30367201117881787, + "grad_norm": 0.5264879465103149, + "learning_rate": 1.6322878264324423e-05, + "loss": 1.0567, + "step": 21080 + }, + { + "epoch": 0.30381606811011713, + "grad_norm": 0.5021429061889648, + "learning_rate": 1.631926291309393e-05, + "loss": 1.0304, + "step": 21090 + }, + { + "epoch": 0.30396012504141634, + "grad_norm": 0.5093965530395508, + "learning_rate": 1.6315646186247067e-05, + "loss": 1.0304, + "step": 21100 + }, + { + "epoch": 0.3041041819727156, + "grad_norm": 0.556792140007019, + "learning_rate": 1.6312028084571157e-05, + "loss": 1.0294, + "step": 21110 + }, + { + "epoch": 0.30424823890401487, + "grad_norm": 0.5673565864562988, + "learning_rate": 1.630840860885381e-05, + "loss": 1.0155, + "step": 21120 + }, + { + "epoch": 0.30439229583531413, + "grad_norm": 0.9615490436553955, + "learning_rate": 1.6304787759882928e-05, + "loss": 1.0346, + "step": 21130 + }, + { + "epoch": 0.30453635276661334, + "grad_norm": 0.5927960872650146, + "learning_rate": 1.6301165538446732e-05, + "loss": 1.0463, + "step": 21140 + }, + { + "epoch": 0.3046804096979126, + "grad_norm": 0.5788955092430115, + "learning_rate": 1.6297541945333716e-05, + "loss": 1.0343, + "step": 21150 + }, + { + "epoch": 0.30482446662921187, + "grad_norm": 0.5964028835296631, + "learning_rate": 1.6293916981332695e-05, + "loss": 1.0474, + "step": 21160 + }, + { + "epoch": 0.30496852356051113, + "grad_norm": 0.5674517750740051, + "learning_rate": 1.6290290647232774e-05, + "loss": 1.0463, + "step": 21170 + }, + { + "epoch": 0.30511258049181034, + "grad_norm": 0.5272372961044312, + "learning_rate": 1.6286662943823352e-05, + "loss": 1.0276, + "step": 21180 + }, + { + "epoch": 0.3052566374231096, + "grad_norm": 0.46439695358276367, + "learning_rate": 1.628303387189413e-05, + "loss": 1.0119, + "step": 21190 + }, + { + "epoch": 0.30540069435440886, + "grad_norm": 0.5587084889411926, + "learning_rate": 1.6279403432235113e-05, + "loss": 1.0111, + "step": 21200 + }, + { + "epoch": 0.3055447512857081, + "grad_norm": 0.6520376801490784, + "learning_rate": 1.6275771625636592e-05, + "loss": 1.0378, + "step": 21210 + }, + { + "epoch": 0.30568880821700734, + "grad_norm": 0.534944474697113, + "learning_rate": 1.627213845288916e-05, + "loss": 1.022, + "step": 21220 + }, + { + "epoch": 0.3058328651483066, + "grad_norm": 0.642658531665802, + "learning_rate": 1.6268503914783714e-05, + "loss": 1.0178, + "step": 21230 + }, + { + "epoch": 0.30597692207960586, + "grad_norm": 0.5053707361221313, + "learning_rate": 1.6264868012111436e-05, + "loss": 1.0179, + "step": 21240 + }, + { + "epoch": 0.3061209790109051, + "grad_norm": 0.6561596393585205, + "learning_rate": 1.626123074566382e-05, + "loss": 1.0395, + "step": 21250 + }, + { + "epoch": 0.30626503594220433, + "grad_norm": 0.5093657374382019, + "learning_rate": 1.6257592116232642e-05, + "loss": 1.0322, + "step": 21260 + }, + { + "epoch": 0.3064090928735036, + "grad_norm": 0.5733696818351746, + "learning_rate": 1.6253952124609984e-05, + "loss": 1.0254, + "step": 21270 + }, + { + "epoch": 0.30655314980480286, + "grad_norm": 0.6285299062728882, + "learning_rate": 1.625031077158822e-05, + "loss": 1.0453, + "step": 21280 + }, + { + "epoch": 0.3066972067361021, + "grad_norm": 0.7546630501747131, + "learning_rate": 1.6246668057960026e-05, + "loss": 1.028, + "step": 21290 + }, + { + "epoch": 0.30684126366740133, + "grad_norm": 0.5427360534667969, + "learning_rate": 1.6243023984518368e-05, + "loss": 1.0203, + "step": 21300 + }, + { + "epoch": 0.3069853205987006, + "grad_norm": 0.594230592250824, + "learning_rate": 1.6239378552056514e-05, + "loss": 1.0062, + "step": 21310 + }, + { + "epoch": 0.30712937752999986, + "grad_norm": 0.521277666091919, + "learning_rate": 1.6235731761368022e-05, + "loss": 1.0194, + "step": 21320 + }, + { + "epoch": 0.3072734344612991, + "grad_norm": 0.5680876970291138, + "learning_rate": 1.6232083613246745e-05, + "loss": 1.0194, + "step": 21330 + }, + { + "epoch": 0.30741749139259833, + "grad_norm": 0.5643933415412903, + "learning_rate": 1.6228434108486833e-05, + "loss": 1.0392, + "step": 21340 + }, + { + "epoch": 0.3075615483238976, + "grad_norm": 0.6359453201293945, + "learning_rate": 1.6224783247882743e-05, + "loss": 1.0295, + "step": 21350 + }, + { + "epoch": 0.30770560525519686, + "grad_norm": 0.4792833924293518, + "learning_rate": 1.6221131032229208e-05, + "loss": 1.0182, + "step": 21360 + }, + { + "epoch": 0.3078496621864961, + "grad_norm": 0.5217666625976562, + "learning_rate": 1.6217477462321264e-05, + "loss": 1.0262, + "step": 21370 + }, + { + "epoch": 0.30799371911779533, + "grad_norm": 0.5734424591064453, + "learning_rate": 1.6213822538954255e-05, + "loss": 1.0242, + "step": 21380 + }, + { + "epoch": 0.3081377760490946, + "grad_norm": 0.4927959442138672, + "learning_rate": 1.6210166262923794e-05, + "loss": 1.0383, + "step": 21390 + }, + { + "epoch": 0.30828183298039386, + "grad_norm": 0.5450374484062195, + "learning_rate": 1.6206508635025808e-05, + "loss": 1.0074, + "step": 21400 + }, + { + "epoch": 0.3084258899116931, + "grad_norm": 0.49887827038764954, + "learning_rate": 1.620284965605651e-05, + "loss": 1.0352, + "step": 21410 + }, + { + "epoch": 0.3085699468429923, + "grad_norm": 0.5779523849487305, + "learning_rate": 1.619918932681241e-05, + "loss": 1.0316, + "step": 21420 + }, + { + "epoch": 0.3087140037742916, + "grad_norm": 0.5187239050865173, + "learning_rate": 1.6195527648090314e-05, + "loss": 1.0374, + "step": 21430 + }, + { + "epoch": 0.30885806070559085, + "grad_norm": 0.543380081653595, + "learning_rate": 1.6191864620687314e-05, + "loss": 1.0418, + "step": 21440 + }, + { + "epoch": 0.3090021176368901, + "grad_norm": 0.5300890207290649, + "learning_rate": 1.6188200245400802e-05, + "loss": 1.0279, + "step": 21450 + }, + { + "epoch": 0.3091461745681893, + "grad_norm": 0.530333936214447, + "learning_rate": 1.618453452302847e-05, + "loss": 1.0377, + "step": 21460 + }, + { + "epoch": 0.3092902314994886, + "grad_norm": 0.6680883765220642, + "learning_rate": 1.618086745436828e-05, + "loss": 1.03, + "step": 21470 + }, + { + "epoch": 0.30943428843078785, + "grad_norm": 0.4743339419364929, + "learning_rate": 1.6177199040218512e-05, + "loss": 1.0197, + "step": 21480 + }, + { + "epoch": 0.3095783453620871, + "grad_norm": 0.5322113037109375, + "learning_rate": 1.6173529281377728e-05, + "loss": 1.0161, + "step": 21490 + }, + { + "epoch": 0.3097224022933863, + "grad_norm": 0.5997210144996643, + "learning_rate": 1.6169858178644786e-05, + "loss": 1.0276, + "step": 21500 + }, + { + "epoch": 0.3098664592246856, + "grad_norm": 0.5533350706100464, + "learning_rate": 1.6166185732818828e-05, + "loss": 1.0364, + "step": 21510 + }, + { + "epoch": 0.31001051615598485, + "grad_norm": 0.7028584480285645, + "learning_rate": 1.6162511944699302e-05, + "loss": 1.0329, + "step": 21520 + }, + { + "epoch": 0.3101545730872841, + "grad_norm": 0.49295574426651, + "learning_rate": 1.6158836815085934e-05, + "loss": 1.0283, + "step": 21530 + }, + { + "epoch": 0.3102986300185833, + "grad_norm": 0.5209812521934509, + "learning_rate": 1.6155160344778752e-05, + "loss": 1.0188, + "step": 21540 + }, + { + "epoch": 0.3104426869498826, + "grad_norm": 0.6073362827301025, + "learning_rate": 1.615148253457808e-05, + "loss": 1.0439, + "step": 21550 + }, + { + "epoch": 0.31058674388118185, + "grad_norm": 0.48292115330696106, + "learning_rate": 1.6147803385284513e-05, + "loss": 1.0354, + "step": 21560 + }, + { + "epoch": 0.3107308008124811, + "grad_norm": 0.6610682010650635, + "learning_rate": 1.614412289769896e-05, + "loss": 1.0681, + "step": 21570 + }, + { + "epoch": 0.3108748577437803, + "grad_norm": 0.5379059910774231, + "learning_rate": 1.6140441072622614e-05, + "loss": 1.0363, + "step": 21580 + }, + { + "epoch": 0.3110189146750796, + "grad_norm": 0.6262497305870056, + "learning_rate": 1.6136757910856948e-05, + "loss": 1.0287, + "step": 21590 + }, + { + "epoch": 0.31116297160637885, + "grad_norm": 0.6049202680587769, + "learning_rate": 1.6133073413203745e-05, + "loss": 1.0502, + "step": 21600 + }, + { + "epoch": 0.3113070285376781, + "grad_norm": 0.5770962834358215, + "learning_rate": 1.6129387580465065e-05, + "loss": 1.0425, + "step": 21610 + }, + { + "epoch": 0.3114510854689773, + "grad_norm": 0.620063841342926, + "learning_rate": 1.612570041344326e-05, + "loss": 1.0446, + "step": 21620 + }, + { + "epoch": 0.3115951424002766, + "grad_norm": 0.5885114669799805, + "learning_rate": 1.6122011912940983e-05, + "loss": 1.0377, + "step": 21630 + }, + { + "epoch": 0.31173919933157584, + "grad_norm": 0.48093509674072266, + "learning_rate": 1.6118322079761167e-05, + "loss": 1.0221, + "step": 21640 + }, + { + "epoch": 0.3118832562628751, + "grad_norm": 0.6427713632583618, + "learning_rate": 1.6114630914707036e-05, + "loss": 1.0282, + "step": 21650 + }, + { + "epoch": 0.3120273131941743, + "grad_norm": 0.5798922777175903, + "learning_rate": 1.6110938418582106e-05, + "loss": 1.0243, + "step": 21660 + }, + { + "epoch": 0.3121713701254736, + "grad_norm": 0.6313838362693787, + "learning_rate": 1.6107244592190177e-05, + "loss": 1.0081, + "step": 21670 + }, + { + "epoch": 0.31231542705677284, + "grad_norm": 0.6102400422096252, + "learning_rate": 1.610354943633536e-05, + "loss": 1.0384, + "step": 21680 + }, + { + "epoch": 0.3124594839880721, + "grad_norm": 0.6214983463287354, + "learning_rate": 1.6099852951822023e-05, + "loss": 1.0444, + "step": 21690 + }, + { + "epoch": 0.3126035409193713, + "grad_norm": 0.5477856397628784, + "learning_rate": 1.6096155139454844e-05, + "loss": 1.0341, + "step": 21700 + }, + { + "epoch": 0.3127475978506706, + "grad_norm": 0.5215377807617188, + "learning_rate": 1.6092456000038793e-05, + "loss": 1.0133, + "step": 21710 + }, + { + "epoch": 0.31289165478196984, + "grad_norm": 0.5937027335166931, + "learning_rate": 1.6088755534379115e-05, + "loss": 1.0138, + "step": 21720 + }, + { + "epoch": 0.3130357117132691, + "grad_norm": 0.596572756767273, + "learning_rate": 1.608505374328135e-05, + "loss": 1.0309, + "step": 21730 + }, + { + "epoch": 0.3131797686445683, + "grad_norm": 0.5597870945930481, + "learning_rate": 1.6081350627551327e-05, + "loss": 1.0358, + "step": 21740 + }, + { + "epoch": 0.3133238255758676, + "grad_norm": 0.50466388463974, + "learning_rate": 1.6077646187995167e-05, + "loss": 1.0299, + "step": 21750 + }, + { + "epoch": 0.31346788250716684, + "grad_norm": 0.7967897653579712, + "learning_rate": 1.6073940425419274e-05, + "loss": 1.0408, + "step": 21760 + }, + { + "epoch": 0.3136119394384661, + "grad_norm": 0.4919719696044922, + "learning_rate": 1.6070233340630337e-05, + "loss": 1.0274, + "step": 21770 + }, + { + "epoch": 0.3137559963697653, + "grad_norm": 0.5507171750068665, + "learning_rate": 1.606652493443534e-05, + "loss": 1.0207, + "step": 21780 + }, + { + "epoch": 0.3139000533010646, + "grad_norm": 0.6266515254974365, + "learning_rate": 1.606281520764156e-05, + "loss": 1.0497, + "step": 21790 + }, + { + "epoch": 0.31404411023236384, + "grad_norm": 0.558059811592102, + "learning_rate": 1.6059104161056536e-05, + "loss": 1.0151, + "step": 21800 + }, + { + "epoch": 0.3141881671636631, + "grad_norm": 0.5087937712669373, + "learning_rate": 1.605539179548813e-05, + "loss": 1.0369, + "step": 21810 + }, + { + "epoch": 0.3143322240949623, + "grad_norm": 0.49334022402763367, + "learning_rate": 1.605167811174446e-05, + "loss": 1.0222, + "step": 21820 + }, + { + "epoch": 0.3144762810262616, + "grad_norm": 0.599881649017334, + "learning_rate": 1.604796311063395e-05, + "loss": 1.0373, + "step": 21830 + }, + { + "epoch": 0.31462033795756084, + "grad_norm": 0.6097472906112671, + "learning_rate": 1.60442467929653e-05, + "loss": 1.0463, + "step": 21840 + }, + { + "epoch": 0.3147643948888601, + "grad_norm": 0.5266076922416687, + "learning_rate": 1.6040529159547505e-05, + "loss": 1.0116, + "step": 21850 + }, + { + "epoch": 0.3149084518201593, + "grad_norm": 0.624330461025238, + "learning_rate": 1.6036810211189843e-05, + "loss": 1.0508, + "step": 21860 + }, + { + "epoch": 0.31505250875145857, + "grad_norm": 0.561284065246582, + "learning_rate": 1.6033089948701872e-05, + "loss": 1.0226, + "step": 21870 + }, + { + "epoch": 0.31519656568275783, + "grad_norm": 0.5418603420257568, + "learning_rate": 1.6029368372893446e-05, + "loss": 1.0101, + "step": 21880 + }, + { + "epoch": 0.3153406226140571, + "grad_norm": 0.48067712783813477, + "learning_rate": 1.6025645484574702e-05, + "loss": 1.0448, + "step": 21890 + }, + { + "epoch": 0.3154846795453563, + "grad_norm": 0.5670645833015442, + "learning_rate": 1.602192128455606e-05, + "loss": 1.0227, + "step": 21900 + }, + { + "epoch": 0.31562873647665557, + "grad_norm": 0.6337267756462097, + "learning_rate": 1.601819577364823e-05, + "loss": 1.0366, + "step": 21910 + }, + { + "epoch": 0.31577279340795483, + "grad_norm": 0.5901769995689392, + "learning_rate": 1.6014468952662197e-05, + "loss": 1.0099, + "step": 21920 + }, + { + "epoch": 0.3159168503392541, + "grad_norm": 0.5751497149467468, + "learning_rate": 1.601074082240924e-05, + "loss": 1.0581, + "step": 21930 + }, + { + "epoch": 0.3160609072705533, + "grad_norm": 0.6279475092887878, + "learning_rate": 1.600701138370093e-05, + "loss": 1.0286, + "step": 21940 + }, + { + "epoch": 0.31620496420185257, + "grad_norm": 0.5632752776145935, + "learning_rate": 1.6003280637349106e-05, + "loss": 1.0393, + "step": 21950 + }, + { + "epoch": 0.31634902113315183, + "grad_norm": 0.6024361848831177, + "learning_rate": 1.5999548584165904e-05, + "loss": 1.0155, + "step": 21960 + }, + { + "epoch": 0.3164930780644511, + "grad_norm": 0.5428387522697449, + "learning_rate": 1.599581522496374e-05, + "loss": 1.0058, + "step": 21970 + }, + { + "epoch": 0.3166371349957503, + "grad_norm": 0.5390846729278564, + "learning_rate": 1.599208056055531e-05, + "loss": 1.0435, + "step": 21980 + }, + { + "epoch": 0.31678119192704957, + "grad_norm": 0.5568037033081055, + "learning_rate": 1.5988344591753603e-05, + "loss": 1.0291, + "step": 21990 + }, + { + "epoch": 0.31692524885834883, + "grad_norm": 0.5895788073539734, + "learning_rate": 1.5984607319371887e-05, + "loss": 1.0252, + "step": 22000 + }, + { + "epoch": 0.3170693057896481, + "grad_norm": 0.5308651328086853, + "learning_rate": 1.5980868744223717e-05, + "loss": 1.0297, + "step": 22010 + }, + { + "epoch": 0.3172133627209473, + "grad_norm": 0.5814491510391235, + "learning_rate": 1.5977128867122932e-05, + "loss": 1.0617, + "step": 22020 + }, + { + "epoch": 0.31735741965224656, + "grad_norm": 0.5710555911064148, + "learning_rate": 1.597338768888364e-05, + "loss": 1.0447, + "step": 22030 + }, + { + "epoch": 0.3175014765835458, + "grad_norm": 0.5677223801612854, + "learning_rate": 1.5969645210320256e-05, + "loss": 1.0438, + "step": 22040 + }, + { + "epoch": 0.3176455335148451, + "grad_norm": 0.6458613276481628, + "learning_rate": 1.596590143224746e-05, + "loss": 1.0231, + "step": 22050 + }, + { + "epoch": 0.3177895904461443, + "grad_norm": 0.5588922500610352, + "learning_rate": 1.596215635548022e-05, + "loss": 1.0349, + "step": 22060 + }, + { + "epoch": 0.31793364737744356, + "grad_norm": 0.6394858360290527, + "learning_rate": 1.5958409980833786e-05, + "loss": 1.0245, + "step": 22070 + }, + { + "epoch": 0.3180777043087428, + "grad_norm": 0.6036264300346375, + "learning_rate": 1.5954662309123703e-05, + "loss": 1.0366, + "step": 22080 + }, + { + "epoch": 0.3182217612400421, + "grad_norm": 0.48761194944381714, + "learning_rate": 1.5950913341165778e-05, + "loss": 1.0286, + "step": 22090 + }, + { + "epoch": 0.3183658181713413, + "grad_norm": 0.5573323965072632, + "learning_rate": 1.5947163077776108e-05, + "loss": 1.0256, + "step": 22100 + }, + { + "epoch": 0.31850987510264056, + "grad_norm": 0.6502055525779724, + "learning_rate": 1.594341151977108e-05, + "loss": 1.0313, + "step": 22110 + }, + { + "epoch": 0.3186539320339398, + "grad_norm": 0.5464458465576172, + "learning_rate": 1.5939658667967354e-05, + "loss": 1.0487, + "step": 22120 + }, + { + "epoch": 0.3187979889652391, + "grad_norm": 0.5588178634643555, + "learning_rate": 1.5935904523181872e-05, + "loss": 1.0364, + "step": 22130 + }, + { + "epoch": 0.3189420458965383, + "grad_norm": 0.5737643241882324, + "learning_rate": 1.5932149086231857e-05, + "loss": 1.03, + "step": 22140 + }, + { + "epoch": 0.31908610282783756, + "grad_norm": 0.544349193572998, + "learning_rate": 1.592839235793483e-05, + "loss": 1.0531, + "step": 22150 + }, + { + "epoch": 0.3192301597591368, + "grad_norm": 0.42877405881881714, + "learning_rate": 1.5924634339108563e-05, + "loss": 1.0279, + "step": 22160 + }, + { + "epoch": 0.3193742166904361, + "grad_norm": 0.5407014489173889, + "learning_rate": 1.5920875030571127e-05, + "loss": 1.0114, + "step": 22170 + }, + { + "epoch": 0.3195182736217353, + "grad_norm": 0.5757893919944763, + "learning_rate": 1.591711443314088e-05, + "loss": 1.0254, + "step": 22180 + }, + { + "epoch": 0.31966233055303456, + "grad_norm": 0.5823472142219543, + "learning_rate": 1.5913352547636448e-05, + "loss": 1.0425, + "step": 22190 + }, + { + "epoch": 0.3198063874843338, + "grad_norm": 0.6235815286636353, + "learning_rate": 1.5909589374876743e-05, + "loss": 1.043, + "step": 22200 + }, + { + "epoch": 0.3199504444156331, + "grad_norm": 0.6385778784751892, + "learning_rate": 1.590582491568095e-05, + "loss": 1.0288, + "step": 22210 + }, + { + "epoch": 0.3200945013469323, + "grad_norm": 0.5611007809638977, + "learning_rate": 1.5902059170868544e-05, + "loss": 1.0391, + "step": 22220 + }, + { + "epoch": 0.32023855827823156, + "grad_norm": 0.5358127951622009, + "learning_rate": 1.5898292141259276e-05, + "loss": 1.0232, + "step": 22230 + }, + { + "epoch": 0.3203826152095308, + "grad_norm": 0.5716277360916138, + "learning_rate": 1.5894523827673177e-05, + "loss": 1.0329, + "step": 22240 + }, + { + "epoch": 0.3205266721408301, + "grad_norm": 0.5133422613143921, + "learning_rate": 1.5890754230930554e-05, + "loss": 1.0256, + "step": 22250 + }, + { + "epoch": 0.3206707290721293, + "grad_norm": 0.571601390838623, + "learning_rate": 1.5886983351851997e-05, + "loss": 1.0386, + "step": 22260 + }, + { + "epoch": 0.32081478600342855, + "grad_norm": 0.6445148587226868, + "learning_rate": 1.588321119125838e-05, + "loss": 1.0282, + "step": 22270 + }, + { + "epoch": 0.3209588429347278, + "grad_norm": 0.6041953563690186, + "learning_rate": 1.587943774997084e-05, + "loss": 1.0441, + "step": 22280 + }, + { + "epoch": 0.3211028998660271, + "grad_norm": 0.5699586272239685, + "learning_rate": 1.5875663028810813e-05, + "loss": 1.0164, + "step": 22290 + }, + { + "epoch": 0.3212469567973263, + "grad_norm": 0.5821050405502319, + "learning_rate": 1.58718870286e-05, + "loss": 1.043, + "step": 22300 + }, + { + "epoch": 0.32139101372862555, + "grad_norm": 0.6183272004127502, + "learning_rate": 1.586810975016038e-05, + "loss": 1.0416, + "step": 22310 + }, + { + "epoch": 0.3215350706599248, + "grad_norm": 0.5741239786148071, + "learning_rate": 1.5864331194314218e-05, + "loss": 1.0158, + "step": 22320 + }, + { + "epoch": 0.3216791275912241, + "grad_norm": 0.5490394830703735, + "learning_rate": 1.5860551361884057e-05, + "loss": 1.0356, + "step": 22330 + }, + { + "epoch": 0.3218231845225233, + "grad_norm": 0.5496690273284912, + "learning_rate": 1.585677025369271e-05, + "loss": 1.0209, + "step": 22340 + }, + { + "epoch": 0.32196724145382255, + "grad_norm": 0.6001868844032288, + "learning_rate": 1.585298787056327e-05, + "loss": 1.0258, + "step": 22350 + }, + { + "epoch": 0.3221112983851218, + "grad_norm": 0.46774235367774963, + "learning_rate": 1.5849204213319113e-05, + "loss": 1.0504, + "step": 22360 + }, + { + "epoch": 0.3222553553164211, + "grad_norm": 0.5711603164672852, + "learning_rate": 1.5845419282783892e-05, + "loss": 1.0252, + "step": 22370 + }, + { + "epoch": 0.3223994122477203, + "grad_norm": 0.5627217292785645, + "learning_rate": 1.5841633079781526e-05, + "loss": 1.0316, + "step": 22380 + }, + { + "epoch": 0.32254346917901955, + "grad_norm": 0.6105978488922119, + "learning_rate": 1.5837845605136227e-05, + "loss": 1.0253, + "step": 22390 + }, + { + "epoch": 0.3226875261103188, + "grad_norm": 0.7439790964126587, + "learning_rate": 1.5834056859672472e-05, + "loss": 1.0493, + "step": 22400 + }, + { + "epoch": 0.322831583041618, + "grad_norm": 0.5626582503318787, + "learning_rate": 1.583026684421502e-05, + "loss": 1.0007, + "step": 22410 + }, + { + "epoch": 0.3229756399729173, + "grad_norm": 0.5579158067703247, + "learning_rate": 1.5826475559588908e-05, + "loss": 1.0087, + "step": 22420 + }, + { + "epoch": 0.32311969690421655, + "grad_norm": 0.6444723010063171, + "learning_rate": 1.582268300661944e-05, + "loss": 1.0474, + "step": 22430 + }, + { + "epoch": 0.3232637538355158, + "grad_norm": 0.6335017085075378, + "learning_rate": 1.5818889186132202e-05, + "loss": 1.0355, + "step": 22440 + }, + { + "epoch": 0.323407810766815, + "grad_norm": 0.6375849843025208, + "learning_rate": 1.5815094098953063e-05, + "loss": 1.0245, + "step": 22450 + }, + { + "epoch": 0.3235518676981143, + "grad_norm": 0.59910649061203, + "learning_rate": 1.5811297745908158e-05, + "loss": 1.0304, + "step": 22460 + }, + { + "epoch": 0.32369592462941355, + "grad_norm": 0.5544905066490173, + "learning_rate": 1.58075001278239e-05, + "loss": 1.0252, + "step": 22470 + }, + { + "epoch": 0.3238399815607128, + "grad_norm": 0.5109099745750427, + "learning_rate": 1.580370124552698e-05, + "loss": 1.0265, + "step": 22480 + }, + { + "epoch": 0.323984038492012, + "grad_norm": 0.5076267123222351, + "learning_rate": 1.5799901099844358e-05, + "loss": 1.0116, + "step": 22490 + }, + { + "epoch": 0.3241280954233113, + "grad_norm": 0.5694175362586975, + "learning_rate": 1.579609969160328e-05, + "loss": 1.0207, + "step": 22500 + }, + { + "epoch": 0.32427215235461054, + "grad_norm": 0.5941329002380371, + "learning_rate": 1.5792297021631253e-05, + "loss": 1.0262, + "step": 22510 + }, + { + "epoch": 0.3244162092859098, + "grad_norm": 0.5924682021141052, + "learning_rate": 1.5788493090756074e-05, + "loss": 1.0336, + "step": 22520 + }, + { + "epoch": 0.324560266217209, + "grad_norm": 0.5476395487785339, + "learning_rate": 1.5784687899805796e-05, + "loss": 1.0153, + "step": 22530 + }, + { + "epoch": 0.3247043231485083, + "grad_norm": 0.6003282070159912, + "learning_rate": 1.5780881449608766e-05, + "loss": 1.0312, + "step": 22540 + }, + { + "epoch": 0.32484838007980754, + "grad_norm": 0.546464741230011, + "learning_rate": 1.577707374099359e-05, + "loss": 1.0356, + "step": 22550 + }, + { + "epoch": 0.3249924370111068, + "grad_norm": 0.5743100643157959, + "learning_rate": 1.5773264774789153e-05, + "loss": 1.0091, + "step": 22560 + }, + { + "epoch": 0.325136493942406, + "grad_norm": 0.5611959099769592, + "learning_rate": 1.5769454551824617e-05, + "loss": 1.0337, + "step": 22570 + }, + { + "epoch": 0.3252805508737053, + "grad_norm": 0.5726581811904907, + "learning_rate": 1.5765643072929416e-05, + "loss": 1.0267, + "step": 22580 + }, + { + "epoch": 0.32542460780500454, + "grad_norm": 0.6655681729316711, + "learning_rate": 1.576183033893326e-05, + "loss": 1.0579, + "step": 22590 + }, + { + "epoch": 0.3255686647363038, + "grad_norm": 0.5404876470565796, + "learning_rate": 1.575801635066612e-05, + "loss": 1.0413, + "step": 22600 + }, + { + "epoch": 0.325712721667603, + "grad_norm": 0.6277989745140076, + "learning_rate": 1.5754201108958248e-05, + "loss": 1.0108, + "step": 22610 + }, + { + "epoch": 0.3258567785989023, + "grad_norm": 0.5897507071495056, + "learning_rate": 1.575038461464018e-05, + "loss": 1.0362, + "step": 22620 + }, + { + "epoch": 0.32600083553020154, + "grad_norm": 0.5520694851875305, + "learning_rate": 1.5746566868542705e-05, + "loss": 1.0468, + "step": 22630 + }, + { + "epoch": 0.3261448924615008, + "grad_norm": 0.5689501762390137, + "learning_rate": 1.5742747871496898e-05, + "loss": 1.0434, + "step": 22640 + }, + { + "epoch": 0.3262889493928, + "grad_norm": 0.641187846660614, + "learning_rate": 1.57389276243341e-05, + "loss": 1.0428, + "step": 22650 + }, + { + "epoch": 0.3264330063240993, + "grad_norm": 0.5682580471038818, + "learning_rate": 1.5735106127885926e-05, + "loss": 1.0129, + "step": 22660 + }, + { + "epoch": 0.32657706325539854, + "grad_norm": 0.6321516036987305, + "learning_rate": 1.5731283382984266e-05, + "loss": 1.0351, + "step": 22670 + }, + { + "epoch": 0.3267211201866978, + "grad_norm": 0.6668967604637146, + "learning_rate": 1.5727459390461276e-05, + "loss": 1.0218, + "step": 22680 + }, + { + "epoch": 0.326865177117997, + "grad_norm": 0.5244312882423401, + "learning_rate": 1.5723634151149387e-05, + "loss": 1.0442, + "step": 22690 + }, + { + "epoch": 0.32700923404929627, + "grad_norm": 0.5505217909812927, + "learning_rate": 1.5719807665881298e-05, + "loss": 1.0164, + "step": 22700 + }, + { + "epoch": 0.32715329098059553, + "grad_norm": 0.5882081389427185, + "learning_rate": 1.5715979935489986e-05, + "loss": 1.0201, + "step": 22710 + }, + { + "epoch": 0.3272973479118948, + "grad_norm": 0.5344705581665039, + "learning_rate": 1.5712150960808698e-05, + "loss": 1.0266, + "step": 22720 + }, + { + "epoch": 0.327441404843194, + "grad_norm": 0.6028308272361755, + "learning_rate": 1.570832074267094e-05, + "loss": 1.0538, + "step": 22730 + }, + { + "epoch": 0.32758546177449327, + "grad_norm": 0.5094046592712402, + "learning_rate": 1.5704489281910507e-05, + "loss": 1.031, + "step": 22740 + }, + { + "epoch": 0.32772951870579253, + "grad_norm": 0.5897226333618164, + "learning_rate": 1.5700656579361445e-05, + "loss": 1.0227, + "step": 22750 + }, + { + "epoch": 0.3278735756370918, + "grad_norm": 0.5317791104316711, + "learning_rate": 1.569682263585809e-05, + "loss": 1.038, + "step": 22760 + }, + { + "epoch": 0.328017632568391, + "grad_norm": 0.5674440860748291, + "learning_rate": 1.5692987452235032e-05, + "loss": 1.0218, + "step": 22770 + }, + { + "epoch": 0.32816168949969027, + "grad_norm": 0.5383510589599609, + "learning_rate": 1.5689151029327145e-05, + "loss": 1.0221, + "step": 22780 + }, + { + "epoch": 0.32830574643098953, + "grad_norm": 0.5321672558784485, + "learning_rate": 1.568531336796956e-05, + "loss": 1.0299, + "step": 22790 + }, + { + "epoch": 0.3284498033622888, + "grad_norm": 0.5883285999298096, + "learning_rate": 1.5681474468997682e-05, + "loss": 1.0352, + "step": 22800 + }, + { + "epoch": 0.328593860293588, + "grad_norm": 0.5454659461975098, + "learning_rate": 1.5677634333247187e-05, + "loss": 1.0254, + "step": 22810 + }, + { + "epoch": 0.32873791722488727, + "grad_norm": 0.5320073962211609, + "learning_rate": 1.5673792961554025e-05, + "loss": 1.0432, + "step": 22820 + }, + { + "epoch": 0.32888197415618653, + "grad_norm": 0.5060919523239136, + "learning_rate": 1.56699503547544e-05, + "loss": 0.9997, + "step": 22830 + }, + { + "epoch": 0.3290260310874858, + "grad_norm": 0.5718419551849365, + "learning_rate": 1.5666106513684803e-05, + "loss": 1.0314, + "step": 22840 + }, + { + "epoch": 0.329170088018785, + "grad_norm": 0.6048396825790405, + "learning_rate": 1.5662261439181983e-05, + "loss": 1.0257, + "step": 22850 + }, + { + "epoch": 0.32931414495008426, + "grad_norm": 0.5688875913619995, + "learning_rate": 1.5658415132082957e-05, + "loss": 1.0253, + "step": 22860 + }, + { + "epoch": 0.32945820188138353, + "grad_norm": 0.6301843523979187, + "learning_rate": 1.5654567593225014e-05, + "loss": 1.036, + "step": 22870 + }, + { + "epoch": 0.3296022588126828, + "grad_norm": 0.5989998579025269, + "learning_rate": 1.565071882344571e-05, + "loss": 1.0325, + "step": 22880 + }, + { + "epoch": 0.329746315743982, + "grad_norm": 0.5171307921409607, + "learning_rate": 1.5646868823582874e-05, + "loss": 1.0229, + "step": 22890 + }, + { + "epoch": 0.32989037267528126, + "grad_norm": 0.6181734800338745, + "learning_rate": 1.564301759447459e-05, + "loss": 1.0229, + "step": 22900 + }, + { + "epoch": 0.3300344296065805, + "grad_norm": 0.5516754984855652, + "learning_rate": 1.5639165136959222e-05, + "loss": 1.0232, + "step": 22910 + }, + { + "epoch": 0.3301784865378798, + "grad_norm": 0.5356869101524353, + "learning_rate": 1.5635311451875393e-05, + "loss": 1.0472, + "step": 22920 + }, + { + "epoch": 0.330322543469179, + "grad_norm": 0.6694609522819519, + "learning_rate": 1.5631456540062004e-05, + "loss": 1.0524, + "step": 22930 + }, + { + "epoch": 0.33046660040047826, + "grad_norm": 0.592635989189148, + "learning_rate": 1.5627600402358213e-05, + "loss": 1.0216, + "step": 22940 + }, + { + "epoch": 0.3306106573317775, + "grad_norm": 0.5867111682891846, + "learning_rate": 1.5623743039603444e-05, + "loss": 1.0266, + "step": 22950 + }, + { + "epoch": 0.3307547142630768, + "grad_norm": 0.5420083403587341, + "learning_rate": 1.5619884452637397e-05, + "loss": 1.0142, + "step": 22960 + }, + { + "epoch": 0.330898771194376, + "grad_norm": 0.5669357180595398, + "learning_rate": 1.5616024642300032e-05, + "loss": 1.0425, + "step": 22970 + }, + { + "epoch": 0.33104282812567526, + "grad_norm": 0.5408803224563599, + "learning_rate": 1.561216360943157e-05, + "loss": 1.0381, + "step": 22980 + }, + { + "epoch": 0.3311868850569745, + "grad_norm": 0.5677891969680786, + "learning_rate": 1.5608301354872517e-05, + "loss": 1.0387, + "step": 22990 + }, + { + "epoch": 0.3313309419882738, + "grad_norm": 0.5275797247886658, + "learning_rate": 1.5604437879463623e-05, + "loss": 1.0127, + "step": 23000 + }, + { + "epoch": 0.331474998919573, + "grad_norm": 0.6289392709732056, + "learning_rate": 1.5600573184045918e-05, + "loss": 1.03, + "step": 23010 + }, + { + "epoch": 0.33161905585087226, + "grad_norm": 0.6317474246025085, + "learning_rate": 1.5596707269460684e-05, + "loss": 1.024, + "step": 23020 + }, + { + "epoch": 0.3317631127821715, + "grad_norm": 0.5193930268287659, + "learning_rate": 1.559284013654949e-05, + "loss": 1.0341, + "step": 23030 + }, + { + "epoch": 0.3319071697134708, + "grad_norm": 0.5708998441696167, + "learning_rate": 1.558897178615415e-05, + "loss": 1.0387, + "step": 23040 + }, + { + "epoch": 0.33205122664477, + "grad_norm": 0.6090611219406128, + "learning_rate": 1.558510221911675e-05, + "loss": 1.0426, + "step": 23050 + }, + { + "epoch": 0.33219528357606926, + "grad_norm": 0.5591376423835754, + "learning_rate": 1.5581231436279648e-05, + "loss": 1.0045, + "step": 23060 + }, + { + "epoch": 0.3323393405073685, + "grad_norm": 0.5508315563201904, + "learning_rate": 1.5577359438485452e-05, + "loss": 1.0523, + "step": 23070 + }, + { + "epoch": 0.3324833974386678, + "grad_norm": 0.5775851607322693, + "learning_rate": 1.557348622657705e-05, + "loss": 1.0117, + "step": 23080 + }, + { + "epoch": 0.332627454369967, + "grad_norm": 0.5561432242393494, + "learning_rate": 1.5569611801397578e-05, + "loss": 1.033, + "step": 23090 + }, + { + "epoch": 0.33277151130126625, + "grad_norm": 0.5185987949371338, + "learning_rate": 1.556573616379045e-05, + "loss": 1.0314, + "step": 23100 + }, + { + "epoch": 0.3329155682325655, + "grad_norm": 0.7127716541290283, + "learning_rate": 1.5561859314599338e-05, + "loss": 1.0239, + "step": 23110 + }, + { + "epoch": 0.3330596251638648, + "grad_norm": 0.6326349973678589, + "learning_rate": 1.555798125466818e-05, + "loss": 1.0128, + "step": 23120 + }, + { + "epoch": 0.333203682095164, + "grad_norm": 0.5786259174346924, + "learning_rate": 1.5554101984841176e-05, + "loss": 1.0421, + "step": 23130 + }, + { + "epoch": 0.33334773902646325, + "grad_norm": 0.5460976958274841, + "learning_rate": 1.5550221505962788e-05, + "loss": 1.0201, + "step": 23140 + }, + { + "epoch": 0.3334917959577625, + "grad_norm": 0.6431747078895569, + "learning_rate": 1.5546339818877745e-05, + "loss": 1.0124, + "step": 23150 + }, + { + "epoch": 0.3336358528890618, + "grad_norm": 0.592847466468811, + "learning_rate": 1.554245692443104e-05, + "loss": 1.0144, + "step": 23160 + }, + { + "epoch": 0.333779909820361, + "grad_norm": 0.5598251819610596, + "learning_rate": 1.5538572823467912e-05, + "loss": 1.0221, + "step": 23170 + }, + { + "epoch": 0.33392396675166025, + "grad_norm": 0.5220053195953369, + "learning_rate": 1.553468751683389e-05, + "loss": 1.0275, + "step": 23180 + }, + { + "epoch": 0.3340680236829595, + "grad_norm": 0.6287788152694702, + "learning_rate": 1.5530801005374742e-05, + "loss": 1.0321, + "step": 23190 + }, + { + "epoch": 0.3342120806142588, + "grad_norm": 0.5721694827079773, + "learning_rate": 1.5526913289936516e-05, + "loss": 1.032, + "step": 23200 + }, + { + "epoch": 0.334356137545558, + "grad_norm": 0.5819039344787598, + "learning_rate": 1.5523024371365508e-05, + "loss": 1.0313, + "step": 23210 + }, + { + "epoch": 0.33450019447685725, + "grad_norm": 0.6232460737228394, + "learning_rate": 1.551913425050829e-05, + "loss": 1.0248, + "step": 23220 + }, + { + "epoch": 0.3346442514081565, + "grad_norm": 0.6906763911247253, + "learning_rate": 1.551524292821168e-05, + "loss": 1.0398, + "step": 23230 + }, + { + "epoch": 0.3347883083394558, + "grad_norm": 0.6997624039649963, + "learning_rate": 1.5511350405322763e-05, + "loss": 1.0349, + "step": 23240 + }, + { + "epoch": 0.334932365270755, + "grad_norm": 0.5740306377410889, + "learning_rate": 1.5507456682688897e-05, + "loss": 1.0452, + "step": 23250 + }, + { + "epoch": 0.33507642220205425, + "grad_norm": 0.5284046530723572, + "learning_rate": 1.5503561761157683e-05, + "loss": 1.0334, + "step": 23260 + }, + { + "epoch": 0.3352204791333535, + "grad_norm": 0.5597389340400696, + "learning_rate": 1.5499665641577e-05, + "loss": 1.0208, + "step": 23270 + }, + { + "epoch": 0.3353645360646528, + "grad_norm": 0.5695542097091675, + "learning_rate": 1.5495768324794967e-05, + "loss": 1.0143, + "step": 23280 + }, + { + "epoch": 0.335508592995952, + "grad_norm": 0.6439692378044128, + "learning_rate": 1.549186981165999e-05, + "loss": 0.9941, + "step": 23290 + }, + { + "epoch": 0.33565264992725125, + "grad_norm": 0.6697540283203125, + "learning_rate": 1.5487970103020714e-05, + "loss": 1.0187, + "step": 23300 + }, + { + "epoch": 0.3357967068585505, + "grad_norm": 0.4710736870765686, + "learning_rate": 1.5484069199726054e-05, + "loss": 1.0441, + "step": 23310 + }, + { + "epoch": 0.3359407637898498, + "grad_norm": 0.5582054853439331, + "learning_rate": 1.5480167102625178e-05, + "loss": 1.0332, + "step": 23320 + }, + { + "epoch": 0.336084820721149, + "grad_norm": 1.0697526931762695, + "learning_rate": 1.5476263812567527e-05, + "loss": 1.0216, + "step": 23330 + }, + { + "epoch": 0.33622887765244824, + "grad_norm": 0.672410249710083, + "learning_rate": 1.5472359330402788e-05, + "loss": 1.041, + "step": 23340 + }, + { + "epoch": 0.3363729345837475, + "grad_norm": 0.7280809879302979, + "learning_rate": 1.5468453656980913e-05, + "loss": 1.0598, + "step": 23350 + }, + { + "epoch": 0.33651699151504677, + "grad_norm": 0.5943734049797058, + "learning_rate": 1.5464546793152118e-05, + "loss": 1.0243, + "step": 23360 + }, + { + "epoch": 0.336661048446346, + "grad_norm": 0.6002911329269409, + "learning_rate": 1.5460638739766863e-05, + "loss": 1.0226, + "step": 23370 + }, + { + "epoch": 0.33680510537764524, + "grad_norm": 0.67616206407547, + "learning_rate": 1.545672949767589e-05, + "loss": 1.0399, + "step": 23380 + }, + { + "epoch": 0.3369491623089445, + "grad_norm": 0.5300679206848145, + "learning_rate": 1.545281906773018e-05, + "loss": 1.0477, + "step": 23390 + }, + { + "epoch": 0.33709321924024377, + "grad_norm": 0.5449818968772888, + "learning_rate": 1.5448907450780976e-05, + "loss": 1.0286, + "step": 23400 + }, + { + "epoch": 0.337237276171543, + "grad_norm": 0.6804856061935425, + "learning_rate": 1.544499464767979e-05, + "loss": 1.0163, + "step": 23410 + }, + { + "epoch": 0.33738133310284224, + "grad_norm": 0.6101489067077637, + "learning_rate": 1.5441080659278383e-05, + "loss": 1.0497, + "step": 23420 + }, + { + "epoch": 0.3375253900341415, + "grad_norm": 0.6030386090278625, + "learning_rate": 1.5437165486428778e-05, + "loss": 1.0462, + "step": 23430 + }, + { + "epoch": 0.33766944696544077, + "grad_norm": 0.5325640439987183, + "learning_rate": 1.5433249129983254e-05, + "loss": 1.0021, + "step": 23440 + }, + { + "epoch": 0.33781350389674, + "grad_norm": 0.6616448760032654, + "learning_rate": 1.5429331590794344e-05, + "loss": 1.0719, + "step": 23450 + }, + { + "epoch": 0.33795756082803924, + "grad_norm": 0.6268752217292786, + "learning_rate": 1.5425412869714842e-05, + "loss": 1.0308, + "step": 23460 + }, + { + "epoch": 0.3381016177593385, + "grad_norm": 0.5952742099761963, + "learning_rate": 1.5421492967597807e-05, + "loss": 1.0377, + "step": 23470 + }, + { + "epoch": 0.33824567469063777, + "grad_norm": 0.814234733581543, + "learning_rate": 1.541757188529654e-05, + "loss": 1.0336, + "step": 23480 + }, + { + "epoch": 0.338389731621937, + "grad_norm": 0.6123931407928467, + "learning_rate": 1.5413649623664608e-05, + "loss": 0.9977, + "step": 23490 + }, + { + "epoch": 0.33853378855323624, + "grad_norm": 0.5388018488883972, + "learning_rate": 1.540972618355583e-05, + "loss": 1.0105, + "step": 23500 + }, + { + "epoch": 0.3386778454845355, + "grad_norm": 0.5089865922927856, + "learning_rate": 1.5405801565824297e-05, + "loss": 1.0377, + "step": 23510 + }, + { + "epoch": 0.33882190241583476, + "grad_norm": 0.6453099846839905, + "learning_rate": 1.540187577132433e-05, + "loss": 1.0003, + "step": 23520 + }, + { + "epoch": 0.33896595934713397, + "grad_norm": 0.7200059294700623, + "learning_rate": 1.539794880091053e-05, + "loss": 1.0176, + "step": 23530 + }, + { + "epoch": 0.33911001627843324, + "grad_norm": 0.5649188756942749, + "learning_rate": 1.539402065543774e-05, + "loss": 1.0176, + "step": 23540 + }, + { + "epoch": 0.3392540732097325, + "grad_norm": 0.5101891756057739, + "learning_rate": 1.5390091335761064e-05, + "loss": 1.0293, + "step": 23550 + }, + { + "epoch": 0.33939813014103176, + "grad_norm": 0.5435965061187744, + "learning_rate": 1.538616084273586e-05, + "loss": 1.0549, + "step": 23560 + }, + { + "epoch": 0.33954218707233097, + "grad_norm": 0.5960400700569153, + "learning_rate": 1.5382229177217744e-05, + "loss": 1.0303, + "step": 23570 + }, + { + "epoch": 0.33968624400363023, + "grad_norm": 0.603877067565918, + "learning_rate": 1.5378296340062584e-05, + "loss": 1.0318, + "step": 23580 + }, + { + "epoch": 0.3398303009349295, + "grad_norm": 0.6355979442596436, + "learning_rate": 1.5374362332126503e-05, + "loss": 1.0153, + "step": 23590 + }, + { + "epoch": 0.33997435786622876, + "grad_norm": 0.5628081560134888, + "learning_rate": 1.537042715426588e-05, + "loss": 1.0471, + "step": 23600 + }, + { + "epoch": 0.34011841479752797, + "grad_norm": 0.6306579113006592, + "learning_rate": 1.5366490807337353e-05, + "loss": 1.0363, + "step": 23610 + }, + { + "epoch": 0.34026247172882723, + "grad_norm": 0.5563225746154785, + "learning_rate": 1.5362553292197807e-05, + "loss": 1.0396, + "step": 23620 + }, + { + "epoch": 0.3404065286601265, + "grad_norm": 0.5619765520095825, + "learning_rate": 1.535861460970439e-05, + "loss": 1.0183, + "step": 23630 + }, + { + "epoch": 0.34055058559142576, + "grad_norm": 0.5668855309486389, + "learning_rate": 1.535467476071449e-05, + "loss": 1.0252, + "step": 23640 + }, + { + "epoch": 0.34069464252272497, + "grad_norm": 0.5778625011444092, + "learning_rate": 1.5350733746085762e-05, + "loss": 1.0362, + "step": 23650 + }, + { + "epoch": 0.34083869945402423, + "grad_norm": 0.6049732565879822, + "learning_rate": 1.5346791566676113e-05, + "loss": 1.0441, + "step": 23660 + }, + { + "epoch": 0.3409827563853235, + "grad_norm": 0.5564858317375183, + "learning_rate": 1.53428482233437e-05, + "loss": 1.0485, + "step": 23670 + }, + { + "epoch": 0.34112681331662276, + "grad_norm": 0.5889008641242981, + "learning_rate": 1.533890371694693e-05, + "loss": 1.0462, + "step": 23680 + }, + { + "epoch": 0.34127087024792196, + "grad_norm": 0.5665014386177063, + "learning_rate": 1.5334958048344473e-05, + "loss": 1.0071, + "step": 23690 + }, + { + "epoch": 0.34141492717922123, + "grad_norm": 0.5539933443069458, + "learning_rate": 1.5331011218395248e-05, + "loss": 1.0423, + "step": 23700 + }, + { + "epoch": 0.3415589841105205, + "grad_norm": 0.518696129322052, + "learning_rate": 1.5327063227958413e-05, + "loss": 1.0103, + "step": 23710 + }, + { + "epoch": 0.3417030410418197, + "grad_norm": 0.5499048233032227, + "learning_rate": 1.5323114077893405e-05, + "loss": 1.0173, + "step": 23720 + }, + { + "epoch": 0.34184709797311896, + "grad_norm": 0.6649884581565857, + "learning_rate": 1.5319163769059898e-05, + "loss": 1.0192, + "step": 23730 + }, + { + "epoch": 0.3419911549044182, + "grad_norm": 0.6036750078201294, + "learning_rate": 1.5315212302317814e-05, + "loss": 1.0453, + "step": 23740 + }, + { + "epoch": 0.3421352118357175, + "grad_norm": 0.5595616102218628, + "learning_rate": 1.5311259678527335e-05, + "loss": 1.0157, + "step": 23750 + }, + { + "epoch": 0.3422792687670167, + "grad_norm": 0.47189611196517944, + "learning_rate": 1.5307305898548893e-05, + "loss": 1.0045, + "step": 23760 + }, + { + "epoch": 0.34242332569831596, + "grad_norm": 0.602363109588623, + "learning_rate": 1.530335096324317e-05, + "loss": 1.0427, + "step": 23770 + }, + { + "epoch": 0.3425673826296152, + "grad_norm": 0.5799539685249329, + "learning_rate": 1.5299394873471104e-05, + "loss": 1.0324, + "step": 23780 + }, + { + "epoch": 0.3427114395609145, + "grad_norm": 0.5826029181480408, + "learning_rate": 1.5295437630093878e-05, + "loss": 1.035, + "step": 23790 + }, + { + "epoch": 0.3428554964922137, + "grad_norm": 0.570226788520813, + "learning_rate": 1.529147923397293e-05, + "loss": 1.0427, + "step": 23800 + }, + { + "epoch": 0.34299955342351296, + "grad_norm": 0.5779927372932434, + "learning_rate": 1.528751968596995e-05, + "loss": 1.0351, + "step": 23810 + }, + { + "epoch": 0.3431436103548122, + "grad_norm": 0.6387103796005249, + "learning_rate": 1.5283558986946875e-05, + "loss": 1.0208, + "step": 23820 + }, + { + "epoch": 0.3432876672861115, + "grad_norm": 0.6481809020042419, + "learning_rate": 1.5279597137765894e-05, + "loss": 1.0319, + "step": 23830 + }, + { + "epoch": 0.3434317242174107, + "grad_norm": 0.5976871252059937, + "learning_rate": 1.527563413928945e-05, + "loss": 1.0415, + "step": 23840 + }, + { + "epoch": 0.34357578114870996, + "grad_norm": 0.554764986038208, + "learning_rate": 1.527166999238023e-05, + "loss": 1.0219, + "step": 23850 + }, + { + "epoch": 0.3437198380800092, + "grad_norm": 0.5971626043319702, + "learning_rate": 1.526770469790118e-05, + "loss": 1.0197, + "step": 23860 + }, + { + "epoch": 0.3438638950113085, + "grad_norm": 0.6000232696533203, + "learning_rate": 1.5263738256715478e-05, + "loss": 1.0252, + "step": 23870 + }, + { + "epoch": 0.3440079519426077, + "grad_norm": 0.5533708930015564, + "learning_rate": 1.5259770669686576e-05, + "loss": 1.0173, + "step": 23880 + }, + { + "epoch": 0.34415200887390696, + "grad_norm": 0.6693439483642578, + "learning_rate": 1.5255801937678158e-05, + "loss": 1.0185, + "step": 23890 + }, + { + "epoch": 0.3442960658052062, + "grad_norm": 0.692731499671936, + "learning_rate": 1.5251832061554162e-05, + "loss": 1.0394, + "step": 23900 + }, + { + "epoch": 0.3444401227365055, + "grad_norm": 0.5547103881835938, + "learning_rate": 1.5247861042178773e-05, + "loss": 1.0183, + "step": 23910 + }, + { + "epoch": 0.3445841796678047, + "grad_norm": 0.6336583495140076, + "learning_rate": 1.5243888880416427e-05, + "loss": 1.0462, + "step": 23920 + }, + { + "epoch": 0.34472823659910395, + "grad_norm": 0.6708395481109619, + "learning_rate": 1.5239915577131817e-05, + "loss": 1.0276, + "step": 23930 + }, + { + "epoch": 0.3448722935304032, + "grad_norm": 0.5331196188926697, + "learning_rate": 1.5235941133189864e-05, + "loss": 1.0367, + "step": 23940 + }, + { + "epoch": 0.3450163504617025, + "grad_norm": 0.5452340245246887, + "learning_rate": 1.5231965549455763e-05, + "loss": 1.011, + "step": 23950 + }, + { + "epoch": 0.3451604073930017, + "grad_norm": 0.5825306177139282, + "learning_rate": 1.5227988826794932e-05, + "loss": 1.0511, + "step": 23960 + }, + { + "epoch": 0.34530446432430095, + "grad_norm": 0.594727098941803, + "learning_rate": 1.5224010966073058e-05, + "loss": 1.0342, + "step": 23970 + }, + { + "epoch": 0.3454485212556002, + "grad_norm": 0.5588743090629578, + "learning_rate": 1.5220031968156055e-05, + "loss": 1.022, + "step": 23980 + }, + { + "epoch": 0.3455925781868995, + "grad_norm": 0.5999211072921753, + "learning_rate": 1.521605183391011e-05, + "loss": 1.0344, + "step": 23990 + }, + { + "epoch": 0.3457366351181987, + "grad_norm": 0.5761189460754395, + "learning_rate": 1.5212070564201634e-05, + "loss": 1.034, + "step": 24000 + }, + { + "epoch": 0.34588069204949795, + "grad_norm": 0.5832364559173584, + "learning_rate": 1.5208088159897296e-05, + "loss": 1.0344, + "step": 24010 + }, + { + "epoch": 0.3460247489807972, + "grad_norm": 0.5501433610916138, + "learning_rate": 1.5204104621864014e-05, + "loss": 1.034, + "step": 24020 + }, + { + "epoch": 0.3461688059120965, + "grad_norm": 0.5442602634429932, + "learning_rate": 1.520011995096894e-05, + "loss": 1.052, + "step": 24030 + }, + { + "epoch": 0.3463128628433957, + "grad_norm": 0.6079826951026917, + "learning_rate": 1.5196134148079494e-05, + "loss": 1.0486, + "step": 24040 + }, + { + "epoch": 0.34645691977469495, + "grad_norm": 0.7023916244506836, + "learning_rate": 1.519214721406332e-05, + "loss": 1.028, + "step": 24050 + }, + { + "epoch": 0.3466009767059942, + "grad_norm": 0.6172901391983032, + "learning_rate": 1.5188159149788328e-05, + "loss": 1.0546, + "step": 24060 + }, + { + "epoch": 0.3467450336372935, + "grad_norm": 0.5976448059082031, + "learning_rate": 1.5184169956122659e-05, + "loss": 1.0409, + "step": 24070 + }, + { + "epoch": 0.3468890905685927, + "grad_norm": 0.6255366206169128, + "learning_rate": 1.5180179633934704e-05, + "loss": 1.0403, + "step": 24080 + }, + { + "epoch": 0.34703314749989195, + "grad_norm": 0.6168020963668823, + "learning_rate": 1.5176188184093103e-05, + "loss": 1.045, + "step": 24090 + }, + { + "epoch": 0.3471772044311912, + "grad_norm": 0.550489604473114, + "learning_rate": 1.5172195607466742e-05, + "loss": 1.0155, + "step": 24100 + }, + { + "epoch": 0.3473212613624905, + "grad_norm": 0.606447696685791, + "learning_rate": 1.5168201904924748e-05, + "loss": 1.0255, + "step": 24110 + }, + { + "epoch": 0.3474653182937897, + "grad_norm": 0.5619000792503357, + "learning_rate": 1.5164207077336492e-05, + "loss": 1.0493, + "step": 24120 + }, + { + "epoch": 0.34760937522508895, + "grad_norm": 0.5773958563804626, + "learning_rate": 1.51602111255716e-05, + "loss": 1.0324, + "step": 24130 + }, + { + "epoch": 0.3477534321563882, + "grad_norm": 0.545673668384552, + "learning_rate": 1.5156214050499927e-05, + "loss": 1.0233, + "step": 24140 + }, + { + "epoch": 0.3478974890876875, + "grad_norm": 0.5850799083709717, + "learning_rate": 1.5152215852991585e-05, + "loss": 1.0275, + "step": 24150 + }, + { + "epoch": 0.3480415460189867, + "grad_norm": 0.5293511748313904, + "learning_rate": 1.5148216533916929e-05, + "loss": 1.02, + "step": 24160 + }, + { + "epoch": 0.34818560295028594, + "grad_norm": 0.5992613434791565, + "learning_rate": 1.5144216094146554e-05, + "loss": 1.0212, + "step": 24170 + }, + { + "epoch": 0.3483296598815852, + "grad_norm": 0.5557295680046082, + "learning_rate": 1.5140214534551296e-05, + "loss": 1.0324, + "step": 24180 + }, + { + "epoch": 0.34847371681288447, + "grad_norm": 0.652137041091919, + "learning_rate": 1.5136211856002244e-05, + "loss": 1.0239, + "step": 24190 + }, + { + "epoch": 0.3486177737441837, + "grad_norm": 0.5570051074028015, + "learning_rate": 1.5132208059370727e-05, + "loss": 1.0176, + "step": 24200 + }, + { + "epoch": 0.34876183067548294, + "grad_norm": 0.49894794821739197, + "learning_rate": 1.5128203145528316e-05, + "loss": 1.0248, + "step": 24210 + }, + { + "epoch": 0.3489058876067822, + "grad_norm": 0.731623113155365, + "learning_rate": 1.512419711534682e-05, + "loss": 1.0075, + "step": 24220 + }, + { + "epoch": 0.34904994453808147, + "grad_norm": 0.9097506403923035, + "learning_rate": 1.5120189969698302e-05, + "loss": 1.0315, + "step": 24230 + }, + { + "epoch": 0.3491940014693807, + "grad_norm": 0.6409125328063965, + "learning_rate": 1.5116181709455062e-05, + "loss": 1.0406, + "step": 24240 + }, + { + "epoch": 0.34933805840067994, + "grad_norm": 0.5566344857215881, + "learning_rate": 1.5112172335489641e-05, + "loss": 1.0293, + "step": 24250 + }, + { + "epoch": 0.3494821153319792, + "grad_norm": 0.5710030198097229, + "learning_rate": 1.5108161848674821e-05, + "loss": 1.0333, + "step": 24260 + }, + { + "epoch": 0.34962617226327847, + "grad_norm": 0.5730279684066772, + "learning_rate": 1.5104150249883637e-05, + "loss": 1.0082, + "step": 24270 + }, + { + "epoch": 0.3497702291945777, + "grad_norm": 0.6711406707763672, + "learning_rate": 1.5100137539989356e-05, + "loss": 1.0169, + "step": 24280 + }, + { + "epoch": 0.34991428612587694, + "grad_norm": 0.5815849304199219, + "learning_rate": 1.509612371986549e-05, + "loss": 0.9991, + "step": 24290 + }, + { + "epoch": 0.3500583430571762, + "grad_norm": 0.6280797123908997, + "learning_rate": 1.5092108790385789e-05, + "loss": 1.0372, + "step": 24300 + }, + { + "epoch": 0.35020239998847547, + "grad_norm": 0.6219536662101746, + "learning_rate": 1.5088092752424248e-05, + "loss": 1.0266, + "step": 24310 + }, + { + "epoch": 0.3503464569197747, + "grad_norm": 0.6277016401290894, + "learning_rate": 1.508407560685511e-05, + "loss": 1.0106, + "step": 24320 + }, + { + "epoch": 0.35049051385107394, + "grad_norm": 0.6451740264892578, + "learning_rate": 1.5080057354552842e-05, + "loss": 1.0286, + "step": 24330 + }, + { + "epoch": 0.3506345707823732, + "grad_norm": 0.5524730086326599, + "learning_rate": 1.5076037996392172e-05, + "loss": 1.021, + "step": 24340 + }, + { + "epoch": 0.35077862771367246, + "grad_norm": 0.7687239050865173, + "learning_rate": 1.5072017533248051e-05, + "loss": 1.0272, + "step": 24350 + }, + { + "epoch": 0.35092268464497167, + "grad_norm": 0.6192282438278198, + "learning_rate": 1.5067995965995686e-05, + "loss": 1.0272, + "step": 24360 + }, + { + "epoch": 0.35106674157627094, + "grad_norm": 0.658404529094696, + "learning_rate": 1.5063973295510508e-05, + "loss": 1.0251, + "step": 24370 + }, + { + "epoch": 0.3512107985075702, + "grad_norm": 0.6252663731575012, + "learning_rate": 1.5059949522668201e-05, + "loss": 1.0236, + "step": 24380 + }, + { + "epoch": 0.35135485543886946, + "grad_norm": 0.5990842580795288, + "learning_rate": 1.5055924648344688e-05, + "loss": 1.0096, + "step": 24390 + }, + { + "epoch": 0.35149891237016867, + "grad_norm": 0.5357429385185242, + "learning_rate": 1.5051898673416121e-05, + "loss": 1.0309, + "step": 24400 + }, + { + "epoch": 0.35164296930146793, + "grad_norm": 0.5470159649848938, + "learning_rate": 1.5047871598758908e-05, + "loss": 1.0331, + "step": 24410 + }, + { + "epoch": 0.3517870262327672, + "grad_norm": 0.6631460785865784, + "learning_rate": 1.504384342524968e-05, + "loss": 1.021, + "step": 24420 + }, + { + "epoch": 0.35193108316406646, + "grad_norm": 0.5246572494506836, + "learning_rate": 1.5039814153765324e-05, + "loss": 1.0232, + "step": 24430 + }, + { + "epoch": 0.35207514009536567, + "grad_norm": 0.5777121186256409, + "learning_rate": 1.5035783785182945e-05, + "loss": 1.0422, + "step": 24440 + }, + { + "epoch": 0.35221919702666493, + "grad_norm": 0.6213065981864929, + "learning_rate": 1.5031752320379907e-05, + "loss": 1.0293, + "step": 24450 + }, + { + "epoch": 0.3523632539579642, + "grad_norm": 0.5165483951568604, + "learning_rate": 1.50277197602338e-05, + "loss": 1.0293, + "step": 24460 + }, + { + "epoch": 0.35250731088926346, + "grad_norm": 0.5039709806442261, + "learning_rate": 1.5023686105622464e-05, + "loss": 1.0158, + "step": 24470 + }, + { + "epoch": 0.35265136782056267, + "grad_norm": 0.6067390441894531, + "learning_rate": 1.5019651357423959e-05, + "loss": 1.0223, + "step": 24480 + }, + { + "epoch": 0.35279542475186193, + "grad_norm": 0.481794536113739, + "learning_rate": 1.5015615516516602e-05, + "loss": 1.0269, + "step": 24490 + }, + { + "epoch": 0.3529394816831612, + "grad_norm": 0.6830320954322815, + "learning_rate": 1.5011578583778936e-05, + "loss": 1.0401, + "step": 24500 + }, + { + "epoch": 0.35308353861446046, + "grad_norm": 0.608053982257843, + "learning_rate": 1.5007540560089746e-05, + "loss": 1.0263, + "step": 24510 + }, + { + "epoch": 0.35322759554575967, + "grad_norm": 0.6035851836204529, + "learning_rate": 1.500350144632806e-05, + "loss": 1.0097, + "step": 24520 + }, + { + "epoch": 0.35337165247705893, + "grad_norm": 0.5260176658630371, + "learning_rate": 1.4999461243373128e-05, + "loss": 1.0323, + "step": 24530 + }, + { + "epoch": 0.3535157094083582, + "grad_norm": 0.6285433173179626, + "learning_rate": 1.4995419952104454e-05, + "loss": 1.0178, + "step": 24540 + }, + { + "epoch": 0.35365976633965746, + "grad_norm": 0.744941234588623, + "learning_rate": 1.4991377573401766e-05, + "loss": 1.0183, + "step": 24550 + }, + { + "epoch": 0.35380382327095666, + "grad_norm": 0.6387519240379333, + "learning_rate": 1.4987334108145038e-05, + "loss": 1.0425, + "step": 24560 + }, + { + "epoch": 0.3539478802022559, + "grad_norm": 0.49635758996009827, + "learning_rate": 1.4983289557214472e-05, + "loss": 1.0217, + "step": 24570 + }, + { + "epoch": 0.3540919371335552, + "grad_norm": 0.5358009934425354, + "learning_rate": 1.4979243921490518e-05, + "loss": 1.0219, + "step": 24580 + }, + { + "epoch": 0.35423599406485445, + "grad_norm": 0.586286187171936, + "learning_rate": 1.4975197201853849e-05, + "loss": 1.0217, + "step": 24590 + }, + { + "epoch": 0.35438005099615366, + "grad_norm": 0.4951419234275818, + "learning_rate": 1.4971149399185381e-05, + "loss": 1.0422, + "step": 24600 + }, + { + "epoch": 0.3545241079274529, + "grad_norm": 0.5345513820648193, + "learning_rate": 1.4967100514366269e-05, + "loss": 1.0154, + "step": 24610 + }, + { + "epoch": 0.3546681648587522, + "grad_norm": 0.660944402217865, + "learning_rate": 1.4963050548277893e-05, + "loss": 1.0319, + "step": 24620 + }, + { + "epoch": 0.35481222179005145, + "grad_norm": 0.5842607021331787, + "learning_rate": 1.4958999501801879e-05, + "loss": 1.0256, + "step": 24630 + }, + { + "epoch": 0.35495627872135066, + "grad_norm": 0.5948303937911987, + "learning_rate": 1.495494737582008e-05, + "loss": 1.0074, + "step": 24640 + }, + { + "epoch": 0.3551003356526499, + "grad_norm": 0.5562019944190979, + "learning_rate": 1.4950894171214596e-05, + "loss": 1.0118, + "step": 24650 + }, + { + "epoch": 0.3552443925839492, + "grad_norm": 0.6218870878219604, + "learning_rate": 1.4946839888867748e-05, + "loss": 1.025, + "step": 24660 + }, + { + "epoch": 0.35538844951524845, + "grad_norm": 0.6354550123214722, + "learning_rate": 1.4942784529662094e-05, + "loss": 1.0077, + "step": 24670 + }, + { + "epoch": 0.35553250644654766, + "grad_norm": 0.6161721348762512, + "learning_rate": 1.493872809448044e-05, + "loss": 1.0306, + "step": 24680 + }, + { + "epoch": 0.3556765633778469, + "grad_norm": 0.5358957052230835, + "learning_rate": 1.4934670584205804e-05, + "loss": 1.0287, + "step": 24690 + }, + { + "epoch": 0.3558206203091462, + "grad_norm": 0.6816520094871521, + "learning_rate": 1.4930611999721457e-05, + "loss": 1.0331, + "step": 24700 + }, + { + "epoch": 0.35596467724044545, + "grad_norm": 0.5539494752883911, + "learning_rate": 1.4926552341910894e-05, + "loss": 1.0369, + "step": 24710 + }, + { + "epoch": 0.35610873417174466, + "grad_norm": 0.5865591168403625, + "learning_rate": 1.492249161165785e-05, + "loss": 1.0138, + "step": 24720 + }, + { + "epoch": 0.3562527911030439, + "grad_norm": 0.5584275722503662, + "learning_rate": 1.4918429809846289e-05, + "loss": 1.0406, + "step": 24730 + }, + { + "epoch": 0.3563968480343432, + "grad_norm": 0.57304447889328, + "learning_rate": 1.4914366937360405e-05, + "loss": 1.0351, + "step": 24740 + }, + { + "epoch": 0.35654090496564245, + "grad_norm": 0.5285123586654663, + "learning_rate": 1.4910302995084634e-05, + "loss": 1.0168, + "step": 24750 + }, + { + "epoch": 0.35668496189694165, + "grad_norm": 0.568499743938446, + "learning_rate": 1.490623798390364e-05, + "loss": 1.0351, + "step": 24760 + }, + { + "epoch": 0.3568290188282409, + "grad_norm": 0.5260944366455078, + "learning_rate": 1.4902171904702316e-05, + "loss": 1.0377, + "step": 24770 + }, + { + "epoch": 0.3569730757595402, + "grad_norm": 0.5984960794448853, + "learning_rate": 1.4898104758365794e-05, + "loss": 1.0291, + "step": 24780 + }, + { + "epoch": 0.35711713269083944, + "grad_norm": 0.6734877228736877, + "learning_rate": 1.4894036545779437e-05, + "loss": 1.0336, + "step": 24790 + }, + { + "epoch": 0.35726118962213865, + "grad_norm": 0.6885083913803101, + "learning_rate": 1.488996726782884e-05, + "loss": 1.0298, + "step": 24800 + }, + { + "epoch": 0.3574052465534379, + "grad_norm": 0.610966682434082, + "learning_rate": 1.488589692539982e-05, + "loss": 1.0235, + "step": 24810 + }, + { + "epoch": 0.3575493034847372, + "grad_norm": 0.5949985980987549, + "learning_rate": 1.4881825519378445e-05, + "loss": 1.0344, + "step": 24820 + }, + { + "epoch": 0.35769336041603644, + "grad_norm": 0.5474660396575928, + "learning_rate": 1.4877753050650996e-05, + "loss": 1.0556, + "step": 24830 + }, + { + "epoch": 0.35783741734733565, + "grad_norm": 0.6285277605056763, + "learning_rate": 1.4873679520103999e-05, + "loss": 0.9982, + "step": 24840 + }, + { + "epoch": 0.3579814742786349, + "grad_norm": 0.5509119033813477, + "learning_rate": 1.4869604928624202e-05, + "loss": 1.0232, + "step": 24850 + }, + { + "epoch": 0.3581255312099342, + "grad_norm": 0.5181789398193359, + "learning_rate": 1.486552927709859e-05, + "loss": 1.0532, + "step": 24860 + }, + { + "epoch": 0.35826958814123344, + "grad_norm": 0.5690596103668213, + "learning_rate": 1.4861452566414371e-05, + "loss": 1.0294, + "step": 24870 + }, + { + "epoch": 0.35841364507253265, + "grad_norm": 0.57747882604599, + "learning_rate": 1.4857374797458992e-05, + "loss": 1.007, + "step": 24880 + }, + { + "epoch": 0.3585577020038319, + "grad_norm": 0.5637708902359009, + "learning_rate": 1.485329597112013e-05, + "loss": 1.0343, + "step": 24890 + }, + { + "epoch": 0.3587017589351312, + "grad_norm": 0.5401881337165833, + "learning_rate": 1.4849216088285686e-05, + "loss": 1.0213, + "step": 24900 + }, + { + "epoch": 0.35884581586643044, + "grad_norm": 0.5868175029754639, + "learning_rate": 1.4845135149843795e-05, + "loss": 1.0397, + "step": 24910 + }, + { + "epoch": 0.35898987279772965, + "grad_norm": 0.5615954399108887, + "learning_rate": 1.4841053156682817e-05, + "loss": 1.0211, + "step": 24920 + }, + { + "epoch": 0.3591339297290289, + "grad_norm": 0.53164142370224, + "learning_rate": 1.4836970109691351e-05, + "loss": 1.026, + "step": 24930 + }, + { + "epoch": 0.3592779866603282, + "grad_norm": 0.5693374872207642, + "learning_rate": 1.4832886009758221e-05, + "loss": 1.0114, + "step": 24940 + }, + { + "epoch": 0.35942204359162744, + "grad_norm": 0.5792744159698486, + "learning_rate": 1.4828800857772472e-05, + "loss": 1.0355, + "step": 24950 + }, + { + "epoch": 0.35956610052292665, + "grad_norm": 0.5707711577415466, + "learning_rate": 1.4824714654623391e-05, + "loss": 1.0196, + "step": 24960 + }, + { + "epoch": 0.3597101574542259, + "grad_norm": 0.5481793880462646, + "learning_rate": 1.4820627401200487e-05, + "loss": 1.0078, + "step": 24970 + }, + { + "epoch": 0.3598542143855252, + "grad_norm": 0.5169560313224792, + "learning_rate": 1.4816539098393498e-05, + "loss": 1.017, + "step": 24980 + }, + { + "epoch": 0.35999827131682444, + "grad_norm": 0.5783670544624329, + "learning_rate": 1.4812449747092392e-05, + "loss": 1.0293, + "step": 24990 + }, + { + "epoch": 0.36014232824812364, + "grad_norm": 0.577237069606781, + "learning_rate": 1.4808359348187365e-05, + "loss": 1.0348, + "step": 25000 + }, + { + "epoch": 0.3602863851794229, + "grad_norm": 0.5775808691978455, + "learning_rate": 1.4804267902568838e-05, + "loss": 1.0284, + "step": 25010 + }, + { + "epoch": 0.36043044211072217, + "grad_norm": 0.6013045310974121, + "learning_rate": 1.4800175411127464e-05, + "loss": 1.0579, + "step": 25020 + }, + { + "epoch": 0.3605744990420214, + "grad_norm": 0.5317995548248291, + "learning_rate": 1.479608187475412e-05, + "loss": 1.0012, + "step": 25030 + }, + { + "epoch": 0.36071855597332064, + "grad_norm": 0.684230625629425, + "learning_rate": 1.4791987294339915e-05, + "loss": 1.0258, + "step": 25040 + }, + { + "epoch": 0.3608626129046199, + "grad_norm": 0.6365956664085388, + "learning_rate": 1.4787891670776182e-05, + "loss": 1.0191, + "step": 25050 + }, + { + "epoch": 0.36100666983591917, + "grad_norm": 0.5982478857040405, + "learning_rate": 1.478379500495448e-05, + "loss": 1.026, + "step": 25060 + }, + { + "epoch": 0.3611507267672184, + "grad_norm": 0.554524838924408, + "learning_rate": 1.47796972977666e-05, + "loss": 1.0095, + "step": 25070 + }, + { + "epoch": 0.36129478369851764, + "grad_norm": 0.6101455688476562, + "learning_rate": 1.4775598550104549e-05, + "loss": 1.0378, + "step": 25080 + }, + { + "epoch": 0.3614388406298169, + "grad_norm": 0.6337770223617554, + "learning_rate": 1.4771498762860582e-05, + "loss": 1.0378, + "step": 25090 + }, + { + "epoch": 0.36158289756111617, + "grad_norm": 0.5765113234519958, + "learning_rate": 1.4767397936927148e-05, + "loss": 1.0114, + "step": 25100 + }, + { + "epoch": 0.3617269544924154, + "grad_norm": 0.583540678024292, + "learning_rate": 1.4763296073196956e-05, + "loss": 1.0329, + "step": 25110 + }, + { + "epoch": 0.36187101142371464, + "grad_norm": 0.5872902870178223, + "learning_rate": 1.4759193172562916e-05, + "loss": 1.0399, + "step": 25120 + }, + { + "epoch": 0.3620150683550139, + "grad_norm": 0.6090887188911438, + "learning_rate": 1.4755089235918175e-05, + "loss": 1.0135, + "step": 25130 + }, + { + "epoch": 0.36215912528631317, + "grad_norm": 0.6019402146339417, + "learning_rate": 1.4750984264156103e-05, + "loss": 1.0042, + "step": 25140 + }, + { + "epoch": 0.3623031822176124, + "grad_norm": 0.6022039651870728, + "learning_rate": 1.4746878258170297e-05, + "loss": 1.0419, + "step": 25150 + }, + { + "epoch": 0.36244723914891164, + "grad_norm": 0.5955145955085754, + "learning_rate": 1.4742771218854582e-05, + "loss": 1.0424, + "step": 25160 + }, + { + "epoch": 0.3625912960802109, + "grad_norm": 0.5031068921089172, + "learning_rate": 1.4738663147102994e-05, + "loss": 1.0134, + "step": 25170 + }, + { + "epoch": 0.36273535301151016, + "grad_norm": 0.6688017249107361, + "learning_rate": 1.473455404380981e-05, + "loss": 1.0289, + "step": 25180 + }, + { + "epoch": 0.36287940994280937, + "grad_norm": 0.5259919762611389, + "learning_rate": 1.4730443909869522e-05, + "loss": 1.0243, + "step": 25190 + }, + { + "epoch": 0.36302346687410864, + "grad_norm": 0.587336003780365, + "learning_rate": 1.4726332746176855e-05, + "loss": 1.0306, + "step": 25200 + }, + { + "epoch": 0.3631675238054079, + "grad_norm": 0.6514984369277954, + "learning_rate": 1.4722220553626747e-05, + "loss": 1.0451, + "step": 25210 + }, + { + "epoch": 0.36331158073670716, + "grad_norm": 0.6073031425476074, + "learning_rate": 1.4718107333114366e-05, + "loss": 1.0448, + "step": 25220 + }, + { + "epoch": 0.36345563766800637, + "grad_norm": 0.5726466178894043, + "learning_rate": 1.4713993085535109e-05, + "loss": 1.0334, + "step": 25230 + }, + { + "epoch": 0.36359969459930563, + "grad_norm": 0.5648417472839355, + "learning_rate": 1.4709877811784586e-05, + "loss": 1.0026, + "step": 25240 + }, + { + "epoch": 0.3637437515306049, + "grad_norm": 0.6145786046981812, + "learning_rate": 1.4705761512758634e-05, + "loss": 1.038, + "step": 25250 + }, + { + "epoch": 0.36388780846190416, + "grad_norm": 0.6085208654403687, + "learning_rate": 1.4701644189353316e-05, + "loss": 1.0099, + "step": 25260 + }, + { + "epoch": 0.36403186539320337, + "grad_norm": 0.5955535173416138, + "learning_rate": 1.4697525842464918e-05, + "loss": 1.0114, + "step": 25270 + }, + { + "epoch": 0.36417592232450263, + "grad_norm": 0.5815196633338928, + "learning_rate": 1.4693406472989949e-05, + "loss": 1.031, + "step": 25280 + }, + { + "epoch": 0.3643199792558019, + "grad_norm": 0.5595164895057678, + "learning_rate": 1.4689286081825135e-05, + "loss": 1.019, + "step": 25290 + }, + { + "epoch": 0.36446403618710116, + "grad_norm": 0.5178977847099304, + "learning_rate": 1.4685164669867429e-05, + "loss": 1.0332, + "step": 25300 + }, + { + "epoch": 0.36460809311840037, + "grad_norm": 0.5986042022705078, + "learning_rate": 1.4681042238014007e-05, + "loss": 1.0385, + "step": 25310 + }, + { + "epoch": 0.36475215004969963, + "grad_norm": 0.5248829126358032, + "learning_rate": 1.4676918787162264e-05, + "loss": 1.0254, + "step": 25320 + }, + { + "epoch": 0.3648962069809989, + "grad_norm": 0.8148512244224548, + "learning_rate": 1.467279431820982e-05, + "loss": 1.0229, + "step": 25330 + }, + { + "epoch": 0.36504026391229816, + "grad_norm": 0.6069955825805664, + "learning_rate": 1.4668668832054513e-05, + "loss": 1.0203, + "step": 25340 + }, + { + "epoch": 0.36518432084359737, + "grad_norm": 0.5385414958000183, + "learning_rate": 1.4664542329594406e-05, + "loss": 1.0416, + "step": 25350 + }, + { + "epoch": 0.36532837777489663, + "grad_norm": 0.5793408751487732, + "learning_rate": 1.4660414811727782e-05, + "loss": 1.0406, + "step": 25360 + }, + { + "epoch": 0.3654724347061959, + "grad_norm": 0.6187202334403992, + "learning_rate": 1.4656286279353142e-05, + "loss": 1.0439, + "step": 25370 + }, + { + "epoch": 0.36561649163749516, + "grad_norm": 0.5589693784713745, + "learning_rate": 1.4652156733369216e-05, + "loss": 1.0141, + "step": 25380 + }, + { + "epoch": 0.36576054856879436, + "grad_norm": 0.5816443562507629, + "learning_rate": 1.4648026174674945e-05, + "loss": 1.0207, + "step": 25390 + }, + { + "epoch": 0.3659046055000936, + "grad_norm": 0.5740160346031189, + "learning_rate": 1.4643894604169493e-05, + "loss": 1.0227, + "step": 25400 + }, + { + "epoch": 0.3660486624313929, + "grad_norm": 0.6110251545906067, + "learning_rate": 1.4639762022752248e-05, + "loss": 1.0352, + "step": 25410 + }, + { + "epoch": 0.36619271936269215, + "grad_norm": 0.6512191891670227, + "learning_rate": 1.463562843132282e-05, + "loss": 1.0159, + "step": 25420 + }, + { + "epoch": 0.36633677629399136, + "grad_norm": 0.6277971267700195, + "learning_rate": 1.4631493830781027e-05, + "loss": 1.0473, + "step": 25430 + }, + { + "epoch": 0.3664808332252906, + "grad_norm": 0.5402994751930237, + "learning_rate": 1.4627358222026919e-05, + "loss": 1.0298, + "step": 25440 + }, + { + "epoch": 0.3666248901565899, + "grad_norm": 0.593584418296814, + "learning_rate": 1.4623221605960762e-05, + "loss": 1.0438, + "step": 25450 + }, + { + "epoch": 0.36676894708788915, + "grad_norm": 0.5621562600135803, + "learning_rate": 1.461908398348304e-05, + "loss": 1.0243, + "step": 25460 + }, + { + "epoch": 0.36691300401918836, + "grad_norm": 0.6253653168678284, + "learning_rate": 1.4614945355494452e-05, + "loss": 1.0258, + "step": 25470 + }, + { + "epoch": 0.3670570609504876, + "grad_norm": 0.6140376925468445, + "learning_rate": 1.4610805722895925e-05, + "loss": 1.0136, + "step": 25480 + }, + { + "epoch": 0.3672011178817869, + "grad_norm": 0.5688601732254028, + "learning_rate": 1.4606665086588598e-05, + "loss": 1.0476, + "step": 25490 + }, + { + "epoch": 0.36734517481308615, + "grad_norm": 0.6936155557632446, + "learning_rate": 1.460252344747383e-05, + "loss": 1.019, + "step": 25500 + }, + { + "epoch": 0.36748923174438536, + "grad_norm": 0.5645762085914612, + "learning_rate": 1.45983808064532e-05, + "loss": 1.0363, + "step": 25510 + }, + { + "epoch": 0.3676332886756846, + "grad_norm": 0.7030591368675232, + "learning_rate": 1.4594237164428503e-05, + "loss": 1.0098, + "step": 25520 + }, + { + "epoch": 0.3677773456069839, + "grad_norm": 0.6672507524490356, + "learning_rate": 1.4590092522301755e-05, + "loss": 0.9895, + "step": 25530 + }, + { + "epoch": 0.36792140253828315, + "grad_norm": 0.6040781736373901, + "learning_rate": 1.4585946880975182e-05, + "loss": 1.0227, + "step": 25540 + }, + { + "epoch": 0.36806545946958236, + "grad_norm": 0.617346465587616, + "learning_rate": 1.4581800241351237e-05, + "loss": 1.0323, + "step": 25550 + }, + { + "epoch": 0.3682095164008816, + "grad_norm": 0.5223129987716675, + "learning_rate": 1.4577652604332586e-05, + "loss": 1.0218, + "step": 25560 + }, + { + "epoch": 0.3683535733321809, + "grad_norm": 0.5572784543037415, + "learning_rate": 1.4573503970822112e-05, + "loss": 1.0052, + "step": 25570 + }, + { + "epoch": 0.36849763026348015, + "grad_norm": 0.6087919473648071, + "learning_rate": 1.4569354341722912e-05, + "loss": 1.0346, + "step": 25580 + }, + { + "epoch": 0.36864168719477935, + "grad_norm": 0.600691020488739, + "learning_rate": 1.456520371793831e-05, + "loss": 1.0277, + "step": 25590 + }, + { + "epoch": 0.3687857441260786, + "grad_norm": 0.6251884698867798, + "learning_rate": 1.4561052100371834e-05, + "loss": 1.0323, + "step": 25600 + }, + { + "epoch": 0.3689298010573779, + "grad_norm": 0.5772163271903992, + "learning_rate": 1.4556899489927236e-05, + "loss": 1.0405, + "step": 25610 + }, + { + "epoch": 0.36907385798867715, + "grad_norm": 0.5718433856964111, + "learning_rate": 1.4552745887508481e-05, + "loss": 1.0451, + "step": 25620 + }, + { + "epoch": 0.36921791491997635, + "grad_norm": 0.5430508255958557, + "learning_rate": 1.4548591294019751e-05, + "loss": 1.0465, + "step": 25630 + }, + { + "epoch": 0.3693619718512756, + "grad_norm": 0.640229344367981, + "learning_rate": 1.4544435710365448e-05, + "loss": 1.0179, + "step": 25640 + }, + { + "epoch": 0.3695060287825749, + "grad_norm": 0.46533769369125366, + "learning_rate": 1.4540279137450176e-05, + "loss": 1.007, + "step": 25650 + }, + { + "epoch": 0.36965008571387414, + "grad_norm": 0.588024914264679, + "learning_rate": 1.4536121576178772e-05, + "loss": 1.0287, + "step": 25660 + }, + { + "epoch": 0.36979414264517335, + "grad_norm": 0.5487402081489563, + "learning_rate": 1.4531963027456274e-05, + "loss": 1.0579, + "step": 25670 + }, + { + "epoch": 0.3699381995764726, + "grad_norm": 0.6008890867233276, + "learning_rate": 1.4527803492187947e-05, + "loss": 1.0253, + "step": 25680 + }, + { + "epoch": 0.3700822565077719, + "grad_norm": 0.4887695610523224, + "learning_rate": 1.452364297127926e-05, + "loss": 1.0325, + "step": 25690 + }, + { + "epoch": 0.37022631343907114, + "grad_norm": 0.5441385507583618, + "learning_rate": 1.4519481465635899e-05, + "loss": 1.025, + "step": 25700 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.5769540667533875, + "learning_rate": 1.4515318976163773e-05, + "loss": 1.0116, + "step": 25710 + }, + { + "epoch": 0.3705144273016696, + "grad_norm": 0.6632396578788757, + "learning_rate": 1.4511155503768993e-05, + "loss": 1.0221, + "step": 25720 + }, + { + "epoch": 0.3706584842329689, + "grad_norm": 0.5847335457801819, + "learning_rate": 1.450699104935789e-05, + "loss": 1.0506, + "step": 25730 + }, + { + "epoch": 0.37080254116426814, + "grad_norm": 0.5339410305023193, + "learning_rate": 1.450282561383701e-05, + "loss": 1.0105, + "step": 25740 + }, + { + "epoch": 0.37094659809556735, + "grad_norm": 0.540700376033783, + "learning_rate": 1.4498659198113112e-05, + "loss": 1.0405, + "step": 25750 + }, + { + "epoch": 0.3710906550268666, + "grad_norm": 0.5502351522445679, + "learning_rate": 1.4494491803093164e-05, + "loss": 1.0286, + "step": 25760 + }, + { + "epoch": 0.3712347119581659, + "grad_norm": 0.6045182943344116, + "learning_rate": 1.4490323429684353e-05, + "loss": 1.0259, + "step": 25770 + }, + { + "epoch": 0.37137876888946514, + "grad_norm": 0.5650545358657837, + "learning_rate": 1.4486154078794077e-05, + "loss": 1.0389, + "step": 25780 + }, + { + "epoch": 0.37152282582076435, + "grad_norm": 0.6249701976776123, + "learning_rate": 1.4481983751329942e-05, + "loss": 1.0472, + "step": 25790 + }, + { + "epoch": 0.3716668827520636, + "grad_norm": 0.541960597038269, + "learning_rate": 1.4477812448199773e-05, + "loss": 1.0154, + "step": 25800 + }, + { + "epoch": 0.3718109396833629, + "grad_norm": 0.5343725681304932, + "learning_rate": 1.4473640170311604e-05, + "loss": 1.0468, + "step": 25810 + }, + { + "epoch": 0.37195499661466214, + "grad_norm": 0.6055474877357483, + "learning_rate": 1.4469466918573688e-05, + "loss": 1.0401, + "step": 25820 + }, + { + "epoch": 0.37209905354596134, + "grad_norm": 0.5726013779640198, + "learning_rate": 1.446529269389448e-05, + "loss": 1.0388, + "step": 25830 + }, + { + "epoch": 0.3722431104772606, + "grad_norm": 0.706449031829834, + "learning_rate": 1.4461117497182649e-05, + "loss": 1.0231, + "step": 25840 + }, + { + "epoch": 0.37238716740855987, + "grad_norm": 0.599277675151825, + "learning_rate": 1.445694132934708e-05, + "loss": 1.0174, + "step": 25850 + }, + { + "epoch": 0.37253122433985913, + "grad_norm": 0.6309727430343628, + "learning_rate": 1.4452764191296868e-05, + "loss": 1.0354, + "step": 25860 + }, + { + "epoch": 0.37267528127115834, + "grad_norm": 0.5788888335227966, + "learning_rate": 1.4448586083941318e-05, + "loss": 1.0409, + "step": 25870 + }, + { + "epoch": 0.3728193382024576, + "grad_norm": 0.5666493773460388, + "learning_rate": 1.4444407008189944e-05, + "loss": 1.0122, + "step": 25880 + }, + { + "epoch": 0.37296339513375687, + "grad_norm": 0.5751312971115112, + "learning_rate": 1.444022696495248e-05, + "loss": 1.0414, + "step": 25890 + }, + { + "epoch": 0.37310745206505613, + "grad_norm": 0.5846768021583557, + "learning_rate": 1.4436045955138853e-05, + "loss": 1.0244, + "step": 25900 + }, + { + "epoch": 0.37325150899635534, + "grad_norm": 0.5709925293922424, + "learning_rate": 1.443186397965922e-05, + "loss": 1.0162, + "step": 25910 + }, + { + "epoch": 0.3733955659276546, + "grad_norm": 0.639036238193512, + "learning_rate": 1.4427681039423933e-05, + "loss": 1.0493, + "step": 25920 + }, + { + "epoch": 0.37353962285895387, + "grad_norm": 0.5406671762466431, + "learning_rate": 1.4423497135343565e-05, + "loss": 1.0402, + "step": 25930 + }, + { + "epoch": 0.37368367979025313, + "grad_norm": 0.5004711747169495, + "learning_rate": 1.4419312268328893e-05, + "loss": 0.9946, + "step": 25940 + }, + { + "epoch": 0.37382773672155234, + "grad_norm": 0.5443496108055115, + "learning_rate": 1.4415126439290905e-05, + "loss": 1.0169, + "step": 25950 + }, + { + "epoch": 0.3739717936528516, + "grad_norm": 0.6194301843643188, + "learning_rate": 1.4410939649140799e-05, + "loss": 0.999, + "step": 25960 + }, + { + "epoch": 0.37411585058415087, + "grad_norm": 0.6406512260437012, + "learning_rate": 1.4406751898789979e-05, + "loss": 1.0533, + "step": 25970 + }, + { + "epoch": 0.37425990751545013, + "grad_norm": 0.7409844994544983, + "learning_rate": 1.4402563189150062e-05, + "loss": 1.0251, + "step": 25980 + }, + { + "epoch": 0.37440396444674934, + "grad_norm": 0.5493165254592896, + "learning_rate": 1.439837352113287e-05, + "loss": 1.0166, + "step": 25990 + }, + { + "epoch": 0.3745480213780486, + "grad_norm": 0.5840917825698853, + "learning_rate": 1.4394182895650442e-05, + "loss": 1.0359, + "step": 26000 + }, + { + "epoch": 0.37469207830934786, + "grad_norm": 0.5663790702819824, + "learning_rate": 1.4389991313615014e-05, + "loss": 1.0124, + "step": 26010 + }, + { + "epoch": 0.37483613524064713, + "grad_norm": 0.6118389964103699, + "learning_rate": 1.4385798775939034e-05, + "loss": 0.9995, + "step": 26020 + }, + { + "epoch": 0.37498019217194634, + "grad_norm": 0.6556876301765442, + "learning_rate": 1.4381605283535167e-05, + "loss": 1.0479, + "step": 26030 + }, + { + "epoch": 0.3751242491032456, + "grad_norm": 0.5584632754325867, + "learning_rate": 1.4377410837316273e-05, + "loss": 1.0507, + "step": 26040 + }, + { + "epoch": 0.37526830603454486, + "grad_norm": 0.5240368843078613, + "learning_rate": 1.4373215438195427e-05, + "loss": 1.0178, + "step": 26050 + }, + { + "epoch": 0.3754123629658441, + "grad_norm": 0.5103548169136047, + "learning_rate": 1.4369019087085905e-05, + "loss": 1.0404, + "step": 26060 + }, + { + "epoch": 0.37555641989714333, + "grad_norm": 0.5511793494224548, + "learning_rate": 1.43648217849012e-05, + "loss": 1.0203, + "step": 26070 + }, + { + "epoch": 0.3757004768284426, + "grad_norm": 0.5841732621192932, + "learning_rate": 1.4360623532555007e-05, + "loss": 1.0362, + "step": 26080 + }, + { + "epoch": 0.37584453375974186, + "grad_norm": 0.6015898585319519, + "learning_rate": 1.4356424330961223e-05, + "loss": 1.0231, + "step": 26090 + }, + { + "epoch": 0.3759885906910411, + "grad_norm": 0.5784769654273987, + "learning_rate": 1.435222418103396e-05, + "loss": 1.0402, + "step": 26100 + }, + { + "epoch": 0.37613264762234033, + "grad_norm": 0.48664385080337524, + "learning_rate": 1.4348023083687533e-05, + "loss": 1.0176, + "step": 26110 + }, + { + "epoch": 0.3762767045536396, + "grad_norm": 0.7868779897689819, + "learning_rate": 1.4343821039836462e-05, + "loss": 1.0208, + "step": 26120 + }, + { + "epoch": 0.37642076148493886, + "grad_norm": 0.6292771100997925, + "learning_rate": 1.433961805039547e-05, + "loss": 1.0183, + "step": 26130 + }, + { + "epoch": 0.3765648184162381, + "grad_norm": 0.5502007603645325, + "learning_rate": 1.4335414116279495e-05, + "loss": 1.0462, + "step": 26140 + }, + { + "epoch": 0.37670887534753733, + "grad_norm": 0.5436489582061768, + "learning_rate": 1.4331209238403676e-05, + "loss": 1.0388, + "step": 26150 + }, + { + "epoch": 0.3768529322788366, + "grad_norm": 0.603003740310669, + "learning_rate": 1.4327003417683354e-05, + "loss": 1.0225, + "step": 26160 + }, + { + "epoch": 0.37699698921013586, + "grad_norm": 0.5382051467895508, + "learning_rate": 1.4322796655034078e-05, + "loss": 1.0256, + "step": 26170 + }, + { + "epoch": 0.3771410461414351, + "grad_norm": 0.5649926066398621, + "learning_rate": 1.4318588951371603e-05, + "loss": 1.0253, + "step": 26180 + }, + { + "epoch": 0.37728510307273433, + "grad_norm": 0.7416114807128906, + "learning_rate": 1.4314380307611892e-05, + "loss": 1.0275, + "step": 26190 + }, + { + "epoch": 0.3774291600040336, + "grad_norm": 0.6471835374832153, + "learning_rate": 1.4310170724671102e-05, + "loss": 1.0522, + "step": 26200 + }, + { + "epoch": 0.37757321693533286, + "grad_norm": 0.5750507712364197, + "learning_rate": 1.4305960203465605e-05, + "loss": 1.0319, + "step": 26210 + }, + { + "epoch": 0.3777172738666321, + "grad_norm": 0.5253751873970032, + "learning_rate": 1.4301748744911972e-05, + "loss": 1.0278, + "step": 26220 + }, + { + "epoch": 0.3778613307979313, + "grad_norm": 0.655426561832428, + "learning_rate": 1.4297536349926976e-05, + "loss": 1.0328, + "step": 26230 + }, + { + "epoch": 0.3780053877292306, + "grad_norm": 0.5094215869903564, + "learning_rate": 1.4293323019427607e-05, + "loss": 1.0205, + "step": 26240 + }, + { + "epoch": 0.37814944466052985, + "grad_norm": 0.6158928871154785, + "learning_rate": 1.428910875433104e-05, + "loss": 1.0379, + "step": 26250 + }, + { + "epoch": 0.3782935015918291, + "grad_norm": 0.6462520360946655, + "learning_rate": 1.4284893555554668e-05, + "loss": 1.0215, + "step": 26260 + }, + { + "epoch": 0.3784375585231283, + "grad_norm": 0.6016168594360352, + "learning_rate": 1.4280677424016076e-05, + "loss": 1.0268, + "step": 26270 + }, + { + "epoch": 0.3785816154544276, + "grad_norm": 0.6505822539329529, + "learning_rate": 1.4276460360633065e-05, + "loss": 1.0283, + "step": 26280 + }, + { + "epoch": 0.37872567238572685, + "grad_norm": 0.5918681025505066, + "learning_rate": 1.4272242366323622e-05, + "loss": 1.0257, + "step": 26290 + }, + { + "epoch": 0.3788697293170261, + "grad_norm": 0.5184618830680847, + "learning_rate": 1.4268023442005954e-05, + "loss": 1.022, + "step": 26300 + }, + { + "epoch": 0.3790137862483253, + "grad_norm": 0.442764014005661, + "learning_rate": 1.4263803588598459e-05, + "loss": 1.0165, + "step": 26310 + }, + { + "epoch": 0.3791578431796246, + "grad_norm": 0.5952311754226685, + "learning_rate": 1.4259582807019743e-05, + "loss": 1.0376, + "step": 26320 + }, + { + "epoch": 0.37930190011092385, + "grad_norm": 0.6148770451545715, + "learning_rate": 1.4255361098188614e-05, + "loss": 1.0247, + "step": 26330 + }, + { + "epoch": 0.3794459570422231, + "grad_norm": 0.5614669919013977, + "learning_rate": 1.4251138463024073e-05, + "loss": 1.0097, + "step": 26340 + }, + { + "epoch": 0.3795900139735223, + "grad_norm": 0.7027855515480042, + "learning_rate": 1.4246914902445332e-05, + "loss": 1.0456, + "step": 26350 + }, + { + "epoch": 0.3797340709048216, + "grad_norm": 0.5557788014411926, + "learning_rate": 1.4242690417371803e-05, + "loss": 1.0262, + "step": 26360 + }, + { + "epoch": 0.37987812783612085, + "grad_norm": 0.5281688570976257, + "learning_rate": 1.4238465008723103e-05, + "loss": 1.0073, + "step": 26370 + }, + { + "epoch": 0.38002218476742006, + "grad_norm": 0.640178918838501, + "learning_rate": 1.4234238677419037e-05, + "loss": 1.0119, + "step": 26380 + }, + { + "epoch": 0.3801662416987193, + "grad_norm": 0.6145185828208923, + "learning_rate": 1.423001142437962e-05, + "loss": 1.0468, + "step": 26390 + }, + { + "epoch": 0.3803102986300186, + "grad_norm": 0.5753335356712341, + "learning_rate": 1.4225783250525072e-05, + "loss": 1.0185, + "step": 26400 + }, + { + "epoch": 0.38045435556131785, + "grad_norm": 0.6809263825416565, + "learning_rate": 1.4221554156775806e-05, + "loss": 1.0168, + "step": 26410 + }, + { + "epoch": 0.38059841249261706, + "grad_norm": 0.5877491235733032, + "learning_rate": 1.4217324144052433e-05, + "loss": 1.0215, + "step": 26420 + }, + { + "epoch": 0.3807424694239163, + "grad_norm": 0.5531141757965088, + "learning_rate": 1.4213093213275773e-05, + "loss": 1.0352, + "step": 26430 + }, + { + "epoch": 0.3808865263552156, + "grad_norm": 0.5586743950843811, + "learning_rate": 1.4208861365366844e-05, + "loss": 1.0197, + "step": 26440 + }, + { + "epoch": 0.38103058328651485, + "grad_norm": 0.5900185108184814, + "learning_rate": 1.420462860124685e-05, + "loss": 1.029, + "step": 26450 + }, + { + "epoch": 0.38117464021781405, + "grad_norm": 0.6241628527641296, + "learning_rate": 1.420039492183722e-05, + "loss": 1.0361, + "step": 26460 + }, + { + "epoch": 0.3813186971491133, + "grad_norm": 0.6917476058006287, + "learning_rate": 1.4196160328059554e-05, + "loss": 1.0499, + "step": 26470 + }, + { + "epoch": 0.3814627540804126, + "grad_norm": 0.676550030708313, + "learning_rate": 1.4191924820835671e-05, + "loss": 1.0091, + "step": 26480 + }, + { + "epoch": 0.38160681101171184, + "grad_norm": 0.5810500383377075, + "learning_rate": 1.4187688401087585e-05, + "loss": 1.0176, + "step": 26490 + }, + { + "epoch": 0.38175086794301105, + "grad_norm": 0.49956226348876953, + "learning_rate": 1.4183451069737502e-05, + "loss": 1.0275, + "step": 26500 + }, + { + "epoch": 0.3818949248743103, + "grad_norm": 0.5411934852600098, + "learning_rate": 1.417921282770783e-05, + "loss": 1.0215, + "step": 26510 + }, + { + "epoch": 0.3820389818056096, + "grad_norm": 0.5320689082145691, + "learning_rate": 1.4174973675921182e-05, + "loss": 1.041, + "step": 26520 + }, + { + "epoch": 0.38218303873690884, + "grad_norm": 0.5580525398254395, + "learning_rate": 1.4170733615300356e-05, + "loss": 1.0327, + "step": 26530 + }, + { + "epoch": 0.38232709566820805, + "grad_norm": 0.8316383957862854, + "learning_rate": 1.4166492646768357e-05, + "loss": 1.0412, + "step": 26540 + }, + { + "epoch": 0.3824711525995073, + "grad_norm": 0.5757890343666077, + "learning_rate": 1.4162250771248387e-05, + "loss": 1.0321, + "step": 26550 + }, + { + "epoch": 0.3826152095308066, + "grad_norm": 0.6110444664955139, + "learning_rate": 1.4158007989663842e-05, + "loss": 1.0081, + "step": 26560 + }, + { + "epoch": 0.38275926646210584, + "grad_norm": 0.5812126398086548, + "learning_rate": 1.4153764302938316e-05, + "loss": 1.0547, + "step": 26570 + }, + { + "epoch": 0.38290332339340505, + "grad_norm": 0.6257738471031189, + "learning_rate": 1.4149519711995603e-05, + "loss": 1.0112, + "step": 26580 + }, + { + "epoch": 0.3830473803247043, + "grad_norm": 0.6122918128967285, + "learning_rate": 1.4145274217759693e-05, + "loss": 1.0463, + "step": 26590 + }, + { + "epoch": 0.3831914372560036, + "grad_norm": 0.6165433526039124, + "learning_rate": 1.4141027821154769e-05, + "loss": 1.0202, + "step": 26600 + }, + { + "epoch": 0.38333549418730284, + "grad_norm": 0.6220911741256714, + "learning_rate": 1.4136780523105212e-05, + "loss": 1.0134, + "step": 26610 + }, + { + "epoch": 0.38347955111860205, + "grad_norm": 0.5039299130439758, + "learning_rate": 1.4132532324535603e-05, + "loss": 1.028, + "step": 26620 + }, + { + "epoch": 0.3836236080499013, + "grad_norm": 0.5618889927864075, + "learning_rate": 1.4128283226370716e-05, + "loss": 1.0144, + "step": 26630 + }, + { + "epoch": 0.3837676649812006, + "grad_norm": 0.5898512005805969, + "learning_rate": 1.4124033229535518e-05, + "loss": 1.0148, + "step": 26640 + }, + { + "epoch": 0.38391172191249984, + "grad_norm": 0.5746092796325684, + "learning_rate": 1.4119782334955182e-05, + "loss": 1.0291, + "step": 26650 + }, + { + "epoch": 0.38405577884379904, + "grad_norm": 0.6095765233039856, + "learning_rate": 1.411553054355506e-05, + "loss": 1.0245, + "step": 26660 + }, + { + "epoch": 0.3841998357750983, + "grad_norm": 0.8533281683921814, + "learning_rate": 1.4111277856260714e-05, + "loss": 1.0252, + "step": 26670 + }, + { + "epoch": 0.38434389270639757, + "grad_norm": 0.5089832544326782, + "learning_rate": 1.4107024273997894e-05, + "loss": 1.0247, + "step": 26680 + }, + { + "epoch": 0.38448794963769684, + "grad_norm": 0.5801883339881897, + "learning_rate": 1.4102769797692543e-05, + "loss": 1.0141, + "step": 26690 + }, + { + "epoch": 0.38463200656899604, + "grad_norm": 0.5481066107749939, + "learning_rate": 1.4098514428270811e-05, + "loss": 1.025, + "step": 26700 + }, + { + "epoch": 0.3847760635002953, + "grad_norm": 0.5064467787742615, + "learning_rate": 1.4094258166659021e-05, + "loss": 1.0131, + "step": 26710 + }, + { + "epoch": 0.38492012043159457, + "grad_norm": 0.6142391562461853, + "learning_rate": 1.409000101378371e-05, + "loss": 1.0192, + "step": 26720 + }, + { + "epoch": 0.38506417736289383, + "grad_norm": 0.5667955875396729, + "learning_rate": 1.4085742970571601e-05, + "loss": 1.0255, + "step": 26730 + }, + { + "epoch": 0.38520823429419304, + "grad_norm": 0.6403192281723022, + "learning_rate": 1.408148403794961e-05, + "loss": 1.0183, + "step": 26740 + }, + { + "epoch": 0.3853522912254923, + "grad_norm": 0.7364640235900879, + "learning_rate": 1.4077224216844846e-05, + "loss": 1.0171, + "step": 26750 + }, + { + "epoch": 0.38549634815679157, + "grad_norm": 0.6199523210525513, + "learning_rate": 1.4072963508184618e-05, + "loss": 1.0094, + "step": 26760 + }, + { + "epoch": 0.38564040508809083, + "grad_norm": 0.5943057537078857, + "learning_rate": 1.406870191289642e-05, + "loss": 1.0512, + "step": 26770 + }, + { + "epoch": 0.38578446201939004, + "grad_norm": 0.5860393047332764, + "learning_rate": 1.4064439431907943e-05, + "loss": 1.0144, + "step": 26780 + }, + { + "epoch": 0.3859285189506893, + "grad_norm": 0.5969266295433044, + "learning_rate": 1.406017606614707e-05, + "loss": 1.0342, + "step": 26790 + }, + { + "epoch": 0.38607257588198857, + "grad_norm": 0.5967586636543274, + "learning_rate": 1.405591181654188e-05, + "loss": 1.0262, + "step": 26800 + }, + { + "epoch": 0.38621663281328783, + "grad_norm": 0.5471726655960083, + "learning_rate": 1.4051646684020637e-05, + "loss": 1.03, + "step": 26810 + }, + { + "epoch": 0.38636068974458704, + "grad_norm": 0.584466278553009, + "learning_rate": 1.4047380669511803e-05, + "loss": 1.0332, + "step": 26820 + }, + { + "epoch": 0.3865047466758863, + "grad_norm": 0.6537045240402222, + "learning_rate": 1.4043113773944033e-05, + "loss": 1.0442, + "step": 26830 + }, + { + "epoch": 0.38664880360718556, + "grad_norm": 0.578791618347168, + "learning_rate": 1.4038845998246165e-05, + "loss": 1.0035, + "step": 26840 + }, + { + "epoch": 0.38679286053848483, + "grad_norm": 0.6067990660667419, + "learning_rate": 1.4034577343347243e-05, + "loss": 1.0426, + "step": 26850 + }, + { + "epoch": 0.38693691746978404, + "grad_norm": 0.6514458060264587, + "learning_rate": 1.4030307810176488e-05, + "loss": 1.0303, + "step": 26860 + }, + { + "epoch": 0.3870809744010833, + "grad_norm": 0.5709391832351685, + "learning_rate": 1.4026037399663323e-05, + "loss": 1.0253, + "step": 26870 + }, + { + "epoch": 0.38722503133238256, + "grad_norm": 0.6085813045501709, + "learning_rate": 1.4021766112737357e-05, + "loss": 1.0311, + "step": 26880 + }, + { + "epoch": 0.3873690882636818, + "grad_norm": 0.5561932325363159, + "learning_rate": 1.4017493950328383e-05, + "loss": 1.0399, + "step": 26890 + }, + { + "epoch": 0.38751314519498103, + "grad_norm": 0.5875961780548096, + "learning_rate": 1.4013220913366401e-05, + "loss": 1.0219, + "step": 26900 + }, + { + "epoch": 0.3876572021262803, + "grad_norm": 0.5987101197242737, + "learning_rate": 1.4008947002781588e-05, + "loss": 1.0086, + "step": 26910 + }, + { + "epoch": 0.38780125905757956, + "grad_norm": 0.7484641075134277, + "learning_rate": 1.4004672219504318e-05, + "loss": 1.0279, + "step": 26920 + }, + { + "epoch": 0.3879453159888788, + "grad_norm": 0.7231214642524719, + "learning_rate": 1.4000396564465146e-05, + "loss": 1.0155, + "step": 26930 + }, + { + "epoch": 0.38808937292017803, + "grad_norm": 0.5435040593147278, + "learning_rate": 1.3996120038594829e-05, + "loss": 1.0326, + "step": 26940 + }, + { + "epoch": 0.3882334298514773, + "grad_norm": 0.5462003350257874, + "learning_rate": 1.3991842642824307e-05, + "loss": 1.042, + "step": 26950 + }, + { + "epoch": 0.38837748678277656, + "grad_norm": 0.5954107046127319, + "learning_rate": 1.3987564378084707e-05, + "loss": 1.0248, + "step": 26960 + }, + { + "epoch": 0.3885215437140758, + "grad_norm": 0.5133011341094971, + "learning_rate": 1.3983285245307355e-05, + "loss": 1.0416, + "step": 26970 + }, + { + "epoch": 0.38866560064537503, + "grad_norm": 0.6856930255889893, + "learning_rate": 1.397900524542375e-05, + "loss": 1.0472, + "step": 26980 + }, + { + "epoch": 0.3888096575766743, + "grad_norm": 0.5439833402633667, + "learning_rate": 1.3974724379365597e-05, + "loss": 0.997, + "step": 26990 + }, + { + "epoch": 0.38895371450797356, + "grad_norm": 0.6151694655418396, + "learning_rate": 1.3970442648064775e-05, + "loss": 1.0289, + "step": 27000 + }, + { + "epoch": 0.3890977714392728, + "grad_norm": 0.6477126479148865, + "learning_rate": 1.3966160052453361e-05, + "loss": 1.0496, + "step": 27010 + }, + { + "epoch": 0.38924182837057203, + "grad_norm": 0.620611310005188, + "learning_rate": 1.396187659346362e-05, + "loss": 1.0345, + "step": 27020 + }, + { + "epoch": 0.3893858853018713, + "grad_norm": 0.5189834833145142, + "learning_rate": 1.3957592272027995e-05, + "loss": 1.0183, + "step": 27030 + }, + { + "epoch": 0.38952994223317056, + "grad_norm": 0.621438205242157, + "learning_rate": 1.395330708907913e-05, + "loss": 1.0469, + "step": 27040 + }, + { + "epoch": 0.3896739991644698, + "grad_norm": 0.6799705624580383, + "learning_rate": 1.3949021045549844e-05, + "loss": 1.021, + "step": 27050 + }, + { + "epoch": 0.389818056095769, + "grad_norm": 0.552381694316864, + "learning_rate": 1.3944734142373157e-05, + "loss": 1.0358, + "step": 27060 + }, + { + "epoch": 0.3899621130270683, + "grad_norm": 0.5012443661689758, + "learning_rate": 1.3940446380482264e-05, + "loss": 1.0345, + "step": 27070 + }, + { + "epoch": 0.39010616995836755, + "grad_norm": 0.569753110408783, + "learning_rate": 1.3936157760810551e-05, + "loss": 1.049, + "step": 27080 + }, + { + "epoch": 0.3902502268896668, + "grad_norm": 0.5513450503349304, + "learning_rate": 1.3931868284291591e-05, + "loss": 1.0374, + "step": 27090 + }, + { + "epoch": 0.390394283820966, + "grad_norm": 0.5258768200874329, + "learning_rate": 1.392757795185915e-05, + "loss": 1.0163, + "step": 27100 + }, + { + "epoch": 0.3905383407522653, + "grad_norm": 0.6887435913085938, + "learning_rate": 1.392328676444717e-05, + "loss": 1.0386, + "step": 27110 + }, + { + "epoch": 0.39068239768356455, + "grad_norm": 0.5797517895698547, + "learning_rate": 1.391899472298978e-05, + "loss": 1.0269, + "step": 27120 + }, + { + "epoch": 0.3908264546148638, + "grad_norm": 0.573955774307251, + "learning_rate": 1.3914701828421304e-05, + "loss": 1.0304, + "step": 27130 + }, + { + "epoch": 0.390970511546163, + "grad_norm": 0.5322799682617188, + "learning_rate": 1.3910408081676241e-05, + "loss": 1.0395, + "step": 27140 + }, + { + "epoch": 0.3911145684774623, + "grad_norm": 0.6017953753471375, + "learning_rate": 1.3906113483689285e-05, + "loss": 1.0302, + "step": 27150 + }, + { + "epoch": 0.39125862540876155, + "grad_norm": 0.5898041129112244, + "learning_rate": 1.3901818035395307e-05, + "loss": 1.0364, + "step": 27160 + }, + { + "epoch": 0.3914026823400608, + "grad_norm": 0.5774397850036621, + "learning_rate": 1.3897521737729367e-05, + "loss": 1.0237, + "step": 27170 + }, + { + "epoch": 0.39154673927136, + "grad_norm": 0.6190448999404907, + "learning_rate": 1.3893224591626715e-05, + "loss": 1.049, + "step": 27180 + }, + { + "epoch": 0.3916907962026593, + "grad_norm": 0.6038142442703247, + "learning_rate": 1.3888926598022772e-05, + "loss": 1.026, + "step": 27190 + }, + { + "epoch": 0.39183485313395855, + "grad_norm": 0.6462919116020203, + "learning_rate": 1.3884627757853158e-05, + "loss": 1.0331, + "step": 27200 + }, + { + "epoch": 0.3919789100652578, + "grad_norm": 0.5720357298851013, + "learning_rate": 1.388032807205367e-05, + "loss": 1.0436, + "step": 27210 + }, + { + "epoch": 0.392122966996557, + "grad_norm": 0.522705614566803, + "learning_rate": 1.387602754156029e-05, + "loss": 1.0061, + "step": 27220 + }, + { + "epoch": 0.3922670239278563, + "grad_norm": 0.5933101177215576, + "learning_rate": 1.3871726167309186e-05, + "loss": 1.0235, + "step": 27230 + }, + { + "epoch": 0.39241108085915555, + "grad_norm": 0.5866686701774597, + "learning_rate": 1.3867423950236702e-05, + "loss": 1.0324, + "step": 27240 + }, + { + "epoch": 0.3925551377904548, + "grad_norm": 0.5963432192802429, + "learning_rate": 1.3863120891279378e-05, + "loss": 1.0311, + "step": 27250 + }, + { + "epoch": 0.392699194721754, + "grad_norm": 0.5158278346061707, + "learning_rate": 1.3858816991373925e-05, + "loss": 1.0346, + "step": 27260 + }, + { + "epoch": 0.3928432516530533, + "grad_norm": 0.5357363224029541, + "learning_rate": 1.3854512251457248e-05, + "loss": 1.036, + "step": 27270 + }, + { + "epoch": 0.39298730858435255, + "grad_norm": 0.6555647850036621, + "learning_rate": 1.3850206672466424e-05, + "loss": 1.0511, + "step": 27280 + }, + { + "epoch": 0.3931313655156518, + "grad_norm": 0.5619667768478394, + "learning_rate": 1.3845900255338726e-05, + "loss": 1.0176, + "step": 27290 + }, + { + "epoch": 0.393275422446951, + "grad_norm": 0.6229352951049805, + "learning_rate": 1.384159300101159e-05, + "loss": 1.0252, + "step": 27300 + }, + { + "epoch": 0.3934194793782503, + "grad_norm": 0.5728248357772827, + "learning_rate": 1.3837284910422659e-05, + "loss": 1.0265, + "step": 27310 + }, + { + "epoch": 0.39356353630954954, + "grad_norm": 0.6256684064865112, + "learning_rate": 1.3832975984509735e-05, + "loss": 1.016, + "step": 27320 + }, + { + "epoch": 0.3937075932408488, + "grad_norm": 0.5774276852607727, + "learning_rate": 1.3828666224210814e-05, + "loss": 1.0318, + "step": 27330 + }, + { + "epoch": 0.393851650172148, + "grad_norm": 0.6430305242538452, + "learning_rate": 1.3824355630464074e-05, + "loss": 1.0287, + "step": 27340 + }, + { + "epoch": 0.3939957071034473, + "grad_norm": 0.5507270693778992, + "learning_rate": 1.382004420420787e-05, + "loss": 1.0277, + "step": 27350 + }, + { + "epoch": 0.39413976403474654, + "grad_norm": 0.5930743217468262, + "learning_rate": 1.3815731946380744e-05, + "loss": 1.0334, + "step": 27360 + }, + { + "epoch": 0.3942838209660458, + "grad_norm": 0.5006609559059143, + "learning_rate": 1.3811418857921406e-05, + "loss": 1.0545, + "step": 27370 + }, + { + "epoch": 0.394427877897345, + "grad_norm": 0.6899300217628479, + "learning_rate": 1.3807104939768763e-05, + "loss": 1.0328, + "step": 27380 + }, + { + "epoch": 0.3945719348286443, + "grad_norm": 0.6164814233779907, + "learning_rate": 1.3802790192861893e-05, + "loss": 1.0314, + "step": 27390 + }, + { + "epoch": 0.39471599175994354, + "grad_norm": 0.6175230145454407, + "learning_rate": 1.3798474618140061e-05, + "loss": 1.0266, + "step": 27400 + }, + { + "epoch": 0.3948600486912428, + "grad_norm": 0.6572819352149963, + "learning_rate": 1.3794158216542698e-05, + "loss": 1.0425, + "step": 27410 + }, + { + "epoch": 0.395004105622542, + "grad_norm": 0.6338945031166077, + "learning_rate": 1.3789840989009438e-05, + "loss": 1.041, + "step": 27420 + }, + { + "epoch": 0.3951481625538413, + "grad_norm": 0.6209861636161804, + "learning_rate": 1.3785522936480074e-05, + "loss": 1.0361, + "step": 27430 + }, + { + "epoch": 0.39529221948514054, + "grad_norm": 0.5528213977813721, + "learning_rate": 1.3781204059894586e-05, + "loss": 1.0104, + "step": 27440 + }, + { + "epoch": 0.3954362764164398, + "grad_norm": 0.603303849697113, + "learning_rate": 1.3776884360193137e-05, + "loss": 1.0319, + "step": 27450 + }, + { + "epoch": 0.395580333347739, + "grad_norm": 0.5575588345527649, + "learning_rate": 1.3772563838316065e-05, + "loss": 1.0025, + "step": 27460 + }, + { + "epoch": 0.3957243902790383, + "grad_norm": 0.5700780749320984, + "learning_rate": 1.3768242495203889e-05, + "loss": 1.0279, + "step": 27470 + }, + { + "epoch": 0.39586844721033754, + "grad_norm": 0.5762060880661011, + "learning_rate": 1.3763920331797302e-05, + "loss": 1.0486, + "step": 27480 + }, + { + "epoch": 0.3960125041416368, + "grad_norm": 0.4911508858203888, + "learning_rate": 1.3759597349037186e-05, + "loss": 1.0127, + "step": 27490 + }, + { + "epoch": 0.396156561072936, + "grad_norm": 0.6626416444778442, + "learning_rate": 1.3755273547864584e-05, + "loss": 1.0403, + "step": 27500 + }, + { + "epoch": 0.39630061800423527, + "grad_norm": 0.6523149609565735, + "learning_rate": 1.3750948929220743e-05, + "loss": 1.0366, + "step": 27510 + }, + { + "epoch": 0.39644467493553454, + "grad_norm": 0.522388756275177, + "learning_rate": 1.374662349404706e-05, + "loss": 1.0388, + "step": 27520 + }, + { + "epoch": 0.3965887318668338, + "grad_norm": 0.5563465356826782, + "learning_rate": 1.3742297243285126e-05, + "loss": 1.0114, + "step": 27530 + }, + { + "epoch": 0.396732788798133, + "grad_norm": 0.5447655320167542, + "learning_rate": 1.3737970177876711e-05, + "loss": 1.0205, + "step": 27540 + }, + { + "epoch": 0.39687684572943227, + "grad_norm": 0.5525548458099365, + "learning_rate": 1.3733642298763752e-05, + "loss": 1.0128, + "step": 27550 + }, + { + "epoch": 0.39702090266073153, + "grad_norm": 0.5254085063934326, + "learning_rate": 1.3729313606888371e-05, + "loss": 1.0114, + "step": 27560 + }, + { + "epoch": 0.3971649595920308, + "grad_norm": 0.5455352067947388, + "learning_rate": 1.372498410319286e-05, + "loss": 1.0319, + "step": 27570 + }, + { + "epoch": 0.39730901652333, + "grad_norm": 0.5383720993995667, + "learning_rate": 1.3720653788619697e-05, + "loss": 1.004, + "step": 27580 + }, + { + "epoch": 0.39745307345462927, + "grad_norm": 0.5045921802520752, + "learning_rate": 1.3716322664111535e-05, + "loss": 1.016, + "step": 27590 + }, + { + "epoch": 0.39759713038592853, + "grad_norm": 0.65843665599823, + "learning_rate": 1.3711990730611189e-05, + "loss": 1.02, + "step": 27600 + }, + { + "epoch": 0.3977411873172278, + "grad_norm": 0.4848676025867462, + "learning_rate": 1.3707657989061675e-05, + "loss": 1.0226, + "step": 27610 + }, + { + "epoch": 0.397885244248527, + "grad_norm": 0.5090790390968323, + "learning_rate": 1.3703324440406159e-05, + "loss": 1.0214, + "step": 27620 + }, + { + "epoch": 0.39802930117982627, + "grad_norm": 0.5815185308456421, + "learning_rate": 1.3698990085588002e-05, + "loss": 0.9969, + "step": 27630 + }, + { + "epoch": 0.39817335811112553, + "grad_norm": 0.5621809959411621, + "learning_rate": 1.3694654925550728e-05, + "loss": 1.0293, + "step": 27640 + }, + { + "epoch": 0.3983174150424248, + "grad_norm": 0.5777491331100464, + "learning_rate": 1.3690318961238046e-05, + "loss": 1.0179, + "step": 27650 + }, + { + "epoch": 0.398461471973724, + "grad_norm": 0.5728102326393127, + "learning_rate": 1.3685982193593837e-05, + "loss": 1.0347, + "step": 27660 + }, + { + "epoch": 0.39860552890502327, + "grad_norm": 0.6662067770957947, + "learning_rate": 1.3681644623562149e-05, + "loss": 1.0244, + "step": 27670 + }, + { + "epoch": 0.39874958583632253, + "grad_norm": 0.6177421808242798, + "learning_rate": 1.3677306252087217e-05, + "loss": 1.0102, + "step": 27680 + }, + { + "epoch": 0.39889364276762174, + "grad_norm": 0.6331455707550049, + "learning_rate": 1.3672967080113443e-05, + "loss": 1.0363, + "step": 27690 + }, + { + "epoch": 0.399037699698921, + "grad_norm": 0.5854665040969849, + "learning_rate": 1.3668627108585404e-05, + "loss": 1.0074, + "step": 27700 + }, + { + "epoch": 0.39918175663022026, + "grad_norm": 0.5754724740982056, + "learning_rate": 1.3664286338447847e-05, + "loss": 1.0154, + "step": 27710 + }, + { + "epoch": 0.3993258135615195, + "grad_norm": 0.6391647458076477, + "learning_rate": 1.3659944770645708e-05, + "loss": 1.0295, + "step": 27720 + }, + { + "epoch": 0.39946987049281873, + "grad_norm": 0.5782113671302795, + "learning_rate": 1.3655602406124082e-05, + "loss": 1.0408, + "step": 27730 + }, + { + "epoch": 0.399613927424118, + "grad_norm": 0.6727347373962402, + "learning_rate": 1.3651259245828238e-05, + "loss": 1.049, + "step": 27740 + }, + { + "epoch": 0.39975798435541726, + "grad_norm": 0.5403684377670288, + "learning_rate": 1.3646915290703628e-05, + "loss": 1.0257, + "step": 27750 + }, + { + "epoch": 0.3999020412867165, + "grad_norm": 0.5497075319290161, + "learning_rate": 1.3642570541695867e-05, + "loss": 1.0169, + "step": 27760 + }, + { + "epoch": 0.40004609821801573, + "grad_norm": 0.570429801940918, + "learning_rate": 1.3638224999750751e-05, + "loss": 1.0356, + "step": 27770 + }, + { + "epoch": 0.400190155149315, + "grad_norm": 0.5894126296043396, + "learning_rate": 1.3633878665814238e-05, + "loss": 0.9985, + "step": 27780 + }, + { + "epoch": 0.40033421208061426, + "grad_norm": 0.5434573888778687, + "learning_rate": 1.362953154083247e-05, + "loss": 1.027, + "step": 27790 + }, + { + "epoch": 0.4004782690119135, + "grad_norm": 0.5834015011787415, + "learning_rate": 1.3625183625751756e-05, + "loss": 1.0195, + "step": 27800 + }, + { + "epoch": 0.40062232594321273, + "grad_norm": 0.5129935145378113, + "learning_rate": 1.3620834921518575e-05, + "loss": 1.0072, + "step": 27810 + }, + { + "epoch": 0.400766382874512, + "grad_norm": 0.6349966526031494, + "learning_rate": 1.3616485429079583e-05, + "loss": 1.0328, + "step": 27820 + }, + { + "epoch": 0.40091043980581126, + "grad_norm": 0.593727707862854, + "learning_rate": 1.36121351493816e-05, + "loss": 1.0206, + "step": 27830 + }, + { + "epoch": 0.4010544967371105, + "grad_norm": 0.5819534659385681, + "learning_rate": 1.3607784083371627e-05, + "loss": 1.0195, + "step": 27840 + }, + { + "epoch": 0.40119855366840973, + "grad_norm": 0.5432244539260864, + "learning_rate": 1.3603432231996825e-05, + "loss": 1.0278, + "step": 27850 + }, + { + "epoch": 0.401342610599709, + "grad_norm": 0.5749140381813049, + "learning_rate": 1.3599079596204539e-05, + "loss": 1.0351, + "step": 27860 + }, + { + "epoch": 0.40148666753100826, + "grad_norm": 0.7217517495155334, + "learning_rate": 1.3594726176942272e-05, + "loss": 1.0323, + "step": 27870 + }, + { + "epoch": 0.4016307244623075, + "grad_norm": 0.5909184813499451, + "learning_rate": 1.3590371975157707e-05, + "loss": 1.0403, + "step": 27880 + }, + { + "epoch": 0.40177478139360673, + "grad_norm": 0.6383644938468933, + "learning_rate": 1.358601699179869e-05, + "loss": 1.0043, + "step": 27890 + }, + { + "epoch": 0.401918838324906, + "grad_norm": 0.573214590549469, + "learning_rate": 1.3581661227813246e-05, + "loss": 1.0463, + "step": 27900 + }, + { + "epoch": 0.40206289525620525, + "grad_norm": 0.6027265787124634, + "learning_rate": 1.3577304684149559e-05, + "loss": 1.0263, + "step": 27910 + }, + { + "epoch": 0.4022069521875045, + "grad_norm": 0.5588923692703247, + "learning_rate": 1.3572947361755993e-05, + "loss": 1.0298, + "step": 27920 + }, + { + "epoch": 0.4023510091188037, + "grad_norm": 0.6305546164512634, + "learning_rate": 1.3568589261581075e-05, + "loss": 1.0306, + "step": 27930 + }, + { + "epoch": 0.402495066050103, + "grad_norm": 0.7798808217048645, + "learning_rate": 1.3564230384573502e-05, + "loss": 1.029, + "step": 27940 + }, + { + "epoch": 0.40263912298140225, + "grad_norm": 0.5395976901054382, + "learning_rate": 1.3559870731682145e-05, + "loss": 1.0239, + "step": 27950 + }, + { + "epoch": 0.4027831799127015, + "grad_norm": 0.532769501209259, + "learning_rate": 1.3555510303856036e-05, + "loss": 1.0146, + "step": 27960 + }, + { + "epoch": 0.4029272368440007, + "grad_norm": 0.9768617749214172, + "learning_rate": 1.3551149102044386e-05, + "loss": 1.0286, + "step": 27970 + }, + { + "epoch": 0.4030712937753, + "grad_norm": 0.5485137104988098, + "learning_rate": 1.354678712719656e-05, + "loss": 1.0266, + "step": 27980 + }, + { + "epoch": 0.40321535070659925, + "grad_norm": 0.5443882346153259, + "learning_rate": 1.3542424380262108e-05, + "loss": 1.0351, + "step": 27990 + }, + { + "epoch": 0.4033594076378985, + "grad_norm": 0.5557625889778137, + "learning_rate": 1.3538060862190735e-05, + "loss": 1.0104, + "step": 28000 + }, + { + "epoch": 0.4035034645691977, + "grad_norm": 0.5878540873527527, + "learning_rate": 1.3533696573932316e-05, + "loss": 1.0176, + "step": 28010 + }, + { + "epoch": 0.403647521500497, + "grad_norm": 0.615067720413208, + "learning_rate": 1.3529331516436905e-05, + "loss": 1.0285, + "step": 28020 + }, + { + "epoch": 0.40379157843179625, + "grad_norm": 0.5775823593139648, + "learning_rate": 1.3524965690654708e-05, + "loss": 1.0296, + "step": 28030 + }, + { + "epoch": 0.4039356353630955, + "grad_norm": 0.5731359720230103, + "learning_rate": 1.352059909753611e-05, + "loss": 1.0473, + "step": 28040 + }, + { + "epoch": 0.4040796922943947, + "grad_norm": 0.5543149709701538, + "learning_rate": 1.3516231738031651e-05, + "loss": 1.0276, + "step": 28050 + }, + { + "epoch": 0.404223749225694, + "grad_norm": 0.5918812155723572, + "learning_rate": 1.3511863613092052e-05, + "loss": 1.0372, + "step": 28060 + }, + { + "epoch": 0.40436780615699325, + "grad_norm": 0.5753766894340515, + "learning_rate": 1.3507494723668188e-05, + "loss": 1.0423, + "step": 28070 + }, + { + "epoch": 0.4045118630882925, + "grad_norm": 0.5637984275817871, + "learning_rate": 1.3503125070711108e-05, + "loss": 1.035, + "step": 28080 + }, + { + "epoch": 0.4046559200195917, + "grad_norm": 0.5202157497406006, + "learning_rate": 1.349875465517203e-05, + "loss": 1.0445, + "step": 28090 + }, + { + "epoch": 0.404799976950891, + "grad_norm": 0.6230756640434265, + "learning_rate": 1.3494383478002326e-05, + "loss": 1.0049, + "step": 28100 + }, + { + "epoch": 0.40494403388219025, + "grad_norm": 0.5658066272735596, + "learning_rate": 1.3490011540153545e-05, + "loss": 1.0238, + "step": 28110 + }, + { + "epoch": 0.4050880908134895, + "grad_norm": 0.549156665802002, + "learning_rate": 1.3485638842577395e-05, + "loss": 1.0304, + "step": 28120 + }, + { + "epoch": 0.4052321477447887, + "grad_norm": 0.5747542977333069, + "learning_rate": 1.3481265386225755e-05, + "loss": 1.0201, + "step": 28130 + }, + { + "epoch": 0.405376204676088, + "grad_norm": 0.5837199687957764, + "learning_rate": 1.3476891172050666e-05, + "loss": 1.0208, + "step": 28140 + }, + { + "epoch": 0.40552026160738724, + "grad_norm": 0.5938249826431274, + "learning_rate": 1.347251620100433e-05, + "loss": 1.0323, + "step": 28150 + }, + { + "epoch": 0.4056643185386865, + "grad_norm": 0.5567238926887512, + "learning_rate": 1.3468140474039123e-05, + "loss": 1.0231, + "step": 28160 + }, + { + "epoch": 0.4058083754699857, + "grad_norm": 0.6004190444946289, + "learning_rate": 1.3463763992107572e-05, + "loss": 1.0224, + "step": 28170 + }, + { + "epoch": 0.405952432401285, + "grad_norm": 0.6211457848548889, + "learning_rate": 1.3459386756162387e-05, + "loss": 1.0114, + "step": 28180 + }, + { + "epoch": 0.40609648933258424, + "grad_norm": 0.591342031955719, + "learning_rate": 1.3455008767156425e-05, + "loss": 1.0291, + "step": 28190 + }, + { + "epoch": 0.4062405462638835, + "grad_norm": 0.5131479501724243, + "learning_rate": 1.3450630026042719e-05, + "loss": 1.0301, + "step": 28200 + }, + { + "epoch": 0.4063846031951827, + "grad_norm": 0.5368046164512634, + "learning_rate": 1.3446250533774458e-05, + "loss": 1.0307, + "step": 28210 + }, + { + "epoch": 0.406528660126482, + "grad_norm": 0.5792551636695862, + "learning_rate": 1.3441870291304995e-05, + "loss": 1.0332, + "step": 28220 + }, + { + "epoch": 0.40667271705778124, + "grad_norm": 0.9220713376998901, + "learning_rate": 1.3437489299587848e-05, + "loss": 1.0206, + "step": 28230 + }, + { + "epoch": 0.4068167739890805, + "grad_norm": 0.5201154947280884, + "learning_rate": 1.3433107559576705e-05, + "loss": 1.0212, + "step": 28240 + }, + { + "epoch": 0.4069608309203797, + "grad_norm": 0.5643423795700073, + "learning_rate": 1.3428725072225407e-05, + "loss": 1.017, + "step": 28250 + }, + { + "epoch": 0.407104887851679, + "grad_norm": 0.5780232548713684, + "learning_rate": 1.3424341838487955e-05, + "loss": 1.0211, + "step": 28260 + }, + { + "epoch": 0.40724894478297824, + "grad_norm": 0.6155387163162231, + "learning_rate": 1.341995785931853e-05, + "loss": 1.0138, + "step": 28270 + }, + { + "epoch": 0.4073930017142775, + "grad_norm": 0.5612841844558716, + "learning_rate": 1.3415573135671451e-05, + "loss": 1.0248, + "step": 28280 + }, + { + "epoch": 0.4075370586455767, + "grad_norm": 0.6546777486801147, + "learning_rate": 1.341118766850122e-05, + "loss": 1.0305, + "step": 28290 + }, + { + "epoch": 0.407681115576876, + "grad_norm": 0.5699030756950378, + "learning_rate": 1.3406801458762493e-05, + "loss": 1.0195, + "step": 28300 + }, + { + "epoch": 0.40782517250817524, + "grad_norm": 0.7382329106330872, + "learning_rate": 1.3402414507410083e-05, + "loss": 1.0355, + "step": 28310 + }, + { + "epoch": 0.4079692294394745, + "grad_norm": 0.5104536414146423, + "learning_rate": 1.3398026815398974e-05, + "loss": 1.0278, + "step": 28320 + }, + { + "epoch": 0.4081132863707737, + "grad_norm": 0.5759681463241577, + "learning_rate": 1.33936383836843e-05, + "loss": 1.0253, + "step": 28330 + }, + { + "epoch": 0.408257343302073, + "grad_norm": 0.5067317485809326, + "learning_rate": 1.3389249213221367e-05, + "loss": 1.0196, + "step": 28340 + }, + { + "epoch": 0.40840140023337224, + "grad_norm": 0.6182513236999512, + "learning_rate": 1.3384859304965635e-05, + "loss": 1.0186, + "step": 28350 + }, + { + "epoch": 0.4085454571646715, + "grad_norm": 0.5657869577407837, + "learning_rate": 1.3380468659872726e-05, + "loss": 1.0375, + "step": 28360 + }, + { + "epoch": 0.4086895140959707, + "grad_norm": 0.607580840587616, + "learning_rate": 1.3376077278898421e-05, + "loss": 1.0309, + "step": 28370 + }, + { + "epoch": 0.40883357102726997, + "grad_norm": 0.5067529082298279, + "learning_rate": 1.3371685162998668e-05, + "loss": 1.0355, + "step": 28380 + }, + { + "epoch": 0.40897762795856923, + "grad_norm": 0.5086706280708313, + "learning_rate": 1.336729231312957e-05, + "loss": 1.022, + "step": 28390 + }, + { + "epoch": 0.4091216848898685, + "grad_norm": 0.5726044178009033, + "learning_rate": 1.3362898730247382e-05, + "loss": 1.0168, + "step": 28400 + }, + { + "epoch": 0.4092657418211677, + "grad_norm": 0.5389735102653503, + "learning_rate": 1.3358504415308533e-05, + "loss": 1.0243, + "step": 28410 + }, + { + "epoch": 0.40940979875246697, + "grad_norm": 0.6076347231864929, + "learning_rate": 1.3354109369269605e-05, + "loss": 1.0241, + "step": 28420 + }, + { + "epoch": 0.40955385568376623, + "grad_norm": 0.4856959283351898, + "learning_rate": 1.3349713593087337e-05, + "loss": 1.031, + "step": 28430 + }, + { + "epoch": 0.4096979126150655, + "grad_norm": 0.6383373737335205, + "learning_rate": 1.3345317087718628e-05, + "loss": 1.0165, + "step": 28440 + }, + { + "epoch": 0.4098419695463647, + "grad_norm": 0.5641618967056274, + "learning_rate": 1.3340919854120538e-05, + "loss": 1.0099, + "step": 28450 + }, + { + "epoch": 0.40998602647766397, + "grad_norm": 0.5254115462303162, + "learning_rate": 1.3336521893250285e-05, + "loss": 1.0275, + "step": 28460 + }, + { + "epoch": 0.41013008340896323, + "grad_norm": 0.5576873421669006, + "learning_rate": 1.3332123206065242e-05, + "loss": 1.0108, + "step": 28470 + }, + { + "epoch": 0.4102741403402625, + "grad_norm": 0.5620102286338806, + "learning_rate": 1.3327723793522944e-05, + "loss": 1.023, + "step": 28480 + }, + { + "epoch": 0.4104181972715617, + "grad_norm": 0.5105875730514526, + "learning_rate": 1.3323323656581082e-05, + "loss": 1.027, + "step": 28490 + }, + { + "epoch": 0.41056225420286097, + "grad_norm": 0.5693491101264954, + "learning_rate": 1.3318922796197507e-05, + "loss": 1.0561, + "step": 28500 + }, + { + "epoch": 0.41070631113416023, + "grad_norm": 0.555982768535614, + "learning_rate": 1.331452121333022e-05, + "loss": 1.0258, + "step": 28510 + }, + { + "epoch": 0.4108503680654595, + "grad_norm": 0.5636388063430786, + "learning_rate": 1.331011890893739e-05, + "loss": 1.0255, + "step": 28520 + }, + { + "epoch": 0.4109944249967587, + "grad_norm": 0.4831456243991852, + "learning_rate": 1.330571588397734e-05, + "loss": 1.0197, + "step": 28530 + }, + { + "epoch": 0.41113848192805796, + "grad_norm": 0.5975767970085144, + "learning_rate": 1.330131213940854e-05, + "loss": 1.007, + "step": 28540 + }, + { + "epoch": 0.4112825388593572, + "grad_norm": 0.5731145739555359, + "learning_rate": 1.329690767618963e-05, + "loss": 1.0209, + "step": 28550 + }, + { + "epoch": 0.4114265957906565, + "grad_norm": 0.6682479381561279, + "learning_rate": 1.32925024952794e-05, + "loss": 1.025, + "step": 28560 + }, + { + "epoch": 0.4115706527219557, + "grad_norm": 0.5810287594795227, + "learning_rate": 1.32880965976368e-05, + "loss": 1.0491, + "step": 28570 + }, + { + "epoch": 0.41171470965325496, + "grad_norm": 0.5970315933227539, + "learning_rate": 1.3283689984220927e-05, + "loss": 1.0477, + "step": 28580 + }, + { + "epoch": 0.4118587665845542, + "grad_norm": 0.6494952440261841, + "learning_rate": 1.3279282655991043e-05, + "loss": 0.9981, + "step": 28590 + }, + { + "epoch": 0.4120028235158535, + "grad_norm": 0.6970714926719666, + "learning_rate": 1.3274874613906564e-05, + "loss": 1.0077, + "step": 28600 + }, + { + "epoch": 0.4121468804471527, + "grad_norm": 0.6124892830848694, + "learning_rate": 1.3270465858927055e-05, + "loss": 1.0258, + "step": 28610 + }, + { + "epoch": 0.41229093737845196, + "grad_norm": 0.563629150390625, + "learning_rate": 1.3266056392012248e-05, + "loss": 1.0306, + "step": 28620 + }, + { + "epoch": 0.4124349943097512, + "grad_norm": 0.6846545934677124, + "learning_rate": 1.3261646214122017e-05, + "loss": 1.0161, + "step": 28630 + }, + { + "epoch": 0.4125790512410505, + "grad_norm": 0.5320466756820679, + "learning_rate": 1.3257235326216405e-05, + "loss": 1.0005, + "step": 28640 + }, + { + "epoch": 0.4127231081723497, + "grad_norm": 0.5725711584091187, + "learning_rate": 1.325282372925559e-05, + "loss": 1.0258, + "step": 28650 + }, + { + "epoch": 0.41286716510364896, + "grad_norm": 0.6050660610198975, + "learning_rate": 1.3248411424199924e-05, + "loss": 1.0354, + "step": 28660 + }, + { + "epoch": 0.4130112220349482, + "grad_norm": 0.6206074953079224, + "learning_rate": 1.3243998412009904e-05, + "loss": 1.0444, + "step": 28670 + }, + { + "epoch": 0.4131552789662475, + "grad_norm": 0.6172865033149719, + "learning_rate": 1.3239584693646179e-05, + "loss": 1.0292, + "step": 28680 + }, + { + "epoch": 0.4132993358975467, + "grad_norm": 0.4913505017757416, + "learning_rate": 1.3235170270069557e-05, + "loss": 1.0175, + "step": 28690 + }, + { + "epoch": 0.41344339282884596, + "grad_norm": 0.5030386447906494, + "learning_rate": 1.3230755142240997e-05, + "loss": 1.0412, + "step": 28700 + }, + { + "epoch": 0.4135874497601452, + "grad_norm": 0.5598655343055725, + "learning_rate": 1.3226339311121613e-05, + "loss": 1.0181, + "step": 28710 + }, + { + "epoch": 0.4137315066914445, + "grad_norm": 0.6742128133773804, + "learning_rate": 1.3221922777672665e-05, + "loss": 1.034, + "step": 28720 + }, + { + "epoch": 0.4138755636227437, + "grad_norm": 0.5599446296691895, + "learning_rate": 1.3217505542855573e-05, + "loss": 1.0297, + "step": 28730 + }, + { + "epoch": 0.41401962055404296, + "grad_norm": 0.5743834972381592, + "learning_rate": 1.3213087607631912e-05, + "loss": 1.0293, + "step": 28740 + }, + { + "epoch": 0.4141636774853422, + "grad_norm": 0.5799579620361328, + "learning_rate": 1.3208668972963406e-05, + "loss": 1.0457, + "step": 28750 + }, + { + "epoch": 0.4143077344166415, + "grad_norm": 0.7092347741127014, + "learning_rate": 1.3204249639811924e-05, + "loss": 1.006, + "step": 28760 + }, + { + "epoch": 0.4144517913479407, + "grad_norm": 0.5876005291938782, + "learning_rate": 1.3199829609139498e-05, + "loss": 1.0278, + "step": 28770 + }, + { + "epoch": 0.41459584827923995, + "grad_norm": 0.6291282176971436, + "learning_rate": 1.3195408881908312e-05, + "loss": 1.0118, + "step": 28780 + }, + { + "epoch": 0.4147399052105392, + "grad_norm": 0.5738189220428467, + "learning_rate": 1.319098745908069e-05, + "loss": 1.012, + "step": 28790 + }, + { + "epoch": 0.4148839621418385, + "grad_norm": 0.6768100261688232, + "learning_rate": 1.318656534161912e-05, + "loss": 1.049, + "step": 28800 + }, + { + "epoch": 0.4150280190731377, + "grad_norm": 0.5153951048851013, + "learning_rate": 1.318214253048623e-05, + "loss": 1.0227, + "step": 28810 + }, + { + "epoch": 0.41517207600443695, + "grad_norm": 0.5960916876792908, + "learning_rate": 1.3177719026644815e-05, + "loss": 1.0149, + "step": 28820 + }, + { + "epoch": 0.4153161329357362, + "grad_norm": 0.6023445129394531, + "learning_rate": 1.3173294831057799e-05, + "loss": 1.0334, + "step": 28830 + }, + { + "epoch": 0.4154601898670355, + "grad_norm": 0.6096762418746948, + "learning_rate": 1.3168869944688274e-05, + "loss": 1.0387, + "step": 28840 + }, + { + "epoch": 0.4156042467983347, + "grad_norm": 0.5115797519683838, + "learning_rate": 1.3164444368499479e-05, + "loss": 1.0066, + "step": 28850 + }, + { + "epoch": 0.41574830372963395, + "grad_norm": 0.5491281151771545, + "learning_rate": 1.3160018103454799e-05, + "loss": 1.0308, + "step": 28860 + }, + { + "epoch": 0.4158923606609332, + "grad_norm": 0.9223948121070862, + "learning_rate": 1.315559115051777e-05, + "loss": 1.025, + "step": 28870 + }, + { + "epoch": 0.4160364175922325, + "grad_norm": 0.6295514106750488, + "learning_rate": 1.3151163510652077e-05, + "loss": 1.0147, + "step": 28880 + }, + { + "epoch": 0.4161804745235317, + "grad_norm": 0.5565011501312256, + "learning_rate": 1.3146735184821559e-05, + "loss": 1.006, + "step": 28890 + }, + { + "epoch": 0.41632453145483095, + "grad_norm": 0.5805292725563049, + "learning_rate": 1.31423061739902e-05, + "loss": 1.0511, + "step": 28900 + }, + { + "epoch": 0.4164685883861302, + "grad_norm": 0.5794126987457275, + "learning_rate": 1.3137876479122138e-05, + "loss": 1.0367, + "step": 28910 + }, + { + "epoch": 0.4166126453174295, + "grad_norm": 0.5777976512908936, + "learning_rate": 1.3133446101181648e-05, + "loss": 1.0072, + "step": 28920 + }, + { + "epoch": 0.4167567022487287, + "grad_norm": 0.5612347722053528, + "learning_rate": 1.3129015041133174e-05, + "loss": 1.0263, + "step": 28930 + }, + { + "epoch": 0.41690075918002795, + "grad_norm": 0.8037853240966797, + "learning_rate": 1.3124583299941287e-05, + "loss": 1.0244, + "step": 28940 + }, + { + "epoch": 0.4170448161113272, + "grad_norm": 0.5883808732032776, + "learning_rate": 1.3120150878570716e-05, + "loss": 1.0409, + "step": 28950 + }, + { + "epoch": 0.4171888730426265, + "grad_norm": 0.5531783699989319, + "learning_rate": 1.3115717777986347e-05, + "loss": 1.0347, + "step": 28960 + }, + { + "epoch": 0.4173329299739257, + "grad_norm": 0.608101487159729, + "learning_rate": 1.3111283999153196e-05, + "loss": 1.0117, + "step": 28970 + }, + { + "epoch": 0.41747698690522494, + "grad_norm": 0.6610711216926575, + "learning_rate": 1.3106849543036441e-05, + "loss": 1.0363, + "step": 28980 + }, + { + "epoch": 0.4176210438365242, + "grad_norm": 0.5230353474617004, + "learning_rate": 1.3102414410601396e-05, + "loss": 1.0151, + "step": 28990 + }, + { + "epoch": 0.4177651007678234, + "grad_norm": 0.6251484751701355, + "learning_rate": 1.3097978602813535e-05, + "loss": 1.0223, + "step": 29000 + }, + { + "epoch": 0.4179091576991227, + "grad_norm": 0.6061869859695435, + "learning_rate": 1.3093542120638467e-05, + "loss": 1.0048, + "step": 29010 + }, + { + "epoch": 0.41805321463042194, + "grad_norm": 0.6269460916519165, + "learning_rate": 1.3089104965041954e-05, + "loss": 1.0439, + "step": 29020 + }, + { + "epoch": 0.4181972715617212, + "grad_norm": 0.6348804831504822, + "learning_rate": 1.3084667136989907e-05, + "loss": 1.0204, + "step": 29030 + }, + { + "epoch": 0.4183413284930204, + "grad_norm": 0.6431804895401001, + "learning_rate": 1.3080228637448375e-05, + "loss": 1.0229, + "step": 29040 + }, + { + "epoch": 0.4184853854243197, + "grad_norm": 0.6093942523002625, + "learning_rate": 1.3075789467383562e-05, + "loss": 1.0388, + "step": 29050 + }, + { + "epoch": 0.41862944235561894, + "grad_norm": 0.5368409156799316, + "learning_rate": 1.3071349627761812e-05, + "loss": 1.0095, + "step": 29060 + }, + { + "epoch": 0.4187734992869182, + "grad_norm": 0.5583363175392151, + "learning_rate": 1.3066909119549615e-05, + "loss": 1.037, + "step": 29070 + }, + { + "epoch": 0.4189175562182174, + "grad_norm": 0.5977921485900879, + "learning_rate": 1.3062467943713615e-05, + "loss": 1.0186, + "step": 29080 + }, + { + "epoch": 0.4190616131495167, + "grad_norm": 0.5360631346702576, + "learning_rate": 1.3058026101220587e-05, + "loss": 1.0257, + "step": 29090 + }, + { + "epoch": 0.41920567008081594, + "grad_norm": 0.5536994934082031, + "learning_rate": 1.3053583593037462e-05, + "loss": 1.0247, + "step": 29100 + }, + { + "epoch": 0.4193497270121152, + "grad_norm": 0.5214248895645142, + "learning_rate": 1.3049140420131314e-05, + "loss": 1.0255, + "step": 29110 + }, + { + "epoch": 0.4194937839434144, + "grad_norm": 0.6232405304908752, + "learning_rate": 1.3044696583469364e-05, + "loss": 1.0303, + "step": 29120 + }, + { + "epoch": 0.4196378408747137, + "grad_norm": 0.6169807314872742, + "learning_rate": 1.3040252084018961e-05, + "loss": 1.0357, + "step": 29130 + }, + { + "epoch": 0.41978189780601294, + "grad_norm": 0.5029460787773132, + "learning_rate": 1.3035806922747624e-05, + "loss": 1.0225, + "step": 29140 + }, + { + "epoch": 0.4199259547373122, + "grad_norm": 0.6531195044517517, + "learning_rate": 1.3031361100623001e-05, + "loss": 1.0172, + "step": 29150 + }, + { + "epoch": 0.4200700116686114, + "grad_norm": 0.5460676550865173, + "learning_rate": 1.3026914618612878e-05, + "loss": 1.0082, + "step": 29160 + }, + { + "epoch": 0.4202140685999107, + "grad_norm": 0.6486418843269348, + "learning_rate": 1.3022467477685204e-05, + "loss": 1.0097, + "step": 29170 + }, + { + "epoch": 0.42035812553120994, + "grad_norm": 0.5953981876373291, + "learning_rate": 1.3018019678808052e-05, + "loss": 1.0222, + "step": 29180 + }, + { + "epoch": 0.4205021824625092, + "grad_norm": 0.6229760050773621, + "learning_rate": 1.3013571222949655e-05, + "loss": 1.0505, + "step": 29190 + }, + { + "epoch": 0.4206462393938084, + "grad_norm": 0.5768680572509766, + "learning_rate": 1.300912211107837e-05, + "loss": 1.0713, + "step": 29200 + }, + { + "epoch": 0.42079029632510767, + "grad_norm": 0.576744019985199, + "learning_rate": 1.3004672344162711e-05, + "loss": 1.0369, + "step": 29210 + }, + { + "epoch": 0.42093435325640693, + "grad_norm": 0.5998263359069824, + "learning_rate": 1.3000221923171337e-05, + "loss": 1.0399, + "step": 29220 + }, + { + "epoch": 0.4210784101877062, + "grad_norm": 0.5798830986022949, + "learning_rate": 1.2995770849073034e-05, + "loss": 1.0456, + "step": 29230 + }, + { + "epoch": 0.4212224671190054, + "grad_norm": 0.5114676356315613, + "learning_rate": 1.2991319122836748e-05, + "loss": 0.9902, + "step": 29240 + }, + { + "epoch": 0.42136652405030467, + "grad_norm": 0.654665470123291, + "learning_rate": 1.298686674543155e-05, + "loss": 1.0353, + "step": 29250 + }, + { + "epoch": 0.42151058098160393, + "grad_norm": 0.5200009942054749, + "learning_rate": 1.298241371782667e-05, + "loss": 1.0128, + "step": 29260 + }, + { + "epoch": 0.4216546379129032, + "grad_norm": 0.55632483959198, + "learning_rate": 1.2977960040991462e-05, + "loss": 1.0357, + "step": 29270 + }, + { + "epoch": 0.4217986948442024, + "grad_norm": 0.6114822626113892, + "learning_rate": 1.2973505715895436e-05, + "loss": 1.0351, + "step": 29280 + }, + { + "epoch": 0.42194275177550167, + "grad_norm": 0.5504259467124939, + "learning_rate": 1.2969050743508234e-05, + "loss": 1.0245, + "step": 29290 + }, + { + "epoch": 0.42208680870680093, + "grad_norm": 0.6451894044876099, + "learning_rate": 1.2964595124799643e-05, + "loss": 1.0103, + "step": 29300 + }, + { + "epoch": 0.4222308656381002, + "grad_norm": 0.5632631182670593, + "learning_rate": 1.2960138860739592e-05, + "loss": 0.9943, + "step": 29310 + }, + { + "epoch": 0.4223749225693994, + "grad_norm": 0.5638245940208435, + "learning_rate": 1.2955681952298145e-05, + "loss": 1.0094, + "step": 29320 + }, + { + "epoch": 0.42251897950069867, + "grad_norm": 0.5850462913513184, + "learning_rate": 1.295122440044551e-05, + "loss": 1.0328, + "step": 29330 + }, + { + "epoch": 0.42266303643199793, + "grad_norm": 0.6049823760986328, + "learning_rate": 1.2946766206152037e-05, + "loss": 1.0323, + "step": 29340 + }, + { + "epoch": 0.4228070933632972, + "grad_norm": 0.638034462928772, + "learning_rate": 1.294230737038821e-05, + "loss": 1.0246, + "step": 29350 + }, + { + "epoch": 0.4229511502945964, + "grad_norm": 0.5273005962371826, + "learning_rate": 1.2937847894124657e-05, + "loss": 1.0363, + "step": 29360 + }, + { + "epoch": 0.42309520722589566, + "grad_norm": 0.6056172251701355, + "learning_rate": 1.293338777833215e-05, + "loss": 1.0167, + "step": 29370 + }, + { + "epoch": 0.4232392641571949, + "grad_norm": 0.5097370743751526, + "learning_rate": 1.2928927023981583e-05, + "loss": 1.013, + "step": 29380 + }, + { + "epoch": 0.4233833210884942, + "grad_norm": 0.5481967329978943, + "learning_rate": 1.2924465632044012e-05, + "loss": 1.0239, + "step": 29390 + }, + { + "epoch": 0.4235273780197934, + "grad_norm": 0.5808329582214355, + "learning_rate": 1.2920003603490612e-05, + "loss": 1.0276, + "step": 29400 + }, + { + "epoch": 0.42367143495109266, + "grad_norm": 0.5588293075561523, + "learning_rate": 1.2915540939292712e-05, + "loss": 1.0301, + "step": 29410 + }, + { + "epoch": 0.4238154918823919, + "grad_norm": 0.5422973036766052, + "learning_rate": 1.291107764042177e-05, + "loss": 1.0499, + "step": 29420 + }, + { + "epoch": 0.4239595488136912, + "grad_norm": 0.5828791856765747, + "learning_rate": 1.2906613707849381e-05, + "loss": 1.0357, + "step": 29430 + }, + { + "epoch": 0.4241036057449904, + "grad_norm": 0.5760287642478943, + "learning_rate": 1.2902149142547287e-05, + "loss": 1.0273, + "step": 29440 + }, + { + "epoch": 0.42424766267628966, + "grad_norm": 0.7837143540382385, + "learning_rate": 1.289768394548736e-05, + "loss": 1.0388, + "step": 29450 + }, + { + "epoch": 0.4243917196075889, + "grad_norm": 0.5347105264663696, + "learning_rate": 1.289321811764161e-05, + "loss": 1.0087, + "step": 29460 + }, + { + "epoch": 0.4245357765388882, + "grad_norm": 0.6786202788352966, + "learning_rate": 1.2888751659982187e-05, + "loss": 1.0355, + "step": 29470 + }, + { + "epoch": 0.4246798334701874, + "grad_norm": 0.5785194039344788, + "learning_rate": 1.2884284573481381e-05, + "loss": 1.0353, + "step": 29480 + }, + { + "epoch": 0.42482389040148666, + "grad_norm": 0.6395455598831177, + "learning_rate": 1.2879816859111612e-05, + "loss": 1.0215, + "step": 29490 + }, + { + "epoch": 0.4249679473327859, + "grad_norm": 0.539808988571167, + "learning_rate": 1.2875348517845437e-05, + "loss": 1.0088, + "step": 29500 + }, + { + "epoch": 0.4251120042640852, + "grad_norm": 0.5646077394485474, + "learning_rate": 1.2870879550655557e-05, + "loss": 1.036, + "step": 29510 + }, + { + "epoch": 0.4252560611953844, + "grad_norm": 0.6132586598396301, + "learning_rate": 1.2866409958514803e-05, + "loss": 1.0476, + "step": 29520 + }, + { + "epoch": 0.42540011812668366, + "grad_norm": 0.523238480091095, + "learning_rate": 1.2861939742396141e-05, + "loss": 1.0276, + "step": 29530 + }, + { + "epoch": 0.4255441750579829, + "grad_norm": 0.53263258934021, + "learning_rate": 1.2857468903272678e-05, + "loss": 1.018, + "step": 29540 + }, + { + "epoch": 0.4256882319892822, + "grad_norm": 0.5227144360542297, + "learning_rate": 1.2852997442117655e-05, + "loss": 1.021, + "step": 29550 + }, + { + "epoch": 0.4258322889205814, + "grad_norm": 0.5284324288368225, + "learning_rate": 1.2848525359904447e-05, + "loss": 1.0247, + "step": 29560 + }, + { + "epoch": 0.42597634585188066, + "grad_norm": 0.7121564745903015, + "learning_rate": 1.2844052657606558e-05, + "loss": 1.0468, + "step": 29570 + }, + { + "epoch": 0.4261204027831799, + "grad_norm": 0.549347996711731, + "learning_rate": 1.2839579336197645e-05, + "loss": 1.0286, + "step": 29580 + }, + { + "epoch": 0.4262644597144792, + "grad_norm": 0.5873895883560181, + "learning_rate": 1.283510539665148e-05, + "loss": 1.0255, + "step": 29590 + }, + { + "epoch": 0.4264085166457784, + "grad_norm": 0.6166974902153015, + "learning_rate": 1.2830630839941985e-05, + "loss": 1.0211, + "step": 29600 + }, + { + "epoch": 0.42655257357707765, + "grad_norm": 0.5977450609207153, + "learning_rate": 1.2826155667043199e-05, + "loss": 1.0338, + "step": 29610 + }, + { + "epoch": 0.4266966305083769, + "grad_norm": 0.659603476524353, + "learning_rate": 1.2821679878929312e-05, + "loss": 1.0208, + "step": 29620 + }, + { + "epoch": 0.4268406874396762, + "grad_norm": 0.5155239105224609, + "learning_rate": 1.2817203476574643e-05, + "loss": 1.0167, + "step": 29630 + }, + { + "epoch": 0.4269847443709754, + "grad_norm": 0.5760207772254944, + "learning_rate": 1.2812726460953637e-05, + "loss": 1.0268, + "step": 29640 + }, + { + "epoch": 0.42712880130227465, + "grad_norm": 0.6081256866455078, + "learning_rate": 1.2808248833040885e-05, + "loss": 1.0351, + "step": 29650 + }, + { + "epoch": 0.4272728582335739, + "grad_norm": 0.5349990725517273, + "learning_rate": 1.28037705938111e-05, + "loss": 1.0107, + "step": 29660 + }, + { + "epoch": 0.4274169151648732, + "grad_norm": 0.5976602435112, + "learning_rate": 1.2799291744239138e-05, + "loss": 1.0191, + "step": 29670 + }, + { + "epoch": 0.4275609720961724, + "grad_norm": 0.5629780888557434, + "learning_rate": 1.2794812285299976e-05, + "loss": 1.0314, + "step": 29680 + }, + { + "epoch": 0.42770502902747165, + "grad_norm": 0.6238521933555603, + "learning_rate": 1.2790332217968734e-05, + "loss": 1.0431, + "step": 29690 + }, + { + "epoch": 0.4278490859587709, + "grad_norm": 0.5768886208534241, + "learning_rate": 1.2785851543220664e-05, + "loss": 1.0422, + "step": 29700 + }, + { + "epoch": 0.4279931428900702, + "grad_norm": 0.6511243581771851, + "learning_rate": 1.2781370262031139e-05, + "loss": 1.0429, + "step": 29710 + }, + { + "epoch": 0.4281371998213694, + "grad_norm": 0.5374072194099426, + "learning_rate": 1.277688837537568e-05, + "loss": 1.0032, + "step": 29720 + }, + { + "epoch": 0.42828125675266865, + "grad_norm": 0.7526559829711914, + "learning_rate": 1.2772405884229928e-05, + "loss": 1.0355, + "step": 29730 + }, + { + "epoch": 0.4284253136839679, + "grad_norm": 0.5825052261352539, + "learning_rate": 1.2767922789569665e-05, + "loss": 1.0183, + "step": 29740 + }, + { + "epoch": 0.4285693706152672, + "grad_norm": 0.5553596615791321, + "learning_rate": 1.2763439092370788e-05, + "loss": 1.023, + "step": 29750 + }, + { + "epoch": 0.4287134275465664, + "grad_norm": 0.5329517722129822, + "learning_rate": 1.2758954793609344e-05, + "loss": 1.012, + "step": 29760 + }, + { + "epoch": 0.42885748447786565, + "grad_norm": 0.6524930596351624, + "learning_rate": 1.27544698942615e-05, + "loss": 1.0298, + "step": 29770 + }, + { + "epoch": 0.4290015414091649, + "grad_norm": 0.6755332350730896, + "learning_rate": 1.274998439530356e-05, + "loss": 1.0237, + "step": 29780 + }, + { + "epoch": 0.4291455983404642, + "grad_norm": 0.5230400562286377, + "learning_rate": 1.2745498297711953e-05, + "loss": 1.0342, + "step": 29790 + }, + { + "epoch": 0.4292896552717634, + "grad_norm": 0.6378017663955688, + "learning_rate": 1.274101160246324e-05, + "loss": 1.0366, + "step": 29800 + }, + { + "epoch": 0.42943371220306265, + "grad_norm": 0.5726189613342285, + "learning_rate": 1.2736524310534118e-05, + "loss": 1.0337, + "step": 29810 + }, + { + "epoch": 0.4295777691343619, + "grad_norm": 0.6010988354682922, + "learning_rate": 1.2732036422901397e-05, + "loss": 1.0252, + "step": 29820 + }, + { + "epoch": 0.42972182606566117, + "grad_norm": 0.571182370185852, + "learning_rate": 1.272754794054204e-05, + "loss": 1.0287, + "step": 29830 + }, + { + "epoch": 0.4298658829969604, + "grad_norm": 0.6563334465026855, + "learning_rate": 1.272305886443312e-05, + "loss": 0.9941, + "step": 29840 + }, + { + "epoch": 0.43000993992825964, + "grad_norm": 0.6256719827651978, + "learning_rate": 1.2718569195551853e-05, + "loss": 1.022, + "step": 29850 + }, + { + "epoch": 0.4301539968595589, + "grad_norm": 0.5974683165550232, + "learning_rate": 1.271407893487557e-05, + "loss": 1.0317, + "step": 29860 + }, + { + "epoch": 0.43029805379085817, + "grad_norm": 0.6402578949928284, + "learning_rate": 1.2709588083381747e-05, + "loss": 1.0149, + "step": 29870 + }, + { + "epoch": 0.4304421107221574, + "grad_norm": 0.5927873849868774, + "learning_rate": 1.2705096642047976e-05, + "loss": 1.0182, + "step": 29880 + }, + { + "epoch": 0.43058616765345664, + "grad_norm": 0.5713891386985779, + "learning_rate": 1.2700604611851984e-05, + "loss": 1.0227, + "step": 29890 + }, + { + "epoch": 0.4307302245847559, + "grad_norm": 0.6035123467445374, + "learning_rate": 1.2696111993771623e-05, + "loss": 1.0242, + "step": 29900 + }, + { + "epoch": 0.43087428151605517, + "grad_norm": 0.6411575078964233, + "learning_rate": 1.269161878878487e-05, + "loss": 1.0063, + "step": 29910 + }, + { + "epoch": 0.4310183384473544, + "grad_norm": 0.6706348061561584, + "learning_rate": 1.2687124997869844e-05, + "loss": 1.0255, + "step": 29920 + }, + { + "epoch": 0.43116239537865364, + "grad_norm": 0.5613524317741394, + "learning_rate": 1.268263062200477e-05, + "loss": 1.0292, + "step": 29930 + }, + { + "epoch": 0.4313064523099529, + "grad_norm": 0.6087634563446045, + "learning_rate": 1.2678135662168019e-05, + "loss": 1.0366, + "step": 29940 + }, + { + "epoch": 0.43145050924125217, + "grad_norm": 0.5440014004707336, + "learning_rate": 1.2673640119338075e-05, + "loss": 1.0036, + "step": 29950 + }, + { + "epoch": 0.4315945661725514, + "grad_norm": 0.652259111404419, + "learning_rate": 1.2669143994493565e-05, + "loss": 1.0345, + "step": 29960 + }, + { + "epoch": 0.43173862310385064, + "grad_norm": 0.5840180516242981, + "learning_rate": 1.2664647288613225e-05, + "loss": 1.0218, + "step": 29970 + }, + { + "epoch": 0.4318826800351499, + "grad_norm": 0.5811200141906738, + "learning_rate": 1.266015000267593e-05, + "loss": 1.0154, + "step": 29980 + }, + { + "epoch": 0.43202673696644917, + "grad_norm": 0.5535847544670105, + "learning_rate": 1.2655652137660677e-05, + "loss": 1.0318, + "step": 29990 + }, + { + "epoch": 0.4321707938977484, + "grad_norm": 0.5859915614128113, + "learning_rate": 1.2651153694546588e-05, + "loss": 1.0538, + "step": 30000 + }, + { + "epoch": 0.43231485082904764, + "grad_norm": 0.5436146259307861, + "learning_rate": 1.2646654674312915e-05, + "loss": 1.0287, + "step": 30010 + }, + { + "epoch": 0.4324589077603469, + "grad_norm": 0.5253572463989258, + "learning_rate": 1.2642155077939024e-05, + "loss": 1.0233, + "step": 30020 + }, + { + "epoch": 0.43260296469164616, + "grad_norm": 0.613762378692627, + "learning_rate": 1.2637654906404424e-05, + "loss": 1.0229, + "step": 30030 + }, + { + "epoch": 0.43274702162294537, + "grad_norm": 0.5489577054977417, + "learning_rate": 1.263315416068874e-05, + "loss": 1.0234, + "step": 30040 + }, + { + "epoch": 0.43289107855424463, + "grad_norm": 0.5223692059516907, + "learning_rate": 1.2628652841771716e-05, + "loss": 1.0226, + "step": 30050 + }, + { + "epoch": 0.4330351354855439, + "grad_norm": 0.5891116857528687, + "learning_rate": 1.2624150950633232e-05, + "loss": 1.021, + "step": 30060 + }, + { + "epoch": 0.43317919241684316, + "grad_norm": 0.597817599773407, + "learning_rate": 1.2619648488253287e-05, + "loss": 1.0257, + "step": 30070 + }, + { + "epoch": 0.43332324934814237, + "grad_norm": 0.6240730881690979, + "learning_rate": 1.2615145455612004e-05, + "loss": 1.0283, + "step": 30080 + }, + { + "epoch": 0.43346730627944163, + "grad_norm": 0.5432168245315552, + "learning_rate": 1.261064185368963e-05, + "loss": 1.0271, + "step": 30090 + }, + { + "epoch": 0.4336113632107409, + "grad_norm": 0.5634761452674866, + "learning_rate": 1.2606137683466539e-05, + "loss": 1.0175, + "step": 30100 + }, + { + "epoch": 0.43375542014204016, + "grad_norm": 0.5914860963821411, + "learning_rate": 1.2601632945923226e-05, + "loss": 1.0329, + "step": 30110 + }, + { + "epoch": 0.43389947707333937, + "grad_norm": 0.5869556069374084, + "learning_rate": 1.259712764204031e-05, + "loss": 1.0356, + "step": 30120 + }, + { + "epoch": 0.43404353400463863, + "grad_norm": 0.5884944200515747, + "learning_rate": 1.2592621772798534e-05, + "loss": 1.0151, + "step": 30130 + }, + { + "epoch": 0.4341875909359379, + "grad_norm": 0.6159077882766724, + "learning_rate": 1.2588115339178763e-05, + "loss": 1.0218, + "step": 30140 + }, + { + "epoch": 0.43433164786723716, + "grad_norm": 0.6196834444999695, + "learning_rate": 1.2583608342161986e-05, + "loss": 1.0351, + "step": 30150 + }, + { + "epoch": 0.43447570479853637, + "grad_norm": 0.570512056350708, + "learning_rate": 1.2579100782729312e-05, + "loss": 1.003, + "step": 30160 + }, + { + "epoch": 0.43461976172983563, + "grad_norm": 0.5434126853942871, + "learning_rate": 1.257459266186198e-05, + "loss": 1.0119, + "step": 30170 + }, + { + "epoch": 0.4347638186611349, + "grad_norm": 0.7198238372802734, + "learning_rate": 1.257008398054134e-05, + "loss": 1.0183, + "step": 30180 + }, + { + "epoch": 0.43490787559243416, + "grad_norm": 0.5934227705001831, + "learning_rate": 1.2565574739748872e-05, + "loss": 1.0364, + "step": 30190 + }, + { + "epoch": 0.43505193252373336, + "grad_norm": 0.7064135670661926, + "learning_rate": 1.2561064940466174e-05, + "loss": 1.0267, + "step": 30200 + }, + { + "epoch": 0.43519598945503263, + "grad_norm": 0.5537364482879639, + "learning_rate": 1.255655458367497e-05, + "loss": 1.0267, + "step": 30210 + }, + { + "epoch": 0.4353400463863319, + "grad_norm": 0.5636672377586365, + "learning_rate": 1.2552043670357101e-05, + "loss": 1.0037, + "step": 30220 + }, + { + "epoch": 0.43548410331763115, + "grad_norm": 0.5693824887275696, + "learning_rate": 1.254753220149453e-05, + "loss": 1.0178, + "step": 30230 + }, + { + "epoch": 0.43562816024893036, + "grad_norm": 0.6389200091362, + "learning_rate": 1.254302017806934e-05, + "loss": 1.0218, + "step": 30240 + }, + { + "epoch": 0.4357722171802296, + "grad_norm": 0.6115773916244507, + "learning_rate": 1.2538507601063744e-05, + "loss": 1.0276, + "step": 30250 + }, + { + "epoch": 0.4359162741115289, + "grad_norm": 0.5919804573059082, + "learning_rate": 1.2533994471460054e-05, + "loss": 1.0168, + "step": 30260 + }, + { + "epoch": 0.43606033104282815, + "grad_norm": 0.6357947587966919, + "learning_rate": 1.252948079024073e-05, + "loss": 1.0153, + "step": 30270 + }, + { + "epoch": 0.43620438797412736, + "grad_norm": 0.6063383221626282, + "learning_rate": 1.2524966558388327e-05, + "loss": 1.0095, + "step": 30280 + }, + { + "epoch": 0.4363484449054266, + "grad_norm": 0.5542093515396118, + "learning_rate": 1.2520451776885544e-05, + "loss": 1.0246, + "step": 30290 + }, + { + "epoch": 0.4364925018367259, + "grad_norm": 0.6694737076759338, + "learning_rate": 1.2515936446715171e-05, + "loss": 1.0491, + "step": 30300 + }, + { + "epoch": 0.4366365587680251, + "grad_norm": 0.5313624143600464, + "learning_rate": 1.2511420568860144e-05, + "loss": 1.0447, + "step": 30310 + }, + { + "epoch": 0.43678061569932436, + "grad_norm": 0.5757784843444824, + "learning_rate": 1.2506904144303502e-05, + "loss": 1.0369, + "step": 30320 + }, + { + "epoch": 0.4369246726306236, + "grad_norm": 0.9457381963729858, + "learning_rate": 1.2502387174028416e-05, + "loss": 1.0399, + "step": 30330 + }, + { + "epoch": 0.4370687295619229, + "grad_norm": 0.5293096303939819, + "learning_rate": 1.2497869659018158e-05, + "loss": 1.0197, + "step": 30340 + }, + { + "epoch": 0.4372127864932221, + "grad_norm": 0.6232894659042358, + "learning_rate": 1.2493351600256136e-05, + "loss": 1.0265, + "step": 30350 + }, + { + "epoch": 0.43735684342452136, + "grad_norm": 0.6530959010124207, + "learning_rate": 1.2488832998725866e-05, + "loss": 1.0193, + "step": 30360 + }, + { + "epoch": 0.4375009003558206, + "grad_norm": 0.5749871134757996, + "learning_rate": 1.2484313855410983e-05, + "loss": 1.0378, + "step": 30370 + }, + { + "epoch": 0.4376449572871199, + "grad_norm": 0.560028612613678, + "learning_rate": 1.2479794171295248e-05, + "loss": 1.0262, + "step": 30380 + }, + { + "epoch": 0.4377890142184191, + "grad_norm": 0.5265219211578369, + "learning_rate": 1.2475273947362528e-05, + "loss": 1.0136, + "step": 30390 + }, + { + "epoch": 0.43793307114971836, + "grad_norm": 0.6045346856117249, + "learning_rate": 1.247075318459682e-05, + "loss": 1.027, + "step": 30400 + }, + { + "epoch": 0.4380771280810176, + "grad_norm": 0.5903724431991577, + "learning_rate": 1.2466231883982223e-05, + "loss": 1.0223, + "step": 30410 + }, + { + "epoch": 0.4382211850123169, + "grad_norm": 0.551878809928894, + "learning_rate": 1.246171004650297e-05, + "loss": 1.0125, + "step": 30420 + }, + { + "epoch": 0.4383652419436161, + "grad_norm": 0.613456130027771, + "learning_rate": 1.2457187673143396e-05, + "loss": 1.0231, + "step": 30430 + }, + { + "epoch": 0.43850929887491535, + "grad_norm": 0.6330251097679138, + "learning_rate": 1.2452664764887965e-05, + "loss": 1.0084, + "step": 30440 + }, + { + "epoch": 0.4386533558062146, + "grad_norm": 0.5188472270965576, + "learning_rate": 1.244814132272125e-05, + "loss": 1.0383, + "step": 30450 + }, + { + "epoch": 0.4387974127375139, + "grad_norm": 0.5829715728759766, + "learning_rate": 1.244361734762794e-05, + "loss": 1.0361, + "step": 30460 + }, + { + "epoch": 0.4389414696688131, + "grad_norm": 0.7289334535598755, + "learning_rate": 1.2439092840592843e-05, + "loss": 1.0452, + "step": 30470 + }, + { + "epoch": 0.43908552660011235, + "grad_norm": 0.6511844396591187, + "learning_rate": 1.2434567802600881e-05, + "loss": 1.0338, + "step": 30480 + }, + { + "epoch": 0.4392295835314116, + "grad_norm": 0.7280377149581909, + "learning_rate": 1.2430042234637094e-05, + "loss": 1.0261, + "step": 30490 + }, + { + "epoch": 0.4393736404627109, + "grad_norm": 0.5383123755455017, + "learning_rate": 1.2425516137686636e-05, + "loss": 1.0147, + "step": 30500 + }, + { + "epoch": 0.4395176973940101, + "grad_norm": 0.5853512287139893, + "learning_rate": 1.2420989512734774e-05, + "loss": 1.0188, + "step": 30510 + }, + { + "epoch": 0.43966175432530935, + "grad_norm": 0.6809625029563904, + "learning_rate": 1.2416462360766892e-05, + "loss": 1.0317, + "step": 30520 + }, + { + "epoch": 0.4398058112566086, + "grad_norm": 0.9652009606361389, + "learning_rate": 1.241193468276849e-05, + "loss": 1.0355, + "step": 30530 + }, + { + "epoch": 0.4399498681879079, + "grad_norm": 0.6148524880409241, + "learning_rate": 1.2407406479725179e-05, + "loss": 1.0355, + "step": 30540 + }, + { + "epoch": 0.4400939251192071, + "grad_norm": 0.5160987973213196, + "learning_rate": 1.2402877752622689e-05, + "loss": 1.0306, + "step": 30550 + }, + { + "epoch": 0.44023798205050635, + "grad_norm": 0.5704976916313171, + "learning_rate": 1.239834850244686e-05, + "loss": 1.0531, + "step": 30560 + }, + { + "epoch": 0.4403820389818056, + "grad_norm": 0.608162522315979, + "learning_rate": 1.2393818730183647e-05, + "loss": 1.029, + "step": 30570 + }, + { + "epoch": 0.4405260959131049, + "grad_norm": 0.6319136619567871, + "learning_rate": 1.2389288436819118e-05, + "loss": 1.0027, + "step": 30580 + }, + { + "epoch": 0.4406701528444041, + "grad_norm": 0.6754544377326965, + "learning_rate": 1.2384757623339458e-05, + "loss": 1.0246, + "step": 30590 + }, + { + "epoch": 0.44081420977570335, + "grad_norm": 0.882371723651886, + "learning_rate": 1.2380226290730959e-05, + "loss": 1.0206, + "step": 30600 + }, + { + "epoch": 0.4409582667070026, + "grad_norm": 0.6008939743041992, + "learning_rate": 1.2375694439980036e-05, + "loss": 1.0049, + "step": 30610 + }, + { + "epoch": 0.4411023236383019, + "grad_norm": 0.5888626575469971, + "learning_rate": 1.2371162072073202e-05, + "loss": 1.0442, + "step": 30620 + }, + { + "epoch": 0.4412463805696011, + "grad_norm": 0.5692532658576965, + "learning_rate": 1.2366629187997096e-05, + "loss": 1.0428, + "step": 30630 + }, + { + "epoch": 0.44139043750090035, + "grad_norm": 0.5510519742965698, + "learning_rate": 1.2362095788738464e-05, + "loss": 1.0398, + "step": 30640 + }, + { + "epoch": 0.4415344944321996, + "grad_norm": 0.5545852780342102, + "learning_rate": 1.2357561875284166e-05, + "loss": 1.0053, + "step": 30650 + }, + { + "epoch": 0.4416785513634989, + "grad_norm": 0.5512057542800903, + "learning_rate": 1.2353027448621169e-05, + "loss": 1.0344, + "step": 30660 + }, + { + "epoch": 0.4418226082947981, + "grad_norm": 0.608013391494751, + "learning_rate": 1.2348492509736554e-05, + "loss": 1.0538, + "step": 30670 + }, + { + "epoch": 0.44196666522609734, + "grad_norm": 0.6366941928863525, + "learning_rate": 1.234395705961752e-05, + "loss": 1.0188, + "step": 30680 + }, + { + "epoch": 0.4421107221573966, + "grad_norm": 0.530868649482727, + "learning_rate": 1.2339421099251367e-05, + "loss": 1.0388, + "step": 30690 + }, + { + "epoch": 0.44225477908869587, + "grad_norm": 0.5997594594955444, + "learning_rate": 1.2334884629625515e-05, + "loss": 1.0111, + "step": 30700 + }, + { + "epoch": 0.4423988360199951, + "grad_norm": 0.8025027513504028, + "learning_rate": 1.2330347651727485e-05, + "loss": 1.0087, + "step": 30710 + }, + { + "epoch": 0.44254289295129434, + "grad_norm": 0.538600504398346, + "learning_rate": 1.2325810166544923e-05, + "loss": 1.02, + "step": 30720 + }, + { + "epoch": 0.4426869498825936, + "grad_norm": 0.5707302689552307, + "learning_rate": 1.2321272175065571e-05, + "loss": 1.0435, + "step": 30730 + }, + { + "epoch": 0.44283100681389287, + "grad_norm": 0.6243785619735718, + "learning_rate": 1.2316733678277286e-05, + "loss": 1.0324, + "step": 30740 + }, + { + "epoch": 0.4429750637451921, + "grad_norm": 0.5545337796211243, + "learning_rate": 1.2312194677168041e-05, + "loss": 1.038, + "step": 30750 + }, + { + "epoch": 0.44311912067649134, + "grad_norm": 0.6085401773452759, + "learning_rate": 1.230765517272591e-05, + "loss": 1.0137, + "step": 30760 + }, + { + "epoch": 0.4432631776077906, + "grad_norm": 0.5168399214744568, + "learning_rate": 1.2303115165939086e-05, + "loss": 1.0088, + "step": 30770 + }, + { + "epoch": 0.44340723453908987, + "grad_norm": 0.5611743927001953, + "learning_rate": 1.2298574657795856e-05, + "loss": 1.0207, + "step": 30780 + }, + { + "epoch": 0.4435512914703891, + "grad_norm": 0.5813621878623962, + "learning_rate": 1.2294033649284635e-05, + "loss": 1.0046, + "step": 30790 + }, + { + "epoch": 0.44369534840168834, + "grad_norm": 0.5414305925369263, + "learning_rate": 1.2289492141393937e-05, + "loss": 1.032, + "step": 30800 + }, + { + "epoch": 0.4438394053329876, + "grad_norm": 0.5455127358436584, + "learning_rate": 1.2284950135112382e-05, + "loss": 1.0306, + "step": 30810 + }, + { + "epoch": 0.44398346226428687, + "grad_norm": 0.5503478050231934, + "learning_rate": 1.22804076314287e-05, + "loss": 1.0046, + "step": 30820 + }, + { + "epoch": 0.4441275191955861, + "grad_norm": 0.5376619100570679, + "learning_rate": 1.2275864631331737e-05, + "loss": 1.0277, + "step": 30830 + }, + { + "epoch": 0.44427157612688534, + "grad_norm": 0.5993817448616028, + "learning_rate": 1.2271321135810442e-05, + "loss": 1.0183, + "step": 30840 + }, + { + "epoch": 0.4444156330581846, + "grad_norm": 0.6439812183380127, + "learning_rate": 1.2266777145853866e-05, + "loss": 1.0425, + "step": 30850 + }, + { + "epoch": 0.44455968998948386, + "grad_norm": 0.5761047601699829, + "learning_rate": 1.2262232662451175e-05, + "loss": 1.051, + "step": 30860 + }, + { + "epoch": 0.44470374692078307, + "grad_norm": 0.553991436958313, + "learning_rate": 1.225768768659164e-05, + "loss": 1.018, + "step": 30870 + }, + { + "epoch": 0.44484780385208234, + "grad_norm": 0.7612801790237427, + "learning_rate": 1.2253142219264644e-05, + "loss": 1.0281, + "step": 30880 + }, + { + "epoch": 0.4449918607833816, + "grad_norm": 0.6323403120040894, + "learning_rate": 1.2248596261459666e-05, + "loss": 1.0406, + "step": 30890 + }, + { + "epoch": 0.44513591771468086, + "grad_norm": 0.5343186259269714, + "learning_rate": 1.2244049814166303e-05, + "loss": 1.0222, + "step": 30900 + }, + { + "epoch": 0.44527997464598007, + "grad_norm": 0.5989657640457153, + "learning_rate": 1.2239502878374255e-05, + "loss": 1.0303, + "step": 30910 + }, + { + "epoch": 0.44542403157727933, + "grad_norm": 0.5075747966766357, + "learning_rate": 1.2234955455073319e-05, + "loss": 1.0193, + "step": 30920 + }, + { + "epoch": 0.4455680885085786, + "grad_norm": 0.5390758514404297, + "learning_rate": 1.2230407545253414e-05, + "loss": 1.0204, + "step": 30930 + }, + { + "epoch": 0.44571214543987786, + "grad_norm": 0.5804110765457153, + "learning_rate": 1.222585914990455e-05, + "loss": 1.0334, + "step": 30940 + }, + { + "epoch": 0.44585620237117707, + "grad_norm": 3.104297637939453, + "learning_rate": 1.2221310270016865e-05, + "loss": 1.0631, + "step": 30950 + }, + { + "epoch": 0.44600025930247633, + "grad_norm": 0.5167530179023743, + "learning_rate": 1.221676090658057e-05, + "loss": 1.0337, + "step": 30960 + }, + { + "epoch": 0.4461443162337756, + "grad_norm": 0.5759089589118958, + "learning_rate": 1.2212211060586008e-05, + "loss": 1.0183, + "step": 30970 + }, + { + "epoch": 0.44628837316507486, + "grad_norm": 0.5175536870956421, + "learning_rate": 1.2207660733023613e-05, + "loss": 1.0247, + "step": 30980 + }, + { + "epoch": 0.44643243009637407, + "grad_norm": 0.519580602645874, + "learning_rate": 1.2203109924883933e-05, + "loss": 1.0394, + "step": 30990 + }, + { + "epoch": 0.44657648702767333, + "grad_norm": 0.6472852826118469, + "learning_rate": 1.2198558637157616e-05, + "loss": 1.0076, + "step": 31000 + }, + { + "epoch": 0.4467205439589726, + "grad_norm": 0.5493713021278381, + "learning_rate": 1.2194006870835408e-05, + "loss": 1.0292, + "step": 31010 + }, + { + "epoch": 0.44686460089027186, + "grad_norm": 0.5693567991256714, + "learning_rate": 1.2189454626908176e-05, + "loss": 1.0217, + "step": 31020 + }, + { + "epoch": 0.44700865782157106, + "grad_norm": 0.6454607844352722, + "learning_rate": 1.2184901906366872e-05, + "loss": 1.0142, + "step": 31030 + }, + { + "epoch": 0.44715271475287033, + "grad_norm": 0.7012841105461121, + "learning_rate": 1.2180348710202565e-05, + "loss": 0.9985, + "step": 31040 + }, + { + "epoch": 0.4472967716841696, + "grad_norm": 0.6283087134361267, + "learning_rate": 1.2175795039406422e-05, + "loss": 1.0346, + "step": 31050 + }, + { + "epoch": 0.44744082861546886, + "grad_norm": 0.7161290645599365, + "learning_rate": 1.2171240894969714e-05, + "loss": 1.0284, + "step": 31060 + }, + { + "epoch": 0.44758488554676806, + "grad_norm": 0.6098495125770569, + "learning_rate": 1.2166686277883815e-05, + "loss": 1.0147, + "step": 31070 + }, + { + "epoch": 0.4477289424780673, + "grad_norm": 0.6446240544319153, + "learning_rate": 1.21621311891402e-05, + "loss": 1.0227, + "step": 31080 + }, + { + "epoch": 0.4478729994093666, + "grad_norm": 0.5811617970466614, + "learning_rate": 1.2157575629730457e-05, + "loss": 1.021, + "step": 31090 + }, + { + "epoch": 0.44801705634066585, + "grad_norm": 0.6839554309844971, + "learning_rate": 1.215301960064626e-05, + "loss": 1.031, + "step": 31100 + }, + { + "epoch": 0.44816111327196506, + "grad_norm": 0.5694627165794373, + "learning_rate": 1.2148463102879398e-05, + "loss": 1.0507, + "step": 31110 + }, + { + "epoch": 0.4483051702032643, + "grad_norm": 0.6384556889533997, + "learning_rate": 1.2143906137421755e-05, + "loss": 1.0203, + "step": 31120 + }, + { + "epoch": 0.4484492271345636, + "grad_norm": 0.5794015526771545, + "learning_rate": 1.2139348705265321e-05, + "loss": 1.0188, + "step": 31130 + }, + { + "epoch": 0.44859328406586285, + "grad_norm": 0.5144084692001343, + "learning_rate": 1.2134790807402189e-05, + "loss": 1.0178, + "step": 31140 + }, + { + "epoch": 0.44873734099716206, + "grad_norm": 0.5534980297088623, + "learning_rate": 1.2130232444824543e-05, + "loss": 1.0415, + "step": 31150 + }, + { + "epoch": 0.4488813979284613, + "grad_norm": 0.48926809430122375, + "learning_rate": 1.2125673618524684e-05, + "loss": 1.0094, + "step": 31160 + }, + { + "epoch": 0.4490254548597606, + "grad_norm": 0.6032081246376038, + "learning_rate": 1.2121114329495003e-05, + "loss": 1.0162, + "step": 31170 + }, + { + "epoch": 0.44916951179105985, + "grad_norm": 0.5034138560295105, + "learning_rate": 1.2116554578727993e-05, + "loss": 1.0238, + "step": 31180 + }, + { + "epoch": 0.44931356872235906, + "grad_norm": 0.6093688011169434, + "learning_rate": 1.2111994367216244e-05, + "loss": 1.0209, + "step": 31190 + }, + { + "epoch": 0.4494576256536583, + "grad_norm": 0.6029537320137024, + "learning_rate": 1.2107433695952462e-05, + "loss": 1.0096, + "step": 31200 + }, + { + "epoch": 0.4496016825849576, + "grad_norm": 0.5483503937721252, + "learning_rate": 1.2102872565929435e-05, + "loss": 1.034, + "step": 31210 + }, + { + "epoch": 0.44974573951625685, + "grad_norm": 0.590434193611145, + "learning_rate": 1.2098310978140057e-05, + "loss": 1.0231, + "step": 31220 + }, + { + "epoch": 0.44988979644755606, + "grad_norm": 0.6409574747085571, + "learning_rate": 1.2093748933577328e-05, + "loss": 1.0312, + "step": 31230 + }, + { + "epoch": 0.4500338533788553, + "grad_norm": 0.5614639520645142, + "learning_rate": 1.208918643323434e-05, + "loss": 0.9887, + "step": 31240 + }, + { + "epoch": 0.4501779103101546, + "grad_norm": 0.621324896812439, + "learning_rate": 1.2084623478104285e-05, + "loss": 1.0357, + "step": 31250 + }, + { + "epoch": 0.45032196724145385, + "grad_norm": 0.5756137371063232, + "learning_rate": 1.2080060069180452e-05, + "loss": 1.0237, + "step": 31260 + }, + { + "epoch": 0.45046602417275305, + "grad_norm": 0.5862130522727966, + "learning_rate": 1.2075496207456242e-05, + "loss": 1.0317, + "step": 31270 + }, + { + "epoch": 0.4506100811040523, + "grad_norm": 0.6214578151702881, + "learning_rate": 1.2070931893925138e-05, + "loss": 1.0306, + "step": 31280 + }, + { + "epoch": 0.4507541380353516, + "grad_norm": 0.5534659624099731, + "learning_rate": 1.2066367129580727e-05, + "loss": 1.0242, + "step": 31290 + }, + { + "epoch": 0.45089819496665084, + "grad_norm": 0.5505697131156921, + "learning_rate": 1.2061801915416698e-05, + "loss": 1.0337, + "step": 31300 + }, + { + "epoch": 0.45104225189795005, + "grad_norm": 0.6099808216094971, + "learning_rate": 1.2057236252426834e-05, + "loss": 1.0237, + "step": 31310 + }, + { + "epoch": 0.4511863088292493, + "grad_norm": 0.5561302304267883, + "learning_rate": 1.205267014160502e-05, + "loss": 1.0099, + "step": 31320 + }, + { + "epoch": 0.4513303657605486, + "grad_norm": 0.5727132558822632, + "learning_rate": 1.2048103583945233e-05, + "loss": 1.0377, + "step": 31330 + }, + { + "epoch": 0.45147442269184784, + "grad_norm": 0.576480507850647, + "learning_rate": 1.2043536580441546e-05, + "loss": 1.0063, + "step": 31340 + }, + { + "epoch": 0.45161847962314705, + "grad_norm": 0.6703700423240662, + "learning_rate": 1.2038969132088139e-05, + "loss": 1.0376, + "step": 31350 + }, + { + "epoch": 0.4517625365544463, + "grad_norm": 0.6149612069129944, + "learning_rate": 1.2034401239879279e-05, + "loss": 1.0205, + "step": 31360 + }, + { + "epoch": 0.4519065934857456, + "grad_norm": 0.683622419834137, + "learning_rate": 1.2029832904809333e-05, + "loss": 1.0164, + "step": 31370 + }, + { + "epoch": 0.45205065041704484, + "grad_norm": 0.6892789602279663, + "learning_rate": 1.2025264127872764e-05, + "loss": 1.0329, + "step": 31380 + }, + { + "epoch": 0.45219470734834405, + "grad_norm": 0.5351040959358215, + "learning_rate": 1.2020694910064137e-05, + "loss": 1.0384, + "step": 31390 + }, + { + "epoch": 0.4523387642796433, + "grad_norm": 0.6014117002487183, + "learning_rate": 1.2016125252378098e-05, + "loss": 1.0268, + "step": 31400 + }, + { + "epoch": 0.4524828212109426, + "grad_norm": 0.6096225380897522, + "learning_rate": 1.2011555155809407e-05, + "loss": 1.0224, + "step": 31410 + }, + { + "epoch": 0.45262687814224184, + "grad_norm": 0.6388968229293823, + "learning_rate": 1.2006984621352906e-05, + "loss": 1.0193, + "step": 31420 + }, + { + "epoch": 0.45277093507354105, + "grad_norm": 0.5522310733795166, + "learning_rate": 1.2002413650003545e-05, + "loss": 1.0332, + "step": 31430 + }, + { + "epoch": 0.4529149920048403, + "grad_norm": 0.7236217260360718, + "learning_rate": 1.199784224275635e-05, + "loss": 1.0283, + "step": 31440 + }, + { + "epoch": 0.4530590489361396, + "grad_norm": 0.6079700589179993, + "learning_rate": 1.1993270400606462e-05, + "loss": 1.0207, + "step": 31450 + }, + { + "epoch": 0.45320310586743884, + "grad_norm": 0.4999813735485077, + "learning_rate": 1.1988698124549102e-05, + "loss": 1.0106, + "step": 31460 + }, + { + "epoch": 0.45334716279873805, + "grad_norm": 0.5409412384033203, + "learning_rate": 1.1984125415579596e-05, + "loss": 1.021, + "step": 31470 + }, + { + "epoch": 0.4534912197300373, + "grad_norm": 0.5420812964439392, + "learning_rate": 1.197955227469336e-05, + "loss": 1.0084, + "step": 31480 + }, + { + "epoch": 0.4536352766613366, + "grad_norm": 0.5634649991989136, + "learning_rate": 1.1974978702885898e-05, + "loss": 1.0361, + "step": 31490 + }, + { + "epoch": 0.45377933359263584, + "grad_norm": 0.7330309748649597, + "learning_rate": 1.1970404701152822e-05, + "loss": 1.0362, + "step": 31500 + }, + { + "epoch": 0.45392339052393504, + "grad_norm": 0.53804612159729, + "learning_rate": 1.196583027048982e-05, + "loss": 1.0525, + "step": 31510 + }, + { + "epoch": 0.4540674474552343, + "grad_norm": 0.5847958326339722, + "learning_rate": 1.196125541189269e-05, + "loss": 1.022, + "step": 31520 + }, + { + "epoch": 0.45421150438653357, + "grad_norm": 0.6122546792030334, + "learning_rate": 1.1956680126357308e-05, + "loss": 1.0401, + "step": 31530 + }, + { + "epoch": 0.45435556131783283, + "grad_norm": 0.6637943387031555, + "learning_rate": 1.1952104414879658e-05, + "loss": 1.0365, + "step": 31540 + }, + { + "epoch": 0.45449961824913204, + "grad_norm": 0.6475297212600708, + "learning_rate": 1.1947528278455807e-05, + "loss": 1.0001, + "step": 31550 + }, + { + "epoch": 0.4546436751804313, + "grad_norm": 0.6540977358818054, + "learning_rate": 1.1942951718081912e-05, + "loss": 1.0261, + "step": 31560 + }, + { + "epoch": 0.45478773211173057, + "grad_norm": 0.497886061668396, + "learning_rate": 1.1938374734754237e-05, + "loss": 1.0266, + "step": 31570 + }, + { + "epoch": 0.45493178904302983, + "grad_norm": 0.5904746055603027, + "learning_rate": 1.1933797329469117e-05, + "loss": 1.0093, + "step": 31580 + }, + { + "epoch": 0.45507584597432904, + "grad_norm": 0.5959451794624329, + "learning_rate": 1.1929219503222997e-05, + "loss": 1.0248, + "step": 31590 + }, + { + "epoch": 0.4552199029056283, + "grad_norm": 0.7836245894432068, + "learning_rate": 1.1924641257012403e-05, + "loss": 1.0199, + "step": 31600 + }, + { + "epoch": 0.45536395983692757, + "grad_norm": 0.6108161211013794, + "learning_rate": 1.192006259183396e-05, + "loss": 1.0467, + "step": 31610 + }, + { + "epoch": 0.4555080167682268, + "grad_norm": 0.5521235466003418, + "learning_rate": 1.191548350868438e-05, + "loss": 1.0296, + "step": 31620 + }, + { + "epoch": 0.45565207369952604, + "grad_norm": 0.5361019372940063, + "learning_rate": 1.191090400856046e-05, + "loss": 1.0242, + "step": 31630 + }, + { + "epoch": 0.4557961306308253, + "grad_norm": 0.5625245571136475, + "learning_rate": 1.19063240924591e-05, + "loss": 1.0098, + "step": 31640 + }, + { + "epoch": 0.45594018756212457, + "grad_norm": 0.5830609202384949, + "learning_rate": 1.1901743761377285e-05, + "loss": 1.0268, + "step": 31650 + }, + { + "epoch": 0.4560842444934238, + "grad_norm": 0.5644638538360596, + "learning_rate": 1.1897163016312085e-05, + "loss": 1.0124, + "step": 31660 + }, + { + "epoch": 0.45622830142472304, + "grad_norm": 0.5439152717590332, + "learning_rate": 1.1892581858260665e-05, + "loss": 0.9989, + "step": 31670 + }, + { + "epoch": 0.4563723583560223, + "grad_norm": 0.5330991148948669, + "learning_rate": 1.1888000288220288e-05, + "loss": 1.0315, + "step": 31680 + }, + { + "epoch": 0.45651641528732156, + "grad_norm": 0.6343188881874084, + "learning_rate": 1.1883418307188292e-05, + "loss": 1.0204, + "step": 31690 + }, + { + "epoch": 0.45666047221862077, + "grad_norm": 0.6865444183349609, + "learning_rate": 1.1878835916162111e-05, + "loss": 1.025, + "step": 31700 + }, + { + "epoch": 0.45680452914992004, + "grad_norm": 0.5597052574157715, + "learning_rate": 1.187425311613927e-05, + "loss": 1.0154, + "step": 31710 + }, + { + "epoch": 0.4569485860812193, + "grad_norm": 0.6149247288703918, + "learning_rate": 1.186966990811738e-05, + "loss": 1.0293, + "step": 31720 + }, + { + "epoch": 0.45709264301251856, + "grad_norm": 0.5930432081222534, + "learning_rate": 1.1865086293094146e-05, + "loss": 1.026, + "step": 31730 + }, + { + "epoch": 0.45723669994381777, + "grad_norm": 0.6211383938789368, + "learning_rate": 1.1860502272067352e-05, + "loss": 1.0215, + "step": 31740 + }, + { + "epoch": 0.45738075687511703, + "grad_norm": 0.6969831585884094, + "learning_rate": 1.1855917846034879e-05, + "loss": 1.035, + "step": 31750 + }, + { + "epoch": 0.4575248138064163, + "grad_norm": 0.7310178875923157, + "learning_rate": 1.1851333015994696e-05, + "loss": 1.032, + "step": 31760 + }, + { + "epoch": 0.45766887073771556, + "grad_norm": 0.5876138806343079, + "learning_rate": 1.1846747782944852e-05, + "loss": 1.0438, + "step": 31770 + }, + { + "epoch": 0.45781292766901477, + "grad_norm": 0.566362738609314, + "learning_rate": 1.1842162147883492e-05, + "loss": 1.034, + "step": 31780 + }, + { + "epoch": 0.45795698460031403, + "grad_norm": 0.45695582032203674, + "learning_rate": 1.1837576111808846e-05, + "loss": 1.0179, + "step": 31790 + }, + { + "epoch": 0.4581010415316133, + "grad_norm": 0.5838852524757385, + "learning_rate": 1.1832989675719231e-05, + "loss": 1.0158, + "step": 31800 + }, + { + "epoch": 0.45824509846291256, + "grad_norm": 0.563360333442688, + "learning_rate": 1.1828402840613045e-05, + "loss": 1.0113, + "step": 31810 + }, + { + "epoch": 0.45838915539421177, + "grad_norm": 0.5359305739402771, + "learning_rate": 1.1823815607488789e-05, + "loss": 1.0321, + "step": 31820 + }, + { + "epoch": 0.45853321232551103, + "grad_norm": 0.6095662713050842, + "learning_rate": 1.1819227977345032e-05, + "loss": 1.0093, + "step": 31830 + }, + { + "epoch": 0.4586772692568103, + "grad_norm": 0.5786446928977966, + "learning_rate": 1.1814639951180443e-05, + "loss": 1.015, + "step": 31840 + }, + { + "epoch": 0.45882132618810956, + "grad_norm": 0.6685968637466431, + "learning_rate": 1.1810051529993765e-05, + "loss": 1.0243, + "step": 31850 + }, + { + "epoch": 0.45896538311940877, + "grad_norm": 0.5776757597923279, + "learning_rate": 1.1805462714783842e-05, + "loss": 1.0309, + "step": 31860 + }, + { + "epoch": 0.45910944005070803, + "grad_norm": 0.5636850595474243, + "learning_rate": 1.180087350654959e-05, + "loss": 1.0263, + "step": 31870 + }, + { + "epoch": 0.4592534969820073, + "grad_norm": 0.5590022802352905, + "learning_rate": 1.179628390629002e-05, + "loss": 1.0022, + "step": 31880 + }, + { + "epoch": 0.45939755391330656, + "grad_norm": 0.6480368375778198, + "learning_rate": 1.1791693915004223e-05, + "loss": 1.0125, + "step": 31890 + }, + { + "epoch": 0.45954161084460576, + "grad_norm": 0.5446181297302246, + "learning_rate": 1.178710353369138e-05, + "loss": 1.038, + "step": 31900 + }, + { + "epoch": 0.459685667775905, + "grad_norm": 0.545379638671875, + "learning_rate": 1.1782512763350748e-05, + "loss": 1.019, + "step": 31910 + }, + { + "epoch": 0.4598297247072043, + "grad_norm": 0.5600593686103821, + "learning_rate": 1.1777921604981677e-05, + "loss": 1.002, + "step": 31920 + }, + { + "epoch": 0.45997378163850355, + "grad_norm": 0.5947697162628174, + "learning_rate": 1.17733300595836e-05, + "loss": 1.011, + "step": 31930 + }, + { + "epoch": 0.46011783856980276, + "grad_norm": 0.5739585757255554, + "learning_rate": 1.1768738128156033e-05, + "loss": 1.024, + "step": 31940 + }, + { + "epoch": 0.460261895501102, + "grad_norm": 0.5938307642936707, + "learning_rate": 1.1764145811698571e-05, + "loss": 1.0214, + "step": 31950 + }, + { + "epoch": 0.4604059524324013, + "grad_norm": 0.5985384583473206, + "learning_rate": 1.1759553111210906e-05, + "loss": 1.0321, + "step": 31960 + }, + { + "epoch": 0.46055000936370055, + "grad_norm": 0.4748883843421936, + "learning_rate": 1.1754960027692801e-05, + "loss": 1.0317, + "step": 31970 + }, + { + "epoch": 0.46069406629499976, + "grad_norm": 0.6445233225822449, + "learning_rate": 1.1750366562144109e-05, + "loss": 1.0444, + "step": 31980 + }, + { + "epoch": 0.460838123226299, + "grad_norm": 0.5735971331596375, + "learning_rate": 1.174577271556476e-05, + "loss": 1.0375, + "step": 31990 + }, + { + "epoch": 0.4609821801575983, + "grad_norm": 0.597209632396698, + "learning_rate": 1.1741178488954777e-05, + "loss": 1.0271, + "step": 32000 + }, + { + "epoch": 0.46112623708889755, + "grad_norm": 0.5384397506713867, + "learning_rate": 1.1736583883314254e-05, + "loss": 1.0414, + "step": 32010 + }, + { + "epoch": 0.46127029402019676, + "grad_norm": 0.6700142025947571, + "learning_rate": 1.1731988899643374e-05, + "loss": 1.0253, + "step": 32020 + }, + { + "epoch": 0.461414350951496, + "grad_norm": 0.5831916928291321, + "learning_rate": 1.1727393538942407e-05, + "loss": 1.0199, + "step": 32030 + }, + { + "epoch": 0.4615584078827953, + "grad_norm": 0.49577924609184265, + "learning_rate": 1.1722797802211693e-05, + "loss": 1.0225, + "step": 32040 + }, + { + "epoch": 0.46170246481409455, + "grad_norm": 0.5894416570663452, + "learning_rate": 1.171820169045167e-05, + "loss": 1.0188, + "step": 32050 + }, + { + "epoch": 0.46184652174539376, + "grad_norm": 0.5564287900924683, + "learning_rate": 1.1713605204662834e-05, + "loss": 1.0465, + "step": 32060 + }, + { + "epoch": 0.461990578676693, + "grad_norm": 0.6189201474189758, + "learning_rate": 1.1709008345845786e-05, + "loss": 1.0252, + "step": 32070 + }, + { + "epoch": 0.4621346356079923, + "grad_norm": 0.5556772351264954, + "learning_rate": 1.17044111150012e-05, + "loss": 1.0326, + "step": 32080 + }, + { + "epoch": 0.46227869253929155, + "grad_norm": 0.6864781379699707, + "learning_rate": 1.1699813513129823e-05, + "loss": 1.0486, + "step": 32090 + }, + { + "epoch": 0.46242274947059075, + "grad_norm": 0.580782413482666, + "learning_rate": 1.1695215541232497e-05, + "loss": 1.021, + "step": 32100 + }, + { + "epoch": 0.46256680640189, + "grad_norm": 0.6461674571037292, + "learning_rate": 1.1690617200310128e-05, + "loss": 1.0224, + "step": 32110 + }, + { + "epoch": 0.4627108633331893, + "grad_norm": 0.533871054649353, + "learning_rate": 1.1686018491363722e-05, + "loss": 1.0225, + "step": 32120 + }, + { + "epoch": 0.46285492026448855, + "grad_norm": 0.6044811010360718, + "learning_rate": 1.1681419415394344e-05, + "loss": 1.0316, + "step": 32130 + }, + { + "epoch": 0.46299897719578775, + "grad_norm": 0.6584555506706238, + "learning_rate": 1.1676819973403158e-05, + "loss": 1.0336, + "step": 32140 + }, + { + "epoch": 0.463143034127087, + "grad_norm": 0.6285313963890076, + "learning_rate": 1.1672220166391392e-05, + "loss": 1.0362, + "step": 32150 + }, + { + "epoch": 0.4632870910583863, + "grad_norm": 0.5741084814071655, + "learning_rate": 1.1667619995360368e-05, + "loss": 1.0185, + "step": 32160 + }, + { + "epoch": 0.46343114798968554, + "grad_norm": 0.5344382524490356, + "learning_rate": 1.1663019461311475e-05, + "loss": 1.0215, + "step": 32170 + }, + { + "epoch": 0.46357520492098475, + "grad_norm": 0.6228933334350586, + "learning_rate": 1.1658418565246183e-05, + "loss": 1.043, + "step": 32180 + }, + { + "epoch": 0.463719261852284, + "grad_norm": 0.7229593992233276, + "learning_rate": 1.1653817308166055e-05, + "loss": 0.9985, + "step": 32190 + }, + { + "epoch": 0.4638633187835833, + "grad_norm": 0.56514972448349, + "learning_rate": 1.1649215691072706e-05, + "loss": 1.0177, + "step": 32200 + }, + { + "epoch": 0.46400737571488254, + "grad_norm": 0.5896420478820801, + "learning_rate": 1.1644613714967857e-05, + "loss": 1.0303, + "step": 32210 + }, + { + "epoch": 0.46415143264618175, + "grad_norm": 0.5764533877372742, + "learning_rate": 1.1640011380853285e-05, + "loss": 1.0082, + "step": 32220 + }, + { + "epoch": 0.464295489577481, + "grad_norm": 0.6431474685668945, + "learning_rate": 1.1635408689730862e-05, + "loss": 1.0407, + "step": 32230 + }, + { + "epoch": 0.4644395465087803, + "grad_norm": 0.5689139366149902, + "learning_rate": 1.1630805642602527e-05, + "loss": 1.0358, + "step": 32240 + }, + { + "epoch": 0.46458360344007954, + "grad_norm": 0.530646026134491, + "learning_rate": 1.1626202240470302e-05, + "loss": 1.0332, + "step": 32250 + }, + { + "epoch": 0.46472766037137875, + "grad_norm": 0.642479658126831, + "learning_rate": 1.162159848433628e-05, + "loss": 1.033, + "step": 32260 + }, + { + "epoch": 0.464871717302678, + "grad_norm": 0.5798544883728027, + "learning_rate": 1.1616994375202642e-05, + "loss": 1.0399, + "step": 32270 + }, + { + "epoch": 0.4650157742339773, + "grad_norm": 0.5376886129379272, + "learning_rate": 1.1612389914071633e-05, + "loss": 1.0265, + "step": 32280 + }, + { + "epoch": 0.46515983116527654, + "grad_norm": 0.6260908842086792, + "learning_rate": 1.1607785101945582e-05, + "loss": 1.0245, + "step": 32290 + }, + { + "epoch": 0.46530388809657575, + "grad_norm": 0.597516655921936, + "learning_rate": 1.1603179939826896e-05, + "loss": 1.0122, + "step": 32300 + }, + { + "epoch": 0.465447945027875, + "grad_norm": 0.5453872680664062, + "learning_rate": 1.1598574428718049e-05, + "loss": 1.0166, + "step": 32310 + }, + { + "epoch": 0.4655920019591743, + "grad_norm": 0.5635013580322266, + "learning_rate": 1.1593968569621604e-05, + "loss": 1.0116, + "step": 32320 + }, + { + "epoch": 0.46573605889047354, + "grad_norm": 0.6421080827713013, + "learning_rate": 1.1589362363540189e-05, + "loss": 1.023, + "step": 32330 + }, + { + "epoch": 0.46588011582177274, + "grad_norm": 0.7370414137840271, + "learning_rate": 1.1584755811476511e-05, + "loss": 1.047, + "step": 32340 + }, + { + "epoch": 0.466024172753072, + "grad_norm": 0.6795259714126587, + "learning_rate": 1.1580148914433359e-05, + "loss": 1.0235, + "step": 32350 + }, + { + "epoch": 0.46616822968437127, + "grad_norm": 0.5798462629318237, + "learning_rate": 1.1575541673413582e-05, + "loss": 1.0129, + "step": 32360 + }, + { + "epoch": 0.46631228661567053, + "grad_norm": 0.6684741973876953, + "learning_rate": 1.157093408942012e-05, + "loss": 1.0319, + "step": 32370 + }, + { + "epoch": 0.46645634354696974, + "grad_norm": 0.53980952501297, + "learning_rate": 1.1566326163455979e-05, + "loss": 1.0186, + "step": 32380 + }, + { + "epoch": 0.466600400478269, + "grad_norm": 0.6894479990005493, + "learning_rate": 1.1561717896524238e-05, + "loss": 1.0438, + "step": 32390 + }, + { + "epoch": 0.46674445740956827, + "grad_norm": 0.6048723459243774, + "learning_rate": 1.1557109289628052e-05, + "loss": 1.0076, + "step": 32400 + }, + { + "epoch": 0.46688851434086753, + "grad_norm": 0.7372193932533264, + "learning_rate": 1.1552500343770658e-05, + "loss": 1.029, + "step": 32410 + }, + { + "epoch": 0.46703257127216674, + "grad_norm": 0.5819730758666992, + "learning_rate": 1.1547891059955356e-05, + "loss": 1.012, + "step": 32420 + }, + { + "epoch": 0.467176628203466, + "grad_norm": 0.6845980882644653, + "learning_rate": 1.154328143918552e-05, + "loss": 1.0259, + "step": 32430 + }, + { + "epoch": 0.46732068513476527, + "grad_norm": 0.6390533447265625, + "learning_rate": 1.1538671482464608e-05, + "loss": 1.0455, + "step": 32440 + }, + { + "epoch": 0.46746474206606453, + "grad_norm": 0.6333168745040894, + "learning_rate": 1.1534061190796137e-05, + "loss": 1.0301, + "step": 32450 + }, + { + "epoch": 0.46760879899736374, + "grad_norm": 0.5310613512992859, + "learning_rate": 1.152945056518371e-05, + "loss": 1.0354, + "step": 32460 + }, + { + "epoch": 0.467752855928663, + "grad_norm": 0.5582602620124817, + "learning_rate": 1.152483960663099e-05, + "loss": 1.0252, + "step": 32470 + }, + { + "epoch": 0.46789691285996227, + "grad_norm": 0.5743705630302429, + "learning_rate": 1.1520228316141729e-05, + "loss": 1.0403, + "step": 32480 + }, + { + "epoch": 0.46804096979126153, + "grad_norm": 0.5354068875312805, + "learning_rate": 1.151561669471973e-05, + "loss": 1.0301, + "step": 32490 + }, + { + "epoch": 0.46818502672256074, + "grad_norm": 0.6218494772911072, + "learning_rate": 1.1511004743368886e-05, + "loss": 1.0412, + "step": 32500 + }, + { + "epoch": 0.46832908365386, + "grad_norm": 0.5262079834938049, + "learning_rate": 1.1506392463093155e-05, + "loss": 1.0183, + "step": 32510 + }, + { + "epoch": 0.46847314058515926, + "grad_norm": 0.7889408469200134, + "learning_rate": 1.1501779854896564e-05, + "loss": 1.0526, + "step": 32520 + }, + { + "epoch": 0.46861719751645853, + "grad_norm": 0.671790361404419, + "learning_rate": 1.1497166919783216e-05, + "loss": 1.0363, + "step": 32530 + }, + { + "epoch": 0.46876125444775774, + "grad_norm": 0.6060637831687927, + "learning_rate": 1.1492553658757281e-05, + "loss": 1.0227, + "step": 32540 + }, + { + "epoch": 0.468905311379057, + "grad_norm": 0.5498833060264587, + "learning_rate": 1.1487940072823007e-05, + "loss": 1.0177, + "step": 32550 + }, + { + "epoch": 0.46904936831035626, + "grad_norm": 0.5617979764938354, + "learning_rate": 1.1483326162984704e-05, + "loss": 1.0172, + "step": 32560 + }, + { + "epoch": 0.4691934252416555, + "grad_norm": 0.5950319170951843, + "learning_rate": 1.1478711930246757e-05, + "loss": 1.0409, + "step": 32570 + }, + { + "epoch": 0.46933748217295473, + "grad_norm": 0.6482478976249695, + "learning_rate": 1.1474097375613624e-05, + "loss": 0.9901, + "step": 32580 + }, + { + "epoch": 0.469481539104254, + "grad_norm": 0.533887505531311, + "learning_rate": 1.1469482500089826e-05, + "loss": 1.0164, + "step": 32590 + }, + { + "epoch": 0.46962559603555326, + "grad_norm": 0.5833592414855957, + "learning_rate": 1.1464867304679961e-05, + "loss": 1.0464, + "step": 32600 + }, + { + "epoch": 0.4697696529668525, + "grad_norm": 0.5561684370040894, + "learning_rate": 1.1460251790388688e-05, + "loss": 1.0359, + "step": 32610 + }, + { + "epoch": 0.46991370989815173, + "grad_norm": 0.702983558177948, + "learning_rate": 1.1455635958220748e-05, + "loss": 1.0174, + "step": 32620 + }, + { + "epoch": 0.470057766829451, + "grad_norm": 0.575308620929718, + "learning_rate": 1.1451019809180938e-05, + "loss": 1.0305, + "step": 32630 + }, + { + "epoch": 0.47020182376075026, + "grad_norm": 0.6303951144218445, + "learning_rate": 1.1446403344274133e-05, + "loss": 1.0359, + "step": 32640 + }, + { + "epoch": 0.4703458806920495, + "grad_norm": 0.6195650100708008, + "learning_rate": 1.1441786564505275e-05, + "loss": 1.0303, + "step": 32650 + }, + { + "epoch": 0.47048993762334873, + "grad_norm": 0.5987343192100525, + "learning_rate": 1.1437169470879368e-05, + "loss": 1.0285, + "step": 32660 + }, + { + "epoch": 0.470633994554648, + "grad_norm": 0.589579164981842, + "learning_rate": 1.1432552064401497e-05, + "loss": 1.0139, + "step": 32670 + }, + { + "epoch": 0.47077805148594726, + "grad_norm": 0.5205335021018982, + "learning_rate": 1.14279343460768e-05, + "loss": 1.0344, + "step": 32680 + }, + { + "epoch": 0.4709221084172465, + "grad_norm": 0.5689437985420227, + "learning_rate": 1.1423316316910496e-05, + "loss": 1.0214, + "step": 32690 + }, + { + "epoch": 0.47106616534854573, + "grad_norm": 0.5881633758544922, + "learning_rate": 1.1418697977907862e-05, + "loss": 1.022, + "step": 32700 + }, + { + "epoch": 0.471210222279845, + "grad_norm": 0.582982063293457, + "learning_rate": 1.1414079330074252e-05, + "loss": 1.0009, + "step": 32710 + }, + { + "epoch": 0.47135427921114426, + "grad_norm": 0.6023359298706055, + "learning_rate": 1.140946037441508e-05, + "loss": 1.0255, + "step": 32720 + }, + { + "epoch": 0.4714983361424435, + "grad_norm": 0.6213319301605225, + "learning_rate": 1.1404841111935824e-05, + "loss": 1.0354, + "step": 32730 + }, + { + "epoch": 0.4716423930737427, + "grad_norm": 0.5460777878761292, + "learning_rate": 1.1400221543642043e-05, + "loss": 1.0209, + "step": 32740 + }, + { + "epoch": 0.471786450005042, + "grad_norm": 0.6849192380905151, + "learning_rate": 1.1395601670539344e-05, + "loss": 1.0097, + "step": 32750 + }, + { + "epoch": 0.47193050693634125, + "grad_norm": 0.5484505891799927, + "learning_rate": 1.1390981493633415e-05, + "loss": 1.0063, + "step": 32760 + }, + { + "epoch": 0.4720745638676405, + "grad_norm": 0.6390265226364136, + "learning_rate": 1.1386361013930002e-05, + "loss": 1.0419, + "step": 32770 + }, + { + "epoch": 0.4722186207989397, + "grad_norm": 0.6577069163322449, + "learning_rate": 1.1381740232434925e-05, + "loss": 1.0162, + "step": 32780 + }, + { + "epoch": 0.472362677730239, + "grad_norm": 0.5977217555046082, + "learning_rate": 1.1377119150154058e-05, + "loss": 1.022, + "step": 32790 + }, + { + "epoch": 0.47250673466153825, + "grad_norm": 0.5634925365447998, + "learning_rate": 1.137249776809335e-05, + "loss": 1.0225, + "step": 32800 + }, + { + "epoch": 0.4726507915928375, + "grad_norm": 0.5434150099754333, + "learning_rate": 1.1367876087258806e-05, + "loss": 1.0155, + "step": 32810 + }, + { + "epoch": 0.4727948485241367, + "grad_norm": 0.5795437693595886, + "learning_rate": 1.1363254108656513e-05, + "loss": 1.0211, + "step": 32820 + }, + { + "epoch": 0.472938905455436, + "grad_norm": 0.6080933213233948, + "learning_rate": 1.1358631833292609e-05, + "loss": 1.0364, + "step": 32830 + }, + { + "epoch": 0.47308296238673525, + "grad_norm": 0.5317662954330444, + "learning_rate": 1.1354009262173292e-05, + "loss": 1.0074, + "step": 32840 + }, + { + "epoch": 0.4732270193180345, + "grad_norm": 0.686359703540802, + "learning_rate": 1.1349386396304842e-05, + "loss": 1.0229, + "step": 32850 + }, + { + "epoch": 0.4733710762493337, + "grad_norm": 0.6074622869491577, + "learning_rate": 1.1344763236693583e-05, + "loss": 1.0143, + "step": 32860 + }, + { + "epoch": 0.473515133180633, + "grad_norm": 0.6043309569358826, + "learning_rate": 1.1340139784345923e-05, + "loss": 1.0402, + "step": 32870 + }, + { + "epoch": 0.47365919011193225, + "grad_norm": 0.5316594243049622, + "learning_rate": 1.1335516040268315e-05, + "loss": 1.0117, + "step": 32880 + }, + { + "epoch": 0.4738032470432315, + "grad_norm": 0.5331855416297913, + "learning_rate": 1.1330892005467293e-05, + "loss": 0.9849, + "step": 32890 + }, + { + "epoch": 0.4739473039745307, + "grad_norm": 0.5734590888023376, + "learning_rate": 1.132626768094944e-05, + "loss": 1.0353, + "step": 32900 + }, + { + "epoch": 0.47409136090583, + "grad_norm": 0.5718367099761963, + "learning_rate": 1.1321643067721407e-05, + "loss": 1.0156, + "step": 32910 + }, + { + "epoch": 0.47423541783712925, + "grad_norm": 0.5481789708137512, + "learning_rate": 1.1317018166789911e-05, + "loss": 1.0032, + "step": 32920 + }, + { + "epoch": 0.4743794747684285, + "grad_norm": 0.5542817115783691, + "learning_rate": 1.1312392979161729e-05, + "loss": 1.0224, + "step": 32930 + }, + { + "epoch": 0.4745235316997277, + "grad_norm": 0.5004504323005676, + "learning_rate": 1.1307767505843699e-05, + "loss": 1.0404, + "step": 32940 + }, + { + "epoch": 0.474667588631027, + "grad_norm": 0.5400373339653015, + "learning_rate": 1.130314174784272e-05, + "loss": 1.0238, + "step": 32950 + }, + { + "epoch": 0.47481164556232625, + "grad_norm": 0.5386858582496643, + "learning_rate": 1.1298515706165764e-05, + "loss": 1.009, + "step": 32960 + }, + { + "epoch": 0.47495570249362545, + "grad_norm": 0.6059908866882324, + "learning_rate": 1.1293889381819854e-05, + "loss": 1.0092, + "step": 32970 + }, + { + "epoch": 0.4750997594249247, + "grad_norm": 0.5509114861488342, + "learning_rate": 1.1289262775812069e-05, + "loss": 1.0033, + "step": 32980 + }, + { + "epoch": 0.475243816356224, + "grad_norm": 0.4810161888599396, + "learning_rate": 1.1284635889149566e-05, + "loss": 1.0172, + "step": 32990 + }, + { + "epoch": 0.47538787328752324, + "grad_norm": 0.5593041777610779, + "learning_rate": 1.1280008722839552e-05, + "loss": 1.0277, + "step": 33000 + }, + { + "epoch": 0.47553193021882245, + "grad_norm": 0.5800256133079529, + "learning_rate": 1.1275381277889298e-05, + "loss": 1.013, + "step": 33010 + }, + { + "epoch": 0.4756759871501217, + "grad_norm": 0.6317630410194397, + "learning_rate": 1.1270753555306129e-05, + "loss": 1.0329, + "step": 33020 + }, + { + "epoch": 0.475820044081421, + "grad_norm": 0.5637384057044983, + "learning_rate": 1.1266125556097444e-05, + "loss": 1.0178, + "step": 33030 + }, + { + "epoch": 0.47596410101272024, + "grad_norm": 0.5414950251579285, + "learning_rate": 1.1261497281270693e-05, + "loss": 1.0151, + "step": 33040 + }, + { + "epoch": 0.47610815794401945, + "grad_norm": 0.6262162327766418, + "learning_rate": 1.1256868731833383e-05, + "loss": 1.017, + "step": 33050 + }, + { + "epoch": 0.4762522148753187, + "grad_norm": 0.6255475878715515, + "learning_rate": 1.1252239908793093e-05, + "loss": 1.0204, + "step": 33060 + }, + { + "epoch": 0.476396271806618, + "grad_norm": 0.6530277132987976, + "learning_rate": 1.1247610813157446e-05, + "loss": 1.0071, + "step": 33070 + }, + { + "epoch": 0.47654032873791724, + "grad_norm": 0.6401610970497131, + "learning_rate": 1.1242981445934138e-05, + "loss": 1.0194, + "step": 33080 + }, + { + "epoch": 0.47668438566921645, + "grad_norm": 0.643420398235321, + "learning_rate": 1.1238351808130911e-05, + "loss": 1.0111, + "step": 33090 + }, + { + "epoch": 0.4768284426005157, + "grad_norm": 0.5949244499206543, + "learning_rate": 1.1233721900755583e-05, + "loss": 1.0173, + "step": 33100 + }, + { + "epoch": 0.476972499531815, + "grad_norm": 0.5837469696998596, + "learning_rate": 1.1229091724816016e-05, + "loss": 1.0326, + "step": 33110 + }, + { + "epoch": 0.47711655646311424, + "grad_norm": 0.5850193500518799, + "learning_rate": 1.1224461281320136e-05, + "loss": 1.0177, + "step": 33120 + }, + { + "epoch": 0.47726061339441345, + "grad_norm": 0.5880247950553894, + "learning_rate": 1.1219830571275928e-05, + "loss": 1.0042, + "step": 33130 + }, + { + "epoch": 0.4774046703257127, + "grad_norm": 0.5486767292022705, + "learning_rate": 1.1215199595691425e-05, + "loss": 1.0246, + "step": 33140 + }, + { + "epoch": 0.477548727257012, + "grad_norm": 0.6419997811317444, + "learning_rate": 1.1210568355574743e-05, + "loss": 1.0189, + "step": 33150 + }, + { + "epoch": 0.47769278418831124, + "grad_norm": 0.6350610852241516, + "learning_rate": 1.1205936851934025e-05, + "loss": 1.0442, + "step": 33160 + }, + { + "epoch": 0.47783684111961044, + "grad_norm": 0.6184735894203186, + "learning_rate": 1.120130508577749e-05, + "loss": 1.0016, + "step": 33170 + }, + { + "epoch": 0.4779808980509097, + "grad_norm": 0.6295711994171143, + "learning_rate": 1.1196673058113413e-05, + "loss": 1.0218, + "step": 33180 + }, + { + "epoch": 0.47812495498220897, + "grad_norm": 0.5689663887023926, + "learning_rate": 1.1192040769950115e-05, + "loss": 1.0158, + "step": 33190 + }, + { + "epoch": 0.47826901191350824, + "grad_norm": 0.7592531442642212, + "learning_rate": 1.1187408222295988e-05, + "loss": 1.0292, + "step": 33200 + }, + { + "epoch": 0.47841306884480744, + "grad_norm": 0.5784085988998413, + "learning_rate": 1.118277541615947e-05, + "loss": 1.0266, + "step": 33210 + }, + { + "epoch": 0.4785571257761067, + "grad_norm": 0.637199878692627, + "learning_rate": 1.1178142352549064e-05, + "loss": 1.0219, + "step": 33220 + }, + { + "epoch": 0.47870118270740597, + "grad_norm": 0.5977768301963806, + "learning_rate": 1.1173509032473316e-05, + "loss": 1.0314, + "step": 33230 + }, + { + "epoch": 0.47884523963870523, + "grad_norm": 0.5544858574867249, + "learning_rate": 1.1168875456940842e-05, + "loss": 1.0043, + "step": 33240 + }, + { + "epoch": 0.47898929657000444, + "grad_norm": 0.5969414114952087, + "learning_rate": 1.1164241626960304e-05, + "loss": 1.0297, + "step": 33250 + }, + { + "epoch": 0.4791333535013037, + "grad_norm": 0.6712751984596252, + "learning_rate": 1.1159607543540429e-05, + "loss": 1.0134, + "step": 33260 + }, + { + "epoch": 0.47927741043260297, + "grad_norm": 0.6637882590293884, + "learning_rate": 1.1154973207689985e-05, + "loss": 1.0377, + "step": 33270 + }, + { + "epoch": 0.47942146736390223, + "grad_norm": 0.5263668298721313, + "learning_rate": 1.1150338620417806e-05, + "loss": 1.0034, + "step": 33280 + }, + { + "epoch": 0.47956552429520144, + "grad_norm": 0.6545845866203308, + "learning_rate": 1.1145703782732781e-05, + "loss": 1.029, + "step": 33290 + }, + { + "epoch": 0.4797095812265007, + "grad_norm": 0.5800336599349976, + "learning_rate": 1.1141068695643845e-05, + "loss": 1.0074, + "step": 33300 + }, + { + "epoch": 0.47985363815779997, + "grad_norm": 0.5339868068695068, + "learning_rate": 1.1136433360159996e-05, + "loss": 1.0223, + "step": 33310 + }, + { + "epoch": 0.47999769508909923, + "grad_norm": 0.6196845173835754, + "learning_rate": 1.1131797777290281e-05, + "loss": 1.036, + "step": 33320 + }, + { + "epoch": 0.48014175202039844, + "grad_norm": 0.6900539994239807, + "learning_rate": 1.1127161948043806e-05, + "loss": 1.024, + "step": 33330 + }, + { + "epoch": 0.4802858089516977, + "grad_norm": 0.635518491268158, + "learning_rate": 1.1122525873429718e-05, + "loss": 1.0273, + "step": 33340 + }, + { + "epoch": 0.48042986588299696, + "grad_norm": 0.5423782467842102, + "learning_rate": 1.1117889554457238e-05, + "loss": 1.0158, + "step": 33350 + }, + { + "epoch": 0.48057392281429623, + "grad_norm": 1.0638518333435059, + "learning_rate": 1.111325299213562e-05, + "loss": 1.0243, + "step": 33360 + }, + { + "epoch": 0.48071797974559544, + "grad_norm": 0.6649375557899475, + "learning_rate": 1.1108616187474186e-05, + "loss": 1.0199, + "step": 33370 + }, + { + "epoch": 0.4808620366768947, + "grad_norm": 0.5941753387451172, + "learning_rate": 1.11039791414823e-05, + "loss": 1.0284, + "step": 33380 + }, + { + "epoch": 0.48100609360819396, + "grad_norm": 0.5361167192459106, + "learning_rate": 1.1099341855169383e-05, + "loss": 1.0069, + "step": 33390 + }, + { + "epoch": 0.4811501505394932, + "grad_norm": 0.5737732648849487, + "learning_rate": 1.1094704329544914e-05, + "loss": 0.9989, + "step": 33400 + }, + { + "epoch": 0.48129420747079243, + "grad_norm": 0.6742610931396484, + "learning_rate": 1.109006656561841e-05, + "loss": 1.0337, + "step": 33410 + }, + { + "epoch": 0.4814382644020917, + "grad_norm": 0.5616894960403442, + "learning_rate": 1.1085428564399453e-05, + "loss": 1.0373, + "step": 33420 + }, + { + "epoch": 0.48158232133339096, + "grad_norm": 0.5926318764686584, + "learning_rate": 1.1080790326897668e-05, + "loss": 1.0238, + "step": 33430 + }, + { + "epoch": 0.4817263782646902, + "grad_norm": 0.5741323232650757, + "learning_rate": 1.107615185412274e-05, + "loss": 1.0243, + "step": 33440 + }, + { + "epoch": 0.48187043519598943, + "grad_norm": 0.5415228009223938, + "learning_rate": 1.1071513147084403e-05, + "loss": 1.0177, + "step": 33450 + }, + { + "epoch": 0.4820144921272887, + "grad_norm": 0.6227560043334961, + "learning_rate": 1.1066874206792431e-05, + "loss": 1.002, + "step": 33460 + }, + { + "epoch": 0.48215854905858796, + "grad_norm": 0.6032860279083252, + "learning_rate": 1.1062235034256663e-05, + "loss": 1.0171, + "step": 33470 + }, + { + "epoch": 0.4823026059898872, + "grad_norm": 0.5680851340293884, + "learning_rate": 1.1057595630486984e-05, + "loss": 1.0107, + "step": 33480 + }, + { + "epoch": 0.48244666292118643, + "grad_norm": 0.7578117251396179, + "learning_rate": 1.1052955996493321e-05, + "loss": 1.0253, + "step": 33490 + }, + { + "epoch": 0.4825907198524857, + "grad_norm": 0.5712920427322388, + "learning_rate": 1.1048316133285667e-05, + "loss": 1.0229, + "step": 33500 + }, + { + "epoch": 0.48273477678378496, + "grad_norm": 0.6681069731712341, + "learning_rate": 1.104367604187405e-05, + "loss": 1.0232, + "step": 33510 + }, + { + "epoch": 0.4828788337150842, + "grad_norm": 0.6597923040390015, + "learning_rate": 1.1039035723268558e-05, + "loss": 1.0236, + "step": 33520 + }, + { + "epoch": 0.48302289064638343, + "grad_norm": 0.5945020318031311, + "learning_rate": 1.103439517847932e-05, + "loss": 1.0021, + "step": 33530 + }, + { + "epoch": 0.4831669475776827, + "grad_norm": 0.6446981430053711, + "learning_rate": 1.1029754408516525e-05, + "loss": 1.0425, + "step": 33540 + }, + { + "epoch": 0.48331100450898196, + "grad_norm": 0.5626109838485718, + "learning_rate": 1.1025113414390398e-05, + "loss": 0.9943, + "step": 33550 + }, + { + "epoch": 0.4834550614402812, + "grad_norm": 0.680046021938324, + "learning_rate": 1.1020472197111224e-05, + "loss": 1.041, + "step": 33560 + }, + { + "epoch": 0.4835991183715804, + "grad_norm": 0.5631656050682068, + "learning_rate": 1.1015830757689327e-05, + "loss": 1.0309, + "step": 33570 + }, + { + "epoch": 0.4837431753028797, + "grad_norm": 0.5655036568641663, + "learning_rate": 1.101118909713509e-05, + "loss": 1.0381, + "step": 33580 + }, + { + "epoch": 0.48388723223417895, + "grad_norm": 0.556452214717865, + "learning_rate": 1.1006547216458937e-05, + "loss": 1.0099, + "step": 33590 + }, + { + "epoch": 0.4840312891654782, + "grad_norm": 0.6915555596351624, + "learning_rate": 1.1001905116671334e-05, + "loss": 1.0386, + "step": 33600 + }, + { + "epoch": 0.4841753460967774, + "grad_norm": 0.5771707892417908, + "learning_rate": 1.0997262798782815e-05, + "loss": 1.0076, + "step": 33610 + }, + { + "epoch": 0.4843194030280767, + "grad_norm": 0.9634688496589661, + "learning_rate": 1.099262026380394e-05, + "loss": 1.0201, + "step": 33620 + }, + { + "epoch": 0.48446345995937595, + "grad_norm": 0.657273530960083, + "learning_rate": 1.0987977512745327e-05, + "loss": 1.046, + "step": 33630 + }, + { + "epoch": 0.4846075168906752, + "grad_norm": 0.49862417578697205, + "learning_rate": 1.0983334546617637e-05, + "loss": 1.0432, + "step": 33640 + }, + { + "epoch": 0.4847515738219744, + "grad_norm": 0.6185719966888428, + "learning_rate": 1.0978691366431583e-05, + "loss": 1.0118, + "step": 33650 + }, + { + "epoch": 0.4848956307532737, + "grad_norm": 0.5877535939216614, + "learning_rate": 1.097404797319792e-05, + "loss": 1.0326, + "step": 33660 + }, + { + "epoch": 0.48503968768457295, + "grad_norm": 0.5418586730957031, + "learning_rate": 1.0969404367927448e-05, + "loss": 1.0337, + "step": 33670 + }, + { + "epoch": 0.4851837446158722, + "grad_norm": 0.5355377197265625, + "learning_rate": 1.096476055163102e-05, + "loss": 1.031, + "step": 33680 + }, + { + "epoch": 0.4853278015471714, + "grad_norm": 0.5953190326690674, + "learning_rate": 1.0960116525319526e-05, + "loss": 1.0469, + "step": 33690 + }, + { + "epoch": 0.4854718584784707, + "grad_norm": 0.621695876121521, + "learning_rate": 1.0955472290003914e-05, + "loss": 1.0275, + "step": 33700 + }, + { + "epoch": 0.48561591540976995, + "grad_norm": 0.5835665464401245, + "learning_rate": 1.0950827846695161e-05, + "loss": 1.0165, + "step": 33710 + }, + { + "epoch": 0.4857599723410692, + "grad_norm": 0.6365586519241333, + "learning_rate": 1.0946183196404305e-05, + "loss": 1.0236, + "step": 33720 + }, + { + "epoch": 0.4859040292723684, + "grad_norm": 0.6362375617027283, + "learning_rate": 1.0941538340142418e-05, + "loss": 1.0102, + "step": 33730 + }, + { + "epoch": 0.4860480862036677, + "grad_norm": 0.7200132608413696, + "learning_rate": 1.0936893278920626e-05, + "loss": 1.0244, + "step": 33740 + }, + { + "epoch": 0.48619214313496695, + "grad_norm": 0.6358307600021362, + "learning_rate": 1.0932248013750089e-05, + "loss": 1.0335, + "step": 33750 + }, + { + "epoch": 0.4863362000662662, + "grad_norm": 0.6214737892150879, + "learning_rate": 1.0927602545642018e-05, + "loss": 1.0381, + "step": 33760 + }, + { + "epoch": 0.4864802569975654, + "grad_norm": 0.5125634074211121, + "learning_rate": 1.0922956875607673e-05, + "loss": 1.03, + "step": 33770 + }, + { + "epoch": 0.4866243139288647, + "grad_norm": 0.6336681842803955, + "learning_rate": 1.0918311004658346e-05, + "loss": 1.03, + "step": 33780 + }, + { + "epoch": 0.48676837086016395, + "grad_norm": 0.5649718642234802, + "learning_rate": 1.0913664933805381e-05, + "loss": 0.9968, + "step": 33790 + }, + { + "epoch": 0.4869124277914632, + "grad_norm": 0.5822668075561523, + "learning_rate": 1.0909018664060164e-05, + "loss": 1.0357, + "step": 33800 + }, + { + "epoch": 0.4870564847227624, + "grad_norm": 0.5345937609672546, + "learning_rate": 1.0904372196434127e-05, + "loss": 1.0025, + "step": 33810 + }, + { + "epoch": 0.4872005416540617, + "grad_norm": 0.5745434761047363, + "learning_rate": 1.0899725531938736e-05, + "loss": 1.0289, + "step": 33820 + }, + { + "epoch": 0.48734459858536094, + "grad_norm": 0.5992249250411987, + "learning_rate": 1.089507867158551e-05, + "loss": 1.0441, + "step": 33830 + }, + { + "epoch": 0.4874886555166602, + "grad_norm": 0.6297547221183777, + "learning_rate": 1.0890431616386004e-05, + "loss": 1.0193, + "step": 33840 + }, + { + "epoch": 0.4876327124479594, + "grad_norm": 0.5767676830291748, + "learning_rate": 1.0885784367351817e-05, + "loss": 1.0103, + "step": 33850 + }, + { + "epoch": 0.4877767693792587, + "grad_norm": 0.5850902199745178, + "learning_rate": 1.0881136925494593e-05, + "loss": 1.0475, + "step": 33860 + }, + { + "epoch": 0.48792082631055794, + "grad_norm": 0.5311706066131592, + "learning_rate": 1.0876489291826015e-05, + "loss": 1.0303, + "step": 33870 + }, + { + "epoch": 0.4880648832418572, + "grad_norm": 0.5796907544136047, + "learning_rate": 1.0871841467357813e-05, + "loss": 1.017, + "step": 33880 + }, + { + "epoch": 0.4882089401731564, + "grad_norm": 0.6047660708427429, + "learning_rate": 1.0867193453101748e-05, + "loss": 1.0114, + "step": 33890 + }, + { + "epoch": 0.4883529971044557, + "grad_norm": 0.7163272500038147, + "learning_rate": 1.0862545250069631e-05, + "loss": 1.0197, + "step": 33900 + }, + { + "epoch": 0.48849705403575494, + "grad_norm": 0.6854219436645508, + "learning_rate": 1.085789685927331e-05, + "loss": 1.0231, + "step": 33910 + }, + { + "epoch": 0.4886411109670542, + "grad_norm": 0.5965855717658997, + "learning_rate": 1.0853248281724682e-05, + "loss": 1.003, + "step": 33920 + }, + { + "epoch": 0.4887851678983534, + "grad_norm": 0.6373468041419983, + "learning_rate": 1.0848599518435674e-05, + "loss": 1.0242, + "step": 33930 + }, + { + "epoch": 0.4889292248296527, + "grad_norm": 0.5388720631599426, + "learning_rate": 1.0843950570418251e-05, + "loss": 1.0382, + "step": 33940 + }, + { + "epoch": 0.48907328176095194, + "grad_norm": 0.6404698491096497, + "learning_rate": 1.0839301438684439e-05, + "loss": 1.0324, + "step": 33950 + }, + { + "epoch": 0.4892173386922512, + "grad_norm": 0.692626416683197, + "learning_rate": 1.0834652124246276e-05, + "loss": 1.0235, + "step": 33960 + }, + { + "epoch": 0.4893613956235504, + "grad_norm": 0.5482460856437683, + "learning_rate": 1.0830002628115863e-05, + "loss": 1.0054, + "step": 33970 + }, + { + "epoch": 0.4895054525548497, + "grad_norm": 0.5376763939857483, + "learning_rate": 1.0825352951305324e-05, + "loss": 1.0273, + "step": 33980 + }, + { + "epoch": 0.48964950948614894, + "grad_norm": 0.5511714816093445, + "learning_rate": 1.0820703094826837e-05, + "loss": 1.0065, + "step": 33990 + }, + { + "epoch": 0.4897935664174482, + "grad_norm": 0.7800193428993225, + "learning_rate": 1.0816053059692607e-05, + "loss": 1.0419, + "step": 34000 + }, + { + "epoch": 0.4899376233487474, + "grad_norm": 0.6549273133277893, + "learning_rate": 1.0811402846914881e-05, + "loss": 1.0188, + "step": 34010 + }, + { + "epoch": 0.49008168028004667, + "grad_norm": 0.5511816740036011, + "learning_rate": 1.0806752457505949e-05, + "loss": 1.0514, + "step": 34020 + }, + { + "epoch": 0.49022573721134594, + "grad_norm": 0.6253392696380615, + "learning_rate": 1.0802101892478139e-05, + "loss": 1.0365, + "step": 34030 + }, + { + "epoch": 0.4903697941426452, + "grad_norm": 0.5437934994697571, + "learning_rate": 1.0797451152843813e-05, + "loss": 1.0175, + "step": 34040 + }, + { + "epoch": 0.4905138510739444, + "grad_norm": 0.580740213394165, + "learning_rate": 1.0792800239615366e-05, + "loss": 1.0405, + "step": 34050 + }, + { + "epoch": 0.49065790800524367, + "grad_norm": 0.646485447883606, + "learning_rate": 1.0788149153805247e-05, + "loss": 1.036, + "step": 34060 + }, + { + "epoch": 0.49080196493654293, + "grad_norm": 0.6106576919555664, + "learning_rate": 1.078349789642593e-05, + "loss": 1.0081, + "step": 34070 + }, + { + "epoch": 0.4909460218678422, + "grad_norm": 0.6716938614845276, + "learning_rate": 1.0778846468489927e-05, + "loss": 1.0205, + "step": 34080 + }, + { + "epoch": 0.4910900787991414, + "grad_norm": 0.6112418174743652, + "learning_rate": 1.0774194871009795e-05, + "loss": 1.003, + "step": 34090 + }, + { + "epoch": 0.49123413573044067, + "grad_norm": 0.5627857446670532, + "learning_rate": 1.076954310499812e-05, + "loss": 1.0165, + "step": 34100 + }, + { + "epoch": 0.49137819266173993, + "grad_norm": 0.5897433161735535, + "learning_rate": 1.0764891171467527e-05, + "loss": 1.0239, + "step": 34110 + }, + { + "epoch": 0.4915222495930392, + "grad_norm": 0.5515845417976379, + "learning_rate": 1.0760239071430674e-05, + "loss": 1.0372, + "step": 34120 + }, + { + "epoch": 0.4916663065243384, + "grad_norm": 0.5582568049430847, + "learning_rate": 1.0755586805900264e-05, + "loss": 1.0286, + "step": 34130 + }, + { + "epoch": 0.49181036345563767, + "grad_norm": 0.5665695071220398, + "learning_rate": 1.0750934375889031e-05, + "loss": 1.0143, + "step": 34140 + }, + { + "epoch": 0.49195442038693693, + "grad_norm": 0.6537351012229919, + "learning_rate": 1.0746281782409741e-05, + "loss": 1.0258, + "step": 34150 + }, + { + "epoch": 0.4920984773182362, + "grad_norm": 0.5903180837631226, + "learning_rate": 1.0741629026475204e-05, + "loss": 1.0406, + "step": 34160 + }, + { + "epoch": 0.4922425342495354, + "grad_norm": 0.5804015398025513, + "learning_rate": 1.073697610909826e-05, + "loss": 1.0212, + "step": 34170 + }, + { + "epoch": 0.49238659118083467, + "grad_norm": 0.724968433380127, + "learning_rate": 1.0732323031291782e-05, + "loss": 1.0573, + "step": 34180 + }, + { + "epoch": 0.49253064811213393, + "grad_norm": 0.627497136592865, + "learning_rate": 1.0727669794068683e-05, + "loss": 1.0238, + "step": 34190 + }, + { + "epoch": 0.4926747050434332, + "grad_norm": 0.5886906981468201, + "learning_rate": 1.0723016398441907e-05, + "loss": 1.0327, + "step": 34200 + }, + { + "epoch": 0.4928187619747324, + "grad_norm": 0.5589420795440674, + "learning_rate": 1.0718362845424437e-05, + "loss": 1.0396, + "step": 34210 + }, + { + "epoch": 0.49296281890603166, + "grad_norm": 0.6463192105293274, + "learning_rate": 1.0713709136029282e-05, + "loss": 1.0181, + "step": 34220 + }, + { + "epoch": 0.4931068758373309, + "grad_norm": 0.6273395419120789, + "learning_rate": 1.0709055271269497e-05, + "loss": 1.0283, + "step": 34230 + }, + { + "epoch": 0.4932509327686302, + "grad_norm": 0.6272040009498596, + "learning_rate": 1.0704401252158156e-05, + "loss": 1.0094, + "step": 34240 + }, + { + "epoch": 0.4933949896999294, + "grad_norm": 0.6010400056838989, + "learning_rate": 1.0699747079708388e-05, + "loss": 1.0057, + "step": 34250 + }, + { + "epoch": 0.49353904663122866, + "grad_norm": 0.6007885336875916, + "learning_rate": 1.0695092754933327e-05, + "loss": 1.0289, + "step": 34260 + }, + { + "epoch": 0.4936831035625279, + "grad_norm": 0.6088549494743347, + "learning_rate": 1.0690438278846164e-05, + "loss": 1.0276, + "step": 34270 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 0.6549152731895447, + "learning_rate": 1.0685783652460113e-05, + "loss": 1.0132, + "step": 34280 + }, + { + "epoch": 0.4939712174251264, + "grad_norm": 0.6156216859817505, + "learning_rate": 1.0681128876788422e-05, + "loss": 1.0263, + "step": 34290 + }, + { + "epoch": 0.49411527435642566, + "grad_norm": 0.5971141457557678, + "learning_rate": 1.0676473952844369e-05, + "loss": 1.037, + "step": 34300 + }, + { + "epoch": 0.4942593312877249, + "grad_norm": 0.5692846179008484, + "learning_rate": 1.067181888164127e-05, + "loss": 1.0347, + "step": 34310 + }, + { + "epoch": 0.49440338821902413, + "grad_norm": 0.47606226801872253, + "learning_rate": 1.0667163664192467e-05, + "loss": 1.0308, + "step": 34320 + }, + { + "epoch": 0.4945474451503234, + "grad_norm": 0.6196852922439575, + "learning_rate": 1.0662508301511335e-05, + "loss": 1.0273, + "step": 34330 + }, + { + "epoch": 0.49469150208162266, + "grad_norm": 0.637514591217041, + "learning_rate": 1.065785279461129e-05, + "loss": 1.0484, + "step": 34340 + }, + { + "epoch": 0.4948355590129219, + "grad_norm": 0.6438940763473511, + "learning_rate": 1.0653197144505762e-05, + "loss": 1.0269, + "step": 34350 + }, + { + "epoch": 0.49497961594422113, + "grad_norm": 0.5789345502853394, + "learning_rate": 1.0648541352208233e-05, + "loss": 1.032, + "step": 34360 + }, + { + "epoch": 0.4951236728755204, + "grad_norm": 0.635887086391449, + "learning_rate": 1.0643885418732191e-05, + "loss": 1.0272, + "step": 34370 + }, + { + "epoch": 0.49526772980681966, + "grad_norm": 0.603382408618927, + "learning_rate": 1.063922934509118e-05, + "loss": 1.0058, + "step": 34380 + }, + { + "epoch": 0.4954117867381189, + "grad_norm": 0.61844402551651, + "learning_rate": 1.063457313229876e-05, + "loss": 1.034, + "step": 34390 + }, + { + "epoch": 0.49555584366941813, + "grad_norm": 0.6242319345474243, + "learning_rate": 1.0629916781368519e-05, + "loss": 1.0437, + "step": 34400 + }, + { + "epoch": 0.4956999006007174, + "grad_norm": 0.5652687549591064, + "learning_rate": 1.0625260293314086e-05, + "loss": 1.0034, + "step": 34410 + }, + { + "epoch": 0.49584395753201665, + "grad_norm": 0.7617804408073425, + "learning_rate": 1.0620603669149113e-05, + "loss": 1.0066, + "step": 34420 + }, + { + "epoch": 0.4959880144633159, + "grad_norm": 0.5702570080757141, + "learning_rate": 1.0615946909887285e-05, + "loss": 1.0242, + "step": 34430 + }, + { + "epoch": 0.4961320713946151, + "grad_norm": 0.5408039689064026, + "learning_rate": 1.0611290016542307e-05, + "loss": 1.01, + "step": 34440 + }, + { + "epoch": 0.4962761283259144, + "grad_norm": 0.5169128179550171, + "learning_rate": 1.0606632990127927e-05, + "loss": 1.031, + "step": 34450 + }, + { + "epoch": 0.49642018525721365, + "grad_norm": 0.6136772632598877, + "learning_rate": 1.0601975831657913e-05, + "loss": 1.028, + "step": 34460 + }, + { + "epoch": 0.4965642421885129, + "grad_norm": 0.615149974822998, + "learning_rate": 1.0597318542146067e-05, + "loss": 1.0001, + "step": 34470 + }, + { + "epoch": 0.4967082991198121, + "grad_norm": 0.6651378870010376, + "learning_rate": 1.0592661122606211e-05, + "loss": 1.0231, + "step": 34480 + }, + { + "epoch": 0.4968523560511114, + "grad_norm": 0.677888810634613, + "learning_rate": 1.0588003574052207e-05, + "loss": 1.052, + "step": 34490 + }, + { + "epoch": 0.49699641298241065, + "grad_norm": 0.6368032097816467, + "learning_rate": 1.058334589749794e-05, + "loss": 0.9976, + "step": 34500 + }, + { + "epoch": 0.4971404699137099, + "grad_norm": 0.5970730185508728, + "learning_rate": 1.057868809395731e-05, + "loss": 1.0335, + "step": 34510 + }, + { + "epoch": 0.4972845268450091, + "grad_norm": 0.6341341137886047, + "learning_rate": 1.0574030164444272e-05, + "loss": 0.9998, + "step": 34520 + }, + { + "epoch": 0.4974285837763084, + "grad_norm": 0.5348761677742004, + "learning_rate": 1.056937210997278e-05, + "loss": 1.027, + "step": 34530 + }, + { + "epoch": 0.49757264070760765, + "grad_norm": 0.7168810367584229, + "learning_rate": 1.0564713931556838e-05, + "loss": 1.0289, + "step": 34540 + }, + { + "epoch": 0.4977166976389069, + "grad_norm": 0.6505715847015381, + "learning_rate": 1.0560055630210462e-05, + "loss": 1.0192, + "step": 34550 + }, + { + "epoch": 0.4978607545702061, + "grad_norm": 0.6991880536079407, + "learning_rate": 1.05553972069477e-05, + "loss": 1.0298, + "step": 34560 + }, + { + "epoch": 0.4980048115015054, + "grad_norm": 0.613568902015686, + "learning_rate": 1.0550738662782632e-05, + "loss": 1.0156, + "step": 34570 + }, + { + "epoch": 0.49814886843280465, + "grad_norm": 0.5543942451477051, + "learning_rate": 1.054607999872935e-05, + "loss": 1.0038, + "step": 34580 + }, + { + "epoch": 0.4982929253641039, + "grad_norm": 0.5577853918075562, + "learning_rate": 1.054142121580199e-05, + "loss": 1.0423, + "step": 34590 + }, + { + "epoch": 0.4984369822954031, + "grad_norm": 0.5902289748191833, + "learning_rate": 1.0536762315014695e-05, + "loss": 0.9978, + "step": 34600 + }, + { + "epoch": 0.4985810392267024, + "grad_norm": 0.6642874479293823, + "learning_rate": 1.0532103297381653e-05, + "loss": 1.03, + "step": 34610 + }, + { + "epoch": 0.49872509615800165, + "grad_norm": 0.7516694664955139, + "learning_rate": 1.0527444163917063e-05, + "loss": 1.0166, + "step": 34620 + }, + { + "epoch": 0.4988691530893009, + "grad_norm": 0.6212608814239502, + "learning_rate": 1.0522784915635152e-05, + "loss": 1.0402, + "step": 34630 + }, + { + "epoch": 0.4990132100206001, + "grad_norm": 0.5182382464408875, + "learning_rate": 1.0518125553550178e-05, + "loss": 0.9924, + "step": 34640 + }, + { + "epoch": 0.4991572669518994, + "grad_norm": 0.6416479349136353, + "learning_rate": 1.051346607867642e-05, + "loss": 1.0154, + "step": 34650 + }, + { + "epoch": 0.49930132388319864, + "grad_norm": 0.6422526240348816, + "learning_rate": 1.0508806492028178e-05, + "loss": 1.0409, + "step": 34660 + }, + { + "epoch": 0.4994453808144979, + "grad_norm": 0.6135118007659912, + "learning_rate": 1.0504146794619783e-05, + "loss": 1.0238, + "step": 34670 + }, + { + "epoch": 0.4995894377457971, + "grad_norm": 0.5611364245414734, + "learning_rate": 1.0499486987465584e-05, + "loss": 1.0196, + "step": 34680 + }, + { + "epoch": 0.4997334946770964, + "grad_norm": 0.5582852363586426, + "learning_rate": 1.0494827071579961e-05, + "loss": 1.0308, + "step": 34690 + }, + { + "epoch": 0.49987755160839564, + "grad_norm": 0.5679566860198975, + "learning_rate": 1.0490167047977306e-05, + "loss": 1.0191, + "step": 34700 + }, + { + "epoch": 0.5000216085396949, + "grad_norm": 0.570986270904541, + "learning_rate": 1.048550691767205e-05, + "loss": 1.0233, + "step": 34710 + }, + { + "epoch": 0.5001656654709942, + "grad_norm": 0.6344956159591675, + "learning_rate": 1.0480846681678635e-05, + "loss": 1.0339, + "step": 34720 + }, + { + "epoch": 0.5003097224022934, + "grad_norm": 0.6062827110290527, + "learning_rate": 1.0476186341011532e-05, + "loss": 1.029, + "step": 34730 + }, + { + "epoch": 0.5004537793335926, + "grad_norm": 0.5521675944328308, + "learning_rate": 1.0471525896685229e-05, + "loss": 1.0248, + "step": 34740 + }, + { + "epoch": 0.5005978362648918, + "grad_norm": 0.5854989886283875, + "learning_rate": 1.0466865349714243e-05, + "loss": 1.037, + "step": 34750 + }, + { + "epoch": 0.5007418931961911, + "grad_norm": 0.5962767601013184, + "learning_rate": 1.0462204701113115e-05, + "loss": 1.0155, + "step": 34760 + }, + { + "epoch": 0.5008859501274904, + "grad_norm": 0.6587475538253784, + "learning_rate": 1.0457543951896396e-05, + "loss": 1.0172, + "step": 34770 + }, + { + "epoch": 0.5010300070587896, + "grad_norm": 0.6376791596412659, + "learning_rate": 1.045288310307867e-05, + "loss": 1.0258, + "step": 34780 + }, + { + "epoch": 0.5011740639900889, + "grad_norm": 0.5904030799865723, + "learning_rate": 1.0448222155674542e-05, + "loss": 1.0061, + "step": 34790 + }, + { + "epoch": 0.5013181209213882, + "grad_norm": 0.5428909659385681, + "learning_rate": 1.0443561110698635e-05, + "loss": 1.0313, + "step": 34800 + }, + { + "epoch": 0.5014621778526874, + "grad_norm": 0.5389203429222107, + "learning_rate": 1.0438899969165588e-05, + "loss": 1.0197, + "step": 34810 + }, + { + "epoch": 0.5016062347839866, + "grad_norm": 0.5955771207809448, + "learning_rate": 1.0434238732090076e-05, + "loss": 1.0094, + "step": 34820 + }, + { + "epoch": 0.5017502917152858, + "grad_norm": 0.5243788361549377, + "learning_rate": 1.0429577400486784e-05, + "loss": 1.017, + "step": 34830 + }, + { + "epoch": 0.5018943486465851, + "grad_norm": 0.5687932968139648, + "learning_rate": 1.0424915975370416e-05, + "loss": 1.0045, + "step": 34840 + }, + { + "epoch": 0.5020384055778844, + "grad_norm": 0.5824376940727234, + "learning_rate": 1.0420254457755699e-05, + "loss": 1.0205, + "step": 34850 + }, + { + "epoch": 0.5021824625091836, + "grad_norm": 0.6246141195297241, + "learning_rate": 1.0415592848657388e-05, + "loss": 1.0298, + "step": 34860 + }, + { + "epoch": 0.5023265194404829, + "grad_norm": 0.6343791484832764, + "learning_rate": 1.0410931149090247e-05, + "loss": 1.0195, + "step": 34870 + }, + { + "epoch": 0.5024705763717822, + "grad_norm": 0.5543587803840637, + "learning_rate": 1.0406269360069058e-05, + "loss": 1.0114, + "step": 34880 + }, + { + "epoch": 0.5026146333030814, + "grad_norm": 0.5309613347053528, + "learning_rate": 1.040160748260864e-05, + "loss": 1.003, + "step": 34890 + }, + { + "epoch": 0.5027586902343806, + "grad_norm": 0.5168134570121765, + "learning_rate": 1.039694551772381e-05, + "loss": 1.0155, + "step": 34900 + }, + { + "epoch": 0.5029027471656798, + "grad_norm": 0.5596436858177185, + "learning_rate": 1.039228346642942e-05, + "loss": 1.0319, + "step": 34910 + }, + { + "epoch": 0.5030468040969791, + "grad_norm": 0.6299206614494324, + "learning_rate": 1.0387621329740327e-05, + "loss": 1.0344, + "step": 34920 + }, + { + "epoch": 0.5031908610282784, + "grad_norm": 0.5811081528663635, + "learning_rate": 1.0382959108671422e-05, + "loss": 1.0113, + "step": 34930 + }, + { + "epoch": 0.5033349179595776, + "grad_norm": 0.6343690752983093, + "learning_rate": 1.03782968042376e-05, + "loss": 1.0361, + "step": 34940 + }, + { + "epoch": 0.5034789748908769, + "grad_norm": 0.68972247838974, + "learning_rate": 1.037363441745378e-05, + "loss": 1.0302, + "step": 34950 + }, + { + "epoch": 0.5036230318221762, + "grad_norm": 0.5717976093292236, + "learning_rate": 1.0368971949334906e-05, + "loss": 1.0431, + "step": 34960 + }, + { + "epoch": 0.5037670887534754, + "grad_norm": 0.6288382411003113, + "learning_rate": 1.0364309400895921e-05, + "loss": 1.0018, + "step": 34970 + }, + { + "epoch": 0.5039111456847746, + "grad_norm": 0.6394724249839783, + "learning_rate": 1.0359646773151815e-05, + "loss": 1.016, + "step": 34980 + }, + { + "epoch": 0.5040552026160738, + "grad_norm": 0.5719991326332092, + "learning_rate": 1.0354984067117562e-05, + "loss": 1.0485, + "step": 34990 + }, + { + "epoch": 0.5041992595473731, + "grad_norm": 0.5724058151245117, + "learning_rate": 1.0350321283808176e-05, + "loss": 1.0177, + "step": 35000 + }, + { + "epoch": 0.5043433164786724, + "grad_norm": 0.5895094871520996, + "learning_rate": 1.0345658424238676e-05, + "loss": 1.0298, + "step": 35010 + }, + { + "epoch": 0.5044873734099716, + "grad_norm": 0.6532093286514282, + "learning_rate": 1.0340995489424108e-05, + "loss": 1.0175, + "step": 35020 + }, + { + "epoch": 0.5046314303412709, + "grad_norm": 0.548981785774231, + "learning_rate": 1.0336332480379527e-05, + "loss": 1.0035, + "step": 35030 + }, + { + "epoch": 0.5047754872725702, + "grad_norm": 0.5826421976089478, + "learning_rate": 1.0331669398120003e-05, + "loss": 1.0139, + "step": 35040 + }, + { + "epoch": 0.5049195442038694, + "grad_norm": 0.6140947937965393, + "learning_rate": 1.0327006243660631e-05, + "loss": 0.9947, + "step": 35050 + }, + { + "epoch": 0.5050636011351686, + "grad_norm": 0.6252985000610352, + "learning_rate": 1.0322343018016505e-05, + "loss": 1.0032, + "step": 35060 + }, + { + "epoch": 0.5052076580664678, + "grad_norm": 0.7099573016166687, + "learning_rate": 1.0317679722202753e-05, + "loss": 1.0339, + "step": 35070 + }, + { + "epoch": 0.5053517149977671, + "grad_norm": 0.6047699451446533, + "learning_rate": 1.0313016357234507e-05, + "loss": 1.042, + "step": 35080 + }, + { + "epoch": 0.5054957719290664, + "grad_norm": 0.4941195249557495, + "learning_rate": 1.0308352924126918e-05, + "loss": 1.02, + "step": 35090 + }, + { + "epoch": 0.5056398288603656, + "grad_norm": 0.6888431906700134, + "learning_rate": 1.0303689423895154e-05, + "loss": 1.0059, + "step": 35100 + }, + { + "epoch": 0.5057838857916649, + "grad_norm": 0.576938807964325, + "learning_rate": 1.0299025857554388e-05, + "loss": 1.0056, + "step": 35110 + }, + { + "epoch": 0.5059279427229642, + "grad_norm": 0.5949046015739441, + "learning_rate": 1.0294362226119821e-05, + "loss": 1.0313, + "step": 35120 + }, + { + "epoch": 0.5060719996542634, + "grad_norm": 0.6250497102737427, + "learning_rate": 1.0289698530606655e-05, + "loss": 1.0393, + "step": 35130 + }, + { + "epoch": 0.5062160565855626, + "grad_norm": 0.5415104031562805, + "learning_rate": 1.0285034772030118e-05, + "loss": 1.0405, + "step": 35140 + }, + { + "epoch": 0.5063601135168618, + "grad_norm": 0.5248748064041138, + "learning_rate": 1.028037095140544e-05, + "loss": 1.0341, + "step": 35150 + }, + { + "epoch": 0.5065041704481611, + "grad_norm": 0.5896631479263306, + "learning_rate": 1.0275707069747877e-05, + "loss": 1.0092, + "step": 35160 + }, + { + "epoch": 0.5066482273794604, + "grad_norm": 0.6074146628379822, + "learning_rate": 1.0271043128072689e-05, + "loss": 1.0019, + "step": 35170 + }, + { + "epoch": 0.5067922843107596, + "grad_norm": 0.5902014374732971, + "learning_rate": 1.0266379127395148e-05, + "loss": 1.0069, + "step": 35180 + }, + { + "epoch": 0.5069363412420589, + "grad_norm": 0.5882689356803894, + "learning_rate": 1.0261715068730547e-05, + "loss": 1.0195, + "step": 35190 + }, + { + "epoch": 0.5070803981733581, + "grad_norm": 0.641704797744751, + "learning_rate": 1.0257050953094185e-05, + "loss": 1.0367, + "step": 35200 + }, + { + "epoch": 0.5072244551046574, + "grad_norm": 0.6412422060966492, + "learning_rate": 1.025238678150138e-05, + "loss": 1.0184, + "step": 35210 + }, + { + "epoch": 0.5073685120359566, + "grad_norm": 0.7109228372573853, + "learning_rate": 1.024772255496745e-05, + "loss": 1.0031, + "step": 35220 + }, + { + "epoch": 0.5075125689672558, + "grad_norm": 0.5909984111785889, + "learning_rate": 1.0243058274507737e-05, + "loss": 1.0108, + "step": 35230 + }, + { + "epoch": 0.5076566258985551, + "grad_norm": 0.6337960362434387, + "learning_rate": 1.0238393941137594e-05, + "loss": 1.0462, + "step": 35240 + }, + { + "epoch": 0.5078006828298544, + "grad_norm": 0.6581072807312012, + "learning_rate": 1.0233729555872378e-05, + "loss": 1.0186, + "step": 35250 + }, + { + "epoch": 0.5079447397611536, + "grad_norm": 0.6113468408584595, + "learning_rate": 1.022906511972746e-05, + "loss": 1.033, + "step": 35260 + }, + { + "epoch": 0.5080887966924529, + "grad_norm": 0.6098412871360779, + "learning_rate": 1.0224400633718226e-05, + "loss": 1.0161, + "step": 35270 + }, + { + "epoch": 0.5082328536237521, + "grad_norm": 0.5138407945632935, + "learning_rate": 1.0219736098860071e-05, + "loss": 1.0242, + "step": 35280 + }, + { + "epoch": 0.5083769105550514, + "grad_norm": 0.5745452046394348, + "learning_rate": 1.0215071516168395e-05, + "loss": 1.0172, + "step": 35290 + }, + { + "epoch": 0.5085209674863506, + "grad_norm": 0.5427656173706055, + "learning_rate": 1.0210406886658622e-05, + "loss": 1.0169, + "step": 35300 + }, + { + "epoch": 0.5086650244176498, + "grad_norm": 0.5485026240348816, + "learning_rate": 1.020574221134617e-05, + "loss": 1.0222, + "step": 35310 + }, + { + "epoch": 0.5088090813489491, + "grad_norm": 0.5079200863838196, + "learning_rate": 1.0201077491246477e-05, + "loss": 1.0131, + "step": 35320 + }, + { + "epoch": 0.5089531382802484, + "grad_norm": 0.5675054788589478, + "learning_rate": 1.0196412727374985e-05, + "loss": 1.0168, + "step": 35330 + }, + { + "epoch": 0.5090971952115476, + "grad_norm": 0.6245012879371643, + "learning_rate": 1.0191747920747155e-05, + "loss": 1.0033, + "step": 35340 + }, + { + "epoch": 0.5092412521428469, + "grad_norm": 0.573479175567627, + "learning_rate": 1.0187083072378448e-05, + "loss": 1.0219, + "step": 35350 + }, + { + "epoch": 0.5093853090741461, + "grad_norm": 0.6171622276306152, + "learning_rate": 1.0182418183284335e-05, + "loss": 1.0157, + "step": 35360 + }, + { + "epoch": 0.5095293660054454, + "grad_norm": 0.5683724880218506, + "learning_rate": 1.0177753254480303e-05, + "loss": 1.0207, + "step": 35370 + }, + { + "epoch": 0.5096734229367446, + "grad_norm": 0.6862166523933411, + "learning_rate": 1.0173088286981837e-05, + "loss": 1.0102, + "step": 35380 + }, + { + "epoch": 0.5098174798680438, + "grad_norm": 0.6910626292228699, + "learning_rate": 1.016842328180444e-05, + "loss": 0.9967, + "step": 35390 + }, + { + "epoch": 0.5099615367993431, + "grad_norm": 0.6231151223182678, + "learning_rate": 1.0163758239963615e-05, + "loss": 1.0224, + "step": 35400 + }, + { + "epoch": 0.5101055937306423, + "grad_norm": 0.5692195892333984, + "learning_rate": 1.0159093162474885e-05, + "loss": 1.0038, + "step": 35410 + }, + { + "epoch": 0.5102496506619416, + "grad_norm": 0.5317984819412231, + "learning_rate": 1.0154428050353766e-05, + "loss": 1.0122, + "step": 35420 + }, + { + "epoch": 0.5103937075932409, + "grad_norm": 0.6425004005432129, + "learning_rate": 1.0149762904615787e-05, + "loss": 1.0318, + "step": 35430 + }, + { + "epoch": 0.5105377645245401, + "grad_norm": 0.5365632772445679, + "learning_rate": 1.0145097726276494e-05, + "loss": 1.0341, + "step": 35440 + }, + { + "epoch": 0.5106818214558394, + "grad_norm": 0.6597155928611755, + "learning_rate": 1.0140432516351426e-05, + "loss": 1.0348, + "step": 35450 + }, + { + "epoch": 0.5108258783871386, + "grad_norm": 0.6425182223320007, + "learning_rate": 1.013576727585614e-05, + "loss": 1.0259, + "step": 35460 + }, + { + "epoch": 0.5109699353184378, + "grad_norm": 0.5821989178657532, + "learning_rate": 1.0131102005806186e-05, + "loss": 0.995, + "step": 35470 + }, + { + "epoch": 0.5111139922497371, + "grad_norm": 0.5270401835441589, + "learning_rate": 1.0126436707217137e-05, + "loss": 1.0172, + "step": 35480 + }, + { + "epoch": 0.5112580491810363, + "grad_norm": 0.6593245267868042, + "learning_rate": 1.012177138110456e-05, + "loss": 1.0218, + "step": 35490 + }, + { + "epoch": 0.5114021061123356, + "grad_norm": 0.628489077091217, + "learning_rate": 1.0117106028484034e-05, + "loss": 1.0383, + "step": 35500 + }, + { + "epoch": 0.5115461630436349, + "grad_norm": 0.57815021276474, + "learning_rate": 1.011244065037114e-05, + "loss": 1.0133, + "step": 35510 + }, + { + "epoch": 0.5116902199749341, + "grad_norm": 0.6188305020332336, + "learning_rate": 1.0107775247781466e-05, + "loss": 1.0236, + "step": 35520 + }, + { + "epoch": 0.5118342769062334, + "grad_norm": 0.5830217003822327, + "learning_rate": 1.0103109821730612e-05, + "loss": 1.0076, + "step": 35530 + }, + { + "epoch": 0.5119783338375326, + "grad_norm": 0.6795600652694702, + "learning_rate": 1.0098444373234169e-05, + "loss": 1.0349, + "step": 35540 + }, + { + "epoch": 0.5121223907688318, + "grad_norm": 0.5690765976905823, + "learning_rate": 1.0093778903307744e-05, + "loss": 1.0115, + "step": 35550 + }, + { + "epoch": 0.5122664477001311, + "grad_norm": 0.5599063038825989, + "learning_rate": 1.0089113412966944e-05, + "loss": 1.0372, + "step": 35560 + }, + { + "epoch": 0.5124105046314303, + "grad_norm": 0.6547520160675049, + "learning_rate": 1.0084447903227386e-05, + "loss": 1.0256, + "step": 35570 + }, + { + "epoch": 0.5125545615627296, + "grad_norm": 0.6033701300621033, + "learning_rate": 1.0079782375104685e-05, + "loss": 1.0372, + "step": 35580 + }, + { + "epoch": 0.5126986184940289, + "grad_norm": 0.6120834350585938, + "learning_rate": 1.0075116829614458e-05, + "loss": 1.0205, + "step": 35590 + }, + { + "epoch": 0.5128426754253281, + "grad_norm": 0.5898239612579346, + "learning_rate": 1.0070451267772338e-05, + "loss": 0.9922, + "step": 35600 + }, + { + "epoch": 0.5129867323566273, + "grad_norm": 0.5969788432121277, + "learning_rate": 1.0065785690593946e-05, + "loss": 1.0304, + "step": 35610 + }, + { + "epoch": 0.5131307892879265, + "grad_norm": 0.5550798773765564, + "learning_rate": 1.0061120099094917e-05, + "loss": 1.0057, + "step": 35620 + }, + { + "epoch": 0.5132748462192258, + "grad_norm": 0.5665841102600098, + "learning_rate": 1.0056454494290884e-05, + "loss": 1.0097, + "step": 35630 + }, + { + "epoch": 0.5134189031505251, + "grad_norm": 0.5943402647972107, + "learning_rate": 1.0051788877197488e-05, + "loss": 1.0073, + "step": 35640 + }, + { + "epoch": 0.5135629600818243, + "grad_norm": 0.6273964643478394, + "learning_rate": 1.0047123248830369e-05, + "loss": 1.0515, + "step": 35650 + }, + { + "epoch": 0.5137070170131236, + "grad_norm": 0.8690207004547119, + "learning_rate": 1.0042457610205168e-05, + "loss": 1.0301, + "step": 35660 + }, + { + "epoch": 0.5138510739444229, + "grad_norm": 0.6672706007957458, + "learning_rate": 1.0037791962337531e-05, + "loss": 1.0015, + "step": 35670 + }, + { + "epoch": 0.5139951308757221, + "grad_norm": 0.5607325434684753, + "learning_rate": 1.0033126306243104e-05, + "loss": 1.0194, + "step": 35680 + }, + { + "epoch": 0.5141391878070213, + "grad_norm": 0.6259201169013977, + "learning_rate": 1.002846064293754e-05, + "loss": 1.0093, + "step": 35690 + }, + { + "epoch": 0.5142832447383205, + "grad_norm": 0.6983173489570618, + "learning_rate": 1.0023794973436483e-05, + "loss": 1.0134, + "step": 35700 + }, + { + "epoch": 0.5144273016696198, + "grad_norm": 0.5167803168296814, + "learning_rate": 1.0019129298755593e-05, + "loss": 1.0212, + "step": 35710 + }, + { + "epoch": 0.5145713586009191, + "grad_norm": 0.5578393936157227, + "learning_rate": 1.0014463619910513e-05, + "loss": 1.0078, + "step": 35720 + }, + { + "epoch": 0.5147154155322183, + "grad_norm": 0.5481727123260498, + "learning_rate": 1.0009797937916905e-05, + "loss": 1.0314, + "step": 35730 + }, + { + "epoch": 0.5148594724635176, + "grad_norm": 0.5455896258354187, + "learning_rate": 1.000513225379042e-05, + "loss": 1.0023, + "step": 35740 + }, + { + "epoch": 0.5150035293948169, + "grad_norm": 0.6557184457778931, + "learning_rate": 1.0000466568546716e-05, + "loss": 1.0311, + "step": 35750 + }, + { + "epoch": 0.5151475863261161, + "grad_norm": 0.7582526206970215, + "learning_rate": 9.995800883201446e-06, + "loss": 0.9772, + "step": 35760 + }, + { + "epoch": 0.5152916432574153, + "grad_norm": 0.5622434616088867, + "learning_rate": 9.991135198770266e-06, + "loss": 1.0282, + "step": 35770 + }, + { + "epoch": 0.5154357001887145, + "grad_norm": 0.5806698799133301, + "learning_rate": 9.98646951626883e-06, + "loss": 1.0392, + "step": 35780 + }, + { + "epoch": 0.5155797571200138, + "grad_norm": 0.5843561291694641, + "learning_rate": 9.981803836712794e-06, + "loss": 1.0311, + "step": 35790 + }, + { + "epoch": 0.5157238140513131, + "grad_norm": 0.5436504483222961, + "learning_rate": 9.977138161117813e-06, + "loss": 1.0078, + "step": 35800 + }, + { + "epoch": 0.5158678709826123, + "grad_norm": 0.6146788001060486, + "learning_rate": 9.972472490499536e-06, + "loss": 1.039, + "step": 35810 + }, + { + "epoch": 0.5160119279139116, + "grad_norm": 0.5172025561332703, + "learning_rate": 9.967806825873621e-06, + "loss": 1.0053, + "step": 35820 + }, + { + "epoch": 0.5161559848452109, + "grad_norm": 0.5545276999473572, + "learning_rate": 9.963141168255714e-06, + "loss": 1.0309, + "step": 35830 + }, + { + "epoch": 0.5163000417765101, + "grad_norm": 0.5263173580169678, + "learning_rate": 9.958475518661467e-06, + "loss": 1.0071, + "step": 35840 + }, + { + "epoch": 0.5164440987078093, + "grad_norm": 0.6488673686981201, + "learning_rate": 9.95380987810653e-06, + "loss": 1.0285, + "step": 35850 + }, + { + "epoch": 0.5165881556391085, + "grad_norm": 0.6554338932037354, + "learning_rate": 9.949144247606538e-06, + "loss": 1.0137, + "step": 35860 + }, + { + "epoch": 0.5167322125704078, + "grad_norm": 0.5225178003311157, + "learning_rate": 9.94447862817715e-06, + "loss": 1.029, + "step": 35870 + }, + { + "epoch": 0.5168762695017071, + "grad_norm": 0.6226567029953003, + "learning_rate": 9.939813020833995e-06, + "loss": 1.0378, + "step": 35880 + }, + { + "epoch": 0.5170203264330063, + "grad_norm": 0.6217008233070374, + "learning_rate": 9.935147426592712e-06, + "loss": 0.9921, + "step": 35890 + }, + { + "epoch": 0.5171643833643056, + "grad_norm": 0.6131475567817688, + "learning_rate": 9.930481846468944e-06, + "loss": 1.0511, + "step": 35900 + }, + { + "epoch": 0.5173084402956049, + "grad_norm": 0.5917423367500305, + "learning_rate": 9.925816281478318e-06, + "loss": 1.0134, + "step": 35910 + }, + { + "epoch": 0.5174524972269041, + "grad_norm": 0.6239330768585205, + "learning_rate": 9.921150732636461e-06, + "loss": 1.0211, + "step": 35920 + }, + { + "epoch": 0.5175965541582033, + "grad_norm": 0.6124416589736938, + "learning_rate": 9.916485200959005e-06, + "loss": 1.006, + "step": 35930 + }, + { + "epoch": 0.5177406110895025, + "grad_norm": 0.5896172523498535, + "learning_rate": 9.911819687461567e-06, + "loss": 1.0296, + "step": 35940 + }, + { + "epoch": 0.5178846680208018, + "grad_norm": 0.6174232959747314, + "learning_rate": 9.907154193159768e-06, + "loss": 1.0337, + "step": 35950 + }, + { + "epoch": 0.5180287249521011, + "grad_norm": 0.6552242636680603, + "learning_rate": 9.902488719069221e-06, + "loss": 1.0386, + "step": 35960 + }, + { + "epoch": 0.5181727818834003, + "grad_norm": 0.6238149404525757, + "learning_rate": 9.897823266205532e-06, + "loss": 1.0338, + "step": 35970 + }, + { + "epoch": 0.5183168388146996, + "grad_norm": 0.6092976331710815, + "learning_rate": 9.893157835584309e-06, + "loss": 1.0355, + "step": 35980 + }, + { + "epoch": 0.5184608957459989, + "grad_norm": 0.6302141547203064, + "learning_rate": 9.888492428221156e-06, + "loss": 1.0335, + "step": 35990 + }, + { + "epoch": 0.5186049526772981, + "grad_norm": 0.5200178623199463, + "learning_rate": 9.883827045131656e-06, + "loss": 1.022, + "step": 36000 + }, + { + "epoch": 0.5187490096085973, + "grad_norm": 0.5777044296264648, + "learning_rate": 9.87916168733141e-06, + "loss": 1.0219, + "step": 36010 + }, + { + "epoch": 0.5188930665398965, + "grad_norm": 0.5704400539398193, + "learning_rate": 9.874496355836e-06, + "loss": 1.0066, + "step": 36020 + }, + { + "epoch": 0.5190371234711958, + "grad_norm": 0.5675187706947327, + "learning_rate": 9.869831051660997e-06, + "loss": 1.0325, + "step": 36030 + }, + { + "epoch": 0.5191811804024951, + "grad_norm": 0.5844601392745972, + "learning_rate": 9.86516577582198e-06, + "loss": 1.0187, + "step": 36040 + }, + { + "epoch": 0.5193252373337943, + "grad_norm": 0.6567334532737732, + "learning_rate": 9.860500529334512e-06, + "loss": 1.0397, + "step": 36050 + }, + { + "epoch": 0.5194692942650936, + "grad_norm": 0.6387859582901001, + "learning_rate": 9.855835313214156e-06, + "loss": 1.0088, + "step": 36060 + }, + { + "epoch": 0.5196133511963928, + "grad_norm": 0.5364038348197937, + "learning_rate": 9.851170128476464e-06, + "loss": 1.0275, + "step": 36070 + }, + { + "epoch": 0.5197574081276921, + "grad_norm": 0.5949365496635437, + "learning_rate": 9.846504976136981e-06, + "loss": 1.0256, + "step": 36080 + }, + { + "epoch": 0.5199014650589913, + "grad_norm": 0.718833327293396, + "learning_rate": 9.841839857211249e-06, + "loss": 1.0149, + "step": 36090 + }, + { + "epoch": 0.5200455219902905, + "grad_norm": 0.540953516960144, + "learning_rate": 9.837174772714797e-06, + "loss": 1.0208, + "step": 36100 + }, + { + "epoch": 0.5201895789215898, + "grad_norm": 0.7185535430908203, + "learning_rate": 9.83250972366315e-06, + "loss": 1.0198, + "step": 36110 + }, + { + "epoch": 0.520333635852889, + "grad_norm": 0.5654457807540894, + "learning_rate": 9.827844711071826e-06, + "loss": 1.0394, + "step": 36120 + }, + { + "epoch": 0.5204776927841883, + "grad_norm": 0.6072353720664978, + "learning_rate": 9.823179735956337e-06, + "loss": 1.0043, + "step": 36130 + }, + { + "epoch": 0.5206217497154876, + "grad_norm": 0.6299914121627808, + "learning_rate": 9.818514799332174e-06, + "loss": 1.0225, + "step": 36140 + }, + { + "epoch": 0.5207658066467868, + "grad_norm": 0.5617902278900146, + "learning_rate": 9.813849902214841e-06, + "loss": 1.0307, + "step": 36150 + }, + { + "epoch": 0.5209098635780861, + "grad_norm": 0.556522011756897, + "learning_rate": 9.809185045619812e-06, + "loss": 1.0163, + "step": 36160 + }, + { + "epoch": 0.5210539205093853, + "grad_norm": 0.6089711785316467, + "learning_rate": 9.80452023056257e-06, + "loss": 1.0104, + "step": 36170 + }, + { + "epoch": 0.5211979774406845, + "grad_norm": 0.5726987719535828, + "learning_rate": 9.799855458058577e-06, + "loss": 1.019, + "step": 36180 + }, + { + "epoch": 0.5213420343719838, + "grad_norm": 0.6261593103408813, + "learning_rate": 9.795190729123288e-06, + "loss": 1.0262, + "step": 36190 + }, + { + "epoch": 0.521486091303283, + "grad_norm": 0.6123784184455872, + "learning_rate": 9.790526044772152e-06, + "loss": 1.0351, + "step": 36200 + }, + { + "epoch": 0.5216301482345823, + "grad_norm": 0.623529314994812, + "learning_rate": 9.785861406020607e-06, + "loss": 1.0525, + "step": 36210 + }, + { + "epoch": 0.5217742051658816, + "grad_norm": 0.5684060454368591, + "learning_rate": 9.781196813884079e-06, + "loss": 1.0217, + "step": 36220 + }, + { + "epoch": 0.5219182620971808, + "grad_norm": 0.49141550064086914, + "learning_rate": 9.776532269377986e-06, + "loss": 1.0235, + "step": 36230 + }, + { + "epoch": 0.5220623190284801, + "grad_norm": 0.5897905230522156, + "learning_rate": 9.77186777351774e-06, + "loss": 1.0118, + "step": 36240 + }, + { + "epoch": 0.5222063759597793, + "grad_norm": 0.5766646265983582, + "learning_rate": 9.767203327318723e-06, + "loss": 1.017, + "step": 36250 + }, + { + "epoch": 0.5223504328910785, + "grad_norm": 0.6184492111206055, + "learning_rate": 9.76253893179634e-06, + "loss": 0.9995, + "step": 36260 + }, + { + "epoch": 0.5224944898223778, + "grad_norm": 0.5500518679618835, + "learning_rate": 9.757874587965948e-06, + "loss": 1.0252, + "step": 36270 + }, + { + "epoch": 0.522638546753677, + "grad_norm": 0.5596465468406677, + "learning_rate": 9.753210296842924e-06, + "loss": 1.021, + "step": 36280 + }, + { + "epoch": 0.5227826036849763, + "grad_norm": 0.6081998944282532, + "learning_rate": 9.748546059442613e-06, + "loss": 1.014, + "step": 36290 + }, + { + "epoch": 0.5229266606162756, + "grad_norm": 0.5707460641860962, + "learning_rate": 9.743881876780353e-06, + "loss": 1.0102, + "step": 36300 + }, + { + "epoch": 0.5230707175475748, + "grad_norm": 0.5708239078521729, + "learning_rate": 9.739217749871476e-06, + "loss": 1.0477, + "step": 36310 + }, + { + "epoch": 0.5232147744788741, + "grad_norm": 0.8087396621704102, + "learning_rate": 9.7345536797313e-06, + "loss": 1.0201, + "step": 36320 + }, + { + "epoch": 0.5233588314101733, + "grad_norm": 0.5321927666664124, + "learning_rate": 9.729889667375123e-06, + "loss": 1.0304, + "step": 36330 + }, + { + "epoch": 0.5235028883414725, + "grad_norm": 0.5228332877159119, + "learning_rate": 9.725225713818242e-06, + "loss": 0.9842, + "step": 36340 + }, + { + "epoch": 0.5236469452727718, + "grad_norm": 0.54778653383255, + "learning_rate": 9.72056182007593e-06, + "loss": 1.0389, + "step": 36350 + }, + { + "epoch": 0.523791002204071, + "grad_norm": 0.6246420741081238, + "learning_rate": 9.715897987163457e-06, + "loss": 1.0301, + "step": 36360 + }, + { + "epoch": 0.5239350591353703, + "grad_norm": 0.6011121273040771, + "learning_rate": 9.711234216096071e-06, + "loss": 1.0303, + "step": 36370 + }, + { + "epoch": 0.5240791160666696, + "grad_norm": 0.6510663032531738, + "learning_rate": 9.706570507889013e-06, + "loss": 0.9917, + "step": 36380 + }, + { + "epoch": 0.5242231729979688, + "grad_norm": 0.5987183451652527, + "learning_rate": 9.701906863557509e-06, + "loss": 1.0175, + "step": 36390 + }, + { + "epoch": 0.5243672299292681, + "grad_norm": 0.5923366546630859, + "learning_rate": 9.69724328411677e-06, + "loss": 1.0189, + "step": 36400 + }, + { + "epoch": 0.5245112868605672, + "grad_norm": 0.5356194376945496, + "learning_rate": 9.692579770581986e-06, + "loss": 0.991, + "step": 36410 + }, + { + "epoch": 0.5246553437918665, + "grad_norm": 0.6098308563232422, + "learning_rate": 9.687916323968352e-06, + "loss": 1.0258, + "step": 36420 + }, + { + "epoch": 0.5247994007231658, + "grad_norm": 0.5701761245727539, + "learning_rate": 9.683252945291027e-06, + "loss": 1.0266, + "step": 36430 + }, + { + "epoch": 0.524943457654465, + "grad_norm": 0.627974808216095, + "learning_rate": 9.678589635565162e-06, + "loss": 1.0131, + "step": 36440 + }, + { + "epoch": 0.5250875145857643, + "grad_norm": 0.6633008122444153, + "learning_rate": 9.673926395805903e-06, + "loss": 1.0196, + "step": 36450 + }, + { + "epoch": 0.5252315715170636, + "grad_norm": 0.5533243417739868, + "learning_rate": 9.66926322702837e-06, + "loss": 1.0284, + "step": 36460 + }, + { + "epoch": 0.5253756284483628, + "grad_norm": 0.6051493883132935, + "learning_rate": 9.664600130247667e-06, + "loss": 1.0272, + "step": 36470 + }, + { + "epoch": 0.5255196853796621, + "grad_norm": 0.6262309551239014, + "learning_rate": 9.65993710647889e-06, + "loss": 1.0189, + "step": 36480 + }, + { + "epoch": 0.5256637423109612, + "grad_norm": 0.5696543455123901, + "learning_rate": 9.655274156737114e-06, + "loss": 1.0353, + "step": 36490 + }, + { + "epoch": 0.5258077992422605, + "grad_norm": 0.6242706179618835, + "learning_rate": 9.6506112820374e-06, + "loss": 1.0196, + "step": 36500 + }, + { + "epoch": 0.5259518561735598, + "grad_norm": 0.6055695414543152, + "learning_rate": 9.64594848339479e-06, + "loss": 1.0171, + "step": 36510 + }, + { + "epoch": 0.526095913104859, + "grad_norm": 0.6551834940910339, + "learning_rate": 9.641285761824307e-06, + "loss": 1.0006, + "step": 36520 + }, + { + "epoch": 0.5262399700361583, + "grad_norm": 0.5793470144271851, + "learning_rate": 9.636623118340968e-06, + "loss": 1.0456, + "step": 36530 + }, + { + "epoch": 0.5263840269674576, + "grad_norm": 0.6057597994804382, + "learning_rate": 9.631960553959767e-06, + "loss": 1.0198, + "step": 36540 + }, + { + "epoch": 0.5265280838987568, + "grad_norm": 0.6657388806343079, + "learning_rate": 9.62729806969567e-06, + "loss": 1.0386, + "step": 36550 + }, + { + "epoch": 0.5266721408300561, + "grad_norm": 0.6059638857841492, + "learning_rate": 9.622635666563647e-06, + "loss": 1.0233, + "step": 36560 + }, + { + "epoch": 0.5268161977613552, + "grad_norm": 0.5604829788208008, + "learning_rate": 9.617973345578632e-06, + "loss": 0.9891, + "step": 36570 + }, + { + "epoch": 0.5269602546926545, + "grad_norm": 0.5700094103813171, + "learning_rate": 9.613311107755547e-06, + "loss": 1.0126, + "step": 36580 + }, + { + "epoch": 0.5271043116239538, + "grad_norm": 0.6769119501113892, + "learning_rate": 9.608648954109302e-06, + "loss": 1.0184, + "step": 36590 + }, + { + "epoch": 0.527248368555253, + "grad_norm": 0.6489994525909424, + "learning_rate": 9.60398688565478e-06, + "loss": 1.0208, + "step": 36600 + }, + { + "epoch": 0.5273924254865523, + "grad_norm": 0.580607533454895, + "learning_rate": 9.599324903406851e-06, + "loss": 0.9943, + "step": 36610 + }, + { + "epoch": 0.5275364824178516, + "grad_norm": 0.7023763060569763, + "learning_rate": 9.594663008380363e-06, + "loss": 1.0256, + "step": 36620 + }, + { + "epoch": 0.5276805393491508, + "grad_norm": 0.5354670286178589, + "learning_rate": 9.590001201590144e-06, + "loss": 1.0263, + "step": 36630 + }, + { + "epoch": 0.5278245962804501, + "grad_norm": 0.5822432041168213, + "learning_rate": 9.585339484051011e-06, + "loss": 0.9996, + "step": 36640 + }, + { + "epoch": 0.5279686532117492, + "grad_norm": 0.5705580115318298, + "learning_rate": 9.580677856777753e-06, + "loss": 1.0092, + "step": 36650 + }, + { + "epoch": 0.5281127101430485, + "grad_norm": 0.5426855683326721, + "learning_rate": 9.576016320785135e-06, + "loss": 1.0097, + "step": 36660 + }, + { + "epoch": 0.5282567670743478, + "grad_norm": 0.6067710518836975, + "learning_rate": 9.571354877087921e-06, + "loss": 1.0058, + "step": 36670 + }, + { + "epoch": 0.528400824005647, + "grad_norm": 0.5888494849205017, + "learning_rate": 9.566693526700835e-06, + "loss": 0.9949, + "step": 36680 + }, + { + "epoch": 0.5285448809369463, + "grad_norm": 0.5788161754608154, + "learning_rate": 9.562032270638587e-06, + "loss": 1.0075, + "step": 36690 + }, + { + "epoch": 0.5286889378682456, + "grad_norm": 0.6695165634155273, + "learning_rate": 9.557371109915878e-06, + "loss": 1.0411, + "step": 36700 + }, + { + "epoch": 0.5288329947995448, + "grad_norm": 0.6557411551475525, + "learning_rate": 9.552710045547367e-06, + "loss": 1.0201, + "step": 36710 + }, + { + "epoch": 0.5289770517308441, + "grad_norm": 0.6170535087585449, + "learning_rate": 9.548049078547715e-06, + "loss": 1.0118, + "step": 36720 + }, + { + "epoch": 0.5291211086621432, + "grad_norm": 0.6295254230499268, + "learning_rate": 9.54338820993154e-06, + "loss": 1.0351, + "step": 36730 + }, + { + "epoch": 0.5292651655934425, + "grad_norm": 0.5872475504875183, + "learning_rate": 9.538727440713451e-06, + "loss": 1.0398, + "step": 36740 + }, + { + "epoch": 0.5294092225247418, + "grad_norm": 0.5091297626495361, + "learning_rate": 9.534066771908038e-06, + "loss": 1.0198, + "step": 36750 + }, + { + "epoch": 0.529553279456041, + "grad_norm": 0.6274429559707642, + "learning_rate": 9.529406204529861e-06, + "loss": 1.0141, + "step": 36760 + }, + { + "epoch": 0.5296973363873403, + "grad_norm": 0.6691988706588745, + "learning_rate": 9.524745739593459e-06, + "loss": 1.0108, + "step": 36770 + }, + { + "epoch": 0.5298413933186396, + "grad_norm": 0.6400824785232544, + "learning_rate": 9.520085378113356e-06, + "loss": 1.0095, + "step": 36780 + }, + { + "epoch": 0.5299854502499388, + "grad_norm": 0.6180183291435242, + "learning_rate": 9.515425121104047e-06, + "loss": 1.0142, + "step": 36790 + }, + { + "epoch": 0.5301295071812381, + "grad_norm": 0.5982281565666199, + "learning_rate": 9.510764969579999e-06, + "loss": 1.0265, + "step": 36800 + }, + { + "epoch": 0.5302735641125372, + "grad_norm": 0.5235705375671387, + "learning_rate": 9.506104924555674e-06, + "loss": 1.0019, + "step": 36810 + }, + { + "epoch": 0.5304176210438365, + "grad_norm": 0.6294726729393005, + "learning_rate": 9.501444987045487e-06, + "loss": 1.0044, + "step": 36820 + }, + { + "epoch": 0.5305616779751358, + "grad_norm": 0.6173301935195923, + "learning_rate": 9.496785158063855e-06, + "loss": 1.0391, + "step": 36830 + }, + { + "epoch": 0.530705734906435, + "grad_norm": 0.5367320775985718, + "learning_rate": 9.492125438625151e-06, + "loss": 0.9951, + "step": 36840 + }, + { + "epoch": 0.5308497918377343, + "grad_norm": 0.7068583369255066, + "learning_rate": 9.48746582974373e-06, + "loss": 0.9917, + "step": 36850 + }, + { + "epoch": 0.5309938487690335, + "grad_norm": 0.6781473755836487, + "learning_rate": 9.482806332433931e-06, + "loss": 1.0262, + "step": 36860 + }, + { + "epoch": 0.5311379057003328, + "grad_norm": 0.6276537775993347, + "learning_rate": 9.478146947710058e-06, + "loss": 1.0212, + "step": 36870 + }, + { + "epoch": 0.5312819626316321, + "grad_norm": 0.6010395288467407, + "learning_rate": 9.473487676586392e-06, + "loss": 1.0201, + "step": 36880 + }, + { + "epoch": 0.5314260195629312, + "grad_norm": 0.6485746502876282, + "learning_rate": 9.4688285200772e-06, + "loss": 1.041, + "step": 36890 + }, + { + "epoch": 0.5315700764942305, + "grad_norm": 0.6540676951408386, + "learning_rate": 9.46416947919671e-06, + "loss": 1.0225, + "step": 36900 + }, + { + "epoch": 0.5317141334255298, + "grad_norm": 0.5813535451889038, + "learning_rate": 9.45951055495913e-06, + "loss": 1.0233, + "step": 36910 + }, + { + "epoch": 0.531858190356829, + "grad_norm": 0.5422252416610718, + "learning_rate": 9.45485174837865e-06, + "loss": 1.0098, + "step": 36920 + }, + { + "epoch": 0.5320022472881283, + "grad_norm": 0.6572553515434265, + "learning_rate": 9.45019306046942e-06, + "loss": 1.0414, + "step": 36930 + }, + { + "epoch": 0.5321463042194275, + "grad_norm": 0.6319551467895508, + "learning_rate": 9.44553449224558e-06, + "loss": 1.0117, + "step": 36940 + }, + { + "epoch": 0.5322903611507268, + "grad_norm": 0.6119747161865234, + "learning_rate": 9.440876044721235e-06, + "loss": 1.0091, + "step": 36950 + }, + { + "epoch": 0.532434418082026, + "grad_norm": 0.5634505748748779, + "learning_rate": 9.436217718910453e-06, + "loss": 1.0233, + "step": 36960 + }, + { + "epoch": 0.5325784750133252, + "grad_norm": 0.6269254684448242, + "learning_rate": 9.431559515827304e-06, + "loss": 1.0379, + "step": 36970 + }, + { + "epoch": 0.5327225319446245, + "grad_norm": 0.7109168171882629, + "learning_rate": 9.426901436485804e-06, + "loss": 1.0136, + "step": 36980 + }, + { + "epoch": 0.5328665888759238, + "grad_norm": 0.5518725514411926, + "learning_rate": 9.422243481899955e-06, + "loss": 1.0228, + "step": 36990 + }, + { + "epoch": 0.533010645807223, + "grad_norm": 0.6496334671974182, + "learning_rate": 9.417585653083729e-06, + "loss": 1.0124, + "step": 37000 + }, + { + "epoch": 0.5331547027385223, + "grad_norm": 0.5915142297744751, + "learning_rate": 9.412927951051075e-06, + "loss": 1.0227, + "step": 37010 + }, + { + "epoch": 0.5332987596698215, + "grad_norm": 0.526029109954834, + "learning_rate": 9.408270376815904e-06, + "loss": 1.0271, + "step": 37020 + }, + { + "epoch": 0.5334428166011208, + "grad_norm": 0.728598415851593, + "learning_rate": 9.40361293139211e-06, + "loss": 1.0282, + "step": 37030 + }, + { + "epoch": 0.53358687353242, + "grad_norm": 0.6483193039894104, + "learning_rate": 9.398955615793552e-06, + "loss": 1.031, + "step": 37040 + }, + { + "epoch": 0.5337309304637192, + "grad_norm": 0.5777032971382141, + "learning_rate": 9.394298431034068e-06, + "loss": 1.0444, + "step": 37050 + }, + { + "epoch": 0.5338749873950185, + "grad_norm": 0.6195807456970215, + "learning_rate": 9.38964137812746e-06, + "loss": 1.0227, + "step": 37060 + }, + { + "epoch": 0.5340190443263177, + "grad_norm": 1.3067591190338135, + "learning_rate": 9.384984458087502e-06, + "loss": 1.0208, + "step": 37070 + }, + { + "epoch": 0.534163101257617, + "grad_norm": 0.6034696698188782, + "learning_rate": 9.380327671927945e-06, + "loss": 1.03, + "step": 37080 + }, + { + "epoch": 0.5343071581889163, + "grad_norm": 0.5766692757606506, + "learning_rate": 9.375671020662506e-06, + "loss": 1.0335, + "step": 37090 + }, + { + "epoch": 0.5344512151202155, + "grad_norm": 0.568112850189209, + "learning_rate": 9.37101450530487e-06, + "loss": 1.0324, + "step": 37100 + }, + { + "epoch": 0.5345952720515148, + "grad_norm": 0.6545660495758057, + "learning_rate": 9.366358126868704e-06, + "loss": 1.0358, + "step": 37110 + }, + { + "epoch": 0.534739328982814, + "grad_norm": 0.5522094368934631, + "learning_rate": 9.361701886367631e-06, + "loss": 1.007, + "step": 37120 + }, + { + "epoch": 0.5348833859141132, + "grad_norm": 0.6215907335281372, + "learning_rate": 9.357045784815248e-06, + "loss": 1.01, + "step": 37130 + }, + { + "epoch": 0.5350274428454125, + "grad_norm": 0.5886874198913574, + "learning_rate": 9.352389823225133e-06, + "loss": 1.0249, + "step": 37140 + }, + { + "epoch": 0.5351714997767117, + "grad_norm": 0.5882801413536072, + "learning_rate": 9.347734002610815e-06, + "loss": 1.0173, + "step": 37150 + }, + { + "epoch": 0.535315556708011, + "grad_norm": 0.6239109039306641, + "learning_rate": 9.343078323985811e-06, + "loss": 1.0131, + "step": 37160 + }, + { + "epoch": 0.5354596136393103, + "grad_norm": 1.379630446434021, + "learning_rate": 9.338422788363592e-06, + "loss": 1.0173, + "step": 37170 + }, + { + "epoch": 0.5356036705706095, + "grad_norm": 0.552575409412384, + "learning_rate": 9.333767396757605e-06, + "loss": 1.0072, + "step": 37180 + }, + { + "epoch": 0.5357477275019088, + "grad_norm": 0.6046473383903503, + "learning_rate": 9.329112150181267e-06, + "loss": 1.0255, + "step": 37190 + }, + { + "epoch": 0.535891784433208, + "grad_norm": 0.6556302905082703, + "learning_rate": 9.32445704964796e-06, + "loss": 1.0379, + "step": 37200 + }, + { + "epoch": 0.5360358413645072, + "grad_norm": 0.6306343078613281, + "learning_rate": 9.319802096171027e-06, + "loss": 1.0471, + "step": 37210 + }, + { + "epoch": 0.5361798982958065, + "grad_norm": 0.5234145522117615, + "learning_rate": 9.315147290763804e-06, + "loss": 1.0176, + "step": 37220 + }, + { + "epoch": 0.5363239552271057, + "grad_norm": 0.6313613653182983, + "learning_rate": 9.310492634439563e-06, + "loss": 1.0286, + "step": 37230 + }, + { + "epoch": 0.536468012158405, + "grad_norm": 0.6118576526641846, + "learning_rate": 9.305838128211561e-06, + "loss": 1.0173, + "step": 37240 + }, + { + "epoch": 0.5366120690897043, + "grad_norm": 0.6831321716308594, + "learning_rate": 9.301183773093027e-06, + "loss": 1.0193, + "step": 37250 + }, + { + "epoch": 0.5367561260210035, + "grad_norm": 0.677731454372406, + "learning_rate": 9.296529570097144e-06, + "loss": 1.0434, + "step": 37260 + }, + { + "epoch": 0.5369001829523028, + "grad_norm": 0.5814643502235413, + "learning_rate": 9.291875520237068e-06, + "loss": 1.0105, + "step": 37270 + }, + { + "epoch": 0.537044239883602, + "grad_norm": 0.6593078970909119, + "learning_rate": 9.287221624525926e-06, + "loss": 1.0187, + "step": 37280 + }, + { + "epoch": 0.5371882968149012, + "grad_norm": 0.6539261341094971, + "learning_rate": 9.2825678839768e-06, + "loss": 1.0189, + "step": 37290 + }, + { + "epoch": 0.5373323537462005, + "grad_norm": 0.5706778764724731, + "learning_rate": 9.277914299602751e-06, + "loss": 1.0185, + "step": 37300 + }, + { + "epoch": 0.5374764106774997, + "grad_norm": 0.6825644969940186, + "learning_rate": 9.273260872416797e-06, + "loss": 1.0336, + "step": 37310 + }, + { + "epoch": 0.537620467608799, + "grad_norm": 0.6412229537963867, + "learning_rate": 9.268607603431924e-06, + "loss": 1.0211, + "step": 37320 + }, + { + "epoch": 0.5377645245400983, + "grad_norm": 0.5434486865997314, + "learning_rate": 9.263954493661089e-06, + "loss": 1.0397, + "step": 37330 + }, + { + "epoch": 0.5379085814713975, + "grad_norm": 0.6290640234947205, + "learning_rate": 9.259301544117208e-06, + "loss": 1.0232, + "step": 37340 + }, + { + "epoch": 0.5380526384026968, + "grad_norm": 0.5575625896453857, + "learning_rate": 9.254648755813158e-06, + "loss": 1.0141, + "step": 37350 + }, + { + "epoch": 0.538196695333996, + "grad_norm": 0.598358690738678, + "learning_rate": 9.249996129761797e-06, + "loss": 1.0191, + "step": 37360 + }, + { + "epoch": 0.5383407522652952, + "grad_norm": 0.6271530389785767, + "learning_rate": 9.245343666975925e-06, + "loss": 1.008, + "step": 37370 + }, + { + "epoch": 0.5384848091965945, + "grad_norm": 0.5654636025428772, + "learning_rate": 9.240691368468334e-06, + "loss": 1.0376, + "step": 37380 + }, + { + "epoch": 0.5386288661278937, + "grad_norm": 0.5937756299972534, + "learning_rate": 9.236039235251755e-06, + "loss": 1.0006, + "step": 37390 + }, + { + "epoch": 0.538772923059193, + "grad_norm": 0.6633250117301941, + "learning_rate": 9.231387268338893e-06, + "loss": 1.0344, + "step": 37400 + }, + { + "epoch": 0.5389169799904923, + "grad_norm": 0.5502985715866089, + "learning_rate": 9.226735468742422e-06, + "loss": 1.0333, + "step": 37410 + }, + { + "epoch": 0.5390610369217915, + "grad_norm": 0.7371459007263184, + "learning_rate": 9.222083837474973e-06, + "loss": 1.0179, + "step": 37420 + }, + { + "epoch": 0.5392050938530908, + "grad_norm": 0.6678913831710815, + "learning_rate": 9.217432375549139e-06, + "loss": 1.0201, + "step": 37430 + }, + { + "epoch": 0.5393491507843899, + "grad_norm": 0.5894361734390259, + "learning_rate": 9.212781083977483e-06, + "loss": 1.0252, + "step": 37440 + }, + { + "epoch": 0.5394932077156892, + "grad_norm": 0.5343698263168335, + "learning_rate": 9.208129963772526e-06, + "loss": 1.0109, + "step": 37450 + }, + { + "epoch": 0.5396372646469885, + "grad_norm": 0.5272938013076782, + "learning_rate": 9.203479015946749e-06, + "loss": 1.0229, + "step": 37460 + }, + { + "epoch": 0.5397813215782877, + "grad_norm": 0.59951251745224, + "learning_rate": 9.198828241512604e-06, + "loss": 1.0244, + "step": 37470 + }, + { + "epoch": 0.539925378509587, + "grad_norm": 0.648590087890625, + "learning_rate": 9.194177641482498e-06, + "loss": 1.0296, + "step": 37480 + }, + { + "epoch": 0.5400694354408863, + "grad_norm": 0.5928313136100769, + "learning_rate": 9.189527216868803e-06, + "loss": 1.0143, + "step": 37490 + }, + { + "epoch": 0.5402134923721855, + "grad_norm": 0.5701433420181274, + "learning_rate": 9.184876968683857e-06, + "loss": 0.9927, + "step": 37500 + }, + { + "epoch": 0.5403575493034848, + "grad_norm": 0.6457295417785645, + "learning_rate": 9.180226897939943e-06, + "loss": 1.0295, + "step": 37510 + }, + { + "epoch": 0.5405016062347839, + "grad_norm": 0.6101258993148804, + "learning_rate": 9.17557700564933e-06, + "loss": 1.0108, + "step": 37520 + }, + { + "epoch": 0.5406456631660832, + "grad_norm": 0.6400893330574036, + "learning_rate": 9.17092729282423e-06, + "loss": 1.0145, + "step": 37530 + }, + { + "epoch": 0.5407897200973825, + "grad_norm": 0.6215956211090088, + "learning_rate": 9.166277760476816e-06, + "loss": 1.0132, + "step": 37540 + }, + { + "epoch": 0.5409337770286817, + "grad_norm": 0.5928720235824585, + "learning_rate": 9.161628409619237e-06, + "loss": 1.0219, + "step": 37550 + }, + { + "epoch": 0.541077833959981, + "grad_norm": 0.959450364112854, + "learning_rate": 9.156979241263587e-06, + "loss": 1.0046, + "step": 37560 + }, + { + "epoch": 0.5412218908912803, + "grad_norm": 0.672532856464386, + "learning_rate": 9.152330256421926e-06, + "loss": 1.0412, + "step": 37570 + }, + { + "epoch": 0.5413659478225795, + "grad_norm": 0.5589152574539185, + "learning_rate": 9.147681456106275e-06, + "loss": 1.0102, + "step": 37580 + }, + { + "epoch": 0.5415100047538788, + "grad_norm": 0.5893039107322693, + "learning_rate": 9.14303284132861e-06, + "loss": 1.0068, + "step": 37590 + }, + { + "epoch": 0.5416540616851779, + "grad_norm": 0.7029055953025818, + "learning_rate": 9.138384413100879e-06, + "loss": 1.0132, + "step": 37600 + }, + { + "epoch": 0.5417981186164772, + "grad_norm": 0.6449434757232666, + "learning_rate": 9.133736172434973e-06, + "loss": 1.0129, + "step": 37610 + }, + { + "epoch": 0.5419421755477765, + "grad_norm": 0.6333742141723633, + "learning_rate": 9.129088120342749e-06, + "loss": 1.0305, + "step": 37620 + }, + { + "epoch": 0.5420862324790757, + "grad_norm": 0.6525691747665405, + "learning_rate": 9.124440257836031e-06, + "loss": 1.0266, + "step": 37630 + }, + { + "epoch": 0.542230289410375, + "grad_norm": 0.6647065281867981, + "learning_rate": 9.119792585926593e-06, + "loss": 1.0265, + "step": 37640 + }, + { + "epoch": 0.5423743463416743, + "grad_norm": 0.5731229186058044, + "learning_rate": 9.11514510562616e-06, + "loss": 1.0033, + "step": 37650 + }, + { + "epoch": 0.5425184032729735, + "grad_norm": 0.5933045744895935, + "learning_rate": 9.110497817946436e-06, + "loss": 1.0069, + "step": 37660 + }, + { + "epoch": 0.5426624602042728, + "grad_norm": 0.6093741655349731, + "learning_rate": 9.105850723899065e-06, + "loss": 1.002, + "step": 37670 + }, + { + "epoch": 0.5428065171355719, + "grad_norm": 0.570600152015686, + "learning_rate": 9.101203824495655e-06, + "loss": 1.0284, + "step": 37680 + }, + { + "epoch": 0.5429505740668712, + "grad_norm": 0.5646870732307434, + "learning_rate": 9.096557120747775e-06, + "loss": 1.0093, + "step": 37690 + }, + { + "epoch": 0.5430946309981705, + "grad_norm": 0.6539302468299866, + "learning_rate": 9.091910613666945e-06, + "loss": 1.0299, + "step": 37700 + }, + { + "epoch": 0.5432386879294697, + "grad_norm": 0.547149658203125, + "learning_rate": 9.087264304264648e-06, + "loss": 1.0283, + "step": 37710 + }, + { + "epoch": 0.543382744860769, + "grad_norm": 0.5525507926940918, + "learning_rate": 9.082618193552323e-06, + "loss": 1.0208, + "step": 37720 + }, + { + "epoch": 0.5435268017920682, + "grad_norm": 0.5298338532447815, + "learning_rate": 9.077972282541358e-06, + "loss": 1.0146, + "step": 37730 + }, + { + "epoch": 0.5436708587233675, + "grad_norm": 0.5054857730865479, + "learning_rate": 9.07332657224311e-06, + "loss": 1.0214, + "step": 37740 + }, + { + "epoch": 0.5438149156546668, + "grad_norm": 0.6309981942176819, + "learning_rate": 9.068681063668886e-06, + "loss": 1.0446, + "step": 37750 + }, + { + "epoch": 0.5439589725859659, + "grad_norm": 0.6483060121536255, + "learning_rate": 9.064035757829941e-06, + "loss": 1.0095, + "step": 37760 + }, + { + "epoch": 0.5441030295172652, + "grad_norm": 0.577060341835022, + "learning_rate": 9.059390655737508e-06, + "loss": 0.9945, + "step": 37770 + }, + { + "epoch": 0.5442470864485645, + "grad_norm": 0.6026631593704224, + "learning_rate": 9.05474575840275e-06, + "loss": 1.0073, + "step": 37780 + }, + { + "epoch": 0.5443911433798637, + "grad_norm": 0.6109444499015808, + "learning_rate": 9.050101066836798e-06, + "loss": 1.0399, + "step": 37790 + }, + { + "epoch": 0.544535200311163, + "grad_norm": 0.6298561096191406, + "learning_rate": 9.045456582050743e-06, + "loss": 1.0078, + "step": 37800 + }, + { + "epoch": 0.5446792572424622, + "grad_norm": 0.6335732340812683, + "learning_rate": 9.040812305055619e-06, + "loss": 1.0087, + "step": 37810 + }, + { + "epoch": 0.5448233141737615, + "grad_norm": 0.5873390436172485, + "learning_rate": 9.036168236862426e-06, + "loss": 1.0298, + "step": 37820 + }, + { + "epoch": 0.5449673711050608, + "grad_norm": 0.5091093182563782, + "learning_rate": 9.03152437848211e-06, + "loss": 1.0363, + "step": 37830 + }, + { + "epoch": 0.5451114280363599, + "grad_norm": 0.6849727034568787, + "learning_rate": 9.02688073092558e-06, + "loss": 1.0126, + "step": 37840 + }, + { + "epoch": 0.5452554849676592, + "grad_norm": 0.7993967533111572, + "learning_rate": 9.022237295203689e-06, + "loss": 1.0182, + "step": 37850 + }, + { + "epoch": 0.5453995418989585, + "grad_norm": 0.6131024956703186, + "learning_rate": 9.01759407232725e-06, + "loss": 1.0363, + "step": 37860 + }, + { + "epoch": 0.5455435988302577, + "grad_norm": 0.5998750925064087, + "learning_rate": 9.01295106330703e-06, + "loss": 1.0188, + "step": 37870 + }, + { + "epoch": 0.545687655761557, + "grad_norm": 0.7071616649627686, + "learning_rate": 9.008308269153747e-06, + "loss": 1.021, + "step": 37880 + }, + { + "epoch": 0.5458317126928562, + "grad_norm": 0.5982471108436584, + "learning_rate": 9.003665690878076e-06, + "loss": 1.0386, + "step": 37890 + }, + { + "epoch": 0.5459757696241555, + "grad_norm": 0.6303359866142273, + "learning_rate": 8.999023329490634e-06, + "loss": 1.0283, + "step": 37900 + }, + { + "epoch": 0.5461198265554548, + "grad_norm": 0.5582040548324585, + "learning_rate": 8.99438118600201e-06, + "loss": 1.0347, + "step": 37910 + }, + { + "epoch": 0.5462638834867539, + "grad_norm": 0.6476597189903259, + "learning_rate": 8.989739261422722e-06, + "loss": 0.9984, + "step": 37920 + }, + { + "epoch": 0.5464079404180532, + "grad_norm": 0.6058054566383362, + "learning_rate": 8.985097556763269e-06, + "loss": 1.0102, + "step": 37930 + }, + { + "epoch": 0.5465519973493524, + "grad_norm": 0.5832399129867554, + "learning_rate": 8.980456073034074e-06, + "loss": 1.0269, + "step": 37940 + }, + { + "epoch": 0.5466960542806517, + "grad_norm": 0.6475029587745667, + "learning_rate": 8.975814811245525e-06, + "loss": 1.0046, + "step": 37950 + }, + { + "epoch": 0.546840111211951, + "grad_norm": 0.5944557785987854, + "learning_rate": 8.971173772407967e-06, + "loss": 1.0239, + "step": 37960 + }, + { + "epoch": 0.5469841681432502, + "grad_norm": 0.637930691242218, + "learning_rate": 8.966532957531684e-06, + "loss": 1.0331, + "step": 37970 + }, + { + "epoch": 0.5471282250745495, + "grad_norm": 0.6158673167228699, + "learning_rate": 8.961892367626918e-06, + "loss": 1.0276, + "step": 37980 + }, + { + "epoch": 0.5472722820058488, + "grad_norm": 0.6054377555847168, + "learning_rate": 8.957252003703865e-06, + "loss": 1.0443, + "step": 37990 + }, + { + "epoch": 0.5474163389371479, + "grad_norm": 0.5525330305099487, + "learning_rate": 8.952611866772665e-06, + "loss": 1.0318, + "step": 38000 + }, + { + "epoch": 0.5475603958684472, + "grad_norm": 0.5811078548431396, + "learning_rate": 8.947971957843412e-06, + "loss": 1.0214, + "step": 38010 + }, + { + "epoch": 0.5477044527997464, + "grad_norm": 0.597833514213562, + "learning_rate": 8.943332277926154e-06, + "loss": 1.0117, + "step": 38020 + }, + { + "epoch": 0.5478485097310457, + "grad_norm": 0.5681318044662476, + "learning_rate": 8.938692828030877e-06, + "loss": 0.993, + "step": 38030 + }, + { + "epoch": 0.547992566662345, + "grad_norm": 0.555644690990448, + "learning_rate": 8.934053609167535e-06, + "loss": 1.0162, + "step": 38040 + }, + { + "epoch": 0.5481366235936442, + "grad_norm": 0.6647034883499146, + "learning_rate": 8.92941462234602e-06, + "loss": 1.0087, + "step": 38050 + }, + { + "epoch": 0.5482806805249435, + "grad_norm": 0.6532979607582092, + "learning_rate": 8.924775868576166e-06, + "loss": 1.0149, + "step": 38060 + }, + { + "epoch": 0.5484247374562428, + "grad_norm": 0.5512877702713013, + "learning_rate": 8.92013734886778e-06, + "loss": 1.0309, + "step": 38070 + }, + { + "epoch": 0.5485687943875419, + "grad_norm": 0.6826861500740051, + "learning_rate": 8.915499064230593e-06, + "loss": 1.0214, + "step": 38080 + }, + { + "epoch": 0.5487128513188412, + "grad_norm": 0.5934279561042786, + "learning_rate": 8.910861015674297e-06, + "loss": 1.015, + "step": 38090 + }, + { + "epoch": 0.5488569082501404, + "grad_norm": 0.7294298410415649, + "learning_rate": 8.906223204208535e-06, + "loss": 0.9978, + "step": 38100 + }, + { + "epoch": 0.5490009651814397, + "grad_norm": 0.6452908515930176, + "learning_rate": 8.901585630842894e-06, + "loss": 1.0172, + "step": 38110 + }, + { + "epoch": 0.549145022112739, + "grad_norm": 0.7616612911224365, + "learning_rate": 8.896948296586905e-06, + "loss": 1.0127, + "step": 38120 + }, + { + "epoch": 0.5492890790440382, + "grad_norm": 0.5813748240470886, + "learning_rate": 8.89231120245006e-06, + "loss": 1.0176, + "step": 38130 + }, + { + "epoch": 0.5494331359753375, + "grad_norm": 0.6211594343185425, + "learning_rate": 8.88767434944178e-06, + "loss": 1.0237, + "step": 38140 + }, + { + "epoch": 0.5495771929066368, + "grad_norm": 0.6435821056365967, + "learning_rate": 8.883037738571453e-06, + "loss": 1.0501, + "step": 38150 + }, + { + "epoch": 0.5497212498379359, + "grad_norm": 0.5843872427940369, + "learning_rate": 8.878401370848406e-06, + "loss": 1.0338, + "step": 38160 + }, + { + "epoch": 0.5498653067692352, + "grad_norm": 0.5781148076057434, + "learning_rate": 8.8737652472819e-06, + "loss": 1.0203, + "step": 38170 + }, + { + "epoch": 0.5500093637005344, + "grad_norm": 0.588342547416687, + "learning_rate": 8.86912936888117e-06, + "loss": 1.0159, + "step": 38180 + }, + { + "epoch": 0.5501534206318337, + "grad_norm": 0.5943087339401245, + "learning_rate": 8.864493736655373e-06, + "loss": 1.0093, + "step": 38190 + }, + { + "epoch": 0.550297477563133, + "grad_norm": 0.6095810532569885, + "learning_rate": 8.859858351613624e-06, + "loss": 1.0255, + "step": 38200 + }, + { + "epoch": 0.5504415344944322, + "grad_norm": 0.6482893824577332, + "learning_rate": 8.855223214764986e-06, + "loss": 1.021, + "step": 38210 + }, + { + "epoch": 0.5505855914257315, + "grad_norm": 0.5073266625404358, + "learning_rate": 8.850588327118461e-06, + "loss": 1.0388, + "step": 38220 + }, + { + "epoch": 0.5507296483570306, + "grad_norm": 0.5247241258621216, + "learning_rate": 8.845953689682999e-06, + "loss": 1.001, + "step": 38230 + }, + { + "epoch": 0.5508737052883299, + "grad_norm": 0.6343132853507996, + "learning_rate": 8.841319303467502e-06, + "loss": 1.0319, + "step": 38240 + }, + { + "epoch": 0.5510177622196292, + "grad_norm": 0.5247365832328796, + "learning_rate": 8.836685169480805e-06, + "loss": 1.0214, + "step": 38250 + }, + { + "epoch": 0.5511618191509284, + "grad_norm": 0.6870401501655579, + "learning_rate": 8.832051288731701e-06, + "loss": 1.0393, + "step": 38260 + }, + { + "epoch": 0.5513058760822277, + "grad_norm": 0.6118807792663574, + "learning_rate": 8.82741766222892e-06, + "loss": 1.0278, + "step": 38270 + }, + { + "epoch": 0.551449933013527, + "grad_norm": 0.5001921057701111, + "learning_rate": 8.822784290981136e-06, + "loss": 1.0279, + "step": 38280 + }, + { + "epoch": 0.5515939899448262, + "grad_norm": 0.5460012555122375, + "learning_rate": 8.818151175996974e-06, + "loss": 0.9962, + "step": 38290 + }, + { + "epoch": 0.5517380468761255, + "grad_norm": 0.5944710373878479, + "learning_rate": 8.813518318285e-06, + "loss": 1.0026, + "step": 38300 + }, + { + "epoch": 0.5518821038074246, + "grad_norm": 0.6419718861579895, + "learning_rate": 8.808885718853713e-06, + "loss": 1.0186, + "step": 38310 + }, + { + "epoch": 0.5520261607387239, + "grad_norm": 0.5534757375717163, + "learning_rate": 8.804253378711583e-06, + "loss": 1.037, + "step": 38320 + }, + { + "epoch": 0.5521702176700232, + "grad_norm": 0.5966949462890625, + "learning_rate": 8.799621298866992e-06, + "loss": 1.0134, + "step": 38330 + }, + { + "epoch": 0.5523142746013224, + "grad_norm": 0.5333777666091919, + "learning_rate": 8.794989480328284e-06, + "loss": 1.0146, + "step": 38340 + }, + { + "epoch": 0.5524583315326217, + "grad_norm": 0.6215424537658691, + "learning_rate": 8.790357924103745e-06, + "loss": 1.0216, + "step": 38350 + }, + { + "epoch": 0.552602388463921, + "grad_norm": 0.5424696207046509, + "learning_rate": 8.785726631201596e-06, + "loss": 1.0054, + "step": 38360 + }, + { + "epoch": 0.5527464453952202, + "grad_norm": 0.6233113408088684, + "learning_rate": 8.781095602630013e-06, + "loss": 1.0158, + "step": 38370 + }, + { + "epoch": 0.5528905023265195, + "grad_norm": 0.6187631487846375, + "learning_rate": 8.776464839397099e-06, + "loss": 1.0449, + "step": 38380 + }, + { + "epoch": 0.5530345592578186, + "grad_norm": 0.7542943954467773, + "learning_rate": 8.77183434251091e-06, + "loss": 1.0275, + "step": 38390 + }, + { + "epoch": 0.5531786161891179, + "grad_norm": 0.6114190816879272, + "learning_rate": 8.767204112979446e-06, + "loss": 1.0139, + "step": 38400 + }, + { + "epoch": 0.5533226731204172, + "grad_norm": 0.6118826270103455, + "learning_rate": 8.762574151810639e-06, + "loss": 1.0123, + "step": 38410 + }, + { + "epoch": 0.5534667300517164, + "grad_norm": 0.6880956292152405, + "learning_rate": 8.757944460012367e-06, + "loss": 1.0474, + "step": 38420 + }, + { + "epoch": 0.5536107869830157, + "grad_norm": 0.5714311003684998, + "learning_rate": 8.753315038592453e-06, + "loss": 1.0332, + "step": 38430 + }, + { + "epoch": 0.553754843914315, + "grad_norm": 0.5913240909576416, + "learning_rate": 8.74868588855866e-06, + "loss": 1.0102, + "step": 38440 + }, + { + "epoch": 0.5538989008456142, + "grad_norm": 0.6488410830497742, + "learning_rate": 8.74405701091868e-06, + "loss": 1.0199, + "step": 38450 + }, + { + "epoch": 0.5540429577769135, + "grad_norm": 0.7068957686424255, + "learning_rate": 8.73942840668017e-06, + "loss": 1.0234, + "step": 38460 + }, + { + "epoch": 0.5541870147082126, + "grad_norm": 0.5866946578025818, + "learning_rate": 8.7348000768507e-06, + "loss": 1.0068, + "step": 38470 + }, + { + "epoch": 0.5543310716395119, + "grad_norm": 0.6052505373954773, + "learning_rate": 8.730172022437806e-06, + "loss": 1.0189, + "step": 38480 + }, + { + "epoch": 0.5544751285708112, + "grad_norm": 0.6822125315666199, + "learning_rate": 8.725544244448943e-06, + "loss": 0.9985, + "step": 38490 + }, + { + "epoch": 0.5546191855021104, + "grad_norm": 0.5761526226997375, + "learning_rate": 8.720916743891516e-06, + "loss": 1.0262, + "step": 38500 + }, + { + "epoch": 0.5547632424334097, + "grad_norm": 0.5955058336257935, + "learning_rate": 8.71628952177287e-06, + "loss": 1.0238, + "step": 38510 + }, + { + "epoch": 0.554907299364709, + "grad_norm": 0.547618567943573, + "learning_rate": 8.711662579100286e-06, + "loss": 1.0246, + "step": 38520 + }, + { + "epoch": 0.5550513562960082, + "grad_norm": 0.6268759965896606, + "learning_rate": 8.707035916880985e-06, + "loss": 1.0133, + "step": 38530 + }, + { + "epoch": 0.5551954132273075, + "grad_norm": 0.5458856821060181, + "learning_rate": 8.70240953612213e-06, + "loss": 1.0192, + "step": 38540 + }, + { + "epoch": 0.5553394701586066, + "grad_norm": 0.6251266598701477, + "learning_rate": 8.697783437830817e-06, + "loss": 1.0445, + "step": 38550 + }, + { + "epoch": 0.5554835270899059, + "grad_norm": 0.5710606575012207, + "learning_rate": 8.693157623014085e-06, + "loss": 1.0103, + "step": 38560 + }, + { + "epoch": 0.5556275840212052, + "grad_norm": 0.6082000732421875, + "learning_rate": 8.688532092678914e-06, + "loss": 1.0202, + "step": 38570 + }, + { + "epoch": 0.5557716409525044, + "grad_norm": 0.5642730593681335, + "learning_rate": 8.68390684783221e-06, + "loss": 1.0211, + "step": 38580 + }, + { + "epoch": 0.5559156978838037, + "grad_norm": 0.5714201927185059, + "learning_rate": 8.679281889480833e-06, + "loss": 1.0094, + "step": 38590 + }, + { + "epoch": 0.556059754815103, + "grad_norm": 0.5635415315628052, + "learning_rate": 8.674657218631573e-06, + "loss": 1.0095, + "step": 38600 + }, + { + "epoch": 0.5562038117464022, + "grad_norm": 0.6005164384841919, + "learning_rate": 8.670032836291144e-06, + "loss": 1.0325, + "step": 38610 + }, + { + "epoch": 0.5563478686777015, + "grad_norm": 0.6503455638885498, + "learning_rate": 8.665408743466227e-06, + "loss": 1.0041, + "step": 38620 + }, + { + "epoch": 0.5564919256090006, + "grad_norm": 0.6297188401222229, + "learning_rate": 8.660784941163415e-06, + "loss": 1.0167, + "step": 38630 + }, + { + "epoch": 0.5566359825402999, + "grad_norm": 0.5665830373764038, + "learning_rate": 8.656161430389243e-06, + "loss": 1.0255, + "step": 38640 + }, + { + "epoch": 0.5567800394715992, + "grad_norm": 0.625270664691925, + "learning_rate": 8.651538212150194e-06, + "loss": 1.0129, + "step": 38650 + }, + { + "epoch": 0.5569240964028984, + "grad_norm": 0.5851070284843445, + "learning_rate": 8.646915287452672e-06, + "loss": 1.0318, + "step": 38660 + }, + { + "epoch": 0.5570681533341977, + "grad_norm": 0.6134948134422302, + "learning_rate": 8.642292657303024e-06, + "loss": 1.016, + "step": 38670 + }, + { + "epoch": 0.5572122102654969, + "grad_norm": 0.6355244517326355, + "learning_rate": 8.637670322707537e-06, + "loss": 1.0134, + "step": 38680 + }, + { + "epoch": 0.5573562671967962, + "grad_norm": 0.6000000834465027, + "learning_rate": 8.633048284672427e-06, + "loss": 1.0254, + "step": 38690 + }, + { + "epoch": 0.5575003241280955, + "grad_norm": 0.6704204082489014, + "learning_rate": 8.62842654420385e-06, + "loss": 1.0106, + "step": 38700 + }, + { + "epoch": 0.5576443810593946, + "grad_norm": 0.6079708337783813, + "learning_rate": 8.623805102307894e-06, + "loss": 1.0194, + "step": 38710 + }, + { + "epoch": 0.5577884379906939, + "grad_norm": 0.6132945418357849, + "learning_rate": 8.619183959990577e-06, + "loss": 1.0171, + "step": 38720 + }, + { + "epoch": 0.5579324949219931, + "grad_norm": 0.5544208288192749, + "learning_rate": 8.61456311825787e-06, + "loss": 1.0318, + "step": 38730 + }, + { + "epoch": 0.5580765518532924, + "grad_norm": 0.6078389883041382, + "learning_rate": 8.609942578115657e-06, + "loss": 1.0017, + "step": 38740 + }, + { + "epoch": 0.5582206087845917, + "grad_norm": 0.6498771905899048, + "learning_rate": 8.605322340569768e-06, + "loss": 0.9872, + "step": 38750 + }, + { + "epoch": 0.5583646657158909, + "grad_norm": 0.6382272243499756, + "learning_rate": 8.600702406625968e-06, + "loss": 1.0337, + "step": 38760 + }, + { + "epoch": 0.5585087226471902, + "grad_norm": 0.5370135307312012, + "learning_rate": 8.596082777289951e-06, + "loss": 1.017, + "step": 38770 + }, + { + "epoch": 0.5586527795784895, + "grad_norm": 0.6954641342163086, + "learning_rate": 8.591463453567344e-06, + "loss": 1.0291, + "step": 38780 + }, + { + "epoch": 0.5587968365097886, + "grad_norm": 0.5990657806396484, + "learning_rate": 8.586844436463714e-06, + "loss": 1.0099, + "step": 38790 + }, + { + "epoch": 0.5589408934410879, + "grad_norm": 0.6652782559394836, + "learning_rate": 8.582225726984554e-06, + "loss": 1.0046, + "step": 38800 + }, + { + "epoch": 0.5590849503723871, + "grad_norm": 0.536297619342804, + "learning_rate": 8.577607326135296e-06, + "loss": 1.0215, + "step": 38810 + }, + { + "epoch": 0.5592290073036864, + "grad_norm": 0.6417962312698364, + "learning_rate": 8.572989234921302e-06, + "loss": 1.0219, + "step": 38820 + }, + { + "epoch": 0.5593730642349857, + "grad_norm": 0.5660361647605896, + "learning_rate": 8.568371454347865e-06, + "loss": 1.0373, + "step": 38830 + }, + { + "epoch": 0.5595171211662849, + "grad_norm": 0.6714792847633362, + "learning_rate": 8.563753985420213e-06, + "loss": 1.0255, + "step": 38840 + }, + { + "epoch": 0.5596611780975842, + "grad_norm": 0.5720904469490051, + "learning_rate": 8.559136829143509e-06, + "loss": 1.0114, + "step": 38850 + }, + { + "epoch": 0.5598052350288835, + "grad_norm": 0.5660542845726013, + "learning_rate": 8.554519986522834e-06, + "loss": 1.0315, + "step": 38860 + }, + { + "epoch": 0.5599492919601826, + "grad_norm": 0.5986666679382324, + "learning_rate": 8.549903458563223e-06, + "loss": 1.0192, + "step": 38870 + }, + { + "epoch": 0.5600933488914819, + "grad_norm": 0.5407901406288147, + "learning_rate": 8.545287246269624e-06, + "loss": 1.0218, + "step": 38880 + }, + { + "epoch": 0.5602374058227811, + "grad_norm": 0.6441916227340698, + "learning_rate": 8.540671350646922e-06, + "loss": 1.0291, + "step": 38890 + }, + { + "epoch": 0.5603814627540804, + "grad_norm": 0.5515192747116089, + "learning_rate": 8.536055772699936e-06, + "loss": 1.0116, + "step": 38900 + }, + { + "epoch": 0.5605255196853797, + "grad_norm": 0.6288701295852661, + "learning_rate": 8.531440513433412e-06, + "loss": 1.0225, + "step": 38910 + }, + { + "epoch": 0.5606695766166789, + "grad_norm": 0.6084778904914856, + "learning_rate": 8.526825573852031e-06, + "loss": 1.0242, + "step": 38920 + }, + { + "epoch": 0.5608136335479782, + "grad_norm": 0.5939294099807739, + "learning_rate": 8.522210954960402e-06, + "loss": 1.0129, + "step": 38930 + }, + { + "epoch": 0.5609576904792775, + "grad_norm": 0.5160016417503357, + "learning_rate": 8.517596657763058e-06, + "loss": 1.0129, + "step": 38940 + }, + { + "epoch": 0.5611017474105766, + "grad_norm": 0.5904541015625, + "learning_rate": 8.512982683264474e-06, + "loss": 1.047, + "step": 38950 + }, + { + "epoch": 0.5612458043418759, + "grad_norm": 0.7362788915634155, + "learning_rate": 8.508369032469046e-06, + "loss": 1.0259, + "step": 38960 + }, + { + "epoch": 0.5613898612731751, + "grad_norm": 0.5475773811340332, + "learning_rate": 8.503755706381102e-06, + "loss": 1.0202, + "step": 38970 + }, + { + "epoch": 0.5615339182044744, + "grad_norm": 0.7003915309906006, + "learning_rate": 8.499142706004902e-06, + "loss": 1.0291, + "step": 38980 + }, + { + "epoch": 0.5616779751357737, + "grad_norm": 0.6023433804512024, + "learning_rate": 8.494530032344631e-06, + "loss": 1.0329, + "step": 38990 + }, + { + "epoch": 0.5618220320670729, + "grad_norm": 0.6179013848304749, + "learning_rate": 8.489917686404399e-06, + "loss": 1.0038, + "step": 39000 + }, + { + "epoch": 0.5619660889983722, + "grad_norm": 0.5738324522972107, + "learning_rate": 8.48530566918826e-06, + "loss": 1.0077, + "step": 39010 + }, + { + "epoch": 0.5621101459296715, + "grad_norm": 0.6717596650123596, + "learning_rate": 8.480693981700177e-06, + "loss": 1.0261, + "step": 39020 + }, + { + "epoch": 0.5622542028609706, + "grad_norm": 0.5486605763435364, + "learning_rate": 8.47608262494406e-06, + "loss": 1.0282, + "step": 39030 + }, + { + "epoch": 0.5623982597922699, + "grad_norm": 0.5822954177856445, + "learning_rate": 8.471471599923734e-06, + "loss": 1.0205, + "step": 39040 + }, + { + "epoch": 0.5625423167235691, + "grad_norm": 0.5565518736839294, + "learning_rate": 8.46686090764295e-06, + "loss": 1.0209, + "step": 39050 + }, + { + "epoch": 0.5626863736548684, + "grad_norm": 0.6196978688240051, + "learning_rate": 8.462250549105398e-06, + "loss": 0.9893, + "step": 39060 + }, + { + "epoch": 0.5628304305861677, + "grad_norm": 0.5636383295059204, + "learning_rate": 8.45764052531469e-06, + "loss": 1.0217, + "step": 39070 + }, + { + "epoch": 0.5629744875174669, + "grad_norm": 0.6413935422897339, + "learning_rate": 8.453030837274361e-06, + "loss": 1.0175, + "step": 39080 + }, + { + "epoch": 0.5631185444487662, + "grad_norm": 0.6890947818756104, + "learning_rate": 8.448421485987879e-06, + "loss": 1.0189, + "step": 39090 + }, + { + "epoch": 0.5632626013800655, + "grad_norm": 0.6520262360572815, + "learning_rate": 8.443812472458639e-06, + "loss": 1.0184, + "step": 39100 + }, + { + "epoch": 0.5634066583113646, + "grad_norm": 0.5905381441116333, + "learning_rate": 8.43920379768995e-06, + "loss": 1.0186, + "step": 39110 + }, + { + "epoch": 0.5635507152426639, + "grad_norm": 0.5987008810043335, + "learning_rate": 8.434595462685066e-06, + "loss": 1.0303, + "step": 39120 + }, + { + "epoch": 0.5636947721739631, + "grad_norm": 0.6232080459594727, + "learning_rate": 8.429987468447152e-06, + "loss": 0.9983, + "step": 39130 + }, + { + "epoch": 0.5638388291052624, + "grad_norm": 0.5960439443588257, + "learning_rate": 8.425379815979312e-06, + "loss": 1.0218, + "step": 39140 + }, + { + "epoch": 0.5639828860365617, + "grad_norm": 0.6017009019851685, + "learning_rate": 8.420772506284565e-06, + "loss": 1.0276, + "step": 39150 + }, + { + "epoch": 0.5641269429678609, + "grad_norm": 0.6649116277694702, + "learning_rate": 8.416165540365851e-06, + "loss": 1.0369, + "step": 39160 + }, + { + "epoch": 0.5642709998991602, + "grad_norm": 0.531055748462677, + "learning_rate": 8.411558919226058e-06, + "loss": 1.0049, + "step": 39170 + }, + { + "epoch": 0.5644150568304594, + "grad_norm": 0.5994618535041809, + "learning_rate": 8.406952643867971e-06, + "loss": 1.0194, + "step": 39180 + }, + { + "epoch": 0.5645591137617586, + "grad_norm": 0.5408610701560974, + "learning_rate": 8.402346715294318e-06, + "loss": 1.0259, + "step": 39190 + }, + { + "epoch": 0.5647031706930579, + "grad_norm": 0.5614040493965149, + "learning_rate": 8.397741134507746e-06, + "loss": 1.0216, + "step": 39200 + }, + { + "epoch": 0.5648472276243571, + "grad_norm": 0.6325517892837524, + "learning_rate": 8.393135902510827e-06, + "loss": 1.0164, + "step": 39210 + }, + { + "epoch": 0.5649912845556564, + "grad_norm": 0.5664529204368591, + "learning_rate": 8.388531020306052e-06, + "loss": 1.0293, + "step": 39220 + }, + { + "epoch": 0.5651353414869557, + "grad_norm": 0.5711817741394043, + "learning_rate": 8.383926488895847e-06, + "loss": 1.0219, + "step": 39230 + }, + { + "epoch": 0.5652793984182549, + "grad_norm": 0.6091551780700684, + "learning_rate": 8.379322309282549e-06, + "loss": 1.0097, + "step": 39240 + }, + { + "epoch": 0.5654234553495542, + "grad_norm": 0.6787410974502563, + "learning_rate": 8.374718482468429e-06, + "loss": 1.0396, + "step": 39250 + }, + { + "epoch": 0.5655675122808534, + "grad_norm": 0.5833188891410828, + "learning_rate": 8.370115009455675e-06, + "loss": 1.0235, + "step": 39260 + }, + { + "epoch": 0.5657115692121526, + "grad_norm": 0.6172269582748413, + "learning_rate": 8.365511891246397e-06, + "loss": 1.0174, + "step": 39270 + }, + { + "epoch": 0.5658556261434519, + "grad_norm": 0.6320672035217285, + "learning_rate": 8.360909128842635e-06, + "loss": 1.0466, + "step": 39280 + }, + { + "epoch": 0.5659996830747511, + "grad_norm": 0.6311730742454529, + "learning_rate": 8.356306723246344e-06, + "loss": 1.0224, + "step": 39290 + }, + { + "epoch": 0.5661437400060504, + "grad_norm": 0.6683375239372253, + "learning_rate": 8.351704675459403e-06, + "loss": 1.0202, + "step": 39300 + }, + { + "epoch": 0.5662877969373497, + "grad_norm": 0.5916944742202759, + "learning_rate": 8.347102986483616e-06, + "loss": 1.0169, + "step": 39310 + }, + { + "epoch": 0.5664318538686489, + "grad_norm": 0.5895016193389893, + "learning_rate": 8.34250165732071e-06, + "loss": 1.0346, + "step": 39320 + }, + { + "epoch": 0.5665759107999482, + "grad_norm": 0.6572079062461853, + "learning_rate": 8.337900688972322e-06, + "loss": 1.0159, + "step": 39330 + }, + { + "epoch": 0.5667199677312474, + "grad_norm": 0.5720778107643127, + "learning_rate": 8.33330008244003e-06, + "loss": 1.0176, + "step": 39340 + }, + { + "epoch": 0.5668640246625466, + "grad_norm": 0.6016258597373962, + "learning_rate": 8.328699838725315e-06, + "loss": 1.014, + "step": 39350 + }, + { + "epoch": 0.5670080815938459, + "grad_norm": 0.7664455771446228, + "learning_rate": 8.324099958829593e-06, + "loss": 1.0235, + "step": 39360 + }, + { + "epoch": 0.5671521385251451, + "grad_norm": 0.5646085143089294, + "learning_rate": 8.319500443754189e-06, + "loss": 1.0343, + "step": 39370 + }, + { + "epoch": 0.5672961954564444, + "grad_norm": 0.5941106081008911, + "learning_rate": 8.314901294500356e-06, + "loss": 1.0136, + "step": 39380 + }, + { + "epoch": 0.5674402523877436, + "grad_norm": 0.6006180047988892, + "learning_rate": 8.310302512069265e-06, + "loss": 1.0105, + "step": 39390 + }, + { + "epoch": 0.5675843093190429, + "grad_norm": 0.5841224789619446, + "learning_rate": 8.305704097462011e-06, + "loss": 1.0177, + "step": 39400 + }, + { + "epoch": 0.5677283662503422, + "grad_norm": 0.8114508390426636, + "learning_rate": 8.301106051679595e-06, + "loss": 1.0129, + "step": 39410 + }, + { + "epoch": 0.5678724231816414, + "grad_norm": 0.5688908696174622, + "learning_rate": 8.296508375722963e-06, + "loss": 1.0179, + "step": 39420 + }, + { + "epoch": 0.5680164801129406, + "grad_norm": 0.5788629055023193, + "learning_rate": 8.291911070592955e-06, + "loss": 1.0178, + "step": 39430 + }, + { + "epoch": 0.5681605370442399, + "grad_norm": 0.5609277486801147, + "learning_rate": 8.28731413729034e-06, + "loss": 1.0384, + "step": 39440 + }, + { + "epoch": 0.5683045939755391, + "grad_norm": 0.69785475730896, + "learning_rate": 8.282717576815816e-06, + "loss": 1.0213, + "step": 39450 + }, + { + "epoch": 0.5684486509068384, + "grad_norm": 0.6239933371543884, + "learning_rate": 8.278121390169981e-06, + "loss": 1.011, + "step": 39460 + }, + { + "epoch": 0.5685927078381376, + "grad_norm": 0.5641295313835144, + "learning_rate": 8.273525578353371e-06, + "loss": 1.0264, + "step": 39470 + }, + { + "epoch": 0.5687367647694369, + "grad_norm": 0.615608811378479, + "learning_rate": 8.268930142366425e-06, + "loss": 1.0126, + "step": 39480 + }, + { + "epoch": 0.5688808217007362, + "grad_norm": 0.5850027799606323, + "learning_rate": 8.264335083209502e-06, + "loss": 1.0081, + "step": 39490 + }, + { + "epoch": 0.5690248786320354, + "grad_norm": 0.5991293787956238, + "learning_rate": 8.259740401882892e-06, + "loss": 1.0207, + "step": 39500 + }, + { + "epoch": 0.5691689355633346, + "grad_norm": 0.5950531959533691, + "learning_rate": 8.255146099386791e-06, + "loss": 0.9916, + "step": 39510 + }, + { + "epoch": 0.5693129924946339, + "grad_norm": 0.6409483551979065, + "learning_rate": 8.25055217672131e-06, + "loss": 1.024, + "step": 39520 + }, + { + "epoch": 0.5694570494259331, + "grad_norm": 0.602638304233551, + "learning_rate": 8.24595863488649e-06, + "loss": 1.0202, + "step": 39530 + }, + { + "epoch": 0.5696011063572324, + "grad_norm": 0.636147677898407, + "learning_rate": 8.241365474882279e-06, + "loss": 0.9985, + "step": 39540 + }, + { + "epoch": 0.5697451632885316, + "grad_norm": 0.5812684893608093, + "learning_rate": 8.236772697708537e-06, + "loss": 1.0187, + "step": 39550 + }, + { + "epoch": 0.5698892202198309, + "grad_norm": 0.7413349151611328, + "learning_rate": 8.232180304365063e-06, + "loss": 1.0333, + "step": 39560 + }, + { + "epoch": 0.5700332771511302, + "grad_norm": 0.5983883142471313, + "learning_rate": 8.227588295851542e-06, + "loss": 1.0297, + "step": 39570 + }, + { + "epoch": 0.5701773340824293, + "grad_norm": 0.6596150994300842, + "learning_rate": 8.222996673167605e-06, + "loss": 1.0081, + "step": 39580 + }, + { + "epoch": 0.5703213910137286, + "grad_norm": 0.6228833794593811, + "learning_rate": 8.218405437312776e-06, + "loss": 1.0255, + "step": 39590 + }, + { + "epoch": 0.5704654479450278, + "grad_norm": 0.5467576384544373, + "learning_rate": 8.213814589286506e-06, + "loss": 1.0071, + "step": 39600 + }, + { + "epoch": 0.5706095048763271, + "grad_norm": 0.6596918106079102, + "learning_rate": 8.209224130088159e-06, + "loss": 1.0227, + "step": 39610 + }, + { + "epoch": 0.5707535618076264, + "grad_norm": 0.5788407921791077, + "learning_rate": 8.204634060717016e-06, + "loss": 1.0154, + "step": 39620 + }, + { + "epoch": 0.5708976187389256, + "grad_norm": 0.6458188891410828, + "learning_rate": 8.20004438217227e-06, + "loss": 1.0196, + "step": 39630 + }, + { + "epoch": 0.5710416756702249, + "grad_norm": 0.5770555138587952, + "learning_rate": 8.195455095453034e-06, + "loss": 1.0107, + "step": 39640 + }, + { + "epoch": 0.5711857326015242, + "grad_norm": 0.5458569526672363, + "learning_rate": 8.190866201558329e-06, + "loss": 1.0212, + "step": 39650 + }, + { + "epoch": 0.5713297895328233, + "grad_norm": 0.5996442437171936, + "learning_rate": 8.186277701487092e-06, + "loss": 1.0139, + "step": 39660 + }, + { + "epoch": 0.5714738464641226, + "grad_norm": 0.6025555729866028, + "learning_rate": 8.181689596238182e-06, + "loss": 1.0094, + "step": 39670 + }, + { + "epoch": 0.5716179033954218, + "grad_norm": 0.6486698389053345, + "learning_rate": 8.177101886810361e-06, + "loss": 1.0082, + "step": 39680 + }, + { + "epoch": 0.5717619603267211, + "grad_norm": 0.5293046832084656, + "learning_rate": 8.172514574202315e-06, + "loss": 1.0315, + "step": 39690 + }, + { + "epoch": 0.5719060172580204, + "grad_norm": 0.5688088536262512, + "learning_rate": 8.167927659412636e-06, + "loss": 1.0206, + "step": 39700 + }, + { + "epoch": 0.5720500741893196, + "grad_norm": 0.6041375398635864, + "learning_rate": 8.16334114343983e-06, + "loss": 1.0173, + "step": 39710 + }, + { + "epoch": 0.5721941311206189, + "grad_norm": 0.6499061584472656, + "learning_rate": 8.15875502728232e-06, + "loss": 1.0233, + "step": 39720 + }, + { + "epoch": 0.5723381880519182, + "grad_norm": 0.6422781944274902, + "learning_rate": 8.154169311938444e-06, + "loss": 1.0212, + "step": 39730 + }, + { + "epoch": 0.5724822449832173, + "grad_norm": 0.5883256196975708, + "learning_rate": 8.149583998406442e-06, + "loss": 1.0222, + "step": 39740 + }, + { + "epoch": 0.5726263019145166, + "grad_norm": 0.6779961585998535, + "learning_rate": 8.14499908768448e-06, + "loss": 1.0258, + "step": 39750 + }, + { + "epoch": 0.5727703588458158, + "grad_norm": 0.6148270964622498, + "learning_rate": 8.140414580770626e-06, + "loss": 1.0124, + "step": 39760 + }, + { + "epoch": 0.5729144157771151, + "grad_norm": 0.6151580810546875, + "learning_rate": 8.135830478662862e-06, + "loss": 1.0236, + "step": 39770 + }, + { + "epoch": 0.5730584727084144, + "grad_norm": 0.6776831150054932, + "learning_rate": 8.131246782359088e-06, + "loss": 0.9998, + "step": 39780 + }, + { + "epoch": 0.5732025296397136, + "grad_norm": 0.5680509805679321, + "learning_rate": 8.12666349285711e-06, + "loss": 1.0156, + "step": 39790 + }, + { + "epoch": 0.5733465865710129, + "grad_norm": 0.59038907289505, + "learning_rate": 8.122080611154648e-06, + "loss": 1.0111, + "step": 39800 + }, + { + "epoch": 0.5734906435023122, + "grad_norm": 0.5553265810012817, + "learning_rate": 8.117498138249334e-06, + "loss": 1.0138, + "step": 39810 + }, + { + "epoch": 0.5736347004336113, + "grad_norm": 0.5936552286148071, + "learning_rate": 8.112916075138699e-06, + "loss": 1.0273, + "step": 39820 + }, + { + "epoch": 0.5737787573649106, + "grad_norm": 0.5887789130210876, + "learning_rate": 8.108334422820209e-06, + "loss": 1.0294, + "step": 39830 + }, + { + "epoch": 0.5739228142962098, + "grad_norm": 0.5883054137229919, + "learning_rate": 8.103753182291218e-06, + "loss": 1.0328, + "step": 39840 + }, + { + "epoch": 0.5740668712275091, + "grad_norm": 0.6597990393638611, + "learning_rate": 8.099172354548997e-06, + "loss": 0.9939, + "step": 39850 + }, + { + "epoch": 0.5742109281588084, + "grad_norm": 0.5739467144012451, + "learning_rate": 8.094591940590736e-06, + "loss": 1.0016, + "step": 39860 + }, + { + "epoch": 0.5743549850901076, + "grad_norm": 0.7081357836723328, + "learning_rate": 8.090011941413523e-06, + "loss": 1.0383, + "step": 39870 + }, + { + "epoch": 0.5744990420214069, + "grad_norm": 0.5651097297668457, + "learning_rate": 8.08543235801436e-06, + "loss": 1.0135, + "step": 39880 + }, + { + "epoch": 0.5746430989527062, + "grad_norm": 0.5493505001068115, + "learning_rate": 8.080853191390163e-06, + "loss": 1.0137, + "step": 39890 + }, + { + "epoch": 0.5747871558840053, + "grad_norm": 0.46422478556632996, + "learning_rate": 8.07627444253775e-06, + "loss": 1.0405, + "step": 39900 + }, + { + "epoch": 0.5749312128153046, + "grad_norm": 0.6994962692260742, + "learning_rate": 8.071696112453856e-06, + "loss": 1.0172, + "step": 39910 + }, + { + "epoch": 0.5750752697466038, + "grad_norm": 0.5448575019836426, + "learning_rate": 8.067118202135116e-06, + "loss": 1.0244, + "step": 39920 + }, + { + "epoch": 0.5752193266779031, + "grad_norm": 0.6547740697860718, + "learning_rate": 8.062540712578079e-06, + "loss": 1.0255, + "step": 39930 + }, + { + "epoch": 0.5753633836092024, + "grad_norm": 0.5272354483604431, + "learning_rate": 8.057963644779203e-06, + "loss": 1.0259, + "step": 39940 + }, + { + "epoch": 0.5755074405405016, + "grad_norm": 0.6416448950767517, + "learning_rate": 8.053386999734853e-06, + "loss": 1.0147, + "step": 39950 + }, + { + "epoch": 0.5756514974718009, + "grad_norm": 0.5450991988182068, + "learning_rate": 8.048810778441292e-06, + "loss": 1.0131, + "step": 39960 + }, + { + "epoch": 0.5757955544031002, + "grad_norm": 0.6209630966186523, + "learning_rate": 8.044234981894716e-06, + "loss": 1.0338, + "step": 39970 + }, + { + "epoch": 0.5759396113343993, + "grad_norm": 0.5697899460792542, + "learning_rate": 8.039659611091201e-06, + "loss": 0.9978, + "step": 39980 + }, + { + "epoch": 0.5760836682656986, + "grad_norm": 0.7120861411094666, + "learning_rate": 8.035084667026745e-06, + "loss": 1.0166, + "step": 39990 + }, + { + "epoch": 0.5762277251969978, + "grad_norm": 0.6045985221862793, + "learning_rate": 8.030510150697252e-06, + "loss": 1.0234, + "step": 40000 + }, + { + "epoch": 0.5763717821282971, + "grad_norm": 0.6500576734542847, + "learning_rate": 8.025936063098529e-06, + "loss": 1.0437, + "step": 40010 + }, + { + "epoch": 0.5765158390595964, + "grad_norm": 0.581506073474884, + "learning_rate": 8.021362405226295e-06, + "loss": 1.0293, + "step": 40020 + }, + { + "epoch": 0.5766598959908956, + "grad_norm": 0.5469965934753418, + "learning_rate": 8.016789178076169e-06, + "loss": 1.0253, + "step": 40030 + }, + { + "epoch": 0.5768039529221949, + "grad_norm": 0.5983647108078003, + "learning_rate": 8.012216382643681e-06, + "loss": 1.0006, + "step": 40040 + }, + { + "epoch": 0.5769480098534941, + "grad_norm": 0.6301031708717346, + "learning_rate": 8.007644019924264e-06, + "loss": 1.0354, + "step": 40050 + }, + { + "epoch": 0.5770920667847933, + "grad_norm": 0.6649234294891357, + "learning_rate": 8.00307209091326e-06, + "loss": 1.0477, + "step": 40060 + }, + { + "epoch": 0.5772361237160926, + "grad_norm": 0.577212929725647, + "learning_rate": 7.998500596605913e-06, + "loss": 1.0236, + "step": 40070 + }, + { + "epoch": 0.5773801806473918, + "grad_norm": 0.6051406860351562, + "learning_rate": 7.993929537997377e-06, + "loss": 1.0237, + "step": 40080 + }, + { + "epoch": 0.5775242375786911, + "grad_norm": 0.5686997771263123, + "learning_rate": 7.989358916082709e-06, + "loss": 1.0298, + "step": 40090 + }, + { + "epoch": 0.5776682945099904, + "grad_norm": 0.6738128066062927, + "learning_rate": 7.984788731856862e-06, + "loss": 1.0051, + "step": 40100 + }, + { + "epoch": 0.5778123514412896, + "grad_norm": 0.5925938487052917, + "learning_rate": 7.980218986314715e-06, + "loss": 1.0238, + "step": 40110 + }, + { + "epoch": 0.5779564083725889, + "grad_norm": 0.5643150210380554, + "learning_rate": 7.975649680451024e-06, + "loss": 1.014, + "step": 40120 + }, + { + "epoch": 0.5781004653038881, + "grad_norm": 0.5942860841751099, + "learning_rate": 7.97108081526048e-06, + "loss": 1.0026, + "step": 40130 + }, + { + "epoch": 0.5782445222351873, + "grad_norm": 0.6095684170722961, + "learning_rate": 7.96651239173765e-06, + "loss": 1.0163, + "step": 40140 + }, + { + "epoch": 0.5783885791664866, + "grad_norm": 0.621647298336029, + "learning_rate": 7.961944410877017e-06, + "loss": 1.0221, + "step": 40150 + }, + { + "epoch": 0.5785326360977858, + "grad_norm": 0.613663911819458, + "learning_rate": 7.957376873672976e-06, + "loss": 1.0195, + "step": 40160 + }, + { + "epoch": 0.5786766930290851, + "grad_norm": 0.5918740034103394, + "learning_rate": 7.952809781119809e-06, + "loss": 0.9951, + "step": 40170 + }, + { + "epoch": 0.5788207499603844, + "grad_norm": 0.5893607139587402, + "learning_rate": 7.94824313421171e-06, + "loss": 1.0272, + "step": 40180 + }, + { + "epoch": 0.5789648068916836, + "grad_norm": 0.7715088725090027, + "learning_rate": 7.943676933942779e-06, + "loss": 1.0101, + "step": 40190 + }, + { + "epoch": 0.5791088638229829, + "grad_norm": 0.6940677762031555, + "learning_rate": 7.939111181307016e-06, + "loss": 1.0139, + "step": 40200 + }, + { + "epoch": 0.5792529207542821, + "grad_norm": 0.5966167449951172, + "learning_rate": 7.93454587729831e-06, + "loss": 1.0219, + "step": 40210 + }, + { + "epoch": 0.5793969776855813, + "grad_norm": 0.6023448705673218, + "learning_rate": 7.929981022910483e-06, + "loss": 1.0295, + "step": 40220 + }, + { + "epoch": 0.5795410346168806, + "grad_norm": 0.7437616586685181, + "learning_rate": 7.925416619137223e-06, + "loss": 1.0075, + "step": 40230 + }, + { + "epoch": 0.5796850915481798, + "grad_norm": 0.5959208607673645, + "learning_rate": 7.920852666972151e-06, + "loss": 1.0185, + "step": 40240 + }, + { + "epoch": 0.5798291484794791, + "grad_norm": 0.5967977046966553, + "learning_rate": 7.916289167408773e-06, + "loss": 1.0219, + "step": 40250 + }, + { + "epoch": 0.5799732054107783, + "grad_norm": 0.6647666096687317, + "learning_rate": 7.911726121440495e-06, + "loss": 1.0124, + "step": 40260 + }, + { + "epoch": 0.5801172623420776, + "grad_norm": 0.5860805511474609, + "learning_rate": 7.907163530060633e-06, + "loss": 1.0203, + "step": 40270 + }, + { + "epoch": 0.5802613192733769, + "grad_norm": 0.5214429497718811, + "learning_rate": 7.9026013942624e-06, + "loss": 1.0174, + "step": 40280 + }, + { + "epoch": 0.5804053762046761, + "grad_norm": 0.8571519255638123, + "learning_rate": 7.89803971503891e-06, + "loss": 1.0173, + "step": 40290 + }, + { + "epoch": 0.5805494331359753, + "grad_norm": 0.7096226215362549, + "learning_rate": 7.893478493383178e-06, + "loss": 1.0248, + "step": 40300 + }, + { + "epoch": 0.5806934900672746, + "grad_norm": 0.5956733226776123, + "learning_rate": 7.888917730288119e-06, + "loss": 1.0308, + "step": 40310 + }, + { + "epoch": 0.5808375469985738, + "grad_norm": 0.5794342756271362, + "learning_rate": 7.884357426746547e-06, + "loss": 1.0284, + "step": 40320 + }, + { + "epoch": 0.5809816039298731, + "grad_norm": 0.590449333190918, + "learning_rate": 7.879797583751177e-06, + "loss": 1.017, + "step": 40330 + }, + { + "epoch": 0.5811256608611723, + "grad_norm": 0.5910434722900391, + "learning_rate": 7.875238202294623e-06, + "loss": 1.0041, + "step": 40340 + }, + { + "epoch": 0.5812697177924716, + "grad_norm": 0.8716016411781311, + "learning_rate": 7.870679283369406e-06, + "loss": 1.0237, + "step": 40350 + }, + { + "epoch": 0.5814137747237709, + "grad_norm": 0.5578383207321167, + "learning_rate": 7.866120827967935e-06, + "loss": 1.0137, + "step": 40360 + }, + { + "epoch": 0.5815578316550701, + "grad_norm": 0.6493157148361206, + "learning_rate": 7.861562837082518e-06, + "loss": 1.0199, + "step": 40370 + }, + { + "epoch": 0.5817018885863693, + "grad_norm": 0.5964250564575195, + "learning_rate": 7.857005311705377e-06, + "loss": 1.0359, + "step": 40380 + }, + { + "epoch": 0.5818459455176686, + "grad_norm": 0.7357345223426819, + "learning_rate": 7.852448252828616e-06, + "loss": 1.0286, + "step": 40390 + }, + { + "epoch": 0.5819900024489678, + "grad_norm": 0.6434196829795837, + "learning_rate": 7.84789166144424e-06, + "loss": 1.0342, + "step": 40400 + }, + { + "epoch": 0.5821340593802671, + "grad_norm": 0.642611026763916, + "learning_rate": 7.843335538544165e-06, + "loss": 1.009, + "step": 40410 + }, + { + "epoch": 0.5822781163115663, + "grad_norm": 0.6158607602119446, + "learning_rate": 7.838779885120193e-06, + "loss": 1.0187, + "step": 40420 + }, + { + "epoch": 0.5824221732428656, + "grad_norm": 0.6186838150024414, + "learning_rate": 7.834224702164022e-06, + "loss": 1.019, + "step": 40430 + }, + { + "epoch": 0.5825662301741649, + "grad_norm": 0.6436375975608826, + "learning_rate": 7.82966999066726e-06, + "loss": 0.9933, + "step": 40440 + }, + { + "epoch": 0.5827102871054641, + "grad_norm": 0.7230498194694519, + "learning_rate": 7.8251157516214e-06, + "loss": 1.0291, + "step": 40450 + }, + { + "epoch": 0.5828543440367633, + "grad_norm": 0.6493755578994751, + "learning_rate": 7.820561986017838e-06, + "loss": 1.0078, + "step": 40460 + }, + { + "epoch": 0.5829984009680625, + "grad_norm": 0.5933564305305481, + "learning_rate": 7.816008694847867e-06, + "loss": 1.0167, + "step": 40470 + }, + { + "epoch": 0.5831424578993618, + "grad_norm": 0.6012160181999207, + "learning_rate": 7.811455879102673e-06, + "loss": 1.0276, + "step": 40480 + }, + { + "epoch": 0.5832865148306611, + "grad_norm": 0.6838956475257874, + "learning_rate": 7.806903539773344e-06, + "loss": 1.0081, + "step": 40490 + }, + { + "epoch": 0.5834305717619603, + "grad_norm": 0.5689899921417236, + "learning_rate": 7.802351677850865e-06, + "loss": 1.0194, + "step": 40500 + }, + { + "epoch": 0.5835746286932596, + "grad_norm": 0.7117645144462585, + "learning_rate": 7.797800294326104e-06, + "loss": 0.9945, + "step": 40510 + }, + { + "epoch": 0.5837186856245589, + "grad_norm": 0.6395501494407654, + "learning_rate": 7.793249390189843e-06, + "loss": 0.9716, + "step": 40520 + }, + { + "epoch": 0.5838627425558581, + "grad_norm": 0.5645208358764648, + "learning_rate": 7.788698966432746e-06, + "loss": 1.0202, + "step": 40530 + }, + { + "epoch": 0.5840067994871573, + "grad_norm": 0.7860140800476074, + "learning_rate": 7.784149024045378e-06, + "loss": 1.0158, + "step": 40540 + }, + { + "epoch": 0.5841508564184565, + "grad_norm": 0.5279279351234436, + "learning_rate": 7.7795995640182e-06, + "loss": 1.0267, + "step": 40550 + }, + { + "epoch": 0.5842949133497558, + "grad_norm": 0.5274000763893127, + "learning_rate": 7.775050587341566e-06, + "loss": 1.0154, + "step": 40560 + }, + { + "epoch": 0.5844389702810551, + "grad_norm": 0.5330712795257568, + "learning_rate": 7.770502095005728e-06, + "loss": 1.0367, + "step": 40570 + }, + { + "epoch": 0.5845830272123543, + "grad_norm": 0.6997164487838745, + "learning_rate": 7.765954088000829e-06, + "loss": 1.0196, + "step": 40580 + }, + { + "epoch": 0.5847270841436536, + "grad_norm": 0.6270565986633301, + "learning_rate": 7.761406567316901e-06, + "loss": 1.0052, + "step": 40590 + }, + { + "epoch": 0.5848711410749529, + "grad_norm": 0.5885422229766846, + "learning_rate": 7.756859533943886e-06, + "loss": 1.0377, + "step": 40600 + }, + { + "epoch": 0.5850151980062521, + "grad_norm": 0.5697892308235168, + "learning_rate": 7.75231298887161e-06, + "loss": 0.9945, + "step": 40610 + }, + { + "epoch": 0.5851592549375513, + "grad_norm": 0.6186272501945496, + "learning_rate": 7.74776693308978e-06, + "loss": 1.0199, + "step": 40620 + }, + { + "epoch": 0.5853033118688505, + "grad_norm": 0.6105360388755798, + "learning_rate": 7.743221367588026e-06, + "loss": 1.0317, + "step": 40630 + }, + { + "epoch": 0.5854473688001498, + "grad_norm": 0.6047841906547546, + "learning_rate": 7.738676293355847e-06, + "loss": 1.0216, + "step": 40640 + }, + { + "epoch": 0.5855914257314491, + "grad_norm": 0.6993504166603088, + "learning_rate": 7.734131711382643e-06, + "loss": 1.0026, + "step": 40650 + }, + { + "epoch": 0.5857354826627483, + "grad_norm": 0.5617213845252991, + "learning_rate": 7.729587622657707e-06, + "loss": 0.9951, + "step": 40660 + }, + { + "epoch": 0.5858795395940476, + "grad_norm": 0.6542547345161438, + "learning_rate": 7.725044028170225e-06, + "loss": 1.0054, + "step": 40670 + }, + { + "epoch": 0.5860235965253469, + "grad_norm": 0.5995214581489563, + "learning_rate": 7.720500928909276e-06, + "loss": 1.033, + "step": 40680 + }, + { + "epoch": 0.5861676534566461, + "grad_norm": 0.5970878601074219, + "learning_rate": 7.715958325863831e-06, + "loss": 0.9994, + "step": 40690 + }, + { + "epoch": 0.5863117103879453, + "grad_norm": 0.5853875875473022, + "learning_rate": 7.711416220022747e-06, + "loss": 1.0091, + "step": 40700 + }, + { + "epoch": 0.5864557673192445, + "grad_norm": 0.6795983910560608, + "learning_rate": 7.706874612374781e-06, + "loss": 1.0137, + "step": 40710 + }, + { + "epoch": 0.5865998242505438, + "grad_norm": 0.5505003929138184, + "learning_rate": 7.702333503908582e-06, + "loss": 1.0143, + "step": 40720 + }, + { + "epoch": 0.5867438811818431, + "grad_norm": 0.6230047345161438, + "learning_rate": 7.697792895612676e-06, + "loss": 1.0478, + "step": 40730 + }, + { + "epoch": 0.5868879381131423, + "grad_norm": 0.6251411437988281, + "learning_rate": 7.693252788475503e-06, + "loss": 1.0301, + "step": 40740 + }, + { + "epoch": 0.5870319950444416, + "grad_norm": 0.5603410005569458, + "learning_rate": 7.688713183485379e-06, + "loss": 1.0227, + "step": 40750 + }, + { + "epoch": 0.5871760519757409, + "grad_norm": 0.5499513745307922, + "learning_rate": 7.684174081630504e-06, + "loss": 1.0216, + "step": 40760 + }, + { + "epoch": 0.5873201089070401, + "grad_norm": 0.5878617763519287, + "learning_rate": 7.67963548389899e-06, + "loss": 1.0204, + "step": 40770 + }, + { + "epoch": 0.5874641658383393, + "grad_norm": 0.672378420829773, + "learning_rate": 7.675097391278815e-06, + "loss": 1.0213, + "step": 40780 + }, + { + "epoch": 0.5876082227696385, + "grad_norm": 0.7302099466323853, + "learning_rate": 7.670559804757874e-06, + "loss": 1.0247, + "step": 40790 + }, + { + "epoch": 0.5877522797009378, + "grad_norm": 0.5528120994567871, + "learning_rate": 7.666022725323924e-06, + "loss": 0.9972, + "step": 40800 + }, + { + "epoch": 0.5878963366322371, + "grad_norm": 0.6364361643791199, + "learning_rate": 7.66148615396463e-06, + "loss": 1.0111, + "step": 40810 + }, + { + "epoch": 0.5880403935635363, + "grad_norm": 0.6810481548309326, + "learning_rate": 7.656950091667542e-06, + "loss": 1.0189, + "step": 40820 + }, + { + "epoch": 0.5881844504948356, + "grad_norm": 0.5953986048698425, + "learning_rate": 7.652414539420096e-06, + "loss": 1.0248, + "step": 40830 + }, + { + "epoch": 0.5883285074261349, + "grad_norm": 0.7078180909156799, + "learning_rate": 7.647879498209617e-06, + "loss": 1.0214, + "step": 40840 + }, + { + "epoch": 0.588472564357434, + "grad_norm": 0.5777971744537354, + "learning_rate": 7.643344969023326e-06, + "loss": 1.008, + "step": 40850 + }, + { + "epoch": 0.5886166212887333, + "grad_norm": 0.5660359263420105, + "learning_rate": 7.638810952848328e-06, + "loss": 1.016, + "step": 40860 + }, + { + "epoch": 0.5887606782200325, + "grad_norm": 0.5861753821372986, + "learning_rate": 7.634277450671607e-06, + "loss": 1.0284, + "step": 40870 + }, + { + "epoch": 0.5889047351513318, + "grad_norm": 0.677720844745636, + "learning_rate": 7.629744463480053e-06, + "loss": 0.9968, + "step": 40880 + }, + { + "epoch": 0.5890487920826311, + "grad_norm": 0.5892419815063477, + "learning_rate": 7.625211992260431e-06, + "loss": 0.9945, + "step": 40890 + }, + { + "epoch": 0.5891928490139303, + "grad_norm": 0.6533917188644409, + "learning_rate": 7.6206800379994e-06, + "loss": 1.0413, + "step": 40900 + }, + { + "epoch": 0.5893369059452296, + "grad_norm": 0.6236209273338318, + "learning_rate": 7.616148601683503e-06, + "loss": 1.0205, + "step": 40910 + }, + { + "epoch": 0.5894809628765288, + "grad_norm": 0.6106456518173218, + "learning_rate": 7.611617684299164e-06, + "loss": 1.0172, + "step": 40920 + }, + { + "epoch": 0.589625019807828, + "grad_norm": 0.7330796718597412, + "learning_rate": 7.607087286832714e-06, + "loss": 1.0321, + "step": 40930 + }, + { + "epoch": 0.5897690767391273, + "grad_norm": 0.6348506212234497, + "learning_rate": 7.602557410270349e-06, + "loss": 1.0331, + "step": 40940 + }, + { + "epoch": 0.5899131336704265, + "grad_norm": 0.665501594543457, + "learning_rate": 7.598028055598161e-06, + "loss": 1.0295, + "step": 40950 + }, + { + "epoch": 0.5900571906017258, + "grad_norm": 0.6182237267494202, + "learning_rate": 7.593499223802132e-06, + "loss": 0.9935, + "step": 40960 + }, + { + "epoch": 0.590201247533025, + "grad_norm": 0.765225350856781, + "learning_rate": 7.588970915868126e-06, + "loss": 1.0425, + "step": 40970 + }, + { + "epoch": 0.5903453044643243, + "grad_norm": 0.6308184862136841, + "learning_rate": 7.584443132781887e-06, + "loss": 0.9983, + "step": 40980 + }, + { + "epoch": 0.5904893613956236, + "grad_norm": 0.7328409552574158, + "learning_rate": 7.5799158755290595e-06, + "loss": 1.0314, + "step": 40990 + }, + { + "epoch": 0.5906334183269228, + "grad_norm": 0.5854573249816895, + "learning_rate": 7.575389145095157e-06, + "loss": 1.0183, + "step": 41000 + }, + { + "epoch": 0.590777475258222, + "grad_norm": 0.6093853116035461, + "learning_rate": 7.570862942465593e-06, + "loss": 1.0023, + "step": 41010 + }, + { + "epoch": 0.5909215321895213, + "grad_norm": 0.637779951095581, + "learning_rate": 7.566337268625657e-06, + "loss": 1.0249, + "step": 41020 + }, + { + "epoch": 0.5910655891208205, + "grad_norm": 0.6483780741691589, + "learning_rate": 7.561812124560521e-06, + "loss": 1.016, + "step": 41030 + }, + { + "epoch": 0.5912096460521198, + "grad_norm": 0.6446595788002014, + "learning_rate": 7.557287511255255e-06, + "loss": 1.0353, + "step": 41040 + }, + { + "epoch": 0.591353702983419, + "grad_norm": 0.6424724459648132, + "learning_rate": 7.552763429694802e-06, + "loss": 1.0421, + "step": 41050 + }, + { + "epoch": 0.5914977599147183, + "grad_norm": 0.6585966348648071, + "learning_rate": 7.548239880863984e-06, + "loss": 1.0247, + "step": 41060 + }, + { + "epoch": 0.5916418168460176, + "grad_norm": 0.6525462865829468, + "learning_rate": 7.543716865747528e-06, + "loss": 0.9947, + "step": 41070 + }, + { + "epoch": 0.5917858737773168, + "grad_norm": 0.5727991461753845, + "learning_rate": 7.539194385330024e-06, + "loss": 1.0184, + "step": 41080 + }, + { + "epoch": 0.591929930708616, + "grad_norm": 0.6609602570533752, + "learning_rate": 7.534672440595955e-06, + "loss": 1.0233, + "step": 41090 + }, + { + "epoch": 0.5920739876399153, + "grad_norm": 0.6003077030181885, + "learning_rate": 7.530151032529687e-06, + "loss": 1.028, + "step": 41100 + }, + { + "epoch": 0.5922180445712145, + "grad_norm": 0.7327446341514587, + "learning_rate": 7.5256301621154645e-06, + "loss": 1.0289, + "step": 41110 + }, + { + "epoch": 0.5923621015025138, + "grad_norm": 0.6378503441810608, + "learning_rate": 7.521109830337427e-06, + "loss": 1.0174, + "step": 41120 + }, + { + "epoch": 0.592506158433813, + "grad_norm": 0.640765905380249, + "learning_rate": 7.516590038179581e-06, + "loss": 1.0193, + "step": 41130 + }, + { + "epoch": 0.5926502153651123, + "grad_norm": 0.6114451289176941, + "learning_rate": 7.512070786625823e-06, + "loss": 1.0193, + "step": 41140 + }, + { + "epoch": 0.5927942722964116, + "grad_norm": 0.614895224571228, + "learning_rate": 7.507552076659935e-06, + "loss": 1.0146, + "step": 41150 + }, + { + "epoch": 0.5929383292277108, + "grad_norm": 0.6712912321090698, + "learning_rate": 7.5030339092655805e-06, + "loss": 1.0069, + "step": 41160 + }, + { + "epoch": 0.59308238615901, + "grad_norm": 0.6977723836898804, + "learning_rate": 7.498516285426291e-06, + "loss": 1.0238, + "step": 41170 + }, + { + "epoch": 0.5932264430903093, + "grad_norm": 0.6460567712783813, + "learning_rate": 7.493999206125504e-06, + "loss": 1.0366, + "step": 41180 + }, + { + "epoch": 0.5933705000216085, + "grad_norm": 0.5149868726730347, + "learning_rate": 7.4894826723465185e-06, + "loss": 1.0075, + "step": 41190 + }, + { + "epoch": 0.5935145569529078, + "grad_norm": 0.6791356205940247, + "learning_rate": 7.48496668507252e-06, + "loss": 1.0297, + "step": 41200 + }, + { + "epoch": 0.593658613884207, + "grad_norm": 0.6235713362693787, + "learning_rate": 7.480451245286582e-06, + "loss": 1.0368, + "step": 41210 + }, + { + "epoch": 0.5938026708155063, + "grad_norm": 0.6560283303260803, + "learning_rate": 7.4759363539716475e-06, + "loss": 1.0177, + "step": 41220 + }, + { + "epoch": 0.5939467277468056, + "grad_norm": 0.6092167496681213, + "learning_rate": 7.471422012110552e-06, + "loss": 1.0235, + "step": 41230 + }, + { + "epoch": 0.5940907846781048, + "grad_norm": 0.5454646944999695, + "learning_rate": 7.466908220686002e-06, + "loss": 1.016, + "step": 41240 + }, + { + "epoch": 0.594234841609404, + "grad_norm": 0.687719464302063, + "learning_rate": 7.4623949806805875e-06, + "loss": 1.0207, + "step": 41250 + }, + { + "epoch": 0.5943788985407032, + "grad_norm": 0.8912255764007568, + "learning_rate": 7.457882293076781e-06, + "loss": 1.0085, + "step": 41260 + }, + { + "epoch": 0.5945229554720025, + "grad_norm": 0.596967875957489, + "learning_rate": 7.4533701588569306e-06, + "loss": 1.0137, + "step": 41270 + }, + { + "epoch": 0.5946670124033018, + "grad_norm": 0.6342841386795044, + "learning_rate": 7.448858579003264e-06, + "loss": 1.0155, + "step": 41280 + }, + { + "epoch": 0.594811069334601, + "grad_norm": 0.7018398642539978, + "learning_rate": 7.444347554497894e-06, + "loss": 1.0055, + "step": 41290 + }, + { + "epoch": 0.5949551262659003, + "grad_norm": 0.5752993822097778, + "learning_rate": 7.439837086322807e-06, + "loss": 1.025, + "step": 41300 + }, + { + "epoch": 0.5950991831971996, + "grad_norm": 0.5885398387908936, + "learning_rate": 7.435327175459863e-06, + "loss": 1.0178, + "step": 41310 + }, + { + "epoch": 0.5952432401284988, + "grad_norm": 0.6342470645904541, + "learning_rate": 7.430817822890821e-06, + "loss": 1.0219, + "step": 41320 + }, + { + "epoch": 0.595387297059798, + "grad_norm": 0.6557644009590149, + "learning_rate": 7.4263090295972895e-06, + "loss": 1.0171, + "step": 41330 + }, + { + "epoch": 0.5955313539910972, + "grad_norm": 0.6283184289932251, + "learning_rate": 7.4218007965607855e-06, + "loss": 1.017, + "step": 41340 + }, + { + "epoch": 0.5956754109223965, + "grad_norm": 0.586904764175415, + "learning_rate": 7.417293124762679e-06, + "loss": 1.0469, + "step": 41350 + }, + { + "epoch": 0.5958194678536958, + "grad_norm": 0.6193545460700989, + "learning_rate": 7.412786015184228e-06, + "loss": 0.9911, + "step": 41360 + }, + { + "epoch": 0.595963524784995, + "grad_norm": 0.7516299486160278, + "learning_rate": 7.408279468806574e-06, + "loss": 1.0397, + "step": 41370 + }, + { + "epoch": 0.5961075817162943, + "grad_norm": 0.5326626896858215, + "learning_rate": 7.403773486610726e-06, + "loss": 1.0237, + "step": 41380 + }, + { + "epoch": 0.5962516386475936, + "grad_norm": 0.6334313750267029, + "learning_rate": 7.399268069577573e-06, + "loss": 1.0405, + "step": 41390 + }, + { + "epoch": 0.5963956955788928, + "grad_norm": 0.63750159740448, + "learning_rate": 7.394763218687886e-06, + "loss": 0.9907, + "step": 41400 + }, + { + "epoch": 0.596539752510192, + "grad_norm": 0.640842080116272, + "learning_rate": 7.390258934922308e-06, + "loss": 1.0224, + "step": 41410 + }, + { + "epoch": 0.5966838094414912, + "grad_norm": 0.5708299875259399, + "learning_rate": 7.385755219261357e-06, + "loss": 1.0164, + "step": 41420 + }, + { + "epoch": 0.5968278663727905, + "grad_norm": 0.5873056650161743, + "learning_rate": 7.381252072685432e-06, + "loss": 1.009, + "step": 41430 + }, + { + "epoch": 0.5969719233040898, + "grad_norm": 0.5703116059303284, + "learning_rate": 7.376749496174802e-06, + "loss": 1.0041, + "step": 41440 + }, + { + "epoch": 0.597115980235389, + "grad_norm": 0.6429044008255005, + "learning_rate": 7.3722474907096235e-06, + "loss": 1.0084, + "step": 41450 + }, + { + "epoch": 0.5972600371666883, + "grad_norm": 0.6282838582992554, + "learning_rate": 7.367746057269917e-06, + "loss": 1.0217, + "step": 41460 + }, + { + "epoch": 0.5974040940979876, + "grad_norm": 0.5623292326927185, + "learning_rate": 7.363245196835575e-06, + "loss": 1.0143, + "step": 41470 + }, + { + "epoch": 0.5975481510292868, + "grad_norm": 0.6059540510177612, + "learning_rate": 7.358744910386388e-06, + "loss": 1.0177, + "step": 41480 + }, + { + "epoch": 0.597692207960586, + "grad_norm": 0.6665140390396118, + "learning_rate": 7.354245198901995e-06, + "loss": 1.0433, + "step": 41490 + }, + { + "epoch": 0.5978362648918852, + "grad_norm": 0.7568016648292542, + "learning_rate": 7.34974606336192e-06, + "loss": 1.029, + "step": 41500 + }, + { + "epoch": 0.5979803218231845, + "grad_norm": 0.6323282122612, + "learning_rate": 7.345247504745571e-06, + "loss": 1.0153, + "step": 41510 + }, + { + "epoch": 0.5981243787544838, + "grad_norm": 0.539384126663208, + "learning_rate": 7.340749524032217e-06, + "loss": 1.0182, + "step": 41520 + }, + { + "epoch": 0.598268435685783, + "grad_norm": 0.6559541821479797, + "learning_rate": 7.336252122201005e-06, + "loss": 1.023, + "step": 41530 + }, + { + "epoch": 0.5984124926170823, + "grad_norm": 0.5994787812232971, + "learning_rate": 7.331755300230963e-06, + "loss": 1.0097, + "step": 41540 + }, + { + "epoch": 0.5985565495483816, + "grad_norm": 0.6237807273864746, + "learning_rate": 7.32725905910098e-06, + "loss": 1.0167, + "step": 41550 + }, + { + "epoch": 0.5987006064796808, + "grad_norm": 0.6201650500297546, + "learning_rate": 7.3227633997898314e-06, + "loss": 1.0167, + "step": 41560 + }, + { + "epoch": 0.59884466341098, + "grad_norm": 0.6548067331314087, + "learning_rate": 7.318268323276161e-06, + "loss": 1.0083, + "step": 41570 + }, + { + "epoch": 0.5989887203422792, + "grad_norm": 0.7271896600723267, + "learning_rate": 7.313773830538475e-06, + "loss": 1.0256, + "step": 41580 + }, + { + "epoch": 0.5991327772735785, + "grad_norm": 0.6019732356071472, + "learning_rate": 7.309279922555178e-06, + "loss": 1.0147, + "step": 41590 + }, + { + "epoch": 0.5992768342048778, + "grad_norm": 0.6316788792610168, + "learning_rate": 7.30478660030452e-06, + "loss": 1.0212, + "step": 41600 + }, + { + "epoch": 0.599420891136177, + "grad_norm": 0.7106907367706299, + "learning_rate": 7.3002938647646336e-06, + "loss": 1.0165, + "step": 41610 + }, + { + "epoch": 0.5995649480674763, + "grad_norm": 0.7058002352714539, + "learning_rate": 7.295801716913537e-06, + "loss": 1.0231, + "step": 41620 + }, + { + "epoch": 0.5997090049987756, + "grad_norm": 0.5363566279411316, + "learning_rate": 7.291310157729101e-06, + "loss": 1.0047, + "step": 41630 + }, + { + "epoch": 0.5998530619300748, + "grad_norm": 0.6057727932929993, + "learning_rate": 7.286819188189075e-06, + "loss": 1.0115, + "step": 41640 + }, + { + "epoch": 0.599997118861374, + "grad_norm": 0.6345871090888977, + "learning_rate": 7.282328809271084e-06, + "loss": 1.0186, + "step": 41650 + }, + { + "epoch": 0.6001411757926732, + "grad_norm": 0.6517109274864197, + "learning_rate": 7.277839021952621e-06, + "loss": 1.0287, + "step": 41660 + }, + { + "epoch": 0.6002852327239725, + "grad_norm": 0.5592643618583679, + "learning_rate": 7.273349827211052e-06, + "loss": 1.0158, + "step": 41670 + }, + { + "epoch": 0.6004292896552718, + "grad_norm": 0.6520673632621765, + "learning_rate": 7.268861226023612e-06, + "loss": 1.0457, + "step": 41680 + }, + { + "epoch": 0.600573346586571, + "grad_norm": 0.6690886616706848, + "learning_rate": 7.264373219367402e-06, + "loss": 1.0384, + "step": 41690 + }, + { + "epoch": 0.6007174035178703, + "grad_norm": 0.5864752531051636, + "learning_rate": 7.259885808219409e-06, + "loss": 1.0268, + "step": 41700 + }, + { + "epoch": 0.6008614604491695, + "grad_norm": 0.6361005306243896, + "learning_rate": 7.255398993556477e-06, + "loss": 1.0331, + "step": 41710 + }, + { + "epoch": 0.6010055173804688, + "grad_norm": 0.7044946551322937, + "learning_rate": 7.250912776355317e-06, + "loss": 1.0202, + "step": 41720 + }, + { + "epoch": 0.601149574311768, + "grad_norm": 0.5614191889762878, + "learning_rate": 7.246427157592526e-06, + "loss": 1.0063, + "step": 41730 + }, + { + "epoch": 0.6012936312430672, + "grad_norm": 0.7571386694908142, + "learning_rate": 7.241942138244557e-06, + "loss": 1.0251, + "step": 41740 + }, + { + "epoch": 0.6014376881743665, + "grad_norm": 0.6337618231773376, + "learning_rate": 7.237457719287735e-06, + "loss": 1.0114, + "step": 41750 + }, + { + "epoch": 0.6015817451056658, + "grad_norm": 0.5836769342422485, + "learning_rate": 7.232973901698259e-06, + "loss": 1.0166, + "step": 41760 + }, + { + "epoch": 0.601725802036965, + "grad_norm": 0.6328428983688354, + "learning_rate": 7.228490686452192e-06, + "loss": 1.0107, + "step": 41770 + }, + { + "epoch": 0.6018698589682643, + "grad_norm": 0.6117316484451294, + "learning_rate": 7.224008074525472e-06, + "loss": 1.012, + "step": 41780 + }, + { + "epoch": 0.6020139158995635, + "grad_norm": 0.6610920429229736, + "learning_rate": 7.219526066893898e-06, + "loss": 1.0401, + "step": 41790 + }, + { + "epoch": 0.6021579728308628, + "grad_norm": 0.636726975440979, + "learning_rate": 7.215044664533143e-06, + "loss": 1.0295, + "step": 41800 + }, + { + "epoch": 0.602302029762162, + "grad_norm": 0.5862812995910645, + "learning_rate": 7.210563868418746e-06, + "loss": 1.0029, + "step": 41810 + }, + { + "epoch": 0.6024460866934612, + "grad_norm": 0.7004019618034363, + "learning_rate": 7.206083679526116e-06, + "loss": 1.0148, + "step": 41820 + }, + { + "epoch": 0.6025901436247605, + "grad_norm": 0.6236006617546082, + "learning_rate": 7.2016040988305235e-06, + "loss": 1.0302, + "step": 41830 + }, + { + "epoch": 0.6027342005560598, + "grad_norm": 0.6367349028587341, + "learning_rate": 7.1971251273071175e-06, + "loss": 1.0014, + "step": 41840 + }, + { + "epoch": 0.602878257487359, + "grad_norm": 0.6152836084365845, + "learning_rate": 7.192646765930908e-06, + "loss": 1.0532, + "step": 41850 + }, + { + "epoch": 0.6030223144186583, + "grad_norm": 0.6188142895698547, + "learning_rate": 7.188169015676762e-06, + "loss": 1.034, + "step": 41860 + }, + { + "epoch": 0.6031663713499575, + "grad_norm": 0.5735546350479126, + "learning_rate": 7.183691877519441e-06, + "loss": 1.0185, + "step": 41870 + }, + { + "epoch": 0.6033104282812568, + "grad_norm": 0.5709615349769592, + "learning_rate": 7.17921535243354e-06, + "loss": 1.0239, + "step": 41880 + }, + { + "epoch": 0.603454485212556, + "grad_norm": 0.5740513205528259, + "learning_rate": 7.17473944139355e-06, + "loss": 1.0162, + "step": 41890 + }, + { + "epoch": 0.6035985421438552, + "grad_norm": 0.5819230079650879, + "learning_rate": 7.170264145373806e-06, + "loss": 1.0214, + "step": 41900 + }, + { + "epoch": 0.6037425990751545, + "grad_norm": 0.49948909878730774, + "learning_rate": 7.165789465348521e-06, + "loss": 1.0277, + "step": 41910 + }, + { + "epoch": 0.6038866560064537, + "grad_norm": 0.7643324136734009, + "learning_rate": 7.161315402291773e-06, + "loss": 1.0223, + "step": 41920 + }, + { + "epoch": 0.604030712937753, + "grad_norm": 0.6517783403396606, + "learning_rate": 7.156841957177503e-06, + "loss": 1.0227, + "step": 41930 + }, + { + "epoch": 0.6041747698690523, + "grad_norm": 0.6694622039794922, + "learning_rate": 7.152369130979514e-06, + "loss": 1.0044, + "step": 41940 + }, + { + "epoch": 0.6043188268003515, + "grad_norm": 0.5882413983345032, + "learning_rate": 7.147896924671485e-06, + "loss": 1.0138, + "step": 41950 + }, + { + "epoch": 0.6044628837316508, + "grad_norm": 0.775355875492096, + "learning_rate": 7.14342533922695e-06, + "loss": 1.0195, + "step": 41960 + }, + { + "epoch": 0.60460694066295, + "grad_norm": 0.6620295643806458, + "learning_rate": 7.1389543756193115e-06, + "loss": 1.0273, + "step": 41970 + }, + { + "epoch": 0.6047509975942492, + "grad_norm": 0.6419587731361389, + "learning_rate": 7.134484034821838e-06, + "loss": 1.0224, + "step": 41980 + }, + { + "epoch": 0.6048950545255485, + "grad_norm": 0.5880239605903625, + "learning_rate": 7.130014317807658e-06, + "loss": 1.014, + "step": 41990 + }, + { + "epoch": 0.6050391114568477, + "grad_norm": 0.5835565328598022, + "learning_rate": 7.1255452255497725e-06, + "loss": 1.0388, + "step": 42000 + }, + { + "epoch": 0.605183168388147, + "grad_norm": 0.5859873294830322, + "learning_rate": 7.121076759021041e-06, + "loss": 1.012, + "step": 42010 + }, + { + "epoch": 0.6053272253194463, + "grad_norm": 0.6449272632598877, + "learning_rate": 7.116608919194177e-06, + "loss": 1.0003, + "step": 42020 + }, + { + "epoch": 0.6054712822507455, + "grad_norm": 0.6205201745033264, + "learning_rate": 7.112141707041782e-06, + "loss": 1.0274, + "step": 42030 + }, + { + "epoch": 0.6056153391820448, + "grad_norm": 0.6808909773826599, + "learning_rate": 7.107675123536295e-06, + "loss": 1.0246, + "step": 42040 + }, + { + "epoch": 0.605759396113344, + "grad_norm": 0.6364637017250061, + "learning_rate": 7.103209169650034e-06, + "loss": 1.0184, + "step": 42050 + }, + { + "epoch": 0.6059034530446432, + "grad_norm": 0.8138787150382996, + "learning_rate": 7.098743846355177e-06, + "loss": 1.0194, + "step": 42060 + }, + { + "epoch": 0.6060475099759425, + "grad_norm": 0.6228765845298767, + "learning_rate": 7.094279154623762e-06, + "loss": 1.0058, + "step": 42070 + }, + { + "epoch": 0.6061915669072417, + "grad_norm": 0.616649329662323, + "learning_rate": 7.089815095427689e-06, + "loss": 1.0056, + "step": 42080 + }, + { + "epoch": 0.606335623838541, + "grad_norm": 0.529268205165863, + "learning_rate": 7.085351669738724e-06, + "loss": 1.011, + "step": 42090 + }, + { + "epoch": 0.6064796807698403, + "grad_norm": 0.6124671697616577, + "learning_rate": 7.0808888785284934e-06, + "loss": 1.0158, + "step": 42100 + }, + { + "epoch": 0.6066237377011395, + "grad_norm": 0.6179755330085754, + "learning_rate": 7.076426722768486e-06, + "loss": 1.0192, + "step": 42110 + }, + { + "epoch": 0.6067677946324388, + "grad_norm": 0.6596282720565796, + "learning_rate": 7.071965203430051e-06, + "loss": 1.0272, + "step": 42120 + }, + { + "epoch": 0.606911851563738, + "grad_norm": 0.7349212765693665, + "learning_rate": 7.067504321484393e-06, + "loss": 1.0299, + "step": 42130 + }, + { + "epoch": 0.6070559084950372, + "grad_norm": 0.6352936625480652, + "learning_rate": 7.063044077902598e-06, + "loss": 1.0095, + "step": 42140 + }, + { + "epoch": 0.6071999654263365, + "grad_norm": 0.5938249230384827, + "learning_rate": 7.0585844736555895e-06, + "loss": 0.9927, + "step": 42150 + }, + { + "epoch": 0.6073440223576357, + "grad_norm": 0.6572223901748657, + "learning_rate": 7.054125509714161e-06, + "loss": 1.0236, + "step": 42160 + }, + { + "epoch": 0.607488079288935, + "grad_norm": 0.5948213934898376, + "learning_rate": 7.049667187048974e-06, + "loss": 0.9959, + "step": 42170 + }, + { + "epoch": 0.6076321362202343, + "grad_norm": 0.6174025535583496, + "learning_rate": 7.045209506630539e-06, + "loss": 1.0259, + "step": 42180 + }, + { + "epoch": 0.6077761931515335, + "grad_norm": 0.500306248664856, + "learning_rate": 7.040752469429233e-06, + "loss": 1.0236, + "step": 42190 + }, + { + "epoch": 0.6079202500828327, + "grad_norm": 0.6655672788619995, + "learning_rate": 7.036296076415292e-06, + "loss": 1.0338, + "step": 42200 + }, + { + "epoch": 0.6080643070141319, + "grad_norm": 0.6235907673835754, + "learning_rate": 7.03184032855881e-06, + "loss": 1.027, + "step": 42210 + }, + { + "epoch": 0.6082083639454312, + "grad_norm": 0.610526442527771, + "learning_rate": 7.0273852268297436e-06, + "loss": 1.0035, + "step": 42220 + }, + { + "epoch": 0.6083524208767305, + "grad_norm": 0.6061323881149292, + "learning_rate": 7.022930772197907e-06, + "loss": 1.0097, + "step": 42230 + }, + { + "epoch": 0.6084964778080297, + "grad_norm": 0.5258249044418335, + "learning_rate": 7.01847696563297e-06, + "loss": 1.0219, + "step": 42240 + }, + { + "epoch": 0.608640534739329, + "grad_norm": 0.6494345664978027, + "learning_rate": 7.01402380810447e-06, + "loss": 1.0222, + "step": 42250 + }, + { + "epoch": 0.6087845916706283, + "grad_norm": 0.7211364507675171, + "learning_rate": 7.009571300581797e-06, + "loss": 1.0248, + "step": 42260 + }, + { + "epoch": 0.6089286486019275, + "grad_norm": 0.7034059762954712, + "learning_rate": 7.005119444034193e-06, + "loss": 1.0098, + "step": 42270 + }, + { + "epoch": 0.6090727055332267, + "grad_norm": 0.5944812297821045, + "learning_rate": 7.000668239430777e-06, + "loss": 1.022, + "step": 42280 + }, + { + "epoch": 0.6092167624645259, + "grad_norm": 0.6565080881118774, + "learning_rate": 6.996217687740509e-06, + "loss": 1.023, + "step": 42290 + }, + { + "epoch": 0.6093608193958252, + "grad_norm": 0.6511583924293518, + "learning_rate": 6.99176778993221e-06, + "loss": 1.0014, + "step": 42300 + }, + { + "epoch": 0.6095048763271245, + "grad_norm": 0.6816222071647644, + "learning_rate": 6.987318546974567e-06, + "loss": 1.0163, + "step": 42310 + }, + { + "epoch": 0.6096489332584237, + "grad_norm": 0.6753540635108948, + "learning_rate": 6.9828699598361135e-06, + "loss": 1.0092, + "step": 42320 + }, + { + "epoch": 0.609792990189723, + "grad_norm": 0.661775529384613, + "learning_rate": 6.978422029485251e-06, + "loss": 1.0315, + "step": 42330 + }, + { + "epoch": 0.6099370471210223, + "grad_norm": 0.5916600823402405, + "learning_rate": 6.973974756890231e-06, + "loss": 1.0193, + "step": 42340 + }, + { + "epoch": 0.6100811040523215, + "grad_norm": 0.6581755876541138, + "learning_rate": 6.969528143019158e-06, + "loss": 1.0378, + "step": 42350 + }, + { + "epoch": 0.6102251609836207, + "grad_norm": 0.6407250761985779, + "learning_rate": 6.965082188840007e-06, + "loss": 1.0293, + "step": 42360 + }, + { + "epoch": 0.6103692179149199, + "grad_norm": 0.604726254940033, + "learning_rate": 6.9606368953205965e-06, + "loss": 1.025, + "step": 42370 + }, + { + "epoch": 0.6105132748462192, + "grad_norm": 0.6147050261497498, + "learning_rate": 6.956192263428603e-06, + "loss": 1.0104, + "step": 42380 + }, + { + "epoch": 0.6106573317775185, + "grad_norm": 0.6224717497825623, + "learning_rate": 6.951748294131567e-06, + "loss": 1.0144, + "step": 42390 + }, + { + "epoch": 0.6108013887088177, + "grad_norm": 0.7217327356338501, + "learning_rate": 6.947304988396879e-06, + "loss": 1.0395, + "step": 42400 + }, + { + "epoch": 0.610945445640117, + "grad_norm": 0.6371968388557434, + "learning_rate": 6.942862347191777e-06, + "loss": 1.0182, + "step": 42410 + }, + { + "epoch": 0.6110895025714163, + "grad_norm": 0.5764115452766418, + "learning_rate": 6.938420371483374e-06, + "loss": 1.0339, + "step": 42420 + }, + { + "epoch": 0.6112335595027155, + "grad_norm": 0.5761376023292542, + "learning_rate": 6.9339790622386165e-06, + "loss": 1.0158, + "step": 42430 + }, + { + "epoch": 0.6113776164340147, + "grad_norm": 0.5998538732528687, + "learning_rate": 6.929538420424327e-06, + "loss": 1.0219, + "step": 42440 + }, + { + "epoch": 0.6115216733653139, + "grad_norm": 0.599068284034729, + "learning_rate": 6.925098447007163e-06, + "loss": 1.0374, + "step": 42450 + }, + { + "epoch": 0.6116657302966132, + "grad_norm": 0.6097978949546814, + "learning_rate": 6.920659142953649e-06, + "loss": 0.9993, + "step": 42460 + }, + { + "epoch": 0.6118097872279125, + "grad_norm": 0.5823416113853455, + "learning_rate": 6.916220509230161e-06, + "loss": 1.0009, + "step": 42470 + }, + { + "epoch": 0.6119538441592117, + "grad_norm": 0.5852572917938232, + "learning_rate": 6.9117825468029276e-06, + "loss": 1.0335, + "step": 42480 + }, + { + "epoch": 0.612097901090511, + "grad_norm": 0.6183345913887024, + "learning_rate": 6.90734525663803e-06, + "loss": 1.031, + "step": 42490 + }, + { + "epoch": 0.6122419580218103, + "grad_norm": 0.7070101499557495, + "learning_rate": 6.902908639701408e-06, + "loss": 1.0149, + "step": 42500 + }, + { + "epoch": 0.6123860149531095, + "grad_norm": 0.6024273037910461, + "learning_rate": 6.898472696958853e-06, + "loss": 1.0142, + "step": 42510 + }, + { + "epoch": 0.6125300718844087, + "grad_norm": 0.5777057409286499, + "learning_rate": 6.894037429376e-06, + "loss": 1.0045, + "step": 42520 + }, + { + "epoch": 0.6126741288157079, + "grad_norm": 0.7443797588348389, + "learning_rate": 6.889602837918357e-06, + "loss": 1.0165, + "step": 42530 + }, + { + "epoch": 0.6128181857470072, + "grad_norm": 0.5777792930603027, + "learning_rate": 6.885168923551265e-06, + "loss": 0.9996, + "step": 42540 + }, + { + "epoch": 0.6129622426783065, + "grad_norm": 0.5409834980964661, + "learning_rate": 6.880735687239932e-06, + "loss": 1.0398, + "step": 42550 + }, + { + "epoch": 0.6131062996096057, + "grad_norm": 0.6305665373802185, + "learning_rate": 6.876303129949411e-06, + "loss": 1.0203, + "step": 42560 + }, + { + "epoch": 0.613250356540905, + "grad_norm": 0.6016412377357483, + "learning_rate": 6.871871252644602e-06, + "loss": 1.0159, + "step": 42570 + }, + { + "epoch": 0.6133944134722042, + "grad_norm": 0.6214472651481628, + "learning_rate": 6.867440056290274e-06, + "loss": 1.0169, + "step": 42580 + }, + { + "epoch": 0.6135384704035035, + "grad_norm": 0.6808599829673767, + "learning_rate": 6.863009541851031e-06, + "loss": 1.0195, + "step": 42590 + }, + { + "epoch": 0.6136825273348027, + "grad_norm": 0.6278876662254333, + "learning_rate": 6.858579710291333e-06, + "loss": 1.0107, + "step": 42600 + }, + { + "epoch": 0.6138265842661019, + "grad_norm": 0.5400162935256958, + "learning_rate": 6.854150562575499e-06, + "loss": 1.0244, + "step": 42610 + }, + { + "epoch": 0.6139706411974012, + "grad_norm": 0.6503427624702454, + "learning_rate": 6.84972209966769e-06, + "loss": 1.0059, + "step": 42620 + }, + { + "epoch": 0.6141146981287005, + "grad_norm": 0.5955107808113098, + "learning_rate": 6.845294322531919e-06, + "loss": 1.0378, + "step": 42630 + }, + { + "epoch": 0.6142587550599997, + "grad_norm": 0.5728549361228943, + "learning_rate": 6.840867232132058e-06, + "loss": 0.9921, + "step": 42640 + }, + { + "epoch": 0.614402811991299, + "grad_norm": 0.9009476900100708, + "learning_rate": 6.836440829431818e-06, + "loss": 1.0149, + "step": 42650 + }, + { + "epoch": 0.6145468689225982, + "grad_norm": 0.7177078723907471, + "learning_rate": 6.83201511539477e-06, + "loss": 1.029, + "step": 42660 + }, + { + "epoch": 0.6146909258538975, + "grad_norm": 0.6054080128669739, + "learning_rate": 6.827590090984331e-06, + "loss": 1.0007, + "step": 42670 + }, + { + "epoch": 0.6148349827851967, + "grad_norm": 0.611545979976654, + "learning_rate": 6.82316575716376e-06, + "loss": 1.0054, + "step": 42680 + }, + { + "epoch": 0.6149790397164959, + "grad_norm": 0.5639975666999817, + "learning_rate": 6.818742114896185e-06, + "loss": 1.0116, + "step": 42690 + }, + { + "epoch": 0.6151230966477952, + "grad_norm": 0.5729649662971497, + "learning_rate": 6.814319165144564e-06, + "loss": 1.0003, + "step": 42700 + }, + { + "epoch": 0.6152671535790945, + "grad_norm": 0.5668555498123169, + "learning_rate": 6.809896908871713e-06, + "loss": 1.0205, + "step": 42710 + }, + { + "epoch": 0.6154112105103937, + "grad_norm": 0.5614668130874634, + "learning_rate": 6.8054753470402975e-06, + "loss": 1.0034, + "step": 42720 + }, + { + "epoch": 0.615555267441693, + "grad_norm": 0.5896543264389038, + "learning_rate": 6.801054480612832e-06, + "loss": 0.9952, + "step": 42730 + }, + { + "epoch": 0.6156993243729922, + "grad_norm": 0.6210322380065918, + "learning_rate": 6.7966343105516755e-06, + "loss": 1.0286, + "step": 42740 + }, + { + "epoch": 0.6158433813042915, + "grad_norm": 0.6250858306884766, + "learning_rate": 6.79221483781904e-06, + "loss": 1.0118, + "step": 42750 + }, + { + "epoch": 0.6159874382355907, + "grad_norm": 0.6140739321708679, + "learning_rate": 6.7877960633769824e-06, + "loss": 1.032, + "step": 42760 + }, + { + "epoch": 0.6161314951668899, + "grad_norm": 0.5905389785766602, + "learning_rate": 6.783377988187407e-06, + "loss": 1.0275, + "step": 42770 + }, + { + "epoch": 0.6162755520981892, + "grad_norm": 0.6868119239807129, + "learning_rate": 6.778960613212073e-06, + "loss": 1.0188, + "step": 42780 + }, + { + "epoch": 0.6164196090294884, + "grad_norm": 0.6418941617012024, + "learning_rate": 6.774543939412579e-06, + "loss": 1.0118, + "step": 42790 + }, + { + "epoch": 0.6165636659607877, + "grad_norm": 0.6080740094184875, + "learning_rate": 6.770127967750374e-06, + "loss": 1.0286, + "step": 42800 + }, + { + "epoch": 0.616707722892087, + "grad_norm": 0.6652899384498596, + "learning_rate": 6.765712699186757e-06, + "loss": 1.0483, + "step": 42810 + }, + { + "epoch": 0.6168517798233862, + "grad_norm": 0.6463445425033569, + "learning_rate": 6.761298134682862e-06, + "loss": 1.0196, + "step": 42820 + }, + { + "epoch": 0.6169958367546855, + "grad_norm": 0.658723771572113, + "learning_rate": 6.756884275199691e-06, + "loss": 1.0043, + "step": 42830 + }, + { + "epoch": 0.6171398936859847, + "grad_norm": 0.6873076558113098, + "learning_rate": 6.7524711216980745e-06, + "loss": 1.0129, + "step": 42840 + }, + { + "epoch": 0.6172839506172839, + "grad_norm": 0.5704846382141113, + "learning_rate": 6.748058675138691e-06, + "loss": 1.0447, + "step": 42850 + }, + { + "epoch": 0.6174280075485832, + "grad_norm": 0.6206031441688538, + "learning_rate": 6.743646936482075e-06, + "loss": 1.0307, + "step": 42860 + }, + { + "epoch": 0.6175720644798824, + "grad_norm": 0.6279870271682739, + "learning_rate": 6.739235906688602e-06, + "loss": 1.0138, + "step": 42870 + }, + { + "epoch": 0.6177161214111817, + "grad_norm": 0.657698392868042, + "learning_rate": 6.734825586718485e-06, + "loss": 1.0203, + "step": 42880 + }, + { + "epoch": 0.617860178342481, + "grad_norm": 0.5187119245529175, + "learning_rate": 6.730415977531799e-06, + "loss": 1.0308, + "step": 42890 + }, + { + "epoch": 0.6180042352737802, + "grad_norm": 0.5883438587188721, + "learning_rate": 6.726007080088447e-06, + "loss": 1.0359, + "step": 42900 + }, + { + "epoch": 0.6181482922050795, + "grad_norm": 0.5652337670326233, + "learning_rate": 6.721598895348192e-06, + "loss": 1.0271, + "step": 42910 + }, + { + "epoch": 0.6182923491363786, + "grad_norm": 0.6093305945396423, + "learning_rate": 6.717191424270631e-06, + "loss": 1.0251, + "step": 42920 + }, + { + "epoch": 0.6184364060676779, + "grad_norm": 0.5216379165649414, + "learning_rate": 6.712784667815209e-06, + "loss": 1.006, + "step": 42930 + }, + { + "epoch": 0.6185804629989772, + "grad_norm": 0.6014308333396912, + "learning_rate": 6.70837862694122e-06, + "loss": 1.0324, + "step": 42940 + }, + { + "epoch": 0.6187245199302764, + "grad_norm": 0.6771434545516968, + "learning_rate": 6.703973302607798e-06, + "loss": 1.01, + "step": 42950 + }, + { + "epoch": 0.6188685768615757, + "grad_norm": 0.778387188911438, + "learning_rate": 6.699568695773913e-06, + "loss": 1.0241, + "step": 42960 + }, + { + "epoch": 0.619012633792875, + "grad_norm": 0.5566554665565491, + "learning_rate": 6.6951648073984e-06, + "loss": 1.0038, + "step": 42970 + }, + { + "epoch": 0.6191566907241742, + "grad_norm": 0.5816327333450317, + "learning_rate": 6.690761638439915e-06, + "loss": 1.0235, + "step": 42980 + }, + { + "epoch": 0.6193007476554735, + "grad_norm": 0.5478515625, + "learning_rate": 6.686359189856969e-06, + "loss": 1.0165, + "step": 42990 + }, + { + "epoch": 0.6194448045867726, + "grad_norm": 0.6159368753433228, + "learning_rate": 6.681957462607918e-06, + "loss": 1.0179, + "step": 43000 + }, + { + "epoch": 0.6195888615180719, + "grad_norm": 0.580099880695343, + "learning_rate": 6.6775564576509534e-06, + "loss": 1.018, + "step": 43010 + }, + { + "epoch": 0.6197329184493712, + "grad_norm": 0.6514555215835571, + "learning_rate": 6.6731561759441155e-06, + "loss": 1.0196, + "step": 43020 + }, + { + "epoch": 0.6198769753806704, + "grad_norm": 0.61688232421875, + "learning_rate": 6.668756618445285e-06, + "loss": 1.0349, + "step": 43030 + }, + { + "epoch": 0.6200210323119697, + "grad_norm": 0.721116840839386, + "learning_rate": 6.664357786112181e-06, + "loss": 1.0309, + "step": 43040 + }, + { + "epoch": 0.620165089243269, + "grad_norm": 0.5536653995513916, + "learning_rate": 6.659959679902376e-06, + "loss": 1.015, + "step": 43050 + }, + { + "epoch": 0.6203091461745682, + "grad_norm": 0.5684986114501953, + "learning_rate": 6.655562300773273e-06, + "loss": 1.0214, + "step": 43060 + }, + { + "epoch": 0.6204532031058675, + "grad_norm": 0.6205838322639465, + "learning_rate": 6.651165649682115e-06, + "loss": 1.0194, + "step": 43070 + }, + { + "epoch": 0.6205972600371666, + "grad_norm": 0.6007439494132996, + "learning_rate": 6.646769727586004e-06, + "loss": 1.0302, + "step": 43080 + }, + { + "epoch": 0.6207413169684659, + "grad_norm": 0.7089323401451111, + "learning_rate": 6.642374535441865e-06, + "loss": 1.0239, + "step": 43090 + }, + { + "epoch": 0.6208853738997652, + "grad_norm": 0.7257370948791504, + "learning_rate": 6.637980074206467e-06, + "loss": 1.0189, + "step": 43100 + }, + { + "epoch": 0.6210294308310644, + "grad_norm": 0.5735003352165222, + "learning_rate": 6.6335863448364305e-06, + "loss": 1.0265, + "step": 43110 + }, + { + "epoch": 0.6211734877623637, + "grad_norm": 0.5978228449821472, + "learning_rate": 6.629193348288205e-06, + "loss": 1.0179, + "step": 43120 + }, + { + "epoch": 0.621317544693663, + "grad_norm": 0.6640300750732422, + "learning_rate": 6.624801085518089e-06, + "loss": 1.0142, + "step": 43130 + }, + { + "epoch": 0.6214616016249622, + "grad_norm": 0.6534985303878784, + "learning_rate": 6.620409557482216e-06, + "loss": 1.0149, + "step": 43140 + }, + { + "epoch": 0.6216056585562615, + "grad_norm": 0.608762800693512, + "learning_rate": 6.616018765136558e-06, + "loss": 1.0123, + "step": 43150 + }, + { + "epoch": 0.6217497154875606, + "grad_norm": 0.5738025903701782, + "learning_rate": 6.611628709436937e-06, + "loss": 1.0383, + "step": 43160 + }, + { + "epoch": 0.6218937724188599, + "grad_norm": 0.5874068140983582, + "learning_rate": 6.607239391339002e-06, + "loss": 0.9957, + "step": 43170 + }, + { + "epoch": 0.6220378293501592, + "grad_norm": 0.6023411750793457, + "learning_rate": 6.602850811798245e-06, + "loss": 1.0301, + "step": 43180 + }, + { + "epoch": 0.6221818862814584, + "grad_norm": 0.8262518048286438, + "learning_rate": 6.598462971770006e-06, + "loss": 1.0215, + "step": 43190 + }, + { + "epoch": 0.6223259432127577, + "grad_norm": 0.7225953936576843, + "learning_rate": 6.594075872209456e-06, + "loss": 1.0256, + "step": 43200 + }, + { + "epoch": 0.622470000144057, + "grad_norm": 0.5936123132705688, + "learning_rate": 6.589689514071598e-06, + "loss": 1.0334, + "step": 43210 + }, + { + "epoch": 0.6226140570753562, + "grad_norm": 0.6910178065299988, + "learning_rate": 6.585303898311293e-06, + "loss": 1.001, + "step": 43220 + }, + { + "epoch": 0.6227581140066555, + "grad_norm": 0.5673627853393555, + "learning_rate": 6.580919025883218e-06, + "loss": 0.9986, + "step": 43230 + }, + { + "epoch": 0.6229021709379546, + "grad_norm": 0.5227172374725342, + "learning_rate": 6.5765348977419106e-06, + "loss": 1.0018, + "step": 43240 + }, + { + "epoch": 0.6230462278692539, + "grad_norm": 0.572830080986023, + "learning_rate": 6.572151514841727e-06, + "loss": 1.0269, + "step": 43250 + }, + { + "epoch": 0.6231902848005532, + "grad_norm": 0.6719442009925842, + "learning_rate": 6.567768878136869e-06, + "loss": 1.0249, + "step": 43260 + }, + { + "epoch": 0.6233343417318524, + "grad_norm": 0.5868476629257202, + "learning_rate": 6.563386988581382e-06, + "loss": 1.0237, + "step": 43270 + }, + { + "epoch": 0.6234783986631517, + "grad_norm": 0.5998367071151733, + "learning_rate": 6.559005847129139e-06, + "loss": 1.0069, + "step": 43280 + }, + { + "epoch": 0.623622455594451, + "grad_norm": 0.6530718207359314, + "learning_rate": 6.554625454733852e-06, + "loss": 1.0166, + "step": 43290 + }, + { + "epoch": 0.6237665125257502, + "grad_norm": 0.6501248478889465, + "learning_rate": 6.550245812349077e-06, + "loss": 0.9883, + "step": 43300 + }, + { + "epoch": 0.6239105694570495, + "grad_norm": 0.5071753263473511, + "learning_rate": 6.545866920928199e-06, + "loss": 1.0081, + "step": 43310 + }, + { + "epoch": 0.6240546263883486, + "grad_norm": 0.6153305172920227, + "learning_rate": 6.5414887814244396e-06, + "loss": 1.0083, + "step": 43320 + }, + { + "epoch": 0.6241986833196479, + "grad_norm": 0.6175336837768555, + "learning_rate": 6.537111394790865e-06, + "loss": 0.9848, + "step": 43330 + }, + { + "epoch": 0.6243427402509472, + "grad_norm": 0.7310288548469543, + "learning_rate": 6.532734761980368e-06, + "loss": 1.0238, + "step": 43340 + }, + { + "epoch": 0.6244867971822464, + "grad_norm": 0.7438796758651733, + "learning_rate": 6.5283588839456836e-06, + "loss": 1.0042, + "step": 43350 + }, + { + "epoch": 0.6246308541135457, + "grad_norm": 0.6033474206924438, + "learning_rate": 6.523983761639381e-06, + "loss": 1.0002, + "step": 43360 + }, + { + "epoch": 0.624774911044845, + "grad_norm": 0.5989162921905518, + "learning_rate": 6.519609396013855e-06, + "loss": 1.0127, + "step": 43370 + }, + { + "epoch": 0.6249189679761442, + "grad_norm": 0.6061357259750366, + "learning_rate": 6.515235788021359e-06, + "loss": 1.0082, + "step": 43380 + }, + { + "epoch": 0.6250630249074435, + "grad_norm": 0.80193692445755, + "learning_rate": 6.510862938613958e-06, + "loss": 1.0086, + "step": 43390 + }, + { + "epoch": 0.6252070818387426, + "grad_norm": 0.6099424362182617, + "learning_rate": 6.506490848743559e-06, + "loss": 1.0282, + "step": 43400 + }, + { + "epoch": 0.6253511387700419, + "grad_norm": 0.630855143070221, + "learning_rate": 6.502119519361911e-06, + "loss": 1.0116, + "step": 43410 + }, + { + "epoch": 0.6254951957013412, + "grad_norm": 0.5582447648048401, + "learning_rate": 6.4977489514205915e-06, + "loss": 1.0261, + "step": 43420 + }, + { + "epoch": 0.6256392526326404, + "grad_norm": 0.6460801362991333, + "learning_rate": 6.4933791458710095e-06, + "loss": 1.0183, + "step": 43430 + }, + { + "epoch": 0.6257833095639397, + "grad_norm": 0.6204429268836975, + "learning_rate": 6.489010103664415e-06, + "loss": 1.0305, + "step": 43440 + }, + { + "epoch": 0.625927366495239, + "grad_norm": 0.5879707336425781, + "learning_rate": 6.484641825751885e-06, + "loss": 1.0109, + "step": 43450 + }, + { + "epoch": 0.6260714234265382, + "grad_norm": 0.6646779775619507, + "learning_rate": 6.480274313084338e-06, + "loss": 1.0228, + "step": 43460 + }, + { + "epoch": 0.6262154803578374, + "grad_norm": 0.5505615472793579, + "learning_rate": 6.475907566612518e-06, + "loss": 1.0212, + "step": 43470 + }, + { + "epoch": 0.6263595372891366, + "grad_norm": 0.6140030026435852, + "learning_rate": 6.471541587287003e-06, + "loss": 1.0113, + "step": 43480 + }, + { + "epoch": 0.6265035942204359, + "grad_norm": 0.5819932818412781, + "learning_rate": 6.467176376058213e-06, + "loss": 1.0014, + "step": 43490 + }, + { + "epoch": 0.6266476511517352, + "grad_norm": 0.574243426322937, + "learning_rate": 6.4628119338763904e-06, + "loss": 1.0067, + "step": 43500 + }, + { + "epoch": 0.6267917080830344, + "grad_norm": 0.5761387348175049, + "learning_rate": 6.45844826169161e-06, + "loss": 1.0292, + "step": 43510 + }, + { + "epoch": 0.6269357650143337, + "grad_norm": 0.639982283115387, + "learning_rate": 6.454085360453792e-06, + "loss": 0.976, + "step": 43520 + }, + { + "epoch": 0.6270798219456329, + "grad_norm": 0.6161608695983887, + "learning_rate": 6.449723231112674e-06, + "loss": 1.027, + "step": 43530 + }, + { + "epoch": 0.6272238788769322, + "grad_norm": 0.678606390953064, + "learning_rate": 6.445361874617831e-06, + "loss": 1.0406, + "step": 43540 + }, + { + "epoch": 0.6273679358082314, + "grad_norm": 0.601266622543335, + "learning_rate": 6.441001291918673e-06, + "loss": 1.0126, + "step": 43550 + }, + { + "epoch": 0.6275119927395306, + "grad_norm": 0.601625919342041, + "learning_rate": 6.436641483964434e-06, + "loss": 1.0175, + "step": 43560 + }, + { + "epoch": 0.6276560496708299, + "grad_norm": 0.5323131084442139, + "learning_rate": 6.432282451704191e-06, + "loss": 1.0012, + "step": 43570 + }, + { + "epoch": 0.6278001066021291, + "grad_norm": 0.6979629993438721, + "learning_rate": 6.427924196086841e-06, + "loss": 1.0329, + "step": 43580 + }, + { + "epoch": 0.6279441635334284, + "grad_norm": 0.5388063788414001, + "learning_rate": 6.423566718061116e-06, + "loss": 1.0152, + "step": 43590 + }, + { + "epoch": 0.6280882204647277, + "grad_norm": 0.649337112903595, + "learning_rate": 6.41921001857558e-06, + "loss": 1.0278, + "step": 43600 + }, + { + "epoch": 0.6282322773960269, + "grad_norm": 0.6489685773849487, + "learning_rate": 6.414854098578629e-06, + "loss": 1.0177, + "step": 43610 + }, + { + "epoch": 0.6283763343273262, + "grad_norm": 0.6887600421905518, + "learning_rate": 6.410498959018476e-06, + "loss": 1.0225, + "step": 43620 + }, + { + "epoch": 0.6285203912586254, + "grad_norm": 0.7380510568618774, + "learning_rate": 6.4061446008431895e-06, + "loss": 1.0289, + "step": 43630 + }, + { + "epoch": 0.6286644481899246, + "grad_norm": 0.6783304810523987, + "learning_rate": 6.401791025000646e-06, + "loss": 1.052, + "step": 43640 + }, + { + "epoch": 0.6288085051212239, + "grad_norm": 0.6634083390235901, + "learning_rate": 6.397438232438553e-06, + "loss": 1.0182, + "step": 43650 + }, + { + "epoch": 0.6289525620525231, + "grad_norm": 0.6285718679428101, + "learning_rate": 6.393086224104464e-06, + "loss": 1.0127, + "step": 43660 + }, + { + "epoch": 0.6290966189838224, + "grad_norm": 0.5838115215301514, + "learning_rate": 6.388735000945745e-06, + "loss": 1.0106, + "step": 43670 + }, + { + "epoch": 0.6292406759151217, + "grad_norm": 0.5893847942352295, + "learning_rate": 6.3843845639096e-06, + "loss": 1.0177, + "step": 43680 + }, + { + "epoch": 0.6293847328464209, + "grad_norm": 0.6196548342704773, + "learning_rate": 6.380034913943058e-06, + "loss": 1.0028, + "step": 43690 + }, + { + "epoch": 0.6295287897777202, + "grad_norm": 0.6291428804397583, + "learning_rate": 6.375686051992975e-06, + "loss": 1.0266, + "step": 43700 + }, + { + "epoch": 0.6296728467090194, + "grad_norm": 0.6694349050521851, + "learning_rate": 6.3713379790060445e-06, + "loss": 1.0248, + "step": 43710 + }, + { + "epoch": 0.6298169036403186, + "grad_norm": 0.6268457174301147, + "learning_rate": 6.366990695928778e-06, + "loss": 1.0144, + "step": 43720 + }, + { + "epoch": 0.6299609605716179, + "grad_norm": 0.652484655380249, + "learning_rate": 6.3626442037075175e-06, + "loss": 1.0055, + "step": 43730 + }, + { + "epoch": 0.6301050175029171, + "grad_norm": 0.6141695380210876, + "learning_rate": 6.358298503288438e-06, + "loss": 1.0028, + "step": 43740 + }, + { + "epoch": 0.6302490744342164, + "grad_norm": 0.6055964827537537, + "learning_rate": 6.35395359561754e-06, + "loss": 1.0261, + "step": 43750 + }, + { + "epoch": 0.6303931313655157, + "grad_norm": 0.5862328410148621, + "learning_rate": 6.349609481640641e-06, + "loss": 1.0067, + "step": 43760 + }, + { + "epoch": 0.6305371882968149, + "grad_norm": 0.6251682639122009, + "learning_rate": 6.345266162303405e-06, + "loss": 1.0071, + "step": 43770 + }, + { + "epoch": 0.6306812452281142, + "grad_norm": 0.6669208407402039, + "learning_rate": 6.340923638551304e-06, + "loss": 1.0134, + "step": 43780 + }, + { + "epoch": 0.6308253021594133, + "grad_norm": 0.6178964376449585, + "learning_rate": 6.336581911329656e-06, + "loss": 1.0239, + "step": 43790 + }, + { + "epoch": 0.6309693590907126, + "grad_norm": 0.7305578589439392, + "learning_rate": 6.332240981583586e-06, + "loss": 1.0052, + "step": 43800 + }, + { + "epoch": 0.6311134160220119, + "grad_norm": 0.6335142850875854, + "learning_rate": 6.327900850258057e-06, + "loss": 1.0306, + "step": 43810 + }, + { + "epoch": 0.6312574729533111, + "grad_norm": 0.641200840473175, + "learning_rate": 6.323561518297857e-06, + "loss": 1.0324, + "step": 43820 + }, + { + "epoch": 0.6314015298846104, + "grad_norm": 0.599928617477417, + "learning_rate": 6.319222986647599e-06, + "loss": 1.0069, + "step": 43830 + }, + { + "epoch": 0.6315455868159097, + "grad_norm": 0.5591249465942383, + "learning_rate": 6.314885256251717e-06, + "loss": 1.0023, + "step": 43840 + }, + { + "epoch": 0.6316896437472089, + "grad_norm": 0.58012455701828, + "learning_rate": 6.310548328054482e-06, + "loss": 1.0185, + "step": 43850 + }, + { + "epoch": 0.6318337006785082, + "grad_norm": 0.5824377536773682, + "learning_rate": 6.30621220299998e-06, + "loss": 1.0052, + "step": 43860 + }, + { + "epoch": 0.6319777576098073, + "grad_norm": 0.6389300227165222, + "learning_rate": 6.3018768820321216e-06, + "loss": 1.0307, + "step": 43870 + }, + { + "epoch": 0.6321218145411066, + "grad_norm": 0.586188793182373, + "learning_rate": 6.297542366094653e-06, + "loss": 1.0025, + "step": 43880 + }, + { + "epoch": 0.6322658714724059, + "grad_norm": 0.6241875290870667, + "learning_rate": 6.293208656131135e-06, + "loss": 1.0152, + "step": 43890 + }, + { + "epoch": 0.6324099284037051, + "grad_norm": 0.6493820548057556, + "learning_rate": 6.28887575308496e-06, + "loss": 1.0012, + "step": 43900 + }, + { + "epoch": 0.6325539853350044, + "grad_norm": 0.6596702933311462, + "learning_rate": 6.284543657899338e-06, + "loss": 1.0102, + "step": 43910 + }, + { + "epoch": 0.6326980422663037, + "grad_norm": 0.6422639489173889, + "learning_rate": 6.280212371517302e-06, + "loss": 1.0222, + "step": 43920 + }, + { + "epoch": 0.6328420991976029, + "grad_norm": 0.6775549650192261, + "learning_rate": 6.275881894881726e-06, + "loss": 1.0628, + "step": 43930 + }, + { + "epoch": 0.6329861561289022, + "grad_norm": 0.5756018161773682, + "learning_rate": 6.271552228935284e-06, + "loss": 1.0296, + "step": 43940 + }, + { + "epoch": 0.6331302130602013, + "grad_norm": 0.5965772867202759, + "learning_rate": 6.267223374620486e-06, + "loss": 1.0149, + "step": 43950 + }, + { + "epoch": 0.6332742699915006, + "grad_norm": 0.5774609446525574, + "learning_rate": 6.262895332879666e-06, + "loss": 1.01, + "step": 43960 + }, + { + "epoch": 0.6334183269227999, + "grad_norm": 0.6798967719078064, + "learning_rate": 6.258568104654981e-06, + "loss": 1.0075, + "step": 43970 + }, + { + "epoch": 0.6335623838540991, + "grad_norm": 0.621671736240387, + "learning_rate": 6.254241690888403e-06, + "loss": 1.0107, + "step": 43980 + }, + { + "epoch": 0.6337064407853984, + "grad_norm": 0.5781314969062805, + "learning_rate": 6.249916092521738e-06, + "loss": 1.032, + "step": 43990 + }, + { + "epoch": 0.6338504977166977, + "grad_norm": 0.5990108251571655, + "learning_rate": 6.245591310496606e-06, + "loss": 1.0236, + "step": 44000 + }, + { + "epoch": 0.6339945546479969, + "grad_norm": 0.5996828079223633, + "learning_rate": 6.241267345754456e-06, + "loss": 0.9997, + "step": 44010 + }, + { + "epoch": 0.6341386115792962, + "grad_norm": 0.6769276857376099, + "learning_rate": 6.236944199236555e-06, + "loss": 1.0181, + "step": 44020 + }, + { + "epoch": 0.6342826685105953, + "grad_norm": 0.5939728021621704, + "learning_rate": 6.2326218718839836e-06, + "loss": 0.9951, + "step": 44030 + }, + { + "epoch": 0.6344267254418946, + "grad_norm": 0.5858749151229858, + "learning_rate": 6.228300364637665e-06, + "loss": 1.0081, + "step": 44040 + }, + { + "epoch": 0.6345707823731939, + "grad_norm": 0.6309555172920227, + "learning_rate": 6.223979678438325e-06, + "loss": 1.0111, + "step": 44050 + }, + { + "epoch": 0.6347148393044931, + "grad_norm": 0.5809255242347717, + "learning_rate": 6.219659814226515e-06, + "loss": 1.0231, + "step": 44060 + }, + { + "epoch": 0.6348588962357924, + "grad_norm": 0.6437588334083557, + "learning_rate": 6.21534077294262e-06, + "loss": 1.0263, + "step": 44070 + }, + { + "epoch": 0.6350029531670917, + "grad_norm": 0.5920225977897644, + "learning_rate": 6.211022555526829e-06, + "loss": 1.0422, + "step": 44080 + }, + { + "epoch": 0.6351470100983909, + "grad_norm": 0.6045619249343872, + "learning_rate": 6.206705162919154e-06, + "loss": 1.0228, + "step": 44090 + }, + { + "epoch": 0.6352910670296902, + "grad_norm": 0.5552858710289001, + "learning_rate": 6.202388596059439e-06, + "loss": 1.0113, + "step": 44100 + }, + { + "epoch": 0.6354351239609893, + "grad_norm": 0.7025449275970459, + "learning_rate": 6.198072855887337e-06, + "loss": 1.0206, + "step": 44110 + }, + { + "epoch": 0.6355791808922886, + "grad_norm": 0.7095867395401001, + "learning_rate": 6.193757943342329e-06, + "loss": 0.9855, + "step": 44120 + }, + { + "epoch": 0.6357232378235879, + "grad_norm": 0.6301383972167969, + "learning_rate": 6.189443859363709e-06, + "loss": 1.0123, + "step": 44130 + }, + { + "epoch": 0.6358672947548871, + "grad_norm": 0.5972219705581665, + "learning_rate": 6.185130604890592e-06, + "loss": 1.0374, + "step": 44140 + }, + { + "epoch": 0.6360113516861864, + "grad_norm": 0.649064838886261, + "learning_rate": 6.180818180861918e-06, + "loss": 1.0203, + "step": 44150 + }, + { + "epoch": 0.6361554086174857, + "grad_norm": 0.5843871235847473, + "learning_rate": 6.176506588216445e-06, + "loss": 1.0171, + "step": 44160 + }, + { + "epoch": 0.6362994655487849, + "grad_norm": 0.6294156312942505, + "learning_rate": 6.172195827892733e-06, + "loss": 0.9945, + "step": 44170 + }, + { + "epoch": 0.6364435224800842, + "grad_norm": 0.5691481828689575, + "learning_rate": 6.167885900829193e-06, + "loss": 1.0032, + "step": 44180 + }, + { + "epoch": 0.6365875794113833, + "grad_norm": 0.5907384157180786, + "learning_rate": 6.163576807964024e-06, + "loss": 1.0088, + "step": 44190 + }, + { + "epoch": 0.6367316363426826, + "grad_norm": 0.555263876914978, + "learning_rate": 6.15926855023526e-06, + "loss": 1.0111, + "step": 44200 + }, + { + "epoch": 0.6368756932739819, + "grad_norm": 0.5968215465545654, + "learning_rate": 6.154961128580752e-06, + "loss": 1.0179, + "step": 44210 + }, + { + "epoch": 0.6370197502052811, + "grad_norm": 0.6137616634368896, + "learning_rate": 6.150654543938161e-06, + "loss": 0.991, + "step": 44220 + }, + { + "epoch": 0.6371638071365804, + "grad_norm": 0.5981063842773438, + "learning_rate": 6.146348797244978e-06, + "loss": 1.0065, + "step": 44230 + }, + { + "epoch": 0.6373078640678796, + "grad_norm": 0.6552395820617676, + "learning_rate": 6.142043889438499e-06, + "loss": 1.0328, + "step": 44240 + }, + { + "epoch": 0.6374519209991789, + "grad_norm": 0.5807239413261414, + "learning_rate": 6.1377398214558435e-06, + "loss": 1.0153, + "step": 44250 + }, + { + "epoch": 0.6375959779304782, + "grad_norm": 0.626849353313446, + "learning_rate": 6.1334365942339505e-06, + "loss": 0.9922, + "step": 44260 + }, + { + "epoch": 0.6377400348617773, + "grad_norm": 0.6650713086128235, + "learning_rate": 6.129134208709572e-06, + "loss": 1.0205, + "step": 44270 + }, + { + "epoch": 0.6378840917930766, + "grad_norm": 0.6666887998580933, + "learning_rate": 6.124832665819274e-06, + "loss": 1.0311, + "step": 44280 + }, + { + "epoch": 0.6380281487243759, + "grad_norm": 0.6410893201828003, + "learning_rate": 6.1205319664994515e-06, + "loss": 1.0218, + "step": 44290 + }, + { + "epoch": 0.6381722056556751, + "grad_norm": 0.6107708811759949, + "learning_rate": 6.116232111686302e-06, + "loss": 1.027, + "step": 44300 + }, + { + "epoch": 0.6383162625869744, + "grad_norm": 0.6178907155990601, + "learning_rate": 6.1119331023158415e-06, + "loss": 0.9946, + "step": 44310 + }, + { + "epoch": 0.6384603195182736, + "grad_norm": 0.612699568271637, + "learning_rate": 6.1076349393239134e-06, + "loss": 1.0263, + "step": 44320 + }, + { + "epoch": 0.6386043764495729, + "grad_norm": 0.6232872605323792, + "learning_rate": 6.103337623646157e-06, + "loss": 1.0208, + "step": 44330 + }, + { + "epoch": 0.6387484333808722, + "grad_norm": 0.6625041365623474, + "learning_rate": 6.099041156218053e-06, + "loss": 1.0127, + "step": 44340 + }, + { + "epoch": 0.6388924903121713, + "grad_norm": 0.614063560962677, + "learning_rate": 6.094745537974873e-06, + "loss": 1.0228, + "step": 44350 + }, + { + "epoch": 0.6390365472434706, + "grad_norm": 0.6330206394195557, + "learning_rate": 6.0904507698517145e-06, + "loss": 1.0347, + "step": 44360 + }, + { + "epoch": 0.6391806041747699, + "grad_norm": 0.5692284107208252, + "learning_rate": 6.086156852783493e-06, + "loss": 1.0066, + "step": 44370 + }, + { + "epoch": 0.6393246611060691, + "grad_norm": 0.6342631578445435, + "learning_rate": 6.081863787704932e-06, + "loss": 0.9999, + "step": 44380 + }, + { + "epoch": 0.6394687180373684, + "grad_norm": 0.5760719776153564, + "learning_rate": 6.077571575550574e-06, + "loss": 1.0193, + "step": 44390 + }, + { + "epoch": 0.6396127749686676, + "grad_norm": 0.626284658908844, + "learning_rate": 6.0732802172547745e-06, + "loss": 1.011, + "step": 44400 + }, + { + "epoch": 0.6397568318999669, + "grad_norm": 0.6053497195243835, + "learning_rate": 6.068989713751703e-06, + "loss": 1.0192, + "step": 44410 + }, + { + "epoch": 0.6399008888312662, + "grad_norm": 0.6346369385719299, + "learning_rate": 6.06470006597534e-06, + "loss": 1.0104, + "step": 44420 + }, + { + "epoch": 0.6400449457625653, + "grad_norm": 0.6166449189186096, + "learning_rate": 6.060411274859488e-06, + "loss": 1.0118, + "step": 44430 + }, + { + "epoch": 0.6401890026938646, + "grad_norm": 0.7997186779975891, + "learning_rate": 6.056123341337752e-06, + "loss": 1.0358, + "step": 44440 + }, + { + "epoch": 0.6403330596251638, + "grad_norm": 0.71084064245224, + "learning_rate": 6.051836266343561e-06, + "loss": 1.0074, + "step": 44450 + }, + { + "epoch": 0.6404771165564631, + "grad_norm": 0.7153636813163757, + "learning_rate": 6.0475500508101525e-06, + "loss": 1.0221, + "step": 44460 + }, + { + "epoch": 0.6406211734877624, + "grad_norm": 0.5669187903404236, + "learning_rate": 6.043264695670568e-06, + "loss": 1.0162, + "step": 44470 + }, + { + "epoch": 0.6407652304190616, + "grad_norm": 0.6377536058425903, + "learning_rate": 6.038980201857681e-06, + "loss": 1.0407, + "step": 44480 + }, + { + "epoch": 0.6409092873503609, + "grad_norm": 0.5607938766479492, + "learning_rate": 6.0346965703041594e-06, + "loss": 1.0044, + "step": 44490 + }, + { + "epoch": 0.6410533442816602, + "grad_norm": 0.5617562532424927, + "learning_rate": 6.030413801942492e-06, + "loss": 1.015, + "step": 44500 + }, + { + "epoch": 0.6411974012129593, + "grad_norm": 0.6788965463638306, + "learning_rate": 6.026131897704981e-06, + "loss": 1.01, + "step": 44510 + }, + { + "epoch": 0.6413414581442586, + "grad_norm": 0.6697009801864624, + "learning_rate": 6.021850858523737e-06, + "loss": 1.0085, + "step": 44520 + }, + { + "epoch": 0.6414855150755578, + "grad_norm": 0.7084874510765076, + "learning_rate": 6.01757068533068e-06, + "loss": 1.0094, + "step": 44530 + }, + { + "epoch": 0.6416295720068571, + "grad_norm": 0.7182435989379883, + "learning_rate": 6.01329137905755e-06, + "loss": 1.0516, + "step": 44540 + }, + { + "epoch": 0.6417736289381564, + "grad_norm": 0.6141085028648376, + "learning_rate": 6.0090129406358874e-06, + "loss": 1.0195, + "step": 44550 + }, + { + "epoch": 0.6419176858694556, + "grad_norm": 0.633694589138031, + "learning_rate": 6.004735370997055e-06, + "loss": 1.0002, + "step": 44560 + }, + { + "epoch": 0.6420617428007549, + "grad_norm": 0.7533735632896423, + "learning_rate": 6.000458671072218e-06, + "loss": 1.042, + "step": 44570 + }, + { + "epoch": 0.6422057997320542, + "grad_norm": 0.5945227146148682, + "learning_rate": 5.996182841792349e-06, + "loss": 1.0289, + "step": 44580 + }, + { + "epoch": 0.6423498566633533, + "grad_norm": 0.6342747211456299, + "learning_rate": 5.9919078840882485e-06, + "loss": 1.0154, + "step": 44590 + }, + { + "epoch": 0.6424939135946526, + "grad_norm": 0.6527490615844727, + "learning_rate": 5.987633798890509e-06, + "loss": 1.0011, + "step": 44600 + }, + { + "epoch": 0.6426379705259518, + "grad_norm": 0.6016778349876404, + "learning_rate": 5.983360587129537e-06, + "loss": 1.0321, + "step": 44610 + }, + { + "epoch": 0.6427820274572511, + "grad_norm": 0.584942102432251, + "learning_rate": 5.979088249735559e-06, + "loss": 1.0102, + "step": 44620 + }, + { + "epoch": 0.6429260843885504, + "grad_norm": 0.6678653359413147, + "learning_rate": 5.974816787638601e-06, + "loss": 1.006, + "step": 44630 + }, + { + "epoch": 0.6430701413198496, + "grad_norm": 0.6083000898361206, + "learning_rate": 5.970546201768498e-06, + "loss": 1.0164, + "step": 44640 + }, + { + "epoch": 0.6432141982511489, + "grad_norm": 0.6137006878852844, + "learning_rate": 5.966276493054903e-06, + "loss": 1.0366, + "step": 44650 + }, + { + "epoch": 0.6433582551824482, + "grad_norm": 0.6589402556419373, + "learning_rate": 5.962007662427267e-06, + "loss": 0.9968, + "step": 44660 + }, + { + "epoch": 0.6435023121137473, + "grad_norm": 0.6604875326156616, + "learning_rate": 5.957739710814863e-06, + "loss": 1.0236, + "step": 44670 + }, + { + "epoch": 0.6436463690450466, + "grad_norm": 0.6565546989440918, + "learning_rate": 5.953472639146759e-06, + "loss": 1.0022, + "step": 44680 + }, + { + "epoch": 0.6437904259763458, + "grad_norm": 0.650482714176178, + "learning_rate": 5.949206448351838e-06, + "loss": 1.0101, + "step": 44690 + }, + { + "epoch": 0.6439344829076451, + "grad_norm": 0.7212525010108948, + "learning_rate": 5.944941139358795e-06, + "loss": 1.0131, + "step": 44700 + }, + { + "epoch": 0.6440785398389444, + "grad_norm": 0.7000559568405151, + "learning_rate": 5.9406767130961275e-06, + "loss": 1.0256, + "step": 44710 + }, + { + "epoch": 0.6442225967702436, + "grad_norm": 0.6735440492630005, + "learning_rate": 5.936413170492137e-06, + "loss": 1.0272, + "step": 44720 + }, + { + "epoch": 0.6443666537015429, + "grad_norm": 0.569595992565155, + "learning_rate": 5.9321505124749455e-06, + "loss": 0.9982, + "step": 44730 + }, + { + "epoch": 0.6445107106328422, + "grad_norm": 0.7124947309494019, + "learning_rate": 5.927888739972471e-06, + "loss": 1.0342, + "step": 44740 + }, + { + "epoch": 0.6446547675641413, + "grad_norm": 0.6878064274787903, + "learning_rate": 5.923627853912441e-06, + "loss": 1.0333, + "step": 44750 + }, + { + "epoch": 0.6447988244954406, + "grad_norm": 0.5419682860374451, + "learning_rate": 5.919367855222396e-06, + "loss": 1.002, + "step": 44760 + }, + { + "epoch": 0.6449428814267398, + "grad_norm": 0.6019362211227417, + "learning_rate": 5.915108744829674e-06, + "loss": 0.9952, + "step": 44770 + }, + { + "epoch": 0.6450869383580391, + "grad_norm": 0.6707798838615417, + "learning_rate": 5.910850523661429e-06, + "loss": 1.0258, + "step": 44780 + }, + { + "epoch": 0.6452309952893384, + "grad_norm": 0.5261653065681458, + "learning_rate": 5.906593192644615e-06, + "loss": 1.0214, + "step": 44790 + }, + { + "epoch": 0.6453750522206376, + "grad_norm": 0.6842942833900452, + "learning_rate": 5.902336752705992e-06, + "loss": 1.0161, + "step": 44800 + }, + { + "epoch": 0.6455191091519369, + "grad_norm": 0.6605587005615234, + "learning_rate": 5.8980812047721335e-06, + "loss": 1.0134, + "step": 44810 + }, + { + "epoch": 0.645663166083236, + "grad_norm": 0.5872989892959595, + "learning_rate": 5.89382654976941e-06, + "loss": 1.0131, + "step": 44820 + }, + { + "epoch": 0.6458072230145353, + "grad_norm": 0.6560587882995605, + "learning_rate": 5.889572788624001e-06, + "loss": 1.0239, + "step": 44830 + }, + { + "epoch": 0.6459512799458346, + "grad_norm": 0.5824592709541321, + "learning_rate": 5.885319922261894e-06, + "loss": 1.0144, + "step": 44840 + }, + { + "epoch": 0.6460953368771338, + "grad_norm": 0.5636458396911621, + "learning_rate": 5.881067951608881e-06, + "loss": 1.024, + "step": 44850 + }, + { + "epoch": 0.6462393938084331, + "grad_norm": 0.6905114650726318, + "learning_rate": 5.876816877590546e-06, + "loss": 1.0265, + "step": 44860 + }, + { + "epoch": 0.6463834507397324, + "grad_norm": 0.6577576398849487, + "learning_rate": 5.872566701132305e-06, + "loss": 1.0203, + "step": 44870 + }, + { + "epoch": 0.6465275076710316, + "grad_norm": 0.6715118288993835, + "learning_rate": 5.8683174231593485e-06, + "loss": 1.0153, + "step": 44880 + }, + { + "epoch": 0.6466715646023309, + "grad_norm": 0.6628469228744507, + "learning_rate": 5.8640690445966985e-06, + "loss": 1.0208, + "step": 44890 + }, + { + "epoch": 0.64681562153363, + "grad_norm": 0.7258967161178589, + "learning_rate": 5.8598215663691606e-06, + "loss": 1.0065, + "step": 44900 + }, + { + "epoch": 0.6469596784649293, + "grad_norm": 0.6283655762672424, + "learning_rate": 5.8555749894013514e-06, + "loss": 1.0174, + "step": 44910 + }, + { + "epoch": 0.6471037353962286, + "grad_norm": 0.6564602851867676, + "learning_rate": 5.8513293146176975e-06, + "loss": 1.0164, + "step": 44920 + }, + { + "epoch": 0.6472477923275278, + "grad_norm": 0.7157777547836304, + "learning_rate": 5.847084542942419e-06, + "loss": 1.0132, + "step": 44930 + }, + { + "epoch": 0.6473918492588271, + "grad_norm": 0.7218253016471863, + "learning_rate": 5.842840675299546e-06, + "loss": 1.0143, + "step": 44940 + }, + { + "epoch": 0.6475359061901264, + "grad_norm": 0.5819610953330994, + "learning_rate": 5.838597712612911e-06, + "loss": 1.0164, + "step": 44950 + }, + { + "epoch": 0.6476799631214256, + "grad_norm": 0.5980701446533203, + "learning_rate": 5.834355655806148e-06, + "loss": 1.0173, + "step": 44960 + }, + { + "epoch": 0.6478240200527249, + "grad_norm": 0.6305130124092102, + "learning_rate": 5.830114505802693e-06, + "loss": 1.0087, + "step": 44970 + }, + { + "epoch": 0.647968076984024, + "grad_norm": 0.621622622013092, + "learning_rate": 5.825874263525787e-06, + "loss": 1.0086, + "step": 44980 + }, + { + "epoch": 0.6481121339153233, + "grad_norm": 0.6889428496360779, + "learning_rate": 5.821634929898468e-06, + "loss": 1.0297, + "step": 44990 + }, + { + "epoch": 0.6482561908466226, + "grad_norm": 0.6638681292533875, + "learning_rate": 5.817396505843586e-06, + "loss": 1.0162, + "step": 45000 + }, + { + "epoch": 0.6484002477779218, + "grad_norm": 0.57753586769104, + "learning_rate": 5.813158992283788e-06, + "loss": 1.0267, + "step": 45010 + }, + { + "epoch": 0.6485443047092211, + "grad_norm": 0.634385883808136, + "learning_rate": 5.808922390141519e-06, + "loss": 1.0385, + "step": 45020 + }, + { + "epoch": 0.6486883616405204, + "grad_norm": 0.571789562702179, + "learning_rate": 5.804686700339031e-06, + "loss": 1.0261, + "step": 45030 + }, + { + "epoch": 0.6488324185718196, + "grad_norm": 0.6051955819129944, + "learning_rate": 5.800451923798372e-06, + "loss": 1.027, + "step": 45040 + }, + { + "epoch": 0.6489764755031189, + "grad_norm": 0.6441025137901306, + "learning_rate": 5.796218061441393e-06, + "loss": 0.9946, + "step": 45050 + }, + { + "epoch": 0.649120532434418, + "grad_norm": 0.701108992099762, + "learning_rate": 5.79198511418976e-06, + "loss": 0.998, + "step": 45060 + }, + { + "epoch": 0.6492645893657173, + "grad_norm": 0.6444332599639893, + "learning_rate": 5.787753082964913e-06, + "loss": 1.0082, + "step": 45070 + }, + { + "epoch": 0.6494086462970166, + "grad_norm": 0.6186667084693909, + "learning_rate": 5.783521968688107e-06, + "loss": 1.0365, + "step": 45080 + }, + { + "epoch": 0.6495527032283158, + "grad_norm": 0.7480693459510803, + "learning_rate": 5.779291772280408e-06, + "loss": 1.0116, + "step": 45090 + }, + { + "epoch": 0.6496967601596151, + "grad_norm": 0.6325919032096863, + "learning_rate": 5.775062494662664e-06, + "loss": 1.0168, + "step": 45100 + }, + { + "epoch": 0.6498408170909143, + "grad_norm": 0.7014506459236145, + "learning_rate": 5.770834136755532e-06, + "loss": 1.0212, + "step": 45110 + }, + { + "epoch": 0.6499848740222136, + "grad_norm": 0.6091514825820923, + "learning_rate": 5.766606699479468e-06, + "loss": 1.0341, + "step": 45120 + }, + { + "epoch": 0.6501289309535129, + "grad_norm": 0.6190800666809082, + "learning_rate": 5.7623801837547215e-06, + "loss": 0.9984, + "step": 45130 + }, + { + "epoch": 0.650272987884812, + "grad_norm": 0.7025150656700134, + "learning_rate": 5.7581545905013546e-06, + "loss": 1.0082, + "step": 45140 + }, + { + "epoch": 0.6504170448161113, + "grad_norm": 0.641904354095459, + "learning_rate": 5.753929920639216e-06, + "loss": 1.0048, + "step": 45150 + }, + { + "epoch": 0.6505611017474106, + "grad_norm": 0.7571387887001038, + "learning_rate": 5.749706175087961e-06, + "loss": 1.0437, + "step": 45160 + }, + { + "epoch": 0.6507051586787098, + "grad_norm": 0.6438396573066711, + "learning_rate": 5.745483354767038e-06, + "loss": 1.012, + "step": 45170 + }, + { + "epoch": 0.6508492156100091, + "grad_norm": 0.6127173900604248, + "learning_rate": 5.741261460595697e-06, + "loss": 1.0125, + "step": 45180 + }, + { + "epoch": 0.6509932725413083, + "grad_norm": 0.568905234336853, + "learning_rate": 5.737040493492985e-06, + "loss": 0.9998, + "step": 45190 + }, + { + "epoch": 0.6511373294726076, + "grad_norm": 0.6029191017150879, + "learning_rate": 5.732820454377757e-06, + "loss": 1.002, + "step": 45200 + }, + { + "epoch": 0.6512813864039069, + "grad_norm": 0.6624503135681152, + "learning_rate": 5.72860134416864e-06, + "loss": 1.0013, + "step": 45210 + }, + { + "epoch": 0.651425443335206, + "grad_norm": 0.7227169275283813, + "learning_rate": 5.724383163784093e-06, + "loss": 1.0325, + "step": 45220 + }, + { + "epoch": 0.6515695002665053, + "grad_norm": 0.7109741568565369, + "learning_rate": 5.720165914142348e-06, + "loss": 1.0249, + "step": 45230 + }, + { + "epoch": 0.6517135571978045, + "grad_norm": 0.7473984956741333, + "learning_rate": 5.715949596161443e-06, + "loss": 1.027, + "step": 45240 + }, + { + "epoch": 0.6518576141291038, + "grad_norm": 0.6912552118301392, + "learning_rate": 5.7117342107592135e-06, + "loss": 1.0129, + "step": 45250 + }, + { + "epoch": 0.6520016710604031, + "grad_norm": 0.5782814621925354, + "learning_rate": 5.707519758853288e-06, + "loss": 1.0191, + "step": 45260 + }, + { + "epoch": 0.6521457279917023, + "grad_norm": 0.6410259008407593, + "learning_rate": 5.703306241361094e-06, + "loss": 1.015, + "step": 45270 + }, + { + "epoch": 0.6522897849230016, + "grad_norm": 0.6248414516448975, + "learning_rate": 5.699093659199861e-06, + "loss": 1.0051, + "step": 45280 + }, + { + "epoch": 0.6524338418543009, + "grad_norm": 0.6174207925796509, + "learning_rate": 5.694882013286609e-06, + "loss": 1.0286, + "step": 45290 + }, + { + "epoch": 0.6525778987856, + "grad_norm": 0.6624664664268494, + "learning_rate": 5.690671304538151e-06, + "loss": 1.0518, + "step": 45300 + }, + { + "epoch": 0.6527219557168993, + "grad_norm": 0.6410439014434814, + "learning_rate": 5.6864615338711046e-06, + "loss": 1.008, + "step": 45310 + }, + { + "epoch": 0.6528660126481985, + "grad_norm": 0.6226246356964111, + "learning_rate": 5.682252702201874e-06, + "loss": 1.0026, + "step": 45320 + }, + { + "epoch": 0.6530100695794978, + "grad_norm": 0.5791779160499573, + "learning_rate": 5.678044810446668e-06, + "loss": 1.0417, + "step": 45330 + }, + { + "epoch": 0.6531541265107971, + "grad_norm": 0.676311731338501, + "learning_rate": 5.673837859521492e-06, + "loss": 1.0227, + "step": 45340 + }, + { + "epoch": 0.6532981834420963, + "grad_norm": 0.6526930332183838, + "learning_rate": 5.669631850342124e-06, + "loss": 1.009, + "step": 45350 + }, + { + "epoch": 0.6534422403733956, + "grad_norm": 0.5468018651008606, + "learning_rate": 5.665426783824171e-06, + "loss": 1.0152, + "step": 45360 + }, + { + "epoch": 0.6535862973046949, + "grad_norm": 0.5868613123893738, + "learning_rate": 5.66122266088301e-06, + "loss": 1.0262, + "step": 45370 + }, + { + "epoch": 0.653730354235994, + "grad_norm": 0.6021547913551331, + "learning_rate": 5.6570194824338235e-06, + "loss": 1.0243, + "step": 45380 + }, + { + "epoch": 0.6538744111672933, + "grad_norm": 0.6306912302970886, + "learning_rate": 5.652817249391583e-06, + "loss": 1.0468, + "step": 45390 + }, + { + "epoch": 0.6540184680985925, + "grad_norm": 0.6520081758499146, + "learning_rate": 5.648615962671058e-06, + "loss": 1.0283, + "step": 45400 + }, + { + "epoch": 0.6541625250298918, + "grad_norm": 0.5919010639190674, + "learning_rate": 5.644415623186807e-06, + "loss": 1.0154, + "step": 45410 + }, + { + "epoch": 0.6543065819611911, + "grad_norm": 0.637642502784729, + "learning_rate": 5.640216231853195e-06, + "loss": 1.0315, + "step": 45420 + }, + { + "epoch": 0.6544506388924903, + "grad_norm": 0.6275111436843872, + "learning_rate": 5.636017789584365e-06, + "loss": 1.0067, + "step": 45430 + }, + { + "epoch": 0.6545946958237896, + "grad_norm": 0.7290439605712891, + "learning_rate": 5.6318202972942595e-06, + "loss": 1.0065, + "step": 45440 + }, + { + "epoch": 0.6547387527550889, + "grad_norm": 0.5982494354248047, + "learning_rate": 5.627623755896618e-06, + "loss": 1.0209, + "step": 45450 + }, + { + "epoch": 0.654882809686388, + "grad_norm": 0.7246947288513184, + "learning_rate": 5.6234281663049625e-06, + "loss": 1.0423, + "step": 45460 + }, + { + "epoch": 0.6550268666176873, + "grad_norm": 0.6050679087638855, + "learning_rate": 5.619233529432626e-06, + "loss": 0.9994, + "step": 45470 + }, + { + "epoch": 0.6551709235489865, + "grad_norm": 0.600634753704071, + "learning_rate": 5.61503984619272e-06, + "loss": 1.0036, + "step": 45480 + }, + { + "epoch": 0.6553149804802858, + "grad_norm": 0.9534344673156738, + "learning_rate": 5.610847117498141e-06, + "loss": 1.0101, + "step": 45490 + }, + { + "epoch": 0.6554590374115851, + "grad_norm": 0.6611275672912598, + "learning_rate": 5.6066553442616026e-06, + "loss": 0.9951, + "step": 45500 + }, + { + "epoch": 0.6556030943428843, + "grad_norm": 0.6024879217147827, + "learning_rate": 5.602464527395587e-06, + "loss": 1.0037, + "step": 45510 + }, + { + "epoch": 0.6557471512741836, + "grad_norm": 0.557040274143219, + "learning_rate": 5.59827466781238e-06, + "loss": 1.0151, + "step": 45520 + }, + { + "epoch": 0.6558912082054829, + "grad_norm": 0.721174418926239, + "learning_rate": 5.594085766424057e-06, + "loss": 1.0248, + "step": 45530 + }, + { + "epoch": 0.656035265136782, + "grad_norm": 0.7238805294036865, + "learning_rate": 5.589897824142479e-06, + "loss": 1.0108, + "step": 45540 + }, + { + "epoch": 0.6561793220680813, + "grad_norm": 0.6601713299751282, + "learning_rate": 5.5857108418793095e-06, + "loss": 1.027, + "step": 45550 + }, + { + "epoch": 0.6563233789993805, + "grad_norm": 0.7457177639007568, + "learning_rate": 5.581524820545996e-06, + "loss": 1.024, + "step": 45560 + }, + { + "epoch": 0.6564674359306798, + "grad_norm": 0.7239423990249634, + "learning_rate": 5.577339761053775e-06, + "loss": 1.0035, + "step": 45570 + }, + { + "epoch": 0.6566114928619791, + "grad_norm": 0.5816134214401245, + "learning_rate": 5.573155664313679e-06, + "loss": 1.022, + "step": 45580 + }, + { + "epoch": 0.6567555497932783, + "grad_norm": 0.6314454674720764, + "learning_rate": 5.5689725312365254e-06, + "loss": 1.0348, + "step": 45590 + }, + { + "epoch": 0.6568996067245776, + "grad_norm": 0.6644384860992432, + "learning_rate": 5.564790362732922e-06, + "loss": 1.023, + "step": 45600 + }, + { + "epoch": 0.6570436636558769, + "grad_norm": 0.6335042119026184, + "learning_rate": 5.560609159713282e-06, + "loss": 1.0395, + "step": 45610 + }, + { + "epoch": 0.657187720587176, + "grad_norm": 0.616644561290741, + "learning_rate": 5.556428923087781e-06, + "loss": 1.0313, + "step": 45620 + }, + { + "epoch": 0.6573317775184753, + "grad_norm": 0.6697009205818176, + "learning_rate": 5.552249653766401e-06, + "loss": 1.0231, + "step": 45630 + }, + { + "epoch": 0.6574758344497745, + "grad_norm": 0.6900887489318848, + "learning_rate": 5.548071352658918e-06, + "loss": 1.0255, + "step": 45640 + }, + { + "epoch": 0.6576198913810738, + "grad_norm": 0.6656466722488403, + "learning_rate": 5.543894020674887e-06, + "loss": 1.0222, + "step": 45650 + }, + { + "epoch": 0.6577639483123731, + "grad_norm": 0.6971928477287292, + "learning_rate": 5.5397176587236575e-06, + "loss": 1.0136, + "step": 45660 + }, + { + "epoch": 0.6579080052436723, + "grad_norm": 0.630372166633606, + "learning_rate": 5.535542267714362e-06, + "loss": 1.0155, + "step": 45670 + }, + { + "epoch": 0.6580520621749716, + "grad_norm": 0.6088457703590393, + "learning_rate": 5.531367848555925e-06, + "loss": 1.0325, + "step": 45680 + }, + { + "epoch": 0.6581961191062708, + "grad_norm": 0.6991166472434998, + "learning_rate": 5.527194402157065e-06, + "loss": 1.037, + "step": 45690 + }, + { + "epoch": 0.65834017603757, + "grad_norm": 0.7558432221412659, + "learning_rate": 5.5230219294262834e-06, + "loss": 1.009, + "step": 45700 + }, + { + "epoch": 0.6584842329688693, + "grad_norm": 0.6258352398872375, + "learning_rate": 5.518850431271867e-06, + "loss": 0.9941, + "step": 45710 + }, + { + "epoch": 0.6586282899001685, + "grad_norm": 0.7778695821762085, + "learning_rate": 5.514679908601894e-06, + "loss": 1.0382, + "step": 45720 + }, + { + "epoch": 0.6587723468314678, + "grad_norm": 0.5646911263465881, + "learning_rate": 5.510510362324228e-06, + "loss": 1.0158, + "step": 45730 + }, + { + "epoch": 0.6589164037627671, + "grad_norm": 0.6628832817077637, + "learning_rate": 5.5063417933465224e-06, + "loss": 1.0066, + "step": 45740 + }, + { + "epoch": 0.6590604606940663, + "grad_norm": 0.7023528814315796, + "learning_rate": 5.5021742025762245e-06, + "loss": 1.0069, + "step": 45750 + }, + { + "epoch": 0.6592045176253656, + "grad_norm": 0.6527752876281738, + "learning_rate": 5.498007590920549e-06, + "loss": 1.0201, + "step": 45760 + }, + { + "epoch": 0.6593485745566648, + "grad_norm": 0.611611545085907, + "learning_rate": 5.493841959286519e-06, + "loss": 1.0115, + "step": 45770 + }, + { + "epoch": 0.659492631487964, + "grad_norm": 0.5528119206428528, + "learning_rate": 5.489677308580932e-06, + "loss": 1.0375, + "step": 45780 + }, + { + "epoch": 0.6596366884192633, + "grad_norm": 0.6203455924987793, + "learning_rate": 5.485513639710375e-06, + "loss": 1.0061, + "step": 45790 + }, + { + "epoch": 0.6597807453505625, + "grad_norm": 0.6796348690986633, + "learning_rate": 5.481350953581221e-06, + "loss": 1.0151, + "step": 45800 + }, + { + "epoch": 0.6599248022818618, + "grad_norm": 0.6706281900405884, + "learning_rate": 5.47718925109963e-06, + "loss": 1.0089, + "step": 45810 + }, + { + "epoch": 0.660068859213161, + "grad_norm": 0.6729138493537903, + "learning_rate": 5.473028533171543e-06, + "loss": 1.0127, + "step": 45820 + }, + { + "epoch": 0.6602129161444603, + "grad_norm": 0.6415603756904602, + "learning_rate": 5.468868800702698e-06, + "loss": 0.9967, + "step": 45830 + }, + { + "epoch": 0.6603569730757596, + "grad_norm": 0.565737247467041, + "learning_rate": 5.464710054598607e-06, + "loss": 1.0264, + "step": 45840 + }, + { + "epoch": 0.6605010300070588, + "grad_norm": 0.6968961358070374, + "learning_rate": 5.460552295764575e-06, + "loss": 1.0321, + "step": 45850 + }, + { + "epoch": 0.660645086938358, + "grad_norm": 0.638938844203949, + "learning_rate": 5.456395525105686e-06, + "loss": 1.0028, + "step": 45860 + }, + { + "epoch": 0.6607891438696573, + "grad_norm": 0.7692632675170898, + "learning_rate": 5.452239743526808e-06, + "loss": 1.001, + "step": 45870 + }, + { + "epoch": 0.6609332008009565, + "grad_norm": 0.7068073749542236, + "learning_rate": 5.448084951932604e-06, + "loss": 1.0274, + "step": 45880 + }, + { + "epoch": 0.6610772577322558, + "grad_norm": 0.5941410064697266, + "learning_rate": 5.443931151227517e-06, + "loss": 1.0312, + "step": 45890 + }, + { + "epoch": 0.661221314663555, + "grad_norm": 0.6203605532646179, + "learning_rate": 5.43977834231576e-06, + "loss": 1.0148, + "step": 45900 + }, + { + "epoch": 0.6613653715948543, + "grad_norm": 0.6441105008125305, + "learning_rate": 5.435626526101354e-06, + "loss": 1.0096, + "step": 45910 + }, + { + "epoch": 0.6615094285261536, + "grad_norm": 0.8502092957496643, + "learning_rate": 5.4314757034880876e-06, + "loss": 1.0068, + "step": 45920 + }, + { + "epoch": 0.6616534854574528, + "grad_norm": 0.6751788258552551, + "learning_rate": 5.4273258753795365e-06, + "loss": 1.0128, + "step": 45930 + }, + { + "epoch": 0.661797542388752, + "grad_norm": 0.6241414546966553, + "learning_rate": 5.423177042679064e-06, + "loss": 1.0185, + "step": 45940 + }, + { + "epoch": 0.6619415993200513, + "grad_norm": 0.5784713625907898, + "learning_rate": 5.419029206289811e-06, + "loss": 1.0159, + "step": 45950 + }, + { + "epoch": 0.6620856562513505, + "grad_norm": 0.6208174824714661, + "learning_rate": 5.414882367114701e-06, + "loss": 1.0197, + "step": 45960 + }, + { + "epoch": 0.6622297131826498, + "grad_norm": 0.6757817268371582, + "learning_rate": 5.4107365260564525e-06, + "loss": 1.0115, + "step": 45970 + }, + { + "epoch": 0.662373770113949, + "grad_norm": 0.6738228797912598, + "learning_rate": 5.406591684017552e-06, + "loss": 0.9987, + "step": 45980 + }, + { + "epoch": 0.6625178270452483, + "grad_norm": 0.5967774391174316, + "learning_rate": 5.402447841900278e-06, + "loss": 1.0077, + "step": 45990 + }, + { + "epoch": 0.6626618839765476, + "grad_norm": 0.6555769443511963, + "learning_rate": 5.398305000606685e-06, + "loss": 1.0269, + "step": 46000 + }, + { + "epoch": 0.6628059409078468, + "grad_norm": 0.6483225226402283, + "learning_rate": 5.394163161038608e-06, + "loss": 0.996, + "step": 46010 + }, + { + "epoch": 0.662949997839146, + "grad_norm": 0.6243937611579895, + "learning_rate": 5.390022324097681e-06, + "loss": 1.0199, + "step": 46020 + }, + { + "epoch": 0.6630940547704453, + "grad_norm": 0.614417314529419, + "learning_rate": 5.385882490685297e-06, + "loss": 1.0502, + "step": 46030 + }, + { + "epoch": 0.6632381117017445, + "grad_norm": 0.7182544469833374, + "learning_rate": 5.381743661702638e-06, + "loss": 1.0235, + "step": 46040 + }, + { + "epoch": 0.6633821686330438, + "grad_norm": 0.6448560357093811, + "learning_rate": 5.377605838050679e-06, + "loss": 1.0316, + "step": 46050 + }, + { + "epoch": 0.663526225564343, + "grad_norm": 0.61222904920578, + "learning_rate": 5.373469020630162e-06, + "loss": 1.0101, + "step": 46060 + }, + { + "epoch": 0.6636702824956423, + "grad_norm": 0.6609721183776855, + "learning_rate": 5.369333210341618e-06, + "loss": 1.0079, + "step": 46070 + }, + { + "epoch": 0.6638143394269416, + "grad_norm": 0.6307268738746643, + "learning_rate": 5.365198408085352e-06, + "loss": 1.0144, + "step": 46080 + }, + { + "epoch": 0.6639583963582408, + "grad_norm": 0.5823897123336792, + "learning_rate": 5.361064614761455e-06, + "loss": 1.0328, + "step": 46090 + }, + { + "epoch": 0.66410245328954, + "grad_norm": 0.645470380783081, + "learning_rate": 5.356931831269798e-06, + "loss": 1.0163, + "step": 46100 + }, + { + "epoch": 0.6642465102208392, + "grad_norm": 0.6113483309745789, + "learning_rate": 5.3528000585100305e-06, + "loss": 1.0279, + "step": 46110 + }, + { + "epoch": 0.6643905671521385, + "grad_norm": 0.5694353580474854, + "learning_rate": 5.348669297381583e-06, + "loss": 1.026, + "step": 46120 + }, + { + "epoch": 0.6645346240834378, + "grad_norm": 0.5848360657691956, + "learning_rate": 5.344539548783665e-06, + "loss": 1.0331, + "step": 46130 + }, + { + "epoch": 0.664678681014737, + "grad_norm": 0.633795440196991, + "learning_rate": 5.3404108136152634e-06, + "loss": 1.0262, + "step": 46140 + }, + { + "epoch": 0.6648227379460363, + "grad_norm": 0.5898723006248474, + "learning_rate": 5.336283092775145e-06, + "loss": 1.0066, + "step": 46150 + }, + { + "epoch": 0.6649667948773356, + "grad_norm": 0.5714325308799744, + "learning_rate": 5.332156387161871e-06, + "loss": 1.0232, + "step": 46160 + }, + { + "epoch": 0.6651108518086347, + "grad_norm": 0.5904276967048645, + "learning_rate": 5.3280306976737515e-06, + "loss": 0.9951, + "step": 46170 + }, + { + "epoch": 0.665254908739934, + "grad_norm": 0.6491442918777466, + "learning_rate": 5.323906025208898e-06, + "loss": 1.0156, + "step": 46180 + }, + { + "epoch": 0.6653989656712332, + "grad_norm": 0.5478079319000244, + "learning_rate": 5.319782370665198e-06, + "loss": 1.0167, + "step": 46190 + }, + { + "epoch": 0.6655430226025325, + "grad_norm": 0.6193534731864929, + "learning_rate": 5.315659734940315e-06, + "loss": 0.9997, + "step": 46200 + }, + { + "epoch": 0.6656870795338318, + "grad_norm": 0.6399695873260498, + "learning_rate": 5.311538118931685e-06, + "loss": 1.0263, + "step": 46210 + }, + { + "epoch": 0.665831136465131, + "grad_norm": 0.6257351636886597, + "learning_rate": 5.30741752353653e-06, + "loss": 1.0472, + "step": 46220 + }, + { + "epoch": 0.6659751933964303, + "grad_norm": 0.6219946146011353, + "learning_rate": 5.3032979496518424e-06, + "loss": 1.0001, + "step": 46230 + }, + { + "epoch": 0.6661192503277296, + "grad_norm": 0.6141021251678467, + "learning_rate": 5.2991793981744024e-06, + "loss": 1.0091, + "step": 46240 + }, + { + "epoch": 0.6662633072590287, + "grad_norm": 0.8189109563827515, + "learning_rate": 5.295061870000761e-06, + "loss": 1.0213, + "step": 46250 + }, + { + "epoch": 0.666407364190328, + "grad_norm": 0.5471169352531433, + "learning_rate": 5.290945366027245e-06, + "loss": 1.0116, + "step": 46260 + }, + { + "epoch": 0.6665514211216272, + "grad_norm": 0.6610715985298157, + "learning_rate": 5.286829887149961e-06, + "loss": 1.0138, + "step": 46270 + }, + { + "epoch": 0.6666954780529265, + "grad_norm": 0.6075613498687744, + "learning_rate": 5.282715434264794e-06, + "loss": 0.9949, + "step": 46280 + }, + { + "epoch": 0.6668395349842258, + "grad_norm": 0.7018479704856873, + "learning_rate": 5.2786020082673975e-06, + "loss": 1.014, + "step": 46290 + }, + { + "epoch": 0.666983591915525, + "grad_norm": 0.6411354541778564, + "learning_rate": 5.274489610053219e-06, + "loss": 1.016, + "step": 46300 + }, + { + "epoch": 0.6671276488468243, + "grad_norm": 0.6892482042312622, + "learning_rate": 5.270378240517456e-06, + "loss": 1.0397, + "step": 46310 + }, + { + "epoch": 0.6672717057781236, + "grad_norm": 0.6826410889625549, + "learning_rate": 5.26626790055511e-06, + "loss": 1.0184, + "step": 46320 + }, + { + "epoch": 0.6674157627094227, + "grad_norm": 0.641666054725647, + "learning_rate": 5.262158591060939e-06, + "loss": 1.021, + "step": 46330 + }, + { + "epoch": 0.667559819640722, + "grad_norm": 0.6144629716873169, + "learning_rate": 5.2580503129294855e-06, + "loss": 1.0263, + "step": 46340 + }, + { + "epoch": 0.6677038765720212, + "grad_norm": 0.7096126079559326, + "learning_rate": 5.2539430670550625e-06, + "loss": 1.0034, + "step": 46350 + }, + { + "epoch": 0.6678479335033205, + "grad_norm": 0.6550100445747375, + "learning_rate": 5.249836854331763e-06, + "loss": 1.0139, + "step": 46360 + }, + { + "epoch": 0.6679919904346198, + "grad_norm": 0.6665602922439575, + "learning_rate": 5.245731675653447e-06, + "loss": 1.0414, + "step": 46370 + }, + { + "epoch": 0.668136047365919, + "grad_norm": 0.5964598059654236, + "learning_rate": 5.241627531913765e-06, + "loss": 1.0247, + "step": 46380 + }, + { + "epoch": 0.6682801042972183, + "grad_norm": 0.798501193523407, + "learning_rate": 5.237524424006125e-06, + "loss": 1.0345, + "step": 46390 + }, + { + "epoch": 0.6684241612285176, + "grad_norm": 0.5867642760276794, + "learning_rate": 5.233422352823721e-06, + "loss": 0.9878, + "step": 46400 + }, + { + "epoch": 0.6685682181598167, + "grad_norm": 0.5655377507209778, + "learning_rate": 5.229321319259517e-06, + "loss": 1.0238, + "step": 46410 + }, + { + "epoch": 0.668712275091116, + "grad_norm": 0.6586424708366394, + "learning_rate": 5.225221324206244e-06, + "loss": 0.9973, + "step": 46420 + }, + { + "epoch": 0.6688563320224152, + "grad_norm": 0.579627275466919, + "learning_rate": 5.221122368556426e-06, + "loss": 1.0146, + "step": 46430 + }, + { + "epoch": 0.6690003889537145, + "grad_norm": 0.6135485172271729, + "learning_rate": 5.217024453202348e-06, + "loss": 1.0023, + "step": 46440 + }, + { + "epoch": 0.6691444458850138, + "grad_norm": 0.7205511331558228, + "learning_rate": 5.212927579036058e-06, + "loss": 1.0071, + "step": 46450 + }, + { + "epoch": 0.669288502816313, + "grad_norm": 0.5820243954658508, + "learning_rate": 5.208831746949398e-06, + "loss": 1.018, + "step": 46460 + }, + { + "epoch": 0.6694325597476123, + "grad_norm": 0.6837009787559509, + "learning_rate": 5.204736957833975e-06, + "loss": 1.0251, + "step": 46470 + }, + { + "epoch": 0.6695766166789116, + "grad_norm": 0.6436097025871277, + "learning_rate": 5.200643212581165e-06, + "loss": 1.0201, + "step": 46480 + }, + { + "epoch": 0.6697206736102107, + "grad_norm": 0.5506840348243713, + "learning_rate": 5.196550512082119e-06, + "loss": 1.013, + "step": 46490 + }, + { + "epoch": 0.66986473054151, + "grad_norm": 0.6284917593002319, + "learning_rate": 5.192458857227764e-06, + "loss": 1.0183, + "step": 46500 + }, + { + "epoch": 0.6700087874728092, + "grad_norm": 0.5801482796669006, + "learning_rate": 5.18836824890879e-06, + "loss": 1.0164, + "step": 46510 + }, + { + "epoch": 0.6701528444041085, + "grad_norm": 0.6342283487319946, + "learning_rate": 5.184278688015678e-06, + "loss": 1.0397, + "step": 46520 + }, + { + "epoch": 0.6702969013354078, + "grad_norm": 0.6485182642936707, + "learning_rate": 5.180190175438662e-06, + "loss": 0.9983, + "step": 46530 + }, + { + "epoch": 0.670440958266707, + "grad_norm": 0.614905059337616, + "learning_rate": 5.176102712067755e-06, + "loss": 1.0289, + "step": 46540 + }, + { + "epoch": 0.6705850151980063, + "grad_norm": 0.6317541003227234, + "learning_rate": 5.172016298792743e-06, + "loss": 1.0153, + "step": 46550 + }, + { + "epoch": 0.6707290721293055, + "grad_norm": 0.5996819734573364, + "learning_rate": 5.167930936503176e-06, + "loss": 1.0203, + "step": 46560 + }, + { + "epoch": 0.6708731290606047, + "grad_norm": 0.7232657074928284, + "learning_rate": 5.163846626088394e-06, + "loss": 1.028, + "step": 46570 + }, + { + "epoch": 0.671017185991904, + "grad_norm": 0.6078417897224426, + "learning_rate": 5.159763368437482e-06, + "loss": 0.9903, + "step": 46580 + }, + { + "epoch": 0.6711612429232032, + "grad_norm": 0.6394873261451721, + "learning_rate": 5.15568116443931e-06, + "loss": 1.0242, + "step": 46590 + }, + { + "epoch": 0.6713052998545025, + "grad_norm": 0.6627945899963379, + "learning_rate": 5.151600014982525e-06, + "loss": 1.0312, + "step": 46600 + }, + { + "epoch": 0.6714493567858018, + "grad_norm": 0.7143929600715637, + "learning_rate": 5.1475199209555325e-06, + "loss": 1.0465, + "step": 46610 + }, + { + "epoch": 0.671593413717101, + "grad_norm": 0.7025628685951233, + "learning_rate": 5.143440883246514e-06, + "loss": 1.0271, + "step": 46620 + }, + { + "epoch": 0.6717374706484003, + "grad_norm": 0.6601547002792358, + "learning_rate": 5.1393629027434186e-06, + "loss": 1.0001, + "step": 46630 + }, + { + "epoch": 0.6718815275796995, + "grad_norm": 0.6126776933670044, + "learning_rate": 5.135285980333962e-06, + "loss": 1.0093, + "step": 46640 + }, + { + "epoch": 0.6720255845109987, + "grad_norm": 0.6820292472839355, + "learning_rate": 5.131210116905645e-06, + "loss": 1.0215, + "step": 46650 + }, + { + "epoch": 0.672169641442298, + "grad_norm": 0.6530098915100098, + "learning_rate": 5.127135313345719e-06, + "loss": 0.9993, + "step": 46660 + }, + { + "epoch": 0.6723136983735972, + "grad_norm": 0.5953645706176758, + "learning_rate": 5.123061570541214e-06, + "loss": 1.0027, + "step": 46670 + }, + { + "epoch": 0.6724577553048965, + "grad_norm": 0.6368499994277954, + "learning_rate": 5.118988889378927e-06, + "loss": 1.0375, + "step": 46680 + }, + { + "epoch": 0.6726018122361958, + "grad_norm": 0.699311375617981, + "learning_rate": 5.114917270745426e-06, + "loss": 1.0243, + "step": 46690 + }, + { + "epoch": 0.672745869167495, + "grad_norm": 0.7033451795578003, + "learning_rate": 5.110846715527042e-06, + "loss": 1.0249, + "step": 46700 + }, + { + "epoch": 0.6728899260987943, + "grad_norm": 0.5182821154594421, + "learning_rate": 5.106777224609889e-06, + "loss": 1.0024, + "step": 46710 + }, + { + "epoch": 0.6730339830300935, + "grad_norm": 0.5992767214775085, + "learning_rate": 5.1027087988798294e-06, + "loss": 1.0258, + "step": 46720 + }, + { + "epoch": 0.6731780399613927, + "grad_norm": 0.6217204928398132, + "learning_rate": 5.098641439222502e-06, + "loss": 1.0065, + "step": 46730 + }, + { + "epoch": 0.673322096892692, + "grad_norm": 0.6292192935943604, + "learning_rate": 5.094575146523322e-06, + "loss": 1.0067, + "step": 46740 + }, + { + "epoch": 0.6734661538239912, + "grad_norm": 0.6448467969894409, + "learning_rate": 5.090509921667463e-06, + "loss": 1.023, + "step": 46750 + }, + { + "epoch": 0.6736102107552905, + "grad_norm": 0.850192666053772, + "learning_rate": 5.086445765539867e-06, + "loss": 1.0341, + "step": 46760 + }, + { + "epoch": 0.6737542676865897, + "grad_norm": 0.6021695733070374, + "learning_rate": 5.082382679025246e-06, + "loss": 1.0352, + "step": 46770 + }, + { + "epoch": 0.673898324617889, + "grad_norm": 0.5934077501296997, + "learning_rate": 5.0783206630080725e-06, + "loss": 0.9999, + "step": 46780 + }, + { + "epoch": 0.6740423815491883, + "grad_norm": 0.5981540083885193, + "learning_rate": 5.074259718372601e-06, + "loss": 0.996, + "step": 46790 + }, + { + "epoch": 0.6741864384804875, + "grad_norm": 0.678403377532959, + "learning_rate": 5.0701998460028365e-06, + "loss": 0.9898, + "step": 46800 + }, + { + "epoch": 0.6743304954117867, + "grad_norm": 0.6320533156394958, + "learning_rate": 5.066141046782561e-06, + "loss": 0.9972, + "step": 46810 + }, + { + "epoch": 0.674474552343086, + "grad_norm": 0.5987135767936707, + "learning_rate": 5.062083321595316e-06, + "loss": 1.0253, + "step": 46820 + }, + { + "epoch": 0.6746186092743852, + "grad_norm": 0.6316280961036682, + "learning_rate": 5.058026671324413e-06, + "loss": 1.0177, + "step": 46830 + }, + { + "epoch": 0.6747626662056845, + "grad_norm": 0.622235119342804, + "learning_rate": 5.053971096852925e-06, + "loss": 1.013, + "step": 46840 + }, + { + "epoch": 0.6749067231369837, + "grad_norm": 0.6065994501113892, + "learning_rate": 5.049916599063708e-06, + "loss": 1.0176, + "step": 46850 + }, + { + "epoch": 0.675050780068283, + "grad_norm": 0.6721647381782532, + "learning_rate": 5.045863178839352e-06, + "loss": 1.0146, + "step": 46860 + }, + { + "epoch": 0.6751948369995823, + "grad_norm": 0.6812637448310852, + "learning_rate": 5.041810837062244e-06, + "loss": 1.0335, + "step": 46870 + }, + { + "epoch": 0.6753388939308815, + "grad_norm": 0.7654386162757874, + "learning_rate": 5.037759574614518e-06, + "loss": 1.0149, + "step": 46880 + }, + { + "epoch": 0.6754829508621807, + "grad_norm": 0.6054534912109375, + "learning_rate": 5.033709392378079e-06, + "loss": 0.9968, + "step": 46890 + }, + { + "epoch": 0.67562700779348, + "grad_norm": 0.6277738213539124, + "learning_rate": 5.029660291234595e-06, + "loss": 1.0318, + "step": 46900 + }, + { + "epoch": 0.6757710647247792, + "grad_norm": 0.6766389012336731, + "learning_rate": 5.025612272065499e-06, + "loss": 0.9958, + "step": 46910 + }, + { + "epoch": 0.6759151216560785, + "grad_norm": 0.6448894739151001, + "learning_rate": 5.021565335751985e-06, + "loss": 1.0266, + "step": 46920 + }, + { + "epoch": 0.6760591785873777, + "grad_norm": 0.6101272702217102, + "learning_rate": 5.017519483175026e-06, + "loss": 0.9839, + "step": 46930 + }, + { + "epoch": 0.676203235518677, + "grad_norm": 0.6158223748207092, + "learning_rate": 5.013474715215345e-06, + "loss": 1.0151, + "step": 46940 + }, + { + "epoch": 0.6763472924499763, + "grad_norm": 0.6153269410133362, + "learning_rate": 5.009431032753422e-06, + "loss": 1.024, + "step": 46950 + }, + { + "epoch": 0.6764913493812755, + "grad_norm": 0.6961709260940552, + "learning_rate": 5.005388436669522e-06, + "loss": 1.0205, + "step": 46960 + }, + { + "epoch": 0.6766354063125747, + "grad_norm": 0.6553806662559509, + "learning_rate": 5.001346927843655e-06, + "loss": 1.0261, + "step": 46970 + }, + { + "epoch": 0.676779463243874, + "grad_norm": 0.7360643744468689, + "learning_rate": 4.99730650715561e-06, + "loss": 1.0089, + "step": 46980 + }, + { + "epoch": 0.6769235201751732, + "grad_norm": 0.6369828581809998, + "learning_rate": 4.993267175484928e-06, + "loss": 1.018, + "step": 46990 + }, + { + "epoch": 0.6770675771064725, + "grad_norm": 0.6678699851036072, + "learning_rate": 4.9892289337109065e-06, + "loss": 1.034, + "step": 47000 + }, + { + "epoch": 0.6772116340377717, + "grad_norm": 0.6522915959358215, + "learning_rate": 4.985191782712628e-06, + "loss": 1.0115, + "step": 47010 + }, + { + "epoch": 0.677355690969071, + "grad_norm": 0.6694148182868958, + "learning_rate": 4.981155723368918e-06, + "loss": 1.0043, + "step": 47020 + }, + { + "epoch": 0.6774997479003703, + "grad_norm": 0.6815006136894226, + "learning_rate": 4.977120756558372e-06, + "loss": 0.9957, + "step": 47030 + }, + { + "epoch": 0.6776438048316695, + "grad_norm": 0.7229025363922119, + "learning_rate": 4.973086883159347e-06, + "loss": 1.0124, + "step": 47040 + }, + { + "epoch": 0.6777878617629687, + "grad_norm": 0.6131969690322876, + "learning_rate": 4.9690541040499605e-06, + "loss": 1.0133, + "step": 47050 + }, + { + "epoch": 0.6779319186942679, + "grad_norm": 0.6573668122291565, + "learning_rate": 4.965022420108091e-06, + "loss": 1.0062, + "step": 47060 + }, + { + "epoch": 0.6780759756255672, + "grad_norm": 0.5714235305786133, + "learning_rate": 4.960991832211386e-06, + "loss": 1.0364, + "step": 47070 + }, + { + "epoch": 0.6782200325568665, + "grad_norm": 0.6412636637687683, + "learning_rate": 4.956962341237246e-06, + "loss": 1.0069, + "step": 47080 + }, + { + "epoch": 0.6783640894881657, + "grad_norm": 0.6453899145126343, + "learning_rate": 4.952933948062836e-06, + "loss": 1.0163, + "step": 47090 + }, + { + "epoch": 0.678508146419465, + "grad_norm": 0.6912153363227844, + "learning_rate": 4.948906653565081e-06, + "loss": 1.0182, + "step": 47100 + }, + { + "epoch": 0.6786522033507643, + "grad_norm": 0.5970426201820374, + "learning_rate": 4.944880458620663e-06, + "loss": 1.0248, + "step": 47110 + }, + { + "epoch": 0.6787962602820635, + "grad_norm": 0.7003148794174194, + "learning_rate": 4.940855364106043e-06, + "loss": 1.012, + "step": 47120 + }, + { + "epoch": 0.6789403172133627, + "grad_norm": 0.7518492937088013, + "learning_rate": 4.9368313708974144e-06, + "loss": 0.9998, + "step": 47130 + }, + { + "epoch": 0.6790843741446619, + "grad_norm": 0.6701333522796631, + "learning_rate": 4.932808479870747e-06, + "loss": 1.0081, + "step": 47140 + }, + { + "epoch": 0.6792284310759612, + "grad_norm": 0.6354277729988098, + "learning_rate": 4.928786691901774e-06, + "loss": 1.0086, + "step": 47150 + }, + { + "epoch": 0.6793724880072605, + "grad_norm": 0.6811854839324951, + "learning_rate": 4.924766007865982e-06, + "loss": 0.9978, + "step": 47160 + }, + { + "epoch": 0.6795165449385597, + "grad_norm": 0.8547168374061584, + "learning_rate": 4.920746428638617e-06, + "loss": 1.009, + "step": 47170 + }, + { + "epoch": 0.679660601869859, + "grad_norm": 0.7384396195411682, + "learning_rate": 4.916727955094685e-06, + "loss": 1.0118, + "step": 47180 + }, + { + "epoch": 0.6798046588011583, + "grad_norm": 0.6684306263923645, + "learning_rate": 4.912710588108951e-06, + "loss": 1.0177, + "step": 47190 + }, + { + "epoch": 0.6799487157324575, + "grad_norm": 0.7980669140815735, + "learning_rate": 4.908694328555945e-06, + "loss": 1.0269, + "step": 47200 + }, + { + "epoch": 0.6800927726637567, + "grad_norm": 0.6420135498046875, + "learning_rate": 4.904679177309951e-06, + "loss": 1.0206, + "step": 47210 + }, + { + "epoch": 0.6802368295950559, + "grad_norm": 0.5965870022773743, + "learning_rate": 4.900665135245008e-06, + "loss": 1.016, + "step": 47220 + }, + { + "epoch": 0.6803808865263552, + "grad_norm": 0.6588281393051147, + "learning_rate": 4.896652203234922e-06, + "loss": 1.0155, + "step": 47230 + }, + { + "epoch": 0.6805249434576545, + "grad_norm": 0.6783021688461304, + "learning_rate": 4.89264038215325e-06, + "loss": 1.0047, + "step": 47240 + }, + { + "epoch": 0.6806690003889537, + "grad_norm": 0.6871261596679688, + "learning_rate": 4.888629672873306e-06, + "loss": 1.0126, + "step": 47250 + }, + { + "epoch": 0.680813057320253, + "grad_norm": 0.6095453500747681, + "learning_rate": 4.884620076268179e-06, + "loss": 1.0229, + "step": 47260 + }, + { + "epoch": 0.6809571142515523, + "grad_norm": 0.6512758135795593, + "learning_rate": 4.880611593210691e-06, + "loss": 1.0221, + "step": 47270 + }, + { + "epoch": 0.6811011711828515, + "grad_norm": 0.5744214057922363, + "learning_rate": 4.876604224573432e-06, + "loss": 1.0143, + "step": 47280 + }, + { + "epoch": 0.6812452281141507, + "grad_norm": 0.7972787022590637, + "learning_rate": 4.872597971228762e-06, + "loss": 1.0161, + "step": 47290 + }, + { + "epoch": 0.6813892850454499, + "grad_norm": 0.7407034039497375, + "learning_rate": 4.86859283404878e-06, + "loss": 1.0169, + "step": 47300 + }, + { + "epoch": 0.6815333419767492, + "grad_norm": 0.5624092817306519, + "learning_rate": 4.86458881390535e-06, + "loss": 1.0317, + "step": 47310 + }, + { + "epoch": 0.6816773989080485, + "grad_norm": 0.6977633833885193, + "learning_rate": 4.860585911670093e-06, + "loss": 1.0119, + "step": 47320 + }, + { + "epoch": 0.6818214558393477, + "grad_norm": 0.6715294122695923, + "learning_rate": 4.85658412821438e-06, + "loss": 1.0192, + "step": 47330 + }, + { + "epoch": 0.681965512770647, + "grad_norm": 0.6478626728057861, + "learning_rate": 4.852583464409354e-06, + "loss": 1.0003, + "step": 47340 + }, + { + "epoch": 0.6821095697019463, + "grad_norm": 0.6032857894897461, + "learning_rate": 4.848583921125898e-06, + "loss": 1.0095, + "step": 47350 + }, + { + "epoch": 0.6822536266332455, + "grad_norm": 0.6184641122817993, + "learning_rate": 4.844585499234659e-06, + "loss": 1.0115, + "step": 47360 + }, + { + "epoch": 0.6823976835645447, + "grad_norm": 0.6499150395393372, + "learning_rate": 4.840588199606037e-06, + "loss": 1.0388, + "step": 47370 + }, + { + "epoch": 0.6825417404958439, + "grad_norm": 0.6112958788871765, + "learning_rate": 4.8365920231101905e-06, + "loss": 1.0212, + "step": 47380 + }, + { + "epoch": 0.6826857974271432, + "grad_norm": 0.567670464515686, + "learning_rate": 4.832596970617027e-06, + "loss": 0.9819, + "step": 47390 + }, + { + "epoch": 0.6828298543584425, + "grad_norm": 0.67822265625, + "learning_rate": 4.828603042996227e-06, + "loss": 1.0072, + "step": 47400 + }, + { + "epoch": 0.6829739112897417, + "grad_norm": 0.6570102572441101, + "learning_rate": 4.824610241117198e-06, + "loss": 1.0118, + "step": 47410 + }, + { + "epoch": 0.683117968221041, + "grad_norm": 0.6754155158996582, + "learning_rate": 4.820618565849128e-06, + "loss": 1.0106, + "step": 47420 + }, + { + "epoch": 0.6832620251523402, + "grad_norm": 0.5989766120910645, + "learning_rate": 4.816628018060946e-06, + "loss": 0.9924, + "step": 47430 + }, + { + "epoch": 0.6834060820836394, + "grad_norm": 0.6305521130561829, + "learning_rate": 4.812638598621341e-06, + "loss": 1.0204, + "step": 47440 + }, + { + "epoch": 0.6835501390149387, + "grad_norm": 0.6485593914985657, + "learning_rate": 4.808650308398753e-06, + "loss": 1.0256, + "step": 47450 + }, + { + "epoch": 0.6836941959462379, + "grad_norm": 0.9397467374801636, + "learning_rate": 4.804663148261377e-06, + "loss": 1.029, + "step": 47460 + }, + { + "epoch": 0.6838382528775372, + "grad_norm": 0.8456627726554871, + "learning_rate": 4.8006771190771615e-06, + "loss": 1.0245, + "step": 47470 + }, + { + "epoch": 0.6839823098088365, + "grad_norm": 0.6226358413696289, + "learning_rate": 4.796692221713816e-06, + "loss": 1.0101, + "step": 47480 + }, + { + "epoch": 0.6841263667401357, + "grad_norm": 0.5895450711250305, + "learning_rate": 4.792708457038797e-06, + "loss": 1.037, + "step": 47490 + }, + { + "epoch": 0.684270423671435, + "grad_norm": 1.2034680843353271, + "learning_rate": 4.788725825919306e-06, + "loss": 1.0181, + "step": 47500 + } + ], + "logging_steps": 10, + "max_steps": 69417, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4867563694146191e+18, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +}