diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,38831 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999977398116359, + "eval_steps": 5000, + "global_step": 55305, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00018081506912786112, + "grad_norm": 40.21875, + "learning_rate": 9.999971747693341e-06, + "loss": 17.52, + "step": 10 + }, + { + "epoch": 0.00036163013825572224, + "grad_norm": 40.4375, + "learning_rate": 9.999943495386682e-06, + "loss": 17.8709, + "step": 20 + }, + { + "epoch": 0.0005424452073835833, + "grad_norm": 40.875, + "learning_rate": 9.999915243080021e-06, + "loss": 17.6916, + "step": 30 + }, + { + "epoch": 0.0007232602765114445, + "grad_norm": 39.96875, + "learning_rate": 9.999886990773362e-06, + "loss": 17.9391, + "step": 40 + }, + { + "epoch": 0.0009040753456393055, + "grad_norm": 39.09375, + "learning_rate": 9.999858738466704e-06, + "loss": 17.3349, + "step": 50 + }, + { + "epoch": 0.0010848904147671666, + "grad_norm": 43.1875, + "learning_rate": 9.999830486160043e-06, + "loss": 17.7175, + "step": 60 + }, + { + "epoch": 0.0012657054838950278, + "grad_norm": 38.1875, + "learning_rate": 9.999802233853383e-06, + "loss": 17.9345, + "step": 70 + }, + { + "epoch": 0.001446520553022889, + "grad_norm": 41.875, + "learning_rate": 9.999773981546724e-06, + "loss": 17.5544, + "step": 80 + }, + { + "epoch": 0.0016273356221507501, + "grad_norm": 41.03125, + "learning_rate": 9.999745729240065e-06, + "loss": 17.255, + "step": 90 + }, + { + "epoch": 0.001808150691278611, + "grad_norm": 38.90625, + "learning_rate": 9.999717476933405e-06, + "loss": 17.2562, + "step": 100 + }, + { + "epoch": 0.0019889657604064723, + "grad_norm": 38.71875, + "learning_rate": 9.999689224626746e-06, + "loss": 17.4383, + "step": 110 + }, + { + "epoch": 0.002169780829534333, + "grad_norm": 41.75, + "learning_rate": 9.999660972320085e-06, + "loss": 17.9809, + "step": 120 + }, + { + "epoch": 0.0023505958986621946, + "grad_norm": 38.84375, + "learning_rate": 9.999632720013426e-06, + "loss": 17.7502, + "step": 130 + }, + { + "epoch": 0.0025314109677900556, + "grad_norm": 41.21875, + "learning_rate": 9.999604467706768e-06, + "loss": 17.4189, + "step": 140 + }, + { + "epoch": 0.002712226036917917, + "grad_norm": 39.1875, + "learning_rate": 9.999576215400107e-06, + "loss": 17.8842, + "step": 150 + }, + { + "epoch": 0.002893041106045778, + "grad_norm": 40.75, + "learning_rate": 9.999547963093447e-06, + "loss": 17.6147, + "step": 160 + }, + { + "epoch": 0.003073856175173639, + "grad_norm": 39.1875, + "learning_rate": 9.999519710786788e-06, + "loss": 17.6027, + "step": 170 + }, + { + "epoch": 0.0032546712443015002, + "grad_norm": 39.375, + "learning_rate": 9.999491458480129e-06, + "loss": 17.581, + "step": 180 + }, + { + "epoch": 0.003435486313429361, + "grad_norm": 40.15625, + "learning_rate": 9.99946320617347e-06, + "loss": 17.5783, + "step": 190 + }, + { + "epoch": 0.003616301382557222, + "grad_norm": 39.53125, + "learning_rate": 9.999434953866808e-06, + "loss": 17.3805, + "step": 200 + }, + { + "epoch": 0.0037971164516850836, + "grad_norm": 39.53125, + "learning_rate": 9.999406701560149e-06, + "loss": 17.3353, + "step": 210 + }, + { + "epoch": 0.0039779315208129445, + "grad_norm": 40.25, + "learning_rate": 9.999378449253491e-06, + "loss": 17.6241, + "step": 220 + }, + { + "epoch": 0.004158746589940806, + "grad_norm": 39.03125, + "learning_rate": 9.999350196946832e-06, + "loss": 17.5579, + "step": 230 + }, + { + "epoch": 0.004339561659068666, + "grad_norm": 42.71875, + "learning_rate": 9.99932194464017e-06, + "loss": 17.7354, + "step": 240 + }, + { + "epoch": 0.004520376728196528, + "grad_norm": 40.78125, + "learning_rate": 9.999293692333511e-06, + "loss": 17.3126, + "step": 250 + }, + { + "epoch": 0.004701191797324389, + "grad_norm": 38.1875, + "learning_rate": 9.999265440026852e-06, + "loss": 17.5456, + "step": 260 + }, + { + "epoch": 0.00488200686645225, + "grad_norm": 42.21875, + "learning_rate": 9.999237187720193e-06, + "loss": 17.7592, + "step": 270 + }, + { + "epoch": 0.005062821935580111, + "grad_norm": 41.8125, + "learning_rate": 9.999208935413533e-06, + "loss": 17.3461, + "step": 280 + }, + { + "epoch": 0.0052436370047079725, + "grad_norm": 44.0, + "learning_rate": 9.999180683106872e-06, + "loss": 17.643, + "step": 290 + }, + { + "epoch": 0.005424452073835834, + "grad_norm": 40.21875, + "learning_rate": 9.999152430800213e-06, + "loss": 17.3788, + "step": 300 + }, + { + "epoch": 0.005605267142963694, + "grad_norm": 42.9375, + "learning_rate": 9.999124178493555e-06, + "loss": 17.5415, + "step": 310 + }, + { + "epoch": 0.005786082212091556, + "grad_norm": 43.09375, + "learning_rate": 9.999095926186894e-06, + "loss": 17.9753, + "step": 320 + }, + { + "epoch": 0.005966897281219417, + "grad_norm": 38.375, + "learning_rate": 9.999067673880235e-06, + "loss": 17.3605, + "step": 330 + }, + { + "epoch": 0.006147712350347278, + "grad_norm": 39.875, + "learning_rate": 9.999039421573575e-06, + "loss": 17.8647, + "step": 340 + }, + { + "epoch": 0.006328527419475139, + "grad_norm": 40.0, + "learning_rate": 9.999011169266916e-06, + "loss": 17.6431, + "step": 350 + }, + { + "epoch": 0.0065093424886030005, + "grad_norm": 42.40625, + "learning_rate": 9.998982916960256e-06, + "loss": 17.4284, + "step": 360 + }, + { + "epoch": 0.006690157557730861, + "grad_norm": 41.46875, + "learning_rate": 9.998954664653595e-06, + "loss": 17.6202, + "step": 370 + }, + { + "epoch": 0.006870972626858722, + "grad_norm": 42.3125, + "learning_rate": 9.998926412346936e-06, + "loss": 17.5108, + "step": 380 + }, + { + "epoch": 0.007051787695986584, + "grad_norm": 41.4375, + "learning_rate": 9.998898160040277e-06, + "loss": 17.2915, + "step": 390 + }, + { + "epoch": 0.007232602765114444, + "grad_norm": 41.625, + "learning_rate": 9.998869907733619e-06, + "loss": 17.2113, + "step": 400 + }, + { + "epoch": 0.007413417834242306, + "grad_norm": 39.375, + "learning_rate": 9.998841655426958e-06, + "loss": 17.551, + "step": 410 + }, + { + "epoch": 0.007594232903370167, + "grad_norm": 39.09375, + "learning_rate": 9.998813403120298e-06, + "loss": 17.6326, + "step": 420 + }, + { + "epoch": 0.007775047972498028, + "grad_norm": 39.65625, + "learning_rate": 9.998785150813639e-06, + "loss": 17.1406, + "step": 430 + }, + { + "epoch": 0.007955863041625889, + "grad_norm": 39.03125, + "learning_rate": 9.99875689850698e-06, + "loss": 17.8196, + "step": 440 + }, + { + "epoch": 0.00813667811075375, + "grad_norm": 39.4375, + "learning_rate": 9.99872864620032e-06, + "loss": 17.2403, + "step": 450 + }, + { + "epoch": 0.008317493179881612, + "grad_norm": 40.375, + "learning_rate": 9.99870039389366e-06, + "loss": 17.9381, + "step": 460 + }, + { + "epoch": 0.008498308249009472, + "grad_norm": 41.3125, + "learning_rate": 9.998672141587e-06, + "loss": 17.7649, + "step": 470 + }, + { + "epoch": 0.008679123318137333, + "grad_norm": 38.625, + "learning_rate": 9.99864388928034e-06, + "loss": 17.6915, + "step": 480 + }, + { + "epoch": 0.008859938387265195, + "grad_norm": 39.78125, + "learning_rate": 9.998615636973681e-06, + "loss": 17.434, + "step": 490 + }, + { + "epoch": 0.009040753456393056, + "grad_norm": 42.09375, + "learning_rate": 9.998587384667022e-06, + "loss": 17.4497, + "step": 500 + }, + { + "epoch": 0.009221568525520916, + "grad_norm": 39.0, + "learning_rate": 9.998559132360362e-06, + "loss": 17.5049, + "step": 510 + }, + { + "epoch": 0.009402383594648778, + "grad_norm": 39.4375, + "learning_rate": 9.998530880053703e-06, + "loss": 17.8357, + "step": 520 + }, + { + "epoch": 0.009583198663776639, + "grad_norm": 40.9375, + "learning_rate": 9.998502627747044e-06, + "loss": 17.334, + "step": 530 + }, + { + "epoch": 0.0097640137329045, + "grad_norm": 39.9375, + "learning_rate": 9.998474375440384e-06, + "loss": 17.6756, + "step": 540 + }, + { + "epoch": 0.009944828802032362, + "grad_norm": 40.15625, + "learning_rate": 9.998446123133723e-06, + "loss": 18.0823, + "step": 550 + }, + { + "epoch": 0.010125643871160222, + "grad_norm": 43.5, + "learning_rate": 9.998417870827064e-06, + "loss": 17.8315, + "step": 560 + }, + { + "epoch": 0.010306458940288083, + "grad_norm": 40.0625, + "learning_rate": 9.998389618520406e-06, + "loss": 17.7426, + "step": 570 + }, + { + "epoch": 0.010487274009415945, + "grad_norm": 39.5, + "learning_rate": 9.998361366213745e-06, + "loss": 17.3527, + "step": 580 + }, + { + "epoch": 0.010668089078543806, + "grad_norm": 38.9375, + "learning_rate": 9.998333113907086e-06, + "loss": 17.8237, + "step": 590 + }, + { + "epoch": 0.010848904147671668, + "grad_norm": 39.46875, + "learning_rate": 9.998304861600426e-06, + "loss": 17.406, + "step": 600 + }, + { + "epoch": 0.011029719216799528, + "grad_norm": 40.6875, + "learning_rate": 9.998276609293767e-06, + "loss": 17.7848, + "step": 610 + }, + { + "epoch": 0.011210534285927389, + "grad_norm": 40.5, + "learning_rate": 9.998248356987108e-06, + "loss": 17.4076, + "step": 620 + }, + { + "epoch": 0.011391349355055251, + "grad_norm": 42.5, + "learning_rate": 9.998220104680446e-06, + "loss": 17.6821, + "step": 630 + }, + { + "epoch": 0.011572164424183112, + "grad_norm": 40.125, + "learning_rate": 9.998191852373787e-06, + "loss": 17.5884, + "step": 640 + }, + { + "epoch": 0.011752979493310972, + "grad_norm": 37.4375, + "learning_rate": 9.998163600067128e-06, + "loss": 17.4682, + "step": 650 + }, + { + "epoch": 0.011933794562438834, + "grad_norm": 39.96875, + "learning_rate": 9.99813534776047e-06, + "loss": 17.4463, + "step": 660 + }, + { + "epoch": 0.012114609631566695, + "grad_norm": 41.09375, + "learning_rate": 9.998107095453809e-06, + "loss": 17.7127, + "step": 670 + }, + { + "epoch": 0.012295424700694555, + "grad_norm": 41.34375, + "learning_rate": 9.99807884314715e-06, + "loss": 17.7266, + "step": 680 + }, + { + "epoch": 0.012476239769822418, + "grad_norm": 39.75, + "learning_rate": 9.99805059084049e-06, + "loss": 17.7013, + "step": 690 + }, + { + "epoch": 0.012657054838950278, + "grad_norm": 37.25, + "learning_rate": 9.99802233853383e-06, + "loss": 17.6106, + "step": 700 + }, + { + "epoch": 0.012837869908078139, + "grad_norm": 41.125, + "learning_rate": 9.997994086227171e-06, + "loss": 17.6656, + "step": 710 + }, + { + "epoch": 0.013018684977206001, + "grad_norm": 43.5, + "learning_rate": 9.99796583392051e-06, + "loss": 17.5118, + "step": 720 + }, + { + "epoch": 0.013199500046333862, + "grad_norm": 40.21875, + "learning_rate": 9.997937581613851e-06, + "loss": 17.6598, + "step": 730 + }, + { + "epoch": 0.013380315115461722, + "grad_norm": 41.9375, + "learning_rate": 9.997909329307192e-06, + "loss": 17.5925, + "step": 740 + }, + { + "epoch": 0.013561130184589584, + "grad_norm": 38.28125, + "learning_rate": 9.997881077000532e-06, + "loss": 17.5799, + "step": 750 + }, + { + "epoch": 0.013741945253717445, + "grad_norm": 38.5, + "learning_rate": 9.997852824693873e-06, + "loss": 17.5959, + "step": 760 + }, + { + "epoch": 0.013922760322845305, + "grad_norm": 38.21875, + "learning_rate": 9.997824572387213e-06, + "loss": 17.2816, + "step": 770 + }, + { + "epoch": 0.014103575391973168, + "grad_norm": 39.53125, + "learning_rate": 9.997796320080554e-06, + "loss": 17.6993, + "step": 780 + }, + { + "epoch": 0.014284390461101028, + "grad_norm": 40.71875, + "learning_rate": 9.997768067773895e-06, + "loss": 17.8703, + "step": 790 + }, + { + "epoch": 0.014465205530228889, + "grad_norm": 40.8125, + "learning_rate": 9.997739815467234e-06, + "loss": 17.6, + "step": 800 + }, + { + "epoch": 0.014646020599356751, + "grad_norm": 41.25, + "learning_rate": 9.997711563160574e-06, + "loss": 17.4995, + "step": 810 + }, + { + "epoch": 0.014826835668484611, + "grad_norm": 37.78125, + "learning_rate": 9.997683310853915e-06, + "loss": 17.3134, + "step": 820 + }, + { + "epoch": 0.015007650737612472, + "grad_norm": 40.28125, + "learning_rate": 9.997655058547256e-06, + "loss": 17.6378, + "step": 830 + }, + { + "epoch": 0.015188465806740334, + "grad_norm": 41.625, + "learning_rate": 9.997626806240596e-06, + "loss": 18.0105, + "step": 840 + }, + { + "epoch": 0.015369280875868195, + "grad_norm": 40.625, + "learning_rate": 9.997598553933937e-06, + "loss": 17.5278, + "step": 850 + }, + { + "epoch": 0.015550095944996055, + "grad_norm": 43.21875, + "learning_rate": 9.997570301627277e-06, + "loss": 17.677, + "step": 860 + }, + { + "epoch": 0.015730911014123918, + "grad_norm": 40.5, + "learning_rate": 9.997542049320618e-06, + "loss": 17.4019, + "step": 870 + }, + { + "epoch": 0.015911726083251778, + "grad_norm": 40.5, + "learning_rate": 9.997513797013959e-06, + "loss": 17.5114, + "step": 880 + }, + { + "epoch": 0.01609254115237964, + "grad_norm": 42.28125, + "learning_rate": 9.997485544707298e-06, + "loss": 17.2865, + "step": 890 + }, + { + "epoch": 0.0162733562215075, + "grad_norm": 41.9375, + "learning_rate": 9.997457292400638e-06, + "loss": 17.6203, + "step": 900 + }, + { + "epoch": 0.016454171290635363, + "grad_norm": 41.71875, + "learning_rate": 9.997429040093979e-06, + "loss": 17.9291, + "step": 910 + }, + { + "epoch": 0.016634986359763224, + "grad_norm": 40.8125, + "learning_rate": 9.99740078778732e-06, + "loss": 17.7519, + "step": 920 + }, + { + "epoch": 0.016815801428891084, + "grad_norm": 40.34375, + "learning_rate": 9.99737253548066e-06, + "loss": 17.777, + "step": 930 + }, + { + "epoch": 0.016996616498018945, + "grad_norm": 40.15625, + "learning_rate": 9.997344283174e-06, + "loss": 17.1429, + "step": 940 + }, + { + "epoch": 0.017177431567146805, + "grad_norm": 39.5625, + "learning_rate": 9.997316030867341e-06, + "loss": 17.6653, + "step": 950 + }, + { + "epoch": 0.017358246636274666, + "grad_norm": 39.96875, + "learning_rate": 9.997287778560682e-06, + "loss": 17.7769, + "step": 960 + }, + { + "epoch": 0.01753906170540253, + "grad_norm": 36.65625, + "learning_rate": 9.997259526254023e-06, + "loss": 17.4157, + "step": 970 + }, + { + "epoch": 0.01771987677453039, + "grad_norm": 40.8125, + "learning_rate": 9.997231273947361e-06, + "loss": 17.7347, + "step": 980 + }, + { + "epoch": 0.01790069184365825, + "grad_norm": 43.8125, + "learning_rate": 9.997203021640702e-06, + "loss": 17.3825, + "step": 990 + }, + { + "epoch": 0.01808150691278611, + "grad_norm": 39.0625, + "learning_rate": 9.997174769334043e-06, + "loss": 17.6844, + "step": 1000 + }, + { + "epoch": 0.018262321981913972, + "grad_norm": 40.84375, + "learning_rate": 9.997146517027383e-06, + "loss": 17.6222, + "step": 1010 + }, + { + "epoch": 0.018443137051041832, + "grad_norm": 38.9375, + "learning_rate": 9.997118264720724e-06, + "loss": 17.6019, + "step": 1020 + }, + { + "epoch": 0.018623952120169696, + "grad_norm": 39.96875, + "learning_rate": 9.997090012414065e-06, + "loss": 17.5789, + "step": 1030 + }, + { + "epoch": 0.018804767189297557, + "grad_norm": 42.78125, + "learning_rate": 9.997061760107405e-06, + "loss": 17.1038, + "step": 1040 + }, + { + "epoch": 0.018985582258425417, + "grad_norm": 41.65625, + "learning_rate": 9.997033507800746e-06, + "loss": 17.6586, + "step": 1050 + }, + { + "epoch": 0.019166397327553278, + "grad_norm": 38.9375, + "learning_rate": 9.997005255494085e-06, + "loss": 17.4448, + "step": 1060 + }, + { + "epoch": 0.01934721239668114, + "grad_norm": 41.21875, + "learning_rate": 9.996977003187425e-06, + "loss": 16.9077, + "step": 1070 + }, + { + "epoch": 0.019528027465809, + "grad_norm": 36.1875, + "learning_rate": 9.996948750880766e-06, + "loss": 17.3949, + "step": 1080 + }, + { + "epoch": 0.019708842534936863, + "grad_norm": 40.28125, + "learning_rate": 9.996920498574107e-06, + "loss": 18.0129, + "step": 1090 + }, + { + "epoch": 0.019889657604064723, + "grad_norm": 39.625, + "learning_rate": 9.996892246267447e-06, + "loss": 17.3197, + "step": 1100 + }, + { + "epoch": 0.020070472673192584, + "grad_norm": 40.46875, + "learning_rate": 9.996863993960788e-06, + "loss": 17.4881, + "step": 1110 + }, + { + "epoch": 0.020251287742320444, + "grad_norm": 39.78125, + "learning_rate": 9.996835741654128e-06, + "loss": 17.6111, + "step": 1120 + }, + { + "epoch": 0.020432102811448305, + "grad_norm": 41.59375, + "learning_rate": 9.996807489347469e-06, + "loss": 17.5654, + "step": 1130 + }, + { + "epoch": 0.020612917880576166, + "grad_norm": 39.625, + "learning_rate": 9.99677923704081e-06, + "loss": 17.5175, + "step": 1140 + }, + { + "epoch": 0.02079373294970403, + "grad_norm": 42.03125, + "learning_rate": 9.996750984734149e-06, + "loss": 18.2359, + "step": 1150 + }, + { + "epoch": 0.02097454801883189, + "grad_norm": 41.8125, + "learning_rate": 9.99672273242749e-06, + "loss": 17.6192, + "step": 1160 + }, + { + "epoch": 0.02115536308795975, + "grad_norm": 43.875, + "learning_rate": 9.99669448012083e-06, + "loss": 17.8928, + "step": 1170 + }, + { + "epoch": 0.02133617815708761, + "grad_norm": 39.25, + "learning_rate": 9.99666622781417e-06, + "loss": 17.5018, + "step": 1180 + }, + { + "epoch": 0.02151699322621547, + "grad_norm": 40.75, + "learning_rate": 9.996637975507511e-06, + "loss": 17.8533, + "step": 1190 + }, + { + "epoch": 0.021697808295343336, + "grad_norm": 39.5, + "learning_rate": 9.996609723200852e-06, + "loss": 17.491, + "step": 1200 + }, + { + "epoch": 0.021878623364471196, + "grad_norm": 38.75, + "learning_rate": 9.996581470894192e-06, + "loss": 17.6635, + "step": 1210 + }, + { + "epoch": 0.022059438433599057, + "grad_norm": 38.0625, + "learning_rate": 9.996553218587533e-06, + "loss": 17.7756, + "step": 1220 + }, + { + "epoch": 0.022240253502726917, + "grad_norm": 43.21875, + "learning_rate": 9.996524966280872e-06, + "loss": 17.5359, + "step": 1230 + }, + { + "epoch": 0.022421068571854778, + "grad_norm": 38.53125, + "learning_rate": 9.996496713974213e-06, + "loss": 17.4919, + "step": 1240 + }, + { + "epoch": 0.022601883640982638, + "grad_norm": 40.84375, + "learning_rate": 9.996468461667553e-06, + "loss": 17.6023, + "step": 1250 + }, + { + "epoch": 0.022782698710110502, + "grad_norm": 41.0625, + "learning_rate": 9.996440209360894e-06, + "loss": 17.3854, + "step": 1260 + }, + { + "epoch": 0.022963513779238363, + "grad_norm": 42.78125, + "learning_rate": 9.996411957054234e-06, + "loss": 17.7068, + "step": 1270 + }, + { + "epoch": 0.023144328848366223, + "grad_norm": 40.46875, + "learning_rate": 9.996383704747575e-06, + "loss": 17.4914, + "step": 1280 + }, + { + "epoch": 0.023325143917494084, + "grad_norm": 38.875, + "learning_rate": 9.996355452440916e-06, + "loss": 18.045, + "step": 1290 + }, + { + "epoch": 0.023505958986621944, + "grad_norm": 42.40625, + "learning_rate": 9.996327200134256e-06, + "loss": 17.5793, + "step": 1300 + }, + { + "epoch": 0.023686774055749805, + "grad_norm": 41.09375, + "learning_rate": 9.996298947827597e-06, + "loss": 17.5581, + "step": 1310 + }, + { + "epoch": 0.02386758912487767, + "grad_norm": 41.59375, + "learning_rate": 9.996270695520936e-06, + "loss": 17.4817, + "step": 1320 + }, + { + "epoch": 0.02404840419400553, + "grad_norm": 40.1875, + "learning_rate": 9.996242443214276e-06, + "loss": 17.7449, + "step": 1330 + }, + { + "epoch": 0.02422921926313339, + "grad_norm": 42.84375, + "learning_rate": 9.996214190907617e-06, + "loss": 17.4616, + "step": 1340 + }, + { + "epoch": 0.02441003433226125, + "grad_norm": 38.65625, + "learning_rate": 9.996185938600958e-06, + "loss": 18.0316, + "step": 1350 + }, + { + "epoch": 0.02459084940138911, + "grad_norm": 44.84375, + "learning_rate": 9.996157686294298e-06, + "loss": 17.4498, + "step": 1360 + }, + { + "epoch": 0.02477166447051697, + "grad_norm": 36.1875, + "learning_rate": 9.996129433987639e-06, + "loss": 17.7884, + "step": 1370 + }, + { + "epoch": 0.024952479539644835, + "grad_norm": 37.9375, + "learning_rate": 9.99610118168098e-06, + "loss": 17.6003, + "step": 1380 + }, + { + "epoch": 0.025133294608772696, + "grad_norm": 41.4375, + "learning_rate": 9.99607292937432e-06, + "loss": 17.8514, + "step": 1390 + }, + { + "epoch": 0.025314109677900556, + "grad_norm": 38.875, + "learning_rate": 9.99604467706766e-06, + "loss": 17.455, + "step": 1400 + }, + { + "epoch": 0.025494924747028417, + "grad_norm": 39.84375, + "learning_rate": 9.996016424761e-06, + "loss": 17.4442, + "step": 1410 + }, + { + "epoch": 0.025675739816156277, + "grad_norm": 41.46875, + "learning_rate": 9.99598817245434e-06, + "loss": 17.4482, + "step": 1420 + }, + { + "epoch": 0.025856554885284138, + "grad_norm": 38.0, + "learning_rate": 9.995959920147681e-06, + "loss": 17.3167, + "step": 1430 + }, + { + "epoch": 0.026037369954412002, + "grad_norm": 40.40625, + "learning_rate": 9.995931667841022e-06, + "loss": 17.3963, + "step": 1440 + }, + { + "epoch": 0.026218185023539863, + "grad_norm": 42.09375, + "learning_rate": 9.995903415534362e-06, + "loss": 17.8684, + "step": 1450 + }, + { + "epoch": 0.026399000092667723, + "grad_norm": 39.0625, + "learning_rate": 9.995875163227703e-06, + "loss": 17.4063, + "step": 1460 + }, + { + "epoch": 0.026579815161795584, + "grad_norm": 37.8125, + "learning_rate": 9.995846910921044e-06, + "loss": 17.4437, + "step": 1470 + }, + { + "epoch": 0.026760630230923444, + "grad_norm": 41.8125, + "learning_rate": 9.995818658614384e-06, + "loss": 17.8306, + "step": 1480 + }, + { + "epoch": 0.026941445300051305, + "grad_norm": 38.875, + "learning_rate": 9.995790406307723e-06, + "loss": 17.4603, + "step": 1490 + }, + { + "epoch": 0.02712226036917917, + "grad_norm": 43.15625, + "learning_rate": 9.995762154001064e-06, + "loss": 17.4142, + "step": 1500 + }, + { + "epoch": 0.02730307543830703, + "grad_norm": 40.59375, + "learning_rate": 9.995733901694404e-06, + "loss": 17.9184, + "step": 1510 + }, + { + "epoch": 0.02748389050743489, + "grad_norm": 40.6875, + "learning_rate": 9.995705649387745e-06, + "loss": 17.6185, + "step": 1520 + }, + { + "epoch": 0.02766470557656275, + "grad_norm": 40.40625, + "learning_rate": 9.995677397081086e-06, + "loss": 17.5548, + "step": 1530 + }, + { + "epoch": 0.02784552064569061, + "grad_norm": 40.5, + "learning_rate": 9.995649144774426e-06, + "loss": 17.4174, + "step": 1540 + }, + { + "epoch": 0.028026335714818475, + "grad_norm": 41.5, + "learning_rate": 9.995620892467767e-06, + "loss": 17.5004, + "step": 1550 + }, + { + "epoch": 0.028207150783946335, + "grad_norm": 40.625, + "learning_rate": 9.995592640161107e-06, + "loss": 17.4335, + "step": 1560 + }, + { + "epoch": 0.028387965853074196, + "grad_norm": 41.59375, + "learning_rate": 9.995564387854448e-06, + "loss": 17.5609, + "step": 1570 + }, + { + "epoch": 0.028568780922202056, + "grad_norm": 41.6875, + "learning_rate": 9.995536135547787e-06, + "loss": 17.2265, + "step": 1580 + }, + { + "epoch": 0.028749595991329917, + "grad_norm": 39.5, + "learning_rate": 9.995507883241128e-06, + "loss": 17.6429, + "step": 1590 + }, + { + "epoch": 0.028930411060457777, + "grad_norm": 37.625, + "learning_rate": 9.995479630934468e-06, + "loss": 17.6421, + "step": 1600 + }, + { + "epoch": 0.02911122612958564, + "grad_norm": 42.34375, + "learning_rate": 9.995451378627809e-06, + "loss": 17.8708, + "step": 1610 + }, + { + "epoch": 0.029292041198713502, + "grad_norm": 38.375, + "learning_rate": 9.99542312632115e-06, + "loss": 17.6356, + "step": 1620 + }, + { + "epoch": 0.029472856267841362, + "grad_norm": 37.03125, + "learning_rate": 9.99539487401449e-06, + "loss": 17.4266, + "step": 1630 + }, + { + "epoch": 0.029653671336969223, + "grad_norm": 38.125, + "learning_rate": 9.99536662170783e-06, + "loss": 17.4123, + "step": 1640 + }, + { + "epoch": 0.029834486406097083, + "grad_norm": 38.3125, + "learning_rate": 9.995338369401171e-06, + "loss": 17.4872, + "step": 1650 + }, + { + "epoch": 0.030015301475224944, + "grad_norm": 40.0, + "learning_rate": 9.99531011709451e-06, + "loss": 17.6213, + "step": 1660 + }, + { + "epoch": 0.030196116544352808, + "grad_norm": 39.46875, + "learning_rate": 9.995281864787851e-06, + "loss": 17.2009, + "step": 1670 + }, + { + "epoch": 0.03037693161348067, + "grad_norm": 42.125, + "learning_rate": 9.995253612481191e-06, + "loss": 17.5288, + "step": 1680 + }, + { + "epoch": 0.03055774668260853, + "grad_norm": 37.625, + "learning_rate": 9.995225360174532e-06, + "loss": 17.842, + "step": 1690 + }, + { + "epoch": 0.03073856175173639, + "grad_norm": 37.90625, + "learning_rate": 9.995197107867873e-06, + "loss": 17.5651, + "step": 1700 + }, + { + "epoch": 0.03091937682086425, + "grad_norm": 40.3125, + "learning_rate": 9.995168855561213e-06, + "loss": 17.3393, + "step": 1710 + }, + { + "epoch": 0.03110019188999211, + "grad_norm": 38.65625, + "learning_rate": 9.995140603254554e-06, + "loss": 17.9935, + "step": 1720 + }, + { + "epoch": 0.03128100695911997, + "grad_norm": 41.78125, + "learning_rate": 9.995112350947895e-06, + "loss": 17.5959, + "step": 1730 + }, + { + "epoch": 0.031461822028247835, + "grad_norm": 40.25, + "learning_rate": 9.995084098641235e-06, + "loss": 17.9083, + "step": 1740 + }, + { + "epoch": 0.03164263709737569, + "grad_norm": 39.9375, + "learning_rate": 9.995055846334574e-06, + "loss": 17.3785, + "step": 1750 + }, + { + "epoch": 0.031823452166503556, + "grad_norm": 40.625, + "learning_rate": 9.995027594027915e-06, + "loss": 17.5619, + "step": 1760 + }, + { + "epoch": 0.03200426723563142, + "grad_norm": 41.5625, + "learning_rate": 9.994999341721255e-06, + "loss": 17.3343, + "step": 1770 + }, + { + "epoch": 0.03218508230475928, + "grad_norm": 40.65625, + "learning_rate": 9.994971089414596e-06, + "loss": 17.5575, + "step": 1780 + }, + { + "epoch": 0.03236589737388714, + "grad_norm": 38.125, + "learning_rate": 9.994942837107937e-06, + "loss": 17.4286, + "step": 1790 + }, + { + "epoch": 0.032546712443015, + "grad_norm": 40.96875, + "learning_rate": 9.994914584801277e-06, + "loss": 17.5324, + "step": 1800 + }, + { + "epoch": 0.03272752751214286, + "grad_norm": 41.71875, + "learning_rate": 9.994886332494618e-06, + "loss": 17.3849, + "step": 1810 + }, + { + "epoch": 0.032908342581270726, + "grad_norm": 41.25, + "learning_rate": 9.994858080187959e-06, + "loss": 17.6226, + "step": 1820 + }, + { + "epoch": 0.03308915765039858, + "grad_norm": 40.59375, + "learning_rate": 9.994829827881299e-06, + "loss": 17.5047, + "step": 1830 + }, + { + "epoch": 0.03326997271952645, + "grad_norm": 41.1875, + "learning_rate": 9.994801575574638e-06, + "loss": 17.7064, + "step": 1840 + }, + { + "epoch": 0.033450787788654304, + "grad_norm": 40.0625, + "learning_rate": 9.994773323267979e-06, + "loss": 17.5485, + "step": 1850 + }, + { + "epoch": 0.03363160285778217, + "grad_norm": 39.65625, + "learning_rate": 9.99474507096132e-06, + "loss": 17.5458, + "step": 1860 + }, + { + "epoch": 0.033812417926910025, + "grad_norm": 39.1875, + "learning_rate": 9.99471681865466e-06, + "loss": 17.452, + "step": 1870 + }, + { + "epoch": 0.03399323299603789, + "grad_norm": 40.6875, + "learning_rate": 9.994688566348e-06, + "loss": 18.0397, + "step": 1880 + }, + { + "epoch": 0.03417404806516575, + "grad_norm": 41.1875, + "learning_rate": 9.994660314041341e-06, + "loss": 17.6931, + "step": 1890 + }, + { + "epoch": 0.03435486313429361, + "grad_norm": 41.28125, + "learning_rate": 9.994632061734682e-06, + "loss": 17.7594, + "step": 1900 + }, + { + "epoch": 0.034535678203421474, + "grad_norm": 41.15625, + "learning_rate": 9.994603809428022e-06, + "loss": 17.4341, + "step": 1910 + }, + { + "epoch": 0.03471649327254933, + "grad_norm": 40.5, + "learning_rate": 9.994575557121361e-06, + "loss": 17.5042, + "step": 1920 + }, + { + "epoch": 0.034897308341677195, + "grad_norm": 40.78125, + "learning_rate": 9.994547304814702e-06, + "loss": 17.4923, + "step": 1930 + }, + { + "epoch": 0.03507812341080506, + "grad_norm": 40.8125, + "learning_rate": 9.994519052508043e-06, + "loss": 17.8844, + "step": 1940 + }, + { + "epoch": 0.035258938479932916, + "grad_norm": 38.875, + "learning_rate": 9.994490800201383e-06, + "loss": 17.2922, + "step": 1950 + }, + { + "epoch": 0.03543975354906078, + "grad_norm": 38.40625, + "learning_rate": 9.994462547894724e-06, + "loss": 17.5523, + "step": 1960 + }, + { + "epoch": 0.03562056861818864, + "grad_norm": 38.71875, + "learning_rate": 9.994434295588063e-06, + "loss": 17.504, + "step": 1970 + }, + { + "epoch": 0.0358013836873165, + "grad_norm": 40.1875, + "learning_rate": 9.994406043281405e-06, + "loss": 17.8244, + "step": 1980 + }, + { + "epoch": 0.035982198756444365, + "grad_norm": 39.78125, + "learning_rate": 9.994377790974746e-06, + "loss": 17.5912, + "step": 1990 + }, + { + "epoch": 0.03616301382557222, + "grad_norm": 37.8125, + "learning_rate": 9.994349538668086e-06, + "loss": 17.4715, + "step": 2000 + }, + { + "epoch": 0.036343828894700086, + "grad_norm": 39.78125, + "learning_rate": 9.994321286361425e-06, + "loss": 17.6859, + "step": 2010 + }, + { + "epoch": 0.036524643963827944, + "grad_norm": 38.96875, + "learning_rate": 9.994293034054766e-06, + "loss": 17.9146, + "step": 2020 + }, + { + "epoch": 0.03670545903295581, + "grad_norm": 40.71875, + "learning_rate": 9.994264781748106e-06, + "loss": 17.4209, + "step": 2030 + }, + { + "epoch": 0.036886274102083665, + "grad_norm": 38.4375, + "learning_rate": 9.994236529441447e-06, + "loss": 17.432, + "step": 2040 + }, + { + "epoch": 0.03706708917121153, + "grad_norm": 40.875, + "learning_rate": 9.994208277134788e-06, + "loss": 17.6924, + "step": 2050 + }, + { + "epoch": 0.03724790424033939, + "grad_norm": 40.34375, + "learning_rate": 9.994180024828128e-06, + "loss": 17.1496, + "step": 2060 + }, + { + "epoch": 0.03742871930946725, + "grad_norm": 38.3125, + "learning_rate": 9.994151772521469e-06, + "loss": 17.0407, + "step": 2070 + }, + { + "epoch": 0.037609534378595114, + "grad_norm": 39.40625, + "learning_rate": 9.99412352021481e-06, + "loss": 18.0761, + "step": 2080 + }, + { + "epoch": 0.03779034944772297, + "grad_norm": 37.71875, + "learning_rate": 9.994095267908149e-06, + "loss": 17.3811, + "step": 2090 + }, + { + "epoch": 0.037971164516850835, + "grad_norm": 41.65625, + "learning_rate": 9.99406701560149e-06, + "loss": 17.4023, + "step": 2100 + }, + { + "epoch": 0.0381519795859787, + "grad_norm": 40.40625, + "learning_rate": 9.99403876329483e-06, + "loss": 17.4827, + "step": 2110 + }, + { + "epoch": 0.038332794655106556, + "grad_norm": 41.96875, + "learning_rate": 9.99401051098817e-06, + "loss": 17.3661, + "step": 2120 + }, + { + "epoch": 0.03851360972423442, + "grad_norm": 39.375, + "learning_rate": 9.993982258681511e-06, + "loss": 17.6976, + "step": 2130 + }, + { + "epoch": 0.03869442479336228, + "grad_norm": 39.09375, + "learning_rate": 9.993954006374852e-06, + "loss": 17.472, + "step": 2140 + }, + { + "epoch": 0.03887523986249014, + "grad_norm": 40.9375, + "learning_rate": 9.993925754068192e-06, + "loss": 17.3894, + "step": 2150 + }, + { + "epoch": 0.039056054931618, + "grad_norm": 38.4375, + "learning_rate": 9.993897501761533e-06, + "loss": 17.4943, + "step": 2160 + }, + { + "epoch": 0.03923687000074586, + "grad_norm": 40.0625, + "learning_rate": 9.993869249454874e-06, + "loss": 17.8319, + "step": 2170 + }, + { + "epoch": 0.039417685069873726, + "grad_norm": 37.40625, + "learning_rate": 9.993840997148212e-06, + "loss": 17.5854, + "step": 2180 + }, + { + "epoch": 0.03959850013900158, + "grad_norm": 41.96875, + "learning_rate": 9.993812744841553e-06, + "loss": 17.6529, + "step": 2190 + }, + { + "epoch": 0.03977931520812945, + "grad_norm": 38.375, + "learning_rate": 9.993784492534894e-06, + "loss": 17.0541, + "step": 2200 + }, + { + "epoch": 0.039960130277257304, + "grad_norm": 40.90625, + "learning_rate": 9.993756240228234e-06, + "loss": 17.2947, + "step": 2210 + }, + { + "epoch": 0.04014094534638517, + "grad_norm": 40.5625, + "learning_rate": 9.993727987921575e-06, + "loss": 17.6078, + "step": 2220 + }, + { + "epoch": 0.04032176041551303, + "grad_norm": 38.375, + "learning_rate": 9.993699735614914e-06, + "loss": 17.3897, + "step": 2230 + }, + { + "epoch": 0.04050257548464089, + "grad_norm": 41.78125, + "learning_rate": 9.993671483308256e-06, + "loss": 17.3831, + "step": 2240 + }, + { + "epoch": 0.04068339055376875, + "grad_norm": 41.6875, + "learning_rate": 9.993643231001597e-06, + "loss": 17.4514, + "step": 2250 + }, + { + "epoch": 0.04086420562289661, + "grad_norm": 43.09375, + "learning_rate": 9.993614978694937e-06, + "loss": 17.9483, + "step": 2260 + }, + { + "epoch": 0.041045020692024474, + "grad_norm": 39.53125, + "learning_rate": 9.993586726388276e-06, + "loss": 17.9368, + "step": 2270 + }, + { + "epoch": 0.04122583576115233, + "grad_norm": 40.15625, + "learning_rate": 9.993558474081617e-06, + "loss": 17.7421, + "step": 2280 + }, + { + "epoch": 0.041406650830280195, + "grad_norm": 42.21875, + "learning_rate": 9.993530221774958e-06, + "loss": 17.6865, + "step": 2290 + }, + { + "epoch": 0.04158746589940806, + "grad_norm": 39.625, + "learning_rate": 9.993501969468298e-06, + "loss": 17.6578, + "step": 2300 + }, + { + "epoch": 0.041768280968535916, + "grad_norm": 40.3125, + "learning_rate": 9.993473717161639e-06, + "loss": 17.5698, + "step": 2310 + }, + { + "epoch": 0.04194909603766378, + "grad_norm": 39.3125, + "learning_rate": 9.993445464854978e-06, + "loss": 17.1494, + "step": 2320 + }, + { + "epoch": 0.04212991110679164, + "grad_norm": 42.96875, + "learning_rate": 9.99341721254832e-06, + "loss": 17.4623, + "step": 2330 + }, + { + "epoch": 0.0423107261759195, + "grad_norm": 40.5, + "learning_rate": 9.99338896024166e-06, + "loss": 17.5555, + "step": 2340 + }, + { + "epoch": 0.042491541245047365, + "grad_norm": 42.125, + "learning_rate": 9.993360707935e-06, + "loss": 17.5949, + "step": 2350 + }, + { + "epoch": 0.04267235631417522, + "grad_norm": 41.34375, + "learning_rate": 9.99333245562834e-06, + "loss": 17.2163, + "step": 2360 + }, + { + "epoch": 0.042853171383303086, + "grad_norm": 41.3125, + "learning_rate": 9.993304203321681e-06, + "loss": 17.95, + "step": 2370 + }, + { + "epoch": 0.04303398645243094, + "grad_norm": 43.40625, + "learning_rate": 9.993275951015021e-06, + "loss": 17.4045, + "step": 2380 + }, + { + "epoch": 0.04321480152155881, + "grad_norm": 37.78125, + "learning_rate": 9.993247698708362e-06, + "loss": 17.2573, + "step": 2390 + }, + { + "epoch": 0.04339561659068667, + "grad_norm": 42.15625, + "learning_rate": 9.993219446401701e-06, + "loss": 17.1407, + "step": 2400 + }, + { + "epoch": 0.04357643165981453, + "grad_norm": 37.84375, + "learning_rate": 9.993191194095043e-06, + "loss": 17.4419, + "step": 2410 + }, + { + "epoch": 0.04375724672894239, + "grad_norm": 39.46875, + "learning_rate": 9.993162941788384e-06, + "loss": 17.9597, + "step": 2420 + }, + { + "epoch": 0.04393806179807025, + "grad_norm": 40.5, + "learning_rate": 9.993134689481725e-06, + "loss": 17.3377, + "step": 2430 + }, + { + "epoch": 0.04411887686719811, + "grad_norm": 37.03125, + "learning_rate": 9.993106437175064e-06, + "loss": 17.517, + "step": 2440 + }, + { + "epoch": 0.04429969193632597, + "grad_norm": 41.6875, + "learning_rate": 9.993078184868404e-06, + "loss": 17.1929, + "step": 2450 + }, + { + "epoch": 0.044480507005453834, + "grad_norm": 38.75, + "learning_rate": 9.993049932561745e-06, + "loss": 17.5776, + "step": 2460 + }, + { + "epoch": 0.0446613220745817, + "grad_norm": 40.28125, + "learning_rate": 9.993021680255085e-06, + "loss": 17.2231, + "step": 2470 + }, + { + "epoch": 0.044842137143709555, + "grad_norm": 42.0625, + "learning_rate": 9.992993427948426e-06, + "loss": 17.4934, + "step": 2480 + }, + { + "epoch": 0.04502295221283742, + "grad_norm": 41.21875, + "learning_rate": 9.992965175641765e-06, + "loss": 17.5628, + "step": 2490 + }, + { + "epoch": 0.045203767281965276, + "grad_norm": 41.59375, + "learning_rate": 9.992936923335107e-06, + "loss": 17.804, + "step": 2500 + }, + { + "epoch": 0.04538458235109314, + "grad_norm": 42.8125, + "learning_rate": 9.992908671028448e-06, + "loss": 17.9538, + "step": 2510 + }, + { + "epoch": 0.045565397420221004, + "grad_norm": 39.09375, + "learning_rate": 9.992880418721787e-06, + "loss": 17.6877, + "step": 2520 + }, + { + "epoch": 0.04574621248934886, + "grad_norm": 39.9375, + "learning_rate": 9.992852166415127e-06, + "loss": 18.0231, + "step": 2530 + }, + { + "epoch": 0.045927027558476725, + "grad_norm": 43.3125, + "learning_rate": 9.992823914108468e-06, + "loss": 17.272, + "step": 2540 + }, + { + "epoch": 0.04610784262760458, + "grad_norm": 40.6875, + "learning_rate": 9.992795661801809e-06, + "loss": 17.1814, + "step": 2550 + }, + { + "epoch": 0.046288657696732446, + "grad_norm": 39.4375, + "learning_rate": 9.99276740949515e-06, + "loss": 17.5537, + "step": 2560 + }, + { + "epoch": 0.046469472765860304, + "grad_norm": 39.75, + "learning_rate": 9.992739157188488e-06, + "loss": 17.979, + "step": 2570 + }, + { + "epoch": 0.04665028783498817, + "grad_norm": 42.15625, + "learning_rate": 9.992710904881829e-06, + "loss": 17.4319, + "step": 2580 + }, + { + "epoch": 0.04683110290411603, + "grad_norm": 38.28125, + "learning_rate": 9.992682652575171e-06, + "loss": 17.3438, + "step": 2590 + }, + { + "epoch": 0.04701191797324389, + "grad_norm": 40.9375, + "learning_rate": 9.992654400268512e-06, + "loss": 17.3572, + "step": 2600 + }, + { + "epoch": 0.04719273304237175, + "grad_norm": 41.59375, + "learning_rate": 9.99262614796185e-06, + "loss": 17.6413, + "step": 2610 + }, + { + "epoch": 0.04737354811149961, + "grad_norm": 40.46875, + "learning_rate": 9.992597895655191e-06, + "loss": 17.2812, + "step": 2620 + }, + { + "epoch": 0.047554363180627474, + "grad_norm": 41.09375, + "learning_rate": 9.992569643348532e-06, + "loss": 17.3716, + "step": 2630 + }, + { + "epoch": 0.04773517824975534, + "grad_norm": 42.53125, + "learning_rate": 9.992541391041873e-06, + "loss": 17.6471, + "step": 2640 + }, + { + "epoch": 0.047915993318883195, + "grad_norm": 39.65625, + "learning_rate": 9.992513138735213e-06, + "loss": 17.5608, + "step": 2650 + }, + { + "epoch": 0.04809680838801106, + "grad_norm": 38.15625, + "learning_rate": 9.992484886428552e-06, + "loss": 17.6671, + "step": 2660 + }, + { + "epoch": 0.048277623457138916, + "grad_norm": 38.21875, + "learning_rate": 9.992456634121893e-06, + "loss": 17.71, + "step": 2670 + }, + { + "epoch": 0.04845843852626678, + "grad_norm": 40.46875, + "learning_rate": 9.992428381815235e-06, + "loss": 17.377, + "step": 2680 + }, + { + "epoch": 0.04863925359539464, + "grad_norm": 40.5625, + "learning_rate": 9.992400129508574e-06, + "loss": 17.4225, + "step": 2690 + }, + { + "epoch": 0.0488200686645225, + "grad_norm": 40.78125, + "learning_rate": 9.992371877201915e-06, + "loss": 17.4072, + "step": 2700 + }, + { + "epoch": 0.049000883733650365, + "grad_norm": 40.4375, + "learning_rate": 9.992343624895255e-06, + "loss": 17.1139, + "step": 2710 + }, + { + "epoch": 0.04918169880277822, + "grad_norm": 38.21875, + "learning_rate": 9.992315372588596e-06, + "loss": 17.3573, + "step": 2720 + }, + { + "epoch": 0.049362513871906086, + "grad_norm": 39.53125, + "learning_rate": 9.992287120281937e-06, + "loss": 17.2752, + "step": 2730 + }, + { + "epoch": 0.04954332894103394, + "grad_norm": 38.6875, + "learning_rate": 9.992258867975277e-06, + "loss": 17.4226, + "step": 2740 + }, + { + "epoch": 0.04972414401016181, + "grad_norm": 40.25, + "learning_rate": 9.992230615668616e-06, + "loss": 17.6635, + "step": 2750 + }, + { + "epoch": 0.04990495907928967, + "grad_norm": 42.0, + "learning_rate": 9.992202363361958e-06, + "loss": 17.8238, + "step": 2760 + }, + { + "epoch": 0.05008577414841753, + "grad_norm": 39.90625, + "learning_rate": 9.992174111055299e-06, + "loss": 17.3426, + "step": 2770 + }, + { + "epoch": 0.05026658921754539, + "grad_norm": 40.28125, + "learning_rate": 9.992145858748638e-06, + "loss": 17.4383, + "step": 2780 + }, + { + "epoch": 0.05044740428667325, + "grad_norm": 40.46875, + "learning_rate": 9.992117606441979e-06, + "loss": 17.505, + "step": 2790 + }, + { + "epoch": 0.05062821935580111, + "grad_norm": 42.5, + "learning_rate": 9.99208935413532e-06, + "loss": 17.3786, + "step": 2800 + }, + { + "epoch": 0.05080903442492898, + "grad_norm": 43.625, + "learning_rate": 9.99206110182866e-06, + "loss": 17.5904, + "step": 2810 + }, + { + "epoch": 0.050989849494056834, + "grad_norm": 39.0625, + "learning_rate": 9.992032849522e-06, + "loss": 17.7643, + "step": 2820 + }, + { + "epoch": 0.0511706645631847, + "grad_norm": 39.875, + "learning_rate": 9.99200459721534e-06, + "loss": 17.3822, + "step": 2830 + }, + { + "epoch": 0.051351479632312555, + "grad_norm": 40.78125, + "learning_rate": 9.99197634490868e-06, + "loss": 17.8304, + "step": 2840 + }, + { + "epoch": 0.05153229470144042, + "grad_norm": 38.1875, + "learning_rate": 9.991948092602022e-06, + "loss": 17.4109, + "step": 2850 + }, + { + "epoch": 0.051713109770568276, + "grad_norm": 43.90625, + "learning_rate": 9.991919840295363e-06, + "loss": 17.5073, + "step": 2860 + }, + { + "epoch": 0.05189392483969614, + "grad_norm": 36.53125, + "learning_rate": 9.991891587988702e-06, + "loss": 17.6826, + "step": 2870 + }, + { + "epoch": 0.052074739908824004, + "grad_norm": 39.5625, + "learning_rate": 9.991863335682042e-06, + "loss": 17.5374, + "step": 2880 + }, + { + "epoch": 0.05225555497795186, + "grad_norm": 42.03125, + "learning_rate": 9.991835083375383e-06, + "loss": 17.7052, + "step": 2890 + }, + { + "epoch": 0.052436370047079725, + "grad_norm": 39.3125, + "learning_rate": 9.991806831068724e-06, + "loss": 17.8823, + "step": 2900 + }, + { + "epoch": 0.05261718511620758, + "grad_norm": 39.125, + "learning_rate": 9.991778578762064e-06, + "loss": 17.2184, + "step": 2910 + }, + { + "epoch": 0.052798000185335446, + "grad_norm": 41.375, + "learning_rate": 9.991750326455403e-06, + "loss": 17.5451, + "step": 2920 + }, + { + "epoch": 0.05297881525446331, + "grad_norm": 41.8125, + "learning_rate": 9.991722074148744e-06, + "loss": 17.6575, + "step": 2930 + }, + { + "epoch": 0.05315963032359117, + "grad_norm": 41.625, + "learning_rate": 9.991693821842086e-06, + "loss": 17.2601, + "step": 2940 + }, + { + "epoch": 0.05334044539271903, + "grad_norm": 39.6875, + "learning_rate": 9.991665569535425e-06, + "loss": 17.8104, + "step": 2950 + }, + { + "epoch": 0.05352126046184689, + "grad_norm": 36.96875, + "learning_rate": 9.991637317228766e-06, + "loss": 17.6773, + "step": 2960 + }, + { + "epoch": 0.05370207553097475, + "grad_norm": 39.15625, + "learning_rate": 9.991609064922106e-06, + "loss": 17.3477, + "step": 2970 + }, + { + "epoch": 0.05388289060010261, + "grad_norm": 41.5, + "learning_rate": 9.991580812615447e-06, + "loss": 17.2803, + "step": 2980 + }, + { + "epoch": 0.05406370566923047, + "grad_norm": 39.28125, + "learning_rate": 9.991552560308788e-06, + "loss": 17.2301, + "step": 2990 + }, + { + "epoch": 0.05424452073835834, + "grad_norm": 37.625, + "learning_rate": 9.991524308002127e-06, + "loss": 17.5987, + "step": 3000 + }, + { + "epoch": 0.054425335807486194, + "grad_norm": 39.34375, + "learning_rate": 9.991496055695467e-06, + "loss": 17.4069, + "step": 3010 + }, + { + "epoch": 0.05460615087661406, + "grad_norm": 43.03125, + "learning_rate": 9.991467803388808e-06, + "loss": 17.0933, + "step": 3020 + }, + { + "epoch": 0.054786965945741915, + "grad_norm": 38.65625, + "learning_rate": 9.99143955108215e-06, + "loss": 17.2305, + "step": 3030 + }, + { + "epoch": 0.05496778101486978, + "grad_norm": 40.4375, + "learning_rate": 9.991411298775489e-06, + "loss": 17.7806, + "step": 3040 + }, + { + "epoch": 0.05514859608399764, + "grad_norm": 39.28125, + "learning_rate": 9.99138304646883e-06, + "loss": 17.7382, + "step": 3050 + }, + { + "epoch": 0.0553294111531255, + "grad_norm": 42.875, + "learning_rate": 9.99135479416217e-06, + "loss": 17.4769, + "step": 3060 + }, + { + "epoch": 0.055510226222253364, + "grad_norm": 40.8125, + "learning_rate": 9.991326541855511e-06, + "loss": 17.1672, + "step": 3070 + }, + { + "epoch": 0.05569104129138122, + "grad_norm": 38.9375, + "learning_rate": 9.991298289548852e-06, + "loss": 17.4499, + "step": 3080 + }, + { + "epoch": 0.055871856360509085, + "grad_norm": 44.28125, + "learning_rate": 9.99127003724219e-06, + "loss": 17.5775, + "step": 3090 + }, + { + "epoch": 0.05605267142963695, + "grad_norm": 41.03125, + "learning_rate": 9.991241784935531e-06, + "loss": 17.6459, + "step": 3100 + }, + { + "epoch": 0.056233486498764806, + "grad_norm": 40.75, + "learning_rate": 9.991213532628873e-06, + "loss": 17.3404, + "step": 3110 + }, + { + "epoch": 0.05641430156789267, + "grad_norm": 41.5, + "learning_rate": 9.991185280322212e-06, + "loss": 17.4062, + "step": 3120 + }, + { + "epoch": 0.05659511663702053, + "grad_norm": 40.59375, + "learning_rate": 9.991157028015553e-06, + "loss": 17.7076, + "step": 3130 + }, + { + "epoch": 0.05677593170614839, + "grad_norm": 38.625, + "learning_rate": 9.991128775708894e-06, + "loss": 17.6773, + "step": 3140 + }, + { + "epoch": 0.05695674677527625, + "grad_norm": 42.0, + "learning_rate": 9.991100523402234e-06, + "loss": 17.3732, + "step": 3150 + }, + { + "epoch": 0.05713756184440411, + "grad_norm": 39.59375, + "learning_rate": 9.991072271095575e-06, + "loss": 17.4142, + "step": 3160 + }, + { + "epoch": 0.057318376913531977, + "grad_norm": 41.375, + "learning_rate": 9.991044018788915e-06, + "loss": 17.4713, + "step": 3170 + }, + { + "epoch": 0.057499191982659834, + "grad_norm": 40.1875, + "learning_rate": 9.991015766482254e-06, + "loss": 17.4469, + "step": 3180 + }, + { + "epoch": 0.0576800070517877, + "grad_norm": 37.5, + "learning_rate": 9.990987514175595e-06, + "loss": 17.2832, + "step": 3190 + }, + { + "epoch": 0.057860822120915555, + "grad_norm": 40.53125, + "learning_rate": 9.990959261868937e-06, + "loss": 17.5141, + "step": 3200 + }, + { + "epoch": 0.05804163719004342, + "grad_norm": 38.5625, + "learning_rate": 9.990931009562276e-06, + "loss": 17.3238, + "step": 3210 + }, + { + "epoch": 0.05822245225917128, + "grad_norm": 40.375, + "learning_rate": 9.990902757255617e-06, + "loss": 17.4824, + "step": 3220 + }, + { + "epoch": 0.05840326732829914, + "grad_norm": 40.34375, + "learning_rate": 9.990874504948957e-06, + "loss": 17.458, + "step": 3230 + }, + { + "epoch": 0.058584082397427004, + "grad_norm": 41.15625, + "learning_rate": 9.990846252642298e-06, + "loss": 17.5533, + "step": 3240 + }, + { + "epoch": 0.05876489746655486, + "grad_norm": 40.9375, + "learning_rate": 9.990818000335639e-06, + "loss": 17.7103, + "step": 3250 + }, + { + "epoch": 0.058945712535682725, + "grad_norm": 40.15625, + "learning_rate": 9.990789748028978e-06, + "loss": 17.4866, + "step": 3260 + }, + { + "epoch": 0.05912652760481058, + "grad_norm": 41.3125, + "learning_rate": 9.990761495722318e-06, + "loss": 17.6451, + "step": 3270 + }, + { + "epoch": 0.059307342673938446, + "grad_norm": 39.9375, + "learning_rate": 9.990733243415659e-06, + "loss": 18.1264, + "step": 3280 + }, + { + "epoch": 0.05948815774306631, + "grad_norm": 40.03125, + "learning_rate": 9.990704991109001e-06, + "loss": 17.2858, + "step": 3290 + }, + { + "epoch": 0.05966897281219417, + "grad_norm": 43.34375, + "learning_rate": 9.99067673880234e-06, + "loss": 17.6059, + "step": 3300 + }, + { + "epoch": 0.05984978788132203, + "grad_norm": 39.65625, + "learning_rate": 9.99064848649568e-06, + "loss": 17.8113, + "step": 3310 + }, + { + "epoch": 0.06003060295044989, + "grad_norm": 42.0, + "learning_rate": 9.990620234189021e-06, + "loss": 17.2323, + "step": 3320 + }, + { + "epoch": 0.06021141801957775, + "grad_norm": 39.40625, + "learning_rate": 9.990591981882362e-06, + "loss": 17.2295, + "step": 3330 + }, + { + "epoch": 0.060392233088705616, + "grad_norm": 40.65625, + "learning_rate": 9.990563729575703e-06, + "loss": 17.2785, + "step": 3340 + }, + { + "epoch": 0.06057304815783347, + "grad_norm": 38.9375, + "learning_rate": 9.990535477269042e-06, + "loss": 17.3906, + "step": 3350 + }, + { + "epoch": 0.06075386322696134, + "grad_norm": 42.09375, + "learning_rate": 9.990507224962382e-06, + "loss": 17.4731, + "step": 3360 + }, + { + "epoch": 0.060934678296089194, + "grad_norm": 39.21875, + "learning_rate": 9.990478972655723e-06, + "loss": 17.5231, + "step": 3370 + }, + { + "epoch": 0.06111549336521706, + "grad_norm": 43.0625, + "learning_rate": 9.990450720349063e-06, + "loss": 17.6285, + "step": 3380 + }, + { + "epoch": 0.061296308434344915, + "grad_norm": 39.1875, + "learning_rate": 9.990422468042404e-06, + "loss": 17.5402, + "step": 3390 + }, + { + "epoch": 0.06147712350347278, + "grad_norm": 41.25, + "learning_rate": 9.990394215735745e-06, + "loss": 17.6009, + "step": 3400 + }, + { + "epoch": 0.06165793857260064, + "grad_norm": 40.9375, + "learning_rate": 9.990365963429085e-06, + "loss": 17.5682, + "step": 3410 + }, + { + "epoch": 0.0618387536417285, + "grad_norm": 41.125, + "learning_rate": 9.990337711122426e-06, + "loss": 17.8649, + "step": 3420 + }, + { + "epoch": 0.062019568710856364, + "grad_norm": 41.28125, + "learning_rate": 9.990309458815765e-06, + "loss": 17.6951, + "step": 3430 + }, + { + "epoch": 0.06220038377998422, + "grad_norm": 38.96875, + "learning_rate": 9.990281206509105e-06, + "loss": 17.3738, + "step": 3440 + }, + { + "epoch": 0.062381198849112085, + "grad_norm": 40.03125, + "learning_rate": 9.990252954202446e-06, + "loss": 17.4536, + "step": 3450 + }, + { + "epoch": 0.06256201391823994, + "grad_norm": 38.5625, + "learning_rate": 9.990224701895788e-06, + "loss": 17.4441, + "step": 3460 + }, + { + "epoch": 0.06274282898736781, + "grad_norm": 41.59375, + "learning_rate": 9.990196449589127e-06, + "loss": 17.284, + "step": 3470 + }, + { + "epoch": 0.06292364405649567, + "grad_norm": 41.25, + "learning_rate": 9.990168197282468e-06, + "loss": 17.529, + "step": 3480 + }, + { + "epoch": 0.06310445912562353, + "grad_norm": 39.25, + "learning_rate": 9.990139944975809e-06, + "loss": 17.4169, + "step": 3490 + }, + { + "epoch": 0.06328527419475138, + "grad_norm": 37.78125, + "learning_rate": 9.99011169266915e-06, + "loss": 17.2194, + "step": 3500 + }, + { + "epoch": 0.06346608926387926, + "grad_norm": 43.84375, + "learning_rate": 9.99008344036249e-06, + "loss": 17.5379, + "step": 3510 + }, + { + "epoch": 0.06364690433300711, + "grad_norm": 40.5, + "learning_rate": 9.990055188055829e-06, + "loss": 17.5937, + "step": 3520 + }, + { + "epoch": 0.06382771940213497, + "grad_norm": 41.125, + "learning_rate": 9.99002693574917e-06, + "loss": 17.8142, + "step": 3530 + }, + { + "epoch": 0.06400853447126284, + "grad_norm": 40.78125, + "learning_rate": 9.98999868344251e-06, + "loss": 17.6586, + "step": 3540 + }, + { + "epoch": 0.0641893495403907, + "grad_norm": 41.03125, + "learning_rate": 9.98997043113585e-06, + "loss": 17.0579, + "step": 3550 + }, + { + "epoch": 0.06437016460951855, + "grad_norm": 37.84375, + "learning_rate": 9.989942178829191e-06, + "loss": 17.0986, + "step": 3560 + }, + { + "epoch": 0.06455097967864643, + "grad_norm": 37.84375, + "learning_rate": 9.989913926522532e-06, + "loss": 17.384, + "step": 3570 + }, + { + "epoch": 0.06473179474777428, + "grad_norm": 41.625, + "learning_rate": 9.989885674215872e-06, + "loss": 17.5132, + "step": 3580 + }, + { + "epoch": 0.06491260981690214, + "grad_norm": 40.5, + "learning_rate": 9.989857421909213e-06, + "loss": 17.5032, + "step": 3590 + }, + { + "epoch": 0.06509342488603, + "grad_norm": 40.96875, + "learning_rate": 9.989829169602554e-06, + "loss": 17.913, + "step": 3600 + }, + { + "epoch": 0.06527423995515787, + "grad_norm": 40.78125, + "learning_rate": 9.989800917295893e-06, + "loss": 17.4051, + "step": 3610 + }, + { + "epoch": 0.06545505502428572, + "grad_norm": 38.3125, + "learning_rate": 9.989772664989233e-06, + "loss": 17.7376, + "step": 3620 + }, + { + "epoch": 0.06563587009341358, + "grad_norm": 39.40625, + "learning_rate": 9.989744412682574e-06, + "loss": 17.9015, + "step": 3630 + }, + { + "epoch": 0.06581668516254145, + "grad_norm": 39.53125, + "learning_rate": 9.989716160375915e-06, + "loss": 17.7884, + "step": 3640 + }, + { + "epoch": 0.06599750023166931, + "grad_norm": 36.15625, + "learning_rate": 9.989687908069255e-06, + "loss": 17.105, + "step": 3650 + }, + { + "epoch": 0.06617831530079717, + "grad_norm": 39.09375, + "learning_rate": 9.989659655762596e-06, + "loss": 17.6172, + "step": 3660 + }, + { + "epoch": 0.06635913036992502, + "grad_norm": 42.15625, + "learning_rate": 9.989631403455936e-06, + "loss": 17.6919, + "step": 3670 + }, + { + "epoch": 0.0665399454390529, + "grad_norm": 38.90625, + "learning_rate": 9.989603151149277e-06, + "loss": 17.4987, + "step": 3680 + }, + { + "epoch": 0.06672076050818075, + "grad_norm": 40.875, + "learning_rate": 9.989574898842616e-06, + "loss": 17.4662, + "step": 3690 + }, + { + "epoch": 0.06690157557730861, + "grad_norm": 40.0625, + "learning_rate": 9.989546646535957e-06, + "loss": 17.3278, + "step": 3700 + }, + { + "epoch": 0.06708239064643648, + "grad_norm": 39.28125, + "learning_rate": 9.989518394229297e-06, + "loss": 17.5842, + "step": 3710 + }, + { + "epoch": 0.06726320571556434, + "grad_norm": 42.03125, + "learning_rate": 9.989490141922638e-06, + "loss": 17.4774, + "step": 3720 + }, + { + "epoch": 0.0674440207846922, + "grad_norm": 37.84375, + "learning_rate": 9.989461889615978e-06, + "loss": 17.6076, + "step": 3730 + }, + { + "epoch": 0.06762483585382005, + "grad_norm": 39.59375, + "learning_rate": 9.989433637309319e-06, + "loss": 17.2923, + "step": 3740 + }, + { + "epoch": 0.06780565092294792, + "grad_norm": 40.625, + "learning_rate": 9.98940538500266e-06, + "loss": 17.6126, + "step": 3750 + }, + { + "epoch": 0.06798646599207578, + "grad_norm": 41.84375, + "learning_rate": 9.989377132696e-06, + "loss": 17.6107, + "step": 3760 + }, + { + "epoch": 0.06816728106120364, + "grad_norm": 40.8125, + "learning_rate": 9.989348880389341e-06, + "loss": 17.3268, + "step": 3770 + }, + { + "epoch": 0.0683480961303315, + "grad_norm": 39.25, + "learning_rate": 9.98932062808268e-06, + "loss": 17.3873, + "step": 3780 + }, + { + "epoch": 0.06852891119945936, + "grad_norm": 41.4375, + "learning_rate": 9.98929237577602e-06, + "loss": 17.6123, + "step": 3790 + }, + { + "epoch": 0.06870972626858722, + "grad_norm": 42.4375, + "learning_rate": 9.989264123469361e-06, + "loss": 17.2932, + "step": 3800 + }, + { + "epoch": 0.06889054133771509, + "grad_norm": 39.71875, + "learning_rate": 9.989235871162702e-06, + "loss": 17.8571, + "step": 3810 + }, + { + "epoch": 0.06907135640684295, + "grad_norm": 38.625, + "learning_rate": 9.989207618856042e-06, + "loss": 17.2305, + "step": 3820 + }, + { + "epoch": 0.0692521714759708, + "grad_norm": 40.53125, + "learning_rate": 9.989179366549383e-06, + "loss": 17.0817, + "step": 3830 + }, + { + "epoch": 0.06943298654509866, + "grad_norm": 41.34375, + "learning_rate": 9.989151114242724e-06, + "loss": 17.5897, + "step": 3840 + }, + { + "epoch": 0.06961380161422653, + "grad_norm": 39.0625, + "learning_rate": 9.989122861936064e-06, + "loss": 17.8866, + "step": 3850 + }, + { + "epoch": 0.06979461668335439, + "grad_norm": 38.96875, + "learning_rate": 9.989094609629403e-06, + "loss": 17.5472, + "step": 3860 + }, + { + "epoch": 0.06997543175248225, + "grad_norm": 41.03125, + "learning_rate": 9.989066357322744e-06, + "loss": 17.7536, + "step": 3870 + }, + { + "epoch": 0.07015624682161012, + "grad_norm": 39.5625, + "learning_rate": 9.989038105016084e-06, + "loss": 17.7344, + "step": 3880 + }, + { + "epoch": 0.07033706189073798, + "grad_norm": 39.15625, + "learning_rate": 9.989009852709425e-06, + "loss": 17.0403, + "step": 3890 + }, + { + "epoch": 0.07051787695986583, + "grad_norm": 40.09375, + "learning_rate": 9.988981600402766e-06, + "loss": 17.368, + "step": 3900 + }, + { + "epoch": 0.07069869202899369, + "grad_norm": 37.21875, + "learning_rate": 9.988953348096106e-06, + "loss": 17.6145, + "step": 3910 + }, + { + "epoch": 0.07087950709812156, + "grad_norm": 39.3125, + "learning_rate": 9.988925095789447e-06, + "loss": 17.3629, + "step": 3920 + }, + { + "epoch": 0.07106032216724942, + "grad_norm": 42.5625, + "learning_rate": 9.988896843482787e-06, + "loss": 17.8597, + "step": 3930 + }, + { + "epoch": 0.07124113723637727, + "grad_norm": 41.15625, + "learning_rate": 9.988868591176128e-06, + "loss": 17.4615, + "step": 3940 + }, + { + "epoch": 0.07142195230550515, + "grad_norm": 42.1875, + "learning_rate": 9.988840338869467e-06, + "loss": 17.6934, + "step": 3950 + }, + { + "epoch": 0.071602767374633, + "grad_norm": 43.25, + "learning_rate": 9.988812086562808e-06, + "loss": 17.637, + "step": 3960 + }, + { + "epoch": 0.07178358244376086, + "grad_norm": 40.65625, + "learning_rate": 9.988783834256148e-06, + "loss": 17.3732, + "step": 3970 + }, + { + "epoch": 0.07196439751288873, + "grad_norm": 41.5625, + "learning_rate": 9.988755581949489e-06, + "loss": 17.4592, + "step": 3980 + }, + { + "epoch": 0.07214521258201659, + "grad_norm": 41.125, + "learning_rate": 9.98872732964283e-06, + "loss": 17.7686, + "step": 3990 + }, + { + "epoch": 0.07232602765114444, + "grad_norm": 41.65625, + "learning_rate": 9.98869907733617e-06, + "loss": 17.8232, + "step": 4000 + }, + { + "epoch": 0.0725068427202723, + "grad_norm": 40.3125, + "learning_rate": 9.98867082502951e-06, + "loss": 17.597, + "step": 4010 + }, + { + "epoch": 0.07268765778940017, + "grad_norm": 40.71875, + "learning_rate": 9.988642572722851e-06, + "loss": 17.6848, + "step": 4020 + }, + { + "epoch": 0.07286847285852803, + "grad_norm": 41.96875, + "learning_rate": 9.988614320416192e-06, + "loss": 17.1169, + "step": 4030 + }, + { + "epoch": 0.07304928792765589, + "grad_norm": 41.0, + "learning_rate": 9.988586068109531e-06, + "loss": 17.301, + "step": 4040 + }, + { + "epoch": 0.07323010299678376, + "grad_norm": 39.8125, + "learning_rate": 9.988557815802872e-06, + "loss": 17.5104, + "step": 4050 + }, + { + "epoch": 0.07341091806591162, + "grad_norm": 40.1875, + "learning_rate": 9.988529563496212e-06, + "loss": 17.302, + "step": 4060 + }, + { + "epoch": 0.07359173313503947, + "grad_norm": 39.625, + "learning_rate": 9.988501311189553e-06, + "loss": 17.4633, + "step": 4070 + }, + { + "epoch": 0.07377254820416733, + "grad_norm": 41.84375, + "learning_rate": 9.988473058882893e-06, + "loss": 17.6813, + "step": 4080 + }, + { + "epoch": 0.0739533632732952, + "grad_norm": 37.4375, + "learning_rate": 9.988444806576234e-06, + "loss": 17.2189, + "step": 4090 + }, + { + "epoch": 0.07413417834242306, + "grad_norm": 39.5, + "learning_rate": 9.988416554269575e-06, + "loss": 17.2021, + "step": 4100 + }, + { + "epoch": 0.07431499341155091, + "grad_norm": 38.96875, + "learning_rate": 9.988388301962915e-06, + "loss": 17.6964, + "step": 4110 + }, + { + "epoch": 0.07449580848067879, + "grad_norm": 39.375, + "learning_rate": 9.988360049656254e-06, + "loss": 17.2008, + "step": 4120 + }, + { + "epoch": 0.07467662354980664, + "grad_norm": 39.0, + "learning_rate": 9.988331797349595e-06, + "loss": 17.7864, + "step": 4130 + }, + { + "epoch": 0.0748574386189345, + "grad_norm": 42.84375, + "learning_rate": 9.988303545042935e-06, + "loss": 17.8637, + "step": 4140 + }, + { + "epoch": 0.07503825368806236, + "grad_norm": 43.34375, + "learning_rate": 9.988275292736276e-06, + "loss": 17.4759, + "step": 4150 + }, + { + "epoch": 0.07521906875719023, + "grad_norm": 41.5, + "learning_rate": 9.988247040429617e-06, + "loss": 17.9535, + "step": 4160 + }, + { + "epoch": 0.07539988382631808, + "grad_norm": 47.34375, + "learning_rate": 9.988218788122957e-06, + "loss": 17.3817, + "step": 4170 + }, + { + "epoch": 0.07558069889544594, + "grad_norm": 40.53125, + "learning_rate": 9.988190535816298e-06, + "loss": 17.4104, + "step": 4180 + }, + { + "epoch": 0.07576151396457381, + "grad_norm": 40.25, + "learning_rate": 9.988162283509639e-06, + "loss": 17.018, + "step": 4190 + }, + { + "epoch": 0.07594232903370167, + "grad_norm": 40.9375, + "learning_rate": 9.98813403120298e-06, + "loss": 17.7528, + "step": 4200 + }, + { + "epoch": 0.07612314410282953, + "grad_norm": 40.125, + "learning_rate": 9.988105778896318e-06, + "loss": 17.5826, + "step": 4210 + }, + { + "epoch": 0.0763039591719574, + "grad_norm": 40.28125, + "learning_rate": 9.988077526589659e-06, + "loss": 17.3862, + "step": 4220 + }, + { + "epoch": 0.07648477424108525, + "grad_norm": 38.6875, + "learning_rate": 9.988049274283e-06, + "loss": 17.879, + "step": 4230 + }, + { + "epoch": 0.07666558931021311, + "grad_norm": 40.53125, + "learning_rate": 9.98802102197634e-06, + "loss": 17.4999, + "step": 4240 + }, + { + "epoch": 0.07684640437934097, + "grad_norm": 42.0, + "learning_rate": 9.98799276966968e-06, + "loss": 17.3519, + "step": 4250 + }, + { + "epoch": 0.07702721944846884, + "grad_norm": 40.78125, + "learning_rate": 9.987964517363021e-06, + "loss": 17.3579, + "step": 4260 + }, + { + "epoch": 0.0772080345175967, + "grad_norm": 42.59375, + "learning_rate": 9.987936265056362e-06, + "loss": 17.225, + "step": 4270 + }, + { + "epoch": 0.07738884958672455, + "grad_norm": 39.5625, + "learning_rate": 9.987908012749702e-06, + "loss": 17.7747, + "step": 4280 + }, + { + "epoch": 0.07756966465585242, + "grad_norm": 41.15625, + "learning_rate": 9.987879760443041e-06, + "loss": 17.5272, + "step": 4290 + }, + { + "epoch": 0.07775047972498028, + "grad_norm": 40.03125, + "learning_rate": 9.987851508136382e-06, + "loss": 17.151, + "step": 4300 + }, + { + "epoch": 0.07793129479410814, + "grad_norm": 39.5625, + "learning_rate": 9.987823255829723e-06, + "loss": 17.4248, + "step": 4310 + }, + { + "epoch": 0.078112109863236, + "grad_norm": 39.09375, + "learning_rate": 9.987795003523063e-06, + "loss": 17.5838, + "step": 4320 + }, + { + "epoch": 0.07829292493236387, + "grad_norm": 40.125, + "learning_rate": 9.987766751216404e-06, + "loss": 17.3888, + "step": 4330 + }, + { + "epoch": 0.07847374000149172, + "grad_norm": 43.3125, + "learning_rate": 9.987738498909745e-06, + "loss": 17.7687, + "step": 4340 + }, + { + "epoch": 0.07865455507061958, + "grad_norm": 43.1875, + "learning_rate": 9.987710246603085e-06, + "loss": 17.5069, + "step": 4350 + }, + { + "epoch": 0.07883537013974745, + "grad_norm": 38.375, + "learning_rate": 9.987681994296426e-06, + "loss": 17.6491, + "step": 4360 + }, + { + "epoch": 0.07901618520887531, + "grad_norm": 39.75, + "learning_rate": 9.987653741989766e-06, + "loss": 17.67, + "step": 4370 + }, + { + "epoch": 0.07919700027800317, + "grad_norm": 41.40625, + "learning_rate": 9.987625489683105e-06, + "loss": 17.4607, + "step": 4380 + }, + { + "epoch": 0.07937781534713104, + "grad_norm": 41.09375, + "learning_rate": 9.987597237376446e-06, + "loss": 17.0653, + "step": 4390 + }, + { + "epoch": 0.0795586304162589, + "grad_norm": 39.4375, + "learning_rate": 9.987568985069787e-06, + "loss": 17.7432, + "step": 4400 + }, + { + "epoch": 0.07973944548538675, + "grad_norm": 40.34375, + "learning_rate": 9.987540732763127e-06, + "loss": 17.5271, + "step": 4410 + }, + { + "epoch": 0.07992026055451461, + "grad_norm": 43.625, + "learning_rate": 9.987512480456468e-06, + "loss": 17.3105, + "step": 4420 + }, + { + "epoch": 0.08010107562364248, + "grad_norm": 37.25, + "learning_rate": 9.987484228149808e-06, + "loss": 17.6018, + "step": 4430 + }, + { + "epoch": 0.08028189069277034, + "grad_norm": 43.4375, + "learning_rate": 9.987455975843149e-06, + "loss": 17.2744, + "step": 4440 + }, + { + "epoch": 0.08046270576189819, + "grad_norm": 37.03125, + "learning_rate": 9.98742772353649e-06, + "loss": 17.3233, + "step": 4450 + }, + { + "epoch": 0.08064352083102606, + "grad_norm": 39.6875, + "learning_rate": 9.98739947122983e-06, + "loss": 17.4399, + "step": 4460 + }, + { + "epoch": 0.08082433590015392, + "grad_norm": 40.5, + "learning_rate": 9.98737121892317e-06, + "loss": 16.9782, + "step": 4470 + }, + { + "epoch": 0.08100515096928178, + "grad_norm": 38.1875, + "learning_rate": 9.98734296661651e-06, + "loss": 16.7517, + "step": 4480 + }, + { + "epoch": 0.08118596603840963, + "grad_norm": 39.96875, + "learning_rate": 9.98731471430985e-06, + "loss": 17.4957, + "step": 4490 + }, + { + "epoch": 0.0813667811075375, + "grad_norm": 41.03125, + "learning_rate": 9.987286462003191e-06, + "loss": 17.537, + "step": 4500 + }, + { + "epoch": 0.08154759617666536, + "grad_norm": 40.09375, + "learning_rate": 9.987258209696532e-06, + "loss": 17.3668, + "step": 4510 + }, + { + "epoch": 0.08172841124579322, + "grad_norm": 41.09375, + "learning_rate": 9.987229957389872e-06, + "loss": 17.4374, + "step": 4520 + }, + { + "epoch": 0.08190922631492109, + "grad_norm": 43.90625, + "learning_rate": 9.987201705083213e-06, + "loss": 17.6475, + "step": 4530 + }, + { + "epoch": 0.08209004138404895, + "grad_norm": 39.0625, + "learning_rate": 9.987173452776554e-06, + "loss": 17.6052, + "step": 4540 + }, + { + "epoch": 0.0822708564531768, + "grad_norm": 36.59375, + "learning_rate": 9.987145200469892e-06, + "loss": 17.2227, + "step": 4550 + }, + { + "epoch": 0.08245167152230466, + "grad_norm": 38.625, + "learning_rate": 9.987116948163233e-06, + "loss": 17.2557, + "step": 4560 + }, + { + "epoch": 0.08263248659143253, + "grad_norm": 39.46875, + "learning_rate": 9.987088695856574e-06, + "loss": 17.3571, + "step": 4570 + }, + { + "epoch": 0.08281330166056039, + "grad_norm": 43.625, + "learning_rate": 9.987060443549914e-06, + "loss": 17.6732, + "step": 4580 + }, + { + "epoch": 0.08299411672968825, + "grad_norm": 38.9375, + "learning_rate": 9.987032191243255e-06, + "loss": 17.7018, + "step": 4590 + }, + { + "epoch": 0.08317493179881612, + "grad_norm": 41.875, + "learning_rate": 9.987003938936596e-06, + "loss": 17.6475, + "step": 4600 + }, + { + "epoch": 0.08335574686794398, + "grad_norm": 40.90625, + "learning_rate": 9.986975686629936e-06, + "loss": 17.6004, + "step": 4610 + }, + { + "epoch": 0.08353656193707183, + "grad_norm": 41.4375, + "learning_rate": 9.986947434323277e-06, + "loss": 17.108, + "step": 4620 + }, + { + "epoch": 0.0837173770061997, + "grad_norm": 39.03125, + "learning_rate": 9.986919182016617e-06, + "loss": 17.4432, + "step": 4630 + }, + { + "epoch": 0.08389819207532756, + "grad_norm": 39.5625, + "learning_rate": 9.986890929709956e-06, + "loss": 17.2823, + "step": 4640 + }, + { + "epoch": 0.08407900714445542, + "grad_norm": 38.40625, + "learning_rate": 9.986862677403297e-06, + "loss": 17.3734, + "step": 4650 + }, + { + "epoch": 0.08425982221358327, + "grad_norm": 41.15625, + "learning_rate": 9.986834425096638e-06, + "loss": 17.538, + "step": 4660 + }, + { + "epoch": 0.08444063728271115, + "grad_norm": 38.46875, + "learning_rate": 9.986806172789978e-06, + "loss": 16.9801, + "step": 4670 + }, + { + "epoch": 0.084621452351839, + "grad_norm": 39.53125, + "learning_rate": 9.986777920483319e-06, + "loss": 17.5482, + "step": 4680 + }, + { + "epoch": 0.08480226742096686, + "grad_norm": 39.46875, + "learning_rate": 9.98674966817666e-06, + "loss": 17.2156, + "step": 4690 + }, + { + "epoch": 0.08498308249009473, + "grad_norm": 37.34375, + "learning_rate": 9.98672141587e-06, + "loss": 17.5817, + "step": 4700 + }, + { + "epoch": 0.08516389755922259, + "grad_norm": 39.0, + "learning_rate": 9.98669316356334e-06, + "loss": 17.4391, + "step": 4710 + }, + { + "epoch": 0.08534471262835044, + "grad_norm": 39.71875, + "learning_rate": 9.98666491125668e-06, + "loss": 17.4249, + "step": 4720 + }, + { + "epoch": 0.0855255276974783, + "grad_norm": 38.25, + "learning_rate": 9.98663665895002e-06, + "loss": 17.2424, + "step": 4730 + }, + { + "epoch": 0.08570634276660617, + "grad_norm": 40.40625, + "learning_rate": 9.986608406643361e-06, + "loss": 17.6555, + "step": 4740 + }, + { + "epoch": 0.08588715783573403, + "grad_norm": 39.625, + "learning_rate": 9.986580154336702e-06, + "loss": 17.662, + "step": 4750 + }, + { + "epoch": 0.08606797290486189, + "grad_norm": 38.4375, + "learning_rate": 9.986551902030042e-06, + "loss": 17.5323, + "step": 4760 + }, + { + "epoch": 0.08624878797398976, + "grad_norm": 39.28125, + "learning_rate": 9.986523649723381e-06, + "loss": 17.3838, + "step": 4770 + }, + { + "epoch": 0.08642960304311761, + "grad_norm": 40.09375, + "learning_rate": 9.986495397416723e-06, + "loss": 17.4733, + "step": 4780 + }, + { + "epoch": 0.08661041811224547, + "grad_norm": 39.1875, + "learning_rate": 9.986467145110064e-06, + "loss": 17.9583, + "step": 4790 + }, + { + "epoch": 0.08679123318137334, + "grad_norm": 40.65625, + "learning_rate": 9.986438892803405e-06, + "loss": 17.6036, + "step": 4800 + }, + { + "epoch": 0.0869720482505012, + "grad_norm": 41.4375, + "learning_rate": 9.986410640496744e-06, + "loss": 17.6934, + "step": 4810 + }, + { + "epoch": 0.08715286331962906, + "grad_norm": 38.84375, + "learning_rate": 9.986382388190084e-06, + "loss": 17.2059, + "step": 4820 + }, + { + "epoch": 0.08733367838875691, + "grad_norm": 37.75, + "learning_rate": 9.986354135883425e-06, + "loss": 17.1567, + "step": 4830 + }, + { + "epoch": 0.08751449345788478, + "grad_norm": 42.1875, + "learning_rate": 9.986325883576765e-06, + "loss": 17.3249, + "step": 4840 + }, + { + "epoch": 0.08769530852701264, + "grad_norm": 38.25, + "learning_rate": 9.986297631270106e-06, + "loss": 17.2945, + "step": 4850 + }, + { + "epoch": 0.0878761235961405, + "grad_norm": 40.0625, + "learning_rate": 9.986269378963447e-06, + "loss": 17.2515, + "step": 4860 + }, + { + "epoch": 0.08805693866526837, + "grad_norm": 38.9375, + "learning_rate": 9.986241126656787e-06, + "loss": 17.6603, + "step": 4870 + }, + { + "epoch": 0.08823775373439623, + "grad_norm": 42.0625, + "learning_rate": 9.986212874350128e-06, + "loss": 17.6273, + "step": 4880 + }, + { + "epoch": 0.08841856880352408, + "grad_norm": 37.9375, + "learning_rate": 9.986184622043467e-06, + "loss": 17.3321, + "step": 4890 + }, + { + "epoch": 0.08859938387265194, + "grad_norm": 38.15625, + "learning_rate": 9.986156369736808e-06, + "loss": 17.5066, + "step": 4900 + }, + { + "epoch": 0.08878019894177981, + "grad_norm": 40.125, + "learning_rate": 9.986128117430148e-06, + "loss": 17.2144, + "step": 4910 + }, + { + "epoch": 0.08896101401090767, + "grad_norm": 39.6875, + "learning_rate": 9.986099865123489e-06, + "loss": 17.3687, + "step": 4920 + }, + { + "epoch": 0.08914182908003553, + "grad_norm": 41.3125, + "learning_rate": 9.98607161281683e-06, + "loss": 17.7729, + "step": 4930 + }, + { + "epoch": 0.0893226441491634, + "grad_norm": 38.0, + "learning_rate": 9.98604336051017e-06, + "loss": 17.2783, + "step": 4940 + }, + { + "epoch": 0.08950345921829125, + "grad_norm": 37.96875, + "learning_rate": 9.98601510820351e-06, + "loss": 17.4247, + "step": 4950 + }, + { + "epoch": 0.08968427428741911, + "grad_norm": 38.84375, + "learning_rate": 9.985986855896851e-06, + "loss": 17.406, + "step": 4960 + }, + { + "epoch": 0.08986508935654697, + "grad_norm": 40.875, + "learning_rate": 9.985958603590192e-06, + "loss": 17.4239, + "step": 4970 + }, + { + "epoch": 0.09004590442567484, + "grad_norm": 38.9375, + "learning_rate": 9.98593035128353e-06, + "loss": 17.6854, + "step": 4980 + }, + { + "epoch": 0.0902267194948027, + "grad_norm": 39.03125, + "learning_rate": 9.985902098976871e-06, + "loss": 17.2287, + "step": 4990 + }, + { + "epoch": 0.09040753456393055, + "grad_norm": 39.625, + "learning_rate": 9.985873846670212e-06, + "loss": 17.9483, + "step": 5000 + }, + { + "epoch": 0.09040753456393055, + "eval_loss": 2.186025381088257, + "eval_runtime": 229.8838, + "eval_samples_per_second": 3158.374, + "eval_steps_per_second": 49.351, + "step": 5000 + }, + { + "epoch": 0.09058834963305842, + "grad_norm": 38.96875, + "learning_rate": 9.985845594363553e-06, + "loss": 17.5609, + "step": 5010 + }, + { + "epoch": 0.09076916470218628, + "grad_norm": 39.28125, + "learning_rate": 9.985817342056893e-06, + "loss": 17.8373, + "step": 5020 + }, + { + "epoch": 0.09094997977131414, + "grad_norm": 39.90625, + "learning_rate": 9.985789089750232e-06, + "loss": 17.0478, + "step": 5030 + }, + { + "epoch": 0.09113079484044201, + "grad_norm": 43.09375, + "learning_rate": 9.985760837443575e-06, + "loss": 17.4107, + "step": 5040 + }, + { + "epoch": 0.09131160990956987, + "grad_norm": 37.3125, + "learning_rate": 9.985732585136915e-06, + "loss": 17.773, + "step": 5050 + }, + { + "epoch": 0.09149242497869772, + "grad_norm": 38.21875, + "learning_rate": 9.985704332830256e-06, + "loss": 16.9747, + "step": 5060 + }, + { + "epoch": 0.09167324004782558, + "grad_norm": 39.78125, + "learning_rate": 9.985676080523595e-06, + "loss": 16.9955, + "step": 5070 + }, + { + "epoch": 0.09185405511695345, + "grad_norm": 41.53125, + "learning_rate": 9.985647828216935e-06, + "loss": 17.2063, + "step": 5080 + }, + { + "epoch": 0.09203487018608131, + "grad_norm": 39.90625, + "learning_rate": 9.985619575910276e-06, + "loss": 17.2967, + "step": 5090 + }, + { + "epoch": 0.09221568525520916, + "grad_norm": 39.34375, + "learning_rate": 9.985591323603617e-06, + "loss": 17.641, + "step": 5100 + }, + { + "epoch": 0.09239650032433704, + "grad_norm": 40.5625, + "learning_rate": 9.985563071296957e-06, + "loss": 17.6372, + "step": 5110 + }, + { + "epoch": 0.09257731539346489, + "grad_norm": 39.96875, + "learning_rate": 9.985534818990296e-06, + "loss": 17.3905, + "step": 5120 + }, + { + "epoch": 0.09275813046259275, + "grad_norm": 38.375, + "learning_rate": 9.985506566683638e-06, + "loss": 17.634, + "step": 5130 + }, + { + "epoch": 0.09293894553172061, + "grad_norm": 39.9375, + "learning_rate": 9.985478314376979e-06, + "loss": 17.8938, + "step": 5140 + }, + { + "epoch": 0.09311976060084848, + "grad_norm": 39.84375, + "learning_rate": 9.985450062070318e-06, + "loss": 17.6147, + "step": 5150 + }, + { + "epoch": 0.09330057566997634, + "grad_norm": 39.0625, + "learning_rate": 9.985421809763659e-06, + "loss": 17.5877, + "step": 5160 + }, + { + "epoch": 0.09348139073910419, + "grad_norm": 38.65625, + "learning_rate": 9.985393557457e-06, + "loss": 17.0614, + "step": 5170 + }, + { + "epoch": 0.09366220580823206, + "grad_norm": 41.8125, + "learning_rate": 9.98536530515034e-06, + "loss": 17.3577, + "step": 5180 + }, + { + "epoch": 0.09384302087735992, + "grad_norm": 42.65625, + "learning_rate": 9.98533705284368e-06, + "loss": 17.6818, + "step": 5190 + }, + { + "epoch": 0.09402383594648778, + "grad_norm": 41.125, + "learning_rate": 9.98530880053702e-06, + "loss": 17.375, + "step": 5200 + }, + { + "epoch": 0.09420465101561565, + "grad_norm": 39.90625, + "learning_rate": 9.985280548230362e-06, + "loss": 17.1533, + "step": 5210 + }, + { + "epoch": 0.0943854660847435, + "grad_norm": 37.84375, + "learning_rate": 9.985252295923702e-06, + "loss": 17.5257, + "step": 5220 + }, + { + "epoch": 0.09456628115387136, + "grad_norm": 41.53125, + "learning_rate": 9.985224043617043e-06, + "loss": 17.6709, + "step": 5230 + }, + { + "epoch": 0.09474709622299922, + "grad_norm": 41.71875, + "learning_rate": 9.985195791310382e-06, + "loss": 17.9804, + "step": 5240 + }, + { + "epoch": 0.09492791129212709, + "grad_norm": 39.6875, + "learning_rate": 9.985167539003723e-06, + "loss": 17.0051, + "step": 5250 + }, + { + "epoch": 0.09510872636125495, + "grad_norm": 39.0, + "learning_rate": 9.985139286697063e-06, + "loss": 17.7626, + "step": 5260 + }, + { + "epoch": 0.0952895414303828, + "grad_norm": 39.0, + "learning_rate": 9.985111034390404e-06, + "loss": 17.0476, + "step": 5270 + }, + { + "epoch": 0.09547035649951068, + "grad_norm": 40.15625, + "learning_rate": 9.985082782083744e-06, + "loss": 17.5495, + "step": 5280 + }, + { + "epoch": 0.09565117156863853, + "grad_norm": 41.78125, + "learning_rate": 9.985054529777083e-06, + "loss": 17.558, + "step": 5290 + }, + { + "epoch": 0.09583198663776639, + "grad_norm": 36.9375, + "learning_rate": 9.985026277470426e-06, + "loss": 17.4099, + "step": 5300 + }, + { + "epoch": 0.09601280170689425, + "grad_norm": 43.0625, + "learning_rate": 9.984998025163766e-06, + "loss": 17.2957, + "step": 5310 + }, + { + "epoch": 0.09619361677602212, + "grad_norm": 39.71875, + "learning_rate": 9.984969772857105e-06, + "loss": 17.5661, + "step": 5320 + }, + { + "epoch": 0.09637443184514997, + "grad_norm": 41.65625, + "learning_rate": 9.984941520550446e-06, + "loss": 17.8033, + "step": 5330 + }, + { + "epoch": 0.09655524691427783, + "grad_norm": 42.1875, + "learning_rate": 9.984913268243786e-06, + "loss": 17.4909, + "step": 5340 + }, + { + "epoch": 0.0967360619834057, + "grad_norm": 38.46875, + "learning_rate": 9.984885015937127e-06, + "loss": 17.7694, + "step": 5350 + }, + { + "epoch": 0.09691687705253356, + "grad_norm": 39.5, + "learning_rate": 9.984856763630468e-06, + "loss": 17.4628, + "step": 5360 + }, + { + "epoch": 0.09709769212166142, + "grad_norm": 40.75, + "learning_rate": 9.984828511323808e-06, + "loss": 17.667, + "step": 5370 + }, + { + "epoch": 0.09727850719078927, + "grad_norm": 40.0625, + "learning_rate": 9.984800259017147e-06, + "loss": 17.372, + "step": 5380 + }, + { + "epoch": 0.09745932225991714, + "grad_norm": 41.0, + "learning_rate": 9.98477200671049e-06, + "loss": 17.2629, + "step": 5390 + }, + { + "epoch": 0.097640137329045, + "grad_norm": 39.5, + "learning_rate": 9.98474375440383e-06, + "loss": 17.4202, + "step": 5400 + }, + { + "epoch": 0.09782095239817286, + "grad_norm": 40.03125, + "learning_rate": 9.984715502097169e-06, + "loss": 17.287, + "step": 5410 + }, + { + "epoch": 0.09800176746730073, + "grad_norm": 43.90625, + "learning_rate": 9.98468724979051e-06, + "loss": 17.6449, + "step": 5420 + }, + { + "epoch": 0.09818258253642859, + "grad_norm": 40.5625, + "learning_rate": 9.98465899748385e-06, + "loss": 16.9249, + "step": 5430 + }, + { + "epoch": 0.09836339760555644, + "grad_norm": 39.375, + "learning_rate": 9.984630745177191e-06, + "loss": 17.5399, + "step": 5440 + }, + { + "epoch": 0.09854421267468431, + "grad_norm": 38.96875, + "learning_rate": 9.984602492870532e-06, + "loss": 17.6102, + "step": 5450 + }, + { + "epoch": 0.09872502774381217, + "grad_norm": 39.1875, + "learning_rate": 9.98457424056387e-06, + "loss": 17.4345, + "step": 5460 + }, + { + "epoch": 0.09890584281294003, + "grad_norm": 41.65625, + "learning_rate": 9.984545988257211e-06, + "loss": 17.5696, + "step": 5470 + }, + { + "epoch": 0.09908665788206789, + "grad_norm": 41.75, + "learning_rate": 9.984517735950553e-06, + "loss": 17.2881, + "step": 5480 + }, + { + "epoch": 0.09926747295119576, + "grad_norm": 39.4375, + "learning_rate": 9.984489483643894e-06, + "loss": 17.1251, + "step": 5490 + }, + { + "epoch": 0.09944828802032361, + "grad_norm": 43.1875, + "learning_rate": 9.984461231337233e-06, + "loss": 17.6069, + "step": 5500 + }, + { + "epoch": 0.09962910308945147, + "grad_norm": 40.8125, + "learning_rate": 9.984432979030574e-06, + "loss": 17.2116, + "step": 5510 + }, + { + "epoch": 0.09980991815857934, + "grad_norm": 42.84375, + "learning_rate": 9.984404726723914e-06, + "loss": 17.4008, + "step": 5520 + }, + { + "epoch": 0.0999907332277072, + "grad_norm": 40.09375, + "learning_rate": 9.984376474417255e-06, + "loss": 17.622, + "step": 5530 + }, + { + "epoch": 0.10017154829683506, + "grad_norm": 41.125, + "learning_rate": 9.984348222110595e-06, + "loss": 17.6973, + "step": 5540 + }, + { + "epoch": 0.10035236336596291, + "grad_norm": 42.34375, + "learning_rate": 9.984319969803934e-06, + "loss": 17.1871, + "step": 5550 + }, + { + "epoch": 0.10053317843509078, + "grad_norm": 40.40625, + "learning_rate": 9.984291717497277e-06, + "loss": 17.496, + "step": 5560 + }, + { + "epoch": 0.10071399350421864, + "grad_norm": 41.71875, + "learning_rate": 9.984263465190617e-06, + "loss": 17.375, + "step": 5570 + }, + { + "epoch": 0.1008948085733465, + "grad_norm": 41.625, + "learning_rate": 9.984235212883956e-06, + "loss": 17.9584, + "step": 5580 + }, + { + "epoch": 0.10107562364247437, + "grad_norm": 39.875, + "learning_rate": 9.984206960577297e-06, + "loss": 17.8407, + "step": 5590 + }, + { + "epoch": 0.10125643871160223, + "grad_norm": 41.84375, + "learning_rate": 9.984178708270638e-06, + "loss": 17.1891, + "step": 5600 + }, + { + "epoch": 0.10143725378073008, + "grad_norm": 40.5, + "learning_rate": 9.984150455963978e-06, + "loss": 17.3538, + "step": 5610 + }, + { + "epoch": 0.10161806884985795, + "grad_norm": 39.09375, + "learning_rate": 9.984122203657319e-06, + "loss": 17.3117, + "step": 5620 + }, + { + "epoch": 0.10179888391898581, + "grad_norm": 37.90625, + "learning_rate": 9.984093951350658e-06, + "loss": 17.2763, + "step": 5630 + }, + { + "epoch": 0.10197969898811367, + "grad_norm": 40.46875, + "learning_rate": 9.984065699043998e-06, + "loss": 17.175, + "step": 5640 + }, + { + "epoch": 0.10216051405724152, + "grad_norm": 37.46875, + "learning_rate": 9.98403744673734e-06, + "loss": 17.5182, + "step": 5650 + }, + { + "epoch": 0.1023413291263694, + "grad_norm": 39.03125, + "learning_rate": 9.984009194430681e-06, + "loss": 17.2687, + "step": 5660 + }, + { + "epoch": 0.10252214419549725, + "grad_norm": 39.875, + "learning_rate": 9.98398094212402e-06, + "loss": 17.5617, + "step": 5670 + }, + { + "epoch": 0.10270295926462511, + "grad_norm": 40.375, + "learning_rate": 9.98395268981736e-06, + "loss": 17.4767, + "step": 5680 + }, + { + "epoch": 0.10288377433375298, + "grad_norm": 40.40625, + "learning_rate": 9.983924437510701e-06, + "loss": 17.7439, + "step": 5690 + }, + { + "epoch": 0.10306458940288084, + "grad_norm": 40.34375, + "learning_rate": 9.983896185204042e-06, + "loss": 17.0064, + "step": 5700 + }, + { + "epoch": 0.1032454044720087, + "grad_norm": 39.90625, + "learning_rate": 9.983867932897383e-06, + "loss": 17.699, + "step": 5710 + }, + { + "epoch": 0.10342621954113655, + "grad_norm": 38.4375, + "learning_rate": 9.983839680590722e-06, + "loss": 17.5337, + "step": 5720 + }, + { + "epoch": 0.10360703461026442, + "grad_norm": 38.375, + "learning_rate": 9.983811428284062e-06, + "loss": 17.6822, + "step": 5730 + }, + { + "epoch": 0.10378784967939228, + "grad_norm": 38.90625, + "learning_rate": 9.983783175977405e-06, + "loss": 17.4954, + "step": 5740 + }, + { + "epoch": 0.10396866474852014, + "grad_norm": 42.28125, + "learning_rate": 9.983754923670743e-06, + "loss": 17.6689, + "step": 5750 + }, + { + "epoch": 0.10414947981764801, + "grad_norm": 40.75, + "learning_rate": 9.983726671364084e-06, + "loss": 17.2288, + "step": 5760 + }, + { + "epoch": 0.10433029488677587, + "grad_norm": 41.59375, + "learning_rate": 9.983698419057425e-06, + "loss": 17.3645, + "step": 5770 + }, + { + "epoch": 0.10451110995590372, + "grad_norm": 40.65625, + "learning_rate": 9.983670166750765e-06, + "loss": 17.2011, + "step": 5780 + }, + { + "epoch": 0.10469192502503159, + "grad_norm": 40.65625, + "learning_rate": 9.983641914444106e-06, + "loss": 17.4304, + "step": 5790 + }, + { + "epoch": 0.10487274009415945, + "grad_norm": 39.40625, + "learning_rate": 9.983613662137447e-06, + "loss": 17.1882, + "step": 5800 + }, + { + "epoch": 0.10505355516328731, + "grad_norm": 40.46875, + "learning_rate": 9.983585409830785e-06, + "loss": 17.7, + "step": 5810 + }, + { + "epoch": 0.10523437023241516, + "grad_norm": 41.25, + "learning_rate": 9.983557157524126e-06, + "loss": 17.4315, + "step": 5820 + }, + { + "epoch": 0.10541518530154304, + "grad_norm": 40.75, + "learning_rate": 9.983528905217468e-06, + "loss": 17.2083, + "step": 5830 + }, + { + "epoch": 0.10559600037067089, + "grad_norm": 39.40625, + "learning_rate": 9.983500652910807e-06, + "loss": 17.4995, + "step": 5840 + }, + { + "epoch": 0.10577681543979875, + "grad_norm": 41.59375, + "learning_rate": 9.983472400604148e-06, + "loss": 17.4644, + "step": 5850 + }, + { + "epoch": 0.10595763050892662, + "grad_norm": 39.875, + "learning_rate": 9.983444148297489e-06, + "loss": 16.9171, + "step": 5860 + }, + { + "epoch": 0.10613844557805448, + "grad_norm": 40.15625, + "learning_rate": 9.98341589599083e-06, + "loss": 17.2418, + "step": 5870 + }, + { + "epoch": 0.10631926064718233, + "grad_norm": 41.6875, + "learning_rate": 9.98338764368417e-06, + "loss": 17.2949, + "step": 5880 + }, + { + "epoch": 0.10650007571631019, + "grad_norm": 42.09375, + "learning_rate": 9.983359391377509e-06, + "loss": 17.4008, + "step": 5890 + }, + { + "epoch": 0.10668089078543806, + "grad_norm": 40.59375, + "learning_rate": 9.98333113907085e-06, + "loss": 17.65, + "step": 5900 + }, + { + "epoch": 0.10686170585456592, + "grad_norm": 38.59375, + "learning_rate": 9.98330288676419e-06, + "loss": 16.9491, + "step": 5910 + }, + { + "epoch": 0.10704252092369378, + "grad_norm": 40.59375, + "learning_rate": 9.983274634457532e-06, + "loss": 17.8632, + "step": 5920 + }, + { + "epoch": 0.10722333599282165, + "grad_norm": 39.875, + "learning_rate": 9.983246382150871e-06, + "loss": 17.4305, + "step": 5930 + }, + { + "epoch": 0.1074041510619495, + "grad_norm": 40.5, + "learning_rate": 9.983218129844212e-06, + "loss": 17.4213, + "step": 5940 + }, + { + "epoch": 0.10758496613107736, + "grad_norm": 39.25, + "learning_rate": 9.983189877537553e-06, + "loss": 17.216, + "step": 5950 + }, + { + "epoch": 0.10776578120020522, + "grad_norm": 38.375, + "learning_rate": 9.983161625230893e-06, + "loss": 17.5143, + "step": 5960 + }, + { + "epoch": 0.10794659626933309, + "grad_norm": 40.9375, + "learning_rate": 9.983133372924234e-06, + "loss": 17.9775, + "step": 5970 + }, + { + "epoch": 0.10812741133846095, + "grad_norm": 40.09375, + "learning_rate": 9.983105120617573e-06, + "loss": 17.2851, + "step": 5980 + }, + { + "epoch": 0.1083082264075888, + "grad_norm": 37.90625, + "learning_rate": 9.983076868310913e-06, + "loss": 17.2725, + "step": 5990 + }, + { + "epoch": 0.10848904147671667, + "grad_norm": 36.4375, + "learning_rate": 9.983048616004256e-06, + "loss": 17.4561, + "step": 6000 + }, + { + "epoch": 0.10866985654584453, + "grad_norm": 39.15625, + "learning_rate": 9.983020363697595e-06, + "loss": 17.251, + "step": 6010 + }, + { + "epoch": 0.10885067161497239, + "grad_norm": 39.34375, + "learning_rate": 9.982992111390935e-06, + "loss": 17.4456, + "step": 6020 + }, + { + "epoch": 0.10903148668410026, + "grad_norm": 41.3125, + "learning_rate": 9.982963859084276e-06, + "loss": 17.8138, + "step": 6030 + }, + { + "epoch": 0.10921230175322812, + "grad_norm": 41.53125, + "learning_rate": 9.982935606777616e-06, + "loss": 17.5189, + "step": 6040 + }, + { + "epoch": 0.10939311682235597, + "grad_norm": 40.59375, + "learning_rate": 9.982907354470957e-06, + "loss": 17.5637, + "step": 6050 + }, + { + "epoch": 0.10957393189148383, + "grad_norm": 42.21875, + "learning_rate": 9.982879102164296e-06, + "loss": 17.297, + "step": 6060 + }, + { + "epoch": 0.1097547469606117, + "grad_norm": 39.03125, + "learning_rate": 9.982850849857637e-06, + "loss": 17.6551, + "step": 6070 + }, + { + "epoch": 0.10993556202973956, + "grad_norm": 37.84375, + "learning_rate": 9.982822597550977e-06, + "loss": 17.2923, + "step": 6080 + }, + { + "epoch": 0.11011637709886742, + "grad_norm": 40.25, + "learning_rate": 9.98279434524432e-06, + "loss": 17.8248, + "step": 6090 + }, + { + "epoch": 0.11029719216799529, + "grad_norm": 40.84375, + "learning_rate": 9.982766092937658e-06, + "loss": 17.666, + "step": 6100 + }, + { + "epoch": 0.11047800723712314, + "grad_norm": 42.9375, + "learning_rate": 9.982737840630999e-06, + "loss": 17.6284, + "step": 6110 + }, + { + "epoch": 0.110658822306251, + "grad_norm": 40.5625, + "learning_rate": 9.98270958832434e-06, + "loss": 16.9809, + "step": 6120 + }, + { + "epoch": 0.11083963737537886, + "grad_norm": 42.5625, + "learning_rate": 9.98268133601768e-06, + "loss": 17.6463, + "step": 6130 + }, + { + "epoch": 0.11102045244450673, + "grad_norm": 41.46875, + "learning_rate": 9.982653083711021e-06, + "loss": 17.5154, + "step": 6140 + }, + { + "epoch": 0.11120126751363459, + "grad_norm": 39.40625, + "learning_rate": 9.98262483140436e-06, + "loss": 17.27, + "step": 6150 + }, + { + "epoch": 0.11138208258276244, + "grad_norm": 39.96875, + "learning_rate": 9.9825965790977e-06, + "loss": 17.2718, + "step": 6160 + }, + { + "epoch": 0.11156289765189031, + "grad_norm": 39.78125, + "learning_rate": 9.982568326791041e-06, + "loss": 17.3054, + "step": 6170 + }, + { + "epoch": 0.11174371272101817, + "grad_norm": 37.53125, + "learning_rate": 9.982540074484382e-06, + "loss": 17.0941, + "step": 6180 + }, + { + "epoch": 0.11192452779014603, + "grad_norm": 40.9375, + "learning_rate": 9.982511822177722e-06, + "loss": 16.9333, + "step": 6190 + }, + { + "epoch": 0.1121053428592739, + "grad_norm": 41.40625, + "learning_rate": 9.982483569871063e-06, + "loss": 17.4357, + "step": 6200 + }, + { + "epoch": 0.11228615792840176, + "grad_norm": 37.4375, + "learning_rate": 9.982455317564404e-06, + "loss": 17.1446, + "step": 6210 + }, + { + "epoch": 0.11246697299752961, + "grad_norm": 40.0, + "learning_rate": 9.982427065257744e-06, + "loss": 17.581, + "step": 6220 + }, + { + "epoch": 0.11264778806665747, + "grad_norm": 40.78125, + "learning_rate": 9.982398812951085e-06, + "loss": 17.6953, + "step": 6230 + }, + { + "epoch": 0.11282860313578534, + "grad_norm": 42.96875, + "learning_rate": 9.982370560644424e-06, + "loss": 17.8289, + "step": 6240 + }, + { + "epoch": 0.1130094182049132, + "grad_norm": 41.375, + "learning_rate": 9.982342308337764e-06, + "loss": 17.8248, + "step": 6250 + }, + { + "epoch": 0.11319023327404105, + "grad_norm": 41.34375, + "learning_rate": 9.982314056031105e-06, + "loss": 17.7385, + "step": 6260 + }, + { + "epoch": 0.11337104834316893, + "grad_norm": 39.9375, + "learning_rate": 9.982285803724446e-06, + "loss": 17.1312, + "step": 6270 + }, + { + "epoch": 0.11355186341229678, + "grad_norm": 37.65625, + "learning_rate": 9.982257551417786e-06, + "loss": 17.5599, + "step": 6280 + }, + { + "epoch": 0.11373267848142464, + "grad_norm": 37.71875, + "learning_rate": 9.982229299111127e-06, + "loss": 17.3697, + "step": 6290 + }, + { + "epoch": 0.1139134935505525, + "grad_norm": 42.46875, + "learning_rate": 9.982201046804468e-06, + "loss": 17.8049, + "step": 6300 + }, + { + "epoch": 0.11409430861968037, + "grad_norm": 40.0625, + "learning_rate": 9.982172794497808e-06, + "loss": 17.5022, + "step": 6310 + }, + { + "epoch": 0.11427512368880823, + "grad_norm": 42.5, + "learning_rate": 9.982144542191147e-06, + "loss": 17.1551, + "step": 6320 + }, + { + "epoch": 0.11445593875793608, + "grad_norm": 39.125, + "learning_rate": 9.982116289884488e-06, + "loss": 17.5444, + "step": 6330 + }, + { + "epoch": 0.11463675382706395, + "grad_norm": 39.8125, + "learning_rate": 9.982088037577828e-06, + "loss": 16.926, + "step": 6340 + }, + { + "epoch": 0.11481756889619181, + "grad_norm": 41.40625, + "learning_rate": 9.98205978527117e-06, + "loss": 17.1707, + "step": 6350 + }, + { + "epoch": 0.11499838396531967, + "grad_norm": 38.71875, + "learning_rate": 9.98203153296451e-06, + "loss": 17.3683, + "step": 6360 + }, + { + "epoch": 0.11517919903444752, + "grad_norm": 41.25, + "learning_rate": 9.98200328065785e-06, + "loss": 17.3285, + "step": 6370 + }, + { + "epoch": 0.1153600141035754, + "grad_norm": 41.75, + "learning_rate": 9.98197502835119e-06, + "loss": 17.3364, + "step": 6380 + }, + { + "epoch": 0.11554082917270325, + "grad_norm": 40.5, + "learning_rate": 9.981946776044531e-06, + "loss": 17.5414, + "step": 6390 + }, + { + "epoch": 0.11572164424183111, + "grad_norm": 40.625, + "learning_rate": 9.981918523737872e-06, + "loss": 17.6574, + "step": 6400 + }, + { + "epoch": 0.11590245931095898, + "grad_norm": 40.75, + "learning_rate": 9.981890271431211e-06, + "loss": 17.3751, + "step": 6410 + }, + { + "epoch": 0.11608327438008684, + "grad_norm": 38.53125, + "learning_rate": 9.981862019124552e-06, + "loss": 16.8837, + "step": 6420 + }, + { + "epoch": 0.1162640894492147, + "grad_norm": 41.625, + "learning_rate": 9.981833766817892e-06, + "loss": 17.3399, + "step": 6430 + }, + { + "epoch": 0.11644490451834257, + "grad_norm": 41.375, + "learning_rate": 9.981805514511233e-06, + "loss": 17.4723, + "step": 6440 + }, + { + "epoch": 0.11662571958747042, + "grad_norm": 38.75, + "learning_rate": 9.981777262204573e-06, + "loss": 17.4737, + "step": 6450 + }, + { + "epoch": 0.11680653465659828, + "grad_norm": 38.96875, + "learning_rate": 9.981749009897914e-06, + "loss": 17.5064, + "step": 6460 + }, + { + "epoch": 0.11698734972572614, + "grad_norm": 37.75, + "learning_rate": 9.981720757591255e-06, + "loss": 17.5465, + "step": 6470 + }, + { + "epoch": 0.11716816479485401, + "grad_norm": 41.84375, + "learning_rate": 9.981692505284595e-06, + "loss": 17.2815, + "step": 6480 + }, + { + "epoch": 0.11734897986398186, + "grad_norm": 41.25, + "learning_rate": 9.981664252977934e-06, + "loss": 17.0824, + "step": 6490 + }, + { + "epoch": 0.11752979493310972, + "grad_norm": 40.78125, + "learning_rate": 9.981636000671275e-06, + "loss": 17.3115, + "step": 6500 + }, + { + "epoch": 0.11771061000223759, + "grad_norm": 39.625, + "learning_rate": 9.981607748364616e-06, + "loss": 17.4792, + "step": 6510 + }, + { + "epoch": 0.11789142507136545, + "grad_norm": 39.09375, + "learning_rate": 9.981579496057956e-06, + "loss": 17.1819, + "step": 6520 + }, + { + "epoch": 0.1180722401404933, + "grad_norm": 37.71875, + "learning_rate": 9.981551243751297e-06, + "loss": 17.3972, + "step": 6530 + }, + { + "epoch": 0.11825305520962116, + "grad_norm": 38.34375, + "learning_rate": 9.981522991444637e-06, + "loss": 17.372, + "step": 6540 + }, + { + "epoch": 0.11843387027874903, + "grad_norm": 39.28125, + "learning_rate": 9.981494739137978e-06, + "loss": 17.452, + "step": 6550 + }, + { + "epoch": 0.11861468534787689, + "grad_norm": 41.28125, + "learning_rate": 9.981466486831319e-06, + "loss": 17.7057, + "step": 6560 + }, + { + "epoch": 0.11879550041700475, + "grad_norm": 40.375, + "learning_rate": 9.98143823452466e-06, + "loss": 17.2223, + "step": 6570 + }, + { + "epoch": 0.11897631548613262, + "grad_norm": 41.78125, + "learning_rate": 9.981409982217998e-06, + "loss": 17.7114, + "step": 6580 + }, + { + "epoch": 0.11915713055526048, + "grad_norm": 38.75, + "learning_rate": 9.981381729911339e-06, + "loss": 17.115, + "step": 6590 + }, + { + "epoch": 0.11933794562438833, + "grad_norm": 38.3125, + "learning_rate": 9.98135347760468e-06, + "loss": 17.3579, + "step": 6600 + }, + { + "epoch": 0.1195187606935162, + "grad_norm": 42.71875, + "learning_rate": 9.98132522529802e-06, + "loss": 17.7693, + "step": 6610 + }, + { + "epoch": 0.11969957576264406, + "grad_norm": 39.25, + "learning_rate": 9.98129697299136e-06, + "loss": 17.7085, + "step": 6620 + }, + { + "epoch": 0.11988039083177192, + "grad_norm": 39.09375, + "learning_rate": 9.981268720684701e-06, + "loss": 17.2631, + "step": 6630 + }, + { + "epoch": 0.12006120590089978, + "grad_norm": 40.78125, + "learning_rate": 9.981240468378042e-06, + "loss": 17.6135, + "step": 6640 + }, + { + "epoch": 0.12024202097002765, + "grad_norm": 38.34375, + "learning_rate": 9.981212216071383e-06, + "loss": 17.5299, + "step": 6650 + }, + { + "epoch": 0.1204228360391555, + "grad_norm": 40.09375, + "learning_rate": 9.981183963764723e-06, + "loss": 17.2463, + "step": 6660 + }, + { + "epoch": 0.12060365110828336, + "grad_norm": 39.75, + "learning_rate": 9.981155711458062e-06, + "loss": 17.5251, + "step": 6670 + }, + { + "epoch": 0.12078446617741123, + "grad_norm": 38.8125, + "learning_rate": 9.981127459151403e-06, + "loss": 17.2288, + "step": 6680 + }, + { + "epoch": 0.12096528124653909, + "grad_norm": 43.625, + "learning_rate": 9.981099206844743e-06, + "loss": 17.6455, + "step": 6690 + }, + { + "epoch": 0.12114609631566695, + "grad_norm": 38.875, + "learning_rate": 9.981070954538084e-06, + "loss": 17.074, + "step": 6700 + }, + { + "epoch": 0.1213269113847948, + "grad_norm": 41.8125, + "learning_rate": 9.981042702231425e-06, + "loss": 17.5405, + "step": 6710 + }, + { + "epoch": 0.12150772645392267, + "grad_norm": 40.6875, + "learning_rate": 9.981014449924765e-06, + "loss": 17.4071, + "step": 6720 + }, + { + "epoch": 0.12168854152305053, + "grad_norm": 39.75, + "learning_rate": 9.980986197618106e-06, + "loss": 17.665, + "step": 6730 + }, + { + "epoch": 0.12186935659217839, + "grad_norm": 39.625, + "learning_rate": 9.980957945311446e-06, + "loss": 17.5599, + "step": 6740 + }, + { + "epoch": 0.12205017166130626, + "grad_norm": 39.5625, + "learning_rate": 9.980929693004785e-06, + "loss": 17.3555, + "step": 6750 + }, + { + "epoch": 0.12223098673043412, + "grad_norm": 38.625, + "learning_rate": 9.980901440698126e-06, + "loss": 17.6027, + "step": 6760 + }, + { + "epoch": 0.12241180179956197, + "grad_norm": 37.125, + "learning_rate": 9.980873188391467e-06, + "loss": 17.5312, + "step": 6770 + }, + { + "epoch": 0.12259261686868983, + "grad_norm": 41.4375, + "learning_rate": 9.980844936084807e-06, + "loss": 17.3242, + "step": 6780 + }, + { + "epoch": 0.1227734319378177, + "grad_norm": 39.375, + "learning_rate": 9.980816683778148e-06, + "loss": 17.6528, + "step": 6790 + }, + { + "epoch": 0.12295424700694556, + "grad_norm": 41.71875, + "learning_rate": 9.980788431471488e-06, + "loss": 17.2223, + "step": 6800 + }, + { + "epoch": 0.12313506207607341, + "grad_norm": 40.0, + "learning_rate": 9.980760179164829e-06, + "loss": 17.4004, + "step": 6810 + }, + { + "epoch": 0.12331587714520129, + "grad_norm": 41.09375, + "learning_rate": 9.98073192685817e-06, + "loss": 17.1266, + "step": 6820 + }, + { + "epoch": 0.12349669221432914, + "grad_norm": 40.125, + "learning_rate": 9.98070367455151e-06, + "loss": 17.7438, + "step": 6830 + }, + { + "epoch": 0.123677507283457, + "grad_norm": 37.96875, + "learning_rate": 9.98067542224485e-06, + "loss": 17.5249, + "step": 6840 + }, + { + "epoch": 0.12385832235258487, + "grad_norm": 41.78125, + "learning_rate": 9.98064716993819e-06, + "loss": 17.5363, + "step": 6850 + }, + { + "epoch": 0.12403913742171273, + "grad_norm": 39.8125, + "learning_rate": 9.98061891763153e-06, + "loss": 17.8214, + "step": 6860 + }, + { + "epoch": 0.12421995249084058, + "grad_norm": 39.53125, + "learning_rate": 9.980590665324871e-06, + "loss": 17.4741, + "step": 6870 + }, + { + "epoch": 0.12440076755996844, + "grad_norm": 41.5625, + "learning_rate": 9.980562413018212e-06, + "loss": 17.3098, + "step": 6880 + }, + { + "epoch": 0.12458158262909631, + "grad_norm": 38.9375, + "learning_rate": 9.980534160711552e-06, + "loss": 17.4718, + "step": 6890 + }, + { + "epoch": 0.12476239769822417, + "grad_norm": 39.40625, + "learning_rate": 9.980505908404893e-06, + "loss": 17.3257, + "step": 6900 + }, + { + "epoch": 0.12494321276735203, + "grad_norm": 40.84375, + "learning_rate": 9.980477656098234e-06, + "loss": 17.6889, + "step": 6910 + }, + { + "epoch": 0.12512402783647988, + "grad_norm": 42.15625, + "learning_rate": 9.980449403791573e-06, + "loss": 17.4467, + "step": 6920 + }, + { + "epoch": 0.12530484290560776, + "grad_norm": 42.0625, + "learning_rate": 9.980421151484913e-06, + "loss": 17.0792, + "step": 6930 + }, + { + "epoch": 0.12548565797473563, + "grad_norm": 40.75, + "learning_rate": 9.980392899178254e-06, + "loss": 17.2743, + "step": 6940 + }, + { + "epoch": 0.12566647304386347, + "grad_norm": 42.0625, + "learning_rate": 9.980364646871594e-06, + "loss": 17.3742, + "step": 6950 + }, + { + "epoch": 0.12584728811299134, + "grad_norm": 40.6875, + "learning_rate": 9.980336394564935e-06, + "loss": 17.4063, + "step": 6960 + }, + { + "epoch": 0.1260281031821192, + "grad_norm": 38.96875, + "learning_rate": 9.980308142258276e-06, + "loss": 17.4938, + "step": 6970 + }, + { + "epoch": 0.12620891825124705, + "grad_norm": 39.71875, + "learning_rate": 9.980279889951616e-06, + "loss": 17.3211, + "step": 6980 + }, + { + "epoch": 0.12638973332037493, + "grad_norm": 39.125, + "learning_rate": 9.980251637644957e-06, + "loss": 17.2774, + "step": 6990 + }, + { + "epoch": 0.12657054838950277, + "grad_norm": 43.125, + "learning_rate": 9.980223385338298e-06, + "loss": 17.5812, + "step": 7000 + }, + { + "epoch": 0.12675136345863064, + "grad_norm": 40.75, + "learning_rate": 9.980195133031636e-06, + "loss": 17.6814, + "step": 7010 + }, + { + "epoch": 0.1269321785277585, + "grad_norm": 36.75, + "learning_rate": 9.980166880724977e-06, + "loss": 17.2395, + "step": 7020 + }, + { + "epoch": 0.12711299359688635, + "grad_norm": 40.15625, + "learning_rate": 9.980138628418318e-06, + "loss": 17.2053, + "step": 7030 + }, + { + "epoch": 0.12729380866601422, + "grad_norm": 41.3125, + "learning_rate": 9.980110376111658e-06, + "loss": 17.4385, + "step": 7040 + }, + { + "epoch": 0.1274746237351421, + "grad_norm": 42.375, + "learning_rate": 9.980082123804999e-06, + "loss": 17.9981, + "step": 7050 + }, + { + "epoch": 0.12765543880426994, + "grad_norm": 41.9375, + "learning_rate": 9.98005387149834e-06, + "loss": 17.6429, + "step": 7060 + }, + { + "epoch": 0.1278362538733978, + "grad_norm": 42.5625, + "learning_rate": 9.98002561919168e-06, + "loss": 17.5678, + "step": 7070 + }, + { + "epoch": 0.12801706894252568, + "grad_norm": 39.5625, + "learning_rate": 9.97999736688502e-06, + "loss": 17.5928, + "step": 7080 + }, + { + "epoch": 0.12819788401165352, + "grad_norm": 42.46875, + "learning_rate": 9.979969114578361e-06, + "loss": 17.4203, + "step": 7090 + }, + { + "epoch": 0.1283786990807814, + "grad_norm": 43.09375, + "learning_rate": 9.9799408622717e-06, + "loss": 17.4189, + "step": 7100 + }, + { + "epoch": 0.12855951414990927, + "grad_norm": 40.625, + "learning_rate": 9.979912609965041e-06, + "loss": 17.4796, + "step": 7110 + }, + { + "epoch": 0.1287403292190371, + "grad_norm": 38.875, + "learning_rate": 9.979884357658382e-06, + "loss": 17.6867, + "step": 7120 + }, + { + "epoch": 0.12892114428816498, + "grad_norm": 39.90625, + "learning_rate": 9.979856105351722e-06, + "loss": 17.7465, + "step": 7130 + }, + { + "epoch": 0.12910195935729285, + "grad_norm": 39.125, + "learning_rate": 9.979827853045063e-06, + "loss": 17.5646, + "step": 7140 + }, + { + "epoch": 0.1292827744264207, + "grad_norm": 38.46875, + "learning_rate": 9.979799600738403e-06, + "loss": 17.3168, + "step": 7150 + }, + { + "epoch": 0.12946358949554856, + "grad_norm": 41.0, + "learning_rate": 9.979771348431744e-06, + "loss": 17.292, + "step": 7160 + }, + { + "epoch": 0.1296444045646764, + "grad_norm": 42.75, + "learning_rate": 9.979743096125085e-06, + "loss": 17.3749, + "step": 7170 + }, + { + "epoch": 0.12982521963380428, + "grad_norm": 39.96875, + "learning_rate": 9.979714843818424e-06, + "loss": 17.1644, + "step": 7180 + }, + { + "epoch": 0.13000603470293215, + "grad_norm": 43.15625, + "learning_rate": 9.979686591511764e-06, + "loss": 17.7358, + "step": 7190 + }, + { + "epoch": 0.13018684977206, + "grad_norm": 39.59375, + "learning_rate": 9.979658339205105e-06, + "loss": 17.423, + "step": 7200 + }, + { + "epoch": 0.13036766484118786, + "grad_norm": 39.46875, + "learning_rate": 9.979630086898446e-06, + "loss": 17.3277, + "step": 7210 + }, + { + "epoch": 0.13054847991031573, + "grad_norm": 40.9375, + "learning_rate": 9.979601834591786e-06, + "loss": 17.3163, + "step": 7220 + }, + { + "epoch": 0.13072929497944358, + "grad_norm": 40.03125, + "learning_rate": 9.979573582285127e-06, + "loss": 17.2494, + "step": 7230 + }, + { + "epoch": 0.13091011004857145, + "grad_norm": 42.1875, + "learning_rate": 9.979545329978467e-06, + "loss": 17.5062, + "step": 7240 + }, + { + "epoch": 0.13109092511769932, + "grad_norm": 40.21875, + "learning_rate": 9.979517077671808e-06, + "loss": 17.5579, + "step": 7250 + }, + { + "epoch": 0.13127174018682716, + "grad_norm": 40.25, + "learning_rate": 9.979488825365149e-06, + "loss": 17.6391, + "step": 7260 + }, + { + "epoch": 0.13145255525595503, + "grad_norm": 39.78125, + "learning_rate": 9.979460573058488e-06, + "loss": 17.387, + "step": 7270 + }, + { + "epoch": 0.1316333703250829, + "grad_norm": 41.9375, + "learning_rate": 9.979432320751828e-06, + "loss": 17.5774, + "step": 7280 + }, + { + "epoch": 0.13181418539421075, + "grad_norm": 41.125, + "learning_rate": 9.979404068445169e-06, + "loss": 17.1301, + "step": 7290 + }, + { + "epoch": 0.13199500046333862, + "grad_norm": 41.15625, + "learning_rate": 9.97937581613851e-06, + "loss": 17.3479, + "step": 7300 + }, + { + "epoch": 0.1321758155324665, + "grad_norm": 39.1875, + "learning_rate": 9.97934756383185e-06, + "loss": 17.3513, + "step": 7310 + }, + { + "epoch": 0.13235663060159433, + "grad_norm": 41.75, + "learning_rate": 9.97931931152519e-06, + "loss": 16.835, + "step": 7320 + }, + { + "epoch": 0.1325374456707222, + "grad_norm": 40.0625, + "learning_rate": 9.979291059218531e-06, + "loss": 17.5654, + "step": 7330 + }, + { + "epoch": 0.13271826073985005, + "grad_norm": 39.6875, + "learning_rate": 9.979262806911872e-06, + "loss": 17.58, + "step": 7340 + }, + { + "epoch": 0.13289907580897792, + "grad_norm": 40.90625, + "learning_rate": 9.97923455460521e-06, + "loss": 17.6938, + "step": 7350 + }, + { + "epoch": 0.1330798908781058, + "grad_norm": 39.90625, + "learning_rate": 9.979206302298551e-06, + "loss": 16.8821, + "step": 7360 + }, + { + "epoch": 0.13326070594723363, + "grad_norm": 39.15625, + "learning_rate": 9.979178049991892e-06, + "loss": 17.4715, + "step": 7370 + }, + { + "epoch": 0.1334415210163615, + "grad_norm": 41.09375, + "learning_rate": 9.979149797685233e-06, + "loss": 17.1391, + "step": 7380 + }, + { + "epoch": 0.13362233608548937, + "grad_norm": 38.96875, + "learning_rate": 9.979121545378573e-06, + "loss": 17.4993, + "step": 7390 + }, + { + "epoch": 0.13380315115461722, + "grad_norm": 40.90625, + "learning_rate": 9.979093293071914e-06, + "loss": 17.5661, + "step": 7400 + }, + { + "epoch": 0.1339839662237451, + "grad_norm": 40.09375, + "learning_rate": 9.979065040765255e-06, + "loss": 17.7374, + "step": 7410 + }, + { + "epoch": 0.13416478129287296, + "grad_norm": 42.15625, + "learning_rate": 9.979036788458595e-06, + "loss": 17.6661, + "step": 7420 + }, + { + "epoch": 0.1343455963620008, + "grad_norm": 38.03125, + "learning_rate": 9.979008536151936e-06, + "loss": 16.8962, + "step": 7430 + }, + { + "epoch": 0.13452641143112867, + "grad_norm": 41.15625, + "learning_rate": 9.978980283845275e-06, + "loss": 17.0925, + "step": 7440 + }, + { + "epoch": 0.13470722650025654, + "grad_norm": 38.03125, + "learning_rate": 9.978952031538615e-06, + "loss": 17.3493, + "step": 7450 + }, + { + "epoch": 0.1348880415693844, + "grad_norm": 40.75, + "learning_rate": 9.978923779231956e-06, + "loss": 17.4892, + "step": 7460 + }, + { + "epoch": 0.13506885663851226, + "grad_norm": 40.3125, + "learning_rate": 9.978895526925297e-06, + "loss": 17.3092, + "step": 7470 + }, + { + "epoch": 0.1352496717076401, + "grad_norm": 41.5, + "learning_rate": 9.978867274618637e-06, + "loss": 17.4271, + "step": 7480 + }, + { + "epoch": 0.13543048677676797, + "grad_norm": 38.15625, + "learning_rate": 9.978839022311978e-06, + "loss": 17.9422, + "step": 7490 + }, + { + "epoch": 0.13561130184589584, + "grad_norm": 39.90625, + "learning_rate": 9.978810770005318e-06, + "loss": 17.0924, + "step": 7500 + }, + { + "epoch": 0.1357921169150237, + "grad_norm": 42.84375, + "learning_rate": 9.978782517698659e-06, + "loss": 17.3601, + "step": 7510 + }, + { + "epoch": 0.13597293198415156, + "grad_norm": 39.3125, + "learning_rate": 9.978754265391998e-06, + "loss": 17.6276, + "step": 7520 + }, + { + "epoch": 0.13615374705327943, + "grad_norm": 39.46875, + "learning_rate": 9.978726013085339e-06, + "loss": 17.1647, + "step": 7530 + }, + { + "epoch": 0.13633456212240727, + "grad_norm": 40.625, + "learning_rate": 9.97869776077868e-06, + "loss": 17.5063, + "step": 7540 + }, + { + "epoch": 0.13651537719153514, + "grad_norm": 40.0, + "learning_rate": 9.97866950847202e-06, + "loss": 17.4359, + "step": 7550 + }, + { + "epoch": 0.136696192260663, + "grad_norm": 40.28125, + "learning_rate": 9.97864125616536e-06, + "loss": 17.7257, + "step": 7560 + }, + { + "epoch": 0.13687700732979086, + "grad_norm": 41.8125, + "learning_rate": 9.978613003858701e-06, + "loss": 17.6415, + "step": 7570 + }, + { + "epoch": 0.13705782239891873, + "grad_norm": 41.25, + "learning_rate": 9.978584751552042e-06, + "loss": 17.0992, + "step": 7580 + }, + { + "epoch": 0.1372386374680466, + "grad_norm": 38.84375, + "learning_rate": 9.978556499245382e-06, + "loss": 17.3908, + "step": 7590 + }, + { + "epoch": 0.13741945253717444, + "grad_norm": 39.09375, + "learning_rate": 9.978528246938723e-06, + "loss": 17.303, + "step": 7600 + }, + { + "epoch": 0.1376002676063023, + "grad_norm": 39.40625, + "learning_rate": 9.978499994632062e-06, + "loss": 17.3098, + "step": 7610 + }, + { + "epoch": 0.13778108267543018, + "grad_norm": 39.28125, + "learning_rate": 9.978471742325403e-06, + "loss": 17.2351, + "step": 7620 + }, + { + "epoch": 0.13796189774455803, + "grad_norm": 39.8125, + "learning_rate": 9.978443490018743e-06, + "loss": 17.6227, + "step": 7630 + }, + { + "epoch": 0.1381427128136859, + "grad_norm": 41.125, + "learning_rate": 9.978415237712084e-06, + "loss": 17.4154, + "step": 7640 + }, + { + "epoch": 0.13832352788281374, + "grad_norm": 41.15625, + "learning_rate": 9.978386985405424e-06, + "loss": 17.1675, + "step": 7650 + }, + { + "epoch": 0.1385043429519416, + "grad_norm": 39.875, + "learning_rate": 9.978358733098763e-06, + "loss": 17.3523, + "step": 7660 + }, + { + "epoch": 0.13868515802106948, + "grad_norm": 40.625, + "learning_rate": 9.978330480792106e-06, + "loss": 17.9028, + "step": 7670 + }, + { + "epoch": 0.13886597309019733, + "grad_norm": 40.1875, + "learning_rate": 9.978302228485446e-06, + "loss": 17.5639, + "step": 7680 + }, + { + "epoch": 0.1390467881593252, + "grad_norm": 39.40625, + "learning_rate": 9.978273976178787e-06, + "loss": 17.2738, + "step": 7690 + }, + { + "epoch": 0.13922760322845307, + "grad_norm": 40.1875, + "learning_rate": 9.978245723872126e-06, + "loss": 17.1962, + "step": 7700 + }, + { + "epoch": 0.1394084182975809, + "grad_norm": 39.21875, + "learning_rate": 9.978217471565466e-06, + "loss": 17.376, + "step": 7710 + }, + { + "epoch": 0.13958923336670878, + "grad_norm": 37.5, + "learning_rate": 9.978189219258807e-06, + "loss": 17.0739, + "step": 7720 + }, + { + "epoch": 0.13977004843583665, + "grad_norm": 38.5, + "learning_rate": 9.978160966952148e-06, + "loss": 16.9097, + "step": 7730 + }, + { + "epoch": 0.1399508635049645, + "grad_norm": 41.0625, + "learning_rate": 9.978132714645488e-06, + "loss": 17.4405, + "step": 7740 + }, + { + "epoch": 0.14013167857409237, + "grad_norm": 43.0625, + "learning_rate": 9.978104462338829e-06, + "loss": 17.3024, + "step": 7750 + }, + { + "epoch": 0.14031249364322024, + "grad_norm": 39.5625, + "learning_rate": 9.97807621003217e-06, + "loss": 17.0304, + "step": 7760 + }, + { + "epoch": 0.14049330871234808, + "grad_norm": 41.3125, + "learning_rate": 9.97804795772551e-06, + "loss": 17.3244, + "step": 7770 + }, + { + "epoch": 0.14067412378147595, + "grad_norm": 39.9375, + "learning_rate": 9.978019705418849e-06, + "loss": 17.3528, + "step": 7780 + }, + { + "epoch": 0.14085493885060382, + "grad_norm": 39.40625, + "learning_rate": 9.97799145311219e-06, + "loss": 17.2244, + "step": 7790 + }, + { + "epoch": 0.14103575391973167, + "grad_norm": 38.0625, + "learning_rate": 9.97796320080553e-06, + "loss": 16.99, + "step": 7800 + }, + { + "epoch": 0.14121656898885954, + "grad_norm": 41.0625, + "learning_rate": 9.977934948498871e-06, + "loss": 17.0707, + "step": 7810 + }, + { + "epoch": 0.14139738405798738, + "grad_norm": 40.28125, + "learning_rate": 9.977906696192212e-06, + "loss": 17.4142, + "step": 7820 + }, + { + "epoch": 0.14157819912711525, + "grad_norm": 41.125, + "learning_rate": 9.97787844388555e-06, + "loss": 17.4165, + "step": 7830 + }, + { + "epoch": 0.14175901419624312, + "grad_norm": 41.78125, + "learning_rate": 9.977850191578893e-06, + "loss": 17.5218, + "step": 7840 + }, + { + "epoch": 0.14193982926537096, + "grad_norm": 38.90625, + "learning_rate": 9.977821939272233e-06, + "loss": 17.1668, + "step": 7850 + }, + { + "epoch": 0.14212064433449884, + "grad_norm": 37.46875, + "learning_rate": 9.977793686965574e-06, + "loss": 16.9904, + "step": 7860 + }, + { + "epoch": 0.1423014594036267, + "grad_norm": 37.21875, + "learning_rate": 9.977765434658913e-06, + "loss": 17.0856, + "step": 7870 + }, + { + "epoch": 0.14248227447275455, + "grad_norm": 36.71875, + "learning_rate": 9.977737182352254e-06, + "loss": 17.1422, + "step": 7880 + }, + { + "epoch": 0.14266308954188242, + "grad_norm": 42.21875, + "learning_rate": 9.977708930045594e-06, + "loss": 17.0785, + "step": 7890 + }, + { + "epoch": 0.1428439046110103, + "grad_norm": 40.71875, + "learning_rate": 9.977680677738935e-06, + "loss": 17.4634, + "step": 7900 + }, + { + "epoch": 0.14302471968013813, + "grad_norm": 42.5625, + "learning_rate": 9.977652425432276e-06, + "loss": 17.3942, + "step": 7910 + }, + { + "epoch": 0.143205534749266, + "grad_norm": 43.75, + "learning_rate": 9.977624173125614e-06, + "loss": 17.7189, + "step": 7920 + }, + { + "epoch": 0.14338634981839388, + "grad_norm": 41.53125, + "learning_rate": 9.977595920818957e-06, + "loss": 17.412, + "step": 7930 + }, + { + "epoch": 0.14356716488752172, + "grad_norm": 42.875, + "learning_rate": 9.977567668512297e-06, + "loss": 17.591, + "step": 7940 + }, + { + "epoch": 0.1437479799566496, + "grad_norm": 39.125, + "learning_rate": 9.977539416205636e-06, + "loss": 17.5684, + "step": 7950 + }, + { + "epoch": 0.14392879502577746, + "grad_norm": 40.40625, + "learning_rate": 9.977511163898977e-06, + "loss": 17.3622, + "step": 7960 + }, + { + "epoch": 0.1441096100949053, + "grad_norm": 40.53125, + "learning_rate": 9.977482911592318e-06, + "loss": 17.1179, + "step": 7970 + }, + { + "epoch": 0.14429042516403318, + "grad_norm": 39.71875, + "learning_rate": 9.977454659285658e-06, + "loss": 17.5406, + "step": 7980 + }, + { + "epoch": 0.14447124023316102, + "grad_norm": 42.40625, + "learning_rate": 9.977426406978999e-06, + "loss": 17.0921, + "step": 7990 + }, + { + "epoch": 0.1446520553022889, + "grad_norm": 42.5625, + "learning_rate": 9.97739815467234e-06, + "loss": 17.3135, + "step": 8000 + }, + { + "epoch": 0.14483287037141676, + "grad_norm": 40.5625, + "learning_rate": 9.977369902365678e-06, + "loss": 17.4295, + "step": 8010 + }, + { + "epoch": 0.1450136854405446, + "grad_norm": 37.84375, + "learning_rate": 9.97734165005902e-06, + "loss": 17.4965, + "step": 8020 + }, + { + "epoch": 0.14519450050967248, + "grad_norm": 39.5, + "learning_rate": 9.977313397752361e-06, + "loss": 17.6603, + "step": 8030 + }, + { + "epoch": 0.14537531557880035, + "grad_norm": 40.25, + "learning_rate": 9.9772851454457e-06, + "loss": 16.9875, + "step": 8040 + }, + { + "epoch": 0.1455561306479282, + "grad_norm": 42.625, + "learning_rate": 9.977256893139041e-06, + "loss": 17.1946, + "step": 8050 + }, + { + "epoch": 0.14573694571705606, + "grad_norm": 40.46875, + "learning_rate": 9.977228640832381e-06, + "loss": 17.4784, + "step": 8060 + }, + { + "epoch": 0.14591776078618393, + "grad_norm": 38.75, + "learning_rate": 9.977200388525722e-06, + "loss": 17.3109, + "step": 8070 + }, + { + "epoch": 0.14609857585531177, + "grad_norm": 44.78125, + "learning_rate": 9.977172136219063e-06, + "loss": 17.7436, + "step": 8080 + }, + { + "epoch": 0.14627939092443965, + "grad_norm": 42.53125, + "learning_rate": 9.977143883912402e-06, + "loss": 17.1768, + "step": 8090 + }, + { + "epoch": 0.14646020599356752, + "grad_norm": 40.125, + "learning_rate": 9.977115631605744e-06, + "loss": 17.3665, + "step": 8100 + }, + { + "epoch": 0.14664102106269536, + "grad_norm": 38.21875, + "learning_rate": 9.977087379299085e-06, + "loss": 17.5109, + "step": 8110 + }, + { + "epoch": 0.14682183613182323, + "grad_norm": 39.15625, + "learning_rate": 9.977059126992425e-06, + "loss": 17.9137, + "step": 8120 + }, + { + "epoch": 0.1470026512009511, + "grad_norm": 43.375, + "learning_rate": 9.977030874685764e-06, + "loss": 17.3533, + "step": 8130 + }, + { + "epoch": 0.14718346627007894, + "grad_norm": 43.71875, + "learning_rate": 9.977002622379105e-06, + "loss": 17.4239, + "step": 8140 + }, + { + "epoch": 0.14736428133920682, + "grad_norm": 39.4375, + "learning_rate": 9.976974370072445e-06, + "loss": 17.8833, + "step": 8150 + }, + { + "epoch": 0.14754509640833466, + "grad_norm": 42.09375, + "learning_rate": 9.976946117765786e-06, + "loss": 17.5879, + "step": 8160 + }, + { + "epoch": 0.14772591147746253, + "grad_norm": 40.875, + "learning_rate": 9.976917865459127e-06, + "loss": 17.5351, + "step": 8170 + }, + { + "epoch": 0.1479067265465904, + "grad_norm": 38.78125, + "learning_rate": 9.976889613152466e-06, + "loss": 17.1707, + "step": 8180 + }, + { + "epoch": 0.14808754161571824, + "grad_norm": 38.96875, + "learning_rate": 9.976861360845808e-06, + "loss": 17.5665, + "step": 8190 + }, + { + "epoch": 0.14826835668484611, + "grad_norm": 41.84375, + "learning_rate": 9.976833108539148e-06, + "loss": 17.9471, + "step": 8200 + }, + { + "epoch": 0.14844917175397399, + "grad_norm": 42.0625, + "learning_rate": 9.976804856232487e-06, + "loss": 17.341, + "step": 8210 + }, + { + "epoch": 0.14862998682310183, + "grad_norm": 41.71875, + "learning_rate": 9.976776603925828e-06, + "loss": 17.2915, + "step": 8220 + }, + { + "epoch": 0.1488108018922297, + "grad_norm": 40.15625, + "learning_rate": 9.976748351619169e-06, + "loss": 17.2771, + "step": 8230 + }, + { + "epoch": 0.14899161696135757, + "grad_norm": 40.4375, + "learning_rate": 9.97672009931251e-06, + "loss": 17.4774, + "step": 8240 + }, + { + "epoch": 0.1491724320304854, + "grad_norm": 38.84375, + "learning_rate": 9.97669184700585e-06, + "loss": 17.43, + "step": 8250 + }, + { + "epoch": 0.14935324709961328, + "grad_norm": 39.875, + "learning_rate": 9.976663594699189e-06, + "loss": 17.3579, + "step": 8260 + }, + { + "epoch": 0.14953406216874116, + "grad_norm": 39.125, + "learning_rate": 9.97663534239253e-06, + "loss": 17.5017, + "step": 8270 + }, + { + "epoch": 0.149714877237869, + "grad_norm": 39.9375, + "learning_rate": 9.976607090085872e-06, + "loss": 17.2222, + "step": 8280 + }, + { + "epoch": 0.14989569230699687, + "grad_norm": 38.375, + "learning_rate": 9.976578837779212e-06, + "loss": 17.222, + "step": 8290 + }, + { + "epoch": 0.1500765073761247, + "grad_norm": 40.15625, + "learning_rate": 9.976550585472551e-06, + "loss": 17.5821, + "step": 8300 + }, + { + "epoch": 0.15025732244525258, + "grad_norm": 39.03125, + "learning_rate": 9.976522333165892e-06, + "loss": 17.1667, + "step": 8310 + }, + { + "epoch": 0.15043813751438045, + "grad_norm": 41.3125, + "learning_rate": 9.976494080859233e-06, + "loss": 17.5639, + "step": 8320 + }, + { + "epoch": 0.1506189525835083, + "grad_norm": 40.84375, + "learning_rate": 9.976465828552573e-06, + "loss": 17.3709, + "step": 8330 + }, + { + "epoch": 0.15079976765263617, + "grad_norm": 41.09375, + "learning_rate": 9.976437576245914e-06, + "loss": 17.223, + "step": 8340 + }, + { + "epoch": 0.15098058272176404, + "grad_norm": 36.84375, + "learning_rate": 9.976409323939253e-06, + "loss": 17.4693, + "step": 8350 + }, + { + "epoch": 0.15116139779089188, + "grad_norm": 41.71875, + "learning_rate": 9.976381071632593e-06, + "loss": 17.4322, + "step": 8360 + }, + { + "epoch": 0.15134221286001975, + "grad_norm": 41.96875, + "learning_rate": 9.976352819325936e-06, + "loss": 17.8045, + "step": 8370 + }, + { + "epoch": 0.15152302792914762, + "grad_norm": 40.59375, + "learning_rate": 9.976324567019275e-06, + "loss": 17.2138, + "step": 8380 + }, + { + "epoch": 0.15170384299827547, + "grad_norm": 39.6875, + "learning_rate": 9.976296314712615e-06, + "loss": 17.234, + "step": 8390 + }, + { + "epoch": 0.15188465806740334, + "grad_norm": 41.84375, + "learning_rate": 9.976268062405956e-06, + "loss": 17.3728, + "step": 8400 + }, + { + "epoch": 0.1520654731365312, + "grad_norm": 38.75, + "learning_rate": 9.976239810099296e-06, + "loss": 17.6384, + "step": 8410 + }, + { + "epoch": 0.15224628820565905, + "grad_norm": 40.46875, + "learning_rate": 9.976211557792637e-06, + "loss": 17.4172, + "step": 8420 + }, + { + "epoch": 0.15242710327478692, + "grad_norm": 40.84375, + "learning_rate": 9.976183305485978e-06, + "loss": 17.0899, + "step": 8430 + }, + { + "epoch": 0.1526079183439148, + "grad_norm": 39.71875, + "learning_rate": 9.976155053179317e-06, + "loss": 17.2965, + "step": 8440 + }, + { + "epoch": 0.15278873341304264, + "grad_norm": 40.75, + "learning_rate": 9.976126800872659e-06, + "loss": 17.5926, + "step": 8450 + }, + { + "epoch": 0.1529695484821705, + "grad_norm": 39.375, + "learning_rate": 9.976098548566e-06, + "loss": 17.4443, + "step": 8460 + }, + { + "epoch": 0.15315036355129835, + "grad_norm": 39.46875, + "learning_rate": 9.976070296259339e-06, + "loss": 17.2535, + "step": 8470 + }, + { + "epoch": 0.15333117862042622, + "grad_norm": 41.65625, + "learning_rate": 9.976042043952679e-06, + "loss": 17.4198, + "step": 8480 + }, + { + "epoch": 0.1535119936895541, + "grad_norm": 39.84375, + "learning_rate": 9.97601379164602e-06, + "loss": 17.372, + "step": 8490 + }, + { + "epoch": 0.15369280875868194, + "grad_norm": 40.28125, + "learning_rate": 9.97598553933936e-06, + "loss": 17.0233, + "step": 8500 + }, + { + "epoch": 0.1538736238278098, + "grad_norm": 39.65625, + "learning_rate": 9.975957287032701e-06, + "loss": 17.4556, + "step": 8510 + }, + { + "epoch": 0.15405443889693768, + "grad_norm": 40.1875, + "learning_rate": 9.97592903472604e-06, + "loss": 17.3115, + "step": 8520 + }, + { + "epoch": 0.15423525396606552, + "grad_norm": 42.46875, + "learning_rate": 9.97590078241938e-06, + "loss": 17.3447, + "step": 8530 + }, + { + "epoch": 0.1544160690351934, + "grad_norm": 37.8125, + "learning_rate": 9.975872530112723e-06, + "loss": 17.3876, + "step": 8540 + }, + { + "epoch": 0.15459688410432126, + "grad_norm": 39.03125, + "learning_rate": 9.975844277806064e-06, + "loss": 17.5301, + "step": 8550 + }, + { + "epoch": 0.1547776991734491, + "grad_norm": 38.96875, + "learning_rate": 9.975816025499402e-06, + "loss": 17.4396, + "step": 8560 + }, + { + "epoch": 0.15495851424257698, + "grad_norm": 40.84375, + "learning_rate": 9.975787773192743e-06, + "loss": 17.4825, + "step": 8570 + }, + { + "epoch": 0.15513932931170485, + "grad_norm": 39.96875, + "learning_rate": 9.975759520886084e-06, + "loss": 17.7308, + "step": 8580 + }, + { + "epoch": 0.1553201443808327, + "grad_norm": 40.34375, + "learning_rate": 9.975731268579424e-06, + "loss": 17.5016, + "step": 8590 + }, + { + "epoch": 0.15550095944996056, + "grad_norm": 40.40625, + "learning_rate": 9.975703016272765e-06, + "loss": 17.2755, + "step": 8600 + }, + { + "epoch": 0.15568177451908843, + "grad_norm": 37.09375, + "learning_rate": 9.975674763966104e-06, + "loss": 17.3056, + "step": 8610 + }, + { + "epoch": 0.15586258958821628, + "grad_norm": 41.09375, + "learning_rate": 9.975646511659444e-06, + "loss": 17.0374, + "step": 8620 + }, + { + "epoch": 0.15604340465734415, + "grad_norm": 41.15625, + "learning_rate": 9.975618259352787e-06, + "loss": 17.2957, + "step": 8630 + }, + { + "epoch": 0.156224219726472, + "grad_norm": 40.1875, + "learning_rate": 9.975590007046126e-06, + "loss": 17.1892, + "step": 8640 + }, + { + "epoch": 0.15640503479559986, + "grad_norm": 37.3125, + "learning_rate": 9.975561754739466e-06, + "loss": 17.7739, + "step": 8650 + }, + { + "epoch": 0.15658584986472773, + "grad_norm": 39.90625, + "learning_rate": 9.975533502432807e-06, + "loss": 17.861, + "step": 8660 + }, + { + "epoch": 0.15676666493385558, + "grad_norm": 39.96875, + "learning_rate": 9.975505250126148e-06, + "loss": 17.779, + "step": 8670 + }, + { + "epoch": 0.15694748000298345, + "grad_norm": 40.65625, + "learning_rate": 9.975476997819488e-06, + "loss": 17.6016, + "step": 8680 + }, + { + "epoch": 0.15712829507211132, + "grad_norm": 41.34375, + "learning_rate": 9.975448745512827e-06, + "loss": 17.1837, + "step": 8690 + }, + { + "epoch": 0.15730911014123916, + "grad_norm": 42.15625, + "learning_rate": 9.975420493206168e-06, + "loss": 17.3432, + "step": 8700 + }, + { + "epoch": 0.15748992521036703, + "grad_norm": 40.5, + "learning_rate": 9.975392240899508e-06, + "loss": 17.4633, + "step": 8710 + }, + { + "epoch": 0.1576707402794949, + "grad_norm": 40.875, + "learning_rate": 9.97536398859285e-06, + "loss": 17.3936, + "step": 8720 + }, + { + "epoch": 0.15785155534862275, + "grad_norm": 39.34375, + "learning_rate": 9.97533573628619e-06, + "loss": 17.2713, + "step": 8730 + }, + { + "epoch": 0.15803237041775062, + "grad_norm": 42.21875, + "learning_rate": 9.97530748397953e-06, + "loss": 17.163, + "step": 8740 + }, + { + "epoch": 0.1582131854868785, + "grad_norm": 41.75, + "learning_rate": 9.975279231672871e-06, + "loss": 17.3989, + "step": 8750 + }, + { + "epoch": 0.15839400055600633, + "grad_norm": 38.4375, + "learning_rate": 9.975250979366211e-06, + "loss": 17.3244, + "step": 8760 + }, + { + "epoch": 0.1585748156251342, + "grad_norm": 41.0, + "learning_rate": 9.975222727059552e-06, + "loss": 17.2113, + "step": 8770 + }, + { + "epoch": 0.15875563069426207, + "grad_norm": 37.875, + "learning_rate": 9.975194474752891e-06, + "loss": 17.142, + "step": 8780 + }, + { + "epoch": 0.15893644576338992, + "grad_norm": 41.15625, + "learning_rate": 9.975166222446232e-06, + "loss": 17.2934, + "step": 8790 + }, + { + "epoch": 0.1591172608325178, + "grad_norm": 41.5, + "learning_rate": 9.975137970139574e-06, + "loss": 17.4551, + "step": 8800 + }, + { + "epoch": 0.15929807590164563, + "grad_norm": 40.90625, + "learning_rate": 9.975109717832913e-06, + "loss": 17.2555, + "step": 8810 + }, + { + "epoch": 0.1594788909707735, + "grad_norm": 40.90625, + "learning_rate": 9.975081465526254e-06, + "loss": 17.4699, + "step": 8820 + }, + { + "epoch": 0.15965970603990137, + "grad_norm": 43.53125, + "learning_rate": 9.975053213219594e-06, + "loss": 17.3129, + "step": 8830 + }, + { + "epoch": 0.15984052110902922, + "grad_norm": 38.34375, + "learning_rate": 9.975024960912935e-06, + "loss": 17.7102, + "step": 8840 + }, + { + "epoch": 0.1600213361781571, + "grad_norm": 37.28125, + "learning_rate": 9.974996708606275e-06, + "loss": 16.9641, + "step": 8850 + }, + { + "epoch": 0.16020215124728496, + "grad_norm": 41.25, + "learning_rate": 9.974968456299616e-06, + "loss": 17.6302, + "step": 8860 + }, + { + "epoch": 0.1603829663164128, + "grad_norm": 43.28125, + "learning_rate": 9.974940203992955e-06, + "loss": 17.493, + "step": 8870 + }, + { + "epoch": 0.16056378138554067, + "grad_norm": 38.625, + "learning_rate": 9.974911951686296e-06, + "loss": 17.4427, + "step": 8880 + }, + { + "epoch": 0.16074459645466854, + "grad_norm": 41.375, + "learning_rate": 9.974883699379638e-06, + "loss": 17.3359, + "step": 8890 + }, + { + "epoch": 0.16092541152379639, + "grad_norm": 38.375, + "learning_rate": 9.974855447072977e-06, + "loss": 17.4055, + "step": 8900 + }, + { + "epoch": 0.16110622659292426, + "grad_norm": 38.34375, + "learning_rate": 9.974827194766317e-06, + "loss": 17.0807, + "step": 8910 + }, + { + "epoch": 0.16128704166205213, + "grad_norm": 42.09375, + "learning_rate": 9.974798942459658e-06, + "loss": 17.5087, + "step": 8920 + }, + { + "epoch": 0.16146785673117997, + "grad_norm": 38.9375, + "learning_rate": 9.974770690152999e-06, + "loss": 17.475, + "step": 8930 + }, + { + "epoch": 0.16164867180030784, + "grad_norm": 41.15625, + "learning_rate": 9.97474243784634e-06, + "loss": 17.2472, + "step": 8940 + }, + { + "epoch": 0.1618294868694357, + "grad_norm": 40.28125, + "learning_rate": 9.974714185539678e-06, + "loss": 17.5978, + "step": 8950 + }, + { + "epoch": 0.16201030193856356, + "grad_norm": 40.59375, + "learning_rate": 9.974685933233019e-06, + "loss": 17.1928, + "step": 8960 + }, + { + "epoch": 0.16219111700769143, + "grad_norm": 39.4375, + "learning_rate": 9.97465768092636e-06, + "loss": 17.3915, + "step": 8970 + }, + { + "epoch": 0.16237193207681927, + "grad_norm": 42.625, + "learning_rate": 9.974629428619702e-06, + "loss": 17.7899, + "step": 8980 + }, + { + "epoch": 0.16255274714594714, + "grad_norm": 39.375, + "learning_rate": 9.97460117631304e-06, + "loss": 17.9558, + "step": 8990 + }, + { + "epoch": 0.162733562215075, + "grad_norm": 38.84375, + "learning_rate": 9.974572924006381e-06, + "loss": 17.5389, + "step": 9000 + }, + { + "epoch": 0.16291437728420285, + "grad_norm": 40.59375, + "learning_rate": 9.974544671699722e-06, + "loss": 17.6044, + "step": 9010 + }, + { + "epoch": 0.16309519235333073, + "grad_norm": 39.28125, + "learning_rate": 9.974516419393063e-06, + "loss": 17.5651, + "step": 9020 + }, + { + "epoch": 0.1632760074224586, + "grad_norm": 40.25, + "learning_rate": 9.974488167086403e-06, + "loss": 17.4637, + "step": 9030 + }, + { + "epoch": 0.16345682249158644, + "grad_norm": 38.9375, + "learning_rate": 9.974459914779742e-06, + "loss": 17.348, + "step": 9040 + }, + { + "epoch": 0.1636376375607143, + "grad_norm": 40.28125, + "learning_rate": 9.974431662473083e-06, + "loss": 17.2592, + "step": 9050 + }, + { + "epoch": 0.16381845262984218, + "grad_norm": 39.75, + "learning_rate": 9.974403410166423e-06, + "loss": 17.5499, + "step": 9060 + }, + { + "epoch": 0.16399926769897002, + "grad_norm": 37.625, + "learning_rate": 9.974375157859764e-06, + "loss": 17.1442, + "step": 9070 + }, + { + "epoch": 0.1641800827680979, + "grad_norm": 40.5, + "learning_rate": 9.974346905553105e-06, + "loss": 17.0087, + "step": 9080 + }, + { + "epoch": 0.16436089783722577, + "grad_norm": 38.125, + "learning_rate": 9.974318653246445e-06, + "loss": 17.2556, + "step": 9090 + }, + { + "epoch": 0.1645417129063536, + "grad_norm": 37.9375, + "learning_rate": 9.974290400939786e-06, + "loss": 17.1482, + "step": 9100 + }, + { + "epoch": 0.16472252797548148, + "grad_norm": 40.4375, + "learning_rate": 9.974262148633126e-06, + "loss": 17.108, + "step": 9110 + }, + { + "epoch": 0.16490334304460932, + "grad_norm": 39.875, + "learning_rate": 9.974233896326465e-06, + "loss": 17.4758, + "step": 9120 + }, + { + "epoch": 0.1650841581137372, + "grad_norm": 40.90625, + "learning_rate": 9.974205644019806e-06, + "loss": 17.3085, + "step": 9130 + }, + { + "epoch": 0.16526497318286507, + "grad_norm": 38.59375, + "learning_rate": 9.974177391713147e-06, + "loss": 17.3685, + "step": 9140 + }, + { + "epoch": 0.1654457882519929, + "grad_norm": 43.0, + "learning_rate": 9.974149139406489e-06, + "loss": 17.382, + "step": 9150 + }, + { + "epoch": 0.16562660332112078, + "grad_norm": 41.53125, + "learning_rate": 9.974120887099828e-06, + "loss": 17.3543, + "step": 9160 + }, + { + "epoch": 0.16580741839024865, + "grad_norm": 41.0, + "learning_rate": 9.974092634793169e-06, + "loss": 17.5792, + "step": 9170 + }, + { + "epoch": 0.1659882334593765, + "grad_norm": 38.46875, + "learning_rate": 9.97406438248651e-06, + "loss": 17.8702, + "step": 9180 + }, + { + "epoch": 0.16616904852850437, + "grad_norm": 39.03125, + "learning_rate": 9.97403613017985e-06, + "loss": 17.4478, + "step": 9190 + }, + { + "epoch": 0.16634986359763224, + "grad_norm": 41.59375, + "learning_rate": 9.97400787787319e-06, + "loss": 17.2742, + "step": 9200 + }, + { + "epoch": 0.16653067866676008, + "grad_norm": 40.15625, + "learning_rate": 9.97397962556653e-06, + "loss": 17.5207, + "step": 9210 + }, + { + "epoch": 0.16671149373588795, + "grad_norm": 39.09375, + "learning_rate": 9.97395137325987e-06, + "loss": 17.5423, + "step": 9220 + }, + { + "epoch": 0.16689230880501582, + "grad_norm": 40.40625, + "learning_rate": 9.97392312095321e-06, + "loss": 17.708, + "step": 9230 + }, + { + "epoch": 0.16707312387414366, + "grad_norm": 43.5625, + "learning_rate": 9.973894868646551e-06, + "loss": 17.2694, + "step": 9240 + }, + { + "epoch": 0.16725393894327154, + "grad_norm": 39.75, + "learning_rate": 9.973866616339892e-06, + "loss": 17.4095, + "step": 9250 + }, + { + "epoch": 0.1674347540123994, + "grad_norm": 41.375, + "learning_rate": 9.973838364033232e-06, + "loss": 17.4842, + "step": 9260 + }, + { + "epoch": 0.16761556908152725, + "grad_norm": 38.125, + "learning_rate": 9.973810111726573e-06, + "loss": 17.4138, + "step": 9270 + }, + { + "epoch": 0.16779638415065512, + "grad_norm": 40.1875, + "learning_rate": 9.973781859419914e-06, + "loss": 17.4215, + "step": 9280 + }, + { + "epoch": 0.16797719921978296, + "grad_norm": 41.1875, + "learning_rate": 9.973753607113254e-06, + "loss": 17.5086, + "step": 9290 + }, + { + "epoch": 0.16815801428891083, + "grad_norm": 38.34375, + "learning_rate": 9.973725354806593e-06, + "loss": 17.5304, + "step": 9300 + }, + { + "epoch": 0.1683388293580387, + "grad_norm": 42.34375, + "learning_rate": 9.973697102499934e-06, + "loss": 17.6362, + "step": 9310 + }, + { + "epoch": 0.16851964442716655, + "grad_norm": 40.84375, + "learning_rate": 9.973668850193274e-06, + "loss": 17.7299, + "step": 9320 + }, + { + "epoch": 0.16870045949629442, + "grad_norm": 40.1875, + "learning_rate": 9.973640597886615e-06, + "loss": 17.6307, + "step": 9330 + }, + { + "epoch": 0.1688812745654223, + "grad_norm": 39.125, + "learning_rate": 9.973612345579956e-06, + "loss": 17.3122, + "step": 9340 + }, + { + "epoch": 0.16906208963455013, + "grad_norm": 40.5625, + "learning_rate": 9.973584093273296e-06, + "loss": 17.3277, + "step": 9350 + }, + { + "epoch": 0.169242904703678, + "grad_norm": 40.40625, + "learning_rate": 9.973555840966637e-06, + "loss": 17.4785, + "step": 9360 + }, + { + "epoch": 0.16942371977280588, + "grad_norm": 39.90625, + "learning_rate": 9.973527588659978e-06, + "loss": 17.1665, + "step": 9370 + }, + { + "epoch": 0.16960453484193372, + "grad_norm": 39.625, + "learning_rate": 9.973499336353317e-06, + "loss": 17.2563, + "step": 9380 + }, + { + "epoch": 0.1697853499110616, + "grad_norm": 37.71875, + "learning_rate": 9.973471084046657e-06, + "loss": 17.3261, + "step": 9390 + }, + { + "epoch": 0.16996616498018946, + "grad_norm": 38.5, + "learning_rate": 9.973442831739998e-06, + "loss": 17.3375, + "step": 9400 + }, + { + "epoch": 0.1701469800493173, + "grad_norm": 43.25, + "learning_rate": 9.973414579433338e-06, + "loss": 17.1612, + "step": 9410 + }, + { + "epoch": 0.17032779511844517, + "grad_norm": 41.4375, + "learning_rate": 9.973386327126679e-06, + "loss": 17.3999, + "step": 9420 + }, + { + "epoch": 0.17050861018757305, + "grad_norm": 42.9375, + "learning_rate": 9.97335807482002e-06, + "loss": 17.4043, + "step": 9430 + }, + { + "epoch": 0.1706894252567009, + "grad_norm": 42.65625, + "learning_rate": 9.97332982251336e-06, + "loss": 17.2566, + "step": 9440 + }, + { + "epoch": 0.17087024032582876, + "grad_norm": 41.3125, + "learning_rate": 9.973301570206701e-06, + "loss": 17.311, + "step": 9450 + }, + { + "epoch": 0.1710510553949566, + "grad_norm": 41.25, + "learning_rate": 9.973273317900041e-06, + "loss": 17.0837, + "step": 9460 + }, + { + "epoch": 0.17123187046408447, + "grad_norm": 37.28125, + "learning_rate": 9.97324506559338e-06, + "loss": 17.1674, + "step": 9470 + }, + { + "epoch": 0.17141268553321234, + "grad_norm": 41.40625, + "learning_rate": 9.973216813286721e-06, + "loss": 17.3728, + "step": 9480 + }, + { + "epoch": 0.1715935006023402, + "grad_norm": 42.125, + "learning_rate": 9.973188560980062e-06, + "loss": 17.5858, + "step": 9490 + }, + { + "epoch": 0.17177431567146806, + "grad_norm": 37.5625, + "learning_rate": 9.973160308673402e-06, + "loss": 17.2968, + "step": 9500 + }, + { + "epoch": 0.17195513074059593, + "grad_norm": 41.84375, + "learning_rate": 9.973132056366743e-06, + "loss": 17.3369, + "step": 9510 + }, + { + "epoch": 0.17213594580972377, + "grad_norm": 42.03125, + "learning_rate": 9.973103804060084e-06, + "loss": 17.0527, + "step": 9520 + }, + { + "epoch": 0.17231676087885164, + "grad_norm": 40.75, + "learning_rate": 9.973075551753424e-06, + "loss": 17.159, + "step": 9530 + }, + { + "epoch": 0.17249757594797951, + "grad_norm": 41.03125, + "learning_rate": 9.973047299446765e-06, + "loss": 17.5794, + "step": 9540 + }, + { + "epoch": 0.17267839101710736, + "grad_norm": 42.0, + "learning_rate": 9.973019047140104e-06, + "loss": 17.6003, + "step": 9550 + }, + { + "epoch": 0.17285920608623523, + "grad_norm": 42.1875, + "learning_rate": 9.972990794833444e-06, + "loss": 17.372, + "step": 9560 + }, + { + "epoch": 0.1730400211553631, + "grad_norm": 40.84375, + "learning_rate": 9.972962542526785e-06, + "loss": 16.9928, + "step": 9570 + }, + { + "epoch": 0.17322083622449094, + "grad_norm": 39.65625, + "learning_rate": 9.972934290220126e-06, + "loss": 17.6939, + "step": 9580 + }, + { + "epoch": 0.1734016512936188, + "grad_norm": 44.5625, + "learning_rate": 9.972906037913466e-06, + "loss": 17.4905, + "step": 9590 + }, + { + "epoch": 0.17358246636274668, + "grad_norm": 38.6875, + "learning_rate": 9.972877785606807e-06, + "loss": 17.1858, + "step": 9600 + }, + { + "epoch": 0.17376328143187453, + "grad_norm": 39.125, + "learning_rate": 9.972849533300147e-06, + "loss": 17.3789, + "step": 9610 + }, + { + "epoch": 0.1739440965010024, + "grad_norm": 38.4375, + "learning_rate": 9.972821280993488e-06, + "loss": 17.4596, + "step": 9620 + }, + { + "epoch": 0.17412491157013024, + "grad_norm": 41.25, + "learning_rate": 9.972793028686829e-06, + "loss": 17.162, + "step": 9630 + }, + { + "epoch": 0.1743057266392581, + "grad_norm": 42.25, + "learning_rate": 9.972764776380168e-06, + "loss": 17.4351, + "step": 9640 + }, + { + "epoch": 0.17448654170838598, + "grad_norm": 40.625, + "learning_rate": 9.972736524073508e-06, + "loss": 17.6255, + "step": 9650 + }, + { + "epoch": 0.17466735677751383, + "grad_norm": 40.34375, + "learning_rate": 9.972708271766849e-06, + "loss": 17.2791, + "step": 9660 + }, + { + "epoch": 0.1748481718466417, + "grad_norm": 43.0, + "learning_rate": 9.97268001946019e-06, + "loss": 17.5448, + "step": 9670 + }, + { + "epoch": 0.17502898691576957, + "grad_norm": 40.65625, + "learning_rate": 9.97265176715353e-06, + "loss": 17.4885, + "step": 9680 + }, + { + "epoch": 0.1752098019848974, + "grad_norm": 38.9375, + "learning_rate": 9.97262351484687e-06, + "loss": 17.652, + "step": 9690 + }, + { + "epoch": 0.17539061705402528, + "grad_norm": 41.09375, + "learning_rate": 9.972595262540211e-06, + "loss": 17.7164, + "step": 9700 + }, + { + "epoch": 0.17557143212315315, + "grad_norm": 39.71875, + "learning_rate": 9.972567010233552e-06, + "loss": 17.4173, + "step": 9710 + }, + { + "epoch": 0.175752247192281, + "grad_norm": 40.34375, + "learning_rate": 9.972538757926891e-06, + "loss": 17.8984, + "step": 9720 + }, + { + "epoch": 0.17593306226140887, + "grad_norm": 40.53125, + "learning_rate": 9.972510505620232e-06, + "loss": 17.6743, + "step": 9730 + }, + { + "epoch": 0.17611387733053674, + "grad_norm": 41.875, + "learning_rate": 9.972482253313572e-06, + "loss": 17.1897, + "step": 9740 + }, + { + "epoch": 0.17629469239966458, + "grad_norm": 41.90625, + "learning_rate": 9.972454001006913e-06, + "loss": 17.4059, + "step": 9750 + }, + { + "epoch": 0.17647550746879245, + "grad_norm": 40.84375, + "learning_rate": 9.972425748700253e-06, + "loss": 17.2298, + "step": 9760 + }, + { + "epoch": 0.17665632253792032, + "grad_norm": 39.25, + "learning_rate": 9.972397496393594e-06, + "loss": 17.4787, + "step": 9770 + }, + { + "epoch": 0.17683713760704817, + "grad_norm": 39.0, + "learning_rate": 9.972369244086935e-06, + "loss": 16.8661, + "step": 9780 + }, + { + "epoch": 0.17701795267617604, + "grad_norm": 39.09375, + "learning_rate": 9.972340991780275e-06, + "loss": 16.9897, + "step": 9790 + }, + { + "epoch": 0.17719876774530388, + "grad_norm": 40.625, + "learning_rate": 9.972312739473616e-06, + "loss": 17.3103, + "step": 9800 + }, + { + "epoch": 0.17737958281443175, + "grad_norm": 41.75, + "learning_rate": 9.972284487166955e-06, + "loss": 17.4173, + "step": 9810 + }, + { + "epoch": 0.17756039788355962, + "grad_norm": 42.34375, + "learning_rate": 9.972256234860295e-06, + "loss": 17.5176, + "step": 9820 + }, + { + "epoch": 0.17774121295268747, + "grad_norm": 40.59375, + "learning_rate": 9.972227982553636e-06, + "loss": 17.8892, + "step": 9830 + }, + { + "epoch": 0.17792202802181534, + "grad_norm": 40.15625, + "learning_rate": 9.972199730246977e-06, + "loss": 17.3229, + "step": 9840 + }, + { + "epoch": 0.1781028430909432, + "grad_norm": 39.40625, + "learning_rate": 9.972171477940317e-06, + "loss": 17.4932, + "step": 9850 + }, + { + "epoch": 0.17828365816007105, + "grad_norm": 39.40625, + "learning_rate": 9.972143225633658e-06, + "loss": 17.1002, + "step": 9860 + }, + { + "epoch": 0.17846447322919892, + "grad_norm": 41.46875, + "learning_rate": 9.972114973326999e-06, + "loss": 17.3818, + "step": 9870 + }, + { + "epoch": 0.1786452882983268, + "grad_norm": 40.25, + "learning_rate": 9.97208672102034e-06, + "loss": 17.3064, + "step": 9880 + }, + { + "epoch": 0.17882610336745464, + "grad_norm": 41.71875, + "learning_rate": 9.97205846871368e-06, + "loss": 17.0175, + "step": 9890 + }, + { + "epoch": 0.1790069184365825, + "grad_norm": 40.25, + "learning_rate": 9.972030216407019e-06, + "loss": 17.5675, + "step": 9900 + }, + { + "epoch": 0.17918773350571038, + "grad_norm": 40.75, + "learning_rate": 9.97200196410036e-06, + "loss": 17.2624, + "step": 9910 + }, + { + "epoch": 0.17936854857483822, + "grad_norm": 42.09375, + "learning_rate": 9.9719737117937e-06, + "loss": 16.7743, + "step": 9920 + }, + { + "epoch": 0.1795493636439661, + "grad_norm": 41.21875, + "learning_rate": 9.97194545948704e-06, + "loss": 17.372, + "step": 9930 + }, + { + "epoch": 0.17973017871309394, + "grad_norm": 41.09375, + "learning_rate": 9.971917207180381e-06, + "loss": 17.2744, + "step": 9940 + }, + { + "epoch": 0.1799109937822218, + "grad_norm": 39.28125, + "learning_rate": 9.971888954873722e-06, + "loss": 17.3897, + "step": 9950 + }, + { + "epoch": 0.18009180885134968, + "grad_norm": 38.6875, + "learning_rate": 9.971860702567062e-06, + "loss": 17.1437, + "step": 9960 + }, + { + "epoch": 0.18027262392047752, + "grad_norm": 41.0, + "learning_rate": 9.971832450260403e-06, + "loss": 17.4114, + "step": 9970 + }, + { + "epoch": 0.1804534389896054, + "grad_norm": 41.15625, + "learning_rate": 9.971804197953742e-06, + "loss": 17.2309, + "step": 9980 + }, + { + "epoch": 0.18063425405873326, + "grad_norm": 38.46875, + "learning_rate": 9.971775945647083e-06, + "loss": 17.4542, + "step": 9990 + }, + { + "epoch": 0.1808150691278611, + "grad_norm": 40.1875, + "learning_rate": 9.971747693340423e-06, + "loss": 17.4604, + "step": 10000 + }, + { + "epoch": 0.1808150691278611, + "eval_loss": 2.173656463623047, + "eval_runtime": 230.3398, + "eval_samples_per_second": 3152.122, + "eval_steps_per_second": 49.253, + "step": 10000 + }, + { + "epoch": 0.18099588419698898, + "grad_norm": 38.78125, + "learning_rate": 9.971719441033764e-06, + "loss": 17.6182, + "step": 10010 + }, + { + "epoch": 0.18117669926611685, + "grad_norm": 42.90625, + "learning_rate": 9.971691188727104e-06, + "loss": 17.3686, + "step": 10020 + }, + { + "epoch": 0.1813575143352447, + "grad_norm": 41.15625, + "learning_rate": 9.971662936420445e-06, + "loss": 16.8379, + "step": 10030 + }, + { + "epoch": 0.18153832940437256, + "grad_norm": 37.84375, + "learning_rate": 9.971634684113786e-06, + "loss": 17.3498, + "step": 10040 + }, + { + "epoch": 0.18171914447350043, + "grad_norm": 38.84375, + "learning_rate": 9.971606431807126e-06, + "loss": 17.3983, + "step": 10050 + }, + { + "epoch": 0.18189995954262828, + "grad_norm": 43.8125, + "learning_rate": 9.971578179500467e-06, + "loss": 17.4824, + "step": 10060 + }, + { + "epoch": 0.18208077461175615, + "grad_norm": 39.46875, + "learning_rate": 9.971549927193806e-06, + "loss": 17.3409, + "step": 10070 + }, + { + "epoch": 0.18226158968088402, + "grad_norm": 41.15625, + "learning_rate": 9.971521674887147e-06, + "loss": 17.2479, + "step": 10080 + }, + { + "epoch": 0.18244240475001186, + "grad_norm": 43.90625, + "learning_rate": 9.971493422580487e-06, + "loss": 17.4112, + "step": 10090 + }, + { + "epoch": 0.18262321981913973, + "grad_norm": 38.71875, + "learning_rate": 9.971465170273828e-06, + "loss": 17.4876, + "step": 10100 + }, + { + "epoch": 0.18280403488826757, + "grad_norm": 39.125, + "learning_rate": 9.971436917967168e-06, + "loss": 17.2352, + "step": 10110 + }, + { + "epoch": 0.18298484995739545, + "grad_norm": 40.34375, + "learning_rate": 9.971408665660509e-06, + "loss": 17.4973, + "step": 10120 + }, + { + "epoch": 0.18316566502652332, + "grad_norm": 38.875, + "learning_rate": 9.97138041335385e-06, + "loss": 17.0968, + "step": 10130 + }, + { + "epoch": 0.18334648009565116, + "grad_norm": 40.5625, + "learning_rate": 9.97135216104719e-06, + "loss": 17.0191, + "step": 10140 + }, + { + "epoch": 0.18352729516477903, + "grad_norm": 42.21875, + "learning_rate": 9.97132390874053e-06, + "loss": 17.3471, + "step": 10150 + }, + { + "epoch": 0.1837081102339069, + "grad_norm": 40.28125, + "learning_rate": 9.97129565643387e-06, + "loss": 17.1329, + "step": 10160 + }, + { + "epoch": 0.18388892530303474, + "grad_norm": 39.4375, + "learning_rate": 9.97126740412721e-06, + "loss": 17.4335, + "step": 10170 + }, + { + "epoch": 0.18406974037216262, + "grad_norm": 38.75, + "learning_rate": 9.971239151820551e-06, + "loss": 17.201, + "step": 10180 + }, + { + "epoch": 0.1842505554412905, + "grad_norm": 41.125, + "learning_rate": 9.971210899513892e-06, + "loss": 17.3499, + "step": 10190 + }, + { + "epoch": 0.18443137051041833, + "grad_norm": 39.90625, + "learning_rate": 9.971182647207232e-06, + "loss": 17.1541, + "step": 10200 + }, + { + "epoch": 0.1846121855795462, + "grad_norm": 41.90625, + "learning_rate": 9.971154394900573e-06, + "loss": 17.4216, + "step": 10210 + }, + { + "epoch": 0.18479300064867407, + "grad_norm": 42.4375, + "learning_rate": 9.971126142593914e-06, + "loss": 17.228, + "step": 10220 + }, + { + "epoch": 0.18497381571780191, + "grad_norm": 42.0, + "learning_rate": 9.971097890287254e-06, + "loss": 17.3865, + "step": 10230 + }, + { + "epoch": 0.18515463078692979, + "grad_norm": 40.4375, + "learning_rate": 9.971069637980593e-06, + "loss": 17.1773, + "step": 10240 + }, + { + "epoch": 0.18533544585605766, + "grad_norm": 39.78125, + "learning_rate": 9.971041385673934e-06, + "loss": 17.2779, + "step": 10250 + }, + { + "epoch": 0.1855162609251855, + "grad_norm": 42.59375, + "learning_rate": 9.971013133367274e-06, + "loss": 17.4225, + "step": 10260 + }, + { + "epoch": 0.18569707599431337, + "grad_norm": 39.03125, + "learning_rate": 9.970984881060615e-06, + "loss": 17.1712, + "step": 10270 + }, + { + "epoch": 0.18587789106344121, + "grad_norm": 42.34375, + "learning_rate": 9.970956628753956e-06, + "loss": 17.1064, + "step": 10280 + }, + { + "epoch": 0.18605870613256908, + "grad_norm": 38.875, + "learning_rate": 9.970928376447296e-06, + "loss": 17.4685, + "step": 10290 + }, + { + "epoch": 0.18623952120169696, + "grad_norm": 43.15625, + "learning_rate": 9.970900124140637e-06, + "loss": 17.152, + "step": 10300 + }, + { + "epoch": 0.1864203362708248, + "grad_norm": 40.875, + "learning_rate": 9.970871871833977e-06, + "loss": 17.4354, + "step": 10310 + }, + { + "epoch": 0.18660115133995267, + "grad_norm": 40.6875, + "learning_rate": 9.970843619527318e-06, + "loss": 17.0856, + "step": 10320 + }, + { + "epoch": 0.18678196640908054, + "grad_norm": 38.21875, + "learning_rate": 9.970815367220657e-06, + "loss": 17.2967, + "step": 10330 + }, + { + "epoch": 0.18696278147820838, + "grad_norm": 41.21875, + "learning_rate": 9.970787114913998e-06, + "loss": 17.6692, + "step": 10340 + }, + { + "epoch": 0.18714359654733626, + "grad_norm": 41.71875, + "learning_rate": 9.970758862607338e-06, + "loss": 17.5644, + "step": 10350 + }, + { + "epoch": 0.18732441161646413, + "grad_norm": 39.09375, + "learning_rate": 9.970730610300679e-06, + "loss": 17.5649, + "step": 10360 + }, + { + "epoch": 0.18750522668559197, + "grad_norm": 40.59375, + "learning_rate": 9.97070235799402e-06, + "loss": 17.1662, + "step": 10370 + }, + { + "epoch": 0.18768604175471984, + "grad_norm": 41.0625, + "learning_rate": 9.97067410568736e-06, + "loss": 17.7755, + "step": 10380 + }, + { + "epoch": 0.1878668568238477, + "grad_norm": 39.40625, + "learning_rate": 9.9706458533807e-06, + "loss": 17.0753, + "step": 10390 + }, + { + "epoch": 0.18804767189297555, + "grad_norm": 41.53125, + "learning_rate": 9.970617601074041e-06, + "loss": 17.3117, + "step": 10400 + }, + { + "epoch": 0.18822848696210343, + "grad_norm": 41.34375, + "learning_rate": 9.97058934876738e-06, + "loss": 17.3471, + "step": 10410 + }, + { + "epoch": 0.1884093020312313, + "grad_norm": 37.625, + "learning_rate": 9.970561096460721e-06, + "loss": 17.3984, + "step": 10420 + }, + { + "epoch": 0.18859011710035914, + "grad_norm": 39.6875, + "learning_rate": 9.970532844154062e-06, + "loss": 17.0128, + "step": 10430 + }, + { + "epoch": 0.188770932169487, + "grad_norm": 41.0625, + "learning_rate": 9.970504591847402e-06, + "loss": 17.2128, + "step": 10440 + }, + { + "epoch": 0.18895174723861485, + "grad_norm": 41.21875, + "learning_rate": 9.970476339540743e-06, + "loss": 17.5359, + "step": 10450 + }, + { + "epoch": 0.18913256230774272, + "grad_norm": 39.6875, + "learning_rate": 9.970448087234082e-06, + "loss": 16.8257, + "step": 10460 + }, + { + "epoch": 0.1893133773768706, + "grad_norm": 40.3125, + "learning_rate": 9.970419834927424e-06, + "loss": 17.617, + "step": 10470 + }, + { + "epoch": 0.18949419244599844, + "grad_norm": 40.0, + "learning_rate": 9.970391582620765e-06, + "loss": 18.0481, + "step": 10480 + }, + { + "epoch": 0.1896750075151263, + "grad_norm": 42.375, + "learning_rate": 9.970363330314105e-06, + "loss": 17.4429, + "step": 10490 + }, + { + "epoch": 0.18985582258425418, + "grad_norm": 39.1875, + "learning_rate": 9.970335078007444e-06, + "loss": 17.5725, + "step": 10500 + }, + { + "epoch": 0.19003663765338202, + "grad_norm": 41.21875, + "learning_rate": 9.970306825700785e-06, + "loss": 17.3861, + "step": 10510 + }, + { + "epoch": 0.1902174527225099, + "grad_norm": 38.875, + "learning_rate": 9.970278573394125e-06, + "loss": 17.8106, + "step": 10520 + }, + { + "epoch": 0.19039826779163777, + "grad_norm": 40.46875, + "learning_rate": 9.970250321087466e-06, + "loss": 17.1235, + "step": 10530 + }, + { + "epoch": 0.1905790828607656, + "grad_norm": 38.8125, + "learning_rate": 9.970222068780807e-06, + "loss": 17.1585, + "step": 10540 + }, + { + "epoch": 0.19075989792989348, + "grad_norm": 39.65625, + "learning_rate": 9.970193816474146e-06, + "loss": 17.3798, + "step": 10550 + }, + { + "epoch": 0.19094071299902135, + "grad_norm": 42.15625, + "learning_rate": 9.970165564167488e-06, + "loss": 16.9925, + "step": 10560 + }, + { + "epoch": 0.1911215280681492, + "grad_norm": 41.09375, + "learning_rate": 9.970137311860829e-06, + "loss": 17.6112, + "step": 10570 + }, + { + "epoch": 0.19130234313727706, + "grad_norm": 41.5, + "learning_rate": 9.970109059554167e-06, + "loss": 17.0915, + "step": 10580 + }, + { + "epoch": 0.19148315820640494, + "grad_norm": 37.09375, + "learning_rate": 9.970080807247508e-06, + "loss": 17.5033, + "step": 10590 + }, + { + "epoch": 0.19166397327553278, + "grad_norm": 40.15625, + "learning_rate": 9.970052554940849e-06, + "loss": 17.2897, + "step": 10600 + }, + { + "epoch": 0.19184478834466065, + "grad_norm": 41.375, + "learning_rate": 9.97002430263419e-06, + "loss": 17.5192, + "step": 10610 + }, + { + "epoch": 0.1920256034137885, + "grad_norm": 39.125, + "learning_rate": 9.96999605032753e-06, + "loss": 17.6841, + "step": 10620 + }, + { + "epoch": 0.19220641848291636, + "grad_norm": 37.90625, + "learning_rate": 9.96996779802087e-06, + "loss": 17.2809, + "step": 10630 + }, + { + "epoch": 0.19238723355204423, + "grad_norm": 40.53125, + "learning_rate": 9.969939545714211e-06, + "loss": 17.3026, + "step": 10640 + }, + { + "epoch": 0.19256804862117208, + "grad_norm": 39.96875, + "learning_rate": 9.969911293407552e-06, + "loss": 17.5597, + "step": 10650 + }, + { + "epoch": 0.19274886369029995, + "grad_norm": 40.90625, + "learning_rate": 9.969883041100892e-06, + "loss": 17.0397, + "step": 10660 + }, + { + "epoch": 0.19292967875942782, + "grad_norm": 39.9375, + "learning_rate": 9.969854788794231e-06, + "loss": 17.7761, + "step": 10670 + }, + { + "epoch": 0.19311049382855566, + "grad_norm": 42.3125, + "learning_rate": 9.969826536487572e-06, + "loss": 17.5117, + "step": 10680 + }, + { + "epoch": 0.19329130889768353, + "grad_norm": 38.0, + "learning_rate": 9.969798284180913e-06, + "loss": 17.26, + "step": 10690 + }, + { + "epoch": 0.1934721239668114, + "grad_norm": 40.40625, + "learning_rate": 9.969770031874253e-06, + "loss": 17.8356, + "step": 10700 + }, + { + "epoch": 0.19365293903593925, + "grad_norm": 39.9375, + "learning_rate": 9.969741779567594e-06, + "loss": 17.7687, + "step": 10710 + }, + { + "epoch": 0.19383375410506712, + "grad_norm": 40.84375, + "learning_rate": 9.969713527260933e-06, + "loss": 17.4783, + "step": 10720 + }, + { + "epoch": 0.194014569174195, + "grad_norm": 43.03125, + "learning_rate": 9.969685274954275e-06, + "loss": 17.7937, + "step": 10730 + }, + { + "epoch": 0.19419538424332283, + "grad_norm": 43.15625, + "learning_rate": 9.969657022647616e-06, + "loss": 17.655, + "step": 10740 + }, + { + "epoch": 0.1943761993124507, + "grad_norm": 41.28125, + "learning_rate": 9.969628770340956e-06, + "loss": 17.4106, + "step": 10750 + }, + { + "epoch": 0.19455701438157855, + "grad_norm": 40.75, + "learning_rate": 9.969600518034295e-06, + "loss": 17.2935, + "step": 10760 + }, + { + "epoch": 0.19473782945070642, + "grad_norm": 38.75, + "learning_rate": 9.969572265727636e-06, + "loss": 17.3053, + "step": 10770 + }, + { + "epoch": 0.1949186445198343, + "grad_norm": 39.78125, + "learning_rate": 9.969544013420977e-06, + "loss": 17.6152, + "step": 10780 + }, + { + "epoch": 0.19509945958896213, + "grad_norm": 43.40625, + "learning_rate": 9.969515761114317e-06, + "loss": 17.6451, + "step": 10790 + }, + { + "epoch": 0.19528027465809, + "grad_norm": 40.9375, + "learning_rate": 9.969487508807658e-06, + "loss": 17.323, + "step": 10800 + }, + { + "epoch": 0.19546108972721787, + "grad_norm": 39.1875, + "learning_rate": 9.969459256500997e-06, + "loss": 17.0142, + "step": 10810 + }, + { + "epoch": 0.19564190479634572, + "grad_norm": 41.34375, + "learning_rate": 9.969431004194339e-06, + "loss": 17.7659, + "step": 10820 + }, + { + "epoch": 0.1958227198654736, + "grad_norm": 43.375, + "learning_rate": 9.96940275188768e-06, + "loss": 17.0923, + "step": 10830 + }, + { + "epoch": 0.19600353493460146, + "grad_norm": 37.625, + "learning_rate": 9.969374499581019e-06, + "loss": 17.2729, + "step": 10840 + }, + { + "epoch": 0.1961843500037293, + "grad_norm": 41.40625, + "learning_rate": 9.96934624727436e-06, + "loss": 17.4684, + "step": 10850 + }, + { + "epoch": 0.19636516507285717, + "grad_norm": 38.5, + "learning_rate": 9.9693179949677e-06, + "loss": 17.3878, + "step": 10860 + }, + { + "epoch": 0.19654598014198504, + "grad_norm": 40.71875, + "learning_rate": 9.96928974266104e-06, + "loss": 17.1617, + "step": 10870 + }, + { + "epoch": 0.1967267952111129, + "grad_norm": 40.8125, + "learning_rate": 9.969261490354381e-06, + "loss": 17.1731, + "step": 10880 + }, + { + "epoch": 0.19690761028024076, + "grad_norm": 41.03125, + "learning_rate": 9.96923323804772e-06, + "loss": 17.5819, + "step": 10890 + }, + { + "epoch": 0.19708842534936863, + "grad_norm": 39.5, + "learning_rate": 9.96920498574106e-06, + "loss": 17.3935, + "step": 10900 + }, + { + "epoch": 0.19726924041849647, + "grad_norm": 40.96875, + "learning_rate": 9.969176733434403e-06, + "loss": 17.4576, + "step": 10910 + }, + { + "epoch": 0.19745005548762434, + "grad_norm": 38.1875, + "learning_rate": 9.969148481127744e-06, + "loss": 17.3355, + "step": 10920 + }, + { + "epoch": 0.1976308705567522, + "grad_norm": 39.1875, + "learning_rate": 9.969120228821082e-06, + "loss": 17.1319, + "step": 10930 + }, + { + "epoch": 0.19781168562588006, + "grad_norm": 38.9375, + "learning_rate": 9.969091976514423e-06, + "loss": 17.4143, + "step": 10940 + }, + { + "epoch": 0.19799250069500793, + "grad_norm": 41.46875, + "learning_rate": 9.969063724207764e-06, + "loss": 17.4249, + "step": 10950 + }, + { + "epoch": 0.19817331576413577, + "grad_norm": 41.5, + "learning_rate": 9.969035471901104e-06, + "loss": 17.1293, + "step": 10960 + }, + { + "epoch": 0.19835413083326364, + "grad_norm": 41.6875, + "learning_rate": 9.969007219594445e-06, + "loss": 17.4503, + "step": 10970 + }, + { + "epoch": 0.1985349459023915, + "grad_norm": 38.4375, + "learning_rate": 9.968978967287784e-06, + "loss": 17.0614, + "step": 10980 + }, + { + "epoch": 0.19871576097151936, + "grad_norm": 39.09375, + "learning_rate": 9.968950714981126e-06, + "loss": 17.3571, + "step": 10990 + }, + { + "epoch": 0.19889657604064723, + "grad_norm": 39.1875, + "learning_rate": 9.968922462674467e-06, + "loss": 17.5243, + "step": 11000 + }, + { + "epoch": 0.1990773911097751, + "grad_norm": 38.9375, + "learning_rate": 9.968894210367806e-06, + "loss": 17.303, + "step": 11010 + }, + { + "epoch": 0.19925820617890294, + "grad_norm": 40.375, + "learning_rate": 9.968865958061146e-06, + "loss": 17.3339, + "step": 11020 + }, + { + "epoch": 0.1994390212480308, + "grad_norm": 40.75, + "learning_rate": 9.968837705754487e-06, + "loss": 17.4386, + "step": 11030 + }, + { + "epoch": 0.19961983631715868, + "grad_norm": 38.96875, + "learning_rate": 9.968809453447828e-06, + "loss": 17.4172, + "step": 11040 + }, + { + "epoch": 0.19980065138628653, + "grad_norm": 40.59375, + "learning_rate": 9.968781201141168e-06, + "loss": 17.1305, + "step": 11050 + }, + { + "epoch": 0.1999814664554144, + "grad_norm": 39.625, + "learning_rate": 9.968752948834509e-06, + "loss": 17.4369, + "step": 11060 + }, + { + "epoch": 0.20016228152454227, + "grad_norm": 41.59375, + "learning_rate": 9.968724696527848e-06, + "loss": 16.9403, + "step": 11070 + }, + { + "epoch": 0.2003430965936701, + "grad_norm": 40.21875, + "learning_rate": 9.96869644422119e-06, + "loss": 16.5315, + "step": 11080 + }, + { + "epoch": 0.20052391166279798, + "grad_norm": 41.40625, + "learning_rate": 9.96866819191453e-06, + "loss": 17.3027, + "step": 11090 + }, + { + "epoch": 0.20070472673192583, + "grad_norm": 39.3125, + "learning_rate": 9.96863993960787e-06, + "loss": 17.1912, + "step": 11100 + }, + { + "epoch": 0.2008855418010537, + "grad_norm": 43.53125, + "learning_rate": 9.96861168730121e-06, + "loss": 17.6024, + "step": 11110 + }, + { + "epoch": 0.20106635687018157, + "grad_norm": 38.6875, + "learning_rate": 9.968583434994551e-06, + "loss": 17.4576, + "step": 11120 + }, + { + "epoch": 0.2012471719393094, + "grad_norm": 39.84375, + "learning_rate": 9.968555182687892e-06, + "loss": 17.3747, + "step": 11130 + }, + { + "epoch": 0.20142798700843728, + "grad_norm": 40.15625, + "learning_rate": 9.968526930381232e-06, + "loss": 17.0922, + "step": 11140 + }, + { + "epoch": 0.20160880207756515, + "grad_norm": 39.5625, + "learning_rate": 9.968498678074571e-06, + "loss": 17.3724, + "step": 11150 + }, + { + "epoch": 0.201789617146693, + "grad_norm": 39.96875, + "learning_rate": 9.968470425767912e-06, + "loss": 17.3961, + "step": 11160 + }, + { + "epoch": 0.20197043221582087, + "grad_norm": 42.9375, + "learning_rate": 9.968442173461254e-06, + "loss": 17.4708, + "step": 11170 + }, + { + "epoch": 0.20215124728494874, + "grad_norm": 41.28125, + "learning_rate": 9.968413921154595e-06, + "loss": 17.5796, + "step": 11180 + }, + { + "epoch": 0.20233206235407658, + "grad_norm": 41.15625, + "learning_rate": 9.968385668847934e-06, + "loss": 17.803, + "step": 11190 + }, + { + "epoch": 0.20251287742320445, + "grad_norm": 40.0625, + "learning_rate": 9.968357416541274e-06, + "loss": 17.0907, + "step": 11200 + }, + { + "epoch": 0.20269369249233232, + "grad_norm": 38.6875, + "learning_rate": 9.968329164234615e-06, + "loss": 17.4098, + "step": 11210 + }, + { + "epoch": 0.20287450756146017, + "grad_norm": 41.3125, + "learning_rate": 9.968300911927955e-06, + "loss": 17.3981, + "step": 11220 + }, + { + "epoch": 0.20305532263058804, + "grad_norm": 42.40625, + "learning_rate": 9.968272659621296e-06, + "loss": 17.505, + "step": 11230 + }, + { + "epoch": 0.2032361376997159, + "grad_norm": 40.03125, + "learning_rate": 9.968244407314635e-06, + "loss": 17.1375, + "step": 11240 + }, + { + "epoch": 0.20341695276884375, + "grad_norm": 42.09375, + "learning_rate": 9.968216155007976e-06, + "loss": 17.1601, + "step": 11250 + }, + { + "epoch": 0.20359776783797162, + "grad_norm": 39.0625, + "learning_rate": 9.968187902701318e-06, + "loss": 17.2334, + "step": 11260 + }, + { + "epoch": 0.20377858290709946, + "grad_norm": 39.90625, + "learning_rate": 9.968159650394657e-06, + "loss": 16.8655, + "step": 11270 + }, + { + "epoch": 0.20395939797622734, + "grad_norm": 43.40625, + "learning_rate": 9.968131398087997e-06, + "loss": 17.5414, + "step": 11280 + }, + { + "epoch": 0.2041402130453552, + "grad_norm": 38.0, + "learning_rate": 9.968103145781338e-06, + "loss": 17.5637, + "step": 11290 + }, + { + "epoch": 0.20432102811448305, + "grad_norm": 40.625, + "learning_rate": 9.968074893474679e-06, + "loss": 17.0316, + "step": 11300 + }, + { + "epoch": 0.20450184318361092, + "grad_norm": 42.59375, + "learning_rate": 9.96804664116802e-06, + "loss": 17.5458, + "step": 11310 + }, + { + "epoch": 0.2046826582527388, + "grad_norm": 38.9375, + "learning_rate": 9.968018388861358e-06, + "loss": 17.4147, + "step": 11320 + }, + { + "epoch": 0.20486347332186663, + "grad_norm": 41.78125, + "learning_rate": 9.967990136554699e-06, + "loss": 17.158, + "step": 11330 + }, + { + "epoch": 0.2050442883909945, + "grad_norm": 42.21875, + "learning_rate": 9.967961884248041e-06, + "loss": 17.5764, + "step": 11340 + }, + { + "epoch": 0.20522510346012238, + "grad_norm": 39.40625, + "learning_rate": 9.967933631941382e-06, + "loss": 17.6255, + "step": 11350 + }, + { + "epoch": 0.20540591852925022, + "grad_norm": 40.6875, + "learning_rate": 9.96790537963472e-06, + "loss": 17.2257, + "step": 11360 + }, + { + "epoch": 0.2055867335983781, + "grad_norm": 40.625, + "learning_rate": 9.967877127328061e-06, + "loss": 17.3469, + "step": 11370 + }, + { + "epoch": 0.20576754866750596, + "grad_norm": 41.9375, + "learning_rate": 9.967848875021402e-06, + "loss": 17.1934, + "step": 11380 + }, + { + "epoch": 0.2059483637366338, + "grad_norm": 39.625, + "learning_rate": 9.967820622714743e-06, + "loss": 17.1529, + "step": 11390 + }, + { + "epoch": 0.20612917880576168, + "grad_norm": 42.0, + "learning_rate": 9.967792370408083e-06, + "loss": 17.0926, + "step": 11400 + }, + { + "epoch": 0.20630999387488955, + "grad_norm": 41.96875, + "learning_rate": 9.967764118101422e-06, + "loss": 17.0699, + "step": 11410 + }, + { + "epoch": 0.2064908089440174, + "grad_norm": 41.6875, + "learning_rate": 9.967735865794763e-06, + "loss": 17.3271, + "step": 11420 + }, + { + "epoch": 0.20667162401314526, + "grad_norm": 42.78125, + "learning_rate": 9.967707613488105e-06, + "loss": 17.2057, + "step": 11430 + }, + { + "epoch": 0.2068524390822731, + "grad_norm": 38.71875, + "learning_rate": 9.967679361181444e-06, + "loss": 17.5981, + "step": 11440 + }, + { + "epoch": 0.20703325415140097, + "grad_norm": 39.65625, + "learning_rate": 9.967651108874785e-06, + "loss": 16.9635, + "step": 11450 + }, + { + "epoch": 0.20721406922052885, + "grad_norm": 40.375, + "learning_rate": 9.967622856568125e-06, + "loss": 17.6198, + "step": 11460 + }, + { + "epoch": 0.2073948842896567, + "grad_norm": 38.75, + "learning_rate": 9.967594604261466e-06, + "loss": 17.2423, + "step": 11470 + }, + { + "epoch": 0.20757569935878456, + "grad_norm": 43.15625, + "learning_rate": 9.967566351954807e-06, + "loss": 17.2828, + "step": 11480 + }, + { + "epoch": 0.20775651442791243, + "grad_norm": 41.34375, + "learning_rate": 9.967538099648147e-06, + "loss": 17.5125, + "step": 11490 + }, + { + "epoch": 0.20793732949704027, + "grad_norm": 42.625, + "learning_rate": 9.967509847341486e-06, + "loss": 17.0669, + "step": 11500 + }, + { + "epoch": 0.20811814456616815, + "grad_norm": 41.84375, + "learning_rate": 9.967481595034827e-06, + "loss": 17.4618, + "step": 11510 + }, + { + "epoch": 0.20829895963529602, + "grad_norm": 40.5625, + "learning_rate": 9.967453342728169e-06, + "loss": 17.1914, + "step": 11520 + }, + { + "epoch": 0.20847977470442386, + "grad_norm": 39.09375, + "learning_rate": 9.967425090421508e-06, + "loss": 17.3149, + "step": 11530 + }, + { + "epoch": 0.20866058977355173, + "grad_norm": 43.78125, + "learning_rate": 9.967396838114849e-06, + "loss": 17.5775, + "step": 11540 + }, + { + "epoch": 0.2088414048426796, + "grad_norm": 41.28125, + "learning_rate": 9.96736858580819e-06, + "loss": 17.443, + "step": 11550 + }, + { + "epoch": 0.20902221991180744, + "grad_norm": 40.40625, + "learning_rate": 9.96734033350153e-06, + "loss": 17.4273, + "step": 11560 + }, + { + "epoch": 0.20920303498093532, + "grad_norm": 41.75, + "learning_rate": 9.96731208119487e-06, + "loss": 17.0814, + "step": 11570 + }, + { + "epoch": 0.20938385005006319, + "grad_norm": 41.625, + "learning_rate": 9.96728382888821e-06, + "loss": 17.515, + "step": 11580 + }, + { + "epoch": 0.20956466511919103, + "grad_norm": 38.03125, + "learning_rate": 9.96725557658155e-06, + "loss": 17.6116, + "step": 11590 + }, + { + "epoch": 0.2097454801883189, + "grad_norm": 38.9375, + "learning_rate": 9.96722732427489e-06, + "loss": 17.3167, + "step": 11600 + }, + { + "epoch": 0.20992629525744674, + "grad_norm": 40.8125, + "learning_rate": 9.967199071968233e-06, + "loss": 17.2356, + "step": 11610 + }, + { + "epoch": 0.21010711032657461, + "grad_norm": 41.375, + "learning_rate": 9.967170819661572e-06, + "loss": 17.4695, + "step": 11620 + }, + { + "epoch": 0.21028792539570249, + "grad_norm": 39.78125, + "learning_rate": 9.967142567354912e-06, + "loss": 17.5472, + "step": 11630 + }, + { + "epoch": 0.21046874046483033, + "grad_norm": 42.46875, + "learning_rate": 9.967114315048253e-06, + "loss": 16.7093, + "step": 11640 + }, + { + "epoch": 0.2106495555339582, + "grad_norm": 44.0625, + "learning_rate": 9.967086062741594e-06, + "loss": 17.725, + "step": 11650 + }, + { + "epoch": 0.21083037060308607, + "grad_norm": 42.6875, + "learning_rate": 9.967057810434934e-06, + "loss": 17.4257, + "step": 11660 + }, + { + "epoch": 0.2110111856722139, + "grad_norm": 38.25, + "learning_rate": 9.967029558128273e-06, + "loss": 17.2519, + "step": 11670 + }, + { + "epoch": 0.21119200074134178, + "grad_norm": 38.46875, + "learning_rate": 9.967001305821614e-06, + "loss": 17.6406, + "step": 11680 + }, + { + "epoch": 0.21137281581046966, + "grad_norm": 38.28125, + "learning_rate": 9.966973053514956e-06, + "loss": 17.1831, + "step": 11690 + }, + { + "epoch": 0.2115536308795975, + "grad_norm": 42.40625, + "learning_rate": 9.966944801208295e-06, + "loss": 17.2882, + "step": 11700 + }, + { + "epoch": 0.21173444594872537, + "grad_norm": 38.40625, + "learning_rate": 9.966916548901636e-06, + "loss": 17.6014, + "step": 11710 + }, + { + "epoch": 0.21191526101785324, + "grad_norm": 42.71875, + "learning_rate": 9.966888296594976e-06, + "loss": 17.5026, + "step": 11720 + }, + { + "epoch": 0.21209607608698108, + "grad_norm": 41.3125, + "learning_rate": 9.966860044288317e-06, + "loss": 17.4043, + "step": 11730 + }, + { + "epoch": 0.21227689115610895, + "grad_norm": 40.40625, + "learning_rate": 9.966831791981658e-06, + "loss": 16.6975, + "step": 11740 + }, + { + "epoch": 0.2124577062252368, + "grad_norm": 42.96875, + "learning_rate": 9.966803539674997e-06, + "loss": 17.4932, + "step": 11750 + }, + { + "epoch": 0.21263852129436467, + "grad_norm": 41.8125, + "learning_rate": 9.966775287368337e-06, + "loss": 17.5085, + "step": 11760 + }, + { + "epoch": 0.21281933636349254, + "grad_norm": 41.0, + "learning_rate": 9.966747035061678e-06, + "loss": 17.6466, + "step": 11770 + }, + { + "epoch": 0.21300015143262038, + "grad_norm": 41.15625, + "learning_rate": 9.96671878275502e-06, + "loss": 17.0325, + "step": 11780 + }, + { + "epoch": 0.21318096650174825, + "grad_norm": 39.46875, + "learning_rate": 9.966690530448359e-06, + "loss": 17.2715, + "step": 11790 + }, + { + "epoch": 0.21336178157087612, + "grad_norm": 40.65625, + "learning_rate": 9.9666622781417e-06, + "loss": 17.3877, + "step": 11800 + }, + { + "epoch": 0.21354259664000397, + "grad_norm": 41.3125, + "learning_rate": 9.96663402583504e-06, + "loss": 17.3444, + "step": 11810 + }, + { + "epoch": 0.21372341170913184, + "grad_norm": 41.5625, + "learning_rate": 9.966605773528381e-06, + "loss": 17.11, + "step": 11820 + }, + { + "epoch": 0.2139042267782597, + "grad_norm": 38.65625, + "learning_rate": 9.966577521221722e-06, + "loss": 17.4604, + "step": 11830 + }, + { + "epoch": 0.21408504184738755, + "grad_norm": 40.90625, + "learning_rate": 9.96654926891506e-06, + "loss": 17.2247, + "step": 11840 + }, + { + "epoch": 0.21426585691651542, + "grad_norm": 38.875, + "learning_rate": 9.966521016608401e-06, + "loss": 17.1911, + "step": 11850 + }, + { + "epoch": 0.2144466719856433, + "grad_norm": 41.90625, + "learning_rate": 9.966492764301742e-06, + "loss": 17.3012, + "step": 11860 + }, + { + "epoch": 0.21462748705477114, + "grad_norm": 42.0, + "learning_rate": 9.966464511995082e-06, + "loss": 16.8401, + "step": 11870 + }, + { + "epoch": 0.214808302123899, + "grad_norm": 40.8125, + "learning_rate": 9.966436259688423e-06, + "loss": 17.2553, + "step": 11880 + }, + { + "epoch": 0.21498911719302688, + "grad_norm": 40.625, + "learning_rate": 9.966408007381764e-06, + "loss": 17.3049, + "step": 11890 + }, + { + "epoch": 0.21516993226215472, + "grad_norm": 40.875, + "learning_rate": 9.966379755075104e-06, + "loss": 17.5202, + "step": 11900 + }, + { + "epoch": 0.2153507473312826, + "grad_norm": 39.34375, + "learning_rate": 9.966351502768445e-06, + "loss": 17.5647, + "step": 11910 + }, + { + "epoch": 0.21553156240041044, + "grad_norm": 40.71875, + "learning_rate": 9.966323250461785e-06, + "loss": 17.4072, + "step": 11920 + }, + { + "epoch": 0.2157123774695383, + "grad_norm": 41.0625, + "learning_rate": 9.966294998155124e-06, + "loss": 17.3908, + "step": 11930 + }, + { + "epoch": 0.21589319253866618, + "grad_norm": 39.78125, + "learning_rate": 9.966266745848465e-06, + "loss": 17.5294, + "step": 11940 + }, + { + "epoch": 0.21607400760779402, + "grad_norm": 37.75, + "learning_rate": 9.966238493541806e-06, + "loss": 17.1815, + "step": 11950 + }, + { + "epoch": 0.2162548226769219, + "grad_norm": 37.875, + "learning_rate": 9.966210241235146e-06, + "loss": 17.3322, + "step": 11960 + }, + { + "epoch": 0.21643563774604976, + "grad_norm": 36.75, + "learning_rate": 9.966181988928487e-06, + "loss": 16.9328, + "step": 11970 + }, + { + "epoch": 0.2166164528151776, + "grad_norm": 40.5625, + "learning_rate": 9.966153736621828e-06, + "loss": 16.6897, + "step": 11980 + }, + { + "epoch": 0.21679726788430548, + "grad_norm": 39.78125, + "learning_rate": 9.966125484315168e-06, + "loss": 17.0212, + "step": 11990 + }, + { + "epoch": 0.21697808295343335, + "grad_norm": 41.71875, + "learning_rate": 9.966097232008509e-06, + "loss": 17.2172, + "step": 12000 + }, + { + "epoch": 0.2171588980225612, + "grad_norm": 42.125, + "learning_rate": 9.966068979701848e-06, + "loss": 16.9982, + "step": 12010 + }, + { + "epoch": 0.21733971309168906, + "grad_norm": 42.125, + "learning_rate": 9.966040727395188e-06, + "loss": 17.1451, + "step": 12020 + }, + { + "epoch": 0.21752052816081693, + "grad_norm": 39.96875, + "learning_rate": 9.966012475088529e-06, + "loss": 17.435, + "step": 12030 + }, + { + "epoch": 0.21770134322994478, + "grad_norm": 38.625, + "learning_rate": 9.96598422278187e-06, + "loss": 17.0784, + "step": 12040 + }, + { + "epoch": 0.21788215829907265, + "grad_norm": 39.0, + "learning_rate": 9.96595597047521e-06, + "loss": 17.4566, + "step": 12050 + }, + { + "epoch": 0.21806297336820052, + "grad_norm": 40.09375, + "learning_rate": 9.96592771816855e-06, + "loss": 17.1917, + "step": 12060 + }, + { + "epoch": 0.21824378843732836, + "grad_norm": 41.21875, + "learning_rate": 9.965899465861891e-06, + "loss": 17.5761, + "step": 12070 + }, + { + "epoch": 0.21842460350645623, + "grad_norm": 41.59375, + "learning_rate": 9.965871213555232e-06, + "loss": 17.2526, + "step": 12080 + }, + { + "epoch": 0.21860541857558408, + "grad_norm": 40.34375, + "learning_rate": 9.965842961248573e-06, + "loss": 17.4308, + "step": 12090 + }, + { + "epoch": 0.21878623364471195, + "grad_norm": 42.28125, + "learning_rate": 9.965814708941912e-06, + "loss": 17.687, + "step": 12100 + }, + { + "epoch": 0.21896704871383982, + "grad_norm": 40.78125, + "learning_rate": 9.965786456635252e-06, + "loss": 17.2235, + "step": 12110 + }, + { + "epoch": 0.21914786378296766, + "grad_norm": 40.78125, + "learning_rate": 9.965758204328593e-06, + "loss": 17.2053, + "step": 12120 + }, + { + "epoch": 0.21932867885209553, + "grad_norm": 41.46875, + "learning_rate": 9.965729952021933e-06, + "loss": 16.9105, + "step": 12130 + }, + { + "epoch": 0.2195094939212234, + "grad_norm": 39.875, + "learning_rate": 9.965701699715274e-06, + "loss": 17.7573, + "step": 12140 + }, + { + "epoch": 0.21969030899035125, + "grad_norm": 44.09375, + "learning_rate": 9.965673447408615e-06, + "loss": 18.0731, + "step": 12150 + }, + { + "epoch": 0.21987112405947912, + "grad_norm": 41.03125, + "learning_rate": 9.965645195101955e-06, + "loss": 16.8835, + "step": 12160 + }, + { + "epoch": 0.220051939128607, + "grad_norm": 41.0625, + "learning_rate": 9.965616942795296e-06, + "loss": 17.0859, + "step": 12170 + }, + { + "epoch": 0.22023275419773483, + "grad_norm": 40.25, + "learning_rate": 9.965588690488635e-06, + "loss": 17.2099, + "step": 12180 + }, + { + "epoch": 0.2204135692668627, + "grad_norm": 43.40625, + "learning_rate": 9.965560438181975e-06, + "loss": 17.5276, + "step": 12190 + }, + { + "epoch": 0.22059438433599057, + "grad_norm": 40.1875, + "learning_rate": 9.965532185875316e-06, + "loss": 17.3187, + "step": 12200 + }, + { + "epoch": 0.22077519940511842, + "grad_norm": 42.46875, + "learning_rate": 9.965503933568657e-06, + "loss": 17.2944, + "step": 12210 + }, + { + "epoch": 0.2209560144742463, + "grad_norm": 42.9375, + "learning_rate": 9.965475681261997e-06, + "loss": 17.1889, + "step": 12220 + }, + { + "epoch": 0.22113682954337416, + "grad_norm": 40.4375, + "learning_rate": 9.965447428955338e-06, + "loss": 16.8959, + "step": 12230 + }, + { + "epoch": 0.221317644612502, + "grad_norm": 45.375, + "learning_rate": 9.965419176648679e-06, + "loss": 17.3403, + "step": 12240 + }, + { + "epoch": 0.22149845968162987, + "grad_norm": 37.5625, + "learning_rate": 9.96539092434202e-06, + "loss": 17.3305, + "step": 12250 + }, + { + "epoch": 0.22167927475075772, + "grad_norm": 45.03125, + "learning_rate": 9.96536267203536e-06, + "loss": 17.6105, + "step": 12260 + }, + { + "epoch": 0.2218600898198856, + "grad_norm": 41.875, + "learning_rate": 9.965334419728699e-06, + "loss": 17.2321, + "step": 12270 + }, + { + "epoch": 0.22204090488901346, + "grad_norm": 39.4375, + "learning_rate": 9.96530616742204e-06, + "loss": 17.4887, + "step": 12280 + }, + { + "epoch": 0.2222217199581413, + "grad_norm": 39.78125, + "learning_rate": 9.96527791511538e-06, + "loss": 17.6234, + "step": 12290 + }, + { + "epoch": 0.22240253502726917, + "grad_norm": 40.28125, + "learning_rate": 9.96524966280872e-06, + "loss": 17.3018, + "step": 12300 + }, + { + "epoch": 0.22258335009639704, + "grad_norm": 38.28125, + "learning_rate": 9.965221410502061e-06, + "loss": 17.2957, + "step": 12310 + }, + { + "epoch": 0.22276416516552489, + "grad_norm": 41.625, + "learning_rate": 9.965193158195402e-06, + "loss": 17.1843, + "step": 12320 + }, + { + "epoch": 0.22294498023465276, + "grad_norm": 38.96875, + "learning_rate": 9.965164905888743e-06, + "loss": 17.5083, + "step": 12330 + }, + { + "epoch": 0.22312579530378063, + "grad_norm": 40.21875, + "learning_rate": 9.965136653582083e-06, + "loss": 17.018, + "step": 12340 + }, + { + "epoch": 0.22330661037290847, + "grad_norm": 41.8125, + "learning_rate": 9.965108401275422e-06, + "loss": 17.7748, + "step": 12350 + }, + { + "epoch": 0.22348742544203634, + "grad_norm": 41.46875, + "learning_rate": 9.965080148968763e-06, + "loss": 17.2274, + "step": 12360 + }, + { + "epoch": 0.2236682405111642, + "grad_norm": 39.5625, + "learning_rate": 9.965051896662103e-06, + "loss": 17.2712, + "step": 12370 + }, + { + "epoch": 0.22384905558029206, + "grad_norm": 41.15625, + "learning_rate": 9.965023644355444e-06, + "loss": 17.5226, + "step": 12380 + }, + { + "epoch": 0.22402987064941993, + "grad_norm": 39.09375, + "learning_rate": 9.964995392048785e-06, + "loss": 17.2146, + "step": 12390 + }, + { + "epoch": 0.2242106857185478, + "grad_norm": 41.59375, + "learning_rate": 9.964967139742125e-06, + "loss": 17.4363, + "step": 12400 + }, + { + "epoch": 0.22439150078767564, + "grad_norm": 38.40625, + "learning_rate": 9.964938887435466e-06, + "loss": 17.2412, + "step": 12410 + }, + { + "epoch": 0.2245723158568035, + "grad_norm": 37.0, + "learning_rate": 9.964910635128806e-06, + "loss": 17.4035, + "step": 12420 + }, + { + "epoch": 0.22475313092593135, + "grad_norm": 41.03125, + "learning_rate": 9.964882382822147e-06, + "loss": 17.4648, + "step": 12430 + }, + { + "epoch": 0.22493394599505923, + "grad_norm": 39.84375, + "learning_rate": 9.964854130515486e-06, + "loss": 17.2871, + "step": 12440 + }, + { + "epoch": 0.2251147610641871, + "grad_norm": 41.40625, + "learning_rate": 9.964825878208827e-06, + "loss": 17.3342, + "step": 12450 + }, + { + "epoch": 0.22529557613331494, + "grad_norm": 40.78125, + "learning_rate": 9.964797625902167e-06, + "loss": 17.5347, + "step": 12460 + }, + { + "epoch": 0.2254763912024428, + "grad_norm": 37.28125, + "learning_rate": 9.964769373595508e-06, + "loss": 17.2245, + "step": 12470 + }, + { + "epoch": 0.22565720627157068, + "grad_norm": 36.9375, + "learning_rate": 9.964741121288848e-06, + "loss": 16.9847, + "step": 12480 + }, + { + "epoch": 0.22583802134069852, + "grad_norm": 40.65625, + "learning_rate": 9.964712868982189e-06, + "loss": 16.98, + "step": 12490 + }, + { + "epoch": 0.2260188364098264, + "grad_norm": 38.8125, + "learning_rate": 9.96468461667553e-06, + "loss": 17.0928, + "step": 12500 + }, + { + "epoch": 0.22619965147895427, + "grad_norm": 37.59375, + "learning_rate": 9.96465636436887e-06, + "loss": 17.3024, + "step": 12510 + }, + { + "epoch": 0.2263804665480821, + "grad_norm": 41.75, + "learning_rate": 9.964628112062211e-06, + "loss": 17.6225, + "step": 12520 + }, + { + "epoch": 0.22656128161720998, + "grad_norm": 41.625, + "learning_rate": 9.96459985975555e-06, + "loss": 17.4294, + "step": 12530 + }, + { + "epoch": 0.22674209668633785, + "grad_norm": 40.78125, + "learning_rate": 9.96457160744889e-06, + "loss": 17.0309, + "step": 12540 + }, + { + "epoch": 0.2269229117554657, + "grad_norm": 40.25, + "learning_rate": 9.964543355142231e-06, + "loss": 16.9067, + "step": 12550 + }, + { + "epoch": 0.22710372682459357, + "grad_norm": 41.65625, + "learning_rate": 9.964515102835572e-06, + "loss": 17.5133, + "step": 12560 + }, + { + "epoch": 0.2272845418937214, + "grad_norm": 40.71875, + "learning_rate": 9.964486850528912e-06, + "loss": 17.1679, + "step": 12570 + }, + { + "epoch": 0.22746535696284928, + "grad_norm": 40.5, + "learning_rate": 9.964458598222253e-06, + "loss": 17.345, + "step": 12580 + }, + { + "epoch": 0.22764617203197715, + "grad_norm": 41.53125, + "learning_rate": 9.964430345915594e-06, + "loss": 17.2944, + "step": 12590 + }, + { + "epoch": 0.227826987101105, + "grad_norm": 42.375, + "learning_rate": 9.964402093608934e-06, + "loss": 16.8557, + "step": 12600 + }, + { + "epoch": 0.22800780217023287, + "grad_norm": 40.25, + "learning_rate": 9.964373841302273e-06, + "loss": 17.3471, + "step": 12610 + }, + { + "epoch": 0.22818861723936074, + "grad_norm": 42.28125, + "learning_rate": 9.964345588995614e-06, + "loss": 16.9808, + "step": 12620 + }, + { + "epoch": 0.22836943230848858, + "grad_norm": 39.78125, + "learning_rate": 9.964317336688954e-06, + "loss": 17.1642, + "step": 12630 + }, + { + "epoch": 0.22855024737761645, + "grad_norm": 40.84375, + "learning_rate": 9.964289084382295e-06, + "loss": 17.1546, + "step": 12640 + }, + { + "epoch": 0.22873106244674432, + "grad_norm": 40.53125, + "learning_rate": 9.964260832075636e-06, + "loss": 17.4697, + "step": 12650 + }, + { + "epoch": 0.22891187751587216, + "grad_norm": 41.40625, + "learning_rate": 9.964232579768976e-06, + "loss": 17.0869, + "step": 12660 + }, + { + "epoch": 0.22909269258500004, + "grad_norm": 38.15625, + "learning_rate": 9.964204327462317e-06, + "loss": 17.1933, + "step": 12670 + }, + { + "epoch": 0.2292735076541279, + "grad_norm": 41.71875, + "learning_rate": 9.964176075155658e-06, + "loss": 17.1562, + "step": 12680 + }, + { + "epoch": 0.22945432272325575, + "grad_norm": 43.15625, + "learning_rate": 9.964147822848998e-06, + "loss": 17.2442, + "step": 12690 + }, + { + "epoch": 0.22963513779238362, + "grad_norm": 42.46875, + "learning_rate": 9.964119570542337e-06, + "loss": 17.0829, + "step": 12700 + }, + { + "epoch": 0.2298159528615115, + "grad_norm": 39.75, + "learning_rate": 9.964091318235678e-06, + "loss": 17.4495, + "step": 12710 + }, + { + "epoch": 0.22999676793063933, + "grad_norm": 41.8125, + "learning_rate": 9.964063065929018e-06, + "loss": 17.1042, + "step": 12720 + }, + { + "epoch": 0.2301775829997672, + "grad_norm": 41.28125, + "learning_rate": 9.964034813622359e-06, + "loss": 17.2907, + "step": 12730 + }, + { + "epoch": 0.23035839806889505, + "grad_norm": 39.84375, + "learning_rate": 9.9640065613157e-06, + "loss": 17.3119, + "step": 12740 + }, + { + "epoch": 0.23053921313802292, + "grad_norm": 42.0625, + "learning_rate": 9.96397830900904e-06, + "loss": 17.4088, + "step": 12750 + }, + { + "epoch": 0.2307200282071508, + "grad_norm": 39.65625, + "learning_rate": 9.96395005670238e-06, + "loss": 17.0552, + "step": 12760 + }, + { + "epoch": 0.23090084327627863, + "grad_norm": 37.9375, + "learning_rate": 9.963921804395721e-06, + "loss": 17.3551, + "step": 12770 + }, + { + "epoch": 0.2310816583454065, + "grad_norm": 39.03125, + "learning_rate": 9.96389355208906e-06, + "loss": 17.0639, + "step": 12780 + }, + { + "epoch": 0.23126247341453438, + "grad_norm": 37.71875, + "learning_rate": 9.963865299782401e-06, + "loss": 17.2184, + "step": 12790 + }, + { + "epoch": 0.23144328848366222, + "grad_norm": 41.59375, + "learning_rate": 9.963837047475742e-06, + "loss": 17.7574, + "step": 12800 + }, + { + "epoch": 0.2316241035527901, + "grad_norm": 37.375, + "learning_rate": 9.963808795169082e-06, + "loss": 17.421, + "step": 12810 + }, + { + "epoch": 0.23180491862191796, + "grad_norm": 39.0625, + "learning_rate": 9.963780542862423e-06, + "loss": 17.1152, + "step": 12820 + }, + { + "epoch": 0.2319857336910458, + "grad_norm": 42.28125, + "learning_rate": 9.963752290555763e-06, + "loss": 17.2219, + "step": 12830 + }, + { + "epoch": 0.23216654876017367, + "grad_norm": 39.96875, + "learning_rate": 9.963724038249104e-06, + "loss": 17.1605, + "step": 12840 + }, + { + "epoch": 0.23234736382930155, + "grad_norm": 40.96875, + "learning_rate": 9.963695785942445e-06, + "loss": 17.1662, + "step": 12850 + }, + { + "epoch": 0.2325281788984294, + "grad_norm": 39.84375, + "learning_rate": 9.963667533635785e-06, + "loss": 17.369, + "step": 12860 + }, + { + "epoch": 0.23270899396755726, + "grad_norm": 43.375, + "learning_rate": 9.963639281329124e-06, + "loss": 17.798, + "step": 12870 + }, + { + "epoch": 0.23288980903668513, + "grad_norm": 41.5625, + "learning_rate": 9.963611029022465e-06, + "loss": 17.1347, + "step": 12880 + }, + { + "epoch": 0.23307062410581297, + "grad_norm": 38.8125, + "learning_rate": 9.963582776715805e-06, + "loss": 17.398, + "step": 12890 + }, + { + "epoch": 0.23325143917494084, + "grad_norm": 38.4375, + "learning_rate": 9.963554524409146e-06, + "loss": 17.3474, + "step": 12900 + }, + { + "epoch": 0.2334322542440687, + "grad_norm": 39.59375, + "learning_rate": 9.963526272102487e-06, + "loss": 17.1073, + "step": 12910 + }, + { + "epoch": 0.23361306931319656, + "grad_norm": 41.625, + "learning_rate": 9.963498019795827e-06, + "loss": 17.2598, + "step": 12920 + }, + { + "epoch": 0.23379388438232443, + "grad_norm": 43.96875, + "learning_rate": 9.963469767489168e-06, + "loss": 17.114, + "step": 12930 + }, + { + "epoch": 0.23397469945145227, + "grad_norm": 39.46875, + "learning_rate": 9.963441515182509e-06, + "loss": 17.214, + "step": 12940 + }, + { + "epoch": 0.23415551452058014, + "grad_norm": 39.1875, + "learning_rate": 9.96341326287585e-06, + "loss": 17.0908, + "step": 12950 + }, + { + "epoch": 0.23433632958970801, + "grad_norm": 39.21875, + "learning_rate": 9.963385010569188e-06, + "loss": 16.9238, + "step": 12960 + }, + { + "epoch": 0.23451714465883586, + "grad_norm": 42.71875, + "learning_rate": 9.963356758262529e-06, + "loss": 17.0022, + "step": 12970 + }, + { + "epoch": 0.23469795972796373, + "grad_norm": 38.84375, + "learning_rate": 9.96332850595587e-06, + "loss": 17.0961, + "step": 12980 + }, + { + "epoch": 0.2348787747970916, + "grad_norm": 38.53125, + "learning_rate": 9.96330025364921e-06, + "loss": 17.5029, + "step": 12990 + }, + { + "epoch": 0.23505958986621944, + "grad_norm": 38.3125, + "learning_rate": 9.96327200134255e-06, + "loss": 17.3472, + "step": 13000 + }, + { + "epoch": 0.2352404049353473, + "grad_norm": 41.0625, + "learning_rate": 9.963243749035891e-06, + "loss": 17.238, + "step": 13010 + }, + { + "epoch": 0.23542122000447518, + "grad_norm": 39.21875, + "learning_rate": 9.963215496729232e-06, + "loss": 17.2826, + "step": 13020 + }, + { + "epoch": 0.23560203507360303, + "grad_norm": 40.15625, + "learning_rate": 9.963187244422573e-06, + "loss": 16.9839, + "step": 13030 + }, + { + "epoch": 0.2357828501427309, + "grad_norm": 41.28125, + "learning_rate": 9.963158992115911e-06, + "loss": 17.0687, + "step": 13040 + }, + { + "epoch": 0.23596366521185877, + "grad_norm": 40.6875, + "learning_rate": 9.963130739809252e-06, + "loss": 17.3794, + "step": 13050 + }, + { + "epoch": 0.2361444802809866, + "grad_norm": 40.53125, + "learning_rate": 9.963102487502593e-06, + "loss": 17.0712, + "step": 13060 + }, + { + "epoch": 0.23632529535011448, + "grad_norm": 40.3125, + "learning_rate": 9.963074235195933e-06, + "loss": 16.9269, + "step": 13070 + }, + { + "epoch": 0.23650611041924233, + "grad_norm": 39.6875, + "learning_rate": 9.963045982889274e-06, + "loss": 17.0576, + "step": 13080 + }, + { + "epoch": 0.2366869254883702, + "grad_norm": 40.4375, + "learning_rate": 9.963017730582613e-06, + "loss": 16.9084, + "step": 13090 + }, + { + "epoch": 0.23686774055749807, + "grad_norm": 38.71875, + "learning_rate": 9.962989478275955e-06, + "loss": 17.7958, + "step": 13100 + }, + { + "epoch": 0.2370485556266259, + "grad_norm": 40.125, + "learning_rate": 9.962961225969296e-06, + "loss": 17.2817, + "step": 13110 + }, + { + "epoch": 0.23722937069575378, + "grad_norm": 41.40625, + "learning_rate": 9.962932973662636e-06, + "loss": 17.777, + "step": 13120 + }, + { + "epoch": 0.23741018576488165, + "grad_norm": 39.3125, + "learning_rate": 9.962904721355975e-06, + "loss": 17.2679, + "step": 13130 + }, + { + "epoch": 0.2375910008340095, + "grad_norm": 42.375, + "learning_rate": 9.962876469049316e-06, + "loss": 17.0529, + "step": 13140 + }, + { + "epoch": 0.23777181590313737, + "grad_norm": 41.84375, + "learning_rate": 9.962848216742657e-06, + "loss": 17.0093, + "step": 13150 + }, + { + "epoch": 0.23795263097226524, + "grad_norm": 41.46875, + "learning_rate": 9.962819964435997e-06, + "loss": 17.5174, + "step": 13160 + }, + { + "epoch": 0.23813344604139308, + "grad_norm": 42.46875, + "learning_rate": 9.962791712129338e-06, + "loss": 17.5967, + "step": 13170 + }, + { + "epoch": 0.23831426111052095, + "grad_norm": 38.90625, + "learning_rate": 9.962763459822678e-06, + "loss": 17.1299, + "step": 13180 + }, + { + "epoch": 0.23849507617964882, + "grad_norm": 42.21875, + "learning_rate": 9.962735207516019e-06, + "loss": 17.0755, + "step": 13190 + }, + { + "epoch": 0.23867589124877667, + "grad_norm": 39.375, + "learning_rate": 9.96270695520936e-06, + "loss": 17.0905, + "step": 13200 + }, + { + "epoch": 0.23885670631790454, + "grad_norm": 41.25, + "learning_rate": 9.962678702902699e-06, + "loss": 17.5971, + "step": 13210 + }, + { + "epoch": 0.2390375213870324, + "grad_norm": 41.28125, + "learning_rate": 9.96265045059604e-06, + "loss": 17.156, + "step": 13220 + }, + { + "epoch": 0.23921833645616025, + "grad_norm": 38.21875, + "learning_rate": 9.96262219828938e-06, + "loss": 17.2076, + "step": 13230 + }, + { + "epoch": 0.23939915152528812, + "grad_norm": 40.28125, + "learning_rate": 9.96259394598272e-06, + "loss": 17.1039, + "step": 13240 + }, + { + "epoch": 0.23957996659441597, + "grad_norm": 41.21875, + "learning_rate": 9.962565693676061e-06, + "loss": 17.3425, + "step": 13250 + }, + { + "epoch": 0.23976078166354384, + "grad_norm": 40.09375, + "learning_rate": 9.962537441369402e-06, + "loss": 17.5018, + "step": 13260 + }, + { + "epoch": 0.2399415967326717, + "grad_norm": 41.03125, + "learning_rate": 9.962509189062742e-06, + "loss": 17.3076, + "step": 13270 + }, + { + "epoch": 0.24012241180179955, + "grad_norm": 38.34375, + "learning_rate": 9.962480936756083e-06, + "loss": 17.1876, + "step": 13280 + }, + { + "epoch": 0.24030322687092742, + "grad_norm": 36.90625, + "learning_rate": 9.962452684449424e-06, + "loss": 16.7979, + "step": 13290 + }, + { + "epoch": 0.2404840419400553, + "grad_norm": 40.4375, + "learning_rate": 9.962424432142763e-06, + "loss": 17.3982, + "step": 13300 + }, + { + "epoch": 0.24066485700918314, + "grad_norm": 41.53125, + "learning_rate": 9.962396179836103e-06, + "loss": 17.331, + "step": 13310 + }, + { + "epoch": 0.240845672078311, + "grad_norm": 39.5, + "learning_rate": 9.962367927529444e-06, + "loss": 16.9671, + "step": 13320 + }, + { + "epoch": 0.24102648714743888, + "grad_norm": 39.0, + "learning_rate": 9.962339675222784e-06, + "loss": 17.5099, + "step": 13330 + }, + { + "epoch": 0.24120730221656672, + "grad_norm": 41.0, + "learning_rate": 9.962311422916125e-06, + "loss": 17.1498, + "step": 13340 + }, + { + "epoch": 0.2413881172856946, + "grad_norm": 39.3125, + "learning_rate": 9.962283170609464e-06, + "loss": 17.5852, + "step": 13350 + }, + { + "epoch": 0.24156893235482246, + "grad_norm": 43.8125, + "learning_rate": 9.962254918302806e-06, + "loss": 17.0536, + "step": 13360 + }, + { + "epoch": 0.2417497474239503, + "grad_norm": 40.875, + "learning_rate": 9.962226665996147e-06, + "loss": 17.2196, + "step": 13370 + }, + { + "epoch": 0.24193056249307818, + "grad_norm": 42.96875, + "learning_rate": 9.962198413689488e-06, + "loss": 17.5202, + "step": 13380 + }, + { + "epoch": 0.24211137756220602, + "grad_norm": 41.625, + "learning_rate": 9.962170161382826e-06, + "loss": 17.5732, + "step": 13390 + }, + { + "epoch": 0.2422921926313339, + "grad_norm": 40.59375, + "learning_rate": 9.962141909076167e-06, + "loss": 17.365, + "step": 13400 + }, + { + "epoch": 0.24247300770046176, + "grad_norm": 41.09375, + "learning_rate": 9.962113656769508e-06, + "loss": 17.4699, + "step": 13410 + }, + { + "epoch": 0.2426538227695896, + "grad_norm": 37.46875, + "learning_rate": 9.962085404462848e-06, + "loss": 17.4083, + "step": 13420 + }, + { + "epoch": 0.24283463783871748, + "grad_norm": 39.0625, + "learning_rate": 9.962057152156189e-06, + "loss": 16.9631, + "step": 13430 + }, + { + "epoch": 0.24301545290784535, + "grad_norm": 40.90625, + "learning_rate": 9.962028899849528e-06, + "loss": 17.1987, + "step": 13440 + }, + { + "epoch": 0.2431962679769732, + "grad_norm": 42.125, + "learning_rate": 9.96200064754287e-06, + "loss": 17.1089, + "step": 13450 + }, + { + "epoch": 0.24337708304610106, + "grad_norm": 39.875, + "learning_rate": 9.96197239523621e-06, + "loss": 17.1529, + "step": 13460 + }, + { + "epoch": 0.24355789811522893, + "grad_norm": 41.8125, + "learning_rate": 9.96194414292955e-06, + "loss": 17.3545, + "step": 13470 + }, + { + "epoch": 0.24373871318435678, + "grad_norm": 41.28125, + "learning_rate": 9.96191589062289e-06, + "loss": 17.4552, + "step": 13480 + }, + { + "epoch": 0.24391952825348465, + "grad_norm": 40.46875, + "learning_rate": 9.961887638316231e-06, + "loss": 17.1754, + "step": 13490 + }, + { + "epoch": 0.24410034332261252, + "grad_norm": 37.53125, + "learning_rate": 9.961859386009572e-06, + "loss": 16.8383, + "step": 13500 + }, + { + "epoch": 0.24428115839174036, + "grad_norm": 41.90625, + "learning_rate": 9.961831133702912e-06, + "loss": 17.8071, + "step": 13510 + }, + { + "epoch": 0.24446197346086823, + "grad_norm": 38.78125, + "learning_rate": 9.961802881396251e-06, + "loss": 17.6229, + "step": 13520 + }, + { + "epoch": 0.2446427885299961, + "grad_norm": 40.34375, + "learning_rate": 9.961774629089593e-06, + "loss": 17.1821, + "step": 13530 + }, + { + "epoch": 0.24482360359912395, + "grad_norm": 40.53125, + "learning_rate": 9.961746376782934e-06, + "loss": 17.358, + "step": 13540 + }, + { + "epoch": 0.24500441866825182, + "grad_norm": 39.09375, + "learning_rate": 9.961718124476275e-06, + "loss": 17.3703, + "step": 13550 + }, + { + "epoch": 0.24518523373737966, + "grad_norm": 40.0625, + "learning_rate": 9.961689872169614e-06, + "loss": 17.2847, + "step": 13560 + }, + { + "epoch": 0.24536604880650753, + "grad_norm": 41.28125, + "learning_rate": 9.961661619862954e-06, + "loss": 16.8204, + "step": 13570 + }, + { + "epoch": 0.2455468638756354, + "grad_norm": 40.96875, + "learning_rate": 9.961633367556295e-06, + "loss": 17.361, + "step": 13580 + }, + { + "epoch": 0.24572767894476324, + "grad_norm": 42.96875, + "learning_rate": 9.961605115249636e-06, + "loss": 17.4287, + "step": 13590 + }, + { + "epoch": 0.24590849401389112, + "grad_norm": 41.21875, + "learning_rate": 9.961576862942976e-06, + "loss": 17.5482, + "step": 13600 + }, + { + "epoch": 0.246089309083019, + "grad_norm": 39.65625, + "learning_rate": 9.961548610636315e-06, + "loss": 17.2304, + "step": 13610 + }, + { + "epoch": 0.24627012415214683, + "grad_norm": 38.84375, + "learning_rate": 9.961520358329657e-06, + "loss": 17.3173, + "step": 13620 + }, + { + "epoch": 0.2464509392212747, + "grad_norm": 40.59375, + "learning_rate": 9.961492106022998e-06, + "loss": 17.6542, + "step": 13630 + }, + { + "epoch": 0.24663175429040257, + "grad_norm": 40.3125, + "learning_rate": 9.961463853716337e-06, + "loss": 17.1626, + "step": 13640 + }, + { + "epoch": 0.24681256935953041, + "grad_norm": 40.5, + "learning_rate": 9.961435601409678e-06, + "loss": 17.2235, + "step": 13650 + }, + { + "epoch": 0.24699338442865829, + "grad_norm": 38.9375, + "learning_rate": 9.961407349103018e-06, + "loss": 17.0925, + "step": 13660 + }, + { + "epoch": 0.24717419949778616, + "grad_norm": 38.71875, + "learning_rate": 9.961379096796359e-06, + "loss": 16.9831, + "step": 13670 + }, + { + "epoch": 0.247355014566914, + "grad_norm": 36.78125, + "learning_rate": 9.9613508444897e-06, + "loss": 16.8113, + "step": 13680 + }, + { + "epoch": 0.24753582963604187, + "grad_norm": 42.65625, + "learning_rate": 9.96132259218304e-06, + "loss": 17.2586, + "step": 13690 + }, + { + "epoch": 0.24771664470516974, + "grad_norm": 39.625, + "learning_rate": 9.961294339876379e-06, + "loss": 17.7591, + "step": 13700 + }, + { + "epoch": 0.24789745977429758, + "grad_norm": 38.84375, + "learning_rate": 9.961266087569721e-06, + "loss": 17.3166, + "step": 13710 + }, + { + "epoch": 0.24807827484342546, + "grad_norm": 39.375, + "learning_rate": 9.961237835263062e-06, + "loss": 17.3201, + "step": 13720 + }, + { + "epoch": 0.2482590899125533, + "grad_norm": 39.5625, + "learning_rate": 9.9612095829564e-06, + "loss": 16.998, + "step": 13730 + }, + { + "epoch": 0.24843990498168117, + "grad_norm": 40.34375, + "learning_rate": 9.961181330649741e-06, + "loss": 17.3526, + "step": 13740 + }, + { + "epoch": 0.24862072005080904, + "grad_norm": 40.59375, + "learning_rate": 9.961153078343082e-06, + "loss": 17.0846, + "step": 13750 + }, + { + "epoch": 0.24880153511993688, + "grad_norm": 40.59375, + "learning_rate": 9.961124826036423e-06, + "loss": 17.3245, + "step": 13760 + }, + { + "epoch": 0.24898235018906476, + "grad_norm": 39.09375, + "learning_rate": 9.961096573729763e-06, + "loss": 17.525, + "step": 13770 + }, + { + "epoch": 0.24916316525819263, + "grad_norm": 40.46875, + "learning_rate": 9.961068321423102e-06, + "loss": 16.7753, + "step": 13780 + }, + { + "epoch": 0.24934398032732047, + "grad_norm": 41.65625, + "learning_rate": 9.961040069116443e-06, + "loss": 17.5064, + "step": 13790 + }, + { + "epoch": 0.24952479539644834, + "grad_norm": 41.0, + "learning_rate": 9.961011816809785e-06, + "loss": 16.8966, + "step": 13800 + }, + { + "epoch": 0.2497056104655762, + "grad_norm": 42.15625, + "learning_rate": 9.960983564503126e-06, + "loss": 17.5192, + "step": 13810 + }, + { + "epoch": 0.24988642553470405, + "grad_norm": 41.875, + "learning_rate": 9.960955312196465e-06, + "loss": 17.1112, + "step": 13820 + }, + { + "epoch": 0.2500672406038319, + "grad_norm": 40.8125, + "learning_rate": 9.960927059889805e-06, + "loss": 17.2937, + "step": 13830 + }, + { + "epoch": 0.25024805567295977, + "grad_norm": 42.8125, + "learning_rate": 9.960898807583146e-06, + "loss": 17.4666, + "step": 13840 + }, + { + "epoch": 0.25042887074208764, + "grad_norm": 38.875, + "learning_rate": 9.960870555276487e-06, + "loss": 17.4754, + "step": 13850 + }, + { + "epoch": 0.2506096858112155, + "grad_norm": 43.8125, + "learning_rate": 9.960842302969827e-06, + "loss": 17.234, + "step": 13860 + }, + { + "epoch": 0.2507905008803434, + "grad_norm": 40.65625, + "learning_rate": 9.960814050663166e-06, + "loss": 17.5303, + "step": 13870 + }, + { + "epoch": 0.25097131594947125, + "grad_norm": 41.71875, + "learning_rate": 9.960785798356508e-06, + "loss": 17.6465, + "step": 13880 + }, + { + "epoch": 0.25115213101859907, + "grad_norm": 39.15625, + "learning_rate": 9.960757546049849e-06, + "loss": 17.1546, + "step": 13890 + }, + { + "epoch": 0.25133294608772694, + "grad_norm": 41.28125, + "learning_rate": 9.960729293743188e-06, + "loss": 17.2995, + "step": 13900 + }, + { + "epoch": 0.2515137611568548, + "grad_norm": 41.8125, + "learning_rate": 9.960701041436529e-06, + "loss": 17.6509, + "step": 13910 + }, + { + "epoch": 0.2516945762259827, + "grad_norm": 39.90625, + "learning_rate": 9.96067278912987e-06, + "loss": 17.4684, + "step": 13920 + }, + { + "epoch": 0.25187539129511055, + "grad_norm": 41.96875, + "learning_rate": 9.96064453682321e-06, + "loss": 17.4557, + "step": 13930 + }, + { + "epoch": 0.2520562063642384, + "grad_norm": 40.84375, + "learning_rate": 9.96061628451655e-06, + "loss": 17.2042, + "step": 13940 + }, + { + "epoch": 0.25223702143336624, + "grad_norm": 42.34375, + "learning_rate": 9.96058803220989e-06, + "loss": 17.3159, + "step": 13950 + }, + { + "epoch": 0.2524178365024941, + "grad_norm": 40.65625, + "learning_rate": 9.96055977990323e-06, + "loss": 17.0903, + "step": 13960 + }, + { + "epoch": 0.252598651571622, + "grad_norm": 38.28125, + "learning_rate": 9.960531527596572e-06, + "loss": 17.1521, + "step": 13970 + }, + { + "epoch": 0.25277946664074985, + "grad_norm": 40.875, + "learning_rate": 9.960503275289913e-06, + "loss": 16.9971, + "step": 13980 + }, + { + "epoch": 0.2529602817098777, + "grad_norm": 41.15625, + "learning_rate": 9.960475022983252e-06, + "loss": 17.4463, + "step": 13990 + }, + { + "epoch": 0.25314109677900554, + "grad_norm": 41.46875, + "learning_rate": 9.960446770676593e-06, + "loss": 17.1802, + "step": 14000 + }, + { + "epoch": 0.2533219118481334, + "grad_norm": 41.75, + "learning_rate": 9.960418518369933e-06, + "loss": 17.2515, + "step": 14010 + }, + { + "epoch": 0.2535027269172613, + "grad_norm": 40.5625, + "learning_rate": 9.960390266063274e-06, + "loss": 17.5549, + "step": 14020 + }, + { + "epoch": 0.25368354198638915, + "grad_norm": 39.8125, + "learning_rate": 9.960362013756614e-06, + "loss": 17.5882, + "step": 14030 + }, + { + "epoch": 0.253864357055517, + "grad_norm": 40.65625, + "learning_rate": 9.960333761449953e-06, + "loss": 17.462, + "step": 14040 + }, + { + "epoch": 0.2540451721246449, + "grad_norm": 39.78125, + "learning_rate": 9.960305509143294e-06, + "loss": 17.2972, + "step": 14050 + }, + { + "epoch": 0.2542259871937727, + "grad_norm": 41.875, + "learning_rate": 9.960277256836636e-06, + "loss": 17.5263, + "step": 14060 + }, + { + "epoch": 0.2544068022629006, + "grad_norm": 40.3125, + "learning_rate": 9.960249004529975e-06, + "loss": 17.6281, + "step": 14070 + }, + { + "epoch": 0.25458761733202845, + "grad_norm": 37.40625, + "learning_rate": 9.960220752223316e-06, + "loss": 17.341, + "step": 14080 + }, + { + "epoch": 0.2547684324011563, + "grad_norm": 39.8125, + "learning_rate": 9.960192499916656e-06, + "loss": 17.308, + "step": 14090 + }, + { + "epoch": 0.2549492474702842, + "grad_norm": 37.71875, + "learning_rate": 9.960164247609997e-06, + "loss": 17.3255, + "step": 14100 + }, + { + "epoch": 0.25513006253941206, + "grad_norm": 40.09375, + "learning_rate": 9.960135995303338e-06, + "loss": 17.5287, + "step": 14110 + }, + { + "epoch": 0.2553108776085399, + "grad_norm": 43.15625, + "learning_rate": 9.960107742996678e-06, + "loss": 17.3311, + "step": 14120 + }, + { + "epoch": 0.25549169267766775, + "grad_norm": 39.46875, + "learning_rate": 9.960079490690017e-06, + "loss": 16.9866, + "step": 14130 + }, + { + "epoch": 0.2556725077467956, + "grad_norm": 41.25, + "learning_rate": 9.960051238383358e-06, + "loss": 17.3274, + "step": 14140 + }, + { + "epoch": 0.2558533228159235, + "grad_norm": 40.0625, + "learning_rate": 9.9600229860767e-06, + "loss": 17.3333, + "step": 14150 + }, + { + "epoch": 0.25603413788505136, + "grad_norm": 37.75, + "learning_rate": 9.959994733770039e-06, + "loss": 17.3042, + "step": 14160 + }, + { + "epoch": 0.2562149529541792, + "grad_norm": 40.875, + "learning_rate": 9.95996648146338e-06, + "loss": 17.2533, + "step": 14170 + }, + { + "epoch": 0.25639576802330705, + "grad_norm": 36.25, + "learning_rate": 9.95993822915672e-06, + "loss": 17.3347, + "step": 14180 + }, + { + "epoch": 0.2565765830924349, + "grad_norm": 39.46875, + "learning_rate": 9.959909976850061e-06, + "loss": 17.7013, + "step": 14190 + }, + { + "epoch": 0.2567573981615628, + "grad_norm": 41.25, + "learning_rate": 9.959881724543402e-06, + "loss": 17.4491, + "step": 14200 + }, + { + "epoch": 0.25693821323069066, + "grad_norm": 41.4375, + "learning_rate": 9.95985347223674e-06, + "loss": 16.9943, + "step": 14210 + }, + { + "epoch": 0.25711902829981853, + "grad_norm": 39.25, + "learning_rate": 9.959825219930081e-06, + "loss": 17.5549, + "step": 14220 + }, + { + "epoch": 0.25729984336894635, + "grad_norm": 40.09375, + "learning_rate": 9.959796967623423e-06, + "loss": 17.2006, + "step": 14230 + }, + { + "epoch": 0.2574806584380742, + "grad_norm": 39.1875, + "learning_rate": 9.959768715316764e-06, + "loss": 17.0657, + "step": 14240 + }, + { + "epoch": 0.2576614735072021, + "grad_norm": 39.03125, + "learning_rate": 9.959740463010103e-06, + "loss": 16.9687, + "step": 14250 + }, + { + "epoch": 0.25784228857632996, + "grad_norm": 39.8125, + "learning_rate": 9.959712210703444e-06, + "loss": 17.2125, + "step": 14260 + }, + { + "epoch": 0.25802310364545783, + "grad_norm": 42.6875, + "learning_rate": 9.959683958396784e-06, + "loss": 17.1497, + "step": 14270 + }, + { + "epoch": 0.2582039187145857, + "grad_norm": 42.0625, + "learning_rate": 9.959655706090125e-06, + "loss": 16.7245, + "step": 14280 + }, + { + "epoch": 0.2583847337837135, + "grad_norm": 40.6875, + "learning_rate": 9.959627453783466e-06, + "loss": 17.2191, + "step": 14290 + }, + { + "epoch": 0.2585655488528414, + "grad_norm": 38.09375, + "learning_rate": 9.959599201476804e-06, + "loss": 17.0521, + "step": 14300 + }, + { + "epoch": 0.25874636392196926, + "grad_norm": 41.84375, + "learning_rate": 9.959570949170145e-06, + "loss": 17.1253, + "step": 14310 + }, + { + "epoch": 0.25892717899109713, + "grad_norm": 39.125, + "learning_rate": 9.959542696863487e-06, + "loss": 16.9786, + "step": 14320 + }, + { + "epoch": 0.259107994060225, + "grad_norm": 39.53125, + "learning_rate": 9.959514444556826e-06, + "loss": 17.5312, + "step": 14330 + }, + { + "epoch": 0.2592888091293528, + "grad_norm": 36.75, + "learning_rate": 9.959486192250167e-06, + "loss": 17.2473, + "step": 14340 + }, + { + "epoch": 0.2594696241984807, + "grad_norm": 40.0625, + "learning_rate": 9.959457939943508e-06, + "loss": 16.8098, + "step": 14350 + }, + { + "epoch": 0.25965043926760856, + "grad_norm": 40.65625, + "learning_rate": 9.959429687636848e-06, + "loss": 17.3783, + "step": 14360 + }, + { + "epoch": 0.25983125433673643, + "grad_norm": 39.625, + "learning_rate": 9.959401435330189e-06, + "loss": 17.593, + "step": 14370 + }, + { + "epoch": 0.2600120694058643, + "grad_norm": 39.1875, + "learning_rate": 9.959373183023528e-06, + "loss": 17.406, + "step": 14380 + }, + { + "epoch": 0.26019288447499217, + "grad_norm": 39.96875, + "learning_rate": 9.959344930716868e-06, + "loss": 17.3751, + "step": 14390 + }, + { + "epoch": 0.26037369954412, + "grad_norm": 40.40625, + "learning_rate": 9.959316678410209e-06, + "loss": 17.2173, + "step": 14400 + }, + { + "epoch": 0.26055451461324786, + "grad_norm": 40.53125, + "learning_rate": 9.959288426103551e-06, + "loss": 17.1151, + "step": 14410 + }, + { + "epoch": 0.2607353296823757, + "grad_norm": 42.3125, + "learning_rate": 9.95926017379689e-06, + "loss": 17.3042, + "step": 14420 + }, + { + "epoch": 0.2609161447515036, + "grad_norm": 42.4375, + "learning_rate": 9.95923192149023e-06, + "loss": 17.3013, + "step": 14430 + }, + { + "epoch": 0.26109695982063147, + "grad_norm": 41.71875, + "learning_rate": 9.959203669183571e-06, + "loss": 17.1391, + "step": 14440 + }, + { + "epoch": 0.26127777488975934, + "grad_norm": 41.71875, + "learning_rate": 9.959175416876912e-06, + "loss": 17.412, + "step": 14450 + }, + { + "epoch": 0.26145858995888716, + "grad_norm": 41.1875, + "learning_rate": 9.959147164570253e-06, + "loss": 17.0796, + "step": 14460 + }, + { + "epoch": 0.261639405028015, + "grad_norm": 41.65625, + "learning_rate": 9.959118912263592e-06, + "loss": 17.1053, + "step": 14470 + }, + { + "epoch": 0.2618202200971429, + "grad_norm": 42.03125, + "learning_rate": 9.959090659956932e-06, + "loss": 17.2328, + "step": 14480 + }, + { + "epoch": 0.26200103516627077, + "grad_norm": 39.59375, + "learning_rate": 9.959062407650273e-06, + "loss": 17.2747, + "step": 14490 + }, + { + "epoch": 0.26218185023539864, + "grad_norm": 40.1875, + "learning_rate": 9.959034155343614e-06, + "loss": 17.1025, + "step": 14500 + }, + { + "epoch": 0.26236266530452645, + "grad_norm": 40.25, + "learning_rate": 9.959005903036954e-06, + "loss": 17.8269, + "step": 14510 + }, + { + "epoch": 0.2625434803736543, + "grad_norm": 39.65625, + "learning_rate": 9.958977650730295e-06, + "loss": 17.1326, + "step": 14520 + }, + { + "epoch": 0.2627242954427822, + "grad_norm": 38.96875, + "learning_rate": 9.958949398423635e-06, + "loss": 17.4717, + "step": 14530 + }, + { + "epoch": 0.26290511051191007, + "grad_norm": 40.96875, + "learning_rate": 9.958921146116976e-06, + "loss": 16.7587, + "step": 14540 + }, + { + "epoch": 0.26308592558103794, + "grad_norm": 44.09375, + "learning_rate": 9.958892893810315e-06, + "loss": 17.5066, + "step": 14550 + }, + { + "epoch": 0.2632667406501658, + "grad_norm": 40.53125, + "learning_rate": 9.958864641503656e-06, + "loss": 17.0883, + "step": 14560 + }, + { + "epoch": 0.2634475557192936, + "grad_norm": 41.8125, + "learning_rate": 9.958836389196996e-06, + "loss": 17.5933, + "step": 14570 + }, + { + "epoch": 0.2636283707884215, + "grad_norm": 42.0, + "learning_rate": 9.958808136890338e-06, + "loss": 17.4965, + "step": 14580 + }, + { + "epoch": 0.26380918585754937, + "grad_norm": 38.625, + "learning_rate": 9.958779884583677e-06, + "loss": 17.28, + "step": 14590 + }, + { + "epoch": 0.26399000092667724, + "grad_norm": 40.5, + "learning_rate": 9.958751632277018e-06, + "loss": 17.4132, + "step": 14600 + }, + { + "epoch": 0.2641708159958051, + "grad_norm": 41.28125, + "learning_rate": 9.958723379970359e-06, + "loss": 17.2562, + "step": 14610 + }, + { + "epoch": 0.264351631064933, + "grad_norm": 40.21875, + "learning_rate": 9.9586951276637e-06, + "loss": 17.5402, + "step": 14620 + }, + { + "epoch": 0.2645324461340608, + "grad_norm": 41.6875, + "learning_rate": 9.95866687535704e-06, + "loss": 17.2135, + "step": 14630 + }, + { + "epoch": 0.26471326120318867, + "grad_norm": 40.21875, + "learning_rate": 9.958638623050379e-06, + "loss": 17.0918, + "step": 14640 + }, + { + "epoch": 0.26489407627231654, + "grad_norm": 40.0625, + "learning_rate": 9.95861037074372e-06, + "loss": 17.3941, + "step": 14650 + }, + { + "epoch": 0.2650748913414444, + "grad_norm": 42.9375, + "learning_rate": 9.95858211843706e-06, + "loss": 17.3198, + "step": 14660 + }, + { + "epoch": 0.2652557064105723, + "grad_norm": 38.8125, + "learning_rate": 9.9585538661304e-06, + "loss": 16.8643, + "step": 14670 + }, + { + "epoch": 0.2654365214797001, + "grad_norm": 41.0, + "learning_rate": 9.958525613823741e-06, + "loss": 17.2676, + "step": 14680 + }, + { + "epoch": 0.26561733654882796, + "grad_norm": 39.71875, + "learning_rate": 9.958497361517082e-06, + "loss": 17.462, + "step": 14690 + }, + { + "epoch": 0.26579815161795584, + "grad_norm": 40.5625, + "learning_rate": 9.958469109210423e-06, + "loss": 17.0876, + "step": 14700 + }, + { + "epoch": 0.2659789666870837, + "grad_norm": 37.8125, + "learning_rate": 9.958440856903763e-06, + "loss": 17.2175, + "step": 14710 + }, + { + "epoch": 0.2661597817562116, + "grad_norm": 40.09375, + "learning_rate": 9.958412604597104e-06, + "loss": 17.4817, + "step": 14720 + }, + { + "epoch": 0.26634059682533945, + "grad_norm": 40.09375, + "learning_rate": 9.958384352290443e-06, + "loss": 17.4135, + "step": 14730 + }, + { + "epoch": 0.26652141189446726, + "grad_norm": 40.375, + "learning_rate": 9.958356099983783e-06, + "loss": 17.1609, + "step": 14740 + }, + { + "epoch": 0.26670222696359513, + "grad_norm": 40.03125, + "learning_rate": 9.958327847677124e-06, + "loss": 17.1004, + "step": 14750 + }, + { + "epoch": 0.266883042032723, + "grad_norm": 39.90625, + "learning_rate": 9.958299595370465e-06, + "loss": 17.1141, + "step": 14760 + }, + { + "epoch": 0.2670638571018509, + "grad_norm": 40.6875, + "learning_rate": 9.958271343063805e-06, + "loss": 17.2939, + "step": 14770 + }, + { + "epoch": 0.26724467217097875, + "grad_norm": 41.625, + "learning_rate": 9.958243090757146e-06, + "loss": 17.0918, + "step": 14780 + }, + { + "epoch": 0.26742548724010656, + "grad_norm": 41.46875, + "learning_rate": 9.958214838450486e-06, + "loss": 17.1987, + "step": 14790 + }, + { + "epoch": 0.26760630230923443, + "grad_norm": 41.3125, + "learning_rate": 9.958186586143827e-06, + "loss": 16.9342, + "step": 14800 + }, + { + "epoch": 0.2677871173783623, + "grad_norm": 41.15625, + "learning_rate": 9.958158333837166e-06, + "loss": 17.0782, + "step": 14810 + }, + { + "epoch": 0.2679679324474902, + "grad_norm": 42.9375, + "learning_rate": 9.958130081530507e-06, + "loss": 17.7101, + "step": 14820 + }, + { + "epoch": 0.26814874751661805, + "grad_norm": 40.6875, + "learning_rate": 9.958101829223847e-06, + "loss": 17.6526, + "step": 14830 + }, + { + "epoch": 0.2683295625857459, + "grad_norm": 43.46875, + "learning_rate": 9.958073576917188e-06, + "loss": 17.1204, + "step": 14840 + }, + { + "epoch": 0.26851037765487373, + "grad_norm": 41.53125, + "learning_rate": 9.958045324610529e-06, + "loss": 17.2844, + "step": 14850 + }, + { + "epoch": 0.2686911927240016, + "grad_norm": 39.28125, + "learning_rate": 9.958017072303869e-06, + "loss": 16.8951, + "step": 14860 + }, + { + "epoch": 0.2688720077931295, + "grad_norm": 39.25, + "learning_rate": 9.95798881999721e-06, + "loss": 17.281, + "step": 14870 + }, + { + "epoch": 0.26905282286225735, + "grad_norm": 42.03125, + "learning_rate": 9.95796056769055e-06, + "loss": 17.4636, + "step": 14880 + }, + { + "epoch": 0.2692336379313852, + "grad_norm": 38.4375, + "learning_rate": 9.957932315383891e-06, + "loss": 17.1833, + "step": 14890 + }, + { + "epoch": 0.2694144530005131, + "grad_norm": 41.375, + "learning_rate": 9.95790406307723e-06, + "loss": 17.3453, + "step": 14900 + }, + { + "epoch": 0.2695952680696409, + "grad_norm": 42.125, + "learning_rate": 9.95787581077057e-06, + "loss": 17.3075, + "step": 14910 + }, + { + "epoch": 0.2697760831387688, + "grad_norm": 41.1875, + "learning_rate": 9.957847558463911e-06, + "loss": 17.2296, + "step": 14920 + }, + { + "epoch": 0.26995689820789665, + "grad_norm": 37.8125, + "learning_rate": 9.957819306157252e-06, + "loss": 17.2295, + "step": 14930 + }, + { + "epoch": 0.2701377132770245, + "grad_norm": 42.84375, + "learning_rate": 9.957791053850592e-06, + "loss": 17.1168, + "step": 14940 + }, + { + "epoch": 0.2703185283461524, + "grad_norm": 41.9375, + "learning_rate": 9.957762801543933e-06, + "loss": 16.8162, + "step": 14950 + }, + { + "epoch": 0.2704993434152802, + "grad_norm": 44.0625, + "learning_rate": 9.957734549237274e-06, + "loss": 17.2402, + "step": 14960 + }, + { + "epoch": 0.2706801584844081, + "grad_norm": 42.46875, + "learning_rate": 9.957706296930614e-06, + "loss": 17.6101, + "step": 14970 + }, + { + "epoch": 0.27086097355353594, + "grad_norm": 42.6875, + "learning_rate": 9.957678044623953e-06, + "loss": 17.2303, + "step": 14980 + }, + { + "epoch": 0.2710417886226638, + "grad_norm": 40.53125, + "learning_rate": 9.957649792317294e-06, + "loss": 17.1591, + "step": 14990 + }, + { + "epoch": 0.2712226036917917, + "grad_norm": 39.25, + "learning_rate": 9.957621540010634e-06, + "loss": 17.2613, + "step": 15000 + }, + { + "epoch": 0.2712226036917917, + "eval_loss": 2.1613612174987793, + "eval_runtime": 229.1402, + "eval_samples_per_second": 3168.623, + "eval_steps_per_second": 49.511, + "step": 15000 + }, + { + "epoch": 0.27140341876091956, + "grad_norm": 39.84375, + "learning_rate": 9.957593287703975e-06, + "loss": 17.1637, + "step": 15010 + }, + { + "epoch": 0.2715842338300474, + "grad_norm": 38.71875, + "learning_rate": 9.957565035397316e-06, + "loss": 17.2246, + "step": 15020 + }, + { + "epoch": 0.27176504889917524, + "grad_norm": 39.34375, + "learning_rate": 9.957536783090656e-06, + "loss": 17.1489, + "step": 15030 + }, + { + "epoch": 0.2719458639683031, + "grad_norm": 40.78125, + "learning_rate": 9.957508530783997e-06, + "loss": 17.6059, + "step": 15040 + }, + { + "epoch": 0.272126679037431, + "grad_norm": 39.53125, + "learning_rate": 9.957480278477338e-06, + "loss": 17.1587, + "step": 15050 + }, + { + "epoch": 0.27230749410655886, + "grad_norm": 40.84375, + "learning_rate": 9.957452026170678e-06, + "loss": 17.5717, + "step": 15060 + }, + { + "epoch": 0.2724883091756867, + "grad_norm": 40.15625, + "learning_rate": 9.957423773864017e-06, + "loss": 17.1256, + "step": 15070 + }, + { + "epoch": 0.27266912424481454, + "grad_norm": 42.25, + "learning_rate": 9.957395521557358e-06, + "loss": 17.153, + "step": 15080 + }, + { + "epoch": 0.2728499393139424, + "grad_norm": 39.5, + "learning_rate": 9.957367269250698e-06, + "loss": 16.9621, + "step": 15090 + }, + { + "epoch": 0.2730307543830703, + "grad_norm": 40.8125, + "learning_rate": 9.957339016944039e-06, + "loss": 17.2758, + "step": 15100 + }, + { + "epoch": 0.27321156945219816, + "grad_norm": 41.5, + "learning_rate": 9.95731076463738e-06, + "loss": 17.0574, + "step": 15110 + }, + { + "epoch": 0.273392384521326, + "grad_norm": 41.625, + "learning_rate": 9.95728251233072e-06, + "loss": 17.3584, + "step": 15120 + }, + { + "epoch": 0.27357319959045384, + "grad_norm": 41.90625, + "learning_rate": 9.957254260024061e-06, + "loss": 17.1221, + "step": 15130 + }, + { + "epoch": 0.2737540146595817, + "grad_norm": 41.40625, + "learning_rate": 9.957226007717401e-06, + "loss": 17.4213, + "step": 15140 + }, + { + "epoch": 0.2739348297287096, + "grad_norm": 38.59375, + "learning_rate": 9.957197755410742e-06, + "loss": 17.5993, + "step": 15150 + }, + { + "epoch": 0.27411564479783745, + "grad_norm": 41.90625, + "learning_rate": 9.957169503104081e-06, + "loss": 17.2159, + "step": 15160 + }, + { + "epoch": 0.2742964598669653, + "grad_norm": 41.25, + "learning_rate": 9.957141250797422e-06, + "loss": 17.6288, + "step": 15170 + }, + { + "epoch": 0.2744772749360932, + "grad_norm": 40.3125, + "learning_rate": 9.957112998490762e-06, + "loss": 17.0387, + "step": 15180 + }, + { + "epoch": 0.274658090005221, + "grad_norm": 41.1875, + "learning_rate": 9.957084746184103e-06, + "loss": 17.3209, + "step": 15190 + }, + { + "epoch": 0.2748389050743489, + "grad_norm": 40.78125, + "learning_rate": 9.957056493877444e-06, + "loss": 16.9922, + "step": 15200 + }, + { + "epoch": 0.27501972014347675, + "grad_norm": 40.15625, + "learning_rate": 9.957028241570784e-06, + "loss": 17.4538, + "step": 15210 + }, + { + "epoch": 0.2752005352126046, + "grad_norm": 41.875, + "learning_rate": 9.956999989264125e-06, + "loss": 17.3762, + "step": 15220 + }, + { + "epoch": 0.2753813502817325, + "grad_norm": 38.875, + "learning_rate": 9.956971736957465e-06, + "loss": 17.343, + "step": 15230 + }, + { + "epoch": 0.27556216535086037, + "grad_norm": 40.90625, + "learning_rate": 9.956943484650804e-06, + "loss": 17.5539, + "step": 15240 + }, + { + "epoch": 0.2757429804199882, + "grad_norm": 38.90625, + "learning_rate": 9.956915232344145e-06, + "loss": 16.9692, + "step": 15250 + }, + { + "epoch": 0.27592379548911605, + "grad_norm": 40.8125, + "learning_rate": 9.956886980037486e-06, + "loss": 17.3419, + "step": 15260 + }, + { + "epoch": 0.2761046105582439, + "grad_norm": 42.5, + "learning_rate": 9.956858727730826e-06, + "loss": 16.8046, + "step": 15270 + }, + { + "epoch": 0.2762854256273718, + "grad_norm": 38.53125, + "learning_rate": 9.956830475424167e-06, + "loss": 17.2915, + "step": 15280 + }, + { + "epoch": 0.27646624069649967, + "grad_norm": 41.78125, + "learning_rate": 9.956802223117507e-06, + "loss": 17.1039, + "step": 15290 + }, + { + "epoch": 0.2766470557656275, + "grad_norm": 40.0, + "learning_rate": 9.956773970810848e-06, + "loss": 17.0082, + "step": 15300 + }, + { + "epoch": 0.27682787083475535, + "grad_norm": 41.25, + "learning_rate": 9.956745718504189e-06, + "loss": 17.4862, + "step": 15310 + }, + { + "epoch": 0.2770086859038832, + "grad_norm": 41.40625, + "learning_rate": 9.95671746619753e-06, + "loss": 17.2742, + "step": 15320 + }, + { + "epoch": 0.2771895009730111, + "grad_norm": 42.0, + "learning_rate": 9.956689213890868e-06, + "loss": 17.3996, + "step": 15330 + }, + { + "epoch": 0.27737031604213896, + "grad_norm": 41.625, + "learning_rate": 9.956660961584209e-06, + "loss": 17.3848, + "step": 15340 + }, + { + "epoch": 0.27755113111126684, + "grad_norm": 39.75, + "learning_rate": 9.95663270927755e-06, + "loss": 17.3582, + "step": 15350 + }, + { + "epoch": 0.27773194618039465, + "grad_norm": 44.0, + "learning_rate": 9.95660445697089e-06, + "loss": 17.5821, + "step": 15360 + }, + { + "epoch": 0.2779127612495225, + "grad_norm": 41.3125, + "learning_rate": 9.95657620466423e-06, + "loss": 17.4158, + "step": 15370 + }, + { + "epoch": 0.2780935763186504, + "grad_norm": 41.78125, + "learning_rate": 9.956547952357571e-06, + "loss": 17.2659, + "step": 15380 + }, + { + "epoch": 0.27827439138777826, + "grad_norm": 41.90625, + "learning_rate": 9.956519700050912e-06, + "loss": 17.218, + "step": 15390 + }, + { + "epoch": 0.27845520645690613, + "grad_norm": 40.15625, + "learning_rate": 9.956491447744253e-06, + "loss": 17.1191, + "step": 15400 + }, + { + "epoch": 0.278636021526034, + "grad_norm": 43.625, + "learning_rate": 9.956463195437592e-06, + "loss": 17.2365, + "step": 15410 + }, + { + "epoch": 0.2788168365951618, + "grad_norm": 41.8125, + "learning_rate": 9.956434943130932e-06, + "loss": 17.5236, + "step": 15420 + }, + { + "epoch": 0.2789976516642897, + "grad_norm": 42.625, + "learning_rate": 9.956406690824273e-06, + "loss": 17.0254, + "step": 15430 + }, + { + "epoch": 0.27917846673341756, + "grad_norm": 41.40625, + "learning_rate": 9.956378438517613e-06, + "loss": 17.2221, + "step": 15440 + }, + { + "epoch": 0.27935928180254543, + "grad_norm": 39.84375, + "learning_rate": 9.956350186210954e-06, + "loss": 17.5745, + "step": 15450 + }, + { + "epoch": 0.2795400968716733, + "grad_norm": 40.9375, + "learning_rate": 9.956321933904295e-06, + "loss": 16.9634, + "step": 15460 + }, + { + "epoch": 0.2797209119408011, + "grad_norm": 40.0625, + "learning_rate": 9.956293681597635e-06, + "loss": 16.9116, + "step": 15470 + }, + { + "epoch": 0.279901727009929, + "grad_norm": 39.34375, + "learning_rate": 9.956265429290976e-06, + "loss": 17.1836, + "step": 15480 + }, + { + "epoch": 0.28008254207905686, + "grad_norm": 40.5, + "learning_rate": 9.956237176984316e-06, + "loss": 17.068, + "step": 15490 + }, + { + "epoch": 0.28026335714818473, + "grad_norm": 41.0, + "learning_rate": 9.956208924677655e-06, + "loss": 17.2735, + "step": 15500 + }, + { + "epoch": 0.2804441722173126, + "grad_norm": 40.09375, + "learning_rate": 9.956180672370996e-06, + "loss": 17.0922, + "step": 15510 + }, + { + "epoch": 0.2806249872864405, + "grad_norm": 40.1875, + "learning_rate": 9.956152420064337e-06, + "loss": 16.9675, + "step": 15520 + }, + { + "epoch": 0.2808058023555683, + "grad_norm": 40.28125, + "learning_rate": 9.956124167757677e-06, + "loss": 17.1198, + "step": 15530 + }, + { + "epoch": 0.28098661742469616, + "grad_norm": 42.5, + "learning_rate": 9.956095915451018e-06, + "loss": 16.9641, + "step": 15540 + }, + { + "epoch": 0.28116743249382403, + "grad_norm": 41.78125, + "learning_rate": 9.956067663144359e-06, + "loss": 17.2347, + "step": 15550 + }, + { + "epoch": 0.2813482475629519, + "grad_norm": 39.0625, + "learning_rate": 9.956039410837699e-06, + "loss": 16.6774, + "step": 15560 + }, + { + "epoch": 0.2815290626320798, + "grad_norm": 37.3125, + "learning_rate": 9.95601115853104e-06, + "loss": 17.3081, + "step": 15570 + }, + { + "epoch": 0.28170987770120764, + "grad_norm": 40.8125, + "learning_rate": 9.95598290622438e-06, + "loss": 17.3257, + "step": 15580 + }, + { + "epoch": 0.28189069277033546, + "grad_norm": 40.5, + "learning_rate": 9.95595465391772e-06, + "loss": 16.8696, + "step": 15590 + }, + { + "epoch": 0.28207150783946333, + "grad_norm": 40.25, + "learning_rate": 9.95592640161106e-06, + "loss": 17.0273, + "step": 15600 + }, + { + "epoch": 0.2822523229085912, + "grad_norm": 39.78125, + "learning_rate": 9.9558981493044e-06, + "loss": 17.3795, + "step": 15610 + }, + { + "epoch": 0.2824331379777191, + "grad_norm": 41.8125, + "learning_rate": 9.955869896997741e-06, + "loss": 17.113, + "step": 15620 + }, + { + "epoch": 0.28261395304684694, + "grad_norm": 39.65625, + "learning_rate": 9.955841644691082e-06, + "loss": 17.3427, + "step": 15630 + }, + { + "epoch": 0.28279476811597476, + "grad_norm": 40.59375, + "learning_rate": 9.955813392384422e-06, + "loss": 17.609, + "step": 15640 + }, + { + "epoch": 0.28297558318510263, + "grad_norm": 39.5, + "learning_rate": 9.955785140077763e-06, + "loss": 17.6259, + "step": 15650 + }, + { + "epoch": 0.2831563982542305, + "grad_norm": 40.28125, + "learning_rate": 9.955756887771104e-06, + "loss": 17.0857, + "step": 15660 + }, + { + "epoch": 0.2833372133233584, + "grad_norm": 42.125, + "learning_rate": 9.955728635464443e-06, + "loss": 17.0976, + "step": 15670 + }, + { + "epoch": 0.28351802839248624, + "grad_norm": 42.71875, + "learning_rate": 9.955700383157783e-06, + "loss": 17.3903, + "step": 15680 + }, + { + "epoch": 0.2836988434616141, + "grad_norm": 40.53125, + "learning_rate": 9.955672130851124e-06, + "loss": 17.3015, + "step": 15690 + }, + { + "epoch": 0.28387965853074193, + "grad_norm": 38.8125, + "learning_rate": 9.955643878544464e-06, + "loss": 16.9424, + "step": 15700 + }, + { + "epoch": 0.2840604735998698, + "grad_norm": 40.34375, + "learning_rate": 9.955615626237805e-06, + "loss": 17.3459, + "step": 15710 + }, + { + "epoch": 0.28424128866899767, + "grad_norm": 44.75, + "learning_rate": 9.955587373931146e-06, + "loss": 17.1487, + "step": 15720 + }, + { + "epoch": 0.28442210373812554, + "grad_norm": 40.53125, + "learning_rate": 9.955559121624486e-06, + "loss": 17.3224, + "step": 15730 + }, + { + "epoch": 0.2846029188072534, + "grad_norm": 40.15625, + "learning_rate": 9.955530869317827e-06, + "loss": 17.1909, + "step": 15740 + }, + { + "epoch": 0.2847837338763813, + "grad_norm": 41.0, + "learning_rate": 9.955502617011168e-06, + "loss": 16.998, + "step": 15750 + }, + { + "epoch": 0.2849645489455091, + "grad_norm": 41.46875, + "learning_rate": 9.955474364704507e-06, + "loss": 17.4409, + "step": 15760 + }, + { + "epoch": 0.28514536401463697, + "grad_norm": 38.0625, + "learning_rate": 9.955446112397847e-06, + "loss": 17.0526, + "step": 15770 + }, + { + "epoch": 0.28532617908376484, + "grad_norm": 40.4375, + "learning_rate": 9.955417860091188e-06, + "loss": 16.7261, + "step": 15780 + }, + { + "epoch": 0.2855069941528927, + "grad_norm": 41.375, + "learning_rate": 9.955389607784528e-06, + "loss": 17.4373, + "step": 15790 + }, + { + "epoch": 0.2856878092220206, + "grad_norm": 39.65625, + "learning_rate": 9.955361355477869e-06, + "loss": 17.2315, + "step": 15800 + }, + { + "epoch": 0.2858686242911484, + "grad_norm": 40.84375, + "learning_rate": 9.95533310317121e-06, + "loss": 17.264, + "step": 15810 + }, + { + "epoch": 0.28604943936027627, + "grad_norm": 40.9375, + "learning_rate": 9.95530485086455e-06, + "loss": 17.5945, + "step": 15820 + }, + { + "epoch": 0.28623025442940414, + "grad_norm": 40.0, + "learning_rate": 9.955276598557891e-06, + "loss": 17.2339, + "step": 15830 + }, + { + "epoch": 0.286411069498532, + "grad_norm": 37.9375, + "learning_rate": 9.95524834625123e-06, + "loss": 17.3516, + "step": 15840 + }, + { + "epoch": 0.2865918845676599, + "grad_norm": 39.125, + "learning_rate": 9.95522009394457e-06, + "loss": 17.5479, + "step": 15850 + }, + { + "epoch": 0.28677269963678775, + "grad_norm": 39.96875, + "learning_rate": 9.955191841637911e-06, + "loss": 17.3622, + "step": 15860 + }, + { + "epoch": 0.28695351470591557, + "grad_norm": 40.3125, + "learning_rate": 9.955163589331252e-06, + "loss": 17.4275, + "step": 15870 + }, + { + "epoch": 0.28713432977504344, + "grad_norm": 41.46875, + "learning_rate": 9.955135337024592e-06, + "loss": 17.2381, + "step": 15880 + }, + { + "epoch": 0.2873151448441713, + "grad_norm": 40.84375, + "learning_rate": 9.955107084717933e-06, + "loss": 17.163, + "step": 15890 + }, + { + "epoch": 0.2874959599132992, + "grad_norm": 40.875, + "learning_rate": 9.955078832411274e-06, + "loss": 17.093, + "step": 15900 + }, + { + "epoch": 0.28767677498242705, + "grad_norm": 39.25, + "learning_rate": 9.955050580104614e-06, + "loss": 17.1249, + "step": 15910 + }, + { + "epoch": 0.2878575900515549, + "grad_norm": 39.625, + "learning_rate": 9.955022327797955e-06, + "loss": 17.3523, + "step": 15920 + }, + { + "epoch": 0.28803840512068274, + "grad_norm": 41.875, + "learning_rate": 9.954994075491294e-06, + "loss": 17.2994, + "step": 15930 + }, + { + "epoch": 0.2882192201898106, + "grad_norm": 39.65625, + "learning_rate": 9.954965823184634e-06, + "loss": 17.3658, + "step": 15940 + }, + { + "epoch": 0.2884000352589385, + "grad_norm": 39.96875, + "learning_rate": 9.954937570877975e-06, + "loss": 17.1942, + "step": 15950 + }, + { + "epoch": 0.28858085032806635, + "grad_norm": 40.53125, + "learning_rate": 9.954909318571316e-06, + "loss": 17.3323, + "step": 15960 + }, + { + "epoch": 0.2887616653971942, + "grad_norm": 41.15625, + "learning_rate": 9.954881066264656e-06, + "loss": 17.148, + "step": 15970 + }, + { + "epoch": 0.28894248046632204, + "grad_norm": 38.375, + "learning_rate": 9.954852813957997e-06, + "loss": 17.149, + "step": 15980 + }, + { + "epoch": 0.2891232955354499, + "grad_norm": 40.8125, + "learning_rate": 9.954824561651337e-06, + "loss": 17.4588, + "step": 15990 + }, + { + "epoch": 0.2893041106045778, + "grad_norm": 40.75, + "learning_rate": 9.954796309344678e-06, + "loss": 17.116, + "step": 16000 + }, + { + "epoch": 0.28948492567370565, + "grad_norm": 38.5625, + "learning_rate": 9.954768057038019e-06, + "loss": 16.7949, + "step": 16010 + }, + { + "epoch": 0.2896657407428335, + "grad_norm": 40.1875, + "learning_rate": 9.954739804731358e-06, + "loss": 17.0813, + "step": 16020 + }, + { + "epoch": 0.2898465558119614, + "grad_norm": 39.625, + "learning_rate": 9.954711552424698e-06, + "loss": 17.6624, + "step": 16030 + }, + { + "epoch": 0.2900273708810892, + "grad_norm": 40.0, + "learning_rate": 9.954683300118039e-06, + "loss": 17.3748, + "step": 16040 + }, + { + "epoch": 0.2902081859502171, + "grad_norm": 39.71875, + "learning_rate": 9.95465504781138e-06, + "loss": 17.3348, + "step": 16050 + }, + { + "epoch": 0.29038900101934495, + "grad_norm": 38.28125, + "learning_rate": 9.95462679550472e-06, + "loss": 16.9131, + "step": 16060 + }, + { + "epoch": 0.2905698160884728, + "grad_norm": 39.03125, + "learning_rate": 9.95459854319806e-06, + "loss": 17.5051, + "step": 16070 + }, + { + "epoch": 0.2907506311576007, + "grad_norm": 41.0625, + "learning_rate": 9.954570290891401e-06, + "loss": 17.3849, + "step": 16080 + }, + { + "epoch": 0.29093144622672856, + "grad_norm": 40.09375, + "learning_rate": 9.954542038584742e-06, + "loss": 17.4643, + "step": 16090 + }, + { + "epoch": 0.2911122612958564, + "grad_norm": 42.21875, + "learning_rate": 9.954513786278081e-06, + "loss": 17.4884, + "step": 16100 + }, + { + "epoch": 0.29129307636498425, + "grad_norm": 39.875, + "learning_rate": 9.954485533971422e-06, + "loss": 17.3067, + "step": 16110 + }, + { + "epoch": 0.2914738914341121, + "grad_norm": 38.53125, + "learning_rate": 9.954457281664762e-06, + "loss": 17.2178, + "step": 16120 + }, + { + "epoch": 0.29165470650324, + "grad_norm": 42.875, + "learning_rate": 9.954429029358103e-06, + "loss": 17.2724, + "step": 16130 + }, + { + "epoch": 0.29183552157236786, + "grad_norm": 42.96875, + "learning_rate": 9.954400777051443e-06, + "loss": 17.2351, + "step": 16140 + }, + { + "epoch": 0.2920163366414957, + "grad_norm": 38.59375, + "learning_rate": 9.954372524744782e-06, + "loss": 17.3964, + "step": 16150 + }, + { + "epoch": 0.29219715171062355, + "grad_norm": 41.0625, + "learning_rate": 9.954344272438125e-06, + "loss": 17.1551, + "step": 16160 + }, + { + "epoch": 0.2923779667797514, + "grad_norm": 38.1875, + "learning_rate": 9.954316020131465e-06, + "loss": 17.0422, + "step": 16170 + }, + { + "epoch": 0.2925587818488793, + "grad_norm": 40.5625, + "learning_rate": 9.954287767824806e-06, + "loss": 17.5605, + "step": 16180 + }, + { + "epoch": 0.29273959691800716, + "grad_norm": 42.9375, + "learning_rate": 9.954259515518145e-06, + "loss": 17.3039, + "step": 16190 + }, + { + "epoch": 0.29292041198713503, + "grad_norm": 39.9375, + "learning_rate": 9.954231263211485e-06, + "loss": 17.0426, + "step": 16200 + }, + { + "epoch": 0.29310122705626285, + "grad_norm": 42.0, + "learning_rate": 9.954203010904826e-06, + "loss": 17.1135, + "step": 16210 + }, + { + "epoch": 0.2932820421253907, + "grad_norm": 39.96875, + "learning_rate": 9.954174758598167e-06, + "loss": 17.2362, + "step": 16220 + }, + { + "epoch": 0.2934628571945186, + "grad_norm": 42.84375, + "learning_rate": 9.954146506291507e-06, + "loss": 17.3939, + "step": 16230 + }, + { + "epoch": 0.29364367226364646, + "grad_norm": 41.34375, + "learning_rate": 9.954118253984846e-06, + "loss": 17.2928, + "step": 16240 + }, + { + "epoch": 0.29382448733277433, + "grad_norm": 42.125, + "learning_rate": 9.954090001678189e-06, + "loss": 17.3053, + "step": 16250 + }, + { + "epoch": 0.2940053024019022, + "grad_norm": 39.9375, + "learning_rate": 9.95406174937153e-06, + "loss": 17.1759, + "step": 16260 + }, + { + "epoch": 0.29418611747103, + "grad_norm": 42.5625, + "learning_rate": 9.954033497064868e-06, + "loss": 17.2078, + "step": 16270 + }, + { + "epoch": 0.2943669325401579, + "grad_norm": 41.46875, + "learning_rate": 9.954005244758209e-06, + "loss": 17.2217, + "step": 16280 + }, + { + "epoch": 0.29454774760928576, + "grad_norm": 41.21875, + "learning_rate": 9.95397699245155e-06, + "loss": 17.5906, + "step": 16290 + }, + { + "epoch": 0.29472856267841363, + "grad_norm": 40.28125, + "learning_rate": 9.95394874014489e-06, + "loss": 17.3592, + "step": 16300 + }, + { + "epoch": 0.2949093777475415, + "grad_norm": 43.0625, + "learning_rate": 9.95392048783823e-06, + "loss": 17.7649, + "step": 16310 + }, + { + "epoch": 0.2950901928166693, + "grad_norm": 40.0625, + "learning_rate": 9.953892235531571e-06, + "loss": 17.0514, + "step": 16320 + }, + { + "epoch": 0.2952710078857972, + "grad_norm": 38.96875, + "learning_rate": 9.953863983224912e-06, + "loss": 17.0892, + "step": 16330 + }, + { + "epoch": 0.29545182295492506, + "grad_norm": 38.75, + "learning_rate": 9.953835730918252e-06, + "loss": 17.6396, + "step": 16340 + }, + { + "epoch": 0.29563263802405293, + "grad_norm": 40.75, + "learning_rate": 9.953807478611593e-06, + "loss": 17.1078, + "step": 16350 + }, + { + "epoch": 0.2958134530931808, + "grad_norm": 40.5625, + "learning_rate": 9.953779226304932e-06, + "loss": 17.0602, + "step": 16360 + }, + { + "epoch": 0.29599426816230867, + "grad_norm": 40.9375, + "learning_rate": 9.953750973998273e-06, + "loss": 17.2024, + "step": 16370 + }, + { + "epoch": 0.2961750832314365, + "grad_norm": 44.0, + "learning_rate": 9.953722721691613e-06, + "loss": 17.265, + "step": 16380 + }, + { + "epoch": 0.29635589830056436, + "grad_norm": 40.53125, + "learning_rate": 9.953694469384954e-06, + "loss": 16.996, + "step": 16390 + }, + { + "epoch": 0.29653671336969223, + "grad_norm": 41.15625, + "learning_rate": 9.953666217078294e-06, + "loss": 16.7257, + "step": 16400 + }, + { + "epoch": 0.2967175284388201, + "grad_norm": 40.65625, + "learning_rate": 9.953637964771633e-06, + "loss": 17.1792, + "step": 16410 + }, + { + "epoch": 0.29689834350794797, + "grad_norm": 42.90625, + "learning_rate": 9.953609712464976e-06, + "loss": 17.4675, + "step": 16420 + }, + { + "epoch": 0.2970791585770758, + "grad_norm": 42.03125, + "learning_rate": 9.953581460158316e-06, + "loss": 17.298, + "step": 16430 + }, + { + "epoch": 0.29725997364620366, + "grad_norm": 38.09375, + "learning_rate": 9.953553207851657e-06, + "loss": 17.1823, + "step": 16440 + }, + { + "epoch": 0.2974407887153315, + "grad_norm": 40.90625, + "learning_rate": 9.953524955544996e-06, + "loss": 17.3904, + "step": 16450 + }, + { + "epoch": 0.2976216037844594, + "grad_norm": 40.5, + "learning_rate": 9.953496703238337e-06, + "loss": 17.1084, + "step": 16460 + }, + { + "epoch": 0.29780241885358727, + "grad_norm": 40.40625, + "learning_rate": 9.953468450931677e-06, + "loss": 17.1752, + "step": 16470 + }, + { + "epoch": 0.29798323392271514, + "grad_norm": 38.0, + "learning_rate": 9.953440198625018e-06, + "loss": 17.1169, + "step": 16480 + }, + { + "epoch": 0.29816404899184296, + "grad_norm": 40.03125, + "learning_rate": 9.953411946318358e-06, + "loss": 17.6988, + "step": 16490 + }, + { + "epoch": 0.2983448640609708, + "grad_norm": 42.09375, + "learning_rate": 9.953383694011697e-06, + "loss": 17.0947, + "step": 16500 + }, + { + "epoch": 0.2985256791300987, + "grad_norm": 40.90625, + "learning_rate": 9.95335544170504e-06, + "loss": 17.1475, + "step": 16510 + }, + { + "epoch": 0.29870649419922657, + "grad_norm": 40.25, + "learning_rate": 9.95332718939838e-06, + "loss": 17.1444, + "step": 16520 + }, + { + "epoch": 0.29888730926835444, + "grad_norm": 40.3125, + "learning_rate": 9.95329893709172e-06, + "loss": 17.1142, + "step": 16530 + }, + { + "epoch": 0.2990681243374823, + "grad_norm": 38.78125, + "learning_rate": 9.95327068478506e-06, + "loss": 16.9027, + "step": 16540 + }, + { + "epoch": 0.2992489394066101, + "grad_norm": 44.03125, + "learning_rate": 9.9532424324784e-06, + "loss": 17.0472, + "step": 16550 + }, + { + "epoch": 0.299429754475738, + "grad_norm": 40.5625, + "learning_rate": 9.953214180171741e-06, + "loss": 16.765, + "step": 16560 + }, + { + "epoch": 0.29961056954486587, + "grad_norm": 42.15625, + "learning_rate": 9.953185927865082e-06, + "loss": 16.9042, + "step": 16570 + }, + { + "epoch": 0.29979138461399374, + "grad_norm": 38.90625, + "learning_rate": 9.95315767555842e-06, + "loss": 16.9619, + "step": 16580 + }, + { + "epoch": 0.2999721996831216, + "grad_norm": 42.25, + "learning_rate": 9.953129423251761e-06, + "loss": 17.0627, + "step": 16590 + }, + { + "epoch": 0.3001530147522494, + "grad_norm": 37.75, + "learning_rate": 9.953101170945104e-06, + "loss": 17.3962, + "step": 16600 + }, + { + "epoch": 0.3003338298213773, + "grad_norm": 40.5, + "learning_rate": 9.953072918638444e-06, + "loss": 17.0685, + "step": 16610 + }, + { + "epoch": 0.30051464489050517, + "grad_norm": 38.6875, + "learning_rate": 9.953044666331783e-06, + "loss": 17.456, + "step": 16620 + }, + { + "epoch": 0.30069545995963304, + "grad_norm": 41.53125, + "learning_rate": 9.953016414025124e-06, + "loss": 17.1949, + "step": 16630 + }, + { + "epoch": 0.3008762750287609, + "grad_norm": 39.09375, + "learning_rate": 9.952988161718464e-06, + "loss": 17.527, + "step": 16640 + }, + { + "epoch": 0.3010570900978888, + "grad_norm": 39.8125, + "learning_rate": 9.952959909411805e-06, + "loss": 17.1713, + "step": 16650 + }, + { + "epoch": 0.3012379051670166, + "grad_norm": 38.90625, + "learning_rate": 9.952931657105146e-06, + "loss": 16.9757, + "step": 16660 + }, + { + "epoch": 0.30141872023614447, + "grad_norm": 41.78125, + "learning_rate": 9.952903404798485e-06, + "loss": 17.2477, + "step": 16670 + }, + { + "epoch": 0.30159953530527234, + "grad_norm": 39.78125, + "learning_rate": 9.952875152491827e-06, + "loss": 17.256, + "step": 16680 + }, + { + "epoch": 0.3017803503744002, + "grad_norm": 40.8125, + "learning_rate": 9.952846900185167e-06, + "loss": 17.1216, + "step": 16690 + }, + { + "epoch": 0.3019611654435281, + "grad_norm": 38.625, + "learning_rate": 9.952818647878506e-06, + "loss": 17.2615, + "step": 16700 + }, + { + "epoch": 0.30214198051265595, + "grad_norm": 39.625, + "learning_rate": 9.952790395571847e-06, + "loss": 17.2899, + "step": 16710 + }, + { + "epoch": 0.30232279558178377, + "grad_norm": 41.375, + "learning_rate": 9.952762143265188e-06, + "loss": 17.8263, + "step": 16720 + }, + { + "epoch": 0.30250361065091164, + "grad_norm": 42.5, + "learning_rate": 9.952733890958528e-06, + "loss": 17.4072, + "step": 16730 + }, + { + "epoch": 0.3026844257200395, + "grad_norm": 44.25, + "learning_rate": 9.952705638651869e-06, + "loss": 17.9111, + "step": 16740 + }, + { + "epoch": 0.3028652407891674, + "grad_norm": 40.40625, + "learning_rate": 9.952677386345208e-06, + "loss": 17.2915, + "step": 16750 + }, + { + "epoch": 0.30304605585829525, + "grad_norm": 41.9375, + "learning_rate": 9.952649134038548e-06, + "loss": 17.3275, + "step": 16760 + }, + { + "epoch": 0.30322687092742306, + "grad_norm": 39.125, + "learning_rate": 9.95262088173189e-06, + "loss": 17.4011, + "step": 16770 + }, + { + "epoch": 0.30340768599655094, + "grad_norm": 41.25, + "learning_rate": 9.952592629425231e-06, + "loss": 17.0106, + "step": 16780 + }, + { + "epoch": 0.3035885010656788, + "grad_norm": 41.53125, + "learning_rate": 9.95256437711857e-06, + "loss": 17.0403, + "step": 16790 + }, + { + "epoch": 0.3037693161348067, + "grad_norm": 39.78125, + "learning_rate": 9.952536124811911e-06, + "loss": 17.1771, + "step": 16800 + }, + { + "epoch": 0.30395013120393455, + "grad_norm": 43.78125, + "learning_rate": 9.952507872505252e-06, + "loss": 17.4583, + "step": 16810 + }, + { + "epoch": 0.3041309462730624, + "grad_norm": 38.09375, + "learning_rate": 9.952479620198592e-06, + "loss": 17.1125, + "step": 16820 + }, + { + "epoch": 0.30431176134219023, + "grad_norm": 40.25, + "learning_rate": 9.952451367891933e-06, + "loss": 17.0927, + "step": 16830 + }, + { + "epoch": 0.3044925764113181, + "grad_norm": 41.09375, + "learning_rate": 9.952423115585272e-06, + "loss": 17.3882, + "step": 16840 + }, + { + "epoch": 0.304673391480446, + "grad_norm": 40.71875, + "learning_rate": 9.952394863278612e-06, + "loss": 16.8834, + "step": 16850 + }, + { + "epoch": 0.30485420654957385, + "grad_norm": 43.40625, + "learning_rate": 9.952366610971955e-06, + "loss": 17.6202, + "step": 16860 + }, + { + "epoch": 0.3050350216187017, + "grad_norm": 39.28125, + "learning_rate": 9.952338358665294e-06, + "loss": 17.3385, + "step": 16870 + }, + { + "epoch": 0.3052158366878296, + "grad_norm": 41.03125, + "learning_rate": 9.952310106358634e-06, + "loss": 17.5984, + "step": 16880 + }, + { + "epoch": 0.3053966517569574, + "grad_norm": 40.59375, + "learning_rate": 9.952281854051975e-06, + "loss": 17.3033, + "step": 16890 + }, + { + "epoch": 0.3055774668260853, + "grad_norm": 43.0, + "learning_rate": 9.952253601745315e-06, + "loss": 17.1927, + "step": 16900 + }, + { + "epoch": 0.30575828189521315, + "grad_norm": 40.1875, + "learning_rate": 9.952225349438656e-06, + "loss": 17.7446, + "step": 16910 + }, + { + "epoch": 0.305939096964341, + "grad_norm": 42.34375, + "learning_rate": 9.952197097131997e-06, + "loss": 17.2445, + "step": 16920 + }, + { + "epoch": 0.3061199120334689, + "grad_norm": 43.65625, + "learning_rate": 9.952168844825336e-06, + "loss": 17.4012, + "step": 16930 + }, + { + "epoch": 0.3063007271025967, + "grad_norm": 41.5, + "learning_rate": 9.952140592518676e-06, + "loss": 17.2224, + "step": 16940 + }, + { + "epoch": 0.3064815421717246, + "grad_norm": 44.75, + "learning_rate": 9.952112340212019e-06, + "loss": 17.4899, + "step": 16950 + }, + { + "epoch": 0.30666235724085245, + "grad_norm": 41.03125, + "learning_rate": 9.952084087905357e-06, + "loss": 16.8993, + "step": 16960 + }, + { + "epoch": 0.3068431723099803, + "grad_norm": 41.4375, + "learning_rate": 9.952055835598698e-06, + "loss": 17.5821, + "step": 16970 + }, + { + "epoch": 0.3070239873791082, + "grad_norm": 40.40625, + "learning_rate": 9.952027583292039e-06, + "loss": 17.31, + "step": 16980 + }, + { + "epoch": 0.30720480244823606, + "grad_norm": 43.0, + "learning_rate": 9.95199933098538e-06, + "loss": 17.4335, + "step": 16990 + }, + { + "epoch": 0.3073856175173639, + "grad_norm": 44.5, + "learning_rate": 9.95197107867872e-06, + "loss": 17.3602, + "step": 17000 + }, + { + "epoch": 0.30756643258649174, + "grad_norm": 39.09375, + "learning_rate": 9.951942826372059e-06, + "loss": 16.9662, + "step": 17010 + }, + { + "epoch": 0.3077472476556196, + "grad_norm": 39.625, + "learning_rate": 9.9519145740654e-06, + "loss": 17.0458, + "step": 17020 + }, + { + "epoch": 0.3079280627247475, + "grad_norm": 41.0625, + "learning_rate": 9.95188632175874e-06, + "loss": 17.7801, + "step": 17030 + }, + { + "epoch": 0.30810887779387536, + "grad_norm": 43.1875, + "learning_rate": 9.951858069452082e-06, + "loss": 17.4092, + "step": 17040 + }, + { + "epoch": 0.30828969286300323, + "grad_norm": 42.03125, + "learning_rate": 9.951829817145421e-06, + "loss": 17.5656, + "step": 17050 + }, + { + "epoch": 0.30847050793213104, + "grad_norm": 41.375, + "learning_rate": 9.951801564838762e-06, + "loss": 17.5283, + "step": 17060 + }, + { + "epoch": 0.3086513230012589, + "grad_norm": 40.90625, + "learning_rate": 9.951773312532103e-06, + "loss": 17.3261, + "step": 17070 + }, + { + "epoch": 0.3088321380703868, + "grad_norm": 40.46875, + "learning_rate": 9.951745060225443e-06, + "loss": 17.403, + "step": 17080 + }, + { + "epoch": 0.30901295313951466, + "grad_norm": 39.71875, + "learning_rate": 9.951716807918784e-06, + "loss": 17.1059, + "step": 17090 + }, + { + "epoch": 0.3091937682086425, + "grad_norm": 42.0625, + "learning_rate": 9.951688555612123e-06, + "loss": 17.2102, + "step": 17100 + }, + { + "epoch": 0.30937458327777034, + "grad_norm": 40.6875, + "learning_rate": 9.951660303305463e-06, + "loss": 16.8268, + "step": 17110 + }, + { + "epoch": 0.3095553983468982, + "grad_norm": 40.71875, + "learning_rate": 9.951632050998806e-06, + "loss": 16.661, + "step": 17120 + }, + { + "epoch": 0.3097362134160261, + "grad_norm": 40.03125, + "learning_rate": 9.951603798692145e-06, + "loss": 17.2557, + "step": 17130 + }, + { + "epoch": 0.30991702848515396, + "grad_norm": 38.5, + "learning_rate": 9.951575546385485e-06, + "loss": 17.03, + "step": 17140 + }, + { + "epoch": 0.3100978435542818, + "grad_norm": 40.1875, + "learning_rate": 9.951547294078826e-06, + "loss": 16.9144, + "step": 17150 + }, + { + "epoch": 0.3102786586234097, + "grad_norm": 41.03125, + "learning_rate": 9.951519041772167e-06, + "loss": 16.892, + "step": 17160 + }, + { + "epoch": 0.3104594736925375, + "grad_norm": 43.9375, + "learning_rate": 9.951490789465507e-06, + "loss": 17.3718, + "step": 17170 + }, + { + "epoch": 0.3106402887616654, + "grad_norm": 43.625, + "learning_rate": 9.951462537158846e-06, + "loss": 16.9441, + "step": 17180 + }, + { + "epoch": 0.31082110383079325, + "grad_norm": 38.96875, + "learning_rate": 9.951434284852187e-06, + "loss": 16.8048, + "step": 17190 + }, + { + "epoch": 0.3110019188999211, + "grad_norm": 40.5, + "learning_rate": 9.951406032545527e-06, + "loss": 17.1232, + "step": 17200 + }, + { + "epoch": 0.311182733969049, + "grad_norm": 42.8125, + "learning_rate": 9.95137778023887e-06, + "loss": 16.9251, + "step": 17210 + }, + { + "epoch": 0.31136354903817687, + "grad_norm": 39.5, + "learning_rate": 9.951349527932209e-06, + "loss": 17.2344, + "step": 17220 + }, + { + "epoch": 0.3115443641073047, + "grad_norm": 42.46875, + "learning_rate": 9.95132127562555e-06, + "loss": 16.9758, + "step": 17230 + }, + { + "epoch": 0.31172517917643255, + "grad_norm": 41.28125, + "learning_rate": 9.95129302331889e-06, + "loss": 17.0863, + "step": 17240 + }, + { + "epoch": 0.3119059942455604, + "grad_norm": 36.375, + "learning_rate": 9.95126477101223e-06, + "loss": 17.3485, + "step": 17250 + }, + { + "epoch": 0.3120868093146883, + "grad_norm": 39.875, + "learning_rate": 9.951236518705571e-06, + "loss": 17.0591, + "step": 17260 + }, + { + "epoch": 0.31226762438381617, + "grad_norm": 44.5625, + "learning_rate": 9.95120826639891e-06, + "loss": 17.2459, + "step": 17270 + }, + { + "epoch": 0.312448439452944, + "grad_norm": 41.5625, + "learning_rate": 9.95118001409225e-06, + "loss": 17.2446, + "step": 17280 + }, + { + "epoch": 0.31262925452207185, + "grad_norm": 40.0, + "learning_rate": 9.951151761785591e-06, + "loss": 16.8575, + "step": 17290 + }, + { + "epoch": 0.3128100695911997, + "grad_norm": 39.875, + "learning_rate": 9.951123509478932e-06, + "loss": 17.4136, + "step": 17300 + }, + { + "epoch": 0.3129908846603276, + "grad_norm": 41.8125, + "learning_rate": 9.951095257172272e-06, + "loss": 17.2629, + "step": 17310 + }, + { + "epoch": 0.31317169972945547, + "grad_norm": 37.375, + "learning_rate": 9.951067004865613e-06, + "loss": 17.0746, + "step": 17320 + }, + { + "epoch": 0.31335251479858334, + "grad_norm": 40.84375, + "learning_rate": 9.951038752558954e-06, + "loss": 17.2394, + "step": 17330 + }, + { + "epoch": 0.31353332986771115, + "grad_norm": 41.53125, + "learning_rate": 9.951010500252294e-06, + "loss": 17.4258, + "step": 17340 + }, + { + "epoch": 0.313714144936839, + "grad_norm": 39.75, + "learning_rate": 9.950982247945635e-06, + "loss": 17.1237, + "step": 17350 + }, + { + "epoch": 0.3138949600059669, + "grad_norm": 38.8125, + "learning_rate": 9.950953995638974e-06, + "loss": 16.8779, + "step": 17360 + }, + { + "epoch": 0.31407577507509477, + "grad_norm": 38.59375, + "learning_rate": 9.950925743332315e-06, + "loss": 17.5397, + "step": 17370 + }, + { + "epoch": 0.31425659014422264, + "grad_norm": 41.75, + "learning_rate": 9.950897491025655e-06, + "loss": 17.3225, + "step": 17380 + }, + { + "epoch": 0.3144374052133505, + "grad_norm": 40.1875, + "learning_rate": 9.950869238718996e-06, + "loss": 17.1693, + "step": 17390 + }, + { + "epoch": 0.3146182202824783, + "grad_norm": 38.5625, + "learning_rate": 9.950840986412336e-06, + "loss": 17.3698, + "step": 17400 + }, + { + "epoch": 0.3147990353516062, + "grad_norm": 39.875, + "learning_rate": 9.950812734105677e-06, + "loss": 17.0733, + "step": 17410 + }, + { + "epoch": 0.31497985042073406, + "grad_norm": 39.46875, + "learning_rate": 9.950784481799018e-06, + "loss": 16.9899, + "step": 17420 + }, + { + "epoch": 0.31516066548986194, + "grad_norm": 40.0625, + "learning_rate": 9.950756229492358e-06, + "loss": 17.282, + "step": 17430 + }, + { + "epoch": 0.3153414805589898, + "grad_norm": 39.96875, + "learning_rate": 9.950727977185697e-06, + "loss": 17.2829, + "step": 17440 + }, + { + "epoch": 0.3155222956281176, + "grad_norm": 41.5625, + "learning_rate": 9.950699724879038e-06, + "loss": 17.432, + "step": 17450 + }, + { + "epoch": 0.3157031106972455, + "grad_norm": 36.0, + "learning_rate": 9.950671472572378e-06, + "loss": 17.3085, + "step": 17460 + }, + { + "epoch": 0.31588392576637336, + "grad_norm": 39.34375, + "learning_rate": 9.95064322026572e-06, + "loss": 17.0396, + "step": 17470 + }, + { + "epoch": 0.31606474083550123, + "grad_norm": 41.15625, + "learning_rate": 9.95061496795906e-06, + "loss": 17.3051, + "step": 17480 + }, + { + "epoch": 0.3162455559046291, + "grad_norm": 40.1875, + "learning_rate": 9.9505867156524e-06, + "loss": 17.087, + "step": 17490 + }, + { + "epoch": 0.316426370973757, + "grad_norm": 38.1875, + "learning_rate": 9.950558463345741e-06, + "loss": 17.0963, + "step": 17500 + }, + { + "epoch": 0.3166071860428848, + "grad_norm": 41.59375, + "learning_rate": 9.950530211039082e-06, + "loss": 17.3186, + "step": 17510 + }, + { + "epoch": 0.31678800111201266, + "grad_norm": 40.71875, + "learning_rate": 9.950501958732422e-06, + "loss": 17.0498, + "step": 17520 + }, + { + "epoch": 0.31696881618114053, + "grad_norm": 40.375, + "learning_rate": 9.950473706425761e-06, + "loss": 17.2796, + "step": 17530 + }, + { + "epoch": 0.3171496312502684, + "grad_norm": 40.5625, + "learning_rate": 9.950445454119102e-06, + "loss": 16.9422, + "step": 17540 + }, + { + "epoch": 0.3173304463193963, + "grad_norm": 40.4375, + "learning_rate": 9.950417201812442e-06, + "loss": 17.1075, + "step": 17550 + }, + { + "epoch": 0.31751126138852415, + "grad_norm": 41.0, + "learning_rate": 9.950388949505783e-06, + "loss": 17.3609, + "step": 17560 + }, + { + "epoch": 0.31769207645765196, + "grad_norm": 41.46875, + "learning_rate": 9.950360697199124e-06, + "loss": 17.1011, + "step": 17570 + }, + { + "epoch": 0.31787289152677983, + "grad_norm": 41.34375, + "learning_rate": 9.950332444892464e-06, + "loss": 17.5317, + "step": 17580 + }, + { + "epoch": 0.3180537065959077, + "grad_norm": 37.0, + "learning_rate": 9.950304192585805e-06, + "loss": 17.2281, + "step": 17590 + }, + { + "epoch": 0.3182345216650356, + "grad_norm": 41.28125, + "learning_rate": 9.950275940279145e-06, + "loss": 16.8669, + "step": 17600 + }, + { + "epoch": 0.31841533673416345, + "grad_norm": 42.1875, + "learning_rate": 9.950247687972484e-06, + "loss": 16.9876, + "step": 17610 + }, + { + "epoch": 0.31859615180329126, + "grad_norm": 41.3125, + "learning_rate": 9.950219435665825e-06, + "loss": 17.7563, + "step": 17620 + }, + { + "epoch": 0.31877696687241913, + "grad_norm": 39.59375, + "learning_rate": 9.950191183359166e-06, + "loss": 17.0608, + "step": 17630 + }, + { + "epoch": 0.318957781941547, + "grad_norm": 40.09375, + "learning_rate": 9.950162931052506e-06, + "loss": 17.4674, + "step": 17640 + }, + { + "epoch": 0.3191385970106749, + "grad_norm": 40.5, + "learning_rate": 9.950134678745847e-06, + "loss": 16.9903, + "step": 17650 + }, + { + "epoch": 0.31931941207980274, + "grad_norm": 41.28125, + "learning_rate": 9.950106426439187e-06, + "loss": 17.3048, + "step": 17660 + }, + { + "epoch": 0.3195002271489306, + "grad_norm": 39.25, + "learning_rate": 9.950078174132528e-06, + "loss": 17.2867, + "step": 17670 + }, + { + "epoch": 0.31968104221805843, + "grad_norm": 42.09375, + "learning_rate": 9.950049921825869e-06, + "loss": 17.6517, + "step": 17680 + }, + { + "epoch": 0.3198618572871863, + "grad_norm": 39.03125, + "learning_rate": 9.95002166951921e-06, + "loss": 17.5702, + "step": 17690 + }, + { + "epoch": 0.3200426723563142, + "grad_norm": 41.4375, + "learning_rate": 9.949993417212548e-06, + "loss": 17.4709, + "step": 17700 + }, + { + "epoch": 0.32022348742544204, + "grad_norm": 43.1875, + "learning_rate": 9.949965164905889e-06, + "loss": 17.372, + "step": 17710 + }, + { + "epoch": 0.3204043024945699, + "grad_norm": 40.0625, + "learning_rate": 9.94993691259923e-06, + "loss": 17.1449, + "step": 17720 + }, + { + "epoch": 0.3205851175636978, + "grad_norm": 39.875, + "learning_rate": 9.94990866029257e-06, + "loss": 16.9721, + "step": 17730 + }, + { + "epoch": 0.3207659326328256, + "grad_norm": 40.34375, + "learning_rate": 9.94988040798591e-06, + "loss": 17.058, + "step": 17740 + }, + { + "epoch": 0.32094674770195347, + "grad_norm": 40.59375, + "learning_rate": 9.949852155679251e-06, + "loss": 17.1496, + "step": 17750 + }, + { + "epoch": 0.32112756277108134, + "grad_norm": 41.4375, + "learning_rate": 9.949823903372592e-06, + "loss": 17.3002, + "step": 17760 + }, + { + "epoch": 0.3213083778402092, + "grad_norm": 41.03125, + "learning_rate": 9.949795651065933e-06, + "loss": 17.1647, + "step": 17770 + }, + { + "epoch": 0.3214891929093371, + "grad_norm": 41.625, + "learning_rate": 9.949767398759273e-06, + "loss": 17.1988, + "step": 17780 + }, + { + "epoch": 0.3216700079784649, + "grad_norm": 43.875, + "learning_rate": 9.949739146452612e-06, + "loss": 17.3825, + "step": 17790 + }, + { + "epoch": 0.32185082304759277, + "grad_norm": 41.4375, + "learning_rate": 9.949710894145953e-06, + "loss": 17.05, + "step": 17800 + }, + { + "epoch": 0.32203163811672064, + "grad_norm": 43.59375, + "learning_rate": 9.949682641839293e-06, + "loss": 17.0607, + "step": 17810 + }, + { + "epoch": 0.3222124531858485, + "grad_norm": 40.6875, + "learning_rate": 9.949654389532634e-06, + "loss": 16.5678, + "step": 17820 + }, + { + "epoch": 0.3223932682549764, + "grad_norm": 42.0, + "learning_rate": 9.949626137225975e-06, + "loss": 17.0074, + "step": 17830 + }, + { + "epoch": 0.32257408332410425, + "grad_norm": 39.1875, + "learning_rate": 9.949597884919315e-06, + "loss": 17.5844, + "step": 17840 + }, + { + "epoch": 0.32275489839323207, + "grad_norm": 39.3125, + "learning_rate": 9.949569632612656e-06, + "loss": 17.0044, + "step": 17850 + }, + { + "epoch": 0.32293571346235994, + "grad_norm": 40.375, + "learning_rate": 9.949541380305997e-06, + "loss": 17.272, + "step": 17860 + }, + { + "epoch": 0.3231165285314878, + "grad_norm": 41.75, + "learning_rate": 9.949513127999335e-06, + "loss": 17.2767, + "step": 17870 + }, + { + "epoch": 0.3232973436006157, + "grad_norm": 40.125, + "learning_rate": 9.949484875692676e-06, + "loss": 16.9778, + "step": 17880 + }, + { + "epoch": 0.32347815866974355, + "grad_norm": 40.84375, + "learning_rate": 9.949456623386017e-06, + "loss": 17.1859, + "step": 17890 + }, + { + "epoch": 0.3236589737388714, + "grad_norm": 41.6875, + "learning_rate": 9.949428371079357e-06, + "loss": 16.9518, + "step": 17900 + }, + { + "epoch": 0.32383978880799924, + "grad_norm": 43.09375, + "learning_rate": 9.949400118772698e-06, + "loss": 17.2181, + "step": 17910 + }, + { + "epoch": 0.3240206038771271, + "grad_norm": 40.8125, + "learning_rate": 9.949371866466039e-06, + "loss": 17.0839, + "step": 17920 + }, + { + "epoch": 0.324201418946255, + "grad_norm": 41.0, + "learning_rate": 9.94934361415938e-06, + "loss": 16.9002, + "step": 17930 + }, + { + "epoch": 0.32438223401538285, + "grad_norm": 37.875, + "learning_rate": 9.94931536185272e-06, + "loss": 17.3268, + "step": 17940 + }, + { + "epoch": 0.3245630490845107, + "grad_norm": 40.09375, + "learning_rate": 9.94928710954606e-06, + "loss": 17.4407, + "step": 17950 + }, + { + "epoch": 0.32474386415363854, + "grad_norm": 41.40625, + "learning_rate": 9.9492588572394e-06, + "loss": 17.2294, + "step": 17960 + }, + { + "epoch": 0.3249246792227664, + "grad_norm": 41.625, + "learning_rate": 9.94923060493274e-06, + "loss": 17.0065, + "step": 17970 + }, + { + "epoch": 0.3251054942918943, + "grad_norm": 41.28125, + "learning_rate": 9.94920235262608e-06, + "loss": 17.3333, + "step": 17980 + }, + { + "epoch": 0.32528630936102215, + "grad_norm": 41.09375, + "learning_rate": 9.949174100319421e-06, + "loss": 17.2219, + "step": 17990 + }, + { + "epoch": 0.32546712443015, + "grad_norm": 41.09375, + "learning_rate": 9.949145848012762e-06, + "loss": 17.4031, + "step": 18000 + }, + { + "epoch": 0.3256479394992779, + "grad_norm": 36.53125, + "learning_rate": 9.949117595706102e-06, + "loss": 17.2132, + "step": 18010 + }, + { + "epoch": 0.3258287545684057, + "grad_norm": 37.28125, + "learning_rate": 9.949089343399443e-06, + "loss": 17.2204, + "step": 18020 + }, + { + "epoch": 0.3260095696375336, + "grad_norm": 42.375, + "learning_rate": 9.949061091092784e-06, + "loss": 17.3199, + "step": 18030 + }, + { + "epoch": 0.32619038470666145, + "grad_norm": 38.5, + "learning_rate": 9.949032838786123e-06, + "loss": 17.6373, + "step": 18040 + }, + { + "epoch": 0.3263711997757893, + "grad_norm": 42.375, + "learning_rate": 9.949004586479463e-06, + "loss": 17.1415, + "step": 18050 + }, + { + "epoch": 0.3265520148449172, + "grad_norm": 41.0, + "learning_rate": 9.948976334172804e-06, + "loss": 17.1106, + "step": 18060 + }, + { + "epoch": 0.32673282991404506, + "grad_norm": 41.5625, + "learning_rate": 9.948948081866145e-06, + "loss": 17.5328, + "step": 18070 + }, + { + "epoch": 0.3269136449831729, + "grad_norm": 40.53125, + "learning_rate": 9.948919829559485e-06, + "loss": 17.2637, + "step": 18080 + }, + { + "epoch": 0.32709446005230075, + "grad_norm": 41.375, + "learning_rate": 9.948891577252826e-06, + "loss": 16.6566, + "step": 18090 + }, + { + "epoch": 0.3272752751214286, + "grad_norm": 41.40625, + "learning_rate": 9.948863324946166e-06, + "loss": 17.204, + "step": 18100 + }, + { + "epoch": 0.3274560901905565, + "grad_norm": 40.6875, + "learning_rate": 9.948835072639507e-06, + "loss": 17.4579, + "step": 18110 + }, + { + "epoch": 0.32763690525968436, + "grad_norm": 42.15625, + "learning_rate": 9.948806820332848e-06, + "loss": 17.6162, + "step": 18120 + }, + { + "epoch": 0.3278177203288122, + "grad_norm": 42.71875, + "learning_rate": 9.948778568026187e-06, + "loss": 17.0227, + "step": 18130 + }, + { + "epoch": 0.32799853539794005, + "grad_norm": 39.09375, + "learning_rate": 9.948750315719527e-06, + "loss": 17.1382, + "step": 18140 + }, + { + "epoch": 0.3281793504670679, + "grad_norm": 40.0625, + "learning_rate": 9.948722063412868e-06, + "loss": 17.3412, + "step": 18150 + }, + { + "epoch": 0.3283601655361958, + "grad_norm": 39.21875, + "learning_rate": 9.948693811106208e-06, + "loss": 17.1184, + "step": 18160 + }, + { + "epoch": 0.32854098060532366, + "grad_norm": 42.5, + "learning_rate": 9.948665558799549e-06, + "loss": 16.8094, + "step": 18170 + }, + { + "epoch": 0.32872179567445153, + "grad_norm": 39.5, + "learning_rate": 9.94863730649289e-06, + "loss": 16.9699, + "step": 18180 + }, + { + "epoch": 0.32890261074357935, + "grad_norm": 39.53125, + "learning_rate": 9.94860905418623e-06, + "loss": 17.0162, + "step": 18190 + }, + { + "epoch": 0.3290834258127072, + "grad_norm": 39.875, + "learning_rate": 9.948580801879571e-06, + "loss": 17.3723, + "step": 18200 + }, + { + "epoch": 0.3292642408818351, + "grad_norm": 39.78125, + "learning_rate": 9.948552549572912e-06, + "loss": 17.3077, + "step": 18210 + }, + { + "epoch": 0.32944505595096296, + "grad_norm": 41.4375, + "learning_rate": 9.94852429726625e-06, + "loss": 16.9637, + "step": 18220 + }, + { + "epoch": 0.32962587102009083, + "grad_norm": 40.96875, + "learning_rate": 9.948496044959591e-06, + "loss": 17.1387, + "step": 18230 + }, + { + "epoch": 0.32980668608921865, + "grad_norm": 40.8125, + "learning_rate": 9.948467792652932e-06, + "loss": 17.3337, + "step": 18240 + }, + { + "epoch": 0.3299875011583465, + "grad_norm": 44.0625, + "learning_rate": 9.948439540346272e-06, + "loss": 17.097, + "step": 18250 + }, + { + "epoch": 0.3301683162274744, + "grad_norm": 43.46875, + "learning_rate": 9.948411288039613e-06, + "loss": 17.0158, + "step": 18260 + }, + { + "epoch": 0.33034913129660226, + "grad_norm": 43.84375, + "learning_rate": 9.948383035732954e-06, + "loss": 17.2627, + "step": 18270 + }, + { + "epoch": 0.33052994636573013, + "grad_norm": 41.65625, + "learning_rate": 9.948354783426294e-06, + "loss": 16.8991, + "step": 18280 + }, + { + "epoch": 0.330710761434858, + "grad_norm": 40.9375, + "learning_rate": 9.948326531119635e-06, + "loss": 17.0619, + "step": 18290 + }, + { + "epoch": 0.3308915765039858, + "grad_norm": 42.15625, + "learning_rate": 9.948298278812974e-06, + "loss": 17.3769, + "step": 18300 + }, + { + "epoch": 0.3310723915731137, + "grad_norm": 43.15625, + "learning_rate": 9.948270026506314e-06, + "loss": 17.4126, + "step": 18310 + }, + { + "epoch": 0.33125320664224156, + "grad_norm": 39.71875, + "learning_rate": 9.948241774199655e-06, + "loss": 17.0779, + "step": 18320 + }, + { + "epoch": 0.33143402171136943, + "grad_norm": 39.1875, + "learning_rate": 9.948213521892996e-06, + "loss": 16.984, + "step": 18330 + }, + { + "epoch": 0.3316148367804973, + "grad_norm": 38.96875, + "learning_rate": 9.948185269586336e-06, + "loss": 16.7094, + "step": 18340 + }, + { + "epoch": 0.3317956518496252, + "grad_norm": 41.46875, + "learning_rate": 9.948157017279677e-06, + "loss": 17.0435, + "step": 18350 + }, + { + "epoch": 0.331976466918753, + "grad_norm": 39.96875, + "learning_rate": 9.948128764973017e-06, + "loss": 17.2927, + "step": 18360 + }, + { + "epoch": 0.33215728198788086, + "grad_norm": 43.46875, + "learning_rate": 9.948100512666358e-06, + "loss": 17.579, + "step": 18370 + }, + { + "epoch": 0.33233809705700873, + "grad_norm": 42.9375, + "learning_rate": 9.948072260359699e-06, + "loss": 17.1167, + "step": 18380 + }, + { + "epoch": 0.3325189121261366, + "grad_norm": 40.875, + "learning_rate": 9.948044008053038e-06, + "loss": 17.3039, + "step": 18390 + }, + { + "epoch": 0.33269972719526447, + "grad_norm": 42.15625, + "learning_rate": 9.948015755746378e-06, + "loss": 17.0618, + "step": 18400 + }, + { + "epoch": 0.3328805422643923, + "grad_norm": 38.0625, + "learning_rate": 9.947987503439719e-06, + "loss": 17.0005, + "step": 18410 + }, + { + "epoch": 0.33306135733352016, + "grad_norm": 39.90625, + "learning_rate": 9.94795925113306e-06, + "loss": 17.2511, + "step": 18420 + }, + { + "epoch": 0.33324217240264803, + "grad_norm": 37.9375, + "learning_rate": 9.9479309988264e-06, + "loss": 16.8679, + "step": 18430 + }, + { + "epoch": 0.3334229874717759, + "grad_norm": 39.0625, + "learning_rate": 9.94790274651974e-06, + "loss": 17.434, + "step": 18440 + }, + { + "epoch": 0.33360380254090377, + "grad_norm": 41.03125, + "learning_rate": 9.947874494213081e-06, + "loss": 17.287, + "step": 18450 + }, + { + "epoch": 0.33378461761003164, + "grad_norm": 37.5625, + "learning_rate": 9.947846241906422e-06, + "loss": 17.7589, + "step": 18460 + }, + { + "epoch": 0.33396543267915946, + "grad_norm": 41.09375, + "learning_rate": 9.947817989599761e-06, + "loss": 16.7847, + "step": 18470 + }, + { + "epoch": 0.33414624774828733, + "grad_norm": 41.3125, + "learning_rate": 9.947789737293102e-06, + "loss": 16.9083, + "step": 18480 + }, + { + "epoch": 0.3343270628174152, + "grad_norm": 39.8125, + "learning_rate": 9.947761484986442e-06, + "loss": 17.2558, + "step": 18490 + }, + { + "epoch": 0.33450787788654307, + "grad_norm": 39.75, + "learning_rate": 9.947733232679783e-06, + "loss": 16.9924, + "step": 18500 + }, + { + "epoch": 0.33468869295567094, + "grad_norm": 38.65625, + "learning_rate": 9.947704980373123e-06, + "loss": 17.0651, + "step": 18510 + }, + { + "epoch": 0.3348695080247988, + "grad_norm": 42.875, + "learning_rate": 9.947676728066464e-06, + "loss": 16.8699, + "step": 18520 + }, + { + "epoch": 0.3350503230939266, + "grad_norm": 37.375, + "learning_rate": 9.947648475759805e-06, + "loss": 17.208, + "step": 18530 + }, + { + "epoch": 0.3352311381630545, + "grad_norm": 40.4375, + "learning_rate": 9.947620223453145e-06, + "loss": 17.3355, + "step": 18540 + }, + { + "epoch": 0.33541195323218237, + "grad_norm": 40.6875, + "learning_rate": 9.947591971146486e-06, + "loss": 16.9956, + "step": 18550 + }, + { + "epoch": 0.33559276830131024, + "grad_norm": 42.53125, + "learning_rate": 9.947563718839825e-06, + "loss": 16.8433, + "step": 18560 + }, + { + "epoch": 0.3357735833704381, + "grad_norm": 38.96875, + "learning_rate": 9.947535466533165e-06, + "loss": 17.1173, + "step": 18570 + }, + { + "epoch": 0.3359543984395659, + "grad_norm": 41.4375, + "learning_rate": 9.947507214226506e-06, + "loss": 17.3725, + "step": 18580 + }, + { + "epoch": 0.3361352135086938, + "grad_norm": 38.9375, + "learning_rate": 9.947478961919847e-06, + "loss": 17.1542, + "step": 18590 + }, + { + "epoch": 0.33631602857782167, + "grad_norm": 43.0625, + "learning_rate": 9.947450709613187e-06, + "loss": 17.6042, + "step": 18600 + }, + { + "epoch": 0.33649684364694954, + "grad_norm": 43.5, + "learning_rate": 9.947422457306528e-06, + "loss": 17.1111, + "step": 18610 + }, + { + "epoch": 0.3366776587160774, + "grad_norm": 39.6875, + "learning_rate": 9.947394204999869e-06, + "loss": 16.7568, + "step": 18620 + }, + { + "epoch": 0.3368584737852053, + "grad_norm": 41.0625, + "learning_rate": 9.94736595269321e-06, + "loss": 16.7065, + "step": 18630 + }, + { + "epoch": 0.3370392888543331, + "grad_norm": 39.90625, + "learning_rate": 9.94733770038655e-06, + "loss": 16.9225, + "step": 18640 + }, + { + "epoch": 0.33722010392346097, + "grad_norm": 43.15625, + "learning_rate": 9.947309448079889e-06, + "loss": 17.0526, + "step": 18650 + }, + { + "epoch": 0.33740091899258884, + "grad_norm": 38.625, + "learning_rate": 9.94728119577323e-06, + "loss": 17.6703, + "step": 18660 + }, + { + "epoch": 0.3375817340617167, + "grad_norm": 40.3125, + "learning_rate": 9.94725294346657e-06, + "loss": 17.1176, + "step": 18670 + }, + { + "epoch": 0.3377625491308446, + "grad_norm": 41.1875, + "learning_rate": 9.94722469115991e-06, + "loss": 17.3024, + "step": 18680 + }, + { + "epoch": 0.33794336419997245, + "grad_norm": 40.25, + "learning_rate": 9.947196438853251e-06, + "loss": 17.4143, + "step": 18690 + }, + { + "epoch": 0.33812417926910027, + "grad_norm": 38.5, + "learning_rate": 9.947168186546592e-06, + "loss": 17.0131, + "step": 18700 + }, + { + "epoch": 0.33830499433822814, + "grad_norm": 40.5625, + "learning_rate": 9.947139934239932e-06, + "loss": 17.4532, + "step": 18710 + }, + { + "epoch": 0.338485809407356, + "grad_norm": 42.84375, + "learning_rate": 9.947111681933273e-06, + "loss": 17.7063, + "step": 18720 + }, + { + "epoch": 0.3386666244764839, + "grad_norm": 44.6875, + "learning_rate": 9.947083429626612e-06, + "loss": 17.6791, + "step": 18730 + }, + { + "epoch": 0.33884743954561175, + "grad_norm": 37.96875, + "learning_rate": 9.947055177319953e-06, + "loss": 17.2725, + "step": 18740 + }, + { + "epoch": 0.33902825461473957, + "grad_norm": 41.34375, + "learning_rate": 9.947026925013293e-06, + "loss": 17.5307, + "step": 18750 + }, + { + "epoch": 0.33920906968386744, + "grad_norm": 38.25, + "learning_rate": 9.946998672706634e-06, + "loss": 17.2355, + "step": 18760 + }, + { + "epoch": 0.3393898847529953, + "grad_norm": 40.34375, + "learning_rate": 9.946970420399975e-06, + "loss": 17.0688, + "step": 18770 + }, + { + "epoch": 0.3395706998221232, + "grad_norm": 40.53125, + "learning_rate": 9.946942168093313e-06, + "loss": 17.016, + "step": 18780 + }, + { + "epoch": 0.33975151489125105, + "grad_norm": 41.125, + "learning_rate": 9.946913915786656e-06, + "loss": 17.0653, + "step": 18790 + }, + { + "epoch": 0.3399323299603789, + "grad_norm": 40.71875, + "learning_rate": 9.946885663479996e-06, + "loss": 17.0459, + "step": 18800 + }, + { + "epoch": 0.34011314502950674, + "grad_norm": 41.03125, + "learning_rate": 9.946857411173337e-06, + "loss": 17.3077, + "step": 18810 + }, + { + "epoch": 0.3402939600986346, + "grad_norm": 41.78125, + "learning_rate": 9.946829158866676e-06, + "loss": 17.0437, + "step": 18820 + }, + { + "epoch": 0.3404747751677625, + "grad_norm": 40.25, + "learning_rate": 9.946800906560017e-06, + "loss": 17.9247, + "step": 18830 + }, + { + "epoch": 0.34065559023689035, + "grad_norm": 41.625, + "learning_rate": 9.946772654253357e-06, + "loss": 17.3083, + "step": 18840 + }, + { + "epoch": 0.3408364053060182, + "grad_norm": 42.3125, + "learning_rate": 9.946744401946698e-06, + "loss": 16.8552, + "step": 18850 + }, + { + "epoch": 0.3410172203751461, + "grad_norm": 42.875, + "learning_rate": 9.946716149640038e-06, + "loss": 17.5379, + "step": 18860 + }, + { + "epoch": 0.3411980354442739, + "grad_norm": 38.0625, + "learning_rate": 9.946687897333379e-06, + "loss": 17.1698, + "step": 18870 + }, + { + "epoch": 0.3413788505134018, + "grad_norm": 41.53125, + "learning_rate": 9.94665964502672e-06, + "loss": 17.2349, + "step": 18880 + }, + { + "epoch": 0.34155966558252965, + "grad_norm": 39.84375, + "learning_rate": 9.94663139272006e-06, + "loss": 17.123, + "step": 18890 + }, + { + "epoch": 0.3417404806516575, + "grad_norm": 43.28125, + "learning_rate": 9.9466031404134e-06, + "loss": 17.0436, + "step": 18900 + }, + { + "epoch": 0.3419212957207854, + "grad_norm": 41.40625, + "learning_rate": 9.94657488810674e-06, + "loss": 17.3219, + "step": 18910 + }, + { + "epoch": 0.3421021107899132, + "grad_norm": 39.21875, + "learning_rate": 9.94654663580008e-06, + "loss": 17.252, + "step": 18920 + }, + { + "epoch": 0.3422829258590411, + "grad_norm": 41.0625, + "learning_rate": 9.946518383493421e-06, + "loss": 17.3506, + "step": 18930 + }, + { + "epoch": 0.34246374092816895, + "grad_norm": 39.78125, + "learning_rate": 9.946490131186762e-06, + "loss": 17.1539, + "step": 18940 + }, + { + "epoch": 0.3426445559972968, + "grad_norm": 38.84375, + "learning_rate": 9.946461878880102e-06, + "loss": 17.2454, + "step": 18950 + }, + { + "epoch": 0.3428253710664247, + "grad_norm": 43.6875, + "learning_rate": 9.946433626573443e-06, + "loss": 17.1604, + "step": 18960 + }, + { + "epoch": 0.34300618613555256, + "grad_norm": 40.90625, + "learning_rate": 9.946405374266784e-06, + "loss": 17.216, + "step": 18970 + }, + { + "epoch": 0.3431870012046804, + "grad_norm": 39.8125, + "learning_rate": 9.946377121960124e-06, + "loss": 17.0927, + "step": 18980 + }, + { + "epoch": 0.34336781627380825, + "grad_norm": 39.65625, + "learning_rate": 9.946348869653463e-06, + "loss": 17.1234, + "step": 18990 + }, + { + "epoch": 0.3435486313429361, + "grad_norm": 40.90625, + "learning_rate": 9.946320617346804e-06, + "loss": 17.0432, + "step": 19000 + }, + { + "epoch": 0.343729446412064, + "grad_norm": 39.75, + "learning_rate": 9.946292365040144e-06, + "loss": 17.5622, + "step": 19010 + }, + { + "epoch": 0.34391026148119186, + "grad_norm": 41.21875, + "learning_rate": 9.946264112733485e-06, + "loss": 17.1439, + "step": 19020 + }, + { + "epoch": 0.34409107655031973, + "grad_norm": 42.125, + "learning_rate": 9.946235860426826e-06, + "loss": 16.7243, + "step": 19030 + }, + { + "epoch": 0.34427189161944755, + "grad_norm": 36.53125, + "learning_rate": 9.946207608120165e-06, + "loss": 17.1458, + "step": 19040 + }, + { + "epoch": 0.3444527066885754, + "grad_norm": 41.25, + "learning_rate": 9.946179355813507e-06, + "loss": 17.2407, + "step": 19050 + }, + { + "epoch": 0.3446335217577033, + "grad_norm": 39.125, + "learning_rate": 9.946151103506848e-06, + "loss": 17.1062, + "step": 19060 + }, + { + "epoch": 0.34481433682683116, + "grad_norm": 38.71875, + "learning_rate": 9.946122851200188e-06, + "loss": 17.1634, + "step": 19070 + }, + { + "epoch": 0.34499515189595903, + "grad_norm": 41.125, + "learning_rate": 9.946094598893527e-06, + "loss": 17.13, + "step": 19080 + }, + { + "epoch": 0.34517596696508684, + "grad_norm": 38.65625, + "learning_rate": 9.946066346586868e-06, + "loss": 17.0695, + "step": 19090 + }, + { + "epoch": 0.3453567820342147, + "grad_norm": 39.625, + "learning_rate": 9.946038094280208e-06, + "loss": 16.5874, + "step": 19100 + }, + { + "epoch": 0.3455375971033426, + "grad_norm": 42.8125, + "learning_rate": 9.946009841973549e-06, + "loss": 17.1617, + "step": 19110 + }, + { + "epoch": 0.34571841217247046, + "grad_norm": 41.1875, + "learning_rate": 9.94598158966689e-06, + "loss": 17.3538, + "step": 19120 + }, + { + "epoch": 0.34589922724159833, + "grad_norm": 40.6875, + "learning_rate": 9.945953337360228e-06, + "loss": 17.2054, + "step": 19130 + }, + { + "epoch": 0.3460800423107262, + "grad_norm": 42.25, + "learning_rate": 9.94592508505357e-06, + "loss": 17.5536, + "step": 19140 + }, + { + "epoch": 0.346260857379854, + "grad_norm": 43.9375, + "learning_rate": 9.945896832746911e-06, + "loss": 17.1485, + "step": 19150 + }, + { + "epoch": 0.3464416724489819, + "grad_norm": 43.6875, + "learning_rate": 9.94586858044025e-06, + "loss": 17.3381, + "step": 19160 + }, + { + "epoch": 0.34662248751810976, + "grad_norm": 39.53125, + "learning_rate": 9.945840328133591e-06, + "loss": 16.9011, + "step": 19170 + }, + { + "epoch": 0.3468033025872376, + "grad_norm": 40.90625, + "learning_rate": 9.945812075826932e-06, + "loss": 16.9682, + "step": 19180 + }, + { + "epoch": 0.3469841176563655, + "grad_norm": 39.65625, + "learning_rate": 9.945783823520272e-06, + "loss": 16.9983, + "step": 19190 + }, + { + "epoch": 0.34716493272549337, + "grad_norm": 43.4375, + "learning_rate": 9.945755571213613e-06, + "loss": 17.5587, + "step": 19200 + }, + { + "epoch": 0.3473457477946212, + "grad_norm": 41.8125, + "learning_rate": 9.945727318906952e-06, + "loss": 16.8602, + "step": 19210 + }, + { + "epoch": 0.34752656286374906, + "grad_norm": 41.96875, + "learning_rate": 9.945699066600294e-06, + "loss": 17.4146, + "step": 19220 + }, + { + "epoch": 0.3477073779328769, + "grad_norm": 42.375, + "learning_rate": 9.945670814293635e-06, + "loss": 16.7323, + "step": 19230 + }, + { + "epoch": 0.3478881930020048, + "grad_norm": 38.6875, + "learning_rate": 9.945642561986975e-06, + "loss": 17.0783, + "step": 19240 + }, + { + "epoch": 0.34806900807113267, + "grad_norm": 39.40625, + "learning_rate": 9.945614309680314e-06, + "loss": 16.9396, + "step": 19250 + }, + { + "epoch": 0.3482498231402605, + "grad_norm": 40.78125, + "learning_rate": 9.945586057373655e-06, + "loss": 17.1608, + "step": 19260 + }, + { + "epoch": 0.34843063820938835, + "grad_norm": 39.21875, + "learning_rate": 9.945557805066995e-06, + "loss": 17.3588, + "step": 19270 + }, + { + "epoch": 0.3486114532785162, + "grad_norm": 39.15625, + "learning_rate": 9.945529552760336e-06, + "loss": 17.2244, + "step": 19280 + }, + { + "epoch": 0.3487922683476441, + "grad_norm": 43.6875, + "learning_rate": 9.945501300453677e-06, + "loss": 17.6382, + "step": 19290 + }, + { + "epoch": 0.34897308341677197, + "grad_norm": 41.21875, + "learning_rate": 9.945473048147016e-06, + "loss": 17.0822, + "step": 19300 + }, + { + "epoch": 0.34915389848589984, + "grad_norm": 40.03125, + "learning_rate": 9.945444795840358e-06, + "loss": 17.418, + "step": 19310 + }, + { + "epoch": 0.34933471355502765, + "grad_norm": 44.03125, + "learning_rate": 9.945416543533699e-06, + "loss": 17.2986, + "step": 19320 + }, + { + "epoch": 0.3495155286241555, + "grad_norm": 41.28125, + "learning_rate": 9.945388291227038e-06, + "loss": 16.9621, + "step": 19330 + }, + { + "epoch": 0.3496963436932834, + "grad_norm": 41.5, + "learning_rate": 9.945360038920378e-06, + "loss": 17.1927, + "step": 19340 + }, + { + "epoch": 0.34987715876241127, + "grad_norm": 40.75, + "learning_rate": 9.945331786613719e-06, + "loss": 17.5507, + "step": 19350 + }, + { + "epoch": 0.35005797383153914, + "grad_norm": 41.375, + "learning_rate": 9.94530353430706e-06, + "loss": 17.3923, + "step": 19360 + }, + { + "epoch": 0.350238788900667, + "grad_norm": 40.6875, + "learning_rate": 9.9452752820004e-06, + "loss": 17.129, + "step": 19370 + }, + { + "epoch": 0.3504196039697948, + "grad_norm": 39.375, + "learning_rate": 9.945247029693739e-06, + "loss": 17.591, + "step": 19380 + }, + { + "epoch": 0.3506004190389227, + "grad_norm": 40.03125, + "learning_rate": 9.94521877738708e-06, + "loss": 17.3433, + "step": 19390 + }, + { + "epoch": 0.35078123410805057, + "grad_norm": 38.09375, + "learning_rate": 9.945190525080422e-06, + "loss": 17.116, + "step": 19400 + }, + { + "epoch": 0.35096204917717844, + "grad_norm": 41.65625, + "learning_rate": 9.945162272773763e-06, + "loss": 17.407, + "step": 19410 + }, + { + "epoch": 0.3511428642463063, + "grad_norm": 38.21875, + "learning_rate": 9.945134020467101e-06, + "loss": 16.9265, + "step": 19420 + }, + { + "epoch": 0.3513236793154341, + "grad_norm": 41.8125, + "learning_rate": 9.945105768160442e-06, + "loss": 16.8851, + "step": 19430 + }, + { + "epoch": 0.351504494384562, + "grad_norm": 39.5, + "learning_rate": 9.945077515853783e-06, + "loss": 17.0815, + "step": 19440 + }, + { + "epoch": 0.35168530945368986, + "grad_norm": 39.78125, + "learning_rate": 9.945049263547123e-06, + "loss": 17.43, + "step": 19450 + }, + { + "epoch": 0.35186612452281774, + "grad_norm": 40.84375, + "learning_rate": 9.945021011240464e-06, + "loss": 16.987, + "step": 19460 + }, + { + "epoch": 0.3520469395919456, + "grad_norm": 40.53125, + "learning_rate": 9.944992758933803e-06, + "loss": 16.6883, + "step": 19470 + }, + { + "epoch": 0.3522277546610735, + "grad_norm": 41.8125, + "learning_rate": 9.944964506627143e-06, + "loss": 17.3894, + "step": 19480 + }, + { + "epoch": 0.3524085697302013, + "grad_norm": 42.0625, + "learning_rate": 9.944936254320486e-06, + "loss": 17.1256, + "step": 19490 + }, + { + "epoch": 0.35258938479932916, + "grad_norm": 41.34375, + "learning_rate": 9.944908002013825e-06, + "loss": 17.2746, + "step": 19500 + }, + { + "epoch": 0.35277019986845704, + "grad_norm": 39.40625, + "learning_rate": 9.944879749707165e-06, + "loss": 17.3932, + "step": 19510 + }, + { + "epoch": 0.3529510149375849, + "grad_norm": 38.0625, + "learning_rate": 9.944851497400506e-06, + "loss": 16.6536, + "step": 19520 + }, + { + "epoch": 0.3531318300067128, + "grad_norm": 41.6875, + "learning_rate": 9.944823245093847e-06, + "loss": 17.1047, + "step": 19530 + }, + { + "epoch": 0.35331264507584065, + "grad_norm": 40.21875, + "learning_rate": 9.944794992787187e-06, + "loss": 17.5294, + "step": 19540 + }, + { + "epoch": 0.35349346014496846, + "grad_norm": 43.1875, + "learning_rate": 9.944766740480528e-06, + "loss": 16.927, + "step": 19550 + }, + { + "epoch": 0.35367427521409633, + "grad_norm": 40.84375, + "learning_rate": 9.944738488173867e-06, + "loss": 17.1587, + "step": 19560 + }, + { + "epoch": 0.3538550902832242, + "grad_norm": 41.8125, + "learning_rate": 9.944710235867209e-06, + "loss": 17.4372, + "step": 19570 + }, + { + "epoch": 0.3540359053523521, + "grad_norm": 41.0, + "learning_rate": 9.94468198356055e-06, + "loss": 17.637, + "step": 19580 + }, + { + "epoch": 0.35421672042147995, + "grad_norm": 42.125, + "learning_rate": 9.944653731253889e-06, + "loss": 17.1499, + "step": 19590 + }, + { + "epoch": 0.35439753549060776, + "grad_norm": 37.25, + "learning_rate": 9.94462547894723e-06, + "loss": 17.0127, + "step": 19600 + }, + { + "epoch": 0.35457835055973563, + "grad_norm": 39.53125, + "learning_rate": 9.94459722664057e-06, + "loss": 17.0926, + "step": 19610 + }, + { + "epoch": 0.3547591656288635, + "grad_norm": 40.1875, + "learning_rate": 9.94456897433391e-06, + "loss": 17.227, + "step": 19620 + }, + { + "epoch": 0.3549399806979914, + "grad_norm": 44.28125, + "learning_rate": 9.944540722027251e-06, + "loss": 16.7545, + "step": 19630 + }, + { + "epoch": 0.35512079576711925, + "grad_norm": 40.625, + "learning_rate": 9.94451246972059e-06, + "loss": 17.1527, + "step": 19640 + }, + { + "epoch": 0.3553016108362471, + "grad_norm": 39.5, + "learning_rate": 9.94448421741393e-06, + "loss": 17.1649, + "step": 19650 + }, + { + "epoch": 0.35548242590537493, + "grad_norm": 39.9375, + "learning_rate": 9.944455965107273e-06, + "loss": 17.5156, + "step": 19660 + }, + { + "epoch": 0.3556632409745028, + "grad_norm": 40.84375, + "learning_rate": 9.944427712800614e-06, + "loss": 16.9445, + "step": 19670 + }, + { + "epoch": 0.3558440560436307, + "grad_norm": 42.1875, + "learning_rate": 9.944399460493953e-06, + "loss": 16.997, + "step": 19680 + }, + { + "epoch": 0.35602487111275855, + "grad_norm": 40.15625, + "learning_rate": 9.944371208187293e-06, + "loss": 17.3185, + "step": 19690 + }, + { + "epoch": 0.3562056861818864, + "grad_norm": 39.0625, + "learning_rate": 9.944342955880634e-06, + "loss": 17.1195, + "step": 19700 + }, + { + "epoch": 0.3563865012510143, + "grad_norm": 40.53125, + "learning_rate": 9.944314703573974e-06, + "loss": 17.2379, + "step": 19710 + }, + { + "epoch": 0.3565673163201421, + "grad_norm": 39.78125, + "learning_rate": 9.944286451267315e-06, + "loss": 17.1858, + "step": 19720 + }, + { + "epoch": 0.35674813138927, + "grad_norm": 39.84375, + "learning_rate": 9.944258198960654e-06, + "loss": 17.3016, + "step": 19730 + }, + { + "epoch": 0.35692894645839784, + "grad_norm": 43.34375, + "learning_rate": 9.944229946653995e-06, + "loss": 17.0505, + "step": 19740 + }, + { + "epoch": 0.3571097615275257, + "grad_norm": 41.46875, + "learning_rate": 9.944201694347337e-06, + "loss": 16.6689, + "step": 19750 + }, + { + "epoch": 0.3572905765966536, + "grad_norm": 43.46875, + "learning_rate": 9.944173442040676e-06, + "loss": 16.7929, + "step": 19760 + }, + { + "epoch": 0.3574713916657814, + "grad_norm": 41.0625, + "learning_rate": 9.944145189734016e-06, + "loss": 17.0547, + "step": 19770 + }, + { + "epoch": 0.3576522067349093, + "grad_norm": 39.34375, + "learning_rate": 9.944116937427357e-06, + "loss": 17.0405, + "step": 19780 + }, + { + "epoch": 0.35783302180403714, + "grad_norm": 41.96875, + "learning_rate": 9.944088685120698e-06, + "loss": 17.3822, + "step": 19790 + }, + { + "epoch": 0.358013836873165, + "grad_norm": 43.125, + "learning_rate": 9.944060432814038e-06, + "loss": 17.0451, + "step": 19800 + }, + { + "epoch": 0.3581946519422929, + "grad_norm": 39.9375, + "learning_rate": 9.944032180507377e-06, + "loss": 17.159, + "step": 19810 + }, + { + "epoch": 0.35837546701142076, + "grad_norm": 42.09375, + "learning_rate": 9.944003928200718e-06, + "loss": 16.8105, + "step": 19820 + }, + { + "epoch": 0.35855628208054857, + "grad_norm": 40.46875, + "learning_rate": 9.943975675894058e-06, + "loss": 17.3335, + "step": 19830 + }, + { + "epoch": 0.35873709714967644, + "grad_norm": 39.25, + "learning_rate": 9.9439474235874e-06, + "loss": 17.1033, + "step": 19840 + }, + { + "epoch": 0.3589179122188043, + "grad_norm": 38.1875, + "learning_rate": 9.94391917128074e-06, + "loss": 17.2213, + "step": 19850 + }, + { + "epoch": 0.3590987272879322, + "grad_norm": 42.09375, + "learning_rate": 9.94389091897408e-06, + "loss": 16.6604, + "step": 19860 + }, + { + "epoch": 0.35927954235706006, + "grad_norm": 44.65625, + "learning_rate": 9.943862666667421e-06, + "loss": 17.556, + "step": 19870 + }, + { + "epoch": 0.35946035742618787, + "grad_norm": 40.1875, + "learning_rate": 9.943834414360762e-06, + "loss": 16.9048, + "step": 19880 + }, + { + "epoch": 0.35964117249531574, + "grad_norm": 41.625, + "learning_rate": 9.943806162054102e-06, + "loss": 17.0686, + "step": 19890 + }, + { + "epoch": 0.3598219875644436, + "grad_norm": 42.21875, + "learning_rate": 9.943777909747441e-06, + "loss": 17.6033, + "step": 19900 + }, + { + "epoch": 0.3600028026335715, + "grad_norm": 40.90625, + "learning_rate": 9.943749657440782e-06, + "loss": 16.8213, + "step": 19910 + }, + { + "epoch": 0.36018361770269935, + "grad_norm": 43.375, + "learning_rate": 9.943721405134124e-06, + "loss": 17.6674, + "step": 19920 + }, + { + "epoch": 0.3603644327718272, + "grad_norm": 39.0, + "learning_rate": 9.943693152827463e-06, + "loss": 17.1387, + "step": 19930 + }, + { + "epoch": 0.36054524784095504, + "grad_norm": 40.21875, + "learning_rate": 9.943664900520804e-06, + "loss": 17.6029, + "step": 19940 + }, + { + "epoch": 0.3607260629100829, + "grad_norm": 40.5, + "learning_rate": 9.943636648214144e-06, + "loss": 17.2961, + "step": 19950 + }, + { + "epoch": 0.3609068779792108, + "grad_norm": 42.78125, + "learning_rate": 9.943608395907485e-06, + "loss": 17.1464, + "step": 19960 + }, + { + "epoch": 0.36108769304833865, + "grad_norm": 41.625, + "learning_rate": 9.943580143600825e-06, + "loss": 17.2045, + "step": 19970 + }, + { + "epoch": 0.3612685081174665, + "grad_norm": 39.4375, + "learning_rate": 9.943551891294166e-06, + "loss": 17.0232, + "step": 19980 + }, + { + "epoch": 0.3614493231865944, + "grad_norm": 42.5625, + "learning_rate": 9.943523638987505e-06, + "loss": 16.7419, + "step": 19990 + }, + { + "epoch": 0.3616301382557222, + "grad_norm": 39.1875, + "learning_rate": 9.943495386680846e-06, + "loss": 17.3945, + "step": 20000 + }, + { + "epoch": 0.3616301382557222, + "eval_loss": 2.1501667499542236, + "eval_runtime": 228.7367, + "eval_samples_per_second": 3174.213, + "eval_steps_per_second": 49.599, + "step": 20000 + }, + { + "epoch": 0.3618109533248501, + "grad_norm": 40.1875, + "learning_rate": 9.943467134374188e-06, + "loss": 17.2655, + "step": 20010 + }, + { + "epoch": 0.36199176839397795, + "grad_norm": 40.21875, + "learning_rate": 9.943438882067527e-06, + "loss": 17.4808, + "step": 20020 + }, + { + "epoch": 0.3621725834631058, + "grad_norm": 42.1875, + "learning_rate": 9.943410629760868e-06, + "loss": 17.0917, + "step": 20030 + }, + { + "epoch": 0.3623533985322337, + "grad_norm": 41.71875, + "learning_rate": 9.943382377454208e-06, + "loss": 16.8377, + "step": 20040 + }, + { + "epoch": 0.3625342136013615, + "grad_norm": 44.3125, + "learning_rate": 9.943354125147549e-06, + "loss": 17.6013, + "step": 20050 + }, + { + "epoch": 0.3627150286704894, + "grad_norm": 41.59375, + "learning_rate": 9.94332587284089e-06, + "loss": 17.57, + "step": 20060 + }, + { + "epoch": 0.36289584373961725, + "grad_norm": 41.09375, + "learning_rate": 9.943297620534228e-06, + "loss": 17.0332, + "step": 20070 + }, + { + "epoch": 0.3630766588087451, + "grad_norm": 40.9375, + "learning_rate": 9.943269368227569e-06, + "loss": 17.2701, + "step": 20080 + }, + { + "epoch": 0.363257473877873, + "grad_norm": 40.21875, + "learning_rate": 9.94324111592091e-06, + "loss": 17.0229, + "step": 20090 + }, + { + "epoch": 0.36343828894700086, + "grad_norm": 42.28125, + "learning_rate": 9.943212863614252e-06, + "loss": 17.3704, + "step": 20100 + }, + { + "epoch": 0.3636191040161287, + "grad_norm": 42.84375, + "learning_rate": 9.94318461130759e-06, + "loss": 16.8609, + "step": 20110 + }, + { + "epoch": 0.36379991908525655, + "grad_norm": 43.625, + "learning_rate": 9.943156359000931e-06, + "loss": 17.259, + "step": 20120 + }, + { + "epoch": 0.3639807341543844, + "grad_norm": 41.4375, + "learning_rate": 9.943128106694272e-06, + "loss": 17.0671, + "step": 20130 + }, + { + "epoch": 0.3641615492235123, + "grad_norm": 40.78125, + "learning_rate": 9.943099854387613e-06, + "loss": 17.6245, + "step": 20140 + }, + { + "epoch": 0.36434236429264016, + "grad_norm": 43.84375, + "learning_rate": 9.943071602080953e-06, + "loss": 17.1875, + "step": 20150 + }, + { + "epoch": 0.36452317936176803, + "grad_norm": 44.40625, + "learning_rate": 9.943043349774292e-06, + "loss": 17.0023, + "step": 20160 + }, + { + "epoch": 0.36470399443089585, + "grad_norm": 40.0, + "learning_rate": 9.943015097467633e-06, + "loss": 16.8232, + "step": 20170 + }, + { + "epoch": 0.3648848095000237, + "grad_norm": 38.3125, + "learning_rate": 9.942986845160973e-06, + "loss": 17.224, + "step": 20180 + }, + { + "epoch": 0.3650656245691516, + "grad_norm": 41.9375, + "learning_rate": 9.942958592854314e-06, + "loss": 17.2585, + "step": 20190 + }, + { + "epoch": 0.36524643963827946, + "grad_norm": 41.1875, + "learning_rate": 9.942930340547655e-06, + "loss": 17.0163, + "step": 20200 + }, + { + "epoch": 0.36542725470740733, + "grad_norm": 44.5, + "learning_rate": 9.942902088240995e-06, + "loss": 16.949, + "step": 20210 + }, + { + "epoch": 0.36560806977653515, + "grad_norm": 37.96875, + "learning_rate": 9.942873835934336e-06, + "loss": 17.4766, + "step": 20220 + }, + { + "epoch": 0.365788884845663, + "grad_norm": 40.4375, + "learning_rate": 9.942845583627677e-06, + "loss": 17.3368, + "step": 20230 + }, + { + "epoch": 0.3659696999147909, + "grad_norm": 43.8125, + "learning_rate": 9.942817331321016e-06, + "loss": 16.8237, + "step": 20240 + }, + { + "epoch": 0.36615051498391876, + "grad_norm": 40.125, + "learning_rate": 9.942789079014356e-06, + "loss": 17.1638, + "step": 20250 + }, + { + "epoch": 0.36633133005304663, + "grad_norm": 41.03125, + "learning_rate": 9.942760826707697e-06, + "loss": 17.4119, + "step": 20260 + }, + { + "epoch": 0.3665121451221745, + "grad_norm": 39.5, + "learning_rate": 9.942732574401039e-06, + "loss": 16.9666, + "step": 20270 + }, + { + "epoch": 0.3666929601913023, + "grad_norm": 38.9375, + "learning_rate": 9.942704322094378e-06, + "loss": 17.1098, + "step": 20280 + }, + { + "epoch": 0.3668737752604302, + "grad_norm": 39.71875, + "learning_rate": 9.942676069787719e-06, + "loss": 17.1769, + "step": 20290 + }, + { + "epoch": 0.36705459032955806, + "grad_norm": 39.3125, + "learning_rate": 9.94264781748106e-06, + "loss": 16.8182, + "step": 20300 + }, + { + "epoch": 0.36723540539868593, + "grad_norm": 40.84375, + "learning_rate": 9.9426195651744e-06, + "loss": 17.4049, + "step": 20310 + }, + { + "epoch": 0.3674162204678138, + "grad_norm": 42.21875, + "learning_rate": 9.94259131286774e-06, + "loss": 17.2833, + "step": 20320 + }, + { + "epoch": 0.3675970355369417, + "grad_norm": 38.71875, + "learning_rate": 9.94256306056108e-06, + "loss": 17.001, + "step": 20330 + }, + { + "epoch": 0.3677778506060695, + "grad_norm": 40.03125, + "learning_rate": 9.94253480825442e-06, + "loss": 17.0131, + "step": 20340 + }, + { + "epoch": 0.36795866567519736, + "grad_norm": 40.53125, + "learning_rate": 9.94250655594776e-06, + "loss": 16.411, + "step": 20350 + }, + { + "epoch": 0.36813948074432523, + "grad_norm": 41.875, + "learning_rate": 9.942478303641101e-06, + "loss": 17.2263, + "step": 20360 + }, + { + "epoch": 0.3683202958134531, + "grad_norm": 42.0625, + "learning_rate": 9.942450051334442e-06, + "loss": 16.7313, + "step": 20370 + }, + { + "epoch": 0.368501110882581, + "grad_norm": 40.9375, + "learning_rate": 9.942421799027783e-06, + "loss": 16.9421, + "step": 20380 + }, + { + "epoch": 0.3686819259517088, + "grad_norm": 40.4375, + "learning_rate": 9.942393546721123e-06, + "loss": 17.3382, + "step": 20390 + }, + { + "epoch": 0.36886274102083666, + "grad_norm": 42.75, + "learning_rate": 9.942365294414464e-06, + "loss": 17.1704, + "step": 20400 + }, + { + "epoch": 0.36904355608996453, + "grad_norm": 39.84375, + "learning_rate": 9.942337042107804e-06, + "loss": 17.117, + "step": 20410 + }, + { + "epoch": 0.3692243711590924, + "grad_norm": 40.03125, + "learning_rate": 9.942308789801143e-06, + "loss": 16.4973, + "step": 20420 + }, + { + "epoch": 0.3694051862282203, + "grad_norm": 41.8125, + "learning_rate": 9.942280537494484e-06, + "loss": 17.1972, + "step": 20430 + }, + { + "epoch": 0.36958600129734814, + "grad_norm": 43.1875, + "learning_rate": 9.942252285187825e-06, + "loss": 17.1071, + "step": 20440 + }, + { + "epoch": 0.36976681636647596, + "grad_norm": 43.84375, + "learning_rate": 9.942224032881165e-06, + "loss": 16.8985, + "step": 20450 + }, + { + "epoch": 0.36994763143560383, + "grad_norm": 41.15625, + "learning_rate": 9.942195780574506e-06, + "loss": 17.5709, + "step": 20460 + }, + { + "epoch": 0.3701284465047317, + "grad_norm": 39.8125, + "learning_rate": 9.942167528267846e-06, + "loss": 17.0988, + "step": 20470 + }, + { + "epoch": 0.37030926157385957, + "grad_norm": 44.625, + "learning_rate": 9.942139275961187e-06, + "loss": 17.2837, + "step": 20480 + }, + { + "epoch": 0.37049007664298744, + "grad_norm": 41.03125, + "learning_rate": 9.942111023654528e-06, + "loss": 17.2057, + "step": 20490 + }, + { + "epoch": 0.3706708917121153, + "grad_norm": 41.6875, + "learning_rate": 9.942082771347867e-06, + "loss": 16.946, + "step": 20500 + }, + { + "epoch": 0.37085170678124313, + "grad_norm": 41.15625, + "learning_rate": 9.942054519041207e-06, + "loss": 17.1524, + "step": 20510 + }, + { + "epoch": 0.371032521850371, + "grad_norm": 42.21875, + "learning_rate": 9.942026266734548e-06, + "loss": 16.8513, + "step": 20520 + }, + { + "epoch": 0.37121333691949887, + "grad_norm": 42.65625, + "learning_rate": 9.941998014427888e-06, + "loss": 16.7765, + "step": 20530 + }, + { + "epoch": 0.37139415198862674, + "grad_norm": 36.78125, + "learning_rate": 9.941969762121229e-06, + "loss": 17.0085, + "step": 20540 + }, + { + "epoch": 0.3715749670577546, + "grad_norm": 37.75, + "learning_rate": 9.94194150981457e-06, + "loss": 16.9942, + "step": 20550 + }, + { + "epoch": 0.37175578212688243, + "grad_norm": 42.90625, + "learning_rate": 9.94191325750791e-06, + "loss": 17.1815, + "step": 20560 + }, + { + "epoch": 0.3719365971960103, + "grad_norm": 41.28125, + "learning_rate": 9.941885005201251e-06, + "loss": 16.9027, + "step": 20570 + }, + { + "epoch": 0.37211741226513817, + "grad_norm": 40.96875, + "learning_rate": 9.941856752894592e-06, + "loss": 17.4756, + "step": 20580 + }, + { + "epoch": 0.37229822733426604, + "grad_norm": 43.1875, + "learning_rate": 9.94182850058793e-06, + "loss": 17.3078, + "step": 20590 + }, + { + "epoch": 0.3724790424033939, + "grad_norm": 44.0625, + "learning_rate": 9.941800248281271e-06, + "loss": 16.9457, + "step": 20600 + }, + { + "epoch": 0.3726598574725218, + "grad_norm": 40.34375, + "learning_rate": 9.941771995974612e-06, + "loss": 16.969, + "step": 20610 + }, + { + "epoch": 0.3728406725416496, + "grad_norm": 41.40625, + "learning_rate": 9.941743743667952e-06, + "loss": 16.8207, + "step": 20620 + }, + { + "epoch": 0.37302148761077747, + "grad_norm": 42.53125, + "learning_rate": 9.941715491361293e-06, + "loss": 16.9767, + "step": 20630 + }, + { + "epoch": 0.37320230267990534, + "grad_norm": 40.59375, + "learning_rate": 9.941687239054634e-06, + "loss": 16.9558, + "step": 20640 + }, + { + "epoch": 0.3733831177490332, + "grad_norm": 38.375, + "learning_rate": 9.941658986747974e-06, + "loss": 16.9039, + "step": 20650 + }, + { + "epoch": 0.3735639328181611, + "grad_norm": 44.9375, + "learning_rate": 9.941630734441315e-06, + "loss": 17.105, + "step": 20660 + }, + { + "epoch": 0.37374474788728895, + "grad_norm": 41.0, + "learning_rate": 9.941602482134654e-06, + "loss": 16.8125, + "step": 20670 + }, + { + "epoch": 0.37392556295641677, + "grad_norm": 42.84375, + "learning_rate": 9.941574229827994e-06, + "loss": 17.5724, + "step": 20680 + }, + { + "epoch": 0.37410637802554464, + "grad_norm": 43.3125, + "learning_rate": 9.941545977521335e-06, + "loss": 17.3323, + "step": 20690 + }, + { + "epoch": 0.3742871930946725, + "grad_norm": 39.96875, + "learning_rate": 9.941517725214676e-06, + "loss": 17.2595, + "step": 20700 + }, + { + "epoch": 0.3744680081638004, + "grad_norm": 39.6875, + "learning_rate": 9.941489472908016e-06, + "loss": 16.9378, + "step": 20710 + }, + { + "epoch": 0.37464882323292825, + "grad_norm": 40.71875, + "learning_rate": 9.941461220601357e-06, + "loss": 17.1729, + "step": 20720 + }, + { + "epoch": 0.37482963830205607, + "grad_norm": 38.5625, + "learning_rate": 9.941432968294698e-06, + "loss": 17.1281, + "step": 20730 + }, + { + "epoch": 0.37501045337118394, + "grad_norm": 42.4375, + "learning_rate": 9.941404715988038e-06, + "loss": 17.3943, + "step": 20740 + }, + { + "epoch": 0.3751912684403118, + "grad_norm": 41.9375, + "learning_rate": 9.941376463681379e-06, + "loss": 17.0422, + "step": 20750 + }, + { + "epoch": 0.3753720835094397, + "grad_norm": 42.90625, + "learning_rate": 9.941348211374718e-06, + "loss": 17.6543, + "step": 20760 + }, + { + "epoch": 0.37555289857856755, + "grad_norm": 39.46875, + "learning_rate": 9.941319959068058e-06, + "loss": 17.1982, + "step": 20770 + }, + { + "epoch": 0.3757337136476954, + "grad_norm": 43.0625, + "learning_rate": 9.941291706761399e-06, + "loss": 17.2206, + "step": 20780 + }, + { + "epoch": 0.37591452871682324, + "grad_norm": 41.4375, + "learning_rate": 9.94126345445474e-06, + "loss": 16.8636, + "step": 20790 + }, + { + "epoch": 0.3760953437859511, + "grad_norm": 41.1875, + "learning_rate": 9.94123520214808e-06, + "loss": 16.6194, + "step": 20800 + }, + { + "epoch": 0.376276158855079, + "grad_norm": 42.25, + "learning_rate": 9.94120694984142e-06, + "loss": 17.0831, + "step": 20810 + }, + { + "epoch": 0.37645697392420685, + "grad_norm": 39.96875, + "learning_rate": 9.941178697534761e-06, + "loss": 17.2419, + "step": 20820 + }, + { + "epoch": 0.3766377889933347, + "grad_norm": 41.1875, + "learning_rate": 9.941150445228102e-06, + "loss": 17.0365, + "step": 20830 + }, + { + "epoch": 0.3768186040624626, + "grad_norm": 39.90625, + "learning_rate": 9.941122192921443e-06, + "loss": 17.113, + "step": 20840 + }, + { + "epoch": 0.3769994191315904, + "grad_norm": 41.03125, + "learning_rate": 9.941093940614782e-06, + "loss": 17.5729, + "step": 20850 + }, + { + "epoch": 0.3771802342007183, + "grad_norm": 44.09375, + "learning_rate": 9.941065688308122e-06, + "loss": 17.2333, + "step": 20860 + }, + { + "epoch": 0.37736104926984615, + "grad_norm": 40.96875, + "learning_rate": 9.941037436001463e-06, + "loss": 17.4125, + "step": 20870 + }, + { + "epoch": 0.377541864338974, + "grad_norm": 40.53125, + "learning_rate": 9.941009183694803e-06, + "loss": 17.2689, + "step": 20880 + }, + { + "epoch": 0.3777226794081019, + "grad_norm": 41.375, + "learning_rate": 9.940980931388144e-06, + "loss": 17.0753, + "step": 20890 + }, + { + "epoch": 0.3779034944772297, + "grad_norm": 41.96875, + "learning_rate": 9.940952679081485e-06, + "loss": 17.0969, + "step": 20900 + }, + { + "epoch": 0.3780843095463576, + "grad_norm": 41.5625, + "learning_rate": 9.940924426774825e-06, + "loss": 16.9405, + "step": 20910 + }, + { + "epoch": 0.37826512461548545, + "grad_norm": 41.8125, + "learning_rate": 9.940896174468166e-06, + "loss": 16.7523, + "step": 20920 + }, + { + "epoch": 0.3784459396846133, + "grad_norm": 38.96875, + "learning_rate": 9.940867922161505e-06, + "loss": 16.8263, + "step": 20930 + }, + { + "epoch": 0.3786267547537412, + "grad_norm": 45.3125, + "learning_rate": 9.940839669854846e-06, + "loss": 17.2799, + "step": 20940 + }, + { + "epoch": 0.37880756982286906, + "grad_norm": 40.25, + "learning_rate": 9.940811417548186e-06, + "loss": 16.8069, + "step": 20950 + }, + { + "epoch": 0.3789883848919969, + "grad_norm": 40.875, + "learning_rate": 9.940783165241527e-06, + "loss": 17.2283, + "step": 20960 + }, + { + "epoch": 0.37916919996112475, + "grad_norm": 39.28125, + "learning_rate": 9.940754912934867e-06, + "loss": 16.9905, + "step": 20970 + }, + { + "epoch": 0.3793500150302526, + "grad_norm": 40.65625, + "learning_rate": 9.940726660628208e-06, + "loss": 17.0025, + "step": 20980 + }, + { + "epoch": 0.3795308300993805, + "grad_norm": 42.28125, + "learning_rate": 9.940698408321549e-06, + "loss": 17.2225, + "step": 20990 + }, + { + "epoch": 0.37971164516850836, + "grad_norm": 43.90625, + "learning_rate": 9.94067015601489e-06, + "loss": 17.4368, + "step": 21000 + }, + { + "epoch": 0.37989246023763623, + "grad_norm": 40.375, + "learning_rate": 9.94064190370823e-06, + "loss": 17.1831, + "step": 21010 + }, + { + "epoch": 0.38007327530676405, + "grad_norm": 39.40625, + "learning_rate": 9.940613651401569e-06, + "loss": 17.1583, + "step": 21020 + }, + { + "epoch": 0.3802540903758919, + "grad_norm": 38.65625, + "learning_rate": 9.94058539909491e-06, + "loss": 17.0701, + "step": 21030 + }, + { + "epoch": 0.3804349054450198, + "grad_norm": 39.4375, + "learning_rate": 9.94055714678825e-06, + "loss": 17.1009, + "step": 21040 + }, + { + "epoch": 0.38061572051414766, + "grad_norm": 40.5, + "learning_rate": 9.94052889448159e-06, + "loss": 17.0964, + "step": 21050 + }, + { + "epoch": 0.38079653558327553, + "grad_norm": 38.21875, + "learning_rate": 9.940500642174931e-06, + "loss": 17.2598, + "step": 21060 + }, + { + "epoch": 0.38097735065240335, + "grad_norm": 43.15625, + "learning_rate": 9.940472389868272e-06, + "loss": 17.3123, + "step": 21070 + }, + { + "epoch": 0.3811581657215312, + "grad_norm": 40.4375, + "learning_rate": 9.940444137561613e-06, + "loss": 17.112, + "step": 21080 + }, + { + "epoch": 0.3813389807906591, + "grad_norm": 41.5, + "learning_rate": 9.940415885254953e-06, + "loss": 17.1988, + "step": 21090 + }, + { + "epoch": 0.38151979585978696, + "grad_norm": 40.3125, + "learning_rate": 9.940387632948292e-06, + "loss": 17.2093, + "step": 21100 + }, + { + "epoch": 0.38170061092891483, + "grad_norm": 40.09375, + "learning_rate": 9.940359380641633e-06, + "loss": 17.2864, + "step": 21110 + }, + { + "epoch": 0.3818814259980427, + "grad_norm": 40.21875, + "learning_rate": 9.940331128334973e-06, + "loss": 16.7385, + "step": 21120 + }, + { + "epoch": 0.3820622410671705, + "grad_norm": 40.0, + "learning_rate": 9.940302876028314e-06, + "loss": 17.4242, + "step": 21130 + }, + { + "epoch": 0.3822430561362984, + "grad_norm": 41.8125, + "learning_rate": 9.940274623721655e-06, + "loss": 17.2833, + "step": 21140 + }, + { + "epoch": 0.38242387120542626, + "grad_norm": 42.0, + "learning_rate": 9.940246371414995e-06, + "loss": 17.4265, + "step": 21150 + }, + { + "epoch": 0.38260468627455413, + "grad_norm": 43.75, + "learning_rate": 9.940218119108336e-06, + "loss": 17.0023, + "step": 21160 + }, + { + "epoch": 0.382785501343682, + "grad_norm": 38.40625, + "learning_rate": 9.940189866801676e-06, + "loss": 17.121, + "step": 21170 + }, + { + "epoch": 0.38296631641280987, + "grad_norm": 40.09375, + "learning_rate": 9.940161614495017e-06, + "loss": 17.3334, + "step": 21180 + }, + { + "epoch": 0.3831471314819377, + "grad_norm": 44.5, + "learning_rate": 9.940133362188356e-06, + "loss": 16.8784, + "step": 21190 + }, + { + "epoch": 0.38332794655106556, + "grad_norm": 42.34375, + "learning_rate": 9.940105109881697e-06, + "loss": 17.4054, + "step": 21200 + }, + { + "epoch": 0.38350876162019343, + "grad_norm": 42.8125, + "learning_rate": 9.940076857575037e-06, + "loss": 17.2301, + "step": 21210 + }, + { + "epoch": 0.3836895766893213, + "grad_norm": 41.71875, + "learning_rate": 9.940048605268378e-06, + "loss": 17.1037, + "step": 21220 + }, + { + "epoch": 0.38387039175844917, + "grad_norm": 40.375, + "learning_rate": 9.940020352961719e-06, + "loss": 17.263, + "step": 21230 + }, + { + "epoch": 0.384051206827577, + "grad_norm": 38.4375, + "learning_rate": 9.939992100655059e-06, + "loss": 17.4868, + "step": 21240 + }, + { + "epoch": 0.38423202189670486, + "grad_norm": 45.40625, + "learning_rate": 9.9399638483484e-06, + "loss": 17.1353, + "step": 21250 + }, + { + "epoch": 0.3844128369658327, + "grad_norm": 40.15625, + "learning_rate": 9.93993559604174e-06, + "loss": 17.1201, + "step": 21260 + }, + { + "epoch": 0.3845936520349606, + "grad_norm": 38.65625, + "learning_rate": 9.939907343735081e-06, + "loss": 17.1401, + "step": 21270 + }, + { + "epoch": 0.38477446710408847, + "grad_norm": 40.03125, + "learning_rate": 9.93987909142842e-06, + "loss": 17.2789, + "step": 21280 + }, + { + "epoch": 0.38495528217321634, + "grad_norm": 40.71875, + "learning_rate": 9.93985083912176e-06, + "loss": 16.8673, + "step": 21290 + }, + { + "epoch": 0.38513609724234416, + "grad_norm": 41.375, + "learning_rate": 9.939822586815101e-06, + "loss": 16.7684, + "step": 21300 + }, + { + "epoch": 0.385316912311472, + "grad_norm": 41.53125, + "learning_rate": 9.939794334508442e-06, + "loss": 17.5884, + "step": 21310 + }, + { + "epoch": 0.3854977273805999, + "grad_norm": 44.71875, + "learning_rate": 9.939766082201782e-06, + "loss": 17.2755, + "step": 21320 + }, + { + "epoch": 0.38567854244972777, + "grad_norm": 38.78125, + "learning_rate": 9.939737829895123e-06, + "loss": 17.0057, + "step": 21330 + }, + { + "epoch": 0.38585935751885564, + "grad_norm": 41.6875, + "learning_rate": 9.939709577588464e-06, + "loss": 17.2807, + "step": 21340 + }, + { + "epoch": 0.3860401725879835, + "grad_norm": 41.90625, + "learning_rate": 9.939681325281804e-06, + "loss": 17.0537, + "step": 21350 + }, + { + "epoch": 0.3862209876571113, + "grad_norm": 41.96875, + "learning_rate": 9.939653072975143e-06, + "loss": 16.9537, + "step": 21360 + }, + { + "epoch": 0.3864018027262392, + "grad_norm": 42.96875, + "learning_rate": 9.939624820668484e-06, + "loss": 16.8987, + "step": 21370 + }, + { + "epoch": 0.38658261779536707, + "grad_norm": 40.34375, + "learning_rate": 9.939596568361824e-06, + "loss": 17.2109, + "step": 21380 + }, + { + "epoch": 0.38676343286449494, + "grad_norm": 39.15625, + "learning_rate": 9.939568316055165e-06, + "loss": 17.2425, + "step": 21390 + }, + { + "epoch": 0.3869442479336228, + "grad_norm": 39.8125, + "learning_rate": 9.939540063748506e-06, + "loss": 17.6312, + "step": 21400 + }, + { + "epoch": 0.3871250630027506, + "grad_norm": 41.78125, + "learning_rate": 9.939511811441846e-06, + "loss": 16.8594, + "step": 21410 + }, + { + "epoch": 0.3873058780718785, + "grad_norm": 39.65625, + "learning_rate": 9.939483559135187e-06, + "loss": 17.2082, + "step": 21420 + }, + { + "epoch": 0.38748669314100637, + "grad_norm": 40.0625, + "learning_rate": 9.939455306828528e-06, + "loss": 16.9624, + "step": 21430 + }, + { + "epoch": 0.38766750821013424, + "grad_norm": 41.625, + "learning_rate": 9.939427054521868e-06, + "loss": 16.6994, + "step": 21440 + }, + { + "epoch": 0.3878483232792621, + "grad_norm": 41.09375, + "learning_rate": 9.939398802215207e-06, + "loss": 17.3257, + "step": 21450 + }, + { + "epoch": 0.38802913834839, + "grad_norm": 43.0, + "learning_rate": 9.939370549908548e-06, + "loss": 17.1406, + "step": 21460 + }, + { + "epoch": 0.3882099534175178, + "grad_norm": 37.6875, + "learning_rate": 9.939342297601888e-06, + "loss": 17.3857, + "step": 21470 + }, + { + "epoch": 0.38839076848664567, + "grad_norm": 41.90625, + "learning_rate": 9.939314045295229e-06, + "loss": 16.8819, + "step": 21480 + }, + { + "epoch": 0.38857158355577354, + "grad_norm": 42.34375, + "learning_rate": 9.93928579298857e-06, + "loss": 17.316, + "step": 21490 + }, + { + "epoch": 0.3887523986249014, + "grad_norm": 40.21875, + "learning_rate": 9.93925754068191e-06, + "loss": 17.0337, + "step": 21500 + }, + { + "epoch": 0.3889332136940293, + "grad_norm": 39.9375, + "learning_rate": 9.93922928837525e-06, + "loss": 17.1861, + "step": 21510 + }, + { + "epoch": 0.3891140287631571, + "grad_norm": 41.8125, + "learning_rate": 9.939201036068591e-06, + "loss": 16.8454, + "step": 21520 + }, + { + "epoch": 0.38929484383228496, + "grad_norm": 44.28125, + "learning_rate": 9.93917278376193e-06, + "loss": 17.6789, + "step": 21530 + }, + { + "epoch": 0.38947565890141284, + "grad_norm": 39.78125, + "learning_rate": 9.939144531455271e-06, + "loss": 16.8952, + "step": 21540 + }, + { + "epoch": 0.3896564739705407, + "grad_norm": 40.78125, + "learning_rate": 9.939116279148612e-06, + "loss": 17.1093, + "step": 21550 + }, + { + "epoch": 0.3898372890396686, + "grad_norm": 42.25, + "learning_rate": 9.939088026841952e-06, + "loss": 17.1814, + "step": 21560 + }, + { + "epoch": 0.39001810410879645, + "grad_norm": 42.4375, + "learning_rate": 9.939059774535293e-06, + "loss": 17.0269, + "step": 21570 + }, + { + "epoch": 0.39019891917792426, + "grad_norm": 41.0625, + "learning_rate": 9.939031522228632e-06, + "loss": 17.3362, + "step": 21580 + }, + { + "epoch": 0.39037973424705213, + "grad_norm": 40.125, + "learning_rate": 9.939003269921974e-06, + "loss": 16.7692, + "step": 21590 + }, + { + "epoch": 0.39056054931618, + "grad_norm": 40.40625, + "learning_rate": 9.938975017615315e-06, + "loss": 17.1616, + "step": 21600 + }, + { + "epoch": 0.3907413643853079, + "grad_norm": 44.03125, + "learning_rate": 9.938946765308655e-06, + "loss": 17.3161, + "step": 21610 + }, + { + "epoch": 0.39092217945443575, + "grad_norm": 39.75, + "learning_rate": 9.938918513001994e-06, + "loss": 17.4791, + "step": 21620 + }, + { + "epoch": 0.3911029945235636, + "grad_norm": 41.9375, + "learning_rate": 9.938890260695335e-06, + "loss": 17.2422, + "step": 21630 + }, + { + "epoch": 0.39128380959269143, + "grad_norm": 42.125, + "learning_rate": 9.938862008388676e-06, + "loss": 17.1619, + "step": 21640 + }, + { + "epoch": 0.3914646246618193, + "grad_norm": 40.9375, + "learning_rate": 9.938833756082016e-06, + "loss": 17.3982, + "step": 21650 + }, + { + "epoch": 0.3916454397309472, + "grad_norm": 38.71875, + "learning_rate": 9.938805503775357e-06, + "loss": 17.2333, + "step": 21660 + }, + { + "epoch": 0.39182625480007505, + "grad_norm": 41.6875, + "learning_rate": 9.938777251468696e-06, + "loss": 17.1, + "step": 21670 + }, + { + "epoch": 0.3920070698692029, + "grad_norm": 39.4375, + "learning_rate": 9.938748999162038e-06, + "loss": 17.1815, + "step": 21680 + }, + { + "epoch": 0.39218788493833073, + "grad_norm": 39.40625, + "learning_rate": 9.938720746855379e-06, + "loss": 16.9364, + "step": 21690 + }, + { + "epoch": 0.3923687000074586, + "grad_norm": 42.46875, + "learning_rate": 9.938692494548718e-06, + "loss": 17.2628, + "step": 21700 + }, + { + "epoch": 0.3925495150765865, + "grad_norm": 43.53125, + "learning_rate": 9.938664242242058e-06, + "loss": 17.1232, + "step": 21710 + }, + { + "epoch": 0.39273033014571435, + "grad_norm": 39.6875, + "learning_rate": 9.938635989935399e-06, + "loss": 16.7432, + "step": 21720 + }, + { + "epoch": 0.3929111452148422, + "grad_norm": 39.1875, + "learning_rate": 9.93860773762874e-06, + "loss": 17.1859, + "step": 21730 + }, + { + "epoch": 0.3930919602839701, + "grad_norm": 43.5, + "learning_rate": 9.93857948532208e-06, + "loss": 17.0688, + "step": 21740 + }, + { + "epoch": 0.3932727753530979, + "grad_norm": 41.875, + "learning_rate": 9.93855123301542e-06, + "loss": 17.1453, + "step": 21750 + }, + { + "epoch": 0.3934535904222258, + "grad_norm": 43.71875, + "learning_rate": 9.938522980708761e-06, + "loss": 17.4998, + "step": 21760 + }, + { + "epoch": 0.39363440549135364, + "grad_norm": 44.71875, + "learning_rate": 9.938494728402102e-06, + "loss": 17.1017, + "step": 21770 + }, + { + "epoch": 0.3938152205604815, + "grad_norm": 42.6875, + "learning_rate": 9.938466476095443e-06, + "loss": 16.8154, + "step": 21780 + }, + { + "epoch": 0.3939960356296094, + "grad_norm": 41.8125, + "learning_rate": 9.938438223788781e-06, + "loss": 17.0335, + "step": 21790 + }, + { + "epoch": 0.39417685069873726, + "grad_norm": 41.375, + "learning_rate": 9.938409971482122e-06, + "loss": 17.3227, + "step": 21800 + }, + { + "epoch": 0.3943576657678651, + "grad_norm": 40.3125, + "learning_rate": 9.938381719175463e-06, + "loss": 16.984, + "step": 21810 + }, + { + "epoch": 0.39453848083699294, + "grad_norm": 40.96875, + "learning_rate": 9.938353466868803e-06, + "loss": 17.2131, + "step": 21820 + }, + { + "epoch": 0.3947192959061208, + "grad_norm": 42.375, + "learning_rate": 9.938325214562144e-06, + "loss": 17.0346, + "step": 21830 + }, + { + "epoch": 0.3949001109752487, + "grad_norm": 41.09375, + "learning_rate": 9.938296962255483e-06, + "loss": 17.0261, + "step": 21840 + }, + { + "epoch": 0.39508092604437656, + "grad_norm": 42.0, + "learning_rate": 9.938268709948825e-06, + "loss": 17.3454, + "step": 21850 + }, + { + "epoch": 0.3952617411135044, + "grad_norm": 39.8125, + "learning_rate": 9.938240457642166e-06, + "loss": 17.0362, + "step": 21860 + }, + { + "epoch": 0.39544255618263224, + "grad_norm": 40.78125, + "learning_rate": 9.938212205335506e-06, + "loss": 17.1867, + "step": 21870 + }, + { + "epoch": 0.3956233712517601, + "grad_norm": 43.84375, + "learning_rate": 9.938183953028845e-06, + "loss": 17.3699, + "step": 21880 + }, + { + "epoch": 0.395804186320888, + "grad_norm": 40.03125, + "learning_rate": 9.938155700722186e-06, + "loss": 17.1582, + "step": 21890 + }, + { + "epoch": 0.39598500139001586, + "grad_norm": 38.84375, + "learning_rate": 9.938127448415527e-06, + "loss": 17.2781, + "step": 21900 + }, + { + "epoch": 0.3961658164591437, + "grad_norm": 43.09375, + "learning_rate": 9.938099196108867e-06, + "loss": 17.2928, + "step": 21910 + }, + { + "epoch": 0.39634663152827154, + "grad_norm": 44.125, + "learning_rate": 9.938070943802208e-06, + "loss": 17.1416, + "step": 21920 + }, + { + "epoch": 0.3965274465973994, + "grad_norm": 40.25, + "learning_rate": 9.938042691495547e-06, + "loss": 17.1793, + "step": 21930 + }, + { + "epoch": 0.3967082616665273, + "grad_norm": 41.28125, + "learning_rate": 9.938014439188889e-06, + "loss": 17.5213, + "step": 21940 + }, + { + "epoch": 0.39688907673565516, + "grad_norm": 40.4375, + "learning_rate": 9.93798618688223e-06, + "loss": 17.0786, + "step": 21950 + }, + { + "epoch": 0.397069891804783, + "grad_norm": 40.625, + "learning_rate": 9.937957934575569e-06, + "loss": 17.1238, + "step": 21960 + }, + { + "epoch": 0.3972507068739109, + "grad_norm": 40.5, + "learning_rate": 9.93792968226891e-06, + "loss": 17.5342, + "step": 21970 + }, + { + "epoch": 0.3974315219430387, + "grad_norm": 43.0, + "learning_rate": 9.93790142996225e-06, + "loss": 17.2814, + "step": 21980 + }, + { + "epoch": 0.3976123370121666, + "grad_norm": 40.75, + "learning_rate": 9.93787317765559e-06, + "loss": 17.0391, + "step": 21990 + }, + { + "epoch": 0.39779315208129445, + "grad_norm": 42.125, + "learning_rate": 9.937844925348931e-06, + "loss": 17.3825, + "step": 22000 + }, + { + "epoch": 0.3979739671504223, + "grad_norm": 40.875, + "learning_rate": 9.93781667304227e-06, + "loss": 17.0485, + "step": 22010 + }, + { + "epoch": 0.3981547822195502, + "grad_norm": 37.71875, + "learning_rate": 9.93778842073561e-06, + "loss": 16.8926, + "step": 22020 + }, + { + "epoch": 0.398335597288678, + "grad_norm": 40.71875, + "learning_rate": 9.937760168428953e-06, + "loss": 17.0525, + "step": 22030 + }, + { + "epoch": 0.3985164123578059, + "grad_norm": 40.03125, + "learning_rate": 9.937731916122294e-06, + "loss": 16.8813, + "step": 22040 + }, + { + "epoch": 0.39869722742693375, + "grad_norm": 41.5, + "learning_rate": 9.937703663815633e-06, + "loss": 16.917, + "step": 22050 + }, + { + "epoch": 0.3988780424960616, + "grad_norm": 41.8125, + "learning_rate": 9.937675411508973e-06, + "loss": 17.3316, + "step": 22060 + }, + { + "epoch": 0.3990588575651895, + "grad_norm": 43.0625, + "learning_rate": 9.937647159202314e-06, + "loss": 17.4144, + "step": 22070 + }, + { + "epoch": 0.39923967263431737, + "grad_norm": 42.25, + "learning_rate": 9.937618906895654e-06, + "loss": 16.9716, + "step": 22080 + }, + { + "epoch": 0.3994204877034452, + "grad_norm": 38.46875, + "learning_rate": 9.937590654588995e-06, + "loss": 17.2706, + "step": 22090 + }, + { + "epoch": 0.39960130277257305, + "grad_norm": 42.09375, + "learning_rate": 9.937562402282334e-06, + "loss": 17.2526, + "step": 22100 + }, + { + "epoch": 0.3997821178417009, + "grad_norm": 38.21875, + "learning_rate": 9.937534149975676e-06, + "loss": 16.7109, + "step": 22110 + }, + { + "epoch": 0.3999629329108288, + "grad_norm": 40.0, + "learning_rate": 9.937505897669017e-06, + "loss": 16.9083, + "step": 22120 + }, + { + "epoch": 0.40014374797995667, + "grad_norm": 41.59375, + "learning_rate": 9.937477645362356e-06, + "loss": 17.1417, + "step": 22130 + }, + { + "epoch": 0.40032456304908454, + "grad_norm": 41.96875, + "learning_rate": 9.937449393055696e-06, + "loss": 16.9667, + "step": 22140 + }, + { + "epoch": 0.40050537811821235, + "grad_norm": 41.375, + "learning_rate": 9.937421140749037e-06, + "loss": 16.957, + "step": 22150 + }, + { + "epoch": 0.4006861931873402, + "grad_norm": 39.625, + "learning_rate": 9.937392888442378e-06, + "loss": 17.6841, + "step": 22160 + }, + { + "epoch": 0.4008670082564681, + "grad_norm": 41.53125, + "learning_rate": 9.937364636135718e-06, + "loss": 16.9138, + "step": 22170 + }, + { + "epoch": 0.40104782332559596, + "grad_norm": 40.625, + "learning_rate": 9.937336383829059e-06, + "loss": 17.0477, + "step": 22180 + }, + { + "epoch": 0.40122863839472384, + "grad_norm": 41.625, + "learning_rate": 9.937308131522398e-06, + "loss": 17.344, + "step": 22190 + }, + { + "epoch": 0.40140945346385165, + "grad_norm": 41.125, + "learning_rate": 9.93727987921574e-06, + "loss": 16.9657, + "step": 22200 + }, + { + "epoch": 0.4015902685329795, + "grad_norm": 39.53125, + "learning_rate": 9.937251626909081e-06, + "loss": 16.5823, + "step": 22210 + }, + { + "epoch": 0.4017710836021074, + "grad_norm": 39.78125, + "learning_rate": 9.93722337460242e-06, + "loss": 17.1726, + "step": 22220 + }, + { + "epoch": 0.40195189867123526, + "grad_norm": 41.375, + "learning_rate": 9.93719512229576e-06, + "loss": 17.4138, + "step": 22230 + }, + { + "epoch": 0.40213271374036313, + "grad_norm": 39.0, + "learning_rate": 9.937166869989101e-06, + "loss": 17.2267, + "step": 22240 + }, + { + "epoch": 0.402313528809491, + "grad_norm": 40.0, + "learning_rate": 9.937138617682442e-06, + "loss": 17.1238, + "step": 22250 + }, + { + "epoch": 0.4024943438786188, + "grad_norm": 46.125, + "learning_rate": 9.937110365375782e-06, + "loss": 16.9259, + "step": 22260 + }, + { + "epoch": 0.4026751589477467, + "grad_norm": 39.28125, + "learning_rate": 9.937082113069121e-06, + "loss": 17.2861, + "step": 22270 + }, + { + "epoch": 0.40285597401687456, + "grad_norm": 43.3125, + "learning_rate": 9.937053860762462e-06, + "loss": 17.508, + "step": 22280 + }, + { + "epoch": 0.40303678908600243, + "grad_norm": 44.125, + "learning_rate": 9.937025608455804e-06, + "loss": 16.9261, + "step": 22290 + }, + { + "epoch": 0.4032176041551303, + "grad_norm": 40.71875, + "learning_rate": 9.936997356149145e-06, + "loss": 17.3141, + "step": 22300 + }, + { + "epoch": 0.4033984192242582, + "grad_norm": 42.34375, + "learning_rate": 9.936969103842484e-06, + "loss": 17.0033, + "step": 22310 + }, + { + "epoch": 0.403579234293386, + "grad_norm": 38.65625, + "learning_rate": 9.936940851535824e-06, + "loss": 17.1212, + "step": 22320 + }, + { + "epoch": 0.40376004936251386, + "grad_norm": 40.5, + "learning_rate": 9.936912599229165e-06, + "loss": 16.7634, + "step": 22330 + }, + { + "epoch": 0.40394086443164173, + "grad_norm": 42.03125, + "learning_rate": 9.936884346922506e-06, + "loss": 17.2485, + "step": 22340 + }, + { + "epoch": 0.4041216795007696, + "grad_norm": 42.0625, + "learning_rate": 9.936856094615846e-06, + "loss": 17.0581, + "step": 22350 + }, + { + "epoch": 0.4043024945698975, + "grad_norm": 43.625, + "learning_rate": 9.936827842309185e-06, + "loss": 17.339, + "step": 22360 + }, + { + "epoch": 0.4044833096390253, + "grad_norm": 38.9375, + "learning_rate": 9.936799590002526e-06, + "loss": 17.0071, + "step": 22370 + }, + { + "epoch": 0.40466412470815316, + "grad_norm": 41.5, + "learning_rate": 9.936771337695868e-06, + "loss": 16.8362, + "step": 22380 + }, + { + "epoch": 0.40484493977728103, + "grad_norm": 41.9375, + "learning_rate": 9.936743085389207e-06, + "loss": 17.1963, + "step": 22390 + }, + { + "epoch": 0.4050257548464089, + "grad_norm": 39.75, + "learning_rate": 9.936714833082548e-06, + "loss": 16.811, + "step": 22400 + }, + { + "epoch": 0.4052065699155368, + "grad_norm": 43.65625, + "learning_rate": 9.936686580775888e-06, + "loss": 17.4384, + "step": 22410 + }, + { + "epoch": 0.40538738498466464, + "grad_norm": 40.53125, + "learning_rate": 9.936658328469229e-06, + "loss": 17.1916, + "step": 22420 + }, + { + "epoch": 0.40556820005379246, + "grad_norm": 42.03125, + "learning_rate": 9.93663007616257e-06, + "loss": 17.2659, + "step": 22430 + }, + { + "epoch": 0.40574901512292033, + "grad_norm": 42.25, + "learning_rate": 9.936601823855908e-06, + "loss": 17.0808, + "step": 22440 + }, + { + "epoch": 0.4059298301920482, + "grad_norm": 41.53125, + "learning_rate": 9.936573571549249e-06, + "loss": 17.25, + "step": 22450 + }, + { + "epoch": 0.4061106452611761, + "grad_norm": 42.4375, + "learning_rate": 9.936545319242591e-06, + "loss": 17.3886, + "step": 22460 + }, + { + "epoch": 0.40629146033030394, + "grad_norm": 42.4375, + "learning_rate": 9.936517066935932e-06, + "loss": 17.0031, + "step": 22470 + }, + { + "epoch": 0.4064722753994318, + "grad_norm": 46.25, + "learning_rate": 9.936488814629271e-06, + "loss": 17.1316, + "step": 22480 + }, + { + "epoch": 0.40665309046855963, + "grad_norm": 43.34375, + "learning_rate": 9.936460562322612e-06, + "loss": 17.3826, + "step": 22490 + }, + { + "epoch": 0.4068339055376875, + "grad_norm": 38.9375, + "learning_rate": 9.936432310015952e-06, + "loss": 16.9754, + "step": 22500 + }, + { + "epoch": 0.40701472060681537, + "grad_norm": 41.15625, + "learning_rate": 9.936404057709293e-06, + "loss": 17.3494, + "step": 22510 + }, + { + "epoch": 0.40719553567594324, + "grad_norm": 41.875, + "learning_rate": 9.936375805402633e-06, + "loss": 16.3543, + "step": 22520 + }, + { + "epoch": 0.4073763507450711, + "grad_norm": 40.15625, + "learning_rate": 9.936347553095972e-06, + "loss": 17.2515, + "step": 22530 + }, + { + "epoch": 0.40755716581419893, + "grad_norm": 43.6875, + "learning_rate": 9.936319300789313e-06, + "loss": 17.1219, + "step": 22540 + }, + { + "epoch": 0.4077379808833268, + "grad_norm": 42.1875, + "learning_rate": 9.936291048482655e-06, + "loss": 17.0418, + "step": 22550 + }, + { + "epoch": 0.40791879595245467, + "grad_norm": 41.5625, + "learning_rate": 9.936262796175994e-06, + "loss": 17.096, + "step": 22560 + }, + { + "epoch": 0.40809961102158254, + "grad_norm": 41.25, + "learning_rate": 9.936234543869335e-06, + "loss": 17.0234, + "step": 22570 + }, + { + "epoch": 0.4082804260907104, + "grad_norm": 43.71875, + "learning_rate": 9.936206291562675e-06, + "loss": 17.0583, + "step": 22580 + }, + { + "epoch": 0.4084612411598383, + "grad_norm": 42.34375, + "learning_rate": 9.936178039256016e-06, + "loss": 17.0878, + "step": 22590 + }, + { + "epoch": 0.4086420562289661, + "grad_norm": 41.8125, + "learning_rate": 9.936149786949357e-06, + "loss": 16.7648, + "step": 22600 + }, + { + "epoch": 0.40882287129809397, + "grad_norm": 42.90625, + "learning_rate": 9.936121534642697e-06, + "loss": 17.1874, + "step": 22610 + }, + { + "epoch": 0.40900368636722184, + "grad_norm": 43.6875, + "learning_rate": 9.936093282336036e-06, + "loss": 17.5475, + "step": 22620 + }, + { + "epoch": 0.4091845014363497, + "grad_norm": 42.59375, + "learning_rate": 9.936065030029377e-06, + "loss": 17.4482, + "step": 22630 + }, + { + "epoch": 0.4093653165054776, + "grad_norm": 38.4375, + "learning_rate": 9.936036777722719e-06, + "loss": 17.099, + "step": 22640 + }, + { + "epoch": 0.40954613157460545, + "grad_norm": 41.0, + "learning_rate": 9.936008525416058e-06, + "loss": 17.336, + "step": 22650 + }, + { + "epoch": 0.40972694664373327, + "grad_norm": 41.90625, + "learning_rate": 9.935980273109399e-06, + "loss": 17.0403, + "step": 22660 + }, + { + "epoch": 0.40990776171286114, + "grad_norm": 42.09375, + "learning_rate": 9.93595202080274e-06, + "loss": 17.0826, + "step": 22670 + }, + { + "epoch": 0.410088576781989, + "grad_norm": 41.625, + "learning_rate": 9.93592376849608e-06, + "loss": 17.2808, + "step": 22680 + }, + { + "epoch": 0.4102693918511169, + "grad_norm": 41.25, + "learning_rate": 9.93589551618942e-06, + "loss": 16.7892, + "step": 22690 + }, + { + "epoch": 0.41045020692024475, + "grad_norm": 43.28125, + "learning_rate": 9.93586726388276e-06, + "loss": 16.9339, + "step": 22700 + }, + { + "epoch": 0.41063102198937257, + "grad_norm": 39.125, + "learning_rate": 9.9358390115761e-06, + "loss": 16.9299, + "step": 22710 + }, + { + "epoch": 0.41081183705850044, + "grad_norm": 42.0625, + "learning_rate": 9.93581075926944e-06, + "loss": 17.1152, + "step": 22720 + }, + { + "epoch": 0.4109926521276283, + "grad_norm": 42.53125, + "learning_rate": 9.935782506962783e-06, + "loss": 17.313, + "step": 22730 + }, + { + "epoch": 0.4111734671967562, + "grad_norm": 43.0, + "learning_rate": 9.935754254656122e-06, + "loss": 17.5031, + "step": 22740 + }, + { + "epoch": 0.41135428226588405, + "grad_norm": 41.40625, + "learning_rate": 9.935726002349463e-06, + "loss": 17.04, + "step": 22750 + }, + { + "epoch": 0.4115350973350119, + "grad_norm": 40.40625, + "learning_rate": 9.935697750042803e-06, + "loss": 17.2801, + "step": 22760 + }, + { + "epoch": 0.41171591240413974, + "grad_norm": 41.53125, + "learning_rate": 9.935669497736144e-06, + "loss": 17.1183, + "step": 22770 + }, + { + "epoch": 0.4118967274732676, + "grad_norm": 38.28125, + "learning_rate": 9.935641245429484e-06, + "loss": 17.0931, + "step": 22780 + }, + { + "epoch": 0.4120775425423955, + "grad_norm": 40.59375, + "learning_rate": 9.935612993122823e-06, + "loss": 17.5523, + "step": 22790 + }, + { + "epoch": 0.41225835761152335, + "grad_norm": 42.5, + "learning_rate": 9.935584740816164e-06, + "loss": 16.6863, + "step": 22800 + }, + { + "epoch": 0.4124391726806512, + "grad_norm": 42.84375, + "learning_rate": 9.935556488509506e-06, + "loss": 16.9507, + "step": 22810 + }, + { + "epoch": 0.4126199877497791, + "grad_norm": 43.8125, + "learning_rate": 9.935528236202845e-06, + "loss": 16.8283, + "step": 22820 + }, + { + "epoch": 0.4128008028189069, + "grad_norm": 43.28125, + "learning_rate": 9.935499983896186e-06, + "loss": 17.189, + "step": 22830 + }, + { + "epoch": 0.4129816178880348, + "grad_norm": 41.21875, + "learning_rate": 9.935471731589527e-06, + "loss": 17.2389, + "step": 22840 + }, + { + "epoch": 0.41316243295716265, + "grad_norm": 40.65625, + "learning_rate": 9.935443479282867e-06, + "loss": 16.7641, + "step": 22850 + }, + { + "epoch": 0.4133432480262905, + "grad_norm": 40.28125, + "learning_rate": 9.935415226976208e-06, + "loss": 17.1253, + "step": 22860 + }, + { + "epoch": 0.4135240630954184, + "grad_norm": 44.28125, + "learning_rate": 9.935386974669547e-06, + "loss": 17.3765, + "step": 22870 + }, + { + "epoch": 0.4137048781645462, + "grad_norm": 44.28125, + "learning_rate": 9.935358722362887e-06, + "loss": 17.4771, + "step": 22880 + }, + { + "epoch": 0.4138856932336741, + "grad_norm": 39.625, + "learning_rate": 9.935330470056228e-06, + "loss": 16.9489, + "step": 22890 + }, + { + "epoch": 0.41406650830280195, + "grad_norm": 41.8125, + "learning_rate": 9.93530221774957e-06, + "loss": 17.4015, + "step": 22900 + }, + { + "epoch": 0.4142473233719298, + "grad_norm": 39.28125, + "learning_rate": 9.93527396544291e-06, + "loss": 17.2297, + "step": 22910 + }, + { + "epoch": 0.4144281384410577, + "grad_norm": 41.5625, + "learning_rate": 9.93524571313625e-06, + "loss": 17.3229, + "step": 22920 + }, + { + "epoch": 0.41460895351018556, + "grad_norm": 42.6875, + "learning_rate": 9.93521746082959e-06, + "loss": 17.0622, + "step": 22930 + }, + { + "epoch": 0.4147897685793134, + "grad_norm": 37.96875, + "learning_rate": 9.935189208522931e-06, + "loss": 17.4614, + "step": 22940 + }, + { + "epoch": 0.41497058364844125, + "grad_norm": 41.15625, + "learning_rate": 9.935160956216272e-06, + "loss": 17.0732, + "step": 22950 + }, + { + "epoch": 0.4151513987175691, + "grad_norm": 40.15625, + "learning_rate": 9.93513270390961e-06, + "loss": 17.2508, + "step": 22960 + }, + { + "epoch": 0.415332213786697, + "grad_norm": 40.40625, + "learning_rate": 9.935104451602951e-06, + "loss": 16.7658, + "step": 22970 + }, + { + "epoch": 0.41551302885582486, + "grad_norm": 41.1875, + "learning_rate": 9.935076199296292e-06, + "loss": 16.942, + "step": 22980 + }, + { + "epoch": 0.41569384392495273, + "grad_norm": 41.46875, + "learning_rate": 9.935047946989632e-06, + "loss": 17.2235, + "step": 22990 + }, + { + "epoch": 0.41587465899408055, + "grad_norm": 41.96875, + "learning_rate": 9.935019694682973e-06, + "loss": 16.4738, + "step": 23000 + }, + { + "epoch": 0.4160554740632084, + "grad_norm": 42.5, + "learning_rate": 9.934991442376314e-06, + "loss": 17.0574, + "step": 23010 + }, + { + "epoch": 0.4162362891323363, + "grad_norm": 42.46875, + "learning_rate": 9.934963190069654e-06, + "loss": 17.2055, + "step": 23020 + }, + { + "epoch": 0.41641710420146416, + "grad_norm": 38.15625, + "learning_rate": 9.934934937762995e-06, + "loss": 16.9899, + "step": 23030 + }, + { + "epoch": 0.41659791927059203, + "grad_norm": 43.375, + "learning_rate": 9.934906685456336e-06, + "loss": 17.4972, + "step": 23040 + }, + { + "epoch": 0.41677873433971985, + "grad_norm": 39.09375, + "learning_rate": 9.934878433149674e-06, + "loss": 17.2599, + "step": 23050 + }, + { + "epoch": 0.4169595494088477, + "grad_norm": 39.78125, + "learning_rate": 9.934850180843015e-06, + "loss": 17.2226, + "step": 23060 + }, + { + "epoch": 0.4171403644779756, + "grad_norm": 40.8125, + "learning_rate": 9.934821928536356e-06, + "loss": 17.1681, + "step": 23070 + }, + { + "epoch": 0.41732117954710346, + "grad_norm": 44.0, + "learning_rate": 9.934793676229696e-06, + "loss": 17.1996, + "step": 23080 + }, + { + "epoch": 0.41750199461623133, + "grad_norm": 42.625, + "learning_rate": 9.934765423923037e-06, + "loss": 17.3154, + "step": 23090 + }, + { + "epoch": 0.4176828096853592, + "grad_norm": 41.125, + "learning_rate": 9.934737171616378e-06, + "loss": 16.4046, + "step": 23100 + }, + { + "epoch": 0.417863624754487, + "grad_norm": 42.15625, + "learning_rate": 9.934708919309718e-06, + "loss": 17.0241, + "step": 23110 + }, + { + "epoch": 0.4180444398236149, + "grad_norm": 39.3125, + "learning_rate": 9.934680667003059e-06, + "loss": 16.9767, + "step": 23120 + }, + { + "epoch": 0.41822525489274276, + "grad_norm": 38.71875, + "learning_rate": 9.934652414696398e-06, + "loss": 17.1326, + "step": 23130 + }, + { + "epoch": 0.41840606996187063, + "grad_norm": 42.625, + "learning_rate": 9.934624162389738e-06, + "loss": 17.2872, + "step": 23140 + }, + { + "epoch": 0.4185868850309985, + "grad_norm": 40.5, + "learning_rate": 9.934595910083079e-06, + "loss": 16.9717, + "step": 23150 + }, + { + "epoch": 0.41876770010012637, + "grad_norm": 39.3125, + "learning_rate": 9.934567657776421e-06, + "loss": 17.3705, + "step": 23160 + }, + { + "epoch": 0.4189485151692542, + "grad_norm": 40.84375, + "learning_rate": 9.93453940546976e-06, + "loss": 17.2579, + "step": 23170 + }, + { + "epoch": 0.41912933023838206, + "grad_norm": 40.875, + "learning_rate": 9.934511153163101e-06, + "loss": 17.1618, + "step": 23180 + }, + { + "epoch": 0.41931014530750993, + "grad_norm": 40.59375, + "learning_rate": 9.934482900856442e-06, + "loss": 17.0195, + "step": 23190 + }, + { + "epoch": 0.4194909603766378, + "grad_norm": 38.8125, + "learning_rate": 9.934454648549782e-06, + "loss": 17.0593, + "step": 23200 + }, + { + "epoch": 0.41967177544576567, + "grad_norm": 42.375, + "learning_rate": 9.934426396243123e-06, + "loss": 17.0742, + "step": 23210 + }, + { + "epoch": 0.4198525905148935, + "grad_norm": 40.875, + "learning_rate": 9.934398143936462e-06, + "loss": 17.3838, + "step": 23220 + }, + { + "epoch": 0.42003340558402136, + "grad_norm": 44.09375, + "learning_rate": 9.934369891629802e-06, + "loss": 17.1919, + "step": 23230 + }, + { + "epoch": 0.42021422065314923, + "grad_norm": 42.75, + "learning_rate": 9.934341639323143e-06, + "loss": 16.9229, + "step": 23240 + }, + { + "epoch": 0.4203950357222771, + "grad_norm": 40.21875, + "learning_rate": 9.934313387016484e-06, + "loss": 17.1713, + "step": 23250 + }, + { + "epoch": 0.42057585079140497, + "grad_norm": 43.125, + "learning_rate": 9.934285134709824e-06, + "loss": 17.0081, + "step": 23260 + }, + { + "epoch": 0.42075666586053284, + "grad_norm": 43.0625, + "learning_rate": 9.934256882403165e-06, + "loss": 17.02, + "step": 23270 + }, + { + "epoch": 0.42093748092966066, + "grad_norm": 40.0625, + "learning_rate": 9.934228630096505e-06, + "loss": 17.4053, + "step": 23280 + }, + { + "epoch": 0.4211182959987885, + "grad_norm": 42.46875, + "learning_rate": 9.934200377789846e-06, + "loss": 16.9431, + "step": 23290 + }, + { + "epoch": 0.4212991110679164, + "grad_norm": 41.28125, + "learning_rate": 9.934172125483185e-06, + "loss": 16.5166, + "step": 23300 + }, + { + "epoch": 0.42147992613704427, + "grad_norm": 37.84375, + "learning_rate": 9.934143873176526e-06, + "loss": 17.1852, + "step": 23310 + }, + { + "epoch": 0.42166074120617214, + "grad_norm": 43.0625, + "learning_rate": 9.934115620869866e-06, + "loss": 16.831, + "step": 23320 + }, + { + "epoch": 0.42184155627529996, + "grad_norm": 45.15625, + "learning_rate": 9.934087368563207e-06, + "loss": 17.0062, + "step": 23330 + }, + { + "epoch": 0.4220223713444278, + "grad_norm": 41.9375, + "learning_rate": 9.934059116256547e-06, + "loss": 17.0704, + "step": 23340 + }, + { + "epoch": 0.4222031864135557, + "grad_norm": 40.8125, + "learning_rate": 9.934030863949888e-06, + "loss": 17.3631, + "step": 23350 + }, + { + "epoch": 0.42238400148268357, + "grad_norm": 40.53125, + "learning_rate": 9.934002611643229e-06, + "loss": 17.2416, + "step": 23360 + }, + { + "epoch": 0.42256481655181144, + "grad_norm": 41.03125, + "learning_rate": 9.93397435933657e-06, + "loss": 16.8875, + "step": 23370 + }, + { + "epoch": 0.4227456316209393, + "grad_norm": 42.09375, + "learning_rate": 9.93394610702991e-06, + "loss": 17.089, + "step": 23380 + }, + { + "epoch": 0.4229264466900671, + "grad_norm": 41.09375, + "learning_rate": 9.933917854723249e-06, + "loss": 17.174, + "step": 23390 + }, + { + "epoch": 0.423107261759195, + "grad_norm": 41.59375, + "learning_rate": 9.93388960241659e-06, + "loss": 16.8249, + "step": 23400 + }, + { + "epoch": 0.42328807682832287, + "grad_norm": 42.4375, + "learning_rate": 9.93386135010993e-06, + "loss": 16.6637, + "step": 23410 + }, + { + "epoch": 0.42346889189745074, + "grad_norm": 41.40625, + "learning_rate": 9.93383309780327e-06, + "loss": 17.1979, + "step": 23420 + }, + { + "epoch": 0.4236497069665786, + "grad_norm": 43.125, + "learning_rate": 9.933804845496611e-06, + "loss": 16.7769, + "step": 23430 + }, + { + "epoch": 0.4238305220357065, + "grad_norm": 41.1875, + "learning_rate": 9.933776593189952e-06, + "loss": 17.3441, + "step": 23440 + }, + { + "epoch": 0.4240113371048343, + "grad_norm": 39.5, + "learning_rate": 9.933748340883293e-06, + "loss": 17.4445, + "step": 23450 + }, + { + "epoch": 0.42419215217396217, + "grad_norm": 41.84375, + "learning_rate": 9.933720088576633e-06, + "loss": 16.9229, + "step": 23460 + }, + { + "epoch": 0.42437296724309004, + "grad_norm": 41.28125, + "learning_rate": 9.933691836269974e-06, + "loss": 17.0078, + "step": 23470 + }, + { + "epoch": 0.4245537823122179, + "grad_norm": 42.78125, + "learning_rate": 9.933663583963313e-06, + "loss": 16.8134, + "step": 23480 + }, + { + "epoch": 0.4247345973813458, + "grad_norm": 42.5, + "learning_rate": 9.933635331656653e-06, + "loss": 16.873, + "step": 23490 + }, + { + "epoch": 0.4249154124504736, + "grad_norm": 41.84375, + "learning_rate": 9.933607079349994e-06, + "loss": 16.9522, + "step": 23500 + }, + { + "epoch": 0.42509622751960147, + "grad_norm": 42.78125, + "learning_rate": 9.933578827043335e-06, + "loss": 17.1385, + "step": 23510 + }, + { + "epoch": 0.42527704258872934, + "grad_norm": 42.4375, + "learning_rate": 9.933550574736675e-06, + "loss": 17.2821, + "step": 23520 + }, + { + "epoch": 0.4254578576578572, + "grad_norm": 41.09375, + "learning_rate": 9.933522322430016e-06, + "loss": 17.0284, + "step": 23530 + }, + { + "epoch": 0.4256386727269851, + "grad_norm": 40.875, + "learning_rate": 9.933494070123357e-06, + "loss": 17.3555, + "step": 23540 + }, + { + "epoch": 0.42581948779611295, + "grad_norm": 44.1875, + "learning_rate": 9.933465817816697e-06, + "loss": 16.8159, + "step": 23550 + }, + { + "epoch": 0.42600030286524077, + "grad_norm": 41.6875, + "learning_rate": 9.933437565510036e-06, + "loss": 17.1709, + "step": 23560 + }, + { + "epoch": 0.42618111793436864, + "grad_norm": 39.78125, + "learning_rate": 9.933409313203377e-06, + "loss": 17.1132, + "step": 23570 + }, + { + "epoch": 0.4263619330034965, + "grad_norm": 40.875, + "learning_rate": 9.933381060896717e-06, + "loss": 16.997, + "step": 23580 + }, + { + "epoch": 0.4265427480726244, + "grad_norm": 42.53125, + "learning_rate": 9.933352808590058e-06, + "loss": 17.2656, + "step": 23590 + }, + { + "epoch": 0.42672356314175225, + "grad_norm": 38.875, + "learning_rate": 9.933324556283399e-06, + "loss": 17.3424, + "step": 23600 + }, + { + "epoch": 0.4269043782108801, + "grad_norm": 42.9375, + "learning_rate": 9.93329630397674e-06, + "loss": 17.2155, + "step": 23610 + }, + { + "epoch": 0.42708519328000794, + "grad_norm": 40.1875, + "learning_rate": 9.93326805167008e-06, + "loss": 17.0139, + "step": 23620 + }, + { + "epoch": 0.4272660083491358, + "grad_norm": 42.125, + "learning_rate": 9.93323979936342e-06, + "loss": 17.278, + "step": 23630 + }, + { + "epoch": 0.4274468234182637, + "grad_norm": 40.53125, + "learning_rate": 9.933211547056761e-06, + "loss": 17.0852, + "step": 23640 + }, + { + "epoch": 0.42762763848739155, + "grad_norm": 40.28125, + "learning_rate": 9.9331832947501e-06, + "loss": 17.0743, + "step": 23650 + }, + { + "epoch": 0.4278084535565194, + "grad_norm": 41.9375, + "learning_rate": 9.93315504244344e-06, + "loss": 17.3539, + "step": 23660 + }, + { + "epoch": 0.42798926862564723, + "grad_norm": 39.5, + "learning_rate": 9.933126790136781e-06, + "loss": 16.9718, + "step": 23670 + }, + { + "epoch": 0.4281700836947751, + "grad_norm": 44.4375, + "learning_rate": 9.933098537830122e-06, + "loss": 17.266, + "step": 23680 + }, + { + "epoch": 0.428350898763903, + "grad_norm": 42.03125, + "learning_rate": 9.933070285523462e-06, + "loss": 17.0662, + "step": 23690 + }, + { + "epoch": 0.42853171383303085, + "grad_norm": 46.09375, + "learning_rate": 9.933042033216803e-06, + "loss": 16.9782, + "step": 23700 + }, + { + "epoch": 0.4287125289021587, + "grad_norm": 40.625, + "learning_rate": 9.933013780910144e-06, + "loss": 17.1737, + "step": 23710 + }, + { + "epoch": 0.4288933439712866, + "grad_norm": 44.09375, + "learning_rate": 9.932985528603484e-06, + "loss": 17.0811, + "step": 23720 + }, + { + "epoch": 0.4290741590404144, + "grad_norm": 41.78125, + "learning_rate": 9.932957276296823e-06, + "loss": 16.8908, + "step": 23730 + }, + { + "epoch": 0.4292549741095423, + "grad_norm": 39.34375, + "learning_rate": 9.932929023990164e-06, + "loss": 17.0634, + "step": 23740 + }, + { + "epoch": 0.42943578917867015, + "grad_norm": 39.28125, + "learning_rate": 9.932900771683505e-06, + "loss": 17.1466, + "step": 23750 + }, + { + "epoch": 0.429616604247798, + "grad_norm": 41.53125, + "learning_rate": 9.932872519376845e-06, + "loss": 16.7681, + "step": 23760 + }, + { + "epoch": 0.4297974193169259, + "grad_norm": 38.34375, + "learning_rate": 9.932844267070186e-06, + "loss": 17.4063, + "step": 23770 + }, + { + "epoch": 0.42997823438605376, + "grad_norm": 42.46875, + "learning_rate": 9.932816014763526e-06, + "loss": 17.1721, + "step": 23780 + }, + { + "epoch": 0.4301590494551816, + "grad_norm": 41.65625, + "learning_rate": 9.932787762456867e-06, + "loss": 17.095, + "step": 23790 + }, + { + "epoch": 0.43033986452430945, + "grad_norm": 40.9375, + "learning_rate": 9.932759510150208e-06, + "loss": 17.3986, + "step": 23800 + }, + { + "epoch": 0.4305206795934373, + "grad_norm": 44.25, + "learning_rate": 9.932731257843548e-06, + "loss": 17.5244, + "step": 23810 + }, + { + "epoch": 0.4307014946625652, + "grad_norm": 41.65625, + "learning_rate": 9.932703005536887e-06, + "loss": 17.2414, + "step": 23820 + }, + { + "epoch": 0.43088230973169306, + "grad_norm": 41.90625, + "learning_rate": 9.932674753230228e-06, + "loss": 16.83, + "step": 23830 + }, + { + "epoch": 0.4310631248008209, + "grad_norm": 38.875, + "learning_rate": 9.932646500923568e-06, + "loss": 17.2673, + "step": 23840 + }, + { + "epoch": 0.43124393986994874, + "grad_norm": 41.40625, + "learning_rate": 9.932618248616909e-06, + "loss": 17.1928, + "step": 23850 + }, + { + "epoch": 0.4314247549390766, + "grad_norm": 40.15625, + "learning_rate": 9.93258999631025e-06, + "loss": 17.1712, + "step": 23860 + }, + { + "epoch": 0.4316055700082045, + "grad_norm": 40.21875, + "learning_rate": 9.93256174400359e-06, + "loss": 16.9979, + "step": 23870 + }, + { + "epoch": 0.43178638507733236, + "grad_norm": 40.78125, + "learning_rate": 9.932533491696931e-06, + "loss": 17.0087, + "step": 23880 + }, + { + "epoch": 0.43196720014646023, + "grad_norm": 40.1875, + "learning_rate": 9.932505239390272e-06, + "loss": 16.8195, + "step": 23890 + }, + { + "epoch": 0.43214801521558804, + "grad_norm": 38.15625, + "learning_rate": 9.932476987083612e-06, + "loss": 16.6816, + "step": 23900 + }, + { + "epoch": 0.4323288302847159, + "grad_norm": 39.8125, + "learning_rate": 9.932448734776951e-06, + "loss": 16.6358, + "step": 23910 + }, + { + "epoch": 0.4325096453538438, + "grad_norm": 40.84375, + "learning_rate": 9.932420482470292e-06, + "loss": 16.9947, + "step": 23920 + }, + { + "epoch": 0.43269046042297166, + "grad_norm": 39.96875, + "learning_rate": 9.932392230163632e-06, + "loss": 17.3373, + "step": 23930 + }, + { + "epoch": 0.4328712754920995, + "grad_norm": 41.09375, + "learning_rate": 9.932363977856973e-06, + "loss": 17.1195, + "step": 23940 + }, + { + "epoch": 0.4330520905612274, + "grad_norm": 38.96875, + "learning_rate": 9.932335725550314e-06, + "loss": 17.2644, + "step": 23950 + }, + { + "epoch": 0.4332329056303552, + "grad_norm": 40.5, + "learning_rate": 9.932307473243654e-06, + "loss": 16.8849, + "step": 23960 + }, + { + "epoch": 0.4334137206994831, + "grad_norm": 41.40625, + "learning_rate": 9.932279220936995e-06, + "loss": 17.2072, + "step": 23970 + }, + { + "epoch": 0.43359453576861096, + "grad_norm": 40.3125, + "learning_rate": 9.932250968630335e-06, + "loss": 16.8773, + "step": 23980 + }, + { + "epoch": 0.4337753508377388, + "grad_norm": 42.375, + "learning_rate": 9.932222716323674e-06, + "loss": 17.1238, + "step": 23990 + }, + { + "epoch": 0.4339561659068667, + "grad_norm": 41.5625, + "learning_rate": 9.932194464017015e-06, + "loss": 16.9992, + "step": 24000 + }, + { + "epoch": 0.4341369809759945, + "grad_norm": 39.96875, + "learning_rate": 9.932166211710356e-06, + "loss": 17.4453, + "step": 24010 + }, + { + "epoch": 0.4343177960451224, + "grad_norm": 39.875, + "learning_rate": 9.932137959403696e-06, + "loss": 16.7997, + "step": 24020 + }, + { + "epoch": 0.43449861111425025, + "grad_norm": 39.28125, + "learning_rate": 9.932109707097037e-06, + "loss": 17.2229, + "step": 24030 + }, + { + "epoch": 0.4346794261833781, + "grad_norm": 42.6875, + "learning_rate": 9.932081454790377e-06, + "loss": 17.3416, + "step": 24040 + }, + { + "epoch": 0.434860241252506, + "grad_norm": 41.8125, + "learning_rate": 9.932053202483718e-06, + "loss": 17.0329, + "step": 24050 + }, + { + "epoch": 0.43504105632163387, + "grad_norm": 41.96875, + "learning_rate": 9.932024950177059e-06, + "loss": 16.8602, + "step": 24060 + }, + { + "epoch": 0.4352218713907617, + "grad_norm": 41.71875, + "learning_rate": 9.9319966978704e-06, + "loss": 17.5934, + "step": 24070 + }, + { + "epoch": 0.43540268645988955, + "grad_norm": 41.4375, + "learning_rate": 9.931968445563738e-06, + "loss": 16.8142, + "step": 24080 + }, + { + "epoch": 0.4355835015290174, + "grad_norm": 42.09375, + "learning_rate": 9.931940193257079e-06, + "loss": 17.3197, + "step": 24090 + }, + { + "epoch": 0.4357643165981453, + "grad_norm": 41.5625, + "learning_rate": 9.93191194095042e-06, + "loss": 17.4021, + "step": 24100 + }, + { + "epoch": 0.43594513166727317, + "grad_norm": 41.875, + "learning_rate": 9.93188368864376e-06, + "loss": 16.9198, + "step": 24110 + }, + { + "epoch": 0.43612594673640104, + "grad_norm": 40.34375, + "learning_rate": 9.9318554363371e-06, + "loss": 17.3164, + "step": 24120 + }, + { + "epoch": 0.43630676180552885, + "grad_norm": 42.25, + "learning_rate": 9.931827184030441e-06, + "loss": 17.3602, + "step": 24130 + }, + { + "epoch": 0.4364875768746567, + "grad_norm": 43.96875, + "learning_rate": 9.931798931723782e-06, + "loss": 17.1367, + "step": 24140 + }, + { + "epoch": 0.4366683919437846, + "grad_norm": 40.96875, + "learning_rate": 9.931770679417123e-06, + "loss": 17.2549, + "step": 24150 + }, + { + "epoch": 0.43684920701291247, + "grad_norm": 40.8125, + "learning_rate": 9.931742427110462e-06, + "loss": 17.0768, + "step": 24160 + }, + { + "epoch": 0.43703002208204034, + "grad_norm": 39.5625, + "learning_rate": 9.931714174803802e-06, + "loss": 16.9547, + "step": 24170 + }, + { + "epoch": 0.43721083715116815, + "grad_norm": 42.15625, + "learning_rate": 9.931685922497143e-06, + "loss": 16.8944, + "step": 24180 + }, + { + "epoch": 0.437391652220296, + "grad_norm": 38.4375, + "learning_rate": 9.931657670190483e-06, + "loss": 17.1201, + "step": 24190 + }, + { + "epoch": 0.4375724672894239, + "grad_norm": 40.71875, + "learning_rate": 9.931629417883824e-06, + "loss": 16.9747, + "step": 24200 + }, + { + "epoch": 0.43775328235855177, + "grad_norm": 42.3125, + "learning_rate": 9.931601165577163e-06, + "loss": 16.6857, + "step": 24210 + }, + { + "epoch": 0.43793409742767964, + "grad_norm": 39.21875, + "learning_rate": 9.931572913270505e-06, + "loss": 17.3643, + "step": 24220 + }, + { + "epoch": 0.4381149124968075, + "grad_norm": 43.375, + "learning_rate": 9.931544660963846e-06, + "loss": 17.2106, + "step": 24230 + }, + { + "epoch": 0.4382957275659353, + "grad_norm": 38.8125, + "learning_rate": 9.931516408657187e-06, + "loss": 16.833, + "step": 24240 + }, + { + "epoch": 0.4384765426350632, + "grad_norm": 43.90625, + "learning_rate": 9.931488156350525e-06, + "loss": 17.0473, + "step": 24250 + }, + { + "epoch": 0.43865735770419106, + "grad_norm": 40.3125, + "learning_rate": 9.931459904043866e-06, + "loss": 16.9058, + "step": 24260 + }, + { + "epoch": 0.43883817277331894, + "grad_norm": 42.65625, + "learning_rate": 9.931431651737207e-06, + "loss": 16.9209, + "step": 24270 + }, + { + "epoch": 0.4390189878424468, + "grad_norm": 43.28125, + "learning_rate": 9.931403399430547e-06, + "loss": 17.0468, + "step": 24280 + }, + { + "epoch": 0.4391998029115747, + "grad_norm": 40.25, + "learning_rate": 9.931375147123888e-06, + "loss": 17.075, + "step": 24290 + }, + { + "epoch": 0.4393806179807025, + "grad_norm": 39.25, + "learning_rate": 9.931346894817229e-06, + "loss": 16.9849, + "step": 24300 + }, + { + "epoch": 0.43956143304983036, + "grad_norm": 39.53125, + "learning_rate": 9.93131864251057e-06, + "loss": 17.4202, + "step": 24310 + }, + { + "epoch": 0.43974224811895823, + "grad_norm": 42.21875, + "learning_rate": 9.93129039020391e-06, + "loss": 17.1817, + "step": 24320 + }, + { + "epoch": 0.4399230631880861, + "grad_norm": 40.59375, + "learning_rate": 9.931262137897249e-06, + "loss": 17.1151, + "step": 24330 + }, + { + "epoch": 0.440103878257214, + "grad_norm": 38.375, + "learning_rate": 9.93123388559059e-06, + "loss": 16.8332, + "step": 24340 + }, + { + "epoch": 0.4402846933263418, + "grad_norm": 42.5625, + "learning_rate": 9.93120563328393e-06, + "loss": 17.2657, + "step": 24350 + }, + { + "epoch": 0.44046550839546966, + "grad_norm": 45.21875, + "learning_rate": 9.93117738097727e-06, + "loss": 17.5942, + "step": 24360 + }, + { + "epoch": 0.44064632346459753, + "grad_norm": 39.375, + "learning_rate": 9.931149128670611e-06, + "loss": 16.6795, + "step": 24370 + }, + { + "epoch": 0.4408271385337254, + "grad_norm": 42.25, + "learning_rate": 9.931120876363952e-06, + "loss": 16.5306, + "step": 24380 + }, + { + "epoch": 0.4410079536028533, + "grad_norm": 39.78125, + "learning_rate": 9.931092624057292e-06, + "loss": 16.996, + "step": 24390 + }, + { + "epoch": 0.44118876867198115, + "grad_norm": 39.53125, + "learning_rate": 9.931064371750633e-06, + "loss": 17.0845, + "step": 24400 + }, + { + "epoch": 0.44136958374110896, + "grad_norm": 40.5, + "learning_rate": 9.931036119443974e-06, + "loss": 16.9601, + "step": 24410 + }, + { + "epoch": 0.44155039881023683, + "grad_norm": 41.78125, + "learning_rate": 9.931007867137313e-06, + "loss": 17.1927, + "step": 24420 + }, + { + "epoch": 0.4417312138793647, + "grad_norm": 43.84375, + "learning_rate": 9.930979614830653e-06, + "loss": 17.0799, + "step": 24430 + }, + { + "epoch": 0.4419120289484926, + "grad_norm": 39.3125, + "learning_rate": 9.930951362523994e-06, + "loss": 17.2743, + "step": 24440 + }, + { + "epoch": 0.44209284401762045, + "grad_norm": 40.6875, + "learning_rate": 9.930923110217335e-06, + "loss": 17.1115, + "step": 24450 + }, + { + "epoch": 0.4422736590867483, + "grad_norm": 42.09375, + "learning_rate": 9.930894857910675e-06, + "loss": 17.0342, + "step": 24460 + }, + { + "epoch": 0.44245447415587613, + "grad_norm": 39.78125, + "learning_rate": 9.930866605604014e-06, + "loss": 17.1333, + "step": 24470 + }, + { + "epoch": 0.442635289225004, + "grad_norm": 41.8125, + "learning_rate": 9.930838353297356e-06, + "loss": 17.1999, + "step": 24480 + }, + { + "epoch": 0.4428161042941319, + "grad_norm": 38.75, + "learning_rate": 9.930810100990697e-06, + "loss": 17.1589, + "step": 24490 + }, + { + "epoch": 0.44299691936325974, + "grad_norm": 43.4375, + "learning_rate": 9.930781848684038e-06, + "loss": 17.0211, + "step": 24500 + }, + { + "epoch": 0.4431777344323876, + "grad_norm": 40.84375, + "learning_rate": 9.930753596377377e-06, + "loss": 16.4922, + "step": 24510 + }, + { + "epoch": 0.44335854950151543, + "grad_norm": 39.8125, + "learning_rate": 9.930725344070717e-06, + "loss": 16.8577, + "step": 24520 + }, + { + "epoch": 0.4435393645706433, + "grad_norm": 38.875, + "learning_rate": 9.930697091764058e-06, + "loss": 17.4463, + "step": 24530 + }, + { + "epoch": 0.4437201796397712, + "grad_norm": 41.09375, + "learning_rate": 9.930668839457398e-06, + "loss": 17.0105, + "step": 24540 + }, + { + "epoch": 0.44390099470889904, + "grad_norm": 38.8125, + "learning_rate": 9.930640587150739e-06, + "loss": 17.0146, + "step": 24550 + }, + { + "epoch": 0.4440818097780269, + "grad_norm": 39.25, + "learning_rate": 9.930612334844078e-06, + "loss": 17.0366, + "step": 24560 + }, + { + "epoch": 0.4442626248471548, + "grad_norm": 42.21875, + "learning_rate": 9.93058408253742e-06, + "loss": 16.7516, + "step": 24570 + }, + { + "epoch": 0.4444434399162826, + "grad_norm": 40.125, + "learning_rate": 9.930555830230761e-06, + "loss": 16.9998, + "step": 24580 + }, + { + "epoch": 0.44462425498541047, + "grad_norm": 39.5625, + "learning_rate": 9.9305275779241e-06, + "loss": 16.992, + "step": 24590 + }, + { + "epoch": 0.44480507005453834, + "grad_norm": 42.34375, + "learning_rate": 9.93049932561744e-06, + "loss": 17.2084, + "step": 24600 + }, + { + "epoch": 0.4449858851236662, + "grad_norm": 41.15625, + "learning_rate": 9.930471073310781e-06, + "loss": 17.1132, + "step": 24610 + }, + { + "epoch": 0.4451667001927941, + "grad_norm": 40.78125, + "learning_rate": 9.930442821004122e-06, + "loss": 17.2638, + "step": 24620 + }, + { + "epoch": 0.44534751526192196, + "grad_norm": 42.09375, + "learning_rate": 9.930414568697462e-06, + "loss": 17.0395, + "step": 24630 + }, + { + "epoch": 0.44552833033104977, + "grad_norm": 40.0, + "learning_rate": 9.930386316390801e-06, + "loss": 17.0863, + "step": 24640 + }, + { + "epoch": 0.44570914540017764, + "grad_norm": 43.78125, + "learning_rate": 9.930358064084144e-06, + "loss": 16.9601, + "step": 24650 + }, + { + "epoch": 0.4458899604693055, + "grad_norm": 43.9375, + "learning_rate": 9.930329811777484e-06, + "loss": 17.36, + "step": 24660 + }, + { + "epoch": 0.4460707755384334, + "grad_norm": 41.6875, + "learning_rate": 9.930301559470825e-06, + "loss": 17.5581, + "step": 24670 + }, + { + "epoch": 0.44625159060756125, + "grad_norm": 39.40625, + "learning_rate": 9.930273307164164e-06, + "loss": 17.0764, + "step": 24680 + }, + { + "epoch": 0.44643240567668907, + "grad_norm": 37.71875, + "learning_rate": 9.930245054857504e-06, + "loss": 17.2964, + "step": 24690 + }, + { + "epoch": 0.44661322074581694, + "grad_norm": 41.09375, + "learning_rate": 9.930216802550845e-06, + "loss": 16.8406, + "step": 24700 + }, + { + "epoch": 0.4467940358149448, + "grad_norm": 41.34375, + "learning_rate": 9.930188550244186e-06, + "loss": 16.8859, + "step": 24710 + }, + { + "epoch": 0.4469748508840727, + "grad_norm": 39.375, + "learning_rate": 9.930160297937526e-06, + "loss": 17.282, + "step": 24720 + }, + { + "epoch": 0.44715566595320055, + "grad_norm": 41.0625, + "learning_rate": 9.930132045630865e-06, + "loss": 16.5238, + "step": 24730 + }, + { + "epoch": 0.4473364810223284, + "grad_norm": 40.0625, + "learning_rate": 9.930103793324207e-06, + "loss": 17.1698, + "step": 24740 + }, + { + "epoch": 0.44751729609145624, + "grad_norm": 38.5625, + "learning_rate": 9.930075541017548e-06, + "loss": 17.1585, + "step": 24750 + }, + { + "epoch": 0.4476981111605841, + "grad_norm": 41.46875, + "learning_rate": 9.930047288710887e-06, + "loss": 17.5576, + "step": 24760 + }, + { + "epoch": 0.447878926229712, + "grad_norm": 39.875, + "learning_rate": 9.930019036404228e-06, + "loss": 17.3201, + "step": 24770 + }, + { + "epoch": 0.44805974129883985, + "grad_norm": 43.0625, + "learning_rate": 9.929990784097568e-06, + "loss": 16.9252, + "step": 24780 + }, + { + "epoch": 0.4482405563679677, + "grad_norm": 42.125, + "learning_rate": 9.929962531790909e-06, + "loss": 17.2519, + "step": 24790 + }, + { + "epoch": 0.4484213714370956, + "grad_norm": 38.40625, + "learning_rate": 9.92993427948425e-06, + "loss": 17.0709, + "step": 24800 + }, + { + "epoch": 0.4486021865062234, + "grad_norm": 42.59375, + "learning_rate": 9.92990602717759e-06, + "loss": 17.0579, + "step": 24810 + }, + { + "epoch": 0.4487830015753513, + "grad_norm": 44.40625, + "learning_rate": 9.929877774870929e-06, + "loss": 16.8308, + "step": 24820 + }, + { + "epoch": 0.44896381664447915, + "grad_norm": 39.15625, + "learning_rate": 9.929849522564271e-06, + "loss": 17.0908, + "step": 24830 + }, + { + "epoch": 0.449144631713607, + "grad_norm": 40.53125, + "learning_rate": 9.929821270257612e-06, + "loss": 16.7871, + "step": 24840 + }, + { + "epoch": 0.4493254467827349, + "grad_norm": 43.03125, + "learning_rate": 9.929793017950951e-06, + "loss": 17.3479, + "step": 24850 + }, + { + "epoch": 0.4495062618518627, + "grad_norm": 40.65625, + "learning_rate": 9.929764765644292e-06, + "loss": 17.52, + "step": 24860 + }, + { + "epoch": 0.4496870769209906, + "grad_norm": 41.875, + "learning_rate": 9.929736513337632e-06, + "loss": 17.0901, + "step": 24870 + }, + { + "epoch": 0.44986789199011845, + "grad_norm": 42.40625, + "learning_rate": 9.929708261030973e-06, + "loss": 17.2132, + "step": 24880 + }, + { + "epoch": 0.4500487070592463, + "grad_norm": 39.78125, + "learning_rate": 9.929680008724313e-06, + "loss": 17.1365, + "step": 24890 + }, + { + "epoch": 0.4502295221283742, + "grad_norm": 38.59375, + "learning_rate": 9.929651756417652e-06, + "loss": 17.0028, + "step": 24900 + }, + { + "epoch": 0.45041033719750206, + "grad_norm": 42.96875, + "learning_rate": 9.929623504110993e-06, + "loss": 17.0601, + "step": 24910 + }, + { + "epoch": 0.4505911522666299, + "grad_norm": 39.6875, + "learning_rate": 9.929595251804335e-06, + "loss": 16.7189, + "step": 24920 + }, + { + "epoch": 0.45077196733575775, + "grad_norm": 39.0, + "learning_rate": 9.929566999497676e-06, + "loss": 16.7454, + "step": 24930 + }, + { + "epoch": 0.4509527824048856, + "grad_norm": 40.4375, + "learning_rate": 9.929538747191015e-06, + "loss": 17.6337, + "step": 24940 + }, + { + "epoch": 0.4511335974740135, + "grad_norm": 42.34375, + "learning_rate": 9.929510494884355e-06, + "loss": 16.8392, + "step": 24950 + }, + { + "epoch": 0.45131441254314136, + "grad_norm": 43.78125, + "learning_rate": 9.929482242577696e-06, + "loss": 16.8768, + "step": 24960 + }, + { + "epoch": 0.4514952276122692, + "grad_norm": 39.84375, + "learning_rate": 9.929453990271037e-06, + "loss": 17.0484, + "step": 24970 + }, + { + "epoch": 0.45167604268139705, + "grad_norm": 41.0625, + "learning_rate": 9.929425737964377e-06, + "loss": 17.2024, + "step": 24980 + }, + { + "epoch": 0.4518568577505249, + "grad_norm": 39.90625, + "learning_rate": 9.929397485657716e-06, + "loss": 17.5434, + "step": 24990 + }, + { + "epoch": 0.4520376728196528, + "grad_norm": 40.40625, + "learning_rate": 9.929369233351059e-06, + "loss": 16.9544, + "step": 25000 + }, + { + "epoch": 0.4520376728196528, + "eval_loss": 2.1385836601257324, + "eval_runtime": 229.8952, + "eval_samples_per_second": 3158.217, + "eval_steps_per_second": 49.349, + "step": 25000 + }, + { + "epoch": 0.45221848788878066, + "grad_norm": 42.25, + "learning_rate": 9.9293409810444e-06, + "loss": 17.2312, + "step": 25010 + }, + { + "epoch": 0.45239930295790853, + "grad_norm": 43.40625, + "learning_rate": 9.929312728737738e-06, + "loss": 16.9233, + "step": 25020 + }, + { + "epoch": 0.45258011802703635, + "grad_norm": 42.34375, + "learning_rate": 9.929284476431079e-06, + "loss": 17.1353, + "step": 25030 + }, + { + "epoch": 0.4527609330961642, + "grad_norm": 41.40625, + "learning_rate": 9.92925622412442e-06, + "loss": 17.2442, + "step": 25040 + }, + { + "epoch": 0.4529417481652921, + "grad_norm": 42.09375, + "learning_rate": 9.92922797181776e-06, + "loss": 17.064, + "step": 25050 + }, + { + "epoch": 0.45312256323441996, + "grad_norm": 39.34375, + "learning_rate": 9.9291997195111e-06, + "loss": 17.032, + "step": 25060 + }, + { + "epoch": 0.45330337830354783, + "grad_norm": 43.34375, + "learning_rate": 9.92917146720444e-06, + "loss": 17.1714, + "step": 25070 + }, + { + "epoch": 0.4534841933726757, + "grad_norm": 42.59375, + "learning_rate": 9.92914321489778e-06, + "loss": 17.0163, + "step": 25080 + }, + { + "epoch": 0.4536650084418035, + "grad_norm": 41.78125, + "learning_rate": 9.929114962591122e-06, + "loss": 17.1397, + "step": 25090 + }, + { + "epoch": 0.4538458235109314, + "grad_norm": 41.6875, + "learning_rate": 9.929086710284463e-06, + "loss": 16.8583, + "step": 25100 + }, + { + "epoch": 0.45402663858005926, + "grad_norm": 41.0625, + "learning_rate": 9.929058457977802e-06, + "loss": 16.8965, + "step": 25110 + }, + { + "epoch": 0.45420745364918713, + "grad_norm": 42.09375, + "learning_rate": 9.929030205671143e-06, + "loss": 17.3544, + "step": 25120 + }, + { + "epoch": 0.454388268718315, + "grad_norm": 38.84375, + "learning_rate": 9.929001953364483e-06, + "loss": 17.2578, + "step": 25130 + }, + { + "epoch": 0.4545690837874428, + "grad_norm": 38.90625, + "learning_rate": 9.928973701057824e-06, + "loss": 17.2073, + "step": 25140 + }, + { + "epoch": 0.4547498988565707, + "grad_norm": 40.8125, + "learning_rate": 9.928945448751165e-06, + "loss": 16.996, + "step": 25150 + }, + { + "epoch": 0.45493071392569856, + "grad_norm": 39.5625, + "learning_rate": 9.928917196444503e-06, + "loss": 17.1514, + "step": 25160 + }, + { + "epoch": 0.45511152899482643, + "grad_norm": 39.5625, + "learning_rate": 9.928888944137844e-06, + "loss": 17.1125, + "step": 25170 + }, + { + "epoch": 0.4552923440639543, + "grad_norm": 40.34375, + "learning_rate": 9.928860691831186e-06, + "loss": 16.7267, + "step": 25180 + }, + { + "epoch": 0.4554731591330822, + "grad_norm": 40.75, + "learning_rate": 9.928832439524525e-06, + "loss": 17.3858, + "step": 25190 + }, + { + "epoch": 0.45565397420221, + "grad_norm": 41.125, + "learning_rate": 9.928804187217866e-06, + "loss": 17.0973, + "step": 25200 + }, + { + "epoch": 0.45583478927133786, + "grad_norm": 41.25, + "learning_rate": 9.928775934911207e-06, + "loss": 16.7575, + "step": 25210 + }, + { + "epoch": 0.45601560434046573, + "grad_norm": 41.28125, + "learning_rate": 9.928747682604547e-06, + "loss": 16.9319, + "step": 25220 + }, + { + "epoch": 0.4561964194095936, + "grad_norm": 40.0, + "learning_rate": 9.928719430297888e-06, + "loss": 16.9408, + "step": 25230 + }, + { + "epoch": 0.45637723447872147, + "grad_norm": 40.8125, + "learning_rate": 9.928691177991228e-06, + "loss": 17.4637, + "step": 25240 + }, + { + "epoch": 0.45655804954784934, + "grad_norm": 38.78125, + "learning_rate": 9.928662925684567e-06, + "loss": 17.4776, + "step": 25250 + }, + { + "epoch": 0.45673886461697716, + "grad_norm": 41.125, + "learning_rate": 9.928634673377908e-06, + "loss": 16.9808, + "step": 25260 + }, + { + "epoch": 0.45691967968610503, + "grad_norm": 41.40625, + "learning_rate": 9.92860642107125e-06, + "loss": 16.7948, + "step": 25270 + }, + { + "epoch": 0.4571004947552329, + "grad_norm": 41.59375, + "learning_rate": 9.92857816876459e-06, + "loss": 17.1491, + "step": 25280 + }, + { + "epoch": 0.45728130982436077, + "grad_norm": 41.78125, + "learning_rate": 9.92854991645793e-06, + "loss": 17.0664, + "step": 25290 + }, + { + "epoch": 0.45746212489348864, + "grad_norm": 43.46875, + "learning_rate": 9.92852166415127e-06, + "loss": 17.0108, + "step": 25300 + }, + { + "epoch": 0.45764293996261646, + "grad_norm": 43.03125, + "learning_rate": 9.928493411844611e-06, + "loss": 16.851, + "step": 25310 + }, + { + "epoch": 0.45782375503174433, + "grad_norm": 41.34375, + "learning_rate": 9.928465159537952e-06, + "loss": 16.9911, + "step": 25320 + }, + { + "epoch": 0.4580045701008722, + "grad_norm": 39.1875, + "learning_rate": 9.92843690723129e-06, + "loss": 17.1102, + "step": 25330 + }, + { + "epoch": 0.45818538517000007, + "grad_norm": 39.75, + "learning_rate": 9.928408654924631e-06, + "loss": 16.7065, + "step": 25340 + }, + { + "epoch": 0.45836620023912794, + "grad_norm": 41.59375, + "learning_rate": 9.928380402617974e-06, + "loss": 17.1513, + "step": 25350 + }, + { + "epoch": 0.4585470153082558, + "grad_norm": 39.46875, + "learning_rate": 9.928352150311314e-06, + "loss": 17.343, + "step": 25360 + }, + { + "epoch": 0.4587278303773836, + "grad_norm": 44.4375, + "learning_rate": 9.928323898004653e-06, + "loss": 16.954, + "step": 25370 + }, + { + "epoch": 0.4589086454465115, + "grad_norm": 39.6875, + "learning_rate": 9.928295645697994e-06, + "loss": 16.6668, + "step": 25380 + }, + { + "epoch": 0.45908946051563937, + "grad_norm": 41.0625, + "learning_rate": 9.928267393391334e-06, + "loss": 17.0833, + "step": 25390 + }, + { + "epoch": 0.45927027558476724, + "grad_norm": 42.625, + "learning_rate": 9.928239141084675e-06, + "loss": 17.1839, + "step": 25400 + }, + { + "epoch": 0.4594510906538951, + "grad_norm": 42.90625, + "learning_rate": 9.928210888778016e-06, + "loss": 17.6705, + "step": 25410 + }, + { + "epoch": 0.459631905723023, + "grad_norm": 43.0625, + "learning_rate": 9.928182636471355e-06, + "loss": 17.2035, + "step": 25420 + }, + { + "epoch": 0.4598127207921508, + "grad_norm": 39.96875, + "learning_rate": 9.928154384164695e-06, + "loss": 17.355, + "step": 25430 + }, + { + "epoch": 0.45999353586127867, + "grad_norm": 42.0, + "learning_rate": 9.928126131858037e-06, + "loss": 17.029, + "step": 25440 + }, + { + "epoch": 0.46017435093040654, + "grad_norm": 45.03125, + "learning_rate": 9.928097879551376e-06, + "loss": 16.6355, + "step": 25450 + }, + { + "epoch": 0.4603551659995344, + "grad_norm": 42.40625, + "learning_rate": 9.928069627244717e-06, + "loss": 17.3171, + "step": 25460 + }, + { + "epoch": 0.4605359810686623, + "grad_norm": 41.8125, + "learning_rate": 9.928041374938058e-06, + "loss": 16.8692, + "step": 25470 + }, + { + "epoch": 0.4607167961377901, + "grad_norm": 41.21875, + "learning_rate": 9.928013122631398e-06, + "loss": 17.0069, + "step": 25480 + }, + { + "epoch": 0.46089761120691797, + "grad_norm": 43.25, + "learning_rate": 9.927984870324739e-06, + "loss": 17.3876, + "step": 25490 + }, + { + "epoch": 0.46107842627604584, + "grad_norm": 42.375, + "learning_rate": 9.927956618018078e-06, + "loss": 16.6849, + "step": 25500 + }, + { + "epoch": 0.4612592413451737, + "grad_norm": 43.125, + "learning_rate": 9.927928365711418e-06, + "loss": 17.0812, + "step": 25510 + }, + { + "epoch": 0.4614400564143016, + "grad_norm": 41.09375, + "learning_rate": 9.927900113404759e-06, + "loss": 17.2766, + "step": 25520 + }, + { + "epoch": 0.46162087148342945, + "grad_norm": 41.9375, + "learning_rate": 9.927871861098101e-06, + "loss": 17.084, + "step": 25530 + }, + { + "epoch": 0.46180168655255727, + "grad_norm": 38.625, + "learning_rate": 9.92784360879144e-06, + "loss": 17.2581, + "step": 25540 + }, + { + "epoch": 0.46198250162168514, + "grad_norm": 41.28125, + "learning_rate": 9.927815356484781e-06, + "loss": 17.2412, + "step": 25550 + }, + { + "epoch": 0.462163316690813, + "grad_norm": 41.28125, + "learning_rate": 9.927787104178122e-06, + "loss": 16.8625, + "step": 25560 + }, + { + "epoch": 0.4623441317599409, + "grad_norm": 40.6875, + "learning_rate": 9.927758851871462e-06, + "loss": 17.4807, + "step": 25570 + }, + { + "epoch": 0.46252494682906875, + "grad_norm": 43.40625, + "learning_rate": 9.927730599564803e-06, + "loss": 16.9297, + "step": 25580 + }, + { + "epoch": 0.4627057618981966, + "grad_norm": 42.5, + "learning_rate": 9.927702347258142e-06, + "loss": 17.1685, + "step": 25590 + }, + { + "epoch": 0.46288657696732444, + "grad_norm": 40.09375, + "learning_rate": 9.927674094951482e-06, + "loss": 17.0162, + "step": 25600 + }, + { + "epoch": 0.4630673920364523, + "grad_norm": 42.0625, + "learning_rate": 9.927645842644823e-06, + "loss": 17.2933, + "step": 25610 + }, + { + "epoch": 0.4632482071055802, + "grad_norm": 44.71875, + "learning_rate": 9.927617590338164e-06, + "loss": 16.9681, + "step": 25620 + }, + { + "epoch": 0.46342902217470805, + "grad_norm": 39.5625, + "learning_rate": 9.927589338031504e-06, + "loss": 17.1248, + "step": 25630 + }, + { + "epoch": 0.4636098372438359, + "grad_norm": 43.9375, + "learning_rate": 9.927561085724845e-06, + "loss": 17.0732, + "step": 25640 + }, + { + "epoch": 0.46379065231296374, + "grad_norm": 42.6875, + "learning_rate": 9.927532833418185e-06, + "loss": 17.0435, + "step": 25650 + }, + { + "epoch": 0.4639714673820916, + "grad_norm": 43.34375, + "learning_rate": 9.927504581111526e-06, + "loss": 17.0115, + "step": 25660 + }, + { + "epoch": 0.4641522824512195, + "grad_norm": 39.28125, + "learning_rate": 9.927476328804867e-06, + "loss": 16.9102, + "step": 25670 + }, + { + "epoch": 0.46433309752034735, + "grad_norm": 41.4375, + "learning_rate": 9.927448076498206e-06, + "loss": 16.9313, + "step": 25680 + }, + { + "epoch": 0.4645139125894752, + "grad_norm": 41.15625, + "learning_rate": 9.927419824191546e-06, + "loss": 17.2224, + "step": 25690 + }, + { + "epoch": 0.4646947276586031, + "grad_norm": 43.75, + "learning_rate": 9.927391571884889e-06, + "loss": 17.1843, + "step": 25700 + }, + { + "epoch": 0.4648755427277309, + "grad_norm": 39.625, + "learning_rate": 9.927363319578228e-06, + "loss": 16.6983, + "step": 25710 + }, + { + "epoch": 0.4650563577968588, + "grad_norm": 43.5, + "learning_rate": 9.927335067271568e-06, + "loss": 16.8113, + "step": 25720 + }, + { + "epoch": 0.46523717286598665, + "grad_norm": 42.0, + "learning_rate": 9.927306814964909e-06, + "loss": 16.901, + "step": 25730 + }, + { + "epoch": 0.4654179879351145, + "grad_norm": 39.875, + "learning_rate": 9.92727856265825e-06, + "loss": 17.2378, + "step": 25740 + }, + { + "epoch": 0.4655988030042424, + "grad_norm": 40.125, + "learning_rate": 9.92725031035159e-06, + "loss": 17.2241, + "step": 25750 + }, + { + "epoch": 0.46577961807337026, + "grad_norm": 40.5625, + "learning_rate": 9.927222058044929e-06, + "loss": 16.9758, + "step": 25760 + }, + { + "epoch": 0.4659604331424981, + "grad_norm": 38.53125, + "learning_rate": 9.92719380573827e-06, + "loss": 16.8353, + "step": 25770 + }, + { + "epoch": 0.46614124821162595, + "grad_norm": 40.625, + "learning_rate": 9.92716555343161e-06, + "loss": 16.9377, + "step": 25780 + }, + { + "epoch": 0.4663220632807538, + "grad_norm": 41.5625, + "learning_rate": 9.927137301124952e-06, + "loss": 17.0842, + "step": 25790 + }, + { + "epoch": 0.4665028783498817, + "grad_norm": 41.25, + "learning_rate": 9.927109048818291e-06, + "loss": 17.0646, + "step": 25800 + }, + { + "epoch": 0.46668369341900956, + "grad_norm": 42.59375, + "learning_rate": 9.927080796511632e-06, + "loss": 17.0465, + "step": 25810 + }, + { + "epoch": 0.4668645084881374, + "grad_norm": 38.1875, + "learning_rate": 9.927052544204973e-06, + "loss": 16.8757, + "step": 25820 + }, + { + "epoch": 0.46704532355726525, + "grad_norm": 41.34375, + "learning_rate": 9.927024291898313e-06, + "loss": 17.617, + "step": 25830 + }, + { + "epoch": 0.4672261386263931, + "grad_norm": 39.65625, + "learning_rate": 9.926996039591654e-06, + "loss": 16.8398, + "step": 25840 + }, + { + "epoch": 0.467406953695521, + "grad_norm": 42.875, + "learning_rate": 9.926967787284993e-06, + "loss": 16.779, + "step": 25850 + }, + { + "epoch": 0.46758776876464886, + "grad_norm": 41.75, + "learning_rate": 9.926939534978333e-06, + "loss": 16.9525, + "step": 25860 + }, + { + "epoch": 0.46776858383377673, + "grad_norm": 41.3125, + "learning_rate": 9.926911282671674e-06, + "loss": 16.375, + "step": 25870 + }, + { + "epoch": 0.46794939890290455, + "grad_norm": 40.71875, + "learning_rate": 9.926883030365015e-06, + "loss": 17.3035, + "step": 25880 + }, + { + "epoch": 0.4681302139720324, + "grad_norm": 41.9375, + "learning_rate": 9.926854778058355e-06, + "loss": 17.2107, + "step": 25890 + }, + { + "epoch": 0.4683110290411603, + "grad_norm": 45.1875, + "learning_rate": 9.926826525751696e-06, + "loss": 16.8853, + "step": 25900 + }, + { + "epoch": 0.46849184411028816, + "grad_norm": 41.125, + "learning_rate": 9.926798273445037e-06, + "loss": 16.858, + "step": 25910 + }, + { + "epoch": 0.46867265917941603, + "grad_norm": 40.21875, + "learning_rate": 9.926770021138377e-06, + "loss": 17.2101, + "step": 25920 + }, + { + "epoch": 0.4688534742485439, + "grad_norm": 37.15625, + "learning_rate": 9.926741768831716e-06, + "loss": 16.781, + "step": 25930 + }, + { + "epoch": 0.4690342893176717, + "grad_norm": 41.28125, + "learning_rate": 9.926713516525057e-06, + "loss": 16.9944, + "step": 25940 + }, + { + "epoch": 0.4692151043867996, + "grad_norm": 42.0625, + "learning_rate": 9.926685264218397e-06, + "loss": 17.245, + "step": 25950 + }, + { + "epoch": 0.46939591945592746, + "grad_norm": 42.6875, + "learning_rate": 9.926657011911738e-06, + "loss": 16.9312, + "step": 25960 + }, + { + "epoch": 0.46957673452505533, + "grad_norm": 42.28125, + "learning_rate": 9.926628759605079e-06, + "loss": 17.4238, + "step": 25970 + }, + { + "epoch": 0.4697575495941832, + "grad_norm": 42.34375, + "learning_rate": 9.92660050729842e-06, + "loss": 16.9905, + "step": 25980 + }, + { + "epoch": 0.469938364663311, + "grad_norm": 39.40625, + "learning_rate": 9.92657225499176e-06, + "loss": 16.793, + "step": 25990 + }, + { + "epoch": 0.4701191797324389, + "grad_norm": 41.15625, + "learning_rate": 9.9265440026851e-06, + "loss": 17.3917, + "step": 26000 + }, + { + "epoch": 0.47029999480156676, + "grad_norm": 42.5625, + "learning_rate": 9.926515750378441e-06, + "loss": 17.1517, + "step": 26010 + }, + { + "epoch": 0.4704808098706946, + "grad_norm": 39.21875, + "learning_rate": 9.92648749807178e-06, + "loss": 16.935, + "step": 26020 + }, + { + "epoch": 0.4706616249398225, + "grad_norm": 41.90625, + "learning_rate": 9.92645924576512e-06, + "loss": 16.5918, + "step": 26030 + }, + { + "epoch": 0.47084244000895037, + "grad_norm": 40.25, + "learning_rate": 9.926430993458461e-06, + "loss": 17.144, + "step": 26040 + }, + { + "epoch": 0.4710232550780782, + "grad_norm": 39.34375, + "learning_rate": 9.926402741151802e-06, + "loss": 16.9929, + "step": 26050 + }, + { + "epoch": 0.47120407014720606, + "grad_norm": 41.75, + "learning_rate": 9.926374488845143e-06, + "loss": 16.8202, + "step": 26060 + }, + { + "epoch": 0.4713848852163339, + "grad_norm": 41.8125, + "learning_rate": 9.926346236538483e-06, + "loss": 16.9627, + "step": 26070 + }, + { + "epoch": 0.4715657002854618, + "grad_norm": 39.46875, + "learning_rate": 9.926317984231824e-06, + "loss": 16.9011, + "step": 26080 + }, + { + "epoch": 0.47174651535458967, + "grad_norm": 41.4375, + "learning_rate": 9.926289731925164e-06, + "loss": 17.0805, + "step": 26090 + }, + { + "epoch": 0.47192733042371754, + "grad_norm": 43.375, + "learning_rate": 9.926261479618505e-06, + "loss": 17.2723, + "step": 26100 + }, + { + "epoch": 0.47210814549284535, + "grad_norm": 41.875, + "learning_rate": 9.926233227311844e-06, + "loss": 17.1467, + "step": 26110 + }, + { + "epoch": 0.4722889605619732, + "grad_norm": 43.25, + "learning_rate": 9.926204975005185e-06, + "loss": 16.8147, + "step": 26120 + }, + { + "epoch": 0.4724697756311011, + "grad_norm": 42.03125, + "learning_rate": 9.926176722698525e-06, + "loss": 16.6152, + "step": 26130 + }, + { + "epoch": 0.47265059070022897, + "grad_norm": 39.03125, + "learning_rate": 9.926148470391866e-06, + "loss": 16.9996, + "step": 26140 + }, + { + "epoch": 0.47283140576935684, + "grad_norm": 41.875, + "learning_rate": 9.926120218085206e-06, + "loss": 16.8885, + "step": 26150 + }, + { + "epoch": 0.47301222083848465, + "grad_norm": 40.9375, + "learning_rate": 9.926091965778547e-06, + "loss": 16.686, + "step": 26160 + }, + { + "epoch": 0.4731930359076125, + "grad_norm": 40.34375, + "learning_rate": 9.926063713471888e-06, + "loss": 17.0671, + "step": 26170 + }, + { + "epoch": 0.4733738509767404, + "grad_norm": 43.71875, + "learning_rate": 9.926035461165228e-06, + "loss": 16.7281, + "step": 26180 + }, + { + "epoch": 0.47355466604586827, + "grad_norm": 41.8125, + "learning_rate": 9.926007208858567e-06, + "loss": 17.1318, + "step": 26190 + }, + { + "epoch": 0.47373548111499614, + "grad_norm": 40.375, + "learning_rate": 9.925978956551908e-06, + "loss": 17.2749, + "step": 26200 + }, + { + "epoch": 0.473916296184124, + "grad_norm": 44.0, + "learning_rate": 9.925950704245248e-06, + "loss": 17.1022, + "step": 26210 + }, + { + "epoch": 0.4740971112532518, + "grad_norm": 43.9375, + "learning_rate": 9.925922451938589e-06, + "loss": 17.0693, + "step": 26220 + }, + { + "epoch": 0.4742779263223797, + "grad_norm": 41.3125, + "learning_rate": 9.92589419963193e-06, + "loss": 16.8296, + "step": 26230 + }, + { + "epoch": 0.47445874139150757, + "grad_norm": 42.03125, + "learning_rate": 9.92586594732527e-06, + "loss": 17.0046, + "step": 26240 + }, + { + "epoch": 0.47463955646063544, + "grad_norm": 41.34375, + "learning_rate": 9.925837695018611e-06, + "loss": 17.1826, + "step": 26250 + }, + { + "epoch": 0.4748203715297633, + "grad_norm": 41.59375, + "learning_rate": 9.925809442711952e-06, + "loss": 17.0491, + "step": 26260 + }, + { + "epoch": 0.4750011865988912, + "grad_norm": 40.875, + "learning_rate": 9.925781190405292e-06, + "loss": 16.8475, + "step": 26270 + }, + { + "epoch": 0.475182001668019, + "grad_norm": 41.78125, + "learning_rate": 9.925752938098631e-06, + "loss": 17.4165, + "step": 26280 + }, + { + "epoch": 0.47536281673714686, + "grad_norm": 40.59375, + "learning_rate": 9.925724685791972e-06, + "loss": 17.1112, + "step": 26290 + }, + { + "epoch": 0.47554363180627474, + "grad_norm": 40.0, + "learning_rate": 9.925696433485312e-06, + "loss": 17.0229, + "step": 26300 + }, + { + "epoch": 0.4757244468754026, + "grad_norm": 38.0625, + "learning_rate": 9.925668181178653e-06, + "loss": 17.1572, + "step": 26310 + }, + { + "epoch": 0.4759052619445305, + "grad_norm": 39.28125, + "learning_rate": 9.925639928871994e-06, + "loss": 17.0653, + "step": 26320 + }, + { + "epoch": 0.4760860770136583, + "grad_norm": 41.5625, + "learning_rate": 9.925611676565334e-06, + "loss": 16.9044, + "step": 26330 + }, + { + "epoch": 0.47626689208278616, + "grad_norm": 42.4375, + "learning_rate": 9.925583424258675e-06, + "loss": 16.9558, + "step": 26340 + }, + { + "epoch": 0.47644770715191403, + "grad_norm": 40.625, + "learning_rate": 9.925555171952015e-06, + "loss": 17.1931, + "step": 26350 + }, + { + "epoch": 0.4766285222210419, + "grad_norm": 39.59375, + "learning_rate": 9.925526919645354e-06, + "loss": 16.6998, + "step": 26360 + }, + { + "epoch": 0.4768093372901698, + "grad_norm": 41.0, + "learning_rate": 9.925498667338695e-06, + "loss": 16.9934, + "step": 26370 + }, + { + "epoch": 0.47699015235929765, + "grad_norm": 43.125, + "learning_rate": 9.925470415032036e-06, + "loss": 17.419, + "step": 26380 + }, + { + "epoch": 0.47717096742842546, + "grad_norm": 41.875, + "learning_rate": 9.925442162725376e-06, + "loss": 17.275, + "step": 26390 + }, + { + "epoch": 0.47735178249755333, + "grad_norm": 42.15625, + "learning_rate": 9.925413910418717e-06, + "loss": 17.2304, + "step": 26400 + }, + { + "epoch": 0.4775325975666812, + "grad_norm": 39.96875, + "learning_rate": 9.925385658112058e-06, + "loss": 16.9184, + "step": 26410 + }, + { + "epoch": 0.4777134126358091, + "grad_norm": 41.96875, + "learning_rate": 9.925357405805398e-06, + "loss": 16.8243, + "step": 26420 + }, + { + "epoch": 0.47789422770493695, + "grad_norm": 42.375, + "learning_rate": 9.925329153498739e-06, + "loss": 17.2043, + "step": 26430 + }, + { + "epoch": 0.4780750427740648, + "grad_norm": 42.0, + "learning_rate": 9.92530090119208e-06, + "loss": 16.9562, + "step": 26440 + }, + { + "epoch": 0.47825585784319263, + "grad_norm": 43.40625, + "learning_rate": 9.925272648885418e-06, + "loss": 16.9889, + "step": 26450 + }, + { + "epoch": 0.4784366729123205, + "grad_norm": 38.78125, + "learning_rate": 9.925244396578759e-06, + "loss": 17.1232, + "step": 26460 + }, + { + "epoch": 0.4786174879814484, + "grad_norm": 41.96875, + "learning_rate": 9.9252161442721e-06, + "loss": 17.097, + "step": 26470 + }, + { + "epoch": 0.47879830305057625, + "grad_norm": 39.59375, + "learning_rate": 9.92518789196544e-06, + "loss": 16.9876, + "step": 26480 + }, + { + "epoch": 0.4789791181197041, + "grad_norm": 42.34375, + "learning_rate": 9.92515963965878e-06, + "loss": 17.3516, + "step": 26490 + }, + { + "epoch": 0.47915993318883193, + "grad_norm": 40.3125, + "learning_rate": 9.925131387352121e-06, + "loss": 17.1503, + "step": 26500 + }, + { + "epoch": 0.4793407482579598, + "grad_norm": 42.375, + "learning_rate": 9.925103135045462e-06, + "loss": 17.4379, + "step": 26510 + }, + { + "epoch": 0.4795215633270877, + "grad_norm": 40.9375, + "learning_rate": 9.925074882738803e-06, + "loss": 16.7705, + "step": 26520 + }, + { + "epoch": 0.47970237839621555, + "grad_norm": 40.3125, + "learning_rate": 9.925046630432142e-06, + "loss": 16.8241, + "step": 26530 + }, + { + "epoch": 0.4798831934653434, + "grad_norm": 45.15625, + "learning_rate": 9.925018378125482e-06, + "loss": 17.3142, + "step": 26540 + }, + { + "epoch": 0.4800640085344713, + "grad_norm": 42.59375, + "learning_rate": 9.924990125818823e-06, + "loss": 16.7404, + "step": 26550 + }, + { + "epoch": 0.4802448236035991, + "grad_norm": 44.78125, + "learning_rate": 9.924961873512163e-06, + "loss": 17.6105, + "step": 26560 + }, + { + "epoch": 0.480425638672727, + "grad_norm": 40.71875, + "learning_rate": 9.924933621205504e-06, + "loss": 16.9162, + "step": 26570 + }, + { + "epoch": 0.48060645374185484, + "grad_norm": 42.03125, + "learning_rate": 9.924905368898845e-06, + "loss": 16.7582, + "step": 26580 + }, + { + "epoch": 0.4807872688109827, + "grad_norm": 42.40625, + "learning_rate": 9.924877116592185e-06, + "loss": 17.1185, + "step": 26590 + }, + { + "epoch": 0.4809680838801106, + "grad_norm": 43.03125, + "learning_rate": 9.924848864285526e-06, + "loss": 17.1527, + "step": 26600 + }, + { + "epoch": 0.48114889894923846, + "grad_norm": 43.78125, + "learning_rate": 9.924820611978867e-06, + "loss": 17.2085, + "step": 26610 + }, + { + "epoch": 0.4813297140183663, + "grad_norm": 42.09375, + "learning_rate": 9.924792359672206e-06, + "loss": 17.3011, + "step": 26620 + }, + { + "epoch": 0.48151052908749414, + "grad_norm": 42.9375, + "learning_rate": 9.924764107365546e-06, + "loss": 17.2475, + "step": 26630 + }, + { + "epoch": 0.481691344156622, + "grad_norm": 41.125, + "learning_rate": 9.924735855058887e-06, + "loss": 17.0385, + "step": 26640 + }, + { + "epoch": 0.4818721592257499, + "grad_norm": 40.40625, + "learning_rate": 9.924707602752227e-06, + "loss": 16.8667, + "step": 26650 + }, + { + "epoch": 0.48205297429487776, + "grad_norm": 42.09375, + "learning_rate": 9.924679350445568e-06, + "loss": 17.118, + "step": 26660 + }, + { + "epoch": 0.48223378936400557, + "grad_norm": 39.8125, + "learning_rate": 9.924651098138909e-06, + "loss": 17.058, + "step": 26670 + }, + { + "epoch": 0.48241460443313344, + "grad_norm": 41.34375, + "learning_rate": 9.92462284583225e-06, + "loss": 17.0949, + "step": 26680 + }, + { + "epoch": 0.4825954195022613, + "grad_norm": 41.5, + "learning_rate": 9.92459459352559e-06, + "loss": 17.0175, + "step": 26690 + }, + { + "epoch": 0.4827762345713892, + "grad_norm": 43.53125, + "learning_rate": 9.92456634121893e-06, + "loss": 16.6859, + "step": 26700 + }, + { + "epoch": 0.48295704964051706, + "grad_norm": 40.1875, + "learning_rate": 9.92453808891227e-06, + "loss": 16.7084, + "step": 26710 + }, + { + "epoch": 0.4831378647096449, + "grad_norm": 40.28125, + "learning_rate": 9.92450983660561e-06, + "loss": 16.9311, + "step": 26720 + }, + { + "epoch": 0.48331867977877274, + "grad_norm": 39.1875, + "learning_rate": 9.92448158429895e-06, + "loss": 17.2466, + "step": 26730 + }, + { + "epoch": 0.4834994948479006, + "grad_norm": 38.46875, + "learning_rate": 9.924453331992291e-06, + "loss": 16.9546, + "step": 26740 + }, + { + "epoch": 0.4836803099170285, + "grad_norm": 40.65625, + "learning_rate": 9.924425079685632e-06, + "loss": 17.0234, + "step": 26750 + }, + { + "epoch": 0.48386112498615635, + "grad_norm": 42.78125, + "learning_rate": 9.924396827378973e-06, + "loss": 17.4127, + "step": 26760 + }, + { + "epoch": 0.4840419400552842, + "grad_norm": 42.78125, + "learning_rate": 9.924368575072313e-06, + "loss": 17.1631, + "step": 26770 + }, + { + "epoch": 0.48422275512441204, + "grad_norm": 42.9375, + "learning_rate": 9.924340322765654e-06, + "loss": 16.9457, + "step": 26780 + }, + { + "epoch": 0.4844035701935399, + "grad_norm": 41.4375, + "learning_rate": 9.924312070458993e-06, + "loss": 17.0132, + "step": 26790 + }, + { + "epoch": 0.4845843852626678, + "grad_norm": 43.3125, + "learning_rate": 9.924283818152333e-06, + "loss": 17.3709, + "step": 26800 + }, + { + "epoch": 0.48476520033179565, + "grad_norm": 41.25, + "learning_rate": 9.924255565845674e-06, + "loss": 17.4508, + "step": 26810 + }, + { + "epoch": 0.4849460154009235, + "grad_norm": 42.9375, + "learning_rate": 9.924227313539015e-06, + "loss": 17.0839, + "step": 26820 + }, + { + "epoch": 0.4851268304700514, + "grad_norm": 40.5625, + "learning_rate": 9.924199061232355e-06, + "loss": 17.0203, + "step": 26830 + }, + { + "epoch": 0.4853076455391792, + "grad_norm": 43.46875, + "learning_rate": 9.924170808925696e-06, + "loss": 17.0355, + "step": 26840 + }, + { + "epoch": 0.4854884606083071, + "grad_norm": 42.21875, + "learning_rate": 9.924142556619036e-06, + "loss": 17.1141, + "step": 26850 + }, + { + "epoch": 0.48566927567743495, + "grad_norm": 41.03125, + "learning_rate": 9.924114304312377e-06, + "loss": 17.0769, + "step": 26860 + }, + { + "epoch": 0.4858500907465628, + "grad_norm": 38.59375, + "learning_rate": 9.924086052005718e-06, + "loss": 16.6634, + "step": 26870 + }, + { + "epoch": 0.4860309058156907, + "grad_norm": 44.1875, + "learning_rate": 9.924057799699057e-06, + "loss": 16.7654, + "step": 26880 + }, + { + "epoch": 0.48621172088481857, + "grad_norm": 42.90625, + "learning_rate": 9.924029547392397e-06, + "loss": 17.1133, + "step": 26890 + }, + { + "epoch": 0.4863925359539464, + "grad_norm": 44.125, + "learning_rate": 9.924001295085738e-06, + "loss": 17.0373, + "step": 26900 + }, + { + "epoch": 0.48657335102307425, + "grad_norm": 40.78125, + "learning_rate": 9.923973042779078e-06, + "loss": 17.2041, + "step": 26910 + }, + { + "epoch": 0.4867541660922021, + "grad_norm": 44.1875, + "learning_rate": 9.923944790472419e-06, + "loss": 16.9118, + "step": 26920 + }, + { + "epoch": 0.48693498116133, + "grad_norm": 43.125, + "learning_rate": 9.92391653816576e-06, + "loss": 17.4285, + "step": 26930 + }, + { + "epoch": 0.48711579623045786, + "grad_norm": 41.6875, + "learning_rate": 9.9238882858591e-06, + "loss": 17.1475, + "step": 26940 + }, + { + "epoch": 0.4872966112995857, + "grad_norm": 40.625, + "learning_rate": 9.923860033552441e-06, + "loss": 17.7247, + "step": 26950 + }, + { + "epoch": 0.48747742636871355, + "grad_norm": 42.71875, + "learning_rate": 9.92383178124578e-06, + "loss": 17.3309, + "step": 26960 + }, + { + "epoch": 0.4876582414378414, + "grad_norm": 46.21875, + "learning_rate": 9.92380352893912e-06, + "loss": 17.1113, + "step": 26970 + }, + { + "epoch": 0.4878390565069693, + "grad_norm": 41.15625, + "learning_rate": 9.923775276632461e-06, + "loss": 17.2221, + "step": 26980 + }, + { + "epoch": 0.48801987157609716, + "grad_norm": 42.15625, + "learning_rate": 9.923747024325802e-06, + "loss": 17.3459, + "step": 26990 + }, + { + "epoch": 0.48820068664522503, + "grad_norm": 42.40625, + "learning_rate": 9.923718772019142e-06, + "loss": 17.1826, + "step": 27000 + }, + { + "epoch": 0.48838150171435285, + "grad_norm": 42.6875, + "learning_rate": 9.923690519712483e-06, + "loss": 16.9989, + "step": 27010 + }, + { + "epoch": 0.4885623167834807, + "grad_norm": 44.6875, + "learning_rate": 9.923662267405824e-06, + "loss": 17.1884, + "step": 27020 + }, + { + "epoch": 0.4887431318526086, + "grad_norm": 41.21875, + "learning_rate": 9.923634015099164e-06, + "loss": 17.0027, + "step": 27030 + }, + { + "epoch": 0.48892394692173646, + "grad_norm": 43.625, + "learning_rate": 9.923605762792505e-06, + "loss": 16.6061, + "step": 27040 + }, + { + "epoch": 0.48910476199086433, + "grad_norm": 42.875, + "learning_rate": 9.923577510485844e-06, + "loss": 17.0401, + "step": 27050 + }, + { + "epoch": 0.4892855770599922, + "grad_norm": 41.28125, + "learning_rate": 9.923549258179184e-06, + "loss": 17.1379, + "step": 27060 + }, + { + "epoch": 0.48946639212912, + "grad_norm": 40.84375, + "learning_rate": 9.923521005872525e-06, + "loss": 16.6934, + "step": 27070 + }, + { + "epoch": 0.4896472071982479, + "grad_norm": 39.03125, + "learning_rate": 9.923492753565866e-06, + "loss": 16.9134, + "step": 27080 + }, + { + "epoch": 0.48982802226737576, + "grad_norm": 40.59375, + "learning_rate": 9.923464501259206e-06, + "loss": 16.6742, + "step": 27090 + }, + { + "epoch": 0.49000883733650363, + "grad_norm": 40.0625, + "learning_rate": 9.923436248952547e-06, + "loss": 17.0472, + "step": 27100 + }, + { + "epoch": 0.4901896524056315, + "grad_norm": 43.65625, + "learning_rate": 9.923407996645888e-06, + "loss": 17.0729, + "step": 27110 + }, + { + "epoch": 0.4903704674747593, + "grad_norm": 40.34375, + "learning_rate": 9.923379744339228e-06, + "loss": 17.1721, + "step": 27120 + }, + { + "epoch": 0.4905512825438872, + "grad_norm": 41.5, + "learning_rate": 9.923351492032569e-06, + "loss": 16.6988, + "step": 27130 + }, + { + "epoch": 0.49073209761301506, + "grad_norm": 38.90625, + "learning_rate": 9.923323239725908e-06, + "loss": 16.7262, + "step": 27140 + }, + { + "epoch": 0.49091291268214293, + "grad_norm": 39.6875, + "learning_rate": 9.923294987419248e-06, + "loss": 17.3932, + "step": 27150 + }, + { + "epoch": 0.4910937277512708, + "grad_norm": 40.34375, + "learning_rate": 9.923266735112589e-06, + "loss": 17.2486, + "step": 27160 + }, + { + "epoch": 0.4912745428203987, + "grad_norm": 41.3125, + "learning_rate": 9.92323848280593e-06, + "loss": 16.8497, + "step": 27170 + }, + { + "epoch": 0.4914553578895265, + "grad_norm": 40.625, + "learning_rate": 9.92321023049927e-06, + "loss": 16.9637, + "step": 27180 + }, + { + "epoch": 0.49163617295865436, + "grad_norm": 40.125, + "learning_rate": 9.92318197819261e-06, + "loss": 16.8488, + "step": 27190 + }, + { + "epoch": 0.49181698802778223, + "grad_norm": 42.0, + "learning_rate": 9.923153725885951e-06, + "loss": 17.0431, + "step": 27200 + }, + { + "epoch": 0.4919978030969101, + "grad_norm": 42.21875, + "learning_rate": 9.923125473579292e-06, + "loss": 16.9398, + "step": 27210 + }, + { + "epoch": 0.492178618166038, + "grad_norm": 42.71875, + "learning_rate": 9.923097221272631e-06, + "loss": 17.0839, + "step": 27220 + }, + { + "epoch": 0.49235943323516584, + "grad_norm": 39.96875, + "learning_rate": 9.923068968965972e-06, + "loss": 17.1533, + "step": 27230 + }, + { + "epoch": 0.49254024830429366, + "grad_norm": 41.5625, + "learning_rate": 9.923040716659312e-06, + "loss": 17.4095, + "step": 27240 + }, + { + "epoch": 0.49272106337342153, + "grad_norm": 44.03125, + "learning_rate": 9.923012464352653e-06, + "loss": 16.9864, + "step": 27250 + }, + { + "epoch": 0.4929018784425494, + "grad_norm": 41.78125, + "learning_rate": 9.922984212045993e-06, + "loss": 16.8673, + "step": 27260 + }, + { + "epoch": 0.4930826935116773, + "grad_norm": 44.34375, + "learning_rate": 9.922955959739332e-06, + "loss": 17.4023, + "step": 27270 + }, + { + "epoch": 0.49326350858080514, + "grad_norm": 42.40625, + "learning_rate": 9.922927707432675e-06, + "loss": 17.256, + "step": 27280 + }, + { + "epoch": 0.49344432364993296, + "grad_norm": 41.9375, + "learning_rate": 9.922899455126015e-06, + "loss": 16.9397, + "step": 27290 + }, + { + "epoch": 0.49362513871906083, + "grad_norm": 41.09375, + "learning_rate": 9.922871202819356e-06, + "loss": 16.7331, + "step": 27300 + }, + { + "epoch": 0.4938059537881887, + "grad_norm": 43.65625, + "learning_rate": 9.922842950512695e-06, + "loss": 16.9543, + "step": 27310 + }, + { + "epoch": 0.49398676885731657, + "grad_norm": 42.15625, + "learning_rate": 9.922814698206036e-06, + "loss": 17.0799, + "step": 27320 + }, + { + "epoch": 0.49416758392644444, + "grad_norm": 40.78125, + "learning_rate": 9.922786445899376e-06, + "loss": 17.436, + "step": 27330 + }, + { + "epoch": 0.4943483989955723, + "grad_norm": 41.75, + "learning_rate": 9.922758193592717e-06, + "loss": 17.0082, + "step": 27340 + }, + { + "epoch": 0.49452921406470013, + "grad_norm": 45.0, + "learning_rate": 9.922729941286057e-06, + "loss": 16.635, + "step": 27350 + }, + { + "epoch": 0.494710029133828, + "grad_norm": 43.3125, + "learning_rate": 9.922701688979396e-06, + "loss": 16.8168, + "step": 27360 + }, + { + "epoch": 0.49489084420295587, + "grad_norm": 42.03125, + "learning_rate": 9.922673436672739e-06, + "loss": 17.2225, + "step": 27370 + }, + { + "epoch": 0.49507165927208374, + "grad_norm": 39.46875, + "learning_rate": 9.92264518436608e-06, + "loss": 16.5935, + "step": 27380 + }, + { + "epoch": 0.4952524743412116, + "grad_norm": 41.5625, + "learning_rate": 9.922616932059418e-06, + "loss": 16.63, + "step": 27390 + }, + { + "epoch": 0.4954332894103395, + "grad_norm": 39.3125, + "learning_rate": 9.922588679752759e-06, + "loss": 16.9653, + "step": 27400 + }, + { + "epoch": 0.4956141044794673, + "grad_norm": 40.1875, + "learning_rate": 9.9225604274461e-06, + "loss": 17.6395, + "step": 27410 + }, + { + "epoch": 0.49579491954859517, + "grad_norm": 40.875, + "learning_rate": 9.92253217513944e-06, + "loss": 17.131, + "step": 27420 + }, + { + "epoch": 0.49597573461772304, + "grad_norm": 43.71875, + "learning_rate": 9.92250392283278e-06, + "loss": 17.4824, + "step": 27430 + }, + { + "epoch": 0.4961565496868509, + "grad_norm": 42.46875, + "learning_rate": 9.922475670526121e-06, + "loss": 17.3884, + "step": 27440 + }, + { + "epoch": 0.4963373647559788, + "grad_norm": 39.84375, + "learning_rate": 9.922447418219462e-06, + "loss": 17.2447, + "step": 27450 + }, + { + "epoch": 0.4965181798251066, + "grad_norm": 41.96875, + "learning_rate": 9.922419165912803e-06, + "loss": 16.9101, + "step": 27460 + }, + { + "epoch": 0.49669899489423447, + "grad_norm": 43.84375, + "learning_rate": 9.922390913606143e-06, + "loss": 17.1153, + "step": 27470 + }, + { + "epoch": 0.49687980996336234, + "grad_norm": 42.46875, + "learning_rate": 9.922362661299482e-06, + "loss": 17.0791, + "step": 27480 + }, + { + "epoch": 0.4970606250324902, + "grad_norm": 39.0, + "learning_rate": 9.922334408992823e-06, + "loss": 17.3504, + "step": 27490 + }, + { + "epoch": 0.4972414401016181, + "grad_norm": 43.40625, + "learning_rate": 9.922306156686163e-06, + "loss": 17.0333, + "step": 27500 + }, + { + "epoch": 0.49742225517074595, + "grad_norm": 42.625, + "learning_rate": 9.922277904379504e-06, + "loss": 16.9336, + "step": 27510 + }, + { + "epoch": 0.49760307023987377, + "grad_norm": 39.875, + "learning_rate": 9.922249652072845e-06, + "loss": 16.8369, + "step": 27520 + }, + { + "epoch": 0.49778388530900164, + "grad_norm": 43.4375, + "learning_rate": 9.922221399766184e-06, + "loss": 17.2567, + "step": 27530 + }, + { + "epoch": 0.4979647003781295, + "grad_norm": 39.71875, + "learning_rate": 9.922193147459526e-06, + "loss": 16.7613, + "step": 27540 + }, + { + "epoch": 0.4981455154472574, + "grad_norm": 40.59375, + "learning_rate": 9.922164895152866e-06, + "loss": 17.1967, + "step": 27550 + }, + { + "epoch": 0.49832633051638525, + "grad_norm": 41.625, + "learning_rate": 9.922136642846207e-06, + "loss": 17.1377, + "step": 27560 + }, + { + "epoch": 0.4985071455855131, + "grad_norm": 41.25, + "learning_rate": 9.922108390539546e-06, + "loss": 16.5317, + "step": 27570 + }, + { + "epoch": 0.49868796065464094, + "grad_norm": 39.75, + "learning_rate": 9.922080138232887e-06, + "loss": 17.3832, + "step": 27580 + }, + { + "epoch": 0.4988687757237688, + "grad_norm": 41.53125, + "learning_rate": 9.922051885926227e-06, + "loss": 16.9755, + "step": 27590 + }, + { + "epoch": 0.4990495907928967, + "grad_norm": 44.28125, + "learning_rate": 9.922023633619568e-06, + "loss": 16.8368, + "step": 27600 + }, + { + "epoch": 0.49923040586202455, + "grad_norm": 42.4375, + "learning_rate": 9.921995381312908e-06, + "loss": 16.3951, + "step": 27610 + }, + { + "epoch": 0.4994112209311524, + "grad_norm": 43.375, + "learning_rate": 9.921967129006247e-06, + "loss": 17.0727, + "step": 27620 + }, + { + "epoch": 0.49959203600028024, + "grad_norm": 39.375, + "learning_rate": 9.92193887669959e-06, + "loss": 16.7404, + "step": 27630 + }, + { + "epoch": 0.4997728510694081, + "grad_norm": 42.5625, + "learning_rate": 9.92191062439293e-06, + "loss": 16.76, + "step": 27640 + }, + { + "epoch": 0.499953666138536, + "grad_norm": 42.21875, + "learning_rate": 9.92188237208627e-06, + "loss": 17.0706, + "step": 27650 + }, + { + "epoch": 0.5001344812076638, + "grad_norm": 41.40625, + "learning_rate": 9.92185411977961e-06, + "loss": 16.7611, + "step": 27660 + }, + { + "epoch": 0.5003152962767917, + "grad_norm": 39.84375, + "learning_rate": 9.92182586747295e-06, + "loss": 17.0832, + "step": 27670 + }, + { + "epoch": 0.5004961113459195, + "grad_norm": 40.1875, + "learning_rate": 9.921797615166291e-06, + "loss": 16.9609, + "step": 27680 + }, + { + "epoch": 0.5006769264150475, + "grad_norm": 40.0, + "learning_rate": 9.921769362859632e-06, + "loss": 17.1072, + "step": 27690 + }, + { + "epoch": 0.5008577414841753, + "grad_norm": 39.40625, + "learning_rate": 9.92174111055297e-06, + "loss": 16.9236, + "step": 27700 + }, + { + "epoch": 0.5010385565533032, + "grad_norm": 40.75, + "learning_rate": 9.921712858246311e-06, + "loss": 16.9665, + "step": 27710 + }, + { + "epoch": 0.501219371622431, + "grad_norm": 42.8125, + "learning_rate": 9.921684605939654e-06, + "loss": 16.732, + "step": 27720 + }, + { + "epoch": 0.5014001866915588, + "grad_norm": 39.90625, + "learning_rate": 9.921656353632994e-06, + "loss": 16.7196, + "step": 27730 + }, + { + "epoch": 0.5015810017606868, + "grad_norm": 42.0625, + "learning_rate": 9.921628101326333e-06, + "loss": 17.0108, + "step": 27740 + }, + { + "epoch": 0.5017618168298146, + "grad_norm": 40.25, + "learning_rate": 9.921599849019674e-06, + "loss": 16.9423, + "step": 27750 + }, + { + "epoch": 0.5019426318989425, + "grad_norm": 39.46875, + "learning_rate": 9.921571596713014e-06, + "loss": 17.2849, + "step": 27760 + }, + { + "epoch": 0.5021234469680703, + "grad_norm": 41.4375, + "learning_rate": 9.921543344406355e-06, + "loss": 17.3861, + "step": 27770 + }, + { + "epoch": 0.5023042620371981, + "grad_norm": 43.09375, + "learning_rate": 9.921515092099696e-06, + "loss": 16.6662, + "step": 27780 + }, + { + "epoch": 0.5024850771063261, + "grad_norm": 41.21875, + "learning_rate": 9.921486839793035e-06, + "loss": 16.7922, + "step": 27790 + }, + { + "epoch": 0.5026658921754539, + "grad_norm": 41.9375, + "learning_rate": 9.921458587486377e-06, + "loss": 16.8959, + "step": 27800 + }, + { + "epoch": 0.5028467072445818, + "grad_norm": 40.15625, + "learning_rate": 9.921430335179718e-06, + "loss": 17.2021, + "step": 27810 + }, + { + "epoch": 0.5030275223137096, + "grad_norm": 40.6875, + "learning_rate": 9.921402082873056e-06, + "loss": 16.8186, + "step": 27820 + }, + { + "epoch": 0.5032083373828374, + "grad_norm": 42.03125, + "learning_rate": 9.921373830566397e-06, + "loss": 16.8509, + "step": 27830 + }, + { + "epoch": 0.5033891524519654, + "grad_norm": 41.09375, + "learning_rate": 9.921345578259738e-06, + "loss": 16.794, + "step": 27840 + }, + { + "epoch": 0.5035699675210932, + "grad_norm": 40.5, + "learning_rate": 9.921317325953078e-06, + "loss": 17.2108, + "step": 27850 + }, + { + "epoch": 0.5037507825902211, + "grad_norm": 41.84375, + "learning_rate": 9.921289073646419e-06, + "loss": 16.9348, + "step": 27860 + }, + { + "epoch": 0.5039315976593489, + "grad_norm": 41.875, + "learning_rate": 9.92126082133976e-06, + "loss": 17.0763, + "step": 27870 + }, + { + "epoch": 0.5041124127284768, + "grad_norm": 42.28125, + "learning_rate": 9.921232569033099e-06, + "loss": 16.7722, + "step": 27880 + }, + { + "epoch": 0.5042932277976047, + "grad_norm": 40.53125, + "learning_rate": 9.92120431672644e-06, + "loss": 17.0283, + "step": 27890 + }, + { + "epoch": 0.5044740428667325, + "grad_norm": 41.6875, + "learning_rate": 9.921176064419781e-06, + "loss": 17.3908, + "step": 27900 + }, + { + "epoch": 0.5046548579358604, + "grad_norm": 44.21875, + "learning_rate": 9.92114781211312e-06, + "loss": 17.0255, + "step": 27910 + }, + { + "epoch": 0.5048356730049882, + "grad_norm": 41.59375, + "learning_rate": 9.921119559806461e-06, + "loss": 16.8385, + "step": 27920 + }, + { + "epoch": 0.5050164880741161, + "grad_norm": 40.25, + "learning_rate": 9.921091307499802e-06, + "loss": 17.0742, + "step": 27930 + }, + { + "epoch": 0.505197303143244, + "grad_norm": 45.1875, + "learning_rate": 9.921063055193142e-06, + "loss": 17.5109, + "step": 27940 + }, + { + "epoch": 0.5053781182123718, + "grad_norm": 44.15625, + "learning_rate": 9.921034802886483e-06, + "loss": 16.946, + "step": 27950 + }, + { + "epoch": 0.5055589332814997, + "grad_norm": 42.1875, + "learning_rate": 9.921006550579822e-06, + "loss": 16.8611, + "step": 27960 + }, + { + "epoch": 0.5057397483506275, + "grad_norm": 42.59375, + "learning_rate": 9.920978298273162e-06, + "loss": 16.8588, + "step": 27970 + }, + { + "epoch": 0.5059205634197554, + "grad_norm": 41.0625, + "learning_rate": 9.920950045966505e-06, + "loss": 17.1637, + "step": 27980 + }, + { + "epoch": 0.5061013784888833, + "grad_norm": 42.53125, + "learning_rate": 9.920921793659845e-06, + "loss": 16.828, + "step": 27990 + }, + { + "epoch": 0.5062821935580111, + "grad_norm": 39.46875, + "learning_rate": 9.920893541353184e-06, + "loss": 17.0082, + "step": 28000 + }, + { + "epoch": 0.506463008627139, + "grad_norm": 43.0625, + "learning_rate": 9.920865289046525e-06, + "loss": 17.4609, + "step": 28010 + }, + { + "epoch": 0.5066438236962668, + "grad_norm": 44.125, + "learning_rate": 9.920837036739866e-06, + "loss": 17.0481, + "step": 28020 + }, + { + "epoch": 0.5068246387653947, + "grad_norm": 41.65625, + "learning_rate": 9.920808784433206e-06, + "loss": 17.2568, + "step": 28030 + }, + { + "epoch": 0.5070054538345226, + "grad_norm": 38.78125, + "learning_rate": 9.920780532126547e-06, + "loss": 17.1309, + "step": 28040 + }, + { + "epoch": 0.5071862689036505, + "grad_norm": 40.90625, + "learning_rate": 9.920752279819886e-06, + "loss": 16.5908, + "step": 28050 + }, + { + "epoch": 0.5073670839727783, + "grad_norm": 42.375, + "learning_rate": 9.920724027513226e-06, + "loss": 16.8779, + "step": 28060 + }, + { + "epoch": 0.5075478990419061, + "grad_norm": 41.28125, + "learning_rate": 9.920695775206569e-06, + "loss": 17.1186, + "step": 28070 + }, + { + "epoch": 0.507728714111034, + "grad_norm": 40.0, + "learning_rate": 9.920667522899908e-06, + "loss": 16.7213, + "step": 28080 + }, + { + "epoch": 0.5079095291801619, + "grad_norm": 42.9375, + "learning_rate": 9.920639270593248e-06, + "loss": 17.0304, + "step": 28090 + }, + { + "epoch": 0.5080903442492898, + "grad_norm": 44.5, + "learning_rate": 9.920611018286589e-06, + "loss": 17.2263, + "step": 28100 + }, + { + "epoch": 0.5082711593184176, + "grad_norm": 39.625, + "learning_rate": 9.92058276597993e-06, + "loss": 17.4506, + "step": 28110 + }, + { + "epoch": 0.5084519743875454, + "grad_norm": 46.53125, + "learning_rate": 9.92055451367327e-06, + "loss": 17.0361, + "step": 28120 + }, + { + "epoch": 0.5086327894566733, + "grad_norm": 43.03125, + "learning_rate": 9.920526261366609e-06, + "loss": 17.1712, + "step": 28130 + }, + { + "epoch": 0.5088136045258012, + "grad_norm": 44.75, + "learning_rate": 9.92049800905995e-06, + "loss": 17.1423, + "step": 28140 + }, + { + "epoch": 0.5089944195949291, + "grad_norm": 42.34375, + "learning_rate": 9.92046975675329e-06, + "loss": 16.952, + "step": 28150 + }, + { + "epoch": 0.5091752346640569, + "grad_norm": 43.34375, + "learning_rate": 9.920441504446633e-06, + "loss": 17.1141, + "step": 28160 + }, + { + "epoch": 0.5093560497331847, + "grad_norm": 37.75, + "learning_rate": 9.920413252139971e-06, + "loss": 17.0306, + "step": 28170 + }, + { + "epoch": 0.5095368648023126, + "grad_norm": 42.21875, + "learning_rate": 9.920384999833312e-06, + "loss": 17.3226, + "step": 28180 + }, + { + "epoch": 0.5097176798714405, + "grad_norm": 42.4375, + "learning_rate": 9.920356747526653e-06, + "loss": 17.2329, + "step": 28190 + }, + { + "epoch": 0.5098984949405684, + "grad_norm": 39.40625, + "learning_rate": 9.920328495219993e-06, + "loss": 17.0066, + "step": 28200 + }, + { + "epoch": 0.5100793100096962, + "grad_norm": 41.0625, + "learning_rate": 9.920300242913334e-06, + "loss": 17.5045, + "step": 28210 + }, + { + "epoch": 0.5102601250788241, + "grad_norm": 41.375, + "learning_rate": 9.920271990606673e-06, + "loss": 16.7508, + "step": 28220 + }, + { + "epoch": 0.5104409401479519, + "grad_norm": 41.71875, + "learning_rate": 9.920243738300014e-06, + "loss": 17.258, + "step": 28230 + }, + { + "epoch": 0.5106217552170798, + "grad_norm": 42.625, + "learning_rate": 9.920215485993356e-06, + "loss": 17.1619, + "step": 28240 + }, + { + "epoch": 0.5108025702862077, + "grad_norm": 40.21875, + "learning_rate": 9.920187233686695e-06, + "loss": 16.9134, + "step": 28250 + }, + { + "epoch": 0.5109833853553355, + "grad_norm": 41.75, + "learning_rate": 9.920158981380035e-06, + "loss": 17.2607, + "step": 28260 + }, + { + "epoch": 0.5111642004244634, + "grad_norm": 45.40625, + "learning_rate": 9.920130729073376e-06, + "loss": 16.8008, + "step": 28270 + }, + { + "epoch": 0.5113450154935912, + "grad_norm": 41.15625, + "learning_rate": 9.920102476766717e-06, + "loss": 17.2442, + "step": 28280 + }, + { + "epoch": 0.511525830562719, + "grad_norm": 41.28125, + "learning_rate": 9.920074224460057e-06, + "loss": 16.9537, + "step": 28290 + }, + { + "epoch": 0.511706645631847, + "grad_norm": 41.75, + "learning_rate": 9.920045972153398e-06, + "loss": 16.9558, + "step": 28300 + }, + { + "epoch": 0.5118874607009748, + "grad_norm": 41.09375, + "learning_rate": 9.920017719846737e-06, + "loss": 16.932, + "step": 28310 + }, + { + "epoch": 0.5120682757701027, + "grad_norm": 40.90625, + "learning_rate": 9.919989467540077e-06, + "loss": 17.1713, + "step": 28320 + }, + { + "epoch": 0.5122490908392305, + "grad_norm": 41.0625, + "learning_rate": 9.91996121523342e-06, + "loss": 17.1424, + "step": 28330 + }, + { + "epoch": 0.5124299059083584, + "grad_norm": 40.3125, + "learning_rate": 9.919932962926759e-06, + "loss": 16.8539, + "step": 28340 + }, + { + "epoch": 0.5126107209774863, + "grad_norm": 38.71875, + "learning_rate": 9.9199047106201e-06, + "loss": 17.0396, + "step": 28350 + }, + { + "epoch": 0.5127915360466141, + "grad_norm": 42.96875, + "learning_rate": 9.91987645831344e-06, + "loss": 16.9008, + "step": 28360 + }, + { + "epoch": 0.512972351115742, + "grad_norm": 43.1875, + "learning_rate": 9.91984820600678e-06, + "loss": 16.977, + "step": 28370 + }, + { + "epoch": 0.5131531661848698, + "grad_norm": 40.5, + "learning_rate": 9.919819953700121e-06, + "loss": 16.8402, + "step": 28380 + }, + { + "epoch": 0.5133339812539978, + "grad_norm": 41.4375, + "learning_rate": 9.91979170139346e-06, + "loss": 17.1193, + "step": 28390 + }, + { + "epoch": 0.5135147963231256, + "grad_norm": 40.40625, + "learning_rate": 9.9197634490868e-06, + "loss": 17.002, + "step": 28400 + }, + { + "epoch": 0.5136956113922534, + "grad_norm": 43.6875, + "learning_rate": 9.919735196780141e-06, + "loss": 17.2712, + "step": 28410 + }, + { + "epoch": 0.5138764264613813, + "grad_norm": 41.75, + "learning_rate": 9.919706944473484e-06, + "loss": 17.0316, + "step": 28420 + }, + { + "epoch": 0.5140572415305091, + "grad_norm": 40.3125, + "learning_rate": 9.919678692166823e-06, + "loss": 17.0029, + "step": 28430 + }, + { + "epoch": 0.5142380565996371, + "grad_norm": 39.40625, + "learning_rate": 9.919650439860163e-06, + "loss": 16.7383, + "step": 28440 + }, + { + "epoch": 0.5144188716687649, + "grad_norm": 42.25, + "learning_rate": 9.919622187553504e-06, + "loss": 16.9989, + "step": 28450 + }, + { + "epoch": 0.5145996867378927, + "grad_norm": 41.84375, + "learning_rate": 9.919593935246844e-06, + "loss": 17.0851, + "step": 28460 + }, + { + "epoch": 0.5147805018070206, + "grad_norm": 40.75, + "learning_rate": 9.919565682940185e-06, + "loss": 16.9262, + "step": 28470 + }, + { + "epoch": 0.5149613168761484, + "grad_norm": 39.3125, + "learning_rate": 9.919537430633524e-06, + "loss": 16.9286, + "step": 28480 + }, + { + "epoch": 0.5151421319452764, + "grad_norm": 40.75, + "learning_rate": 9.919509178326865e-06, + "loss": 16.9974, + "step": 28490 + }, + { + "epoch": 0.5153229470144042, + "grad_norm": 43.125, + "learning_rate": 9.919480926020205e-06, + "loss": 17.1487, + "step": 28500 + }, + { + "epoch": 0.515503762083532, + "grad_norm": 38.4375, + "learning_rate": 9.919452673713546e-06, + "loss": 16.7718, + "step": 28510 + }, + { + "epoch": 0.5156845771526599, + "grad_norm": 41.71875, + "learning_rate": 9.919424421406886e-06, + "loss": 16.9833, + "step": 28520 + }, + { + "epoch": 0.5158653922217877, + "grad_norm": 42.40625, + "learning_rate": 9.919396169100227e-06, + "loss": 17.0211, + "step": 28530 + }, + { + "epoch": 0.5160462072909157, + "grad_norm": 43.3125, + "learning_rate": 9.919367916793568e-06, + "loss": 17.1867, + "step": 28540 + }, + { + "epoch": 0.5162270223600435, + "grad_norm": 42.90625, + "learning_rate": 9.919339664486908e-06, + "loss": 16.7948, + "step": 28550 + }, + { + "epoch": 0.5164078374291714, + "grad_norm": 41.75, + "learning_rate": 9.919311412180247e-06, + "loss": 17.1321, + "step": 28560 + }, + { + "epoch": 0.5165886524982992, + "grad_norm": 41.4375, + "learning_rate": 9.919283159873588e-06, + "loss": 17.0192, + "step": 28570 + }, + { + "epoch": 0.516769467567427, + "grad_norm": 42.25, + "learning_rate": 9.919254907566929e-06, + "loss": 16.7417, + "step": 28580 + }, + { + "epoch": 0.516950282636555, + "grad_norm": 41.5625, + "learning_rate": 9.91922665526027e-06, + "loss": 17.0249, + "step": 28590 + }, + { + "epoch": 0.5171310977056828, + "grad_norm": 38.75, + "learning_rate": 9.91919840295361e-06, + "loss": 16.8818, + "step": 28600 + }, + { + "epoch": 0.5173119127748107, + "grad_norm": 43.875, + "learning_rate": 9.91917015064695e-06, + "loss": 17.1732, + "step": 28610 + }, + { + "epoch": 0.5174927278439385, + "grad_norm": 41.8125, + "learning_rate": 9.919141898340291e-06, + "loss": 17.4947, + "step": 28620 + }, + { + "epoch": 0.5176735429130663, + "grad_norm": 40.1875, + "learning_rate": 9.919113646033632e-06, + "loss": 17.4457, + "step": 28630 + }, + { + "epoch": 0.5178543579821943, + "grad_norm": 42.3125, + "learning_rate": 9.919085393726972e-06, + "loss": 16.9908, + "step": 28640 + }, + { + "epoch": 0.5180351730513221, + "grad_norm": 40.46875, + "learning_rate": 9.919057141420311e-06, + "loss": 16.6074, + "step": 28650 + }, + { + "epoch": 0.51821598812045, + "grad_norm": 42.625, + "learning_rate": 9.919028889113652e-06, + "loss": 16.7887, + "step": 28660 + }, + { + "epoch": 0.5183968031895778, + "grad_norm": 42.1875, + "learning_rate": 9.919000636806992e-06, + "loss": 17.326, + "step": 28670 + }, + { + "epoch": 0.5185776182587056, + "grad_norm": 40.25, + "learning_rate": 9.918972384500333e-06, + "loss": 16.7892, + "step": 28680 + }, + { + "epoch": 0.5187584333278336, + "grad_norm": 40.25, + "learning_rate": 9.918944132193674e-06, + "loss": 17.2271, + "step": 28690 + }, + { + "epoch": 0.5189392483969614, + "grad_norm": 41.90625, + "learning_rate": 9.918915879887014e-06, + "loss": 17.1251, + "step": 28700 + }, + { + "epoch": 0.5191200634660893, + "grad_norm": 41.625, + "learning_rate": 9.918887627580355e-06, + "loss": 16.8376, + "step": 28710 + }, + { + "epoch": 0.5193008785352171, + "grad_norm": 40.90625, + "learning_rate": 9.918859375273696e-06, + "loss": 17.0451, + "step": 28720 + }, + { + "epoch": 0.519481693604345, + "grad_norm": 41.71875, + "learning_rate": 9.918831122967034e-06, + "loss": 17.2237, + "step": 28730 + }, + { + "epoch": 0.5196625086734729, + "grad_norm": 40.65625, + "learning_rate": 9.918802870660375e-06, + "loss": 17.1952, + "step": 28740 + }, + { + "epoch": 0.5198433237426007, + "grad_norm": 41.53125, + "learning_rate": 9.918774618353716e-06, + "loss": 16.9982, + "step": 28750 + }, + { + "epoch": 0.5200241388117286, + "grad_norm": 40.125, + "learning_rate": 9.918746366047056e-06, + "loss": 17.1002, + "step": 28760 + }, + { + "epoch": 0.5202049538808564, + "grad_norm": 43.25, + "learning_rate": 9.918718113740397e-06, + "loss": 16.8707, + "step": 28770 + }, + { + "epoch": 0.5203857689499843, + "grad_norm": 41.3125, + "learning_rate": 9.918689861433738e-06, + "loss": 17.2035, + "step": 28780 + }, + { + "epoch": 0.5205665840191122, + "grad_norm": 45.375, + "learning_rate": 9.918661609127078e-06, + "loss": 16.9605, + "step": 28790 + }, + { + "epoch": 0.52074739908824, + "grad_norm": 41.53125, + "learning_rate": 9.918633356820419e-06, + "loss": 17.3515, + "step": 28800 + }, + { + "epoch": 0.5209282141573679, + "grad_norm": 43.5625, + "learning_rate": 9.91860510451376e-06, + "loss": 16.8148, + "step": 28810 + }, + { + "epoch": 0.5211090292264957, + "grad_norm": 42.75, + "learning_rate": 9.918576852207098e-06, + "loss": 16.5191, + "step": 28820 + }, + { + "epoch": 0.5212898442956236, + "grad_norm": 42.96875, + "learning_rate": 9.918548599900439e-06, + "loss": 17.1934, + "step": 28830 + }, + { + "epoch": 0.5214706593647515, + "grad_norm": 41.5, + "learning_rate": 9.91852034759378e-06, + "loss": 17.5222, + "step": 28840 + }, + { + "epoch": 0.5216514744338793, + "grad_norm": 43.09375, + "learning_rate": 9.91849209528712e-06, + "loss": 17.0393, + "step": 28850 + }, + { + "epoch": 0.5218322895030072, + "grad_norm": 44.875, + "learning_rate": 9.918463842980461e-06, + "loss": 17.2174, + "step": 28860 + }, + { + "epoch": 0.522013104572135, + "grad_norm": 40.6875, + "learning_rate": 9.918435590673801e-06, + "loss": 17.1666, + "step": 28870 + }, + { + "epoch": 0.5221939196412629, + "grad_norm": 40.65625, + "learning_rate": 9.918407338367142e-06, + "loss": 16.7025, + "step": 28880 + }, + { + "epoch": 0.5223747347103908, + "grad_norm": 40.5, + "learning_rate": 9.918379086060483e-06, + "loss": 16.7877, + "step": 28890 + }, + { + "epoch": 0.5225555497795187, + "grad_norm": 43.71875, + "learning_rate": 9.918350833753823e-06, + "loss": 17.1532, + "step": 28900 + }, + { + "epoch": 0.5227363648486465, + "grad_norm": 40.96875, + "learning_rate": 9.918322581447162e-06, + "loss": 17.0564, + "step": 28910 + }, + { + "epoch": 0.5229171799177743, + "grad_norm": 42.96875, + "learning_rate": 9.918294329140503e-06, + "loss": 16.9364, + "step": 28920 + }, + { + "epoch": 0.5230979949869022, + "grad_norm": 43.40625, + "learning_rate": 9.918266076833844e-06, + "loss": 17.3431, + "step": 28930 + }, + { + "epoch": 0.52327881005603, + "grad_norm": 43.53125, + "learning_rate": 9.918237824527184e-06, + "loss": 16.791, + "step": 28940 + }, + { + "epoch": 0.523459625125158, + "grad_norm": 40.6875, + "learning_rate": 9.918209572220525e-06, + "loss": 17.3021, + "step": 28950 + }, + { + "epoch": 0.5236404401942858, + "grad_norm": 41.40625, + "learning_rate": 9.918181319913865e-06, + "loss": 16.802, + "step": 28960 + }, + { + "epoch": 0.5238212552634136, + "grad_norm": 43.5625, + "learning_rate": 9.918153067607206e-06, + "loss": 17.1349, + "step": 28970 + }, + { + "epoch": 0.5240020703325415, + "grad_norm": 44.0625, + "learning_rate": 9.918124815300547e-06, + "loss": 16.9321, + "step": 28980 + }, + { + "epoch": 0.5241828854016694, + "grad_norm": 41.25, + "learning_rate": 9.918096562993886e-06, + "loss": 17.3189, + "step": 28990 + }, + { + "epoch": 0.5243637004707973, + "grad_norm": 42.96875, + "learning_rate": 9.918068310687226e-06, + "loss": 16.9273, + "step": 29000 + }, + { + "epoch": 0.5245445155399251, + "grad_norm": 40.875, + "learning_rate": 9.918040058380567e-06, + "loss": 17.2656, + "step": 29010 + }, + { + "epoch": 0.5247253306090529, + "grad_norm": 41.90625, + "learning_rate": 9.918011806073907e-06, + "loss": 17.2891, + "step": 29020 + }, + { + "epoch": 0.5249061456781808, + "grad_norm": 43.78125, + "learning_rate": 9.917983553767248e-06, + "loss": 16.7634, + "step": 29030 + }, + { + "epoch": 0.5250869607473087, + "grad_norm": 44.40625, + "learning_rate": 9.917955301460589e-06, + "loss": 16.9445, + "step": 29040 + }, + { + "epoch": 0.5252677758164366, + "grad_norm": 41.09375, + "learning_rate": 9.91792704915393e-06, + "loss": 17.0999, + "step": 29050 + }, + { + "epoch": 0.5254485908855644, + "grad_norm": 42.0625, + "learning_rate": 9.91789879684727e-06, + "loss": 17.1245, + "step": 29060 + }, + { + "epoch": 0.5256294059546923, + "grad_norm": 42.09375, + "learning_rate": 9.91787054454061e-06, + "loss": 16.812, + "step": 29070 + }, + { + "epoch": 0.5258102210238201, + "grad_norm": 38.9375, + "learning_rate": 9.91784229223395e-06, + "loss": 17.0611, + "step": 29080 + }, + { + "epoch": 0.525991036092948, + "grad_norm": 40.53125, + "learning_rate": 9.91781403992729e-06, + "loss": 17.081, + "step": 29090 + }, + { + "epoch": 0.5261718511620759, + "grad_norm": 40.90625, + "learning_rate": 9.91778578762063e-06, + "loss": 16.5785, + "step": 29100 + }, + { + "epoch": 0.5263526662312037, + "grad_norm": 40.1875, + "learning_rate": 9.917757535313971e-06, + "loss": 16.8719, + "step": 29110 + }, + { + "epoch": 0.5265334813003316, + "grad_norm": 42.1875, + "learning_rate": 9.917729283007312e-06, + "loss": 16.9369, + "step": 29120 + }, + { + "epoch": 0.5267142963694594, + "grad_norm": 39.15625, + "learning_rate": 9.917701030700653e-06, + "loss": 17.2821, + "step": 29130 + }, + { + "epoch": 0.5268951114385872, + "grad_norm": 38.15625, + "learning_rate": 9.917672778393993e-06, + "loss": 16.6713, + "step": 29140 + }, + { + "epoch": 0.5270759265077152, + "grad_norm": 41.75, + "learning_rate": 9.917644526087334e-06, + "loss": 16.9598, + "step": 29150 + }, + { + "epoch": 0.527256741576843, + "grad_norm": 38.96875, + "learning_rate": 9.917616273780673e-06, + "loss": 17.0933, + "step": 29160 + }, + { + "epoch": 0.5274375566459709, + "grad_norm": 38.3125, + "learning_rate": 9.917588021474013e-06, + "loss": 17.3192, + "step": 29170 + }, + { + "epoch": 0.5276183717150987, + "grad_norm": 42.65625, + "learning_rate": 9.917559769167354e-06, + "loss": 17.0815, + "step": 29180 + }, + { + "epoch": 0.5277991867842265, + "grad_norm": 42.65625, + "learning_rate": 9.917531516860695e-06, + "loss": 17.0196, + "step": 29190 + }, + { + "epoch": 0.5279800018533545, + "grad_norm": 39.875, + "learning_rate": 9.917503264554035e-06, + "loss": 16.7482, + "step": 29200 + }, + { + "epoch": 0.5281608169224823, + "grad_norm": 43.65625, + "learning_rate": 9.917475012247376e-06, + "loss": 16.6207, + "step": 29210 + }, + { + "epoch": 0.5283416319916102, + "grad_norm": 41.65625, + "learning_rate": 9.917446759940716e-06, + "loss": 17.0133, + "step": 29220 + }, + { + "epoch": 0.528522447060738, + "grad_norm": 43.28125, + "learning_rate": 9.917418507634057e-06, + "loss": 17.2084, + "step": 29230 + }, + { + "epoch": 0.528703262129866, + "grad_norm": 42.84375, + "learning_rate": 9.917390255327398e-06, + "loss": 17.0293, + "step": 29240 + }, + { + "epoch": 0.5288840771989938, + "grad_norm": 41.03125, + "learning_rate": 9.917362003020737e-06, + "loss": 16.935, + "step": 29250 + }, + { + "epoch": 0.5290648922681216, + "grad_norm": 41.9375, + "learning_rate": 9.917333750714077e-06, + "loss": 16.9455, + "step": 29260 + }, + { + "epoch": 0.5292457073372495, + "grad_norm": 41.65625, + "learning_rate": 9.917305498407418e-06, + "loss": 17.1935, + "step": 29270 + }, + { + "epoch": 0.5294265224063773, + "grad_norm": 40.625, + "learning_rate": 9.917277246100759e-06, + "loss": 17.0945, + "step": 29280 + }, + { + "epoch": 0.5296073374755053, + "grad_norm": 40.59375, + "learning_rate": 9.9172489937941e-06, + "loss": 17.1142, + "step": 29290 + }, + { + "epoch": 0.5297881525446331, + "grad_norm": 40.5625, + "learning_rate": 9.91722074148744e-06, + "loss": 17.0181, + "step": 29300 + }, + { + "epoch": 0.5299689676137609, + "grad_norm": 41.40625, + "learning_rate": 9.91719248918078e-06, + "loss": 17.0982, + "step": 29310 + }, + { + "epoch": 0.5301497826828888, + "grad_norm": 42.0625, + "learning_rate": 9.917164236874121e-06, + "loss": 17.1721, + "step": 29320 + }, + { + "epoch": 0.5303305977520166, + "grad_norm": 40.5625, + "learning_rate": 9.917135984567462e-06, + "loss": 17.2262, + "step": 29330 + }, + { + "epoch": 0.5305114128211446, + "grad_norm": 42.40625, + "learning_rate": 9.9171077322608e-06, + "loss": 17.0821, + "step": 29340 + }, + { + "epoch": 0.5306922278902724, + "grad_norm": 41.96875, + "learning_rate": 9.917079479954141e-06, + "loss": 17.1578, + "step": 29350 + }, + { + "epoch": 0.5308730429594002, + "grad_norm": 42.25, + "learning_rate": 9.917051227647482e-06, + "loss": 17.4814, + "step": 29360 + }, + { + "epoch": 0.5310538580285281, + "grad_norm": 41.40625, + "learning_rate": 9.917022975340822e-06, + "loss": 17.022, + "step": 29370 + }, + { + "epoch": 0.5312346730976559, + "grad_norm": 41.5, + "learning_rate": 9.916994723034163e-06, + "loss": 17.2217, + "step": 29380 + }, + { + "epoch": 0.5314154881667839, + "grad_norm": 42.28125, + "learning_rate": 9.916966470727504e-06, + "loss": 17.1552, + "step": 29390 + }, + { + "epoch": 0.5315963032359117, + "grad_norm": 44.09375, + "learning_rate": 9.916938218420844e-06, + "loss": 16.6741, + "step": 29400 + }, + { + "epoch": 0.5317771183050395, + "grad_norm": 44.03125, + "learning_rate": 9.916909966114185e-06, + "loss": 17.2433, + "step": 29410 + }, + { + "epoch": 0.5319579333741674, + "grad_norm": 46.09375, + "learning_rate": 9.916881713807524e-06, + "loss": 16.9512, + "step": 29420 + }, + { + "epoch": 0.5321387484432952, + "grad_norm": 45.71875, + "learning_rate": 9.916853461500864e-06, + "loss": 17.1226, + "step": 29430 + }, + { + "epoch": 0.5323195635124232, + "grad_norm": 41.3125, + "learning_rate": 9.916825209194205e-06, + "loss": 16.9795, + "step": 29440 + }, + { + "epoch": 0.532500378581551, + "grad_norm": 41.625, + "learning_rate": 9.916796956887546e-06, + "loss": 16.7698, + "step": 29450 + }, + { + "epoch": 0.5326811936506789, + "grad_norm": 41.84375, + "learning_rate": 9.916768704580886e-06, + "loss": 17.3518, + "step": 29460 + }, + { + "epoch": 0.5328620087198067, + "grad_norm": 41.84375, + "learning_rate": 9.916740452274227e-06, + "loss": 16.6971, + "step": 29470 + }, + { + "epoch": 0.5330428237889345, + "grad_norm": 42.34375, + "learning_rate": 9.916712199967568e-06, + "loss": 16.7716, + "step": 29480 + }, + { + "epoch": 0.5332236388580625, + "grad_norm": 43.21875, + "learning_rate": 9.916683947660908e-06, + "loss": 17.3883, + "step": 29490 + }, + { + "epoch": 0.5334044539271903, + "grad_norm": 41.03125, + "learning_rate": 9.916655695354249e-06, + "loss": 17.0945, + "step": 29500 + }, + { + "epoch": 0.5335852689963182, + "grad_norm": 42.0, + "learning_rate": 9.916627443047588e-06, + "loss": 16.6286, + "step": 29510 + }, + { + "epoch": 0.533766084065446, + "grad_norm": 39.6875, + "learning_rate": 9.916599190740928e-06, + "loss": 16.7957, + "step": 29520 + }, + { + "epoch": 0.5339468991345738, + "grad_norm": 37.21875, + "learning_rate": 9.916570938434269e-06, + "loss": 16.9727, + "step": 29530 + }, + { + "epoch": 0.5341277142037018, + "grad_norm": 40.6875, + "learning_rate": 9.91654268612761e-06, + "loss": 17.2793, + "step": 29540 + }, + { + "epoch": 0.5343085292728296, + "grad_norm": 42.9375, + "learning_rate": 9.91651443382095e-06, + "loss": 16.8023, + "step": 29550 + }, + { + "epoch": 0.5344893443419575, + "grad_norm": 42.09375, + "learning_rate": 9.916486181514291e-06, + "loss": 16.812, + "step": 29560 + }, + { + "epoch": 0.5346701594110853, + "grad_norm": 41.75, + "learning_rate": 9.916457929207632e-06, + "loss": 17.0138, + "step": 29570 + }, + { + "epoch": 0.5348509744802131, + "grad_norm": 40.65625, + "learning_rate": 9.916429676900972e-06, + "loss": 17.0572, + "step": 29580 + }, + { + "epoch": 0.535031789549341, + "grad_norm": 40.90625, + "learning_rate": 9.916401424594311e-06, + "loss": 16.8175, + "step": 29590 + }, + { + "epoch": 0.5352126046184689, + "grad_norm": 41.9375, + "learning_rate": 9.916373172287652e-06, + "loss": 17.1891, + "step": 29600 + }, + { + "epoch": 0.5353934196875968, + "grad_norm": 41.21875, + "learning_rate": 9.916344919980992e-06, + "loss": 17.1858, + "step": 29610 + }, + { + "epoch": 0.5355742347567246, + "grad_norm": 42.875, + "learning_rate": 9.916316667674333e-06, + "loss": 17.156, + "step": 29620 + }, + { + "epoch": 0.5357550498258525, + "grad_norm": 44.15625, + "learning_rate": 9.916288415367674e-06, + "loss": 16.9915, + "step": 29630 + }, + { + "epoch": 0.5359358648949804, + "grad_norm": 41.25, + "learning_rate": 9.916260163061014e-06, + "loss": 17.008, + "step": 29640 + }, + { + "epoch": 0.5361166799641082, + "grad_norm": 41.0625, + "learning_rate": 9.916231910754355e-06, + "loss": 17.0597, + "step": 29650 + }, + { + "epoch": 0.5362974950332361, + "grad_norm": 41.59375, + "learning_rate": 9.916203658447695e-06, + "loss": 17.3012, + "step": 29660 + }, + { + "epoch": 0.5364783101023639, + "grad_norm": 39.8125, + "learning_rate": 9.916175406141036e-06, + "loss": 16.7412, + "step": 29670 + }, + { + "epoch": 0.5366591251714918, + "grad_norm": 41.71875, + "learning_rate": 9.916147153834375e-06, + "loss": 16.666, + "step": 29680 + }, + { + "epoch": 0.5368399402406197, + "grad_norm": 40.6875, + "learning_rate": 9.916118901527716e-06, + "loss": 17.0808, + "step": 29690 + }, + { + "epoch": 0.5370207553097475, + "grad_norm": 41.84375, + "learning_rate": 9.916090649221056e-06, + "loss": 16.8892, + "step": 29700 + }, + { + "epoch": 0.5372015703788754, + "grad_norm": 42.84375, + "learning_rate": 9.916062396914397e-06, + "loss": 16.88, + "step": 29710 + }, + { + "epoch": 0.5373823854480032, + "grad_norm": 44.09375, + "learning_rate": 9.916034144607737e-06, + "loss": 16.8642, + "step": 29720 + }, + { + "epoch": 0.5375632005171311, + "grad_norm": 43.59375, + "learning_rate": 9.916005892301078e-06, + "loss": 17.3012, + "step": 29730 + }, + { + "epoch": 0.537744015586259, + "grad_norm": 40.28125, + "learning_rate": 9.915977639994419e-06, + "loss": 17.0513, + "step": 29740 + }, + { + "epoch": 0.5379248306553868, + "grad_norm": 40.6875, + "learning_rate": 9.91594938768776e-06, + "loss": 17.2723, + "step": 29750 + }, + { + "epoch": 0.5381056457245147, + "grad_norm": 41.625, + "learning_rate": 9.9159211353811e-06, + "loss": 17.2688, + "step": 29760 + }, + { + "epoch": 0.5382864607936425, + "grad_norm": 38.59375, + "learning_rate": 9.915892883074439e-06, + "loss": 17.1386, + "step": 29770 + }, + { + "epoch": 0.5384672758627704, + "grad_norm": 41.5625, + "learning_rate": 9.91586463076778e-06, + "loss": 16.8733, + "step": 29780 + }, + { + "epoch": 0.5386480909318982, + "grad_norm": 40.375, + "learning_rate": 9.91583637846112e-06, + "loss": 17.0199, + "step": 29790 + }, + { + "epoch": 0.5388289060010262, + "grad_norm": 44.59375, + "learning_rate": 9.91580812615446e-06, + "loss": 17.2014, + "step": 29800 + }, + { + "epoch": 0.539009721070154, + "grad_norm": 42.0625, + "learning_rate": 9.915779873847801e-06, + "loss": 16.9838, + "step": 29810 + }, + { + "epoch": 0.5391905361392818, + "grad_norm": 43.15625, + "learning_rate": 9.915751621541142e-06, + "loss": 17.3729, + "step": 29820 + }, + { + "epoch": 0.5393713512084097, + "grad_norm": 41.875, + "learning_rate": 9.915723369234483e-06, + "loss": 17.075, + "step": 29830 + }, + { + "epoch": 0.5395521662775375, + "grad_norm": 38.125, + "learning_rate": 9.915695116927823e-06, + "loss": 17.0875, + "step": 29840 + }, + { + "epoch": 0.5397329813466655, + "grad_norm": 39.53125, + "learning_rate": 9.915666864621162e-06, + "loss": 16.6237, + "step": 29850 + }, + { + "epoch": 0.5399137964157933, + "grad_norm": 41.84375, + "learning_rate": 9.915638612314503e-06, + "loss": 17.0236, + "step": 29860 + }, + { + "epoch": 0.5400946114849211, + "grad_norm": 42.65625, + "learning_rate": 9.915610360007843e-06, + "loss": 17.3734, + "step": 29870 + }, + { + "epoch": 0.540275426554049, + "grad_norm": 42.09375, + "learning_rate": 9.915582107701184e-06, + "loss": 16.5518, + "step": 29880 + }, + { + "epoch": 0.5404562416231768, + "grad_norm": 41.1875, + "learning_rate": 9.915553855394525e-06, + "loss": 16.4954, + "step": 29890 + }, + { + "epoch": 0.5406370566923048, + "grad_norm": 40.5, + "learning_rate": 9.915525603087864e-06, + "loss": 16.9942, + "step": 29900 + }, + { + "epoch": 0.5408178717614326, + "grad_norm": 40.25, + "learning_rate": 9.915497350781206e-06, + "loss": 16.9093, + "step": 29910 + }, + { + "epoch": 0.5409986868305604, + "grad_norm": 40.875, + "learning_rate": 9.915469098474547e-06, + "loss": 17.0349, + "step": 29920 + }, + { + "epoch": 0.5411795018996883, + "grad_norm": 41.625, + "learning_rate": 9.915440846167887e-06, + "loss": 16.9415, + "step": 29930 + }, + { + "epoch": 0.5413603169688161, + "grad_norm": 41.53125, + "learning_rate": 9.915412593861226e-06, + "loss": 16.9677, + "step": 29940 + }, + { + "epoch": 0.5415411320379441, + "grad_norm": 39.9375, + "learning_rate": 9.915384341554567e-06, + "loss": 17.1662, + "step": 29950 + }, + { + "epoch": 0.5417219471070719, + "grad_norm": 46.5, + "learning_rate": 9.915356089247907e-06, + "loss": 17.0126, + "step": 29960 + }, + { + "epoch": 0.5419027621761998, + "grad_norm": 37.96875, + "learning_rate": 9.915327836941248e-06, + "loss": 17.1567, + "step": 29970 + }, + { + "epoch": 0.5420835772453276, + "grad_norm": 42.28125, + "learning_rate": 9.915299584634589e-06, + "loss": 17.0694, + "step": 29980 + }, + { + "epoch": 0.5422643923144554, + "grad_norm": 40.21875, + "learning_rate": 9.91527133232793e-06, + "loss": 16.7261, + "step": 29990 + }, + { + "epoch": 0.5424452073835834, + "grad_norm": 41.6875, + "learning_rate": 9.91524308002127e-06, + "loss": 16.8142, + "step": 30000 + }, + { + "epoch": 0.5424452073835834, + "eval_loss": 2.1271157264709473, + "eval_runtime": 229.7564, + "eval_samples_per_second": 3160.125, + "eval_steps_per_second": 49.378, + "step": 30000 + }, + { + "epoch": 0.5426260224527112, + "grad_norm": 44.59375, + "learning_rate": 9.91521482771461e-06, + "loss": 16.946, + "step": 30010 + }, + { + "epoch": 0.5428068375218391, + "grad_norm": 42.71875, + "learning_rate": 9.91518657540795e-06, + "loss": 17.0405, + "step": 30020 + }, + { + "epoch": 0.5429876525909669, + "grad_norm": 40.875, + "learning_rate": 9.91515832310129e-06, + "loss": 16.5716, + "step": 30030 + }, + { + "epoch": 0.5431684676600947, + "grad_norm": 40.25, + "learning_rate": 9.91513007079463e-06, + "loss": 16.8354, + "step": 30040 + }, + { + "epoch": 0.5433492827292227, + "grad_norm": 41.40625, + "learning_rate": 9.915101818487971e-06, + "loss": 17.1805, + "step": 30050 + }, + { + "epoch": 0.5435300977983505, + "grad_norm": 42.5625, + "learning_rate": 9.915073566181312e-06, + "loss": 17.2318, + "step": 30060 + }, + { + "epoch": 0.5437109128674784, + "grad_norm": 44.8125, + "learning_rate": 9.915045313874652e-06, + "loss": 16.813, + "step": 30070 + }, + { + "epoch": 0.5438917279366062, + "grad_norm": 39.4375, + "learning_rate": 9.915017061567993e-06, + "loss": 16.9205, + "step": 30080 + }, + { + "epoch": 0.544072543005734, + "grad_norm": 42.5625, + "learning_rate": 9.914988809261334e-06, + "loss": 16.9416, + "step": 30090 + }, + { + "epoch": 0.544253358074862, + "grad_norm": 42.75, + "learning_rate": 9.914960556954674e-06, + "loss": 17.3216, + "step": 30100 + }, + { + "epoch": 0.5444341731439898, + "grad_norm": 38.4375, + "learning_rate": 9.914932304648013e-06, + "loss": 17.1828, + "step": 30110 + }, + { + "epoch": 0.5446149882131177, + "grad_norm": 44.34375, + "learning_rate": 9.914904052341354e-06, + "loss": 16.7072, + "step": 30120 + }, + { + "epoch": 0.5447958032822455, + "grad_norm": 39.0625, + "learning_rate": 9.914875800034694e-06, + "loss": 17.0456, + "step": 30130 + }, + { + "epoch": 0.5449766183513735, + "grad_norm": 42.1875, + "learning_rate": 9.914847547728035e-06, + "loss": 16.8821, + "step": 30140 + }, + { + "epoch": 0.5451574334205013, + "grad_norm": 41.5, + "learning_rate": 9.914819295421376e-06, + "loss": 16.8177, + "step": 30150 + }, + { + "epoch": 0.5453382484896291, + "grad_norm": 39.59375, + "learning_rate": 9.914791043114715e-06, + "loss": 16.8663, + "step": 30160 + }, + { + "epoch": 0.545519063558757, + "grad_norm": 42.28125, + "learning_rate": 9.914762790808057e-06, + "loss": 17.1785, + "step": 30170 + }, + { + "epoch": 0.5456998786278848, + "grad_norm": 42.25, + "learning_rate": 9.914734538501398e-06, + "loss": 16.9884, + "step": 30180 + }, + { + "epoch": 0.5458806936970128, + "grad_norm": 42.75, + "learning_rate": 9.914706286194738e-06, + "loss": 16.6989, + "step": 30190 + }, + { + "epoch": 0.5460615087661406, + "grad_norm": 42.65625, + "learning_rate": 9.914678033888077e-06, + "loss": 17.1729, + "step": 30200 + }, + { + "epoch": 0.5462423238352684, + "grad_norm": 40.8125, + "learning_rate": 9.914649781581418e-06, + "loss": 17.0265, + "step": 30210 + }, + { + "epoch": 0.5464231389043963, + "grad_norm": 40.34375, + "learning_rate": 9.914621529274758e-06, + "loss": 17.4852, + "step": 30220 + }, + { + "epoch": 0.5466039539735241, + "grad_norm": 41.625, + "learning_rate": 9.914593276968099e-06, + "loss": 17.1923, + "step": 30230 + }, + { + "epoch": 0.546784769042652, + "grad_norm": 41.375, + "learning_rate": 9.91456502466144e-06, + "loss": 17.2628, + "step": 30240 + }, + { + "epoch": 0.5469655841117799, + "grad_norm": 40.6875, + "learning_rate": 9.914536772354779e-06, + "loss": 17.0136, + "step": 30250 + }, + { + "epoch": 0.5471463991809077, + "grad_norm": 42.6875, + "learning_rate": 9.914508520048121e-06, + "loss": 16.633, + "step": 30260 + }, + { + "epoch": 0.5473272142500356, + "grad_norm": 42.1875, + "learning_rate": 9.914480267741462e-06, + "loss": 16.9213, + "step": 30270 + }, + { + "epoch": 0.5475080293191634, + "grad_norm": 44.15625, + "learning_rate": 9.9144520154348e-06, + "loss": 16.8474, + "step": 30280 + }, + { + "epoch": 0.5476888443882914, + "grad_norm": 43.5625, + "learning_rate": 9.914423763128141e-06, + "loss": 16.8602, + "step": 30290 + }, + { + "epoch": 0.5478696594574192, + "grad_norm": 39.46875, + "learning_rate": 9.914395510821482e-06, + "loss": 17.1963, + "step": 30300 + }, + { + "epoch": 0.5480504745265471, + "grad_norm": 39.59375, + "learning_rate": 9.914367258514822e-06, + "loss": 17.1725, + "step": 30310 + }, + { + "epoch": 0.5482312895956749, + "grad_norm": 40.40625, + "learning_rate": 9.914339006208163e-06, + "loss": 17.3516, + "step": 30320 + }, + { + "epoch": 0.5484121046648027, + "grad_norm": 40.46875, + "learning_rate": 9.914310753901502e-06, + "loss": 17.3431, + "step": 30330 + }, + { + "epoch": 0.5485929197339307, + "grad_norm": 38.40625, + "learning_rate": 9.914282501594844e-06, + "loss": 16.9414, + "step": 30340 + }, + { + "epoch": 0.5487737348030585, + "grad_norm": 41.3125, + "learning_rate": 9.914254249288185e-06, + "loss": 16.793, + "step": 30350 + }, + { + "epoch": 0.5489545498721864, + "grad_norm": 43.75, + "learning_rate": 9.914225996981525e-06, + "loss": 16.7673, + "step": 30360 + }, + { + "epoch": 0.5491353649413142, + "grad_norm": 41.625, + "learning_rate": 9.914197744674864e-06, + "loss": 17.0738, + "step": 30370 + }, + { + "epoch": 0.549316180010442, + "grad_norm": 39.75, + "learning_rate": 9.914169492368205e-06, + "loss": 17.0317, + "step": 30380 + }, + { + "epoch": 0.54949699507957, + "grad_norm": 39.90625, + "learning_rate": 9.914141240061546e-06, + "loss": 17.0928, + "step": 30390 + }, + { + "epoch": 0.5496778101486978, + "grad_norm": 41.1875, + "learning_rate": 9.914112987754886e-06, + "loss": 17.2458, + "step": 30400 + }, + { + "epoch": 0.5498586252178257, + "grad_norm": 40.09375, + "learning_rate": 9.914084735448227e-06, + "loss": 17.1515, + "step": 30410 + }, + { + "epoch": 0.5500394402869535, + "grad_norm": 43.03125, + "learning_rate": 9.914056483141566e-06, + "loss": 17.6208, + "step": 30420 + }, + { + "epoch": 0.5502202553560813, + "grad_norm": 42.0, + "learning_rate": 9.914028230834908e-06, + "loss": 17.0189, + "step": 30430 + }, + { + "epoch": 0.5504010704252092, + "grad_norm": 41.3125, + "learning_rate": 9.913999978528249e-06, + "loss": 16.9412, + "step": 30440 + }, + { + "epoch": 0.5505818854943371, + "grad_norm": 44.5, + "learning_rate": 9.913971726221588e-06, + "loss": 16.8629, + "step": 30450 + }, + { + "epoch": 0.550762700563465, + "grad_norm": 42.3125, + "learning_rate": 9.913943473914928e-06, + "loss": 17.4288, + "step": 30460 + }, + { + "epoch": 0.5509435156325928, + "grad_norm": 39.1875, + "learning_rate": 9.913915221608269e-06, + "loss": 16.7881, + "step": 30470 + }, + { + "epoch": 0.5511243307017207, + "grad_norm": 44.125, + "learning_rate": 9.91388696930161e-06, + "loss": 17.1656, + "step": 30480 + }, + { + "epoch": 0.5513051457708485, + "grad_norm": 40.125, + "learning_rate": 9.91385871699495e-06, + "loss": 16.8119, + "step": 30490 + }, + { + "epoch": 0.5514859608399764, + "grad_norm": 41.65625, + "learning_rate": 9.91383046468829e-06, + "loss": 16.8665, + "step": 30500 + }, + { + "epoch": 0.5516667759091043, + "grad_norm": 40.21875, + "learning_rate": 9.91380221238163e-06, + "loss": 17.2542, + "step": 30510 + }, + { + "epoch": 0.5518475909782321, + "grad_norm": 42.0625, + "learning_rate": 9.913773960074972e-06, + "loss": 16.7592, + "step": 30520 + }, + { + "epoch": 0.55202840604736, + "grad_norm": 42.625, + "learning_rate": 9.913745707768313e-06, + "loss": 16.7518, + "step": 30530 + }, + { + "epoch": 0.5522092211164878, + "grad_norm": 43.78125, + "learning_rate": 9.913717455461652e-06, + "loss": 16.7956, + "step": 30540 + }, + { + "epoch": 0.5523900361856157, + "grad_norm": 38.28125, + "learning_rate": 9.913689203154992e-06, + "loss": 17.1653, + "step": 30550 + }, + { + "epoch": 0.5525708512547436, + "grad_norm": 42.75, + "learning_rate": 9.913660950848333e-06, + "loss": 17.4375, + "step": 30560 + }, + { + "epoch": 0.5527516663238714, + "grad_norm": 41.0, + "learning_rate": 9.913632698541673e-06, + "loss": 16.7577, + "step": 30570 + }, + { + "epoch": 0.5529324813929993, + "grad_norm": 44.0, + "learning_rate": 9.913604446235014e-06, + "loss": 16.8749, + "step": 30580 + }, + { + "epoch": 0.5531132964621271, + "grad_norm": 43.90625, + "learning_rate": 9.913576193928353e-06, + "loss": 17.3017, + "step": 30590 + }, + { + "epoch": 0.553294111531255, + "grad_norm": 45.0625, + "learning_rate": 9.913547941621694e-06, + "loss": 16.8087, + "step": 30600 + }, + { + "epoch": 0.5534749266003829, + "grad_norm": 45.84375, + "learning_rate": 9.913519689315036e-06, + "loss": 16.8531, + "step": 30610 + }, + { + "epoch": 0.5536557416695107, + "grad_norm": 38.78125, + "learning_rate": 9.913491437008377e-06, + "loss": 16.864, + "step": 30620 + }, + { + "epoch": 0.5538365567386386, + "grad_norm": 43.1875, + "learning_rate": 9.913463184701715e-06, + "loss": 16.9866, + "step": 30630 + }, + { + "epoch": 0.5540173718077664, + "grad_norm": 42.25, + "learning_rate": 9.913434932395056e-06, + "loss": 17.0031, + "step": 30640 + }, + { + "epoch": 0.5541981868768944, + "grad_norm": 39.34375, + "learning_rate": 9.913406680088397e-06, + "loss": 16.7895, + "step": 30650 + }, + { + "epoch": 0.5543790019460222, + "grad_norm": 43.5, + "learning_rate": 9.913378427781737e-06, + "loss": 17.0026, + "step": 30660 + }, + { + "epoch": 0.55455981701515, + "grad_norm": 40.25, + "learning_rate": 9.913350175475078e-06, + "loss": 17.2128, + "step": 30670 + }, + { + "epoch": 0.5547406320842779, + "grad_norm": 43.96875, + "learning_rate": 9.913321923168417e-06, + "loss": 17.0078, + "step": 30680 + }, + { + "epoch": 0.5549214471534057, + "grad_norm": 43.84375, + "learning_rate": 9.91329367086176e-06, + "loss": 17.1819, + "step": 30690 + }, + { + "epoch": 0.5551022622225337, + "grad_norm": 43.5625, + "learning_rate": 9.9132654185551e-06, + "loss": 16.6908, + "step": 30700 + }, + { + "epoch": 0.5552830772916615, + "grad_norm": 43.5, + "learning_rate": 9.913237166248439e-06, + "loss": 17.0772, + "step": 30710 + }, + { + "epoch": 0.5554638923607893, + "grad_norm": 41.84375, + "learning_rate": 9.91320891394178e-06, + "loss": 17.0964, + "step": 30720 + }, + { + "epoch": 0.5556447074299172, + "grad_norm": 43.875, + "learning_rate": 9.91318066163512e-06, + "loss": 16.7243, + "step": 30730 + }, + { + "epoch": 0.555825522499045, + "grad_norm": 41.53125, + "learning_rate": 9.91315240932846e-06, + "loss": 17.2681, + "step": 30740 + }, + { + "epoch": 0.556006337568173, + "grad_norm": 42.6875, + "learning_rate": 9.913124157021801e-06, + "loss": 17.031, + "step": 30750 + }, + { + "epoch": 0.5561871526373008, + "grad_norm": 42.3125, + "learning_rate": 9.91309590471514e-06, + "loss": 16.7159, + "step": 30760 + }, + { + "epoch": 0.5563679677064286, + "grad_norm": 43.84375, + "learning_rate": 9.91306765240848e-06, + "loss": 16.9925, + "step": 30770 + }, + { + "epoch": 0.5565487827755565, + "grad_norm": 40.71875, + "learning_rate": 9.913039400101823e-06, + "loss": 16.9813, + "step": 30780 + }, + { + "epoch": 0.5567295978446843, + "grad_norm": 40.40625, + "learning_rate": 9.913011147795164e-06, + "loss": 16.9735, + "step": 30790 + }, + { + "epoch": 0.5569104129138123, + "grad_norm": 42.4375, + "learning_rate": 9.912982895488503e-06, + "loss": 17.1974, + "step": 30800 + }, + { + "epoch": 0.5570912279829401, + "grad_norm": 42.96875, + "learning_rate": 9.912954643181843e-06, + "loss": 17.0436, + "step": 30810 + }, + { + "epoch": 0.557272043052068, + "grad_norm": 43.375, + "learning_rate": 9.912926390875184e-06, + "loss": 17.1123, + "step": 30820 + }, + { + "epoch": 0.5574528581211958, + "grad_norm": 43.21875, + "learning_rate": 9.912898138568525e-06, + "loss": 16.9163, + "step": 30830 + }, + { + "epoch": 0.5576336731903236, + "grad_norm": 39.90625, + "learning_rate": 9.912869886261865e-06, + "loss": 17.1751, + "step": 30840 + }, + { + "epoch": 0.5578144882594516, + "grad_norm": 42.90625, + "learning_rate": 9.912841633955204e-06, + "loss": 16.9831, + "step": 30850 + }, + { + "epoch": 0.5579953033285794, + "grad_norm": 43.9375, + "learning_rate": 9.912813381648545e-06, + "loss": 16.887, + "step": 30860 + }, + { + "epoch": 0.5581761183977073, + "grad_norm": 41.15625, + "learning_rate": 9.912785129341887e-06, + "loss": 16.9941, + "step": 30870 + }, + { + "epoch": 0.5583569334668351, + "grad_norm": 44.25, + "learning_rate": 9.912756877035226e-06, + "loss": 16.9951, + "step": 30880 + }, + { + "epoch": 0.5585377485359629, + "grad_norm": 43.5, + "learning_rate": 9.912728624728567e-06, + "loss": 17.0885, + "step": 30890 + }, + { + "epoch": 0.5587185636050909, + "grad_norm": 38.4375, + "learning_rate": 9.912700372421907e-06, + "loss": 16.9381, + "step": 30900 + }, + { + "epoch": 0.5588993786742187, + "grad_norm": 45.90625, + "learning_rate": 9.912672120115248e-06, + "loss": 16.8079, + "step": 30910 + }, + { + "epoch": 0.5590801937433466, + "grad_norm": 45.1875, + "learning_rate": 9.912643867808588e-06, + "loss": 17.1373, + "step": 30920 + }, + { + "epoch": 0.5592610088124744, + "grad_norm": 41.65625, + "learning_rate": 9.912615615501929e-06, + "loss": 17.3978, + "step": 30930 + }, + { + "epoch": 0.5594418238816022, + "grad_norm": 40.0, + "learning_rate": 9.912587363195268e-06, + "loss": 17.0463, + "step": 30940 + }, + { + "epoch": 0.5596226389507302, + "grad_norm": 40.25, + "learning_rate": 9.912559110888609e-06, + "loss": 17.0721, + "step": 30950 + }, + { + "epoch": 0.559803454019858, + "grad_norm": 43.40625, + "learning_rate": 9.912530858581951e-06, + "loss": 17.0719, + "step": 30960 + }, + { + "epoch": 0.5599842690889859, + "grad_norm": 39.9375, + "learning_rate": 9.91250260627529e-06, + "loss": 17.2505, + "step": 30970 + }, + { + "epoch": 0.5601650841581137, + "grad_norm": 41.71875, + "learning_rate": 9.91247435396863e-06, + "loss": 17.0649, + "step": 30980 + }, + { + "epoch": 0.5603458992272417, + "grad_norm": 42.21875, + "learning_rate": 9.912446101661971e-06, + "loss": 17.0292, + "step": 30990 + }, + { + "epoch": 0.5605267142963695, + "grad_norm": 45.75, + "learning_rate": 9.912417849355312e-06, + "loss": 16.998, + "step": 31000 + }, + { + "epoch": 0.5607075293654973, + "grad_norm": 39.71875, + "learning_rate": 9.912389597048652e-06, + "loss": 16.8837, + "step": 31010 + }, + { + "epoch": 0.5608883444346252, + "grad_norm": 39.78125, + "learning_rate": 9.912361344741991e-06, + "loss": 17.1694, + "step": 31020 + }, + { + "epoch": 0.561069159503753, + "grad_norm": 40.5, + "learning_rate": 9.912333092435332e-06, + "loss": 16.8809, + "step": 31030 + }, + { + "epoch": 0.561249974572881, + "grad_norm": 42.125, + "learning_rate": 9.912304840128674e-06, + "loss": 17.1144, + "step": 31040 + }, + { + "epoch": 0.5614307896420088, + "grad_norm": 40.25, + "learning_rate": 9.912276587822015e-06, + "loss": 17.2917, + "step": 31050 + }, + { + "epoch": 0.5616116047111366, + "grad_norm": 44.6875, + "learning_rate": 9.912248335515354e-06, + "loss": 16.9495, + "step": 31060 + }, + { + "epoch": 0.5617924197802645, + "grad_norm": 43.96875, + "learning_rate": 9.912220083208694e-06, + "loss": 17.3728, + "step": 31070 + }, + { + "epoch": 0.5619732348493923, + "grad_norm": 41.6875, + "learning_rate": 9.912191830902035e-06, + "loss": 16.7284, + "step": 31080 + }, + { + "epoch": 0.5621540499185202, + "grad_norm": 40.65625, + "learning_rate": 9.912163578595376e-06, + "loss": 16.6271, + "step": 31090 + }, + { + "epoch": 0.5623348649876481, + "grad_norm": 40.375, + "learning_rate": 9.912135326288716e-06, + "loss": 16.8732, + "step": 31100 + }, + { + "epoch": 0.5625156800567759, + "grad_norm": 42.21875, + "learning_rate": 9.912107073982055e-06, + "loss": 16.6208, + "step": 31110 + }, + { + "epoch": 0.5626964951259038, + "grad_norm": 40.71875, + "learning_rate": 9.912078821675396e-06, + "loss": 17.2431, + "step": 31120 + }, + { + "epoch": 0.5628773101950316, + "grad_norm": 40.15625, + "learning_rate": 9.912050569368738e-06, + "loss": 16.7951, + "step": 31130 + }, + { + "epoch": 0.5630581252641595, + "grad_norm": 40.21875, + "learning_rate": 9.912022317062077e-06, + "loss": 17.2329, + "step": 31140 + }, + { + "epoch": 0.5632389403332874, + "grad_norm": 39.90625, + "learning_rate": 9.911994064755418e-06, + "loss": 16.9604, + "step": 31150 + }, + { + "epoch": 0.5634197554024153, + "grad_norm": 43.0, + "learning_rate": 9.911965812448758e-06, + "loss": 16.9863, + "step": 31160 + }, + { + "epoch": 0.5636005704715431, + "grad_norm": 41.71875, + "learning_rate": 9.911937560142099e-06, + "loss": 16.8509, + "step": 31170 + }, + { + "epoch": 0.5637813855406709, + "grad_norm": 39.96875, + "learning_rate": 9.91190930783544e-06, + "loss": 16.956, + "step": 31180 + }, + { + "epoch": 0.5639622006097988, + "grad_norm": 41.53125, + "learning_rate": 9.911881055528778e-06, + "loss": 16.8709, + "step": 31190 + }, + { + "epoch": 0.5641430156789267, + "grad_norm": 42.71875, + "learning_rate": 9.911852803222119e-06, + "loss": 16.7424, + "step": 31200 + }, + { + "epoch": 0.5643238307480546, + "grad_norm": 43.0, + "learning_rate": 9.91182455091546e-06, + "loss": 16.9176, + "step": 31210 + }, + { + "epoch": 0.5645046458171824, + "grad_norm": 43.34375, + "learning_rate": 9.911796298608802e-06, + "loss": 17.2313, + "step": 31220 + }, + { + "epoch": 0.5646854608863102, + "grad_norm": 41.28125, + "learning_rate": 9.911768046302141e-06, + "loss": 17.1907, + "step": 31230 + }, + { + "epoch": 0.5648662759554381, + "grad_norm": 39.9375, + "learning_rate": 9.911739793995482e-06, + "loss": 16.7637, + "step": 31240 + }, + { + "epoch": 0.565047091024566, + "grad_norm": 41.0, + "learning_rate": 9.911711541688822e-06, + "loss": 17.0071, + "step": 31250 + }, + { + "epoch": 0.5652279060936939, + "grad_norm": 41.53125, + "learning_rate": 9.911683289382163e-06, + "loss": 17.097, + "step": 31260 + }, + { + "epoch": 0.5654087211628217, + "grad_norm": 45.09375, + "learning_rate": 9.911655037075503e-06, + "loss": 17.3105, + "step": 31270 + }, + { + "epoch": 0.5655895362319495, + "grad_norm": 42.59375, + "learning_rate": 9.911626784768842e-06, + "loss": 17.0906, + "step": 31280 + }, + { + "epoch": 0.5657703513010774, + "grad_norm": 39.96875, + "learning_rate": 9.911598532462183e-06, + "loss": 17.2088, + "step": 31290 + }, + { + "epoch": 0.5659511663702053, + "grad_norm": 38.9375, + "learning_rate": 9.911570280155524e-06, + "loss": 16.8181, + "step": 31300 + }, + { + "epoch": 0.5661319814393332, + "grad_norm": 38.9375, + "learning_rate": 9.911542027848864e-06, + "loss": 17.0384, + "step": 31310 + }, + { + "epoch": 0.566312796508461, + "grad_norm": 42.4375, + "learning_rate": 9.911513775542205e-06, + "loss": 16.741, + "step": 31320 + }, + { + "epoch": 0.5664936115775889, + "grad_norm": 39.3125, + "learning_rate": 9.911485523235545e-06, + "loss": 16.7638, + "step": 31330 + }, + { + "epoch": 0.5666744266467167, + "grad_norm": 43.75, + "learning_rate": 9.911457270928886e-06, + "loss": 16.7134, + "step": 31340 + }, + { + "epoch": 0.5668552417158446, + "grad_norm": 40.8125, + "learning_rate": 9.911429018622227e-06, + "loss": 16.9839, + "step": 31350 + }, + { + "epoch": 0.5670360567849725, + "grad_norm": 42.96875, + "learning_rate": 9.911400766315566e-06, + "loss": 16.8958, + "step": 31360 + }, + { + "epoch": 0.5672168718541003, + "grad_norm": 41.875, + "learning_rate": 9.911372514008906e-06, + "loss": 16.3046, + "step": 31370 + }, + { + "epoch": 0.5673976869232282, + "grad_norm": 42.46875, + "learning_rate": 9.911344261702247e-06, + "loss": 17.342, + "step": 31380 + }, + { + "epoch": 0.567578501992356, + "grad_norm": 41.65625, + "learning_rate": 9.91131600939559e-06, + "loss": 17.1161, + "step": 31390 + }, + { + "epoch": 0.5677593170614839, + "grad_norm": 41.40625, + "learning_rate": 9.911287757088928e-06, + "loss": 16.7149, + "step": 31400 + }, + { + "epoch": 0.5679401321306118, + "grad_norm": 40.59375, + "learning_rate": 9.911259504782269e-06, + "loss": 16.841, + "step": 31410 + }, + { + "epoch": 0.5681209471997396, + "grad_norm": 40.625, + "learning_rate": 9.91123125247561e-06, + "loss": 16.8366, + "step": 31420 + }, + { + "epoch": 0.5683017622688675, + "grad_norm": 40.125, + "learning_rate": 9.91120300016895e-06, + "loss": 17.2759, + "step": 31430 + }, + { + "epoch": 0.5684825773379953, + "grad_norm": 42.09375, + "learning_rate": 9.91117474786229e-06, + "loss": 16.9346, + "step": 31440 + }, + { + "epoch": 0.5686633924071232, + "grad_norm": 40.5, + "learning_rate": 9.91114649555563e-06, + "loss": 16.5734, + "step": 31450 + }, + { + "epoch": 0.5688442074762511, + "grad_norm": 41.96875, + "learning_rate": 9.91111824324897e-06, + "loss": 16.8365, + "step": 31460 + }, + { + "epoch": 0.5690250225453789, + "grad_norm": 43.65625, + "learning_rate": 9.91108999094231e-06, + "loss": 17.1106, + "step": 31470 + }, + { + "epoch": 0.5692058376145068, + "grad_norm": 41.59375, + "learning_rate": 9.911061738635651e-06, + "loss": 16.7902, + "step": 31480 + }, + { + "epoch": 0.5693866526836346, + "grad_norm": 42.59375, + "learning_rate": 9.911033486328992e-06, + "loss": 16.4987, + "step": 31490 + }, + { + "epoch": 0.5695674677527626, + "grad_norm": 40.6875, + "learning_rate": 9.911005234022333e-06, + "loss": 16.7677, + "step": 31500 + }, + { + "epoch": 0.5697482828218904, + "grad_norm": 41.46875, + "learning_rate": 9.910976981715673e-06, + "loss": 16.971, + "step": 31510 + }, + { + "epoch": 0.5699290978910182, + "grad_norm": 40.9375, + "learning_rate": 9.910948729409014e-06, + "loss": 16.8891, + "step": 31520 + }, + { + "epoch": 0.5701099129601461, + "grad_norm": 43.78125, + "learning_rate": 9.910920477102355e-06, + "loss": 16.5515, + "step": 31530 + }, + { + "epoch": 0.5702907280292739, + "grad_norm": 41.34375, + "learning_rate": 9.910892224795693e-06, + "loss": 17.0996, + "step": 31540 + }, + { + "epoch": 0.5704715430984019, + "grad_norm": 41.40625, + "learning_rate": 9.910863972489034e-06, + "loss": 17.3, + "step": 31550 + }, + { + "epoch": 0.5706523581675297, + "grad_norm": 41.875, + "learning_rate": 9.910835720182375e-06, + "loss": 16.7294, + "step": 31560 + }, + { + "epoch": 0.5708331732366575, + "grad_norm": 41.25, + "learning_rate": 9.910807467875715e-06, + "loss": 17.0507, + "step": 31570 + }, + { + "epoch": 0.5710139883057854, + "grad_norm": 44.46875, + "learning_rate": 9.910779215569056e-06, + "loss": 16.7798, + "step": 31580 + }, + { + "epoch": 0.5711948033749132, + "grad_norm": 40.875, + "learning_rate": 9.910750963262397e-06, + "loss": 16.729, + "step": 31590 + }, + { + "epoch": 0.5713756184440412, + "grad_norm": 40.9375, + "learning_rate": 9.910722710955737e-06, + "loss": 16.7486, + "step": 31600 + }, + { + "epoch": 0.571556433513169, + "grad_norm": 45.0, + "learning_rate": 9.910694458649078e-06, + "loss": 17.0809, + "step": 31610 + }, + { + "epoch": 0.5717372485822968, + "grad_norm": 41.09375, + "learning_rate": 9.910666206342417e-06, + "loss": 17.0275, + "step": 31620 + }, + { + "epoch": 0.5719180636514247, + "grad_norm": 40.5, + "learning_rate": 9.910637954035757e-06, + "loss": 16.9974, + "step": 31630 + }, + { + "epoch": 0.5720988787205525, + "grad_norm": 40.84375, + "learning_rate": 9.910609701729098e-06, + "loss": 16.9911, + "step": 31640 + }, + { + "epoch": 0.5722796937896805, + "grad_norm": 43.40625, + "learning_rate": 9.910581449422439e-06, + "loss": 16.7812, + "step": 31650 + }, + { + "epoch": 0.5724605088588083, + "grad_norm": 44.40625, + "learning_rate": 9.91055319711578e-06, + "loss": 16.8301, + "step": 31660 + }, + { + "epoch": 0.5726413239279362, + "grad_norm": 40.03125, + "learning_rate": 9.91052494480912e-06, + "loss": 16.7667, + "step": 31670 + }, + { + "epoch": 0.572822138997064, + "grad_norm": 43.40625, + "learning_rate": 9.91049669250246e-06, + "loss": 16.5191, + "step": 31680 + }, + { + "epoch": 0.5730029540661918, + "grad_norm": 39.8125, + "learning_rate": 9.910468440195801e-06, + "loss": 17.1354, + "step": 31690 + }, + { + "epoch": 0.5731837691353198, + "grad_norm": 41.28125, + "learning_rate": 9.910440187889142e-06, + "loss": 16.8287, + "step": 31700 + }, + { + "epoch": 0.5733645842044476, + "grad_norm": 41.84375, + "learning_rate": 9.91041193558248e-06, + "loss": 17.183, + "step": 31710 + }, + { + "epoch": 0.5735453992735755, + "grad_norm": 42.9375, + "learning_rate": 9.910383683275821e-06, + "loss": 17.053, + "step": 31720 + }, + { + "epoch": 0.5737262143427033, + "grad_norm": 41.96875, + "learning_rate": 9.910355430969162e-06, + "loss": 17.1543, + "step": 31730 + }, + { + "epoch": 0.5739070294118311, + "grad_norm": 42.09375, + "learning_rate": 9.910327178662503e-06, + "loss": 16.8542, + "step": 31740 + }, + { + "epoch": 0.5740878444809591, + "grad_norm": 39.15625, + "learning_rate": 9.910298926355843e-06, + "loss": 16.7303, + "step": 31750 + }, + { + "epoch": 0.5742686595500869, + "grad_norm": 40.15625, + "learning_rate": 9.910270674049184e-06, + "loss": 16.62, + "step": 31760 + }, + { + "epoch": 0.5744494746192148, + "grad_norm": 43.4375, + "learning_rate": 9.910242421742524e-06, + "loss": 17.0608, + "step": 31770 + }, + { + "epoch": 0.5746302896883426, + "grad_norm": 39.90625, + "learning_rate": 9.910214169435865e-06, + "loss": 16.7006, + "step": 31780 + }, + { + "epoch": 0.5748111047574704, + "grad_norm": 41.53125, + "learning_rate": 9.910185917129204e-06, + "loss": 17.0885, + "step": 31790 + }, + { + "epoch": 0.5749919198265984, + "grad_norm": 43.0625, + "learning_rate": 9.910157664822545e-06, + "loss": 16.7022, + "step": 31800 + }, + { + "epoch": 0.5751727348957262, + "grad_norm": 44.1875, + "learning_rate": 9.910129412515885e-06, + "loss": 16.9512, + "step": 31810 + }, + { + "epoch": 0.5753535499648541, + "grad_norm": 42.375, + "learning_rate": 9.910101160209226e-06, + "loss": 16.9256, + "step": 31820 + }, + { + "epoch": 0.5755343650339819, + "grad_norm": 42.34375, + "learning_rate": 9.910072907902566e-06, + "loss": 16.6284, + "step": 31830 + }, + { + "epoch": 0.5757151801031098, + "grad_norm": 41.25, + "learning_rate": 9.910044655595907e-06, + "loss": 16.9051, + "step": 31840 + }, + { + "epoch": 0.5758959951722377, + "grad_norm": 39.9375, + "learning_rate": 9.910016403289248e-06, + "loss": 16.7478, + "step": 31850 + }, + { + "epoch": 0.5760768102413655, + "grad_norm": 39.9375, + "learning_rate": 9.909988150982588e-06, + "loss": 16.8593, + "step": 31860 + }, + { + "epoch": 0.5762576253104934, + "grad_norm": 44.09375, + "learning_rate": 9.909959898675929e-06, + "loss": 16.7016, + "step": 31870 + }, + { + "epoch": 0.5764384403796212, + "grad_norm": 43.1875, + "learning_rate": 9.909931646369268e-06, + "loss": 17.125, + "step": 31880 + }, + { + "epoch": 0.5766192554487491, + "grad_norm": 41.53125, + "learning_rate": 9.909903394062608e-06, + "loss": 16.9871, + "step": 31890 + }, + { + "epoch": 0.576800070517877, + "grad_norm": 40.96875, + "learning_rate": 9.909875141755949e-06, + "loss": 17.0037, + "step": 31900 + }, + { + "epoch": 0.5769808855870048, + "grad_norm": 43.90625, + "learning_rate": 9.90984688944929e-06, + "loss": 17.2558, + "step": 31910 + }, + { + "epoch": 0.5771617006561327, + "grad_norm": 42.0625, + "learning_rate": 9.90981863714263e-06, + "loss": 17.2594, + "step": 31920 + }, + { + "epoch": 0.5773425157252605, + "grad_norm": 43.53125, + "learning_rate": 9.909790384835971e-06, + "loss": 16.8757, + "step": 31930 + }, + { + "epoch": 0.5775233307943884, + "grad_norm": 39.40625, + "learning_rate": 9.909762132529312e-06, + "loss": 16.9364, + "step": 31940 + }, + { + "epoch": 0.5777041458635163, + "grad_norm": 41.875, + "learning_rate": 9.909733880222652e-06, + "loss": 16.6703, + "step": 31950 + }, + { + "epoch": 0.5778849609326441, + "grad_norm": 40.65625, + "learning_rate": 9.909705627915993e-06, + "loss": 16.8876, + "step": 31960 + }, + { + "epoch": 0.578065776001772, + "grad_norm": 44.96875, + "learning_rate": 9.909677375609332e-06, + "loss": 16.8418, + "step": 31970 + }, + { + "epoch": 0.5782465910708998, + "grad_norm": 43.34375, + "learning_rate": 9.909649123302672e-06, + "loss": 16.9372, + "step": 31980 + }, + { + "epoch": 0.5784274061400277, + "grad_norm": 44.1875, + "learning_rate": 9.909620870996013e-06, + "loss": 16.7588, + "step": 31990 + }, + { + "epoch": 0.5786082212091556, + "grad_norm": 42.09375, + "learning_rate": 9.909592618689354e-06, + "loss": 17.0603, + "step": 32000 + }, + { + "epoch": 0.5787890362782835, + "grad_norm": 45.75, + "learning_rate": 9.909564366382694e-06, + "loss": 17.18, + "step": 32010 + }, + { + "epoch": 0.5789698513474113, + "grad_norm": 43.34375, + "learning_rate": 9.909536114076035e-06, + "loss": 16.7754, + "step": 32020 + }, + { + "epoch": 0.5791506664165391, + "grad_norm": 42.4375, + "learning_rate": 9.909507861769375e-06, + "loss": 17.1534, + "step": 32030 + }, + { + "epoch": 0.579331481485667, + "grad_norm": 42.40625, + "learning_rate": 9.909479609462716e-06, + "loss": 17.2115, + "step": 32040 + }, + { + "epoch": 0.5795122965547949, + "grad_norm": 43.1875, + "learning_rate": 9.909451357156055e-06, + "loss": 17.0768, + "step": 32050 + }, + { + "epoch": 0.5796931116239228, + "grad_norm": 43.46875, + "learning_rate": 9.909423104849396e-06, + "loss": 17.2024, + "step": 32060 + }, + { + "epoch": 0.5798739266930506, + "grad_norm": 45.84375, + "learning_rate": 9.909394852542736e-06, + "loss": 16.6366, + "step": 32070 + }, + { + "epoch": 0.5800547417621784, + "grad_norm": 39.8125, + "learning_rate": 9.909366600236077e-06, + "loss": 17.1224, + "step": 32080 + }, + { + "epoch": 0.5802355568313063, + "grad_norm": 40.15625, + "learning_rate": 9.909338347929418e-06, + "loss": 16.9536, + "step": 32090 + }, + { + "epoch": 0.5804163719004342, + "grad_norm": 41.34375, + "learning_rate": 9.909310095622758e-06, + "loss": 16.8179, + "step": 32100 + }, + { + "epoch": 0.5805971869695621, + "grad_norm": 42.28125, + "learning_rate": 9.909281843316099e-06, + "loss": 17.0561, + "step": 32110 + }, + { + "epoch": 0.5807780020386899, + "grad_norm": 41.9375, + "learning_rate": 9.90925359100944e-06, + "loss": 16.8479, + "step": 32120 + }, + { + "epoch": 0.5809588171078177, + "grad_norm": 41.34375, + "learning_rate": 9.90922533870278e-06, + "loss": 17.2642, + "step": 32130 + }, + { + "epoch": 0.5811396321769456, + "grad_norm": 41.5, + "learning_rate": 9.909197086396119e-06, + "loss": 17.1699, + "step": 32140 + }, + { + "epoch": 0.5813204472460735, + "grad_norm": 42.71875, + "learning_rate": 9.90916883408946e-06, + "loss": 16.8568, + "step": 32150 + }, + { + "epoch": 0.5815012623152014, + "grad_norm": 39.0, + "learning_rate": 9.9091405817828e-06, + "loss": 16.9422, + "step": 32160 + }, + { + "epoch": 0.5816820773843292, + "grad_norm": 39.3125, + "learning_rate": 9.90911232947614e-06, + "loss": 16.7037, + "step": 32170 + }, + { + "epoch": 0.5818628924534571, + "grad_norm": 42.125, + "learning_rate": 9.909084077169481e-06, + "loss": 17.1012, + "step": 32180 + }, + { + "epoch": 0.5820437075225849, + "grad_norm": 43.4375, + "learning_rate": 9.909055824862822e-06, + "loss": 16.9235, + "step": 32190 + }, + { + "epoch": 0.5822245225917128, + "grad_norm": 40.90625, + "learning_rate": 9.909027572556163e-06, + "loss": 17.0028, + "step": 32200 + }, + { + "epoch": 0.5824053376608407, + "grad_norm": 42.40625, + "learning_rate": 9.908999320249503e-06, + "loss": 17.3045, + "step": 32210 + }, + { + "epoch": 0.5825861527299685, + "grad_norm": 41.5625, + "learning_rate": 9.908971067942842e-06, + "loss": 16.6335, + "step": 32220 + }, + { + "epoch": 0.5827669677990964, + "grad_norm": 41.6875, + "learning_rate": 9.908942815636183e-06, + "loss": 17.434, + "step": 32230 + }, + { + "epoch": 0.5829477828682242, + "grad_norm": 41.40625, + "learning_rate": 9.908914563329523e-06, + "loss": 16.849, + "step": 32240 + }, + { + "epoch": 0.5831285979373521, + "grad_norm": 41.71875, + "learning_rate": 9.908886311022864e-06, + "loss": 17.0407, + "step": 32250 + }, + { + "epoch": 0.58330941300648, + "grad_norm": 43.0, + "learning_rate": 9.908858058716205e-06, + "loss": 16.7928, + "step": 32260 + }, + { + "epoch": 0.5834902280756078, + "grad_norm": 41.59375, + "learning_rate": 9.908829806409545e-06, + "loss": 16.5376, + "step": 32270 + }, + { + "epoch": 0.5836710431447357, + "grad_norm": 40.84375, + "learning_rate": 9.908801554102886e-06, + "loss": 16.9865, + "step": 32280 + }, + { + "epoch": 0.5838518582138635, + "grad_norm": 45.21875, + "learning_rate": 9.908773301796227e-06, + "loss": 16.6478, + "step": 32290 + }, + { + "epoch": 0.5840326732829914, + "grad_norm": 38.8125, + "learning_rate": 9.908745049489567e-06, + "loss": 17.0553, + "step": 32300 + }, + { + "epoch": 0.5842134883521193, + "grad_norm": 45.0625, + "learning_rate": 9.908716797182906e-06, + "loss": 16.9444, + "step": 32310 + }, + { + "epoch": 0.5843943034212471, + "grad_norm": 45.0, + "learning_rate": 9.908688544876247e-06, + "loss": 17.1495, + "step": 32320 + }, + { + "epoch": 0.584575118490375, + "grad_norm": 40.90625, + "learning_rate": 9.908660292569587e-06, + "loss": 17.0832, + "step": 32330 + }, + { + "epoch": 0.5847559335595028, + "grad_norm": 38.375, + "learning_rate": 9.908632040262928e-06, + "loss": 16.2737, + "step": 32340 + }, + { + "epoch": 0.5849367486286308, + "grad_norm": 44.0, + "learning_rate": 9.908603787956269e-06, + "loss": 16.8861, + "step": 32350 + }, + { + "epoch": 0.5851175636977586, + "grad_norm": 41.28125, + "learning_rate": 9.90857553564961e-06, + "loss": 16.8836, + "step": 32360 + }, + { + "epoch": 0.5852983787668864, + "grad_norm": 42.90625, + "learning_rate": 9.90854728334295e-06, + "loss": 17.1306, + "step": 32370 + }, + { + "epoch": 0.5854791938360143, + "grad_norm": 43.375, + "learning_rate": 9.90851903103629e-06, + "loss": 17.097, + "step": 32380 + }, + { + "epoch": 0.5856600089051421, + "grad_norm": 41.3125, + "learning_rate": 9.908490778729631e-06, + "loss": 16.9224, + "step": 32390 + }, + { + "epoch": 0.5858408239742701, + "grad_norm": 45.375, + "learning_rate": 9.90846252642297e-06, + "loss": 16.8028, + "step": 32400 + }, + { + "epoch": 0.5860216390433979, + "grad_norm": 39.15625, + "learning_rate": 9.90843427411631e-06, + "loss": 16.8687, + "step": 32410 + }, + { + "epoch": 0.5862024541125257, + "grad_norm": 39.375, + "learning_rate": 9.908406021809651e-06, + "loss": 16.8831, + "step": 32420 + }, + { + "epoch": 0.5863832691816536, + "grad_norm": 42.78125, + "learning_rate": 9.908377769502992e-06, + "loss": 16.6693, + "step": 32430 + }, + { + "epoch": 0.5865640842507814, + "grad_norm": 41.125, + "learning_rate": 9.908349517196333e-06, + "loss": 16.9407, + "step": 32440 + }, + { + "epoch": 0.5867448993199094, + "grad_norm": 46.59375, + "learning_rate": 9.908321264889673e-06, + "loss": 17.4105, + "step": 32450 + }, + { + "epoch": 0.5869257143890372, + "grad_norm": 41.90625, + "learning_rate": 9.908293012583014e-06, + "loss": 16.7872, + "step": 32460 + }, + { + "epoch": 0.587106529458165, + "grad_norm": 40.0, + "learning_rate": 9.908264760276354e-06, + "loss": 17.1423, + "step": 32470 + }, + { + "epoch": 0.5872873445272929, + "grad_norm": 43.21875, + "learning_rate": 9.908236507969693e-06, + "loss": 17.2344, + "step": 32480 + }, + { + "epoch": 0.5874681595964207, + "grad_norm": 40.5, + "learning_rate": 9.908208255663034e-06, + "loss": 17.0029, + "step": 32490 + }, + { + "epoch": 0.5876489746655487, + "grad_norm": 46.3125, + "learning_rate": 9.908180003356375e-06, + "loss": 17.0556, + "step": 32500 + }, + { + "epoch": 0.5878297897346765, + "grad_norm": 39.3125, + "learning_rate": 9.908151751049715e-06, + "loss": 16.9346, + "step": 32510 + }, + { + "epoch": 0.5880106048038044, + "grad_norm": 45.0625, + "learning_rate": 9.908123498743056e-06, + "loss": 16.9969, + "step": 32520 + }, + { + "epoch": 0.5881914198729322, + "grad_norm": 45.1875, + "learning_rate": 9.908095246436396e-06, + "loss": 17.1584, + "step": 32530 + }, + { + "epoch": 0.58837223494206, + "grad_norm": 44.4375, + "learning_rate": 9.908066994129737e-06, + "loss": 16.9199, + "step": 32540 + }, + { + "epoch": 0.588553050011188, + "grad_norm": 44.21875, + "learning_rate": 9.908038741823078e-06, + "loss": 16.8951, + "step": 32550 + }, + { + "epoch": 0.5887338650803158, + "grad_norm": 41.53125, + "learning_rate": 9.908010489516418e-06, + "loss": 17.4404, + "step": 32560 + }, + { + "epoch": 0.5889146801494437, + "grad_norm": 45.5625, + "learning_rate": 9.907982237209757e-06, + "loss": 17.1576, + "step": 32570 + }, + { + "epoch": 0.5890954952185715, + "grad_norm": 42.625, + "learning_rate": 9.907953984903098e-06, + "loss": 16.656, + "step": 32580 + }, + { + "epoch": 0.5892763102876993, + "grad_norm": 42.28125, + "learning_rate": 9.907925732596438e-06, + "loss": 16.8173, + "step": 32590 + }, + { + "epoch": 0.5894571253568273, + "grad_norm": 41.375, + "learning_rate": 9.907897480289779e-06, + "loss": 16.9153, + "step": 32600 + }, + { + "epoch": 0.5896379404259551, + "grad_norm": 40.78125, + "learning_rate": 9.90786922798312e-06, + "loss": 17.1193, + "step": 32610 + }, + { + "epoch": 0.589818755495083, + "grad_norm": 43.59375, + "learning_rate": 9.90784097567646e-06, + "loss": 17.2083, + "step": 32620 + }, + { + "epoch": 0.5899995705642108, + "grad_norm": 44.75, + "learning_rate": 9.907812723369801e-06, + "loss": 17.1789, + "step": 32630 + }, + { + "epoch": 0.5901803856333386, + "grad_norm": 40.78125, + "learning_rate": 9.907784471063142e-06, + "loss": 16.5824, + "step": 32640 + }, + { + "epoch": 0.5903612007024666, + "grad_norm": 39.375, + "learning_rate": 9.90775621875648e-06, + "loss": 16.6448, + "step": 32650 + }, + { + "epoch": 0.5905420157715944, + "grad_norm": 44.125, + "learning_rate": 9.907727966449821e-06, + "loss": 17.0394, + "step": 32660 + }, + { + "epoch": 0.5907228308407223, + "grad_norm": 43.6875, + "learning_rate": 9.907699714143162e-06, + "loss": 16.9122, + "step": 32670 + }, + { + "epoch": 0.5909036459098501, + "grad_norm": 44.6875, + "learning_rate": 9.907671461836502e-06, + "loss": 16.9744, + "step": 32680 + }, + { + "epoch": 0.591084460978978, + "grad_norm": 44.15625, + "learning_rate": 9.907643209529843e-06, + "loss": 16.7347, + "step": 32690 + }, + { + "epoch": 0.5912652760481059, + "grad_norm": 44.84375, + "learning_rate": 9.907614957223184e-06, + "loss": 17.1523, + "step": 32700 + }, + { + "epoch": 0.5914460911172337, + "grad_norm": 41.40625, + "learning_rate": 9.907586704916524e-06, + "loss": 16.64, + "step": 32710 + }, + { + "epoch": 0.5916269061863616, + "grad_norm": 41.53125, + "learning_rate": 9.907558452609865e-06, + "loss": 16.8095, + "step": 32720 + }, + { + "epoch": 0.5918077212554894, + "grad_norm": 43.71875, + "learning_rate": 9.907530200303205e-06, + "loss": 17.1435, + "step": 32730 + }, + { + "epoch": 0.5919885363246173, + "grad_norm": 41.21875, + "learning_rate": 9.907501947996544e-06, + "loss": 17.3664, + "step": 32740 + }, + { + "epoch": 0.5921693513937452, + "grad_norm": 42.25, + "learning_rate": 9.907473695689885e-06, + "loss": 17.0529, + "step": 32750 + }, + { + "epoch": 0.592350166462873, + "grad_norm": 42.625, + "learning_rate": 9.907445443383226e-06, + "loss": 16.9959, + "step": 32760 + }, + { + "epoch": 0.5925309815320009, + "grad_norm": 41.71875, + "learning_rate": 9.907417191076566e-06, + "loss": 16.3658, + "step": 32770 + }, + { + "epoch": 0.5927117966011287, + "grad_norm": 42.90625, + "learning_rate": 9.907388938769907e-06, + "loss": 16.7799, + "step": 32780 + }, + { + "epoch": 0.5928926116702566, + "grad_norm": 40.875, + "learning_rate": 9.907360686463246e-06, + "loss": 17.0065, + "step": 32790 + }, + { + "epoch": 0.5930734267393845, + "grad_norm": 42.5625, + "learning_rate": 9.907332434156588e-06, + "loss": 16.8945, + "step": 32800 + }, + { + "epoch": 0.5932542418085123, + "grad_norm": 41.875, + "learning_rate": 9.907304181849929e-06, + "loss": 16.9943, + "step": 32810 + }, + { + "epoch": 0.5934350568776402, + "grad_norm": 38.78125, + "learning_rate": 9.90727592954327e-06, + "loss": 16.6761, + "step": 32820 + }, + { + "epoch": 0.593615871946768, + "grad_norm": 42.96875, + "learning_rate": 9.907247677236608e-06, + "loss": 16.9836, + "step": 32830 + }, + { + "epoch": 0.5937966870158959, + "grad_norm": 42.0, + "learning_rate": 9.907219424929949e-06, + "loss": 17.4699, + "step": 32840 + }, + { + "epoch": 0.5939775020850238, + "grad_norm": 41.28125, + "learning_rate": 9.90719117262329e-06, + "loss": 17.0958, + "step": 32850 + }, + { + "epoch": 0.5941583171541516, + "grad_norm": 41.34375, + "learning_rate": 9.90716292031663e-06, + "loss": 17.0023, + "step": 32860 + }, + { + "epoch": 0.5943391322232795, + "grad_norm": 41.21875, + "learning_rate": 9.90713466800997e-06, + "loss": 16.894, + "step": 32870 + }, + { + "epoch": 0.5945199472924073, + "grad_norm": 41.1875, + "learning_rate": 9.907106415703311e-06, + "loss": 17.3038, + "step": 32880 + }, + { + "epoch": 0.5947007623615352, + "grad_norm": 41.5, + "learning_rate": 9.907078163396652e-06, + "loss": 16.8994, + "step": 32890 + }, + { + "epoch": 0.594881577430663, + "grad_norm": 38.78125, + "learning_rate": 9.907049911089993e-06, + "loss": 17.6324, + "step": 32900 + }, + { + "epoch": 0.595062392499791, + "grad_norm": 40.78125, + "learning_rate": 9.907021658783332e-06, + "loss": 16.7333, + "step": 32910 + }, + { + "epoch": 0.5952432075689188, + "grad_norm": 43.0, + "learning_rate": 9.906993406476672e-06, + "loss": 16.869, + "step": 32920 + }, + { + "epoch": 0.5954240226380466, + "grad_norm": 41.34375, + "learning_rate": 9.906965154170013e-06, + "loss": 16.7851, + "step": 32930 + }, + { + "epoch": 0.5956048377071745, + "grad_norm": 40.8125, + "learning_rate": 9.906936901863353e-06, + "loss": 16.8765, + "step": 32940 + }, + { + "epoch": 0.5957856527763024, + "grad_norm": 38.3125, + "learning_rate": 9.906908649556694e-06, + "loss": 17.119, + "step": 32950 + }, + { + "epoch": 0.5959664678454303, + "grad_norm": 44.8125, + "learning_rate": 9.906880397250033e-06, + "loss": 16.8493, + "step": 32960 + }, + { + "epoch": 0.5961472829145581, + "grad_norm": 42.625, + "learning_rate": 9.906852144943375e-06, + "loss": 17.293, + "step": 32970 + }, + { + "epoch": 0.5963280979836859, + "grad_norm": 41.15625, + "learning_rate": 9.906823892636716e-06, + "loss": 16.4654, + "step": 32980 + }, + { + "epoch": 0.5965089130528138, + "grad_norm": 46.65625, + "learning_rate": 9.906795640330057e-06, + "loss": 16.9846, + "step": 32990 + }, + { + "epoch": 0.5966897281219417, + "grad_norm": 38.6875, + "learning_rate": 9.906767388023396e-06, + "loss": 17.0765, + "step": 33000 + }, + { + "epoch": 0.5968705431910696, + "grad_norm": 45.40625, + "learning_rate": 9.906739135716736e-06, + "loss": 17.217, + "step": 33010 + }, + { + "epoch": 0.5970513582601974, + "grad_norm": 41.4375, + "learning_rate": 9.906710883410077e-06, + "loss": 16.6237, + "step": 33020 + }, + { + "epoch": 0.5972321733293252, + "grad_norm": 44.125, + "learning_rate": 9.906682631103417e-06, + "loss": 16.7351, + "step": 33030 + }, + { + "epoch": 0.5974129883984531, + "grad_norm": 42.6875, + "learning_rate": 9.906654378796758e-06, + "loss": 16.7977, + "step": 33040 + }, + { + "epoch": 0.597593803467581, + "grad_norm": 44.34375, + "learning_rate": 9.906626126490097e-06, + "loss": 17.443, + "step": 33050 + }, + { + "epoch": 0.5977746185367089, + "grad_norm": 42.5625, + "learning_rate": 9.90659787418344e-06, + "loss": 16.9387, + "step": 33060 + }, + { + "epoch": 0.5979554336058367, + "grad_norm": 38.90625, + "learning_rate": 9.90656962187678e-06, + "loss": 16.9259, + "step": 33070 + }, + { + "epoch": 0.5981362486749646, + "grad_norm": 39.4375, + "learning_rate": 9.906541369570119e-06, + "loss": 16.7404, + "step": 33080 + }, + { + "epoch": 0.5983170637440924, + "grad_norm": 42.65625, + "learning_rate": 9.90651311726346e-06, + "loss": 16.6768, + "step": 33090 + }, + { + "epoch": 0.5984978788132203, + "grad_norm": 42.71875, + "learning_rate": 9.9064848649568e-06, + "loss": 16.6058, + "step": 33100 + }, + { + "epoch": 0.5986786938823482, + "grad_norm": 42.65625, + "learning_rate": 9.90645661265014e-06, + "loss": 16.9653, + "step": 33110 + }, + { + "epoch": 0.598859508951476, + "grad_norm": 41.34375, + "learning_rate": 9.906428360343481e-06, + "loss": 16.6989, + "step": 33120 + }, + { + "epoch": 0.5990403240206039, + "grad_norm": 43.03125, + "learning_rate": 9.906400108036822e-06, + "loss": 16.8293, + "step": 33130 + }, + { + "epoch": 0.5992211390897317, + "grad_norm": 42.25, + "learning_rate": 9.90637185573016e-06, + "loss": 16.9864, + "step": 33140 + }, + { + "epoch": 0.5994019541588596, + "grad_norm": 40.78125, + "learning_rate": 9.906343603423503e-06, + "loss": 17.4234, + "step": 33150 + }, + { + "epoch": 0.5995827692279875, + "grad_norm": 40.34375, + "learning_rate": 9.906315351116844e-06, + "loss": 17.622, + "step": 33160 + }, + { + "epoch": 0.5997635842971153, + "grad_norm": 44.0625, + "learning_rate": 9.906287098810183e-06, + "loss": 17.2536, + "step": 33170 + }, + { + "epoch": 0.5999443993662432, + "grad_norm": 43.9375, + "learning_rate": 9.906258846503523e-06, + "loss": 17.4132, + "step": 33180 + }, + { + "epoch": 0.600125214435371, + "grad_norm": 41.90625, + "learning_rate": 9.906230594196864e-06, + "loss": 17.1735, + "step": 33190 + }, + { + "epoch": 0.6003060295044989, + "grad_norm": 40.09375, + "learning_rate": 9.906202341890205e-06, + "loss": 16.8919, + "step": 33200 + }, + { + "epoch": 0.6004868445736268, + "grad_norm": 41.6875, + "learning_rate": 9.906174089583545e-06, + "loss": 16.8507, + "step": 33210 + }, + { + "epoch": 0.6006676596427546, + "grad_norm": 42.21875, + "learning_rate": 9.906145837276884e-06, + "loss": 16.8082, + "step": 33220 + }, + { + "epoch": 0.6008484747118825, + "grad_norm": 42.34375, + "learning_rate": 9.906117584970226e-06, + "loss": 17.3458, + "step": 33230 + }, + { + "epoch": 0.6010292897810103, + "grad_norm": 41.0, + "learning_rate": 9.906089332663567e-06, + "loss": 17.2312, + "step": 33240 + }, + { + "epoch": 0.6012101048501383, + "grad_norm": 39.375, + "learning_rate": 9.906061080356908e-06, + "loss": 16.5374, + "step": 33250 + }, + { + "epoch": 0.6013909199192661, + "grad_norm": 41.375, + "learning_rate": 9.906032828050247e-06, + "loss": 16.3421, + "step": 33260 + }, + { + "epoch": 0.6015717349883939, + "grad_norm": 45.34375, + "learning_rate": 9.906004575743587e-06, + "loss": 16.4727, + "step": 33270 + }, + { + "epoch": 0.6017525500575218, + "grad_norm": 43.03125, + "learning_rate": 9.905976323436928e-06, + "loss": 16.9591, + "step": 33280 + }, + { + "epoch": 0.6019333651266496, + "grad_norm": 41.875, + "learning_rate": 9.905948071130268e-06, + "loss": 17.0089, + "step": 33290 + }, + { + "epoch": 0.6021141801957776, + "grad_norm": 44.4375, + "learning_rate": 9.905919818823609e-06, + "loss": 17.0707, + "step": 33300 + }, + { + "epoch": 0.6022949952649054, + "grad_norm": 42.1875, + "learning_rate": 9.905891566516948e-06, + "loss": 17.0987, + "step": 33310 + }, + { + "epoch": 0.6024758103340332, + "grad_norm": 43.5, + "learning_rate": 9.90586331421029e-06, + "loss": 17.0295, + "step": 33320 + }, + { + "epoch": 0.6026566254031611, + "grad_norm": 44.21875, + "learning_rate": 9.905835061903631e-06, + "loss": 16.9252, + "step": 33330 + }, + { + "epoch": 0.6028374404722889, + "grad_norm": 42.78125, + "learning_rate": 9.90580680959697e-06, + "loss": 16.6911, + "step": 33340 + }, + { + "epoch": 0.6030182555414169, + "grad_norm": 44.1875, + "learning_rate": 9.90577855729031e-06, + "loss": 17.1652, + "step": 33350 + }, + { + "epoch": 0.6031990706105447, + "grad_norm": 42.03125, + "learning_rate": 9.905750304983651e-06, + "loss": 16.8341, + "step": 33360 + }, + { + "epoch": 0.6033798856796725, + "grad_norm": 44.09375, + "learning_rate": 9.905722052676992e-06, + "loss": 16.6755, + "step": 33370 + }, + { + "epoch": 0.6035607007488004, + "grad_norm": 44.25, + "learning_rate": 9.905693800370332e-06, + "loss": 16.725, + "step": 33380 + }, + { + "epoch": 0.6037415158179282, + "grad_norm": 41.96875, + "learning_rate": 9.905665548063671e-06, + "loss": 16.9334, + "step": 33390 + }, + { + "epoch": 0.6039223308870562, + "grad_norm": 42.71875, + "learning_rate": 9.905637295757012e-06, + "loss": 16.5482, + "step": 33400 + }, + { + "epoch": 0.604103145956184, + "grad_norm": 40.34375, + "learning_rate": 9.905609043450354e-06, + "loss": 16.8465, + "step": 33410 + }, + { + "epoch": 0.6042839610253119, + "grad_norm": 41.59375, + "learning_rate": 9.905580791143695e-06, + "loss": 16.6368, + "step": 33420 + }, + { + "epoch": 0.6044647760944397, + "grad_norm": 41.75, + "learning_rate": 9.905552538837034e-06, + "loss": 16.757, + "step": 33430 + }, + { + "epoch": 0.6046455911635675, + "grad_norm": 42.8125, + "learning_rate": 9.905524286530374e-06, + "loss": 17.0386, + "step": 33440 + }, + { + "epoch": 0.6048264062326955, + "grad_norm": 41.84375, + "learning_rate": 9.905496034223715e-06, + "loss": 16.8365, + "step": 33450 + }, + { + "epoch": 0.6050072213018233, + "grad_norm": 40.75, + "learning_rate": 9.905467781917056e-06, + "loss": 17.0499, + "step": 33460 + }, + { + "epoch": 0.6051880363709512, + "grad_norm": 41.4375, + "learning_rate": 9.905439529610396e-06, + "loss": 17.1388, + "step": 33470 + }, + { + "epoch": 0.605368851440079, + "grad_norm": 42.3125, + "learning_rate": 9.905411277303735e-06, + "loss": 17.1348, + "step": 33480 + }, + { + "epoch": 0.6055496665092068, + "grad_norm": 39.34375, + "learning_rate": 9.905383024997076e-06, + "loss": 16.5983, + "step": 33490 + }, + { + "epoch": 0.6057304815783348, + "grad_norm": 40.09375, + "learning_rate": 9.905354772690418e-06, + "loss": 17.5614, + "step": 33500 + }, + { + "epoch": 0.6059112966474626, + "grad_norm": 41.4375, + "learning_rate": 9.905326520383757e-06, + "loss": 16.7848, + "step": 33510 + }, + { + "epoch": 0.6060921117165905, + "grad_norm": 40.0, + "learning_rate": 9.905298268077098e-06, + "loss": 16.9091, + "step": 33520 + }, + { + "epoch": 0.6062729267857183, + "grad_norm": 43.6875, + "learning_rate": 9.905270015770438e-06, + "loss": 16.7644, + "step": 33530 + }, + { + "epoch": 0.6064537418548461, + "grad_norm": 43.25, + "learning_rate": 9.905241763463779e-06, + "loss": 16.7903, + "step": 33540 + }, + { + "epoch": 0.606634556923974, + "grad_norm": 44.875, + "learning_rate": 9.90521351115712e-06, + "loss": 16.7235, + "step": 33550 + }, + { + "epoch": 0.6068153719931019, + "grad_norm": 42.03125, + "learning_rate": 9.905185258850458e-06, + "loss": 16.9318, + "step": 33560 + }, + { + "epoch": 0.6069961870622298, + "grad_norm": 43.1875, + "learning_rate": 9.905157006543799e-06, + "loss": 16.8678, + "step": 33570 + }, + { + "epoch": 0.6071770021313576, + "grad_norm": 40.375, + "learning_rate": 9.905128754237141e-06, + "loss": 16.9585, + "step": 33580 + }, + { + "epoch": 0.6073578172004855, + "grad_norm": 42.28125, + "learning_rate": 9.905100501930482e-06, + "loss": 16.8712, + "step": 33590 + }, + { + "epoch": 0.6075386322696134, + "grad_norm": 42.46875, + "learning_rate": 9.905072249623821e-06, + "loss": 16.741, + "step": 33600 + }, + { + "epoch": 0.6077194473387412, + "grad_norm": 42.40625, + "learning_rate": 9.905043997317162e-06, + "loss": 16.962, + "step": 33610 + }, + { + "epoch": 0.6079002624078691, + "grad_norm": 41.25, + "learning_rate": 9.905015745010502e-06, + "loss": 16.7619, + "step": 33620 + }, + { + "epoch": 0.6080810774769969, + "grad_norm": 42.375, + "learning_rate": 9.904987492703843e-06, + "loss": 16.7792, + "step": 33630 + }, + { + "epoch": 0.6082618925461248, + "grad_norm": 44.28125, + "learning_rate": 9.904959240397183e-06, + "loss": 17.2013, + "step": 33640 + }, + { + "epoch": 0.6084427076152527, + "grad_norm": 42.90625, + "learning_rate": 9.904930988090522e-06, + "loss": 17.1005, + "step": 33650 + }, + { + "epoch": 0.6086235226843805, + "grad_norm": 42.90625, + "learning_rate": 9.904902735783863e-06, + "loss": 17.0336, + "step": 33660 + }, + { + "epoch": 0.6088043377535084, + "grad_norm": 41.75, + "learning_rate": 9.904874483477205e-06, + "loss": 16.9484, + "step": 33670 + }, + { + "epoch": 0.6089851528226362, + "grad_norm": 42.59375, + "learning_rate": 9.904846231170544e-06, + "loss": 16.7472, + "step": 33680 + }, + { + "epoch": 0.6091659678917641, + "grad_norm": 41.34375, + "learning_rate": 9.904817978863885e-06, + "loss": 16.9834, + "step": 33690 + }, + { + "epoch": 0.609346782960892, + "grad_norm": 43.96875, + "learning_rate": 9.904789726557226e-06, + "loss": 16.5895, + "step": 33700 + }, + { + "epoch": 0.6095275980300198, + "grad_norm": 42.34375, + "learning_rate": 9.904761474250566e-06, + "loss": 16.9442, + "step": 33710 + }, + { + "epoch": 0.6097084130991477, + "grad_norm": 43.75, + "learning_rate": 9.904733221943907e-06, + "loss": 17.1539, + "step": 33720 + }, + { + "epoch": 0.6098892281682755, + "grad_norm": 39.625, + "learning_rate": 9.904704969637247e-06, + "loss": 17.1007, + "step": 33730 + }, + { + "epoch": 0.6100700432374034, + "grad_norm": 42.1875, + "learning_rate": 9.904676717330586e-06, + "loss": 16.9644, + "step": 33740 + }, + { + "epoch": 0.6102508583065313, + "grad_norm": 42.09375, + "learning_rate": 9.904648465023927e-06, + "loss": 16.9215, + "step": 33750 + }, + { + "epoch": 0.6104316733756592, + "grad_norm": 43.15625, + "learning_rate": 9.90462021271727e-06, + "loss": 17.2342, + "step": 33760 + }, + { + "epoch": 0.610612488444787, + "grad_norm": 42.3125, + "learning_rate": 9.904591960410608e-06, + "loss": 17.427, + "step": 33770 + }, + { + "epoch": 0.6107933035139148, + "grad_norm": 39.9375, + "learning_rate": 9.904563708103949e-06, + "loss": 17.1716, + "step": 33780 + }, + { + "epoch": 0.6109741185830427, + "grad_norm": 43.0625, + "learning_rate": 9.90453545579729e-06, + "loss": 17.059, + "step": 33790 + }, + { + "epoch": 0.6111549336521706, + "grad_norm": 39.6875, + "learning_rate": 9.90450720349063e-06, + "loss": 17.2268, + "step": 33800 + }, + { + "epoch": 0.6113357487212985, + "grad_norm": 43.15625, + "learning_rate": 9.90447895118397e-06, + "loss": 16.9237, + "step": 33810 + }, + { + "epoch": 0.6115165637904263, + "grad_norm": 41.78125, + "learning_rate": 9.90445069887731e-06, + "loss": 17.0366, + "step": 33820 + }, + { + "epoch": 0.6116973788595541, + "grad_norm": 41.8125, + "learning_rate": 9.90442244657065e-06, + "loss": 16.9747, + "step": 33830 + }, + { + "epoch": 0.611878193928682, + "grad_norm": 43.90625, + "learning_rate": 9.90439419426399e-06, + "loss": 16.7878, + "step": 33840 + }, + { + "epoch": 0.6120590089978099, + "grad_norm": 44.96875, + "learning_rate": 9.904365941957333e-06, + "loss": 16.8532, + "step": 33850 + }, + { + "epoch": 0.6122398240669378, + "grad_norm": 41.6875, + "learning_rate": 9.904337689650672e-06, + "loss": 16.7214, + "step": 33860 + }, + { + "epoch": 0.6124206391360656, + "grad_norm": 43.15625, + "learning_rate": 9.904309437344013e-06, + "loss": 16.9797, + "step": 33870 + }, + { + "epoch": 0.6126014542051934, + "grad_norm": 41.75, + "learning_rate": 9.904281185037353e-06, + "loss": 16.8734, + "step": 33880 + }, + { + "epoch": 0.6127822692743213, + "grad_norm": 42.34375, + "learning_rate": 9.904252932730694e-06, + "loss": 17.0882, + "step": 33890 + }, + { + "epoch": 0.6129630843434491, + "grad_norm": 42.34375, + "learning_rate": 9.904224680424035e-06, + "loss": 16.4828, + "step": 33900 + }, + { + "epoch": 0.6131438994125771, + "grad_norm": 41.59375, + "learning_rate": 9.904196428117374e-06, + "loss": 16.8991, + "step": 33910 + }, + { + "epoch": 0.6133247144817049, + "grad_norm": 42.90625, + "learning_rate": 9.904168175810714e-06, + "loss": 16.8868, + "step": 33920 + }, + { + "epoch": 0.6135055295508328, + "grad_norm": 40.625, + "learning_rate": 9.904139923504056e-06, + "loss": 16.8543, + "step": 33930 + }, + { + "epoch": 0.6136863446199606, + "grad_norm": 42.9375, + "learning_rate": 9.904111671197395e-06, + "loss": 17.1237, + "step": 33940 + }, + { + "epoch": 0.6138671596890884, + "grad_norm": 41.25, + "learning_rate": 9.904083418890736e-06, + "loss": 17.0901, + "step": 33950 + }, + { + "epoch": 0.6140479747582164, + "grad_norm": 43.125, + "learning_rate": 9.904055166584077e-06, + "loss": 17.1514, + "step": 33960 + }, + { + "epoch": 0.6142287898273442, + "grad_norm": 43.875, + "learning_rate": 9.904026914277417e-06, + "loss": 17.3739, + "step": 33970 + }, + { + "epoch": 0.6144096048964721, + "grad_norm": 40.78125, + "learning_rate": 9.903998661970758e-06, + "loss": 17.0267, + "step": 33980 + }, + { + "epoch": 0.6145904199655999, + "grad_norm": 41.0625, + "learning_rate": 9.903970409664097e-06, + "loss": 16.9841, + "step": 33990 + }, + { + "epoch": 0.6147712350347277, + "grad_norm": 40.90625, + "learning_rate": 9.903942157357437e-06, + "loss": 16.8561, + "step": 34000 + }, + { + "epoch": 0.6149520501038557, + "grad_norm": 39.78125, + "learning_rate": 9.903913905050778e-06, + "loss": 17.2727, + "step": 34010 + }, + { + "epoch": 0.6151328651729835, + "grad_norm": 42.1875, + "learning_rate": 9.90388565274412e-06, + "loss": 16.6296, + "step": 34020 + }, + { + "epoch": 0.6153136802421114, + "grad_norm": 41.71875, + "learning_rate": 9.90385740043746e-06, + "loss": 16.8987, + "step": 34030 + }, + { + "epoch": 0.6154944953112392, + "grad_norm": 40.25, + "learning_rate": 9.9038291481308e-06, + "loss": 17.0696, + "step": 34040 + }, + { + "epoch": 0.615675310380367, + "grad_norm": 42.96875, + "learning_rate": 9.90380089582414e-06, + "loss": 17.1479, + "step": 34050 + }, + { + "epoch": 0.615856125449495, + "grad_norm": 41.21875, + "learning_rate": 9.903772643517481e-06, + "loss": 17.0279, + "step": 34060 + }, + { + "epoch": 0.6160369405186228, + "grad_norm": 41.65625, + "learning_rate": 9.903744391210822e-06, + "loss": 17.0861, + "step": 34070 + }, + { + "epoch": 0.6162177555877507, + "grad_norm": 41.25, + "learning_rate": 9.90371613890416e-06, + "loss": 17.1457, + "step": 34080 + }, + { + "epoch": 0.6163985706568785, + "grad_norm": 42.21875, + "learning_rate": 9.903687886597501e-06, + "loss": 16.8054, + "step": 34090 + }, + { + "epoch": 0.6165793857260065, + "grad_norm": 39.71875, + "learning_rate": 9.903659634290842e-06, + "loss": 16.8803, + "step": 34100 + }, + { + "epoch": 0.6167602007951343, + "grad_norm": 40.0, + "learning_rate": 9.903631381984183e-06, + "loss": 17.2851, + "step": 34110 + }, + { + "epoch": 0.6169410158642621, + "grad_norm": 42.15625, + "learning_rate": 9.903603129677523e-06, + "loss": 17.0829, + "step": 34120 + }, + { + "epoch": 0.61712183093339, + "grad_norm": 43.625, + "learning_rate": 9.903574877370864e-06, + "loss": 16.758, + "step": 34130 + }, + { + "epoch": 0.6173026460025178, + "grad_norm": 39.09375, + "learning_rate": 9.903546625064204e-06, + "loss": 17.0564, + "step": 34140 + }, + { + "epoch": 0.6174834610716458, + "grad_norm": 41.625, + "learning_rate": 9.903518372757545e-06, + "loss": 16.8258, + "step": 34150 + }, + { + "epoch": 0.6176642761407736, + "grad_norm": 44.84375, + "learning_rate": 9.903490120450886e-06, + "loss": 17.0703, + "step": 34160 + }, + { + "epoch": 0.6178450912099014, + "grad_norm": 43.8125, + "learning_rate": 9.903461868144225e-06, + "loss": 16.8211, + "step": 34170 + }, + { + "epoch": 0.6180259062790293, + "grad_norm": 40.15625, + "learning_rate": 9.903433615837565e-06, + "loss": 16.6875, + "step": 34180 + }, + { + "epoch": 0.6182067213481571, + "grad_norm": 41.90625, + "learning_rate": 9.903405363530906e-06, + "loss": 16.5914, + "step": 34190 + }, + { + "epoch": 0.618387536417285, + "grad_norm": 39.875, + "learning_rate": 9.903377111224246e-06, + "loss": 16.8584, + "step": 34200 + }, + { + "epoch": 0.6185683514864129, + "grad_norm": 41.65625, + "learning_rate": 9.903348858917587e-06, + "loss": 16.7128, + "step": 34210 + }, + { + "epoch": 0.6187491665555407, + "grad_norm": 41.875, + "learning_rate": 9.903320606610928e-06, + "loss": 16.6644, + "step": 34220 + }, + { + "epoch": 0.6189299816246686, + "grad_norm": 40.96875, + "learning_rate": 9.903292354304268e-06, + "loss": 16.4386, + "step": 34230 + }, + { + "epoch": 0.6191107966937964, + "grad_norm": 40.71875, + "learning_rate": 9.903264101997609e-06, + "loss": 16.9011, + "step": 34240 + }, + { + "epoch": 0.6192916117629244, + "grad_norm": 41.0625, + "learning_rate": 9.903235849690948e-06, + "loss": 16.8479, + "step": 34250 + }, + { + "epoch": 0.6194724268320522, + "grad_norm": 42.53125, + "learning_rate": 9.903207597384289e-06, + "loss": 17.3777, + "step": 34260 + }, + { + "epoch": 0.6196532419011801, + "grad_norm": 45.84375, + "learning_rate": 9.903179345077629e-06, + "loss": 17.1973, + "step": 34270 + }, + { + "epoch": 0.6198340569703079, + "grad_norm": 41.6875, + "learning_rate": 9.903151092770971e-06, + "loss": 17.3628, + "step": 34280 + }, + { + "epoch": 0.6200148720394357, + "grad_norm": 41.46875, + "learning_rate": 9.90312284046431e-06, + "loss": 17.0504, + "step": 34290 + }, + { + "epoch": 0.6201956871085637, + "grad_norm": 42.21875, + "learning_rate": 9.903094588157651e-06, + "loss": 17.6827, + "step": 34300 + }, + { + "epoch": 0.6203765021776915, + "grad_norm": 42.96875, + "learning_rate": 9.903066335850992e-06, + "loss": 16.3788, + "step": 34310 + }, + { + "epoch": 0.6205573172468194, + "grad_norm": 42.0625, + "learning_rate": 9.903038083544332e-06, + "loss": 16.5896, + "step": 34320 + }, + { + "epoch": 0.6207381323159472, + "grad_norm": 41.46875, + "learning_rate": 9.903009831237673e-06, + "loss": 16.5937, + "step": 34330 + }, + { + "epoch": 0.620918947385075, + "grad_norm": 39.8125, + "learning_rate": 9.902981578931012e-06, + "loss": 16.8482, + "step": 34340 + }, + { + "epoch": 0.621099762454203, + "grad_norm": 42.21875, + "learning_rate": 9.902953326624352e-06, + "loss": 16.9854, + "step": 34350 + }, + { + "epoch": 0.6212805775233308, + "grad_norm": 44.625, + "learning_rate": 9.902925074317693e-06, + "loss": 16.7246, + "step": 34360 + }, + { + "epoch": 0.6214613925924587, + "grad_norm": 43.78125, + "learning_rate": 9.902896822011034e-06, + "loss": 16.9704, + "step": 34370 + }, + { + "epoch": 0.6216422076615865, + "grad_norm": 38.78125, + "learning_rate": 9.902868569704374e-06, + "loss": 16.8478, + "step": 34380 + }, + { + "epoch": 0.6218230227307143, + "grad_norm": 44.34375, + "learning_rate": 9.902840317397715e-06, + "loss": 16.8959, + "step": 34390 + }, + { + "epoch": 0.6220038377998423, + "grad_norm": 42.25, + "learning_rate": 9.902812065091056e-06, + "loss": 16.4676, + "step": 34400 + }, + { + "epoch": 0.6221846528689701, + "grad_norm": 40.59375, + "learning_rate": 9.902783812784396e-06, + "loss": 16.6884, + "step": 34410 + }, + { + "epoch": 0.622365467938098, + "grad_norm": 44.34375, + "learning_rate": 9.902755560477735e-06, + "loss": 17.0956, + "step": 34420 + }, + { + "epoch": 0.6225462830072258, + "grad_norm": 43.15625, + "learning_rate": 9.902727308171076e-06, + "loss": 16.7984, + "step": 34430 + }, + { + "epoch": 0.6227270980763537, + "grad_norm": 40.53125, + "learning_rate": 9.902699055864416e-06, + "loss": 16.6023, + "step": 34440 + }, + { + "epoch": 0.6229079131454816, + "grad_norm": 44.53125, + "learning_rate": 9.902670803557757e-06, + "loss": 16.9605, + "step": 34450 + }, + { + "epoch": 0.6230887282146094, + "grad_norm": 41.0625, + "learning_rate": 9.902642551251098e-06, + "loss": 17.1605, + "step": 34460 + }, + { + "epoch": 0.6232695432837373, + "grad_norm": 42.03125, + "learning_rate": 9.902614298944438e-06, + "loss": 16.8769, + "step": 34470 + }, + { + "epoch": 0.6234503583528651, + "grad_norm": 40.8125, + "learning_rate": 9.902586046637779e-06, + "loss": 17.3661, + "step": 34480 + }, + { + "epoch": 0.623631173421993, + "grad_norm": 41.59375, + "learning_rate": 9.90255779433112e-06, + "loss": 16.8391, + "step": 34490 + }, + { + "epoch": 0.6238119884911209, + "grad_norm": 41.625, + "learning_rate": 9.90252954202446e-06, + "loss": 17.2878, + "step": 34500 + }, + { + "epoch": 0.6239928035602487, + "grad_norm": 39.0625, + "learning_rate": 9.902501289717799e-06, + "loss": 17.1672, + "step": 34510 + }, + { + "epoch": 0.6241736186293766, + "grad_norm": 41.21875, + "learning_rate": 9.90247303741114e-06, + "loss": 17.1609, + "step": 34520 + }, + { + "epoch": 0.6243544336985044, + "grad_norm": 42.125, + "learning_rate": 9.90244478510448e-06, + "loss": 17.036, + "step": 34530 + }, + { + "epoch": 0.6245352487676323, + "grad_norm": 42.40625, + "learning_rate": 9.902416532797821e-06, + "loss": 16.9945, + "step": 34540 + }, + { + "epoch": 0.6247160638367601, + "grad_norm": 40.09375, + "learning_rate": 9.902388280491161e-06, + "loss": 16.446, + "step": 34550 + }, + { + "epoch": 0.624896878905888, + "grad_norm": 42.6875, + "learning_rate": 9.902360028184502e-06, + "loss": 16.5309, + "step": 34560 + }, + { + "epoch": 0.6250776939750159, + "grad_norm": 41.4375, + "learning_rate": 9.902331775877843e-06, + "loss": 17.0712, + "step": 34570 + }, + { + "epoch": 0.6252585090441437, + "grad_norm": 40.9375, + "learning_rate": 9.902303523571183e-06, + "loss": 16.878, + "step": 34580 + }, + { + "epoch": 0.6254393241132716, + "grad_norm": 43.28125, + "learning_rate": 9.902275271264524e-06, + "loss": 16.8346, + "step": 34590 + }, + { + "epoch": 0.6256201391823994, + "grad_norm": 40.96875, + "learning_rate": 9.902247018957863e-06, + "loss": 16.9437, + "step": 34600 + }, + { + "epoch": 0.6258009542515274, + "grad_norm": 43.0625, + "learning_rate": 9.902218766651204e-06, + "loss": 17.3732, + "step": 34610 + }, + { + "epoch": 0.6259817693206552, + "grad_norm": 42.03125, + "learning_rate": 9.902190514344544e-06, + "loss": 17.4373, + "step": 34620 + }, + { + "epoch": 0.626162584389783, + "grad_norm": 40.25, + "learning_rate": 9.902162262037885e-06, + "loss": 16.7355, + "step": 34630 + }, + { + "epoch": 0.6263433994589109, + "grad_norm": 43.03125, + "learning_rate": 9.902134009731225e-06, + "loss": 16.743, + "step": 34640 + }, + { + "epoch": 0.6265242145280387, + "grad_norm": 42.5625, + "learning_rate": 9.902105757424566e-06, + "loss": 16.8201, + "step": 34650 + }, + { + "epoch": 0.6267050295971667, + "grad_norm": 43.625, + "learning_rate": 9.902077505117907e-06, + "loss": 17.0255, + "step": 34660 + }, + { + "epoch": 0.6268858446662945, + "grad_norm": 41.6875, + "learning_rate": 9.902049252811247e-06, + "loss": 16.6278, + "step": 34670 + }, + { + "epoch": 0.6270666597354223, + "grad_norm": 43.15625, + "learning_rate": 9.902021000504586e-06, + "loss": 16.7172, + "step": 34680 + }, + { + "epoch": 0.6272474748045502, + "grad_norm": 42.90625, + "learning_rate": 9.901992748197927e-06, + "loss": 17.1172, + "step": 34690 + }, + { + "epoch": 0.627428289873678, + "grad_norm": 38.4375, + "learning_rate": 9.901964495891267e-06, + "loss": 17.0091, + "step": 34700 + }, + { + "epoch": 0.627609104942806, + "grad_norm": 43.625, + "learning_rate": 9.901936243584608e-06, + "loss": 16.8104, + "step": 34710 + }, + { + "epoch": 0.6277899200119338, + "grad_norm": 40.1875, + "learning_rate": 9.901907991277949e-06, + "loss": 16.8872, + "step": 34720 + }, + { + "epoch": 0.6279707350810616, + "grad_norm": 41.78125, + "learning_rate": 9.90187973897129e-06, + "loss": 17.1016, + "step": 34730 + }, + { + "epoch": 0.6281515501501895, + "grad_norm": 41.59375, + "learning_rate": 9.90185148666463e-06, + "loss": 17.1182, + "step": 34740 + }, + { + "epoch": 0.6283323652193173, + "grad_norm": 43.78125, + "learning_rate": 9.90182323435797e-06, + "loss": 16.9325, + "step": 34750 + }, + { + "epoch": 0.6285131802884453, + "grad_norm": 43.8125, + "learning_rate": 9.901794982051311e-06, + "loss": 16.7678, + "step": 34760 + }, + { + "epoch": 0.6286939953575731, + "grad_norm": 44.0, + "learning_rate": 9.90176672974465e-06, + "loss": 16.7194, + "step": 34770 + }, + { + "epoch": 0.628874810426701, + "grad_norm": 41.4375, + "learning_rate": 9.90173847743799e-06, + "loss": 16.9852, + "step": 34780 + }, + { + "epoch": 0.6290556254958288, + "grad_norm": 40.34375, + "learning_rate": 9.901710225131331e-06, + "loss": 17.3694, + "step": 34790 + }, + { + "epoch": 0.6292364405649566, + "grad_norm": 45.625, + "learning_rate": 9.901681972824672e-06, + "loss": 16.7443, + "step": 34800 + }, + { + "epoch": 0.6294172556340846, + "grad_norm": 42.53125, + "learning_rate": 9.901653720518013e-06, + "loss": 16.5887, + "step": 34810 + }, + { + "epoch": 0.6295980707032124, + "grad_norm": 43.125, + "learning_rate": 9.901625468211353e-06, + "loss": 16.9845, + "step": 34820 + }, + { + "epoch": 0.6297788857723403, + "grad_norm": 39.9375, + "learning_rate": 9.901597215904694e-06, + "loss": 16.8, + "step": 34830 + }, + { + "epoch": 0.6299597008414681, + "grad_norm": 41.6875, + "learning_rate": 9.901568963598034e-06, + "loss": 17.1302, + "step": 34840 + }, + { + "epoch": 0.6301405159105959, + "grad_norm": 43.90625, + "learning_rate": 9.901540711291373e-06, + "loss": 16.5467, + "step": 34850 + }, + { + "epoch": 0.6303213309797239, + "grad_norm": 43.84375, + "learning_rate": 9.901512458984714e-06, + "loss": 16.4088, + "step": 34860 + }, + { + "epoch": 0.6305021460488517, + "grad_norm": 42.09375, + "learning_rate": 9.901484206678055e-06, + "loss": 16.5629, + "step": 34870 + }, + { + "epoch": 0.6306829611179796, + "grad_norm": 43.9375, + "learning_rate": 9.901455954371395e-06, + "loss": 16.8379, + "step": 34880 + }, + { + "epoch": 0.6308637761871074, + "grad_norm": 43.03125, + "learning_rate": 9.901427702064736e-06, + "loss": 16.6736, + "step": 34890 + }, + { + "epoch": 0.6310445912562352, + "grad_norm": 42.5, + "learning_rate": 9.901399449758076e-06, + "loss": 16.9239, + "step": 34900 + }, + { + "epoch": 0.6312254063253632, + "grad_norm": 43.96875, + "learning_rate": 9.901371197451417e-06, + "loss": 17.1634, + "step": 34910 + }, + { + "epoch": 0.631406221394491, + "grad_norm": 40.78125, + "learning_rate": 9.901342945144758e-06, + "loss": 17.1069, + "step": 34920 + }, + { + "epoch": 0.6315870364636189, + "grad_norm": 42.875, + "learning_rate": 9.901314692838098e-06, + "loss": 16.9312, + "step": 34930 + }, + { + "epoch": 0.6317678515327467, + "grad_norm": 41.09375, + "learning_rate": 9.901286440531437e-06, + "loss": 16.988, + "step": 34940 + }, + { + "epoch": 0.6319486666018747, + "grad_norm": 43.1875, + "learning_rate": 9.901258188224778e-06, + "loss": 16.9073, + "step": 34950 + }, + { + "epoch": 0.6321294816710025, + "grad_norm": 41.84375, + "learning_rate": 9.901229935918119e-06, + "loss": 17.0261, + "step": 34960 + }, + { + "epoch": 0.6323102967401303, + "grad_norm": 41.8125, + "learning_rate": 9.901201683611459e-06, + "loss": 16.8868, + "step": 34970 + }, + { + "epoch": 0.6324911118092582, + "grad_norm": 42.375, + "learning_rate": 9.9011734313048e-06, + "loss": 16.634, + "step": 34980 + }, + { + "epoch": 0.632671926878386, + "grad_norm": 42.78125, + "learning_rate": 9.90114517899814e-06, + "loss": 16.5571, + "step": 34990 + }, + { + "epoch": 0.632852741947514, + "grad_norm": 41.625, + "learning_rate": 9.901116926691481e-06, + "loss": 16.7899, + "step": 35000 + }, + { + "epoch": 0.632852741947514, + "eval_loss": 2.1153111457824707, + "eval_runtime": 229.0986, + "eval_samples_per_second": 3169.199, + "eval_steps_per_second": 49.52, + "step": 35000 + }, + { + "epoch": 0.6330335570166418, + "grad_norm": 40.875, + "learning_rate": 9.901088674384822e-06, + "loss": 17.0118, + "step": 35010 + }, + { + "epoch": 0.6332143720857696, + "grad_norm": 42.5, + "learning_rate": 9.901060422078162e-06, + "loss": 17.3374, + "step": 35020 + }, + { + "epoch": 0.6333951871548975, + "grad_norm": 44.21875, + "learning_rate": 9.901032169771501e-06, + "loss": 17.1585, + "step": 35030 + }, + { + "epoch": 0.6335760022240253, + "grad_norm": 43.28125, + "learning_rate": 9.901003917464842e-06, + "loss": 16.9927, + "step": 35040 + }, + { + "epoch": 0.6337568172931533, + "grad_norm": 41.90625, + "learning_rate": 9.900975665158182e-06, + "loss": 16.7819, + "step": 35050 + }, + { + "epoch": 0.6339376323622811, + "grad_norm": 42.65625, + "learning_rate": 9.900947412851523e-06, + "loss": 16.6033, + "step": 35060 + }, + { + "epoch": 0.6341184474314089, + "grad_norm": 45.15625, + "learning_rate": 9.900919160544864e-06, + "loss": 16.9377, + "step": 35070 + }, + { + "epoch": 0.6342992625005368, + "grad_norm": 41.34375, + "learning_rate": 9.900890908238204e-06, + "loss": 16.9437, + "step": 35080 + }, + { + "epoch": 0.6344800775696646, + "grad_norm": 41.5625, + "learning_rate": 9.900862655931545e-06, + "loss": 16.9132, + "step": 35090 + }, + { + "epoch": 0.6346608926387926, + "grad_norm": 39.625, + "learning_rate": 9.900834403624886e-06, + "loss": 16.9262, + "step": 35100 + }, + { + "epoch": 0.6348417077079204, + "grad_norm": 43.03125, + "learning_rate": 9.900806151318224e-06, + "loss": 16.8212, + "step": 35110 + }, + { + "epoch": 0.6350225227770483, + "grad_norm": 43.5625, + "learning_rate": 9.900777899011565e-06, + "loss": 16.7947, + "step": 35120 + }, + { + "epoch": 0.6352033378461761, + "grad_norm": 39.59375, + "learning_rate": 9.900749646704906e-06, + "loss": 16.9076, + "step": 35130 + }, + { + "epoch": 0.6353841529153039, + "grad_norm": 42.25, + "learning_rate": 9.900721394398246e-06, + "loss": 16.4483, + "step": 35140 + }, + { + "epoch": 0.6355649679844319, + "grad_norm": 43.625, + "learning_rate": 9.900693142091587e-06, + "loss": 17.4467, + "step": 35150 + }, + { + "epoch": 0.6357457830535597, + "grad_norm": 39.375, + "learning_rate": 9.900664889784928e-06, + "loss": 16.6712, + "step": 35160 + }, + { + "epoch": 0.6359265981226876, + "grad_norm": 43.5625, + "learning_rate": 9.900636637478268e-06, + "loss": 16.6812, + "step": 35170 + }, + { + "epoch": 0.6361074131918154, + "grad_norm": 43.25, + "learning_rate": 9.900608385171609e-06, + "loss": 16.7277, + "step": 35180 + }, + { + "epoch": 0.6362882282609432, + "grad_norm": 42.75, + "learning_rate": 9.90058013286495e-06, + "loss": 17.0572, + "step": 35190 + }, + { + "epoch": 0.6364690433300711, + "grad_norm": 39.90625, + "learning_rate": 9.900551880558288e-06, + "loss": 17.105, + "step": 35200 + }, + { + "epoch": 0.636649858399199, + "grad_norm": 41.1875, + "learning_rate": 9.900523628251629e-06, + "loss": 16.6574, + "step": 35210 + }, + { + "epoch": 0.6368306734683269, + "grad_norm": 42.03125, + "learning_rate": 9.90049537594497e-06, + "loss": 16.8217, + "step": 35220 + }, + { + "epoch": 0.6370114885374547, + "grad_norm": 43.21875, + "learning_rate": 9.90046712363831e-06, + "loss": 16.7569, + "step": 35230 + }, + { + "epoch": 0.6371923036065825, + "grad_norm": 42.21875, + "learning_rate": 9.900438871331651e-06, + "loss": 16.4877, + "step": 35240 + }, + { + "epoch": 0.6373731186757104, + "grad_norm": 42.28125, + "learning_rate": 9.900410619024991e-06, + "loss": 16.7122, + "step": 35250 + }, + { + "epoch": 0.6375539337448383, + "grad_norm": 40.65625, + "learning_rate": 9.900382366718332e-06, + "loss": 16.9515, + "step": 35260 + }, + { + "epoch": 0.6377347488139662, + "grad_norm": 42.25, + "learning_rate": 9.900354114411673e-06, + "loss": 16.655, + "step": 35270 + }, + { + "epoch": 0.637915563883094, + "grad_norm": 42.5625, + "learning_rate": 9.900325862105012e-06, + "loss": 16.847, + "step": 35280 + }, + { + "epoch": 0.6380963789522219, + "grad_norm": 38.375, + "learning_rate": 9.900297609798352e-06, + "loss": 16.9473, + "step": 35290 + }, + { + "epoch": 0.6382771940213497, + "grad_norm": 42.625, + "learning_rate": 9.900269357491693e-06, + "loss": 16.8541, + "step": 35300 + }, + { + "epoch": 0.6384580090904776, + "grad_norm": 40.28125, + "learning_rate": 9.900241105185034e-06, + "loss": 16.9603, + "step": 35310 + }, + { + "epoch": 0.6386388241596055, + "grad_norm": 42.21875, + "learning_rate": 9.900212852878374e-06, + "loss": 16.9738, + "step": 35320 + }, + { + "epoch": 0.6388196392287333, + "grad_norm": 41.28125, + "learning_rate": 9.900184600571715e-06, + "loss": 17.0404, + "step": 35330 + }, + { + "epoch": 0.6390004542978612, + "grad_norm": 43.46875, + "learning_rate": 9.900156348265055e-06, + "loss": 17.3818, + "step": 35340 + }, + { + "epoch": 0.639181269366989, + "grad_norm": 40.90625, + "learning_rate": 9.900128095958396e-06, + "loss": 16.7372, + "step": 35350 + }, + { + "epoch": 0.6393620844361169, + "grad_norm": 42.65625, + "learning_rate": 9.900099843651737e-06, + "loss": 16.9702, + "step": 35360 + }, + { + "epoch": 0.6395428995052448, + "grad_norm": 45.375, + "learning_rate": 9.900071591345076e-06, + "loss": 17.1497, + "step": 35370 + }, + { + "epoch": 0.6397237145743726, + "grad_norm": 43.3125, + "learning_rate": 9.900043339038416e-06, + "loss": 16.9083, + "step": 35380 + }, + { + "epoch": 0.6399045296435005, + "grad_norm": 45.34375, + "learning_rate": 9.900015086731757e-06, + "loss": 16.762, + "step": 35390 + }, + { + "epoch": 0.6400853447126283, + "grad_norm": 40.6875, + "learning_rate": 9.899986834425097e-06, + "loss": 17.2308, + "step": 35400 + }, + { + "epoch": 0.6402661597817562, + "grad_norm": 43.96875, + "learning_rate": 9.899958582118438e-06, + "loss": 16.7875, + "step": 35410 + }, + { + "epoch": 0.6404469748508841, + "grad_norm": 41.90625, + "learning_rate": 9.899930329811779e-06, + "loss": 16.6921, + "step": 35420 + }, + { + "epoch": 0.6406277899200119, + "grad_norm": 43.15625, + "learning_rate": 9.89990207750512e-06, + "loss": 16.8197, + "step": 35430 + }, + { + "epoch": 0.6408086049891398, + "grad_norm": 43.65625, + "learning_rate": 9.89987382519846e-06, + "loss": 16.9357, + "step": 35440 + }, + { + "epoch": 0.6409894200582676, + "grad_norm": 43.40625, + "learning_rate": 9.8998455728918e-06, + "loss": 16.8973, + "step": 35450 + }, + { + "epoch": 0.6411702351273956, + "grad_norm": 43.21875, + "learning_rate": 9.89981732058514e-06, + "loss": 16.7642, + "step": 35460 + }, + { + "epoch": 0.6413510501965234, + "grad_norm": 46.84375, + "learning_rate": 9.89978906827848e-06, + "loss": 16.9361, + "step": 35470 + }, + { + "epoch": 0.6415318652656512, + "grad_norm": 43.15625, + "learning_rate": 9.89976081597182e-06, + "loss": 17.1652, + "step": 35480 + }, + { + "epoch": 0.6417126803347791, + "grad_norm": 43.71875, + "learning_rate": 9.899732563665161e-06, + "loss": 16.9174, + "step": 35490 + }, + { + "epoch": 0.6418934954039069, + "grad_norm": 41.8125, + "learning_rate": 9.899704311358502e-06, + "loss": 16.7135, + "step": 35500 + }, + { + "epoch": 0.6420743104730349, + "grad_norm": 38.71875, + "learning_rate": 9.899676059051843e-06, + "loss": 16.6854, + "step": 35510 + }, + { + "epoch": 0.6422551255421627, + "grad_norm": 42.375, + "learning_rate": 9.899647806745183e-06, + "loss": 17.3483, + "step": 35520 + }, + { + "epoch": 0.6424359406112905, + "grad_norm": 40.625, + "learning_rate": 9.899619554438524e-06, + "loss": 16.9488, + "step": 35530 + }, + { + "epoch": 0.6426167556804184, + "grad_norm": 42.4375, + "learning_rate": 9.899591302131863e-06, + "loss": 17.0364, + "step": 35540 + }, + { + "epoch": 0.6427975707495462, + "grad_norm": 44.1875, + "learning_rate": 9.899563049825203e-06, + "loss": 17.0453, + "step": 35550 + }, + { + "epoch": 0.6429783858186742, + "grad_norm": 42.0, + "learning_rate": 9.899534797518544e-06, + "loss": 17.2179, + "step": 35560 + }, + { + "epoch": 0.643159200887802, + "grad_norm": 42.40625, + "learning_rate": 9.899506545211885e-06, + "loss": 16.6237, + "step": 35570 + }, + { + "epoch": 0.6433400159569298, + "grad_norm": 42.34375, + "learning_rate": 9.899478292905225e-06, + "loss": 16.5776, + "step": 35580 + }, + { + "epoch": 0.6435208310260577, + "grad_norm": 43.28125, + "learning_rate": 9.899450040598564e-06, + "loss": 17.0083, + "step": 35590 + }, + { + "epoch": 0.6437016460951855, + "grad_norm": 42.09375, + "learning_rate": 9.899421788291906e-06, + "loss": 16.8802, + "step": 35600 + }, + { + "epoch": 0.6438824611643135, + "grad_norm": 43.15625, + "learning_rate": 9.899393535985247e-06, + "loss": 16.8426, + "step": 35610 + }, + { + "epoch": 0.6440632762334413, + "grad_norm": 40.5, + "learning_rate": 9.899365283678588e-06, + "loss": 16.6048, + "step": 35620 + }, + { + "epoch": 0.6442440913025692, + "grad_norm": 44.21875, + "learning_rate": 9.899337031371927e-06, + "loss": 16.8645, + "step": 35630 + }, + { + "epoch": 0.644424906371697, + "grad_norm": 42.5, + "learning_rate": 9.899308779065267e-06, + "loss": 17.0983, + "step": 35640 + }, + { + "epoch": 0.6446057214408248, + "grad_norm": 40.625, + "learning_rate": 9.899280526758608e-06, + "loss": 16.9962, + "step": 35650 + }, + { + "epoch": 0.6447865365099528, + "grad_norm": 41.34375, + "learning_rate": 9.899252274451949e-06, + "loss": 16.5154, + "step": 35660 + }, + { + "epoch": 0.6449673515790806, + "grad_norm": 41.53125, + "learning_rate": 9.899224022145289e-06, + "loss": 17.1147, + "step": 35670 + }, + { + "epoch": 0.6451481666482085, + "grad_norm": 43.75, + "learning_rate": 9.899195769838628e-06, + "loss": 17.1466, + "step": 35680 + }, + { + "epoch": 0.6453289817173363, + "grad_norm": 43.875, + "learning_rate": 9.89916751753197e-06, + "loss": 16.6516, + "step": 35690 + }, + { + "epoch": 0.6455097967864641, + "grad_norm": 44.375, + "learning_rate": 9.899139265225311e-06, + "loss": 16.9173, + "step": 35700 + }, + { + "epoch": 0.6456906118555921, + "grad_norm": 43.0625, + "learning_rate": 9.89911101291865e-06, + "loss": 16.9888, + "step": 35710 + }, + { + "epoch": 0.6458714269247199, + "grad_norm": 40.125, + "learning_rate": 9.89908276061199e-06, + "loss": 16.8233, + "step": 35720 + }, + { + "epoch": 0.6460522419938478, + "grad_norm": 44.1875, + "learning_rate": 9.899054508305331e-06, + "loss": 16.9166, + "step": 35730 + }, + { + "epoch": 0.6462330570629756, + "grad_norm": 38.09375, + "learning_rate": 9.899026255998672e-06, + "loss": 16.7543, + "step": 35740 + }, + { + "epoch": 0.6464138721321034, + "grad_norm": 41.96875, + "learning_rate": 9.898998003692012e-06, + "loss": 17.1185, + "step": 35750 + }, + { + "epoch": 0.6465946872012314, + "grad_norm": 42.40625, + "learning_rate": 9.898969751385353e-06, + "loss": 16.4558, + "step": 35760 + }, + { + "epoch": 0.6467755022703592, + "grad_norm": 44.28125, + "learning_rate": 9.898941499078694e-06, + "loss": 16.7642, + "step": 35770 + }, + { + "epoch": 0.6469563173394871, + "grad_norm": 41.59375, + "learning_rate": 9.898913246772034e-06, + "loss": 16.9298, + "step": 35780 + }, + { + "epoch": 0.6471371324086149, + "grad_norm": 42.5625, + "learning_rate": 9.898884994465375e-06, + "loss": 17.1463, + "step": 35790 + }, + { + "epoch": 0.6473179474777428, + "grad_norm": 40.84375, + "learning_rate": 9.898856742158714e-06, + "loss": 17.0144, + "step": 35800 + }, + { + "epoch": 0.6474987625468707, + "grad_norm": 42.59375, + "learning_rate": 9.898828489852054e-06, + "loss": 16.8369, + "step": 35810 + }, + { + "epoch": 0.6476795776159985, + "grad_norm": 42.03125, + "learning_rate": 9.898800237545395e-06, + "loss": 16.9437, + "step": 35820 + }, + { + "epoch": 0.6478603926851264, + "grad_norm": 42.28125, + "learning_rate": 9.898771985238736e-06, + "loss": 16.7855, + "step": 35830 + }, + { + "epoch": 0.6480412077542542, + "grad_norm": 43.03125, + "learning_rate": 9.898743732932076e-06, + "loss": 16.9263, + "step": 35840 + }, + { + "epoch": 0.6482220228233821, + "grad_norm": 45.625, + "learning_rate": 9.898715480625415e-06, + "loss": 16.818, + "step": 35850 + }, + { + "epoch": 0.64840283789251, + "grad_norm": 39.59375, + "learning_rate": 9.898687228318758e-06, + "loss": 16.6835, + "step": 35860 + }, + { + "epoch": 0.6485836529616378, + "grad_norm": 41.21875, + "learning_rate": 9.898658976012098e-06, + "loss": 16.8036, + "step": 35870 + }, + { + "epoch": 0.6487644680307657, + "grad_norm": 41.5625, + "learning_rate": 9.898630723705439e-06, + "loss": 17.0221, + "step": 35880 + }, + { + "epoch": 0.6489452830998935, + "grad_norm": 44.25, + "learning_rate": 9.898602471398778e-06, + "loss": 16.639, + "step": 35890 + }, + { + "epoch": 0.6491260981690214, + "grad_norm": 47.71875, + "learning_rate": 9.898574219092118e-06, + "loss": 16.8011, + "step": 35900 + }, + { + "epoch": 0.6493069132381493, + "grad_norm": 41.6875, + "learning_rate": 9.898545966785459e-06, + "loss": 16.8954, + "step": 35910 + }, + { + "epoch": 0.6494877283072771, + "grad_norm": 41.6875, + "learning_rate": 9.8985177144788e-06, + "loss": 16.8521, + "step": 35920 + }, + { + "epoch": 0.649668543376405, + "grad_norm": 42.84375, + "learning_rate": 9.89848946217214e-06, + "loss": 17.0667, + "step": 35930 + }, + { + "epoch": 0.6498493584455328, + "grad_norm": 41.75, + "learning_rate": 9.89846120986548e-06, + "loss": 17.2669, + "step": 35940 + }, + { + "epoch": 0.6500301735146607, + "grad_norm": 40.4375, + "learning_rate": 9.898432957558821e-06, + "loss": 16.4704, + "step": 35950 + }, + { + "epoch": 0.6502109885837886, + "grad_norm": 45.40625, + "learning_rate": 9.898404705252162e-06, + "loss": 16.8687, + "step": 35960 + }, + { + "epoch": 0.6503918036529165, + "grad_norm": 42.59375, + "learning_rate": 9.898376452945501e-06, + "loss": 17.2093, + "step": 35970 + }, + { + "epoch": 0.6505726187220443, + "grad_norm": 43.125, + "learning_rate": 9.898348200638842e-06, + "loss": 17.1447, + "step": 35980 + }, + { + "epoch": 0.6507534337911721, + "grad_norm": 46.625, + "learning_rate": 9.898319948332182e-06, + "loss": 16.7466, + "step": 35990 + }, + { + "epoch": 0.6509342488603, + "grad_norm": 43.875, + "learning_rate": 9.898291696025523e-06, + "loss": 16.8183, + "step": 36000 + }, + { + "epoch": 0.6511150639294279, + "grad_norm": 42.15625, + "learning_rate": 9.898263443718864e-06, + "loss": 17.1838, + "step": 36010 + }, + { + "epoch": 0.6512958789985558, + "grad_norm": 42.6875, + "learning_rate": 9.898235191412202e-06, + "loss": 16.6245, + "step": 36020 + }, + { + "epoch": 0.6514766940676836, + "grad_norm": 42.25, + "learning_rate": 9.898206939105543e-06, + "loss": 16.658, + "step": 36030 + }, + { + "epoch": 0.6516575091368114, + "grad_norm": 45.125, + "learning_rate": 9.898178686798885e-06, + "loss": 17.396, + "step": 36040 + }, + { + "epoch": 0.6518383242059393, + "grad_norm": 41.875, + "learning_rate": 9.898150434492226e-06, + "loss": 16.9268, + "step": 36050 + }, + { + "epoch": 0.6520191392750672, + "grad_norm": 42.21875, + "learning_rate": 9.898122182185565e-06, + "loss": 16.686, + "step": 36060 + }, + { + "epoch": 0.6521999543441951, + "grad_norm": 42.1875, + "learning_rate": 9.898093929878906e-06, + "loss": 16.705, + "step": 36070 + }, + { + "epoch": 0.6523807694133229, + "grad_norm": 43.90625, + "learning_rate": 9.898065677572246e-06, + "loss": 16.6551, + "step": 36080 + }, + { + "epoch": 0.6525615844824507, + "grad_norm": 43.28125, + "learning_rate": 9.898037425265587e-06, + "loss": 16.6608, + "step": 36090 + }, + { + "epoch": 0.6527423995515786, + "grad_norm": 44.21875, + "learning_rate": 9.898009172958927e-06, + "loss": 17.0485, + "step": 36100 + }, + { + "epoch": 0.6529232146207065, + "grad_norm": 42.0625, + "learning_rate": 9.897980920652266e-06, + "loss": 17.0704, + "step": 36110 + }, + { + "epoch": 0.6531040296898344, + "grad_norm": 43.15625, + "learning_rate": 9.897952668345609e-06, + "loss": 16.8169, + "step": 36120 + }, + { + "epoch": 0.6532848447589622, + "grad_norm": 42.5, + "learning_rate": 9.89792441603895e-06, + "loss": 16.8907, + "step": 36130 + }, + { + "epoch": 0.6534656598280901, + "grad_norm": 42.8125, + "learning_rate": 9.897896163732288e-06, + "loss": 16.8032, + "step": 36140 + }, + { + "epoch": 0.6536464748972179, + "grad_norm": 40.96875, + "learning_rate": 9.897867911425629e-06, + "loss": 16.6132, + "step": 36150 + }, + { + "epoch": 0.6538272899663458, + "grad_norm": 41.53125, + "learning_rate": 9.89783965911897e-06, + "loss": 16.8102, + "step": 36160 + }, + { + "epoch": 0.6540081050354737, + "grad_norm": 41.96875, + "learning_rate": 9.89781140681231e-06, + "loss": 17.1489, + "step": 36170 + }, + { + "epoch": 0.6541889201046015, + "grad_norm": 44.90625, + "learning_rate": 9.89778315450565e-06, + "loss": 16.9509, + "step": 36180 + }, + { + "epoch": 0.6543697351737294, + "grad_norm": 42.59375, + "learning_rate": 9.89775490219899e-06, + "loss": 17.1862, + "step": 36190 + }, + { + "epoch": 0.6545505502428572, + "grad_norm": 42.78125, + "learning_rate": 9.89772664989233e-06, + "loss": 17.2136, + "step": 36200 + }, + { + "epoch": 0.6547313653119851, + "grad_norm": 42.8125, + "learning_rate": 9.897698397585673e-06, + "loss": 16.7174, + "step": 36210 + }, + { + "epoch": 0.654912180381113, + "grad_norm": 41.1875, + "learning_rate": 9.897670145279013e-06, + "loss": 16.752, + "step": 36220 + }, + { + "epoch": 0.6550929954502408, + "grad_norm": 44.28125, + "learning_rate": 9.897641892972352e-06, + "loss": 16.7826, + "step": 36230 + }, + { + "epoch": 0.6552738105193687, + "grad_norm": 42.875, + "learning_rate": 9.897613640665693e-06, + "loss": 16.906, + "step": 36240 + }, + { + "epoch": 0.6554546255884965, + "grad_norm": 42.625, + "learning_rate": 9.897585388359033e-06, + "loss": 16.7243, + "step": 36250 + }, + { + "epoch": 0.6556354406576244, + "grad_norm": 44.21875, + "learning_rate": 9.897557136052374e-06, + "loss": 17.1138, + "step": 36260 + }, + { + "epoch": 0.6558162557267523, + "grad_norm": 43.8125, + "learning_rate": 9.897528883745715e-06, + "loss": 17.3506, + "step": 36270 + }, + { + "epoch": 0.6559970707958801, + "grad_norm": 39.53125, + "learning_rate": 9.897500631439054e-06, + "loss": 17.1032, + "step": 36280 + }, + { + "epoch": 0.656177885865008, + "grad_norm": 42.34375, + "learning_rate": 9.897472379132394e-06, + "loss": 16.862, + "step": 36290 + }, + { + "epoch": 0.6563587009341358, + "grad_norm": 43.8125, + "learning_rate": 9.897444126825736e-06, + "loss": 17.4687, + "step": 36300 + }, + { + "epoch": 0.6565395160032637, + "grad_norm": 42.71875, + "learning_rate": 9.897415874519075e-06, + "loss": 16.8392, + "step": 36310 + }, + { + "epoch": 0.6567203310723916, + "grad_norm": 44.40625, + "learning_rate": 9.897387622212416e-06, + "loss": 17.142, + "step": 36320 + }, + { + "epoch": 0.6569011461415194, + "grad_norm": 42.125, + "learning_rate": 9.897359369905757e-06, + "loss": 16.773, + "step": 36330 + }, + { + "epoch": 0.6570819612106473, + "grad_norm": 41.375, + "learning_rate": 9.897331117599097e-06, + "loss": 16.9312, + "step": 36340 + }, + { + "epoch": 0.6572627762797751, + "grad_norm": 44.90625, + "learning_rate": 9.897302865292438e-06, + "loss": 17.084, + "step": 36350 + }, + { + "epoch": 0.6574435913489031, + "grad_norm": 46.78125, + "learning_rate": 9.897274612985779e-06, + "loss": 17.0247, + "step": 36360 + }, + { + "epoch": 0.6576244064180309, + "grad_norm": 43.4375, + "learning_rate": 9.897246360679117e-06, + "loss": 16.9223, + "step": 36370 + }, + { + "epoch": 0.6578052214871587, + "grad_norm": 40.59375, + "learning_rate": 9.897218108372458e-06, + "loss": 16.8791, + "step": 36380 + }, + { + "epoch": 0.6579860365562866, + "grad_norm": 42.6875, + "learning_rate": 9.8971898560658e-06, + "loss": 17.133, + "step": 36390 + }, + { + "epoch": 0.6581668516254144, + "grad_norm": 44.25, + "learning_rate": 9.89716160375914e-06, + "loss": 17.3585, + "step": 36400 + }, + { + "epoch": 0.6583476666945424, + "grad_norm": 41.09375, + "learning_rate": 9.89713335145248e-06, + "loss": 16.954, + "step": 36410 + }, + { + "epoch": 0.6585284817636702, + "grad_norm": 41.40625, + "learning_rate": 9.89710509914582e-06, + "loss": 16.6315, + "step": 36420 + }, + { + "epoch": 0.658709296832798, + "grad_norm": 42.59375, + "learning_rate": 9.897076846839161e-06, + "loss": 17.1309, + "step": 36430 + }, + { + "epoch": 0.6588901119019259, + "grad_norm": 43.75, + "learning_rate": 9.897048594532502e-06, + "loss": 16.5603, + "step": 36440 + }, + { + "epoch": 0.6590709269710537, + "grad_norm": 43.3125, + "learning_rate": 9.89702034222584e-06, + "loss": 17.2705, + "step": 36450 + }, + { + "epoch": 0.6592517420401817, + "grad_norm": 43.125, + "learning_rate": 9.896992089919181e-06, + "loss": 17.4744, + "step": 36460 + }, + { + "epoch": 0.6594325571093095, + "grad_norm": 41.15625, + "learning_rate": 9.896963837612524e-06, + "loss": 16.9674, + "step": 36470 + }, + { + "epoch": 0.6596133721784373, + "grad_norm": 44.5, + "learning_rate": 9.896935585305864e-06, + "loss": 17.0317, + "step": 36480 + }, + { + "epoch": 0.6597941872475652, + "grad_norm": 42.71875, + "learning_rate": 9.896907332999203e-06, + "loss": 17.1149, + "step": 36490 + }, + { + "epoch": 0.659975002316693, + "grad_norm": 43.75, + "learning_rate": 9.896879080692544e-06, + "loss": 16.7619, + "step": 36500 + }, + { + "epoch": 0.660155817385821, + "grad_norm": 40.4375, + "learning_rate": 9.896850828385884e-06, + "loss": 16.9837, + "step": 36510 + }, + { + "epoch": 0.6603366324549488, + "grad_norm": 43.4375, + "learning_rate": 9.896822576079225e-06, + "loss": 16.7384, + "step": 36520 + }, + { + "epoch": 0.6605174475240767, + "grad_norm": 42.375, + "learning_rate": 9.896794323772566e-06, + "loss": 16.8103, + "step": 36530 + }, + { + "epoch": 0.6606982625932045, + "grad_norm": 48.125, + "learning_rate": 9.896766071465905e-06, + "loss": 16.7246, + "step": 36540 + }, + { + "epoch": 0.6608790776623323, + "grad_norm": 41.125, + "learning_rate": 9.896737819159245e-06, + "loss": 16.5504, + "step": 36550 + }, + { + "epoch": 0.6610598927314603, + "grad_norm": 44.46875, + "learning_rate": 9.896709566852588e-06, + "loss": 17.1036, + "step": 36560 + }, + { + "epoch": 0.6612407078005881, + "grad_norm": 42.0, + "learning_rate": 9.896681314545927e-06, + "loss": 16.7741, + "step": 36570 + }, + { + "epoch": 0.661421522869716, + "grad_norm": 47.5, + "learning_rate": 9.896653062239267e-06, + "loss": 16.8923, + "step": 36580 + }, + { + "epoch": 0.6616023379388438, + "grad_norm": 40.0625, + "learning_rate": 9.896624809932608e-06, + "loss": 16.797, + "step": 36590 + }, + { + "epoch": 0.6617831530079716, + "grad_norm": 43.65625, + "learning_rate": 9.896596557625948e-06, + "loss": 16.9909, + "step": 36600 + }, + { + "epoch": 0.6619639680770996, + "grad_norm": 41.40625, + "learning_rate": 9.896568305319289e-06, + "loss": 16.6533, + "step": 36610 + }, + { + "epoch": 0.6621447831462274, + "grad_norm": 43.75, + "learning_rate": 9.896540053012628e-06, + "loss": 16.85, + "step": 36620 + }, + { + "epoch": 0.6623255982153553, + "grad_norm": 45.125, + "learning_rate": 9.896511800705969e-06, + "loss": 16.742, + "step": 36630 + }, + { + "epoch": 0.6625064132844831, + "grad_norm": 43.09375, + "learning_rate": 9.89648354839931e-06, + "loss": 16.9515, + "step": 36640 + }, + { + "epoch": 0.6626872283536109, + "grad_norm": 42.75, + "learning_rate": 9.896455296092652e-06, + "loss": 17.0241, + "step": 36650 + }, + { + "epoch": 0.6628680434227389, + "grad_norm": 43.65625, + "learning_rate": 9.89642704378599e-06, + "loss": 17.213, + "step": 36660 + }, + { + "epoch": 0.6630488584918667, + "grad_norm": 44.09375, + "learning_rate": 9.896398791479331e-06, + "loss": 16.7815, + "step": 36670 + }, + { + "epoch": 0.6632296735609946, + "grad_norm": 43.65625, + "learning_rate": 9.896370539172672e-06, + "loss": 17.3715, + "step": 36680 + }, + { + "epoch": 0.6634104886301224, + "grad_norm": 45.6875, + "learning_rate": 9.896342286866012e-06, + "loss": 17.1651, + "step": 36690 + }, + { + "epoch": 0.6635913036992503, + "grad_norm": 44.21875, + "learning_rate": 9.896314034559353e-06, + "loss": 17.0294, + "step": 36700 + }, + { + "epoch": 0.6637721187683782, + "grad_norm": 42.03125, + "learning_rate": 9.896285782252692e-06, + "loss": 16.8683, + "step": 36710 + }, + { + "epoch": 0.663952933837506, + "grad_norm": 43.6875, + "learning_rate": 9.896257529946032e-06, + "loss": 17.0183, + "step": 36720 + }, + { + "epoch": 0.6641337489066339, + "grad_norm": 40.625, + "learning_rate": 9.896229277639373e-06, + "loss": 16.8197, + "step": 36730 + }, + { + "epoch": 0.6643145639757617, + "grad_norm": 44.21875, + "learning_rate": 9.896201025332714e-06, + "loss": 16.8733, + "step": 36740 + }, + { + "epoch": 0.6644953790448896, + "grad_norm": 40.78125, + "learning_rate": 9.896172773026054e-06, + "loss": 16.7025, + "step": 36750 + }, + { + "epoch": 0.6646761941140175, + "grad_norm": 44.5625, + "learning_rate": 9.896144520719395e-06, + "loss": 16.9805, + "step": 36760 + }, + { + "epoch": 0.6648570091831453, + "grad_norm": 42.28125, + "learning_rate": 9.896116268412736e-06, + "loss": 17.3065, + "step": 36770 + }, + { + "epoch": 0.6650378242522732, + "grad_norm": 41.5625, + "learning_rate": 9.896088016106076e-06, + "loss": 17.0735, + "step": 36780 + }, + { + "epoch": 0.665218639321401, + "grad_norm": 42.0625, + "learning_rate": 9.896059763799417e-06, + "loss": 17.0477, + "step": 36790 + }, + { + "epoch": 0.6653994543905289, + "grad_norm": 43.09375, + "learning_rate": 9.896031511492756e-06, + "loss": 16.4885, + "step": 36800 + }, + { + "epoch": 0.6655802694596568, + "grad_norm": 42.0, + "learning_rate": 9.896003259186096e-06, + "loss": 17.0502, + "step": 36810 + }, + { + "epoch": 0.6657610845287846, + "grad_norm": 40.9375, + "learning_rate": 9.895975006879439e-06, + "loss": 16.9869, + "step": 36820 + }, + { + "epoch": 0.6659418995979125, + "grad_norm": 42.53125, + "learning_rate": 9.895946754572778e-06, + "loss": 16.7416, + "step": 36830 + }, + { + "epoch": 0.6661227146670403, + "grad_norm": 44.53125, + "learning_rate": 9.895918502266118e-06, + "loss": 16.9338, + "step": 36840 + }, + { + "epoch": 0.6663035297361682, + "grad_norm": 42.8125, + "learning_rate": 9.895890249959459e-06, + "loss": 16.6768, + "step": 36850 + }, + { + "epoch": 0.6664843448052961, + "grad_norm": 43.0625, + "learning_rate": 9.8958619976528e-06, + "loss": 16.521, + "step": 36860 + }, + { + "epoch": 0.666665159874424, + "grad_norm": 38.9375, + "learning_rate": 9.89583374534614e-06, + "loss": 16.6117, + "step": 36870 + }, + { + "epoch": 0.6668459749435518, + "grad_norm": 43.09375, + "learning_rate": 9.895805493039479e-06, + "loss": 16.6247, + "step": 36880 + }, + { + "epoch": 0.6670267900126796, + "grad_norm": 41.25, + "learning_rate": 9.89577724073282e-06, + "loss": 16.7686, + "step": 36890 + }, + { + "epoch": 0.6672076050818075, + "grad_norm": 43.40625, + "learning_rate": 9.89574898842616e-06, + "loss": 16.936, + "step": 36900 + }, + { + "epoch": 0.6673884201509354, + "grad_norm": 39.03125, + "learning_rate": 9.895720736119503e-06, + "loss": 17.2432, + "step": 36910 + }, + { + "epoch": 0.6675692352200633, + "grad_norm": 41.75, + "learning_rate": 9.895692483812842e-06, + "loss": 17.0423, + "step": 36920 + }, + { + "epoch": 0.6677500502891911, + "grad_norm": 41.78125, + "learning_rate": 9.895664231506182e-06, + "loss": 16.7086, + "step": 36930 + }, + { + "epoch": 0.6679308653583189, + "grad_norm": 40.09375, + "learning_rate": 9.895635979199523e-06, + "loss": 17.0203, + "step": 36940 + }, + { + "epoch": 0.6681116804274468, + "grad_norm": 43.15625, + "learning_rate": 9.895607726892863e-06, + "loss": 16.739, + "step": 36950 + }, + { + "epoch": 0.6682924954965747, + "grad_norm": 40.375, + "learning_rate": 9.895579474586204e-06, + "loss": 16.8941, + "step": 36960 + }, + { + "epoch": 0.6684733105657026, + "grad_norm": 44.625, + "learning_rate": 9.895551222279543e-06, + "loss": 17.0405, + "step": 36970 + }, + { + "epoch": 0.6686541256348304, + "grad_norm": 43.5, + "learning_rate": 9.895522969972884e-06, + "loss": 17.3071, + "step": 36980 + }, + { + "epoch": 0.6688349407039582, + "grad_norm": 41.5, + "learning_rate": 9.895494717666224e-06, + "loss": 16.9758, + "step": 36990 + }, + { + "epoch": 0.6690157557730861, + "grad_norm": 40.28125, + "learning_rate": 9.895466465359565e-06, + "loss": 17.0213, + "step": 37000 + }, + { + "epoch": 0.669196570842214, + "grad_norm": 41.4375, + "learning_rate": 9.895438213052905e-06, + "loss": 16.3765, + "step": 37010 + }, + { + "epoch": 0.6693773859113419, + "grad_norm": 43.9375, + "learning_rate": 9.895409960746246e-06, + "loss": 16.96, + "step": 37020 + }, + { + "epoch": 0.6695582009804697, + "grad_norm": 44.1875, + "learning_rate": 9.895381708439587e-06, + "loss": 16.0563, + "step": 37030 + }, + { + "epoch": 0.6697390160495976, + "grad_norm": 40.40625, + "learning_rate": 9.895353456132927e-06, + "loss": 16.8258, + "step": 37040 + }, + { + "epoch": 0.6699198311187254, + "grad_norm": 41.625, + "learning_rate": 9.895325203826266e-06, + "loss": 16.9237, + "step": 37050 + }, + { + "epoch": 0.6701006461878533, + "grad_norm": 43.15625, + "learning_rate": 9.895296951519607e-06, + "loss": 16.9691, + "step": 37060 + }, + { + "epoch": 0.6702814612569812, + "grad_norm": 45.4375, + "learning_rate": 9.895268699212947e-06, + "loss": 16.9079, + "step": 37070 + }, + { + "epoch": 0.670462276326109, + "grad_norm": 43.84375, + "learning_rate": 9.895240446906288e-06, + "loss": 16.7044, + "step": 37080 + }, + { + "epoch": 0.6706430913952369, + "grad_norm": 42.5, + "learning_rate": 9.895212194599629e-06, + "loss": 16.7271, + "step": 37090 + }, + { + "epoch": 0.6708239064643647, + "grad_norm": 41.75, + "learning_rate": 9.89518394229297e-06, + "loss": 16.8627, + "step": 37100 + }, + { + "epoch": 0.6710047215334926, + "grad_norm": 43.5625, + "learning_rate": 9.89515568998631e-06, + "loss": 16.9436, + "step": 37110 + }, + { + "epoch": 0.6711855366026205, + "grad_norm": 39.53125, + "learning_rate": 9.89512743767965e-06, + "loss": 16.9911, + "step": 37120 + }, + { + "epoch": 0.6713663516717483, + "grad_norm": 41.0625, + "learning_rate": 9.895099185372991e-06, + "loss": 16.8614, + "step": 37130 + }, + { + "epoch": 0.6715471667408762, + "grad_norm": 39.46875, + "learning_rate": 9.89507093306633e-06, + "loss": 16.749, + "step": 37140 + }, + { + "epoch": 0.671727981810004, + "grad_norm": 41.96875, + "learning_rate": 9.89504268075967e-06, + "loss": 17.2616, + "step": 37150 + }, + { + "epoch": 0.6719087968791319, + "grad_norm": 45.3125, + "learning_rate": 9.895014428453011e-06, + "loss": 17.2514, + "step": 37160 + }, + { + "epoch": 0.6720896119482598, + "grad_norm": 41.96875, + "learning_rate": 9.894986176146352e-06, + "loss": 16.9123, + "step": 37170 + }, + { + "epoch": 0.6722704270173876, + "grad_norm": 45.4375, + "learning_rate": 9.894957923839693e-06, + "loss": 16.9001, + "step": 37180 + }, + { + "epoch": 0.6724512420865155, + "grad_norm": 42.75, + "learning_rate": 9.894929671533033e-06, + "loss": 16.7307, + "step": 37190 + }, + { + "epoch": 0.6726320571556433, + "grad_norm": 43.5625, + "learning_rate": 9.894901419226374e-06, + "loss": 16.4719, + "step": 37200 + }, + { + "epoch": 0.6728128722247713, + "grad_norm": 41.375, + "learning_rate": 9.894873166919714e-06, + "loss": 16.7455, + "step": 37210 + }, + { + "epoch": 0.6729936872938991, + "grad_norm": 43.09375, + "learning_rate": 9.894844914613055e-06, + "loss": 17.014, + "step": 37220 + }, + { + "epoch": 0.6731745023630269, + "grad_norm": 42.4375, + "learning_rate": 9.894816662306394e-06, + "loss": 16.5502, + "step": 37230 + }, + { + "epoch": 0.6733553174321548, + "grad_norm": 41.34375, + "learning_rate": 9.894788409999735e-06, + "loss": 16.7081, + "step": 37240 + }, + { + "epoch": 0.6735361325012826, + "grad_norm": 43.0, + "learning_rate": 9.894760157693075e-06, + "loss": 16.4107, + "step": 37250 + }, + { + "epoch": 0.6737169475704106, + "grad_norm": 42.3125, + "learning_rate": 9.894731905386416e-06, + "loss": 16.7003, + "step": 37260 + }, + { + "epoch": 0.6738977626395384, + "grad_norm": 42.4375, + "learning_rate": 9.894703653079757e-06, + "loss": 17.0678, + "step": 37270 + }, + { + "epoch": 0.6740785777086662, + "grad_norm": 43.75, + "learning_rate": 9.894675400773097e-06, + "loss": 17.2967, + "step": 37280 + }, + { + "epoch": 0.6742593927777941, + "grad_norm": 41.59375, + "learning_rate": 9.894647148466438e-06, + "loss": 16.9273, + "step": 37290 + }, + { + "epoch": 0.6744402078469219, + "grad_norm": 40.59375, + "learning_rate": 9.894618896159778e-06, + "loss": 16.7979, + "step": 37300 + }, + { + "epoch": 0.6746210229160499, + "grad_norm": 40.5625, + "learning_rate": 9.894590643853117e-06, + "loss": 16.8169, + "step": 37310 + }, + { + "epoch": 0.6748018379851777, + "grad_norm": 39.21875, + "learning_rate": 9.894562391546458e-06, + "loss": 16.4699, + "step": 37320 + }, + { + "epoch": 0.6749826530543055, + "grad_norm": 43.3125, + "learning_rate": 9.894534139239799e-06, + "loss": 17.2835, + "step": 37330 + }, + { + "epoch": 0.6751634681234334, + "grad_norm": 44.0625, + "learning_rate": 9.89450588693314e-06, + "loss": 17.0986, + "step": 37340 + }, + { + "epoch": 0.6753442831925612, + "grad_norm": 41.71875, + "learning_rate": 9.89447763462648e-06, + "loss": 16.5451, + "step": 37350 + }, + { + "epoch": 0.6755250982616892, + "grad_norm": 38.75, + "learning_rate": 9.89444938231982e-06, + "loss": 16.5853, + "step": 37360 + }, + { + "epoch": 0.675705913330817, + "grad_norm": 41.5625, + "learning_rate": 9.894421130013161e-06, + "loss": 16.5989, + "step": 37370 + }, + { + "epoch": 0.6758867283999449, + "grad_norm": 43.78125, + "learning_rate": 9.894392877706502e-06, + "loss": 16.3647, + "step": 37380 + }, + { + "epoch": 0.6760675434690727, + "grad_norm": 43.3125, + "learning_rate": 9.894364625399842e-06, + "loss": 16.8922, + "step": 37390 + }, + { + "epoch": 0.6762483585382005, + "grad_norm": 42.53125, + "learning_rate": 9.894336373093181e-06, + "loss": 16.6042, + "step": 37400 + }, + { + "epoch": 0.6764291736073285, + "grad_norm": 44.375, + "learning_rate": 9.894308120786522e-06, + "loss": 16.704, + "step": 37410 + }, + { + "epoch": 0.6766099886764563, + "grad_norm": 43.21875, + "learning_rate": 9.894279868479862e-06, + "loss": 16.9412, + "step": 37420 + }, + { + "epoch": 0.6767908037455842, + "grad_norm": 42.3125, + "learning_rate": 9.894251616173203e-06, + "loss": 16.9898, + "step": 37430 + }, + { + "epoch": 0.676971618814712, + "grad_norm": 44.78125, + "learning_rate": 9.894223363866544e-06, + "loss": 17.0491, + "step": 37440 + }, + { + "epoch": 0.6771524338838398, + "grad_norm": 44.1875, + "learning_rate": 9.894195111559884e-06, + "loss": 16.946, + "step": 37450 + }, + { + "epoch": 0.6773332489529678, + "grad_norm": 42.53125, + "learning_rate": 9.894166859253225e-06, + "loss": 16.8964, + "step": 37460 + }, + { + "epoch": 0.6775140640220956, + "grad_norm": 42.125, + "learning_rate": 9.894138606946566e-06, + "loss": 16.7398, + "step": 37470 + }, + { + "epoch": 0.6776948790912235, + "grad_norm": 42.71875, + "learning_rate": 9.894110354639905e-06, + "loss": 17.4216, + "step": 37480 + }, + { + "epoch": 0.6778756941603513, + "grad_norm": 42.03125, + "learning_rate": 9.894082102333245e-06, + "loss": 16.6143, + "step": 37490 + }, + { + "epoch": 0.6780565092294791, + "grad_norm": 42.78125, + "learning_rate": 9.894053850026586e-06, + "loss": 16.975, + "step": 37500 + }, + { + "epoch": 0.6782373242986071, + "grad_norm": 41.0, + "learning_rate": 9.894025597719926e-06, + "loss": 17.1048, + "step": 37510 + }, + { + "epoch": 0.6784181393677349, + "grad_norm": 41.53125, + "learning_rate": 9.893997345413267e-06, + "loss": 16.8411, + "step": 37520 + }, + { + "epoch": 0.6785989544368628, + "grad_norm": 43.34375, + "learning_rate": 9.893969093106608e-06, + "loss": 16.628, + "step": 37530 + }, + { + "epoch": 0.6787797695059906, + "grad_norm": 42.875, + "learning_rate": 9.893940840799948e-06, + "loss": 16.8008, + "step": 37540 + }, + { + "epoch": 0.6789605845751185, + "grad_norm": 42.71875, + "learning_rate": 9.893912588493289e-06, + "loss": 17.0204, + "step": 37550 + }, + { + "epoch": 0.6791413996442464, + "grad_norm": 44.65625, + "learning_rate": 9.89388433618663e-06, + "loss": 16.8503, + "step": 37560 + }, + { + "epoch": 0.6793222147133742, + "grad_norm": 42.65625, + "learning_rate": 9.893856083879968e-06, + "loss": 17.1805, + "step": 37570 + }, + { + "epoch": 0.6795030297825021, + "grad_norm": 45.84375, + "learning_rate": 9.893827831573309e-06, + "loss": 17.1783, + "step": 37580 + }, + { + "epoch": 0.6796838448516299, + "grad_norm": 42.65625, + "learning_rate": 9.89379957926665e-06, + "loss": 16.4853, + "step": 37590 + }, + { + "epoch": 0.6798646599207578, + "grad_norm": 45.4375, + "learning_rate": 9.89377132695999e-06, + "loss": 16.8635, + "step": 37600 + }, + { + "epoch": 0.6800454749898857, + "grad_norm": 40.90625, + "learning_rate": 9.893743074653331e-06, + "loss": 16.7238, + "step": 37610 + }, + { + "epoch": 0.6802262900590135, + "grad_norm": 44.0, + "learning_rate": 9.893714822346672e-06, + "loss": 16.7515, + "step": 37620 + }, + { + "epoch": 0.6804071051281414, + "grad_norm": 42.1875, + "learning_rate": 9.893686570040012e-06, + "loss": 16.5855, + "step": 37630 + }, + { + "epoch": 0.6805879201972692, + "grad_norm": 45.28125, + "learning_rate": 9.893658317733353e-06, + "loss": 16.6717, + "step": 37640 + }, + { + "epoch": 0.6807687352663971, + "grad_norm": 41.75, + "learning_rate": 9.893630065426693e-06, + "loss": 16.5964, + "step": 37650 + }, + { + "epoch": 0.680949550335525, + "grad_norm": 41.9375, + "learning_rate": 9.893601813120032e-06, + "loss": 17.0906, + "step": 37660 + }, + { + "epoch": 0.6811303654046528, + "grad_norm": 40.65625, + "learning_rate": 9.893573560813373e-06, + "loss": 16.9528, + "step": 37670 + }, + { + "epoch": 0.6813111804737807, + "grad_norm": 40.46875, + "learning_rate": 9.893545308506714e-06, + "loss": 16.7316, + "step": 37680 + }, + { + "epoch": 0.6814919955429085, + "grad_norm": 41.03125, + "learning_rate": 9.893517056200054e-06, + "loss": 17.0295, + "step": 37690 + }, + { + "epoch": 0.6816728106120364, + "grad_norm": 42.71875, + "learning_rate": 9.893488803893395e-06, + "loss": 16.8099, + "step": 37700 + }, + { + "epoch": 0.6818536256811643, + "grad_norm": 42.40625, + "learning_rate": 9.893460551586735e-06, + "loss": 17.2334, + "step": 37710 + }, + { + "epoch": 0.6820344407502922, + "grad_norm": 43.90625, + "learning_rate": 9.893432299280076e-06, + "loss": 17.0922, + "step": 37720 + }, + { + "epoch": 0.68221525581942, + "grad_norm": 42.84375, + "learning_rate": 9.893404046973417e-06, + "loss": 16.5968, + "step": 37730 + }, + { + "epoch": 0.6823960708885478, + "grad_norm": 42.78125, + "learning_rate": 9.893375794666756e-06, + "loss": 16.5034, + "step": 37740 + }, + { + "epoch": 0.6825768859576757, + "grad_norm": 41.03125, + "learning_rate": 9.893347542360096e-06, + "loss": 16.4676, + "step": 37750 + }, + { + "epoch": 0.6827577010268036, + "grad_norm": 44.15625, + "learning_rate": 9.893319290053437e-06, + "loss": 17.131, + "step": 37760 + }, + { + "epoch": 0.6829385160959315, + "grad_norm": 43.78125, + "learning_rate": 9.893291037746777e-06, + "loss": 16.6227, + "step": 37770 + }, + { + "epoch": 0.6831193311650593, + "grad_norm": 39.53125, + "learning_rate": 9.893262785440118e-06, + "loss": 17.1105, + "step": 37780 + }, + { + "epoch": 0.6833001462341871, + "grad_norm": 46.03125, + "learning_rate": 9.893234533133459e-06, + "loss": 16.6947, + "step": 37790 + }, + { + "epoch": 0.683480961303315, + "grad_norm": 41.4375, + "learning_rate": 9.8932062808268e-06, + "loss": 16.7799, + "step": 37800 + }, + { + "epoch": 0.6836617763724429, + "grad_norm": 42.59375, + "learning_rate": 9.89317802852014e-06, + "loss": 16.9629, + "step": 37810 + }, + { + "epoch": 0.6838425914415708, + "grad_norm": 43.34375, + "learning_rate": 9.89314977621348e-06, + "loss": 16.9785, + "step": 37820 + }, + { + "epoch": 0.6840234065106986, + "grad_norm": 47.46875, + "learning_rate": 9.89312152390682e-06, + "loss": 17.1893, + "step": 37830 + }, + { + "epoch": 0.6842042215798264, + "grad_norm": 45.0625, + "learning_rate": 9.89309327160016e-06, + "loss": 16.745, + "step": 37840 + }, + { + "epoch": 0.6843850366489543, + "grad_norm": 41.03125, + "learning_rate": 9.8930650192935e-06, + "loss": 16.7386, + "step": 37850 + }, + { + "epoch": 0.6845658517180822, + "grad_norm": 44.5625, + "learning_rate": 9.893036766986841e-06, + "loss": 17.1872, + "step": 37860 + }, + { + "epoch": 0.6847466667872101, + "grad_norm": 45.09375, + "learning_rate": 9.893008514680182e-06, + "loss": 17.1807, + "step": 37870 + }, + { + "epoch": 0.6849274818563379, + "grad_norm": 40.8125, + "learning_rate": 9.892980262373523e-06, + "loss": 16.3967, + "step": 37880 + }, + { + "epoch": 0.6851082969254658, + "grad_norm": 43.3125, + "learning_rate": 9.892952010066863e-06, + "loss": 17.071, + "step": 37890 + }, + { + "epoch": 0.6852891119945936, + "grad_norm": 40.75, + "learning_rate": 9.892923757760204e-06, + "loss": 16.4375, + "step": 37900 + }, + { + "epoch": 0.6854699270637215, + "grad_norm": 42.3125, + "learning_rate": 9.892895505453543e-06, + "loss": 16.9052, + "step": 37910 + }, + { + "epoch": 0.6856507421328494, + "grad_norm": 41.1875, + "learning_rate": 9.892867253146883e-06, + "loss": 17.2202, + "step": 37920 + }, + { + "epoch": 0.6858315572019772, + "grad_norm": 41.28125, + "learning_rate": 9.892839000840224e-06, + "loss": 16.2978, + "step": 37930 + }, + { + "epoch": 0.6860123722711051, + "grad_norm": 42.3125, + "learning_rate": 9.892810748533565e-06, + "loss": 16.8237, + "step": 37940 + }, + { + "epoch": 0.6861931873402329, + "grad_norm": 40.4375, + "learning_rate": 9.892782496226905e-06, + "loss": 16.7933, + "step": 37950 + }, + { + "epoch": 0.6863740024093608, + "grad_norm": 41.09375, + "learning_rate": 9.892754243920246e-06, + "loss": 16.5888, + "step": 37960 + }, + { + "epoch": 0.6865548174784887, + "grad_norm": 42.625, + "learning_rate": 9.892725991613587e-06, + "loss": 17.1759, + "step": 37970 + }, + { + "epoch": 0.6867356325476165, + "grad_norm": 44.28125, + "learning_rate": 9.892697739306927e-06, + "loss": 16.5377, + "step": 37980 + }, + { + "epoch": 0.6869164476167444, + "grad_norm": 41.9375, + "learning_rate": 9.892669487000268e-06, + "loss": 16.3997, + "step": 37990 + }, + { + "epoch": 0.6870972626858722, + "grad_norm": 43.1875, + "learning_rate": 9.892641234693607e-06, + "loss": 16.5976, + "step": 38000 + }, + { + "epoch": 0.687278077755, + "grad_norm": 45.53125, + "learning_rate": 9.892612982386947e-06, + "loss": 16.7352, + "step": 38010 + }, + { + "epoch": 0.687458892824128, + "grad_norm": 44.0, + "learning_rate": 9.892584730080288e-06, + "loss": 17.0146, + "step": 38020 + }, + { + "epoch": 0.6876397078932558, + "grad_norm": 41.5, + "learning_rate": 9.892556477773629e-06, + "loss": 16.9501, + "step": 38030 + }, + { + "epoch": 0.6878205229623837, + "grad_norm": 40.03125, + "learning_rate": 9.89252822546697e-06, + "loss": 16.6916, + "step": 38040 + }, + { + "epoch": 0.6880013380315115, + "grad_norm": 41.59375, + "learning_rate": 9.89249997316031e-06, + "loss": 16.8109, + "step": 38050 + }, + { + "epoch": 0.6881821531006395, + "grad_norm": 43.09375, + "learning_rate": 9.89247172085365e-06, + "loss": 16.1281, + "step": 38060 + }, + { + "epoch": 0.6883629681697673, + "grad_norm": 39.875, + "learning_rate": 9.892443468546991e-06, + "loss": 16.5465, + "step": 38070 + }, + { + "epoch": 0.6885437832388951, + "grad_norm": 41.1875, + "learning_rate": 9.892415216240332e-06, + "loss": 16.8856, + "step": 38080 + }, + { + "epoch": 0.688724598308023, + "grad_norm": 44.875, + "learning_rate": 9.89238696393367e-06, + "loss": 17.1266, + "step": 38090 + }, + { + "epoch": 0.6889054133771508, + "grad_norm": 48.3125, + "learning_rate": 9.892358711627011e-06, + "loss": 17.2518, + "step": 38100 + }, + { + "epoch": 0.6890862284462788, + "grad_norm": 45.1875, + "learning_rate": 9.892330459320352e-06, + "loss": 16.6606, + "step": 38110 + }, + { + "epoch": 0.6892670435154066, + "grad_norm": 44.1875, + "learning_rate": 9.892302207013692e-06, + "loss": 17.1191, + "step": 38120 + }, + { + "epoch": 0.6894478585845344, + "grad_norm": 42.03125, + "learning_rate": 9.892273954707033e-06, + "loss": 16.8843, + "step": 38130 + }, + { + "epoch": 0.6896286736536623, + "grad_norm": 43.40625, + "learning_rate": 9.892245702400374e-06, + "loss": 17.0698, + "step": 38140 + }, + { + "epoch": 0.6898094887227901, + "grad_norm": 42.8125, + "learning_rate": 9.892217450093714e-06, + "loss": 16.8898, + "step": 38150 + }, + { + "epoch": 0.6899903037919181, + "grad_norm": 42.1875, + "learning_rate": 9.892189197787055e-06, + "loss": 16.6293, + "step": 38160 + }, + { + "epoch": 0.6901711188610459, + "grad_norm": 40.1875, + "learning_rate": 9.892160945480394e-06, + "loss": 16.8833, + "step": 38170 + }, + { + "epoch": 0.6903519339301737, + "grad_norm": 43.1875, + "learning_rate": 9.892132693173735e-06, + "loss": 16.8337, + "step": 38180 + }, + { + "epoch": 0.6905327489993016, + "grad_norm": 43.46875, + "learning_rate": 9.892104440867075e-06, + "loss": 16.8272, + "step": 38190 + }, + { + "epoch": 0.6907135640684294, + "grad_norm": 42.53125, + "learning_rate": 9.892076188560416e-06, + "loss": 16.7386, + "step": 38200 + }, + { + "epoch": 0.6908943791375574, + "grad_norm": 41.09375, + "learning_rate": 9.892047936253756e-06, + "loss": 17.0917, + "step": 38210 + }, + { + "epoch": 0.6910751942066852, + "grad_norm": 44.03125, + "learning_rate": 9.892019683947097e-06, + "loss": 16.8651, + "step": 38220 + }, + { + "epoch": 0.6912560092758131, + "grad_norm": 43.9375, + "learning_rate": 9.891991431640438e-06, + "loss": 17.1524, + "step": 38230 + }, + { + "epoch": 0.6914368243449409, + "grad_norm": 42.25, + "learning_rate": 9.891963179333778e-06, + "loss": 16.482, + "step": 38240 + }, + { + "epoch": 0.6916176394140687, + "grad_norm": 40.8125, + "learning_rate": 9.891934927027119e-06, + "loss": 16.8363, + "step": 38250 + }, + { + "epoch": 0.6917984544831967, + "grad_norm": 39.59375, + "learning_rate": 9.891906674720458e-06, + "loss": 16.6902, + "step": 38260 + }, + { + "epoch": 0.6919792695523245, + "grad_norm": 43.1875, + "learning_rate": 9.891878422413798e-06, + "loss": 17.1459, + "step": 38270 + }, + { + "epoch": 0.6921600846214524, + "grad_norm": 41.75, + "learning_rate": 9.891850170107139e-06, + "loss": 16.5241, + "step": 38280 + }, + { + "epoch": 0.6923408996905802, + "grad_norm": 43.375, + "learning_rate": 9.89182191780048e-06, + "loss": 16.95, + "step": 38290 + }, + { + "epoch": 0.692521714759708, + "grad_norm": 44.4375, + "learning_rate": 9.89179366549382e-06, + "loss": 16.9044, + "step": 38300 + }, + { + "epoch": 0.692702529828836, + "grad_norm": 41.375, + "learning_rate": 9.891765413187161e-06, + "loss": 17.0597, + "step": 38310 + }, + { + "epoch": 0.6928833448979638, + "grad_norm": 39.875, + "learning_rate": 9.891737160880502e-06, + "loss": 16.8813, + "step": 38320 + }, + { + "epoch": 0.6930641599670917, + "grad_norm": 40.96875, + "learning_rate": 9.891708908573842e-06, + "loss": 17.1459, + "step": 38330 + }, + { + "epoch": 0.6932449750362195, + "grad_norm": 42.5, + "learning_rate": 9.891680656267181e-06, + "loss": 16.6943, + "step": 38340 + }, + { + "epoch": 0.6934257901053473, + "grad_norm": 42.625, + "learning_rate": 9.891652403960522e-06, + "loss": 16.8816, + "step": 38350 + }, + { + "epoch": 0.6936066051744753, + "grad_norm": 42.3125, + "learning_rate": 9.891624151653862e-06, + "loss": 16.8835, + "step": 38360 + }, + { + "epoch": 0.6937874202436031, + "grad_norm": 44.59375, + "learning_rate": 9.891595899347203e-06, + "loss": 17.0272, + "step": 38370 + }, + { + "epoch": 0.693968235312731, + "grad_norm": 40.53125, + "learning_rate": 9.891567647040544e-06, + "loss": 16.7614, + "step": 38380 + }, + { + "epoch": 0.6941490503818588, + "grad_norm": 42.125, + "learning_rate": 9.891539394733883e-06, + "loss": 16.4766, + "step": 38390 + }, + { + "epoch": 0.6943298654509867, + "grad_norm": 40.4375, + "learning_rate": 9.891511142427225e-06, + "loss": 17.3731, + "step": 38400 + }, + { + "epoch": 0.6945106805201146, + "grad_norm": 40.78125, + "learning_rate": 9.891482890120565e-06, + "loss": 16.6749, + "step": 38410 + }, + { + "epoch": 0.6946914955892424, + "grad_norm": 43.84375, + "learning_rate": 9.891454637813906e-06, + "loss": 16.6835, + "step": 38420 + }, + { + "epoch": 0.6948723106583703, + "grad_norm": 44.875, + "learning_rate": 9.891426385507245e-06, + "loss": 16.9625, + "step": 38430 + }, + { + "epoch": 0.6950531257274981, + "grad_norm": 43.0, + "learning_rate": 9.891398133200586e-06, + "loss": 16.6631, + "step": 38440 + }, + { + "epoch": 0.695233940796626, + "grad_norm": 41.0625, + "learning_rate": 9.891369880893926e-06, + "loss": 16.7724, + "step": 38450 + }, + { + "epoch": 0.6954147558657539, + "grad_norm": 43.96875, + "learning_rate": 9.891341628587267e-06, + "loss": 16.9028, + "step": 38460 + }, + { + "epoch": 0.6955955709348817, + "grad_norm": 42.5, + "learning_rate": 9.891313376280607e-06, + "loss": 17.0701, + "step": 38470 + }, + { + "epoch": 0.6957763860040096, + "grad_norm": 42.34375, + "learning_rate": 9.891285123973946e-06, + "loss": 16.3494, + "step": 38480 + }, + { + "epoch": 0.6959572010731374, + "grad_norm": 43.4375, + "learning_rate": 9.891256871667289e-06, + "loss": 16.4693, + "step": 38490 + }, + { + "epoch": 0.6961380161422653, + "grad_norm": 43.25, + "learning_rate": 9.89122861936063e-06, + "loss": 16.8401, + "step": 38500 + }, + { + "epoch": 0.6963188312113932, + "grad_norm": 42.8125, + "learning_rate": 9.891200367053968e-06, + "loss": 16.548, + "step": 38510 + }, + { + "epoch": 0.696499646280521, + "grad_norm": 42.40625, + "learning_rate": 9.891172114747309e-06, + "loss": 17.2683, + "step": 38520 + }, + { + "epoch": 0.6966804613496489, + "grad_norm": 43.71875, + "learning_rate": 9.89114386244065e-06, + "loss": 17.3941, + "step": 38530 + }, + { + "epoch": 0.6968612764187767, + "grad_norm": 42.15625, + "learning_rate": 9.89111561013399e-06, + "loss": 17.3078, + "step": 38540 + }, + { + "epoch": 0.6970420914879046, + "grad_norm": 41.0625, + "learning_rate": 9.89108735782733e-06, + "loss": 16.5073, + "step": 38550 + }, + { + "epoch": 0.6972229065570325, + "grad_norm": 44.9375, + "learning_rate": 9.891059105520671e-06, + "loss": 16.5629, + "step": 38560 + }, + { + "epoch": 0.6974037216261604, + "grad_norm": 42.96875, + "learning_rate": 9.891030853214012e-06, + "loss": 16.6385, + "step": 38570 + }, + { + "epoch": 0.6975845366952882, + "grad_norm": 43.21875, + "learning_rate": 9.891002600907353e-06, + "loss": 16.8539, + "step": 38580 + }, + { + "epoch": 0.697765351764416, + "grad_norm": 43.65625, + "learning_rate": 9.890974348600693e-06, + "loss": 16.9434, + "step": 38590 + }, + { + "epoch": 0.6979461668335439, + "grad_norm": 43.3125, + "learning_rate": 9.890946096294032e-06, + "loss": 16.9127, + "step": 38600 + }, + { + "epoch": 0.6981269819026718, + "grad_norm": 44.65625, + "learning_rate": 9.890917843987373e-06, + "loss": 16.9669, + "step": 38610 + }, + { + "epoch": 0.6983077969717997, + "grad_norm": 45.53125, + "learning_rate": 9.890889591680713e-06, + "loss": 16.6195, + "step": 38620 + }, + { + "epoch": 0.6984886120409275, + "grad_norm": 43.03125, + "learning_rate": 9.890861339374054e-06, + "loss": 16.8106, + "step": 38630 + }, + { + "epoch": 0.6986694271100553, + "grad_norm": 41.15625, + "learning_rate": 9.890833087067395e-06, + "loss": 16.8928, + "step": 38640 + }, + { + "epoch": 0.6988502421791832, + "grad_norm": 44.59375, + "learning_rate": 9.890804834760734e-06, + "loss": 16.7577, + "step": 38650 + }, + { + "epoch": 0.699031057248311, + "grad_norm": 42.3125, + "learning_rate": 9.890776582454076e-06, + "loss": 16.6293, + "step": 38660 + }, + { + "epoch": 0.699211872317439, + "grad_norm": 42.9375, + "learning_rate": 9.890748330147417e-06, + "loss": 16.8936, + "step": 38670 + }, + { + "epoch": 0.6993926873865668, + "grad_norm": 43.125, + "learning_rate": 9.890720077840757e-06, + "loss": 16.863, + "step": 38680 + }, + { + "epoch": 0.6995735024556946, + "grad_norm": 44.125, + "learning_rate": 9.890691825534096e-06, + "loss": 16.9209, + "step": 38690 + }, + { + "epoch": 0.6997543175248225, + "grad_norm": 47.375, + "learning_rate": 9.890663573227437e-06, + "loss": 16.7577, + "step": 38700 + }, + { + "epoch": 0.6999351325939503, + "grad_norm": 44.46875, + "learning_rate": 9.890635320920777e-06, + "loss": 16.7093, + "step": 38710 + }, + { + "epoch": 0.7001159476630783, + "grad_norm": 43.75, + "learning_rate": 9.890607068614118e-06, + "loss": 16.6643, + "step": 38720 + }, + { + "epoch": 0.7002967627322061, + "grad_norm": 42.28125, + "learning_rate": 9.890578816307459e-06, + "loss": 16.9856, + "step": 38730 + }, + { + "epoch": 0.700477577801334, + "grad_norm": 41.59375, + "learning_rate": 9.890550564000798e-06, + "loss": 16.7979, + "step": 38740 + }, + { + "epoch": 0.7006583928704618, + "grad_norm": 41.3125, + "learning_rate": 9.89052231169414e-06, + "loss": 17.2538, + "step": 38750 + }, + { + "epoch": 0.7008392079395896, + "grad_norm": 46.84375, + "learning_rate": 9.89049405938748e-06, + "loss": 16.9212, + "step": 38760 + }, + { + "epoch": 0.7010200230087176, + "grad_norm": 46.71875, + "learning_rate": 9.89046580708082e-06, + "loss": 17.0221, + "step": 38770 + }, + { + "epoch": 0.7012008380778454, + "grad_norm": 43.25, + "learning_rate": 9.89043755477416e-06, + "loss": 17.1704, + "step": 38780 + }, + { + "epoch": 0.7013816531469733, + "grad_norm": 41.65625, + "learning_rate": 9.8904093024675e-06, + "loss": 17.0322, + "step": 38790 + }, + { + "epoch": 0.7015624682161011, + "grad_norm": 42.75, + "learning_rate": 9.890381050160841e-06, + "loss": 16.8613, + "step": 38800 + }, + { + "epoch": 0.701743283285229, + "grad_norm": 40.375, + "learning_rate": 9.890352797854182e-06, + "loss": 16.7168, + "step": 38810 + }, + { + "epoch": 0.7019240983543569, + "grad_norm": 40.125, + "learning_rate": 9.89032454554752e-06, + "loss": 16.8869, + "step": 38820 + }, + { + "epoch": 0.7021049134234847, + "grad_norm": 42.0, + "learning_rate": 9.890296293240861e-06, + "loss": 17.2882, + "step": 38830 + }, + { + "epoch": 0.7022857284926126, + "grad_norm": 40.0, + "learning_rate": 9.890268040934204e-06, + "loss": 16.2854, + "step": 38840 + }, + { + "epoch": 0.7024665435617404, + "grad_norm": 41.4375, + "learning_rate": 9.890239788627544e-06, + "loss": 16.8362, + "step": 38850 + }, + { + "epoch": 0.7026473586308682, + "grad_norm": 46.09375, + "learning_rate": 9.890211536320883e-06, + "loss": 16.914, + "step": 38860 + }, + { + "epoch": 0.7028281736999962, + "grad_norm": 41.96875, + "learning_rate": 9.890183284014224e-06, + "loss": 16.5401, + "step": 38870 + }, + { + "epoch": 0.703008988769124, + "grad_norm": 40.9375, + "learning_rate": 9.890155031707565e-06, + "loss": 16.6829, + "step": 38880 + }, + { + "epoch": 0.7031898038382519, + "grad_norm": 40.0625, + "learning_rate": 9.890126779400905e-06, + "loss": 16.6298, + "step": 38890 + }, + { + "epoch": 0.7033706189073797, + "grad_norm": 40.875, + "learning_rate": 9.890098527094246e-06, + "loss": 16.8991, + "step": 38900 + }, + { + "epoch": 0.7035514339765077, + "grad_norm": 42.34375, + "learning_rate": 9.890070274787585e-06, + "loss": 16.7942, + "step": 38910 + }, + { + "epoch": 0.7037322490456355, + "grad_norm": 43.28125, + "learning_rate": 9.890042022480927e-06, + "loss": 16.8306, + "step": 38920 + }, + { + "epoch": 0.7039130641147633, + "grad_norm": 43.78125, + "learning_rate": 9.890013770174268e-06, + "loss": 16.7857, + "step": 38930 + }, + { + "epoch": 0.7040938791838912, + "grad_norm": 41.8125, + "learning_rate": 9.889985517867607e-06, + "loss": 17.0068, + "step": 38940 + }, + { + "epoch": 0.704274694253019, + "grad_norm": 45.0, + "learning_rate": 9.889957265560947e-06, + "loss": 17.0016, + "step": 38950 + }, + { + "epoch": 0.704455509322147, + "grad_norm": 44.0, + "learning_rate": 9.889929013254288e-06, + "loss": 16.9943, + "step": 38960 + }, + { + "epoch": 0.7046363243912748, + "grad_norm": 42.25, + "learning_rate": 9.889900760947628e-06, + "loss": 17.1728, + "step": 38970 + }, + { + "epoch": 0.7048171394604026, + "grad_norm": 41.28125, + "learning_rate": 9.889872508640969e-06, + "loss": 16.7221, + "step": 38980 + }, + { + "epoch": 0.7049979545295305, + "grad_norm": 42.125, + "learning_rate": 9.88984425633431e-06, + "loss": 17.306, + "step": 38990 + }, + { + "epoch": 0.7051787695986583, + "grad_norm": 46.5625, + "learning_rate": 9.889816004027649e-06, + "loss": 16.8126, + "step": 39000 + }, + { + "epoch": 0.7053595846677863, + "grad_norm": 47.84375, + "learning_rate": 9.889787751720991e-06, + "loss": 17.1228, + "step": 39010 + }, + { + "epoch": 0.7055403997369141, + "grad_norm": 44.34375, + "learning_rate": 9.889759499414332e-06, + "loss": 16.9815, + "step": 39020 + }, + { + "epoch": 0.7057212148060419, + "grad_norm": 46.75, + "learning_rate": 9.88973124710767e-06, + "loss": 16.8795, + "step": 39030 + }, + { + "epoch": 0.7059020298751698, + "grad_norm": 41.25, + "learning_rate": 9.889702994801011e-06, + "loss": 16.5388, + "step": 39040 + }, + { + "epoch": 0.7060828449442976, + "grad_norm": 38.8125, + "learning_rate": 9.889674742494352e-06, + "loss": 17.3997, + "step": 39050 + }, + { + "epoch": 0.7062636600134256, + "grad_norm": 41.96875, + "learning_rate": 9.889646490187692e-06, + "loss": 17.334, + "step": 39060 + }, + { + "epoch": 0.7064444750825534, + "grad_norm": 42.5625, + "learning_rate": 9.889618237881033e-06, + "loss": 16.7021, + "step": 39070 + }, + { + "epoch": 0.7066252901516813, + "grad_norm": 42.75, + "learning_rate": 9.889589985574372e-06, + "loss": 16.7919, + "step": 39080 + }, + { + "epoch": 0.7068061052208091, + "grad_norm": 42.375, + "learning_rate": 9.889561733267713e-06, + "loss": 16.9483, + "step": 39090 + }, + { + "epoch": 0.7069869202899369, + "grad_norm": 44.0625, + "learning_rate": 9.889533480961055e-06, + "loss": 17.155, + "step": 39100 + }, + { + "epoch": 0.7071677353590649, + "grad_norm": 43.46875, + "learning_rate": 9.889505228654395e-06, + "loss": 16.7376, + "step": 39110 + }, + { + "epoch": 0.7073485504281927, + "grad_norm": 44.15625, + "learning_rate": 9.889476976347734e-06, + "loss": 16.9099, + "step": 39120 + }, + { + "epoch": 0.7075293654973206, + "grad_norm": 41.03125, + "learning_rate": 9.889448724041075e-06, + "loss": 16.8503, + "step": 39130 + }, + { + "epoch": 0.7077101805664484, + "grad_norm": 41.125, + "learning_rate": 9.889420471734416e-06, + "loss": 17.4304, + "step": 39140 + }, + { + "epoch": 0.7078909956355762, + "grad_norm": 41.4375, + "learning_rate": 9.889392219427756e-06, + "loss": 16.8346, + "step": 39150 + }, + { + "epoch": 0.7080718107047042, + "grad_norm": 40.125, + "learning_rate": 9.889363967121097e-06, + "loss": 17.1053, + "step": 39160 + }, + { + "epoch": 0.708252625773832, + "grad_norm": 40.59375, + "learning_rate": 9.889335714814436e-06, + "loss": 17.1629, + "step": 39170 + }, + { + "epoch": 0.7084334408429599, + "grad_norm": 40.5, + "learning_rate": 9.889307462507776e-06, + "loss": 16.7123, + "step": 39180 + }, + { + "epoch": 0.7086142559120877, + "grad_norm": 41.75, + "learning_rate": 9.889279210201119e-06, + "loss": 16.7231, + "step": 39190 + }, + { + "epoch": 0.7087950709812155, + "grad_norm": 42.6875, + "learning_rate": 9.889250957894458e-06, + "loss": 16.8257, + "step": 39200 + }, + { + "epoch": 0.7089758860503435, + "grad_norm": 41.625, + "learning_rate": 9.889222705587798e-06, + "loss": 16.6544, + "step": 39210 + }, + { + "epoch": 0.7091567011194713, + "grad_norm": 42.28125, + "learning_rate": 9.889194453281139e-06, + "loss": 16.5631, + "step": 39220 + }, + { + "epoch": 0.7093375161885992, + "grad_norm": 45.625, + "learning_rate": 9.88916620097448e-06, + "loss": 16.7361, + "step": 39230 + }, + { + "epoch": 0.709518331257727, + "grad_norm": 41.5625, + "learning_rate": 9.88913794866782e-06, + "loss": 16.61, + "step": 39240 + }, + { + "epoch": 0.7096991463268549, + "grad_norm": 43.9375, + "learning_rate": 9.889109696361159e-06, + "loss": 16.7179, + "step": 39250 + }, + { + "epoch": 0.7098799613959828, + "grad_norm": 43.4375, + "learning_rate": 9.8890814440545e-06, + "loss": 16.5795, + "step": 39260 + }, + { + "epoch": 0.7100607764651106, + "grad_norm": 42.25, + "learning_rate": 9.88905319174784e-06, + "loss": 16.9953, + "step": 39270 + }, + { + "epoch": 0.7102415915342385, + "grad_norm": 42.96875, + "learning_rate": 9.889024939441183e-06, + "loss": 17.0232, + "step": 39280 + }, + { + "epoch": 0.7104224066033663, + "grad_norm": 42.03125, + "learning_rate": 9.888996687134522e-06, + "loss": 16.6105, + "step": 39290 + }, + { + "epoch": 0.7106032216724942, + "grad_norm": 42.90625, + "learning_rate": 9.888968434827862e-06, + "loss": 16.4751, + "step": 39300 + }, + { + "epoch": 0.710784036741622, + "grad_norm": 44.875, + "learning_rate": 9.888940182521203e-06, + "loss": 16.6902, + "step": 39310 + }, + { + "epoch": 0.7109648518107499, + "grad_norm": 42.53125, + "learning_rate": 9.888911930214543e-06, + "loss": 16.8739, + "step": 39320 + }, + { + "epoch": 0.7111456668798778, + "grad_norm": 42.34375, + "learning_rate": 9.888883677907884e-06, + "loss": 16.8996, + "step": 39330 + }, + { + "epoch": 0.7113264819490056, + "grad_norm": 40.40625, + "learning_rate": 9.888855425601223e-06, + "loss": 17.1528, + "step": 39340 + }, + { + "epoch": 0.7115072970181335, + "grad_norm": 43.8125, + "learning_rate": 9.888827173294564e-06, + "loss": 16.411, + "step": 39350 + }, + { + "epoch": 0.7116881120872613, + "grad_norm": 42.90625, + "learning_rate": 9.888798920987906e-06, + "loss": 17.1764, + "step": 39360 + }, + { + "epoch": 0.7118689271563892, + "grad_norm": 41.65625, + "learning_rate": 9.888770668681245e-06, + "loss": 16.745, + "step": 39370 + }, + { + "epoch": 0.7120497422255171, + "grad_norm": 42.875, + "learning_rate": 9.888742416374585e-06, + "loss": 16.6895, + "step": 39380 + }, + { + "epoch": 0.7122305572946449, + "grad_norm": 41.8125, + "learning_rate": 9.888714164067926e-06, + "loss": 16.6713, + "step": 39390 + }, + { + "epoch": 0.7124113723637728, + "grad_norm": 42.6875, + "learning_rate": 9.888685911761267e-06, + "loss": 16.8277, + "step": 39400 + }, + { + "epoch": 0.7125921874329006, + "grad_norm": 44.25, + "learning_rate": 9.888657659454607e-06, + "loss": 16.4465, + "step": 39410 + }, + { + "epoch": 0.7127730025020286, + "grad_norm": 43.03125, + "learning_rate": 9.888629407147948e-06, + "loss": 17.0582, + "step": 39420 + }, + { + "epoch": 0.7129538175711564, + "grad_norm": 43.21875, + "learning_rate": 9.888601154841287e-06, + "loss": 17.0685, + "step": 39430 + }, + { + "epoch": 0.7131346326402842, + "grad_norm": 43.5, + "learning_rate": 9.888572902534628e-06, + "loss": 16.3677, + "step": 39440 + }, + { + "epoch": 0.7133154477094121, + "grad_norm": 43.25, + "learning_rate": 9.88854465022797e-06, + "loss": 16.704, + "step": 39450 + }, + { + "epoch": 0.71349626277854, + "grad_norm": 43.3125, + "learning_rate": 9.888516397921309e-06, + "loss": 17.0324, + "step": 39460 + }, + { + "epoch": 0.7136770778476679, + "grad_norm": 42.125, + "learning_rate": 9.88848814561465e-06, + "loss": 17.0155, + "step": 39470 + }, + { + "epoch": 0.7138578929167957, + "grad_norm": 41.34375, + "learning_rate": 9.88845989330799e-06, + "loss": 17.1789, + "step": 39480 + }, + { + "epoch": 0.7140387079859235, + "grad_norm": 45.875, + "learning_rate": 9.88843164100133e-06, + "loss": 16.8495, + "step": 39490 + }, + { + "epoch": 0.7142195230550514, + "grad_norm": 44.125, + "learning_rate": 9.888403388694671e-06, + "loss": 16.6137, + "step": 39500 + }, + { + "epoch": 0.7144003381241792, + "grad_norm": 42.53125, + "learning_rate": 9.88837513638801e-06, + "loss": 16.8421, + "step": 39510 + }, + { + "epoch": 0.7145811531933072, + "grad_norm": 45.21875, + "learning_rate": 9.88834688408135e-06, + "loss": 16.8688, + "step": 39520 + }, + { + "epoch": 0.714761968262435, + "grad_norm": 42.75, + "learning_rate": 9.888318631774691e-06, + "loss": 16.5882, + "step": 39530 + }, + { + "epoch": 0.7149427833315628, + "grad_norm": 43.46875, + "learning_rate": 9.888290379468034e-06, + "loss": 16.5181, + "step": 39540 + }, + { + "epoch": 0.7151235984006907, + "grad_norm": 43.15625, + "learning_rate": 9.888262127161373e-06, + "loss": 17.2262, + "step": 39550 + }, + { + "epoch": 0.7153044134698185, + "grad_norm": 44.9375, + "learning_rate": 9.888233874854713e-06, + "loss": 16.6752, + "step": 39560 + }, + { + "epoch": 0.7154852285389465, + "grad_norm": 46.625, + "learning_rate": 9.888205622548054e-06, + "loss": 17.0998, + "step": 39570 + }, + { + "epoch": 0.7156660436080743, + "grad_norm": 42.21875, + "learning_rate": 9.888177370241395e-06, + "loss": 16.7745, + "step": 39580 + }, + { + "epoch": 0.7158468586772022, + "grad_norm": 42.6875, + "learning_rate": 9.888149117934735e-06, + "loss": 16.4374, + "step": 39590 + }, + { + "epoch": 0.71602767374633, + "grad_norm": 41.71875, + "learning_rate": 9.888120865628074e-06, + "loss": 16.4207, + "step": 39600 + }, + { + "epoch": 0.7162084888154578, + "grad_norm": 45.21875, + "learning_rate": 9.888092613321415e-06, + "loss": 16.7451, + "step": 39610 + }, + { + "epoch": 0.7163893038845858, + "grad_norm": 39.90625, + "learning_rate": 9.888064361014755e-06, + "loss": 17.1233, + "step": 39620 + }, + { + "epoch": 0.7165701189537136, + "grad_norm": 44.5, + "learning_rate": 9.888036108708096e-06, + "loss": 16.855, + "step": 39630 + }, + { + "epoch": 0.7167509340228415, + "grad_norm": 40.84375, + "learning_rate": 9.888007856401437e-06, + "loss": 16.5099, + "step": 39640 + }, + { + "epoch": 0.7169317490919693, + "grad_norm": 41.1875, + "learning_rate": 9.887979604094777e-06, + "loss": 17.1891, + "step": 39650 + }, + { + "epoch": 0.7171125641610971, + "grad_norm": 43.21875, + "learning_rate": 9.887951351788118e-06, + "loss": 16.7949, + "step": 39660 + }, + { + "epoch": 0.7172933792302251, + "grad_norm": 40.03125, + "learning_rate": 9.887923099481458e-06, + "loss": 16.9468, + "step": 39670 + }, + { + "epoch": 0.7174741942993529, + "grad_norm": 42.0, + "learning_rate": 9.887894847174797e-06, + "loss": 16.891, + "step": 39680 + }, + { + "epoch": 0.7176550093684808, + "grad_norm": 43.6875, + "learning_rate": 9.887866594868138e-06, + "loss": 17.1045, + "step": 39690 + }, + { + "epoch": 0.7178358244376086, + "grad_norm": 43.125, + "learning_rate": 9.887838342561479e-06, + "loss": 17.314, + "step": 39700 + }, + { + "epoch": 0.7180166395067364, + "grad_norm": 43.5625, + "learning_rate": 9.887810090254821e-06, + "loss": 16.633, + "step": 39710 + }, + { + "epoch": 0.7181974545758644, + "grad_norm": 45.9375, + "learning_rate": 9.88778183794816e-06, + "loss": 16.8595, + "step": 39720 + }, + { + "epoch": 0.7183782696449922, + "grad_norm": 42.78125, + "learning_rate": 9.8877535856415e-06, + "loss": 16.9203, + "step": 39730 + }, + { + "epoch": 0.7185590847141201, + "grad_norm": 42.90625, + "learning_rate": 9.887725333334841e-06, + "loss": 16.9117, + "step": 39740 + }, + { + "epoch": 0.7187398997832479, + "grad_norm": 44.5625, + "learning_rate": 9.887697081028182e-06, + "loss": 17.2357, + "step": 39750 + }, + { + "epoch": 0.7189207148523757, + "grad_norm": 40.5625, + "learning_rate": 9.887668828721522e-06, + "loss": 16.8096, + "step": 39760 + }, + { + "epoch": 0.7191015299215037, + "grad_norm": 42.625, + "learning_rate": 9.887640576414861e-06, + "loss": 16.8199, + "step": 39770 + }, + { + "epoch": 0.7192823449906315, + "grad_norm": 44.71875, + "learning_rate": 9.887612324108202e-06, + "loss": 16.8323, + "step": 39780 + }, + { + "epoch": 0.7194631600597594, + "grad_norm": 41.90625, + "learning_rate": 9.887584071801543e-06, + "loss": 16.9276, + "step": 39790 + }, + { + "epoch": 0.7196439751288872, + "grad_norm": 41.65625, + "learning_rate": 9.887555819494883e-06, + "loss": 16.6748, + "step": 39800 + }, + { + "epoch": 0.7198247901980152, + "grad_norm": 40.84375, + "learning_rate": 9.887527567188224e-06, + "loss": 16.8066, + "step": 39810 + }, + { + "epoch": 0.720005605267143, + "grad_norm": 40.40625, + "learning_rate": 9.887499314881564e-06, + "loss": 17.2768, + "step": 39820 + }, + { + "epoch": 0.7201864203362708, + "grad_norm": 44.8125, + "learning_rate": 9.887471062574905e-06, + "loss": 16.4379, + "step": 39830 + }, + { + "epoch": 0.7203672354053987, + "grad_norm": 41.3125, + "learning_rate": 9.887442810268246e-06, + "loss": 16.8064, + "step": 39840 + }, + { + "epoch": 0.7205480504745265, + "grad_norm": 41.875, + "learning_rate": 9.887414557961586e-06, + "loss": 16.7328, + "step": 39850 + }, + { + "epoch": 0.7207288655436545, + "grad_norm": 42.8125, + "learning_rate": 9.887386305654925e-06, + "loss": 16.8048, + "step": 39860 + }, + { + "epoch": 0.7209096806127823, + "grad_norm": 44.0, + "learning_rate": 9.887358053348266e-06, + "loss": 16.5774, + "step": 39870 + }, + { + "epoch": 0.7210904956819101, + "grad_norm": 44.5625, + "learning_rate": 9.887329801041606e-06, + "loss": 16.6305, + "step": 39880 + }, + { + "epoch": 0.721271310751038, + "grad_norm": 40.125, + "learning_rate": 9.887301548734947e-06, + "loss": 16.7518, + "step": 39890 + }, + { + "epoch": 0.7214521258201658, + "grad_norm": 42.25, + "learning_rate": 9.887273296428288e-06, + "loss": 17.4451, + "step": 39900 + }, + { + "epoch": 0.7216329408892938, + "grad_norm": 43.4375, + "learning_rate": 9.887245044121628e-06, + "loss": 16.9063, + "step": 39910 + }, + { + "epoch": 0.7218137559584216, + "grad_norm": 43.40625, + "learning_rate": 9.887216791814969e-06, + "loss": 16.5798, + "step": 39920 + }, + { + "epoch": 0.7219945710275494, + "grad_norm": 42.125, + "learning_rate": 9.88718853950831e-06, + "loss": 16.8749, + "step": 39930 + }, + { + "epoch": 0.7221753860966773, + "grad_norm": 43.03125, + "learning_rate": 9.887160287201648e-06, + "loss": 16.9119, + "step": 39940 + }, + { + "epoch": 0.7223562011658051, + "grad_norm": 42.5, + "learning_rate": 9.887132034894989e-06, + "loss": 16.7371, + "step": 39950 + }, + { + "epoch": 0.722537016234933, + "grad_norm": 41.25, + "learning_rate": 9.88710378258833e-06, + "loss": 17.1396, + "step": 39960 + }, + { + "epoch": 0.7227178313040609, + "grad_norm": 43.75, + "learning_rate": 9.88707553028167e-06, + "loss": 16.7851, + "step": 39970 + }, + { + "epoch": 0.7228986463731888, + "grad_norm": 41.5625, + "learning_rate": 9.887047277975011e-06, + "loss": 17.002, + "step": 39980 + }, + { + "epoch": 0.7230794614423166, + "grad_norm": 42.8125, + "learning_rate": 9.887019025668352e-06, + "loss": 17.0829, + "step": 39990 + }, + { + "epoch": 0.7232602765114444, + "grad_norm": 39.84375, + "learning_rate": 9.886990773361692e-06, + "loss": 16.9125, + "step": 40000 + }, + { + "epoch": 0.7232602765114444, + "eval_loss": 2.107973098754883, + "eval_runtime": 229.6504, + "eval_samples_per_second": 3161.584, + "eval_steps_per_second": 49.401, + "step": 40000 + }, + { + "epoch": 0.7234410915805723, + "grad_norm": 41.5, + "learning_rate": 9.886962521055033e-06, + "loss": 16.4671, + "step": 40010 + }, + { + "epoch": 0.7236219066497002, + "grad_norm": 43.71875, + "learning_rate": 9.886934268748373e-06, + "loss": 16.8505, + "step": 40020 + }, + { + "epoch": 0.7238027217188281, + "grad_norm": 40.4375, + "learning_rate": 9.886906016441712e-06, + "loss": 16.8149, + "step": 40030 + }, + { + "epoch": 0.7239835367879559, + "grad_norm": 43.125, + "learning_rate": 9.886877764135053e-06, + "loss": 16.7169, + "step": 40040 + }, + { + "epoch": 0.7241643518570837, + "grad_norm": 42.40625, + "learning_rate": 9.886849511828394e-06, + "loss": 16.8032, + "step": 40050 + }, + { + "epoch": 0.7243451669262116, + "grad_norm": 43.6875, + "learning_rate": 9.886821259521734e-06, + "loss": 16.7357, + "step": 40060 + }, + { + "epoch": 0.7245259819953395, + "grad_norm": 43.5, + "learning_rate": 9.886793007215075e-06, + "loss": 16.8738, + "step": 40070 + }, + { + "epoch": 0.7247067970644674, + "grad_norm": 42.96875, + "learning_rate": 9.886764754908416e-06, + "loss": 16.9363, + "step": 40080 + }, + { + "epoch": 0.7248876121335952, + "grad_norm": 44.875, + "learning_rate": 9.886736502601756e-06, + "loss": 16.6958, + "step": 40090 + }, + { + "epoch": 0.725068427202723, + "grad_norm": 41.625, + "learning_rate": 9.886708250295097e-06, + "loss": 16.8281, + "step": 40100 + }, + { + "epoch": 0.725249242271851, + "grad_norm": 43.09375, + "learning_rate": 9.886679997988436e-06, + "loss": 16.6294, + "step": 40110 + }, + { + "epoch": 0.7254300573409788, + "grad_norm": 41.40625, + "learning_rate": 9.886651745681776e-06, + "loss": 17.3219, + "step": 40120 + }, + { + "epoch": 0.7256108724101067, + "grad_norm": 44.75, + "learning_rate": 9.886623493375117e-06, + "loss": 17.1288, + "step": 40130 + }, + { + "epoch": 0.7257916874792345, + "grad_norm": 43.0625, + "learning_rate": 9.886595241068458e-06, + "loss": 16.3775, + "step": 40140 + }, + { + "epoch": 0.7259725025483624, + "grad_norm": 42.0625, + "learning_rate": 9.886566988761798e-06, + "loss": 16.2942, + "step": 40150 + }, + { + "epoch": 0.7261533176174902, + "grad_norm": 42.1875, + "learning_rate": 9.886538736455139e-06, + "loss": 16.8152, + "step": 40160 + }, + { + "epoch": 0.7263341326866181, + "grad_norm": 43.15625, + "learning_rate": 9.88651048414848e-06, + "loss": 17.041, + "step": 40170 + }, + { + "epoch": 0.726514947755746, + "grad_norm": 43.0, + "learning_rate": 9.88648223184182e-06, + "loss": 17.1616, + "step": 40180 + }, + { + "epoch": 0.7266957628248738, + "grad_norm": 43.90625, + "learning_rate": 9.88645397953516e-06, + "loss": 16.3568, + "step": 40190 + }, + { + "epoch": 0.7268765778940017, + "grad_norm": 43.4375, + "learning_rate": 9.8864257272285e-06, + "loss": 16.744, + "step": 40200 + }, + { + "epoch": 0.7270573929631295, + "grad_norm": 43.03125, + "learning_rate": 9.88639747492184e-06, + "loss": 16.7687, + "step": 40210 + }, + { + "epoch": 0.7272382080322574, + "grad_norm": 42.75, + "learning_rate": 9.88636922261518e-06, + "loss": 16.7254, + "step": 40220 + }, + { + "epoch": 0.7274190231013853, + "grad_norm": 47.21875, + "learning_rate": 9.886340970308521e-06, + "loss": 16.8152, + "step": 40230 + }, + { + "epoch": 0.7275998381705131, + "grad_norm": 42.71875, + "learning_rate": 9.886312718001862e-06, + "loss": 16.6617, + "step": 40240 + }, + { + "epoch": 0.727780653239641, + "grad_norm": 42.78125, + "learning_rate": 9.886284465695203e-06, + "loss": 16.8171, + "step": 40250 + }, + { + "epoch": 0.7279614683087688, + "grad_norm": 41.09375, + "learning_rate": 9.886256213388543e-06, + "loss": 16.6807, + "step": 40260 + }, + { + "epoch": 0.7281422833778967, + "grad_norm": 44.65625, + "learning_rate": 9.886227961081884e-06, + "loss": 16.8896, + "step": 40270 + }, + { + "epoch": 0.7283230984470246, + "grad_norm": 44.75, + "learning_rate": 9.886199708775225e-06, + "loss": 17.2417, + "step": 40280 + }, + { + "epoch": 0.7285039135161524, + "grad_norm": 40.6875, + "learning_rate": 9.886171456468563e-06, + "loss": 16.7568, + "step": 40290 + }, + { + "epoch": 0.7286847285852803, + "grad_norm": 46.21875, + "learning_rate": 9.886143204161904e-06, + "loss": 16.9543, + "step": 40300 + }, + { + "epoch": 0.7288655436544081, + "grad_norm": 41.09375, + "learning_rate": 9.886114951855245e-06, + "loss": 16.9995, + "step": 40310 + }, + { + "epoch": 0.7290463587235361, + "grad_norm": 41.03125, + "learning_rate": 9.886086699548585e-06, + "loss": 16.5499, + "step": 40320 + }, + { + "epoch": 0.7292271737926639, + "grad_norm": 45.96875, + "learning_rate": 9.886058447241926e-06, + "loss": 16.7233, + "step": 40330 + }, + { + "epoch": 0.7294079888617917, + "grad_norm": 44.90625, + "learning_rate": 9.886030194935267e-06, + "loss": 17.1275, + "step": 40340 + }, + { + "epoch": 0.7295888039309196, + "grad_norm": 44.4375, + "learning_rate": 9.886001942628607e-06, + "loss": 16.9403, + "step": 40350 + }, + { + "epoch": 0.7297696190000474, + "grad_norm": 42.34375, + "learning_rate": 9.885973690321948e-06, + "loss": 16.5147, + "step": 40360 + }, + { + "epoch": 0.7299504340691754, + "grad_norm": 40.28125, + "learning_rate": 9.885945438015287e-06, + "loss": 16.7906, + "step": 40370 + }, + { + "epoch": 0.7301312491383032, + "grad_norm": 42.8125, + "learning_rate": 9.885917185708627e-06, + "loss": 17.0495, + "step": 40380 + }, + { + "epoch": 0.730312064207431, + "grad_norm": 42.4375, + "learning_rate": 9.885888933401968e-06, + "loss": 17.0379, + "step": 40390 + }, + { + "epoch": 0.7304928792765589, + "grad_norm": 43.09375, + "learning_rate": 9.885860681095309e-06, + "loss": 16.7473, + "step": 40400 + }, + { + "epoch": 0.7306736943456867, + "grad_norm": 40.375, + "learning_rate": 9.88583242878865e-06, + "loss": 16.9922, + "step": 40410 + }, + { + "epoch": 0.7308545094148147, + "grad_norm": 41.53125, + "learning_rate": 9.88580417648199e-06, + "loss": 16.7063, + "step": 40420 + }, + { + "epoch": 0.7310353244839425, + "grad_norm": 40.5625, + "learning_rate": 9.88577592417533e-06, + "loss": 17.0254, + "step": 40430 + }, + { + "epoch": 0.7312161395530703, + "grad_norm": 42.5, + "learning_rate": 9.885747671868671e-06, + "loss": 16.7434, + "step": 40440 + }, + { + "epoch": 0.7313969546221982, + "grad_norm": 44.5, + "learning_rate": 9.885719419562012e-06, + "loss": 16.8146, + "step": 40450 + }, + { + "epoch": 0.731577769691326, + "grad_norm": 41.625, + "learning_rate": 9.88569116725535e-06, + "loss": 17.4253, + "step": 40460 + }, + { + "epoch": 0.731758584760454, + "grad_norm": 45.53125, + "learning_rate": 9.885662914948691e-06, + "loss": 16.398, + "step": 40470 + }, + { + "epoch": 0.7319393998295818, + "grad_norm": 42.09375, + "learning_rate": 9.885634662642032e-06, + "loss": 16.5052, + "step": 40480 + }, + { + "epoch": 0.7321202148987097, + "grad_norm": 42.75, + "learning_rate": 9.885606410335373e-06, + "loss": 16.7789, + "step": 40490 + }, + { + "epoch": 0.7323010299678375, + "grad_norm": 44.78125, + "learning_rate": 9.885578158028713e-06, + "loss": 16.5454, + "step": 40500 + }, + { + "epoch": 0.7324818450369653, + "grad_norm": 42.78125, + "learning_rate": 9.885549905722054e-06, + "loss": 16.8163, + "step": 40510 + }, + { + "epoch": 0.7326626601060933, + "grad_norm": 42.125, + "learning_rate": 9.885521653415394e-06, + "loss": 17.143, + "step": 40520 + }, + { + "epoch": 0.7328434751752211, + "grad_norm": 39.1875, + "learning_rate": 9.885493401108735e-06, + "loss": 17.0652, + "step": 40530 + }, + { + "epoch": 0.733024290244349, + "grad_norm": 45.71875, + "learning_rate": 9.885465148802074e-06, + "loss": 17.1131, + "step": 40540 + }, + { + "epoch": 0.7332051053134768, + "grad_norm": 44.09375, + "learning_rate": 9.885436896495415e-06, + "loss": 17.2092, + "step": 40550 + }, + { + "epoch": 0.7333859203826046, + "grad_norm": 44.8125, + "learning_rate": 9.885408644188755e-06, + "loss": 17.1851, + "step": 40560 + }, + { + "epoch": 0.7335667354517326, + "grad_norm": 41.4375, + "learning_rate": 9.885380391882096e-06, + "loss": 17.2057, + "step": 40570 + }, + { + "epoch": 0.7337475505208604, + "grad_norm": 45.0625, + "learning_rate": 9.885352139575436e-06, + "loss": 16.8595, + "step": 40580 + }, + { + "epoch": 0.7339283655899883, + "grad_norm": 42.71875, + "learning_rate": 9.885323887268777e-06, + "loss": 16.6233, + "step": 40590 + }, + { + "epoch": 0.7341091806591161, + "grad_norm": 41.0625, + "learning_rate": 9.885295634962118e-06, + "loss": 16.6906, + "step": 40600 + }, + { + "epoch": 0.7342899957282439, + "grad_norm": 41.75, + "learning_rate": 9.885267382655458e-06, + "loss": 16.5386, + "step": 40610 + }, + { + "epoch": 0.7344708107973719, + "grad_norm": 42.46875, + "learning_rate": 9.885239130348799e-06, + "loss": 16.8199, + "step": 40620 + }, + { + "epoch": 0.7346516258664997, + "grad_norm": 42.8125, + "learning_rate": 9.885210878042138e-06, + "loss": 16.6419, + "step": 40630 + }, + { + "epoch": 0.7348324409356276, + "grad_norm": 44.34375, + "learning_rate": 9.885182625735478e-06, + "loss": 16.9355, + "step": 40640 + }, + { + "epoch": 0.7350132560047554, + "grad_norm": 42.46875, + "learning_rate": 9.885154373428819e-06, + "loss": 16.7165, + "step": 40650 + }, + { + "epoch": 0.7351940710738833, + "grad_norm": 44.8125, + "learning_rate": 9.88512612112216e-06, + "loss": 16.981, + "step": 40660 + }, + { + "epoch": 0.7353748861430112, + "grad_norm": 40.3125, + "learning_rate": 9.8850978688155e-06, + "loss": 16.8327, + "step": 40670 + }, + { + "epoch": 0.735555701212139, + "grad_norm": 42.3125, + "learning_rate": 9.885069616508841e-06, + "loss": 16.9216, + "step": 40680 + }, + { + "epoch": 0.7357365162812669, + "grad_norm": 42.53125, + "learning_rate": 9.885041364202182e-06, + "loss": 17.251, + "step": 40690 + }, + { + "epoch": 0.7359173313503947, + "grad_norm": 42.1875, + "learning_rate": 9.885013111895522e-06, + "loss": 16.3896, + "step": 40700 + }, + { + "epoch": 0.7360981464195226, + "grad_norm": 42.90625, + "learning_rate": 9.884984859588861e-06, + "loss": 16.8334, + "step": 40710 + }, + { + "epoch": 0.7362789614886505, + "grad_norm": 43.78125, + "learning_rate": 9.884956607282202e-06, + "loss": 16.6666, + "step": 40720 + }, + { + "epoch": 0.7364597765577783, + "grad_norm": 45.0, + "learning_rate": 9.884928354975542e-06, + "loss": 17.068, + "step": 40730 + }, + { + "epoch": 0.7366405916269062, + "grad_norm": 41.5, + "learning_rate": 9.884900102668883e-06, + "loss": 16.5892, + "step": 40740 + }, + { + "epoch": 0.736821406696034, + "grad_norm": 41.8125, + "learning_rate": 9.884871850362224e-06, + "loss": 16.977, + "step": 40750 + }, + { + "epoch": 0.737002221765162, + "grad_norm": 44.0625, + "learning_rate": 9.884843598055564e-06, + "loss": 17.0296, + "step": 40760 + }, + { + "epoch": 0.7371830368342898, + "grad_norm": 41.75, + "learning_rate": 9.884815345748905e-06, + "loss": 16.9444, + "step": 40770 + }, + { + "epoch": 0.7373638519034176, + "grad_norm": 43.15625, + "learning_rate": 9.884787093442246e-06, + "loss": 16.7186, + "step": 40780 + }, + { + "epoch": 0.7375446669725455, + "grad_norm": 41.71875, + "learning_rate": 9.884758841135586e-06, + "loss": 16.5658, + "step": 40790 + }, + { + "epoch": 0.7377254820416733, + "grad_norm": 41.21875, + "learning_rate": 9.884730588828925e-06, + "loss": 16.9532, + "step": 40800 + }, + { + "epoch": 0.7379062971108012, + "grad_norm": 43.5, + "learning_rate": 9.884702336522266e-06, + "loss": 16.7144, + "step": 40810 + }, + { + "epoch": 0.7380871121799291, + "grad_norm": 43.9375, + "learning_rate": 9.884674084215606e-06, + "loss": 16.8067, + "step": 40820 + }, + { + "epoch": 0.738267927249057, + "grad_norm": 41.84375, + "learning_rate": 9.884645831908947e-06, + "loss": 16.7855, + "step": 40830 + }, + { + "epoch": 0.7384487423181848, + "grad_norm": 40.875, + "learning_rate": 9.884617579602288e-06, + "loss": 16.8532, + "step": 40840 + }, + { + "epoch": 0.7386295573873126, + "grad_norm": 43.84375, + "learning_rate": 9.884589327295628e-06, + "loss": 16.8848, + "step": 40850 + }, + { + "epoch": 0.7388103724564405, + "grad_norm": 43.90625, + "learning_rate": 9.884561074988969e-06, + "loss": 16.41, + "step": 40860 + }, + { + "epoch": 0.7389911875255684, + "grad_norm": 42.9375, + "learning_rate": 9.88453282268231e-06, + "loss": 16.931, + "step": 40870 + }, + { + "epoch": 0.7391720025946963, + "grad_norm": 44.3125, + "learning_rate": 9.88450457037565e-06, + "loss": 16.4792, + "step": 40880 + }, + { + "epoch": 0.7393528176638241, + "grad_norm": 41.6875, + "learning_rate": 9.884476318068989e-06, + "loss": 16.5623, + "step": 40890 + }, + { + "epoch": 0.7395336327329519, + "grad_norm": 44.3125, + "learning_rate": 9.88444806576233e-06, + "loss": 16.9786, + "step": 40900 + }, + { + "epoch": 0.7397144478020798, + "grad_norm": 44.46875, + "learning_rate": 9.88441981345567e-06, + "loss": 16.7459, + "step": 40910 + }, + { + "epoch": 0.7398952628712077, + "grad_norm": 43.78125, + "learning_rate": 9.88439156114901e-06, + "loss": 17.09, + "step": 40920 + }, + { + "epoch": 0.7400760779403356, + "grad_norm": 43.34375, + "learning_rate": 9.884363308842351e-06, + "loss": 17.1282, + "step": 40930 + }, + { + "epoch": 0.7402568930094634, + "grad_norm": 40.875, + "learning_rate": 9.884335056535692e-06, + "loss": 16.7603, + "step": 40940 + }, + { + "epoch": 0.7404377080785912, + "grad_norm": 44.4375, + "learning_rate": 9.884306804229033e-06, + "loss": 16.8682, + "step": 40950 + }, + { + "epoch": 0.7406185231477191, + "grad_norm": 40.40625, + "learning_rate": 9.884278551922373e-06, + "loss": 17.0364, + "step": 40960 + }, + { + "epoch": 0.740799338216847, + "grad_norm": 42.53125, + "learning_rate": 9.884250299615712e-06, + "loss": 17.0327, + "step": 40970 + }, + { + "epoch": 0.7409801532859749, + "grad_norm": 43.15625, + "learning_rate": 9.884222047309053e-06, + "loss": 16.9583, + "step": 40980 + }, + { + "epoch": 0.7411609683551027, + "grad_norm": 41.90625, + "learning_rate": 9.884193795002394e-06, + "loss": 16.6937, + "step": 40990 + }, + { + "epoch": 0.7413417834242306, + "grad_norm": 43.65625, + "learning_rate": 9.884165542695734e-06, + "loss": 16.5979, + "step": 41000 + }, + { + "epoch": 0.7415225984933584, + "grad_norm": 42.875, + "learning_rate": 9.884137290389075e-06, + "loss": 16.5445, + "step": 41010 + }, + { + "epoch": 0.7417034135624863, + "grad_norm": 44.21875, + "learning_rate": 9.884109038082414e-06, + "loss": 16.9713, + "step": 41020 + }, + { + "epoch": 0.7418842286316142, + "grad_norm": 42.8125, + "learning_rate": 9.884080785775756e-06, + "loss": 16.3581, + "step": 41030 + }, + { + "epoch": 0.742065043700742, + "grad_norm": 42.3125, + "learning_rate": 9.884052533469097e-06, + "loss": 16.7363, + "step": 41040 + }, + { + "epoch": 0.7422458587698699, + "grad_norm": 39.625, + "learning_rate": 9.884024281162437e-06, + "loss": 16.7999, + "step": 41050 + }, + { + "epoch": 0.7424266738389977, + "grad_norm": 39.53125, + "learning_rate": 9.883996028855776e-06, + "loss": 16.8929, + "step": 41060 + }, + { + "epoch": 0.7426074889081256, + "grad_norm": 45.46875, + "learning_rate": 9.883967776549117e-06, + "loss": 16.6393, + "step": 41070 + }, + { + "epoch": 0.7427883039772535, + "grad_norm": 43.375, + "learning_rate": 9.883939524242457e-06, + "loss": 16.7282, + "step": 41080 + }, + { + "epoch": 0.7429691190463813, + "grad_norm": 40.78125, + "learning_rate": 9.883911271935798e-06, + "loss": 17.0336, + "step": 41090 + }, + { + "epoch": 0.7431499341155092, + "grad_norm": 45.5625, + "learning_rate": 9.883883019629139e-06, + "loss": 16.9114, + "step": 41100 + }, + { + "epoch": 0.743330749184637, + "grad_norm": 45.5625, + "learning_rate": 9.88385476732248e-06, + "loss": 17.1745, + "step": 41110 + }, + { + "epoch": 0.7435115642537649, + "grad_norm": 41.9375, + "learning_rate": 9.88382651501582e-06, + "loss": 16.4395, + "step": 41120 + }, + { + "epoch": 0.7436923793228928, + "grad_norm": 44.25, + "learning_rate": 9.88379826270916e-06, + "loss": 16.8133, + "step": 41130 + }, + { + "epoch": 0.7438731943920206, + "grad_norm": 42.6875, + "learning_rate": 9.8837700104025e-06, + "loss": 16.6626, + "step": 41140 + }, + { + "epoch": 0.7440540094611485, + "grad_norm": 43.40625, + "learning_rate": 9.88374175809584e-06, + "loss": 16.7532, + "step": 41150 + }, + { + "epoch": 0.7442348245302763, + "grad_norm": 41.0625, + "learning_rate": 9.88371350578918e-06, + "loss": 16.8214, + "step": 41160 + }, + { + "epoch": 0.7444156395994043, + "grad_norm": 42.21875, + "learning_rate": 9.883685253482521e-06, + "loss": 16.7015, + "step": 41170 + }, + { + "epoch": 0.7445964546685321, + "grad_norm": 43.96875, + "learning_rate": 9.883657001175862e-06, + "loss": 17.0512, + "step": 41180 + }, + { + "epoch": 0.7447772697376599, + "grad_norm": 44.5, + "learning_rate": 9.883628748869203e-06, + "loss": 17.1258, + "step": 41190 + }, + { + "epoch": 0.7449580848067878, + "grad_norm": 44.21875, + "learning_rate": 9.883600496562543e-06, + "loss": 16.8723, + "step": 41200 + }, + { + "epoch": 0.7451388998759156, + "grad_norm": 42.09375, + "learning_rate": 9.883572244255884e-06, + "loss": 16.7487, + "step": 41210 + }, + { + "epoch": 0.7453197149450436, + "grad_norm": 43.84375, + "learning_rate": 9.883543991949224e-06, + "loss": 16.6667, + "step": 41220 + }, + { + "epoch": 0.7455005300141714, + "grad_norm": 41.90625, + "learning_rate": 9.883515739642563e-06, + "loss": 16.8356, + "step": 41230 + }, + { + "epoch": 0.7456813450832992, + "grad_norm": 44.1875, + "learning_rate": 9.883487487335904e-06, + "loss": 16.8372, + "step": 41240 + }, + { + "epoch": 0.7458621601524271, + "grad_norm": 44.84375, + "learning_rate": 9.883459235029245e-06, + "loss": 16.2635, + "step": 41250 + }, + { + "epoch": 0.7460429752215549, + "grad_norm": 44.375, + "learning_rate": 9.883430982722585e-06, + "loss": 16.6561, + "step": 41260 + }, + { + "epoch": 0.7462237902906829, + "grad_norm": 44.34375, + "learning_rate": 9.883402730415926e-06, + "loss": 16.6502, + "step": 41270 + }, + { + "epoch": 0.7464046053598107, + "grad_norm": 42.09375, + "learning_rate": 9.883374478109265e-06, + "loss": 16.8927, + "step": 41280 + }, + { + "epoch": 0.7465854204289385, + "grad_norm": 44.28125, + "learning_rate": 9.883346225802607e-06, + "loss": 16.7709, + "step": 41290 + }, + { + "epoch": 0.7467662354980664, + "grad_norm": 43.71875, + "learning_rate": 9.883317973495948e-06, + "loss": 16.8387, + "step": 41300 + }, + { + "epoch": 0.7469470505671942, + "grad_norm": 43.28125, + "learning_rate": 9.883289721189288e-06, + "loss": 16.9989, + "step": 41310 + }, + { + "epoch": 0.7471278656363222, + "grad_norm": 40.96875, + "learning_rate": 9.883261468882627e-06, + "loss": 16.9145, + "step": 41320 + }, + { + "epoch": 0.74730868070545, + "grad_norm": 43.6875, + "learning_rate": 9.883233216575968e-06, + "loss": 16.7206, + "step": 41330 + }, + { + "epoch": 0.7474894957745779, + "grad_norm": 42.625, + "learning_rate": 9.883204964269309e-06, + "loss": 16.3599, + "step": 41340 + }, + { + "epoch": 0.7476703108437057, + "grad_norm": 42.84375, + "learning_rate": 9.883176711962649e-06, + "loss": 16.4318, + "step": 41350 + }, + { + "epoch": 0.7478511259128335, + "grad_norm": 43.0625, + "learning_rate": 9.88314845965599e-06, + "loss": 16.9316, + "step": 41360 + }, + { + "epoch": 0.7480319409819615, + "grad_norm": 44.09375, + "learning_rate": 9.883120207349329e-06, + "loss": 16.7205, + "step": 41370 + }, + { + "epoch": 0.7482127560510893, + "grad_norm": 40.25, + "learning_rate": 9.883091955042671e-06, + "loss": 16.3273, + "step": 41380 + }, + { + "epoch": 0.7483935711202172, + "grad_norm": 42.40625, + "learning_rate": 9.883063702736012e-06, + "loss": 17.1665, + "step": 41390 + }, + { + "epoch": 0.748574386189345, + "grad_norm": 43.59375, + "learning_rate": 9.88303545042935e-06, + "loss": 16.7555, + "step": 41400 + }, + { + "epoch": 0.7487552012584728, + "grad_norm": 43.71875, + "learning_rate": 9.883007198122691e-06, + "loss": 16.8557, + "step": 41410 + }, + { + "epoch": 0.7489360163276008, + "grad_norm": 40.375, + "learning_rate": 9.882978945816032e-06, + "loss": 16.3214, + "step": 41420 + }, + { + "epoch": 0.7491168313967286, + "grad_norm": 43.6875, + "learning_rate": 9.882950693509372e-06, + "loss": 17.0458, + "step": 41430 + }, + { + "epoch": 0.7492976464658565, + "grad_norm": 44.21875, + "learning_rate": 9.882922441202713e-06, + "loss": 17.0682, + "step": 41440 + }, + { + "epoch": 0.7494784615349843, + "grad_norm": 42.78125, + "learning_rate": 9.882894188896052e-06, + "loss": 16.79, + "step": 41450 + }, + { + "epoch": 0.7496592766041121, + "grad_norm": 44.40625, + "learning_rate": 9.882865936589394e-06, + "loss": 16.8722, + "step": 41460 + }, + { + "epoch": 0.7498400916732401, + "grad_norm": 41.3125, + "learning_rate": 9.882837684282735e-06, + "loss": 16.351, + "step": 41470 + }, + { + "epoch": 0.7500209067423679, + "grad_norm": 45.6875, + "learning_rate": 9.882809431976076e-06, + "loss": 16.8484, + "step": 41480 + }, + { + "epoch": 0.7502017218114958, + "grad_norm": 43.125, + "learning_rate": 9.882781179669414e-06, + "loss": 16.6952, + "step": 41490 + }, + { + "epoch": 0.7503825368806236, + "grad_norm": 46.5, + "learning_rate": 9.882752927362755e-06, + "loss": 16.8167, + "step": 41500 + }, + { + "epoch": 0.7505633519497515, + "grad_norm": 40.0, + "learning_rate": 9.882724675056096e-06, + "loss": 16.6396, + "step": 41510 + }, + { + "epoch": 0.7507441670188794, + "grad_norm": 43.9375, + "learning_rate": 9.882696422749436e-06, + "loss": 16.5496, + "step": 41520 + }, + { + "epoch": 0.7509249820880072, + "grad_norm": 44.34375, + "learning_rate": 9.882668170442777e-06, + "loss": 17.0983, + "step": 41530 + }, + { + "epoch": 0.7511057971571351, + "grad_norm": 40.84375, + "learning_rate": 9.882639918136116e-06, + "loss": 16.601, + "step": 41540 + }, + { + "epoch": 0.7512866122262629, + "grad_norm": 41.75, + "learning_rate": 9.882611665829458e-06, + "loss": 16.7391, + "step": 41550 + }, + { + "epoch": 0.7514674272953908, + "grad_norm": 41.71875, + "learning_rate": 9.882583413522799e-06, + "loss": 17.1427, + "step": 41560 + }, + { + "epoch": 0.7516482423645187, + "grad_norm": 42.25, + "learning_rate": 9.882555161216138e-06, + "loss": 16.784, + "step": 41570 + }, + { + "epoch": 0.7518290574336465, + "grad_norm": 43.65625, + "learning_rate": 9.882526908909478e-06, + "loss": 16.6273, + "step": 41580 + }, + { + "epoch": 0.7520098725027744, + "grad_norm": 39.6875, + "learning_rate": 9.882498656602819e-06, + "loss": 16.5671, + "step": 41590 + }, + { + "epoch": 0.7521906875719022, + "grad_norm": 44.03125, + "learning_rate": 9.88247040429616e-06, + "loss": 16.9705, + "step": 41600 + }, + { + "epoch": 0.7523715026410301, + "grad_norm": 42.15625, + "learning_rate": 9.8824421519895e-06, + "loss": 16.6402, + "step": 41610 + }, + { + "epoch": 0.752552317710158, + "grad_norm": 41.71875, + "learning_rate": 9.882413899682841e-06, + "loss": 16.7703, + "step": 41620 + }, + { + "epoch": 0.7527331327792858, + "grad_norm": 42.34375, + "learning_rate": 9.88238564737618e-06, + "loss": 17.2249, + "step": 41630 + }, + { + "epoch": 0.7529139478484137, + "grad_norm": 41.03125, + "learning_rate": 9.882357395069522e-06, + "loss": 16.6053, + "step": 41640 + }, + { + "epoch": 0.7530947629175415, + "grad_norm": 46.1875, + "learning_rate": 9.882329142762863e-06, + "loss": 17.1259, + "step": 41650 + }, + { + "epoch": 0.7532755779866694, + "grad_norm": 42.78125, + "learning_rate": 9.882300890456202e-06, + "loss": 16.6679, + "step": 41660 + }, + { + "epoch": 0.7534563930557973, + "grad_norm": 44.9375, + "learning_rate": 9.882272638149542e-06, + "loss": 17.3633, + "step": 41670 + }, + { + "epoch": 0.7536372081249252, + "grad_norm": 41.5625, + "learning_rate": 9.882244385842883e-06, + "loss": 16.7871, + "step": 41680 + }, + { + "epoch": 0.753818023194053, + "grad_norm": 43.46875, + "learning_rate": 9.882216133536224e-06, + "loss": 16.3583, + "step": 41690 + }, + { + "epoch": 0.7539988382631808, + "grad_norm": 41.84375, + "learning_rate": 9.882187881229564e-06, + "loss": 16.692, + "step": 41700 + }, + { + "epoch": 0.7541796533323087, + "grad_norm": 41.09375, + "learning_rate": 9.882159628922903e-06, + "loss": 16.7707, + "step": 41710 + }, + { + "epoch": 0.7543604684014366, + "grad_norm": 42.03125, + "learning_rate": 9.882131376616244e-06, + "loss": 17.0561, + "step": 41720 + }, + { + "epoch": 0.7545412834705645, + "grad_norm": 44.3125, + "learning_rate": 9.882103124309586e-06, + "loss": 17.1578, + "step": 41730 + }, + { + "epoch": 0.7547220985396923, + "grad_norm": 45.0, + "learning_rate": 9.882074872002927e-06, + "loss": 16.8074, + "step": 41740 + }, + { + "epoch": 0.7549029136088201, + "grad_norm": 41.15625, + "learning_rate": 9.882046619696266e-06, + "loss": 16.7429, + "step": 41750 + }, + { + "epoch": 0.755083728677948, + "grad_norm": 43.09375, + "learning_rate": 9.882018367389606e-06, + "loss": 16.8894, + "step": 41760 + }, + { + "epoch": 0.7552645437470759, + "grad_norm": 42.6875, + "learning_rate": 9.881990115082947e-06, + "loss": 16.8988, + "step": 41770 + }, + { + "epoch": 0.7554453588162038, + "grad_norm": 43.21875, + "learning_rate": 9.881961862776287e-06, + "loss": 16.8764, + "step": 41780 + }, + { + "epoch": 0.7556261738853316, + "grad_norm": 44.1875, + "learning_rate": 9.881933610469628e-06, + "loss": 17.0994, + "step": 41790 + }, + { + "epoch": 0.7558069889544594, + "grad_norm": 44.4375, + "learning_rate": 9.881905358162967e-06, + "loss": 16.8171, + "step": 41800 + }, + { + "epoch": 0.7559878040235873, + "grad_norm": 42.90625, + "learning_rate": 9.88187710585631e-06, + "loss": 17.0823, + "step": 41810 + }, + { + "epoch": 0.7561686190927152, + "grad_norm": 42.1875, + "learning_rate": 9.88184885354965e-06, + "loss": 16.7577, + "step": 41820 + }, + { + "epoch": 0.7563494341618431, + "grad_norm": 41.0, + "learning_rate": 9.881820601242989e-06, + "loss": 17.2433, + "step": 41830 + }, + { + "epoch": 0.7565302492309709, + "grad_norm": 43.90625, + "learning_rate": 9.88179234893633e-06, + "loss": 17.117, + "step": 41840 + }, + { + "epoch": 0.7567110643000988, + "grad_norm": 42.625, + "learning_rate": 9.88176409662967e-06, + "loss": 16.6246, + "step": 41850 + }, + { + "epoch": 0.7568918793692266, + "grad_norm": 41.71875, + "learning_rate": 9.88173584432301e-06, + "loss": 17.0378, + "step": 41860 + }, + { + "epoch": 0.7570726944383545, + "grad_norm": 43.3125, + "learning_rate": 9.881707592016351e-06, + "loss": 16.559, + "step": 41870 + }, + { + "epoch": 0.7572535095074824, + "grad_norm": 46.1875, + "learning_rate": 9.88167933970969e-06, + "loss": 16.9933, + "step": 41880 + }, + { + "epoch": 0.7574343245766102, + "grad_norm": 41.65625, + "learning_rate": 9.881651087403031e-06, + "loss": 16.5829, + "step": 41890 + }, + { + "epoch": 0.7576151396457381, + "grad_norm": 40.9375, + "learning_rate": 9.881622835096373e-06, + "loss": 16.7736, + "step": 41900 + }, + { + "epoch": 0.7577959547148659, + "grad_norm": 44.59375, + "learning_rate": 9.881594582789714e-06, + "loss": 16.9979, + "step": 41910 + }, + { + "epoch": 0.7579767697839938, + "grad_norm": 42.84375, + "learning_rate": 9.881566330483053e-06, + "loss": 16.8586, + "step": 41920 + }, + { + "epoch": 0.7581575848531217, + "grad_norm": 41.96875, + "learning_rate": 9.881538078176393e-06, + "loss": 16.416, + "step": 41930 + }, + { + "epoch": 0.7583383999222495, + "grad_norm": 43.375, + "learning_rate": 9.881509825869734e-06, + "loss": 17.0207, + "step": 41940 + }, + { + "epoch": 0.7585192149913774, + "grad_norm": 45.6875, + "learning_rate": 9.881481573563075e-06, + "loss": 16.8822, + "step": 41950 + }, + { + "epoch": 0.7587000300605052, + "grad_norm": 41.375, + "learning_rate": 9.881453321256415e-06, + "loss": 16.8448, + "step": 41960 + }, + { + "epoch": 0.758880845129633, + "grad_norm": 45.3125, + "learning_rate": 9.881425068949754e-06, + "loss": 16.5718, + "step": 41970 + }, + { + "epoch": 0.759061660198761, + "grad_norm": 44.125, + "learning_rate": 9.881396816643095e-06, + "loss": 16.2509, + "step": 41980 + }, + { + "epoch": 0.7592424752678888, + "grad_norm": 43.5, + "learning_rate": 9.881368564336437e-06, + "loss": 16.4995, + "step": 41990 + }, + { + "epoch": 0.7594232903370167, + "grad_norm": 42.21875, + "learning_rate": 9.881340312029776e-06, + "loss": 16.7124, + "step": 42000 + }, + { + "epoch": 0.7596041054061445, + "grad_norm": 43.03125, + "learning_rate": 9.881312059723117e-06, + "loss": 16.6199, + "step": 42010 + }, + { + "epoch": 0.7597849204752725, + "grad_norm": 41.125, + "learning_rate": 9.881283807416457e-06, + "loss": 17.145, + "step": 42020 + }, + { + "epoch": 0.7599657355444003, + "grad_norm": 40.96875, + "learning_rate": 9.881255555109798e-06, + "loss": 16.6874, + "step": 42030 + }, + { + "epoch": 0.7601465506135281, + "grad_norm": 43.1875, + "learning_rate": 9.881227302803139e-06, + "loss": 16.8522, + "step": 42040 + }, + { + "epoch": 0.760327365682656, + "grad_norm": 41.6875, + "learning_rate": 9.881199050496479e-06, + "loss": 16.6918, + "step": 42050 + }, + { + "epoch": 0.7605081807517838, + "grad_norm": 41.65625, + "learning_rate": 9.881170798189818e-06, + "loss": 16.6511, + "step": 42060 + }, + { + "epoch": 0.7606889958209118, + "grad_norm": 45.875, + "learning_rate": 9.881142545883159e-06, + "loss": 16.3789, + "step": 42070 + }, + { + "epoch": 0.7608698108900396, + "grad_norm": 45.34375, + "learning_rate": 9.881114293576501e-06, + "loss": 16.3309, + "step": 42080 + }, + { + "epoch": 0.7610506259591674, + "grad_norm": 45.5625, + "learning_rate": 9.88108604126984e-06, + "loss": 17.0268, + "step": 42090 + }, + { + "epoch": 0.7612314410282953, + "grad_norm": 41.84375, + "learning_rate": 9.88105778896318e-06, + "loss": 16.5562, + "step": 42100 + }, + { + "epoch": 0.7614122560974231, + "grad_norm": 41.59375, + "learning_rate": 9.881029536656521e-06, + "loss": 16.671, + "step": 42110 + }, + { + "epoch": 0.7615930711665511, + "grad_norm": 42.0625, + "learning_rate": 9.881001284349862e-06, + "loss": 16.8582, + "step": 42120 + }, + { + "epoch": 0.7617738862356789, + "grad_norm": 41.65625, + "learning_rate": 9.880973032043202e-06, + "loss": 16.7876, + "step": 42130 + }, + { + "epoch": 0.7619547013048067, + "grad_norm": 39.40625, + "learning_rate": 9.880944779736541e-06, + "loss": 16.9899, + "step": 42140 + }, + { + "epoch": 0.7621355163739346, + "grad_norm": 43.4375, + "learning_rate": 9.880916527429882e-06, + "loss": 16.6254, + "step": 42150 + }, + { + "epoch": 0.7623163314430624, + "grad_norm": 45.5, + "learning_rate": 9.880888275123224e-06, + "loss": 16.5524, + "step": 42160 + }, + { + "epoch": 0.7624971465121904, + "grad_norm": 43.25, + "learning_rate": 9.880860022816565e-06, + "loss": 17.2899, + "step": 42170 + }, + { + "epoch": 0.7626779615813182, + "grad_norm": 45.09375, + "learning_rate": 9.880831770509904e-06, + "loss": 16.8959, + "step": 42180 + }, + { + "epoch": 0.7628587766504461, + "grad_norm": 43.28125, + "learning_rate": 9.880803518203244e-06, + "loss": 16.9736, + "step": 42190 + }, + { + "epoch": 0.7630395917195739, + "grad_norm": 46.625, + "learning_rate": 9.880775265896585e-06, + "loss": 17.254, + "step": 42200 + }, + { + "epoch": 0.7632204067887017, + "grad_norm": 43.28125, + "learning_rate": 9.880747013589926e-06, + "loss": 16.6953, + "step": 42210 + }, + { + "epoch": 0.7634012218578297, + "grad_norm": 45.03125, + "learning_rate": 9.880718761283266e-06, + "loss": 16.8156, + "step": 42220 + }, + { + "epoch": 0.7635820369269575, + "grad_norm": 41.25, + "learning_rate": 9.880690508976605e-06, + "loss": 16.7833, + "step": 42230 + }, + { + "epoch": 0.7637628519960854, + "grad_norm": 45.25, + "learning_rate": 9.880662256669946e-06, + "loss": 17.0329, + "step": 42240 + }, + { + "epoch": 0.7639436670652132, + "grad_norm": 42.5625, + "learning_rate": 9.880634004363288e-06, + "loss": 16.7803, + "step": 42250 + }, + { + "epoch": 0.764124482134341, + "grad_norm": 46.65625, + "learning_rate": 9.880605752056627e-06, + "loss": 16.7741, + "step": 42260 + }, + { + "epoch": 0.764305297203469, + "grad_norm": 44.46875, + "learning_rate": 9.880577499749968e-06, + "loss": 16.4529, + "step": 42270 + }, + { + "epoch": 0.7644861122725968, + "grad_norm": 43.78125, + "learning_rate": 9.880549247443308e-06, + "loss": 16.7497, + "step": 42280 + }, + { + "epoch": 0.7646669273417247, + "grad_norm": 40.59375, + "learning_rate": 9.880520995136649e-06, + "loss": 16.7111, + "step": 42290 + }, + { + "epoch": 0.7648477424108525, + "grad_norm": 43.75, + "learning_rate": 9.88049274282999e-06, + "loss": 16.9267, + "step": 42300 + }, + { + "epoch": 0.7650285574799803, + "grad_norm": 43.25, + "learning_rate": 9.880464490523329e-06, + "loss": 17.0684, + "step": 42310 + }, + { + "epoch": 0.7652093725491083, + "grad_norm": 42.96875, + "learning_rate": 9.88043623821667e-06, + "loss": 16.6041, + "step": 42320 + }, + { + "epoch": 0.7653901876182361, + "grad_norm": 41.40625, + "learning_rate": 9.88040798591001e-06, + "loss": 16.4228, + "step": 42330 + }, + { + "epoch": 0.765571002687364, + "grad_norm": 43.8125, + "learning_rate": 9.880379733603352e-06, + "loss": 16.712, + "step": 42340 + }, + { + "epoch": 0.7657518177564918, + "grad_norm": 42.375, + "learning_rate": 9.880351481296691e-06, + "loss": 16.6611, + "step": 42350 + }, + { + "epoch": 0.7659326328256197, + "grad_norm": 43.0, + "learning_rate": 9.880323228990032e-06, + "loss": 16.5457, + "step": 42360 + }, + { + "epoch": 0.7661134478947476, + "grad_norm": 44.78125, + "learning_rate": 9.880294976683372e-06, + "loss": 17.0666, + "step": 42370 + }, + { + "epoch": 0.7662942629638754, + "grad_norm": 44.03125, + "learning_rate": 9.880266724376713e-06, + "loss": 16.7831, + "step": 42380 + }, + { + "epoch": 0.7664750780330033, + "grad_norm": 44.34375, + "learning_rate": 9.880238472070054e-06, + "loss": 17.1136, + "step": 42390 + }, + { + "epoch": 0.7666558931021311, + "grad_norm": 42.71875, + "learning_rate": 9.880210219763392e-06, + "loss": 16.8953, + "step": 42400 + }, + { + "epoch": 0.766836708171259, + "grad_norm": 41.25, + "learning_rate": 9.880181967456733e-06, + "loss": 16.8162, + "step": 42410 + }, + { + "epoch": 0.7670175232403869, + "grad_norm": 42.0, + "learning_rate": 9.880153715150074e-06, + "loss": 16.8969, + "step": 42420 + }, + { + "epoch": 0.7671983383095147, + "grad_norm": 43.9375, + "learning_rate": 9.880125462843414e-06, + "loss": 16.7205, + "step": 42430 + }, + { + "epoch": 0.7673791533786426, + "grad_norm": 47.90625, + "learning_rate": 9.880097210536755e-06, + "loss": 16.8168, + "step": 42440 + }, + { + "epoch": 0.7675599684477704, + "grad_norm": 41.5, + "learning_rate": 9.880068958230096e-06, + "loss": 17.1686, + "step": 42450 + }, + { + "epoch": 0.7677407835168983, + "grad_norm": 41.46875, + "learning_rate": 9.880040705923436e-06, + "loss": 16.6928, + "step": 42460 + }, + { + "epoch": 0.7679215985860262, + "grad_norm": 40.46875, + "learning_rate": 9.880012453616777e-06, + "loss": 17.0065, + "step": 42470 + }, + { + "epoch": 0.768102413655154, + "grad_norm": 45.15625, + "learning_rate": 9.879984201310117e-06, + "loss": 16.5089, + "step": 42480 + }, + { + "epoch": 0.7682832287242819, + "grad_norm": 46.0, + "learning_rate": 9.879955949003456e-06, + "loss": 16.6789, + "step": 42490 + }, + { + "epoch": 0.7684640437934097, + "grad_norm": 43.75, + "learning_rate": 9.879927696696797e-06, + "loss": 17.1577, + "step": 42500 + }, + { + "epoch": 0.7686448588625376, + "grad_norm": 44.0625, + "learning_rate": 9.87989944439014e-06, + "loss": 16.6613, + "step": 42510 + }, + { + "epoch": 0.7688256739316655, + "grad_norm": 42.625, + "learning_rate": 9.879871192083478e-06, + "loss": 16.8114, + "step": 42520 + }, + { + "epoch": 0.7690064890007934, + "grad_norm": 41.15625, + "learning_rate": 9.879842939776819e-06, + "loss": 17.2868, + "step": 42530 + }, + { + "epoch": 0.7691873040699212, + "grad_norm": 40.78125, + "learning_rate": 9.87981468747016e-06, + "loss": 16.7327, + "step": 42540 + }, + { + "epoch": 0.769368119139049, + "grad_norm": 41.4375, + "learning_rate": 9.8797864351635e-06, + "loss": 16.9727, + "step": 42550 + }, + { + "epoch": 0.7695489342081769, + "grad_norm": 42.375, + "learning_rate": 9.87975818285684e-06, + "loss": 16.9383, + "step": 42560 + }, + { + "epoch": 0.7697297492773048, + "grad_norm": 43.375, + "learning_rate": 9.87972993055018e-06, + "loss": 16.681, + "step": 42570 + }, + { + "epoch": 0.7699105643464327, + "grad_norm": 43.6875, + "learning_rate": 9.87970167824352e-06, + "loss": 17.1889, + "step": 42580 + }, + { + "epoch": 0.7700913794155605, + "grad_norm": 42.25, + "learning_rate": 9.879673425936861e-06, + "loss": 16.4792, + "step": 42590 + }, + { + "epoch": 0.7702721944846883, + "grad_norm": 44.46875, + "learning_rate": 9.879645173630203e-06, + "loss": 16.7619, + "step": 42600 + }, + { + "epoch": 0.7704530095538162, + "grad_norm": 42.125, + "learning_rate": 9.879616921323542e-06, + "loss": 16.4408, + "step": 42610 + }, + { + "epoch": 0.770633824622944, + "grad_norm": 43.3125, + "learning_rate": 9.879588669016883e-06, + "loss": 16.6894, + "step": 42620 + }, + { + "epoch": 0.770814639692072, + "grad_norm": 42.46875, + "learning_rate": 9.879560416710223e-06, + "loss": 16.7997, + "step": 42630 + }, + { + "epoch": 0.7709954547611998, + "grad_norm": 44.28125, + "learning_rate": 9.879532164403564e-06, + "loss": 17.1275, + "step": 42640 + }, + { + "epoch": 0.7711762698303276, + "grad_norm": 41.75, + "learning_rate": 9.879503912096905e-06, + "loss": 16.7961, + "step": 42650 + }, + { + "epoch": 0.7713570848994555, + "grad_norm": 45.59375, + "learning_rate": 9.879475659790244e-06, + "loss": 16.8425, + "step": 42660 + }, + { + "epoch": 0.7715378999685834, + "grad_norm": 43.9375, + "learning_rate": 9.879447407483584e-06, + "loss": 16.9238, + "step": 42670 + }, + { + "epoch": 0.7717187150377113, + "grad_norm": 44.0, + "learning_rate": 9.879419155176925e-06, + "loss": 16.7722, + "step": 42680 + }, + { + "epoch": 0.7718995301068391, + "grad_norm": 44.96875, + "learning_rate": 9.879390902870265e-06, + "loss": 16.8247, + "step": 42690 + }, + { + "epoch": 0.772080345175967, + "grad_norm": 40.78125, + "learning_rate": 9.879362650563606e-06, + "loss": 16.495, + "step": 42700 + }, + { + "epoch": 0.7722611602450948, + "grad_norm": 41.15625, + "learning_rate": 9.879334398256947e-06, + "loss": 16.5344, + "step": 42710 + }, + { + "epoch": 0.7724419753142227, + "grad_norm": 44.625, + "learning_rate": 9.879306145950287e-06, + "loss": 16.6817, + "step": 42720 + }, + { + "epoch": 0.7726227903833506, + "grad_norm": 44.09375, + "learning_rate": 9.879277893643628e-06, + "loss": 16.883, + "step": 42730 + }, + { + "epoch": 0.7728036054524784, + "grad_norm": 44.5625, + "learning_rate": 9.879249641336967e-06, + "loss": 16.7871, + "step": 42740 + }, + { + "epoch": 0.7729844205216063, + "grad_norm": 43.625, + "learning_rate": 9.879221389030307e-06, + "loss": 16.6971, + "step": 42750 + }, + { + "epoch": 0.7731652355907341, + "grad_norm": 44.53125, + "learning_rate": 9.879193136723648e-06, + "loss": 16.6874, + "step": 42760 + }, + { + "epoch": 0.773346050659862, + "grad_norm": 43.84375, + "learning_rate": 9.879164884416989e-06, + "loss": 16.8721, + "step": 42770 + }, + { + "epoch": 0.7735268657289899, + "grad_norm": 42.65625, + "learning_rate": 9.87913663211033e-06, + "loss": 16.7498, + "step": 42780 + }, + { + "epoch": 0.7737076807981177, + "grad_norm": 41.5625, + "learning_rate": 9.87910837980367e-06, + "loss": 16.7373, + "step": 42790 + }, + { + "epoch": 0.7738884958672456, + "grad_norm": 42.65625, + "learning_rate": 9.87908012749701e-06, + "loss": 16.5815, + "step": 42800 + }, + { + "epoch": 0.7740693109363734, + "grad_norm": 40.15625, + "learning_rate": 9.879051875190351e-06, + "loss": 16.9589, + "step": 42810 + }, + { + "epoch": 0.7742501260055012, + "grad_norm": 44.78125, + "learning_rate": 9.879023622883692e-06, + "loss": 17.089, + "step": 42820 + }, + { + "epoch": 0.7744309410746292, + "grad_norm": 40.9375, + "learning_rate": 9.87899537057703e-06, + "loss": 16.7203, + "step": 42830 + }, + { + "epoch": 0.774611756143757, + "grad_norm": 43.03125, + "learning_rate": 9.878967118270371e-06, + "loss": 16.8916, + "step": 42840 + }, + { + "epoch": 0.7747925712128849, + "grad_norm": 43.4375, + "learning_rate": 9.878938865963712e-06, + "loss": 16.8227, + "step": 42850 + }, + { + "epoch": 0.7749733862820127, + "grad_norm": 44.5625, + "learning_rate": 9.878910613657053e-06, + "loss": 16.8217, + "step": 42860 + }, + { + "epoch": 0.7751542013511407, + "grad_norm": 42.375, + "learning_rate": 9.878882361350393e-06, + "loss": 16.9417, + "step": 42870 + }, + { + "epoch": 0.7753350164202685, + "grad_norm": 43.46875, + "learning_rate": 9.878854109043734e-06, + "loss": 16.5462, + "step": 42880 + }, + { + "epoch": 0.7755158314893963, + "grad_norm": 42.28125, + "learning_rate": 9.878825856737074e-06, + "loss": 16.7315, + "step": 42890 + }, + { + "epoch": 0.7756966465585242, + "grad_norm": 43.28125, + "learning_rate": 9.878797604430415e-06, + "loss": 16.9322, + "step": 42900 + }, + { + "epoch": 0.775877461627652, + "grad_norm": 41.28125, + "learning_rate": 9.878769352123756e-06, + "loss": 16.8821, + "step": 42910 + }, + { + "epoch": 0.77605827669678, + "grad_norm": 42.3125, + "learning_rate": 9.878741099817095e-06, + "loss": 17.2593, + "step": 42920 + }, + { + "epoch": 0.7762390917659078, + "grad_norm": 42.71875, + "learning_rate": 9.878712847510435e-06, + "loss": 16.8446, + "step": 42930 + }, + { + "epoch": 0.7764199068350356, + "grad_norm": 41.90625, + "learning_rate": 9.878684595203776e-06, + "loss": 16.921, + "step": 42940 + }, + { + "epoch": 0.7766007219041635, + "grad_norm": 42.4375, + "learning_rate": 9.878656342897117e-06, + "loss": 17.1003, + "step": 42950 + }, + { + "epoch": 0.7767815369732913, + "grad_norm": 40.9375, + "learning_rate": 9.878628090590457e-06, + "loss": 16.8286, + "step": 42960 + }, + { + "epoch": 0.7769623520424193, + "grad_norm": 43.5625, + "learning_rate": 9.878599838283798e-06, + "loss": 17.0732, + "step": 42970 + }, + { + "epoch": 0.7771431671115471, + "grad_norm": 41.71875, + "learning_rate": 9.878571585977138e-06, + "loss": 16.7076, + "step": 42980 + }, + { + "epoch": 0.7773239821806749, + "grad_norm": 42.5, + "learning_rate": 9.878543333670479e-06, + "loss": 16.9669, + "step": 42990 + }, + { + "epoch": 0.7775047972498028, + "grad_norm": 46.78125, + "learning_rate": 9.878515081363818e-06, + "loss": 16.9687, + "step": 43000 + }, + { + "epoch": 0.7776856123189306, + "grad_norm": 43.84375, + "learning_rate": 9.878486829057159e-06, + "loss": 16.8253, + "step": 43010 + }, + { + "epoch": 0.7778664273880586, + "grad_norm": 41.25, + "learning_rate": 9.8784585767505e-06, + "loss": 16.6976, + "step": 43020 + }, + { + "epoch": 0.7780472424571864, + "grad_norm": 42.53125, + "learning_rate": 9.87843032444384e-06, + "loss": 17.3396, + "step": 43030 + }, + { + "epoch": 0.7782280575263142, + "grad_norm": 44.25, + "learning_rate": 9.87840207213718e-06, + "loss": 16.4274, + "step": 43040 + }, + { + "epoch": 0.7784088725954421, + "grad_norm": 41.4375, + "learning_rate": 9.878373819830521e-06, + "loss": 16.6991, + "step": 43050 + }, + { + "epoch": 0.7785896876645699, + "grad_norm": 43.4375, + "learning_rate": 9.878345567523862e-06, + "loss": 17.0445, + "step": 43060 + }, + { + "epoch": 0.7787705027336979, + "grad_norm": 39.5, + "learning_rate": 9.878317315217202e-06, + "loss": 16.6232, + "step": 43070 + }, + { + "epoch": 0.7789513178028257, + "grad_norm": 40.9375, + "learning_rate": 9.878289062910543e-06, + "loss": 16.6776, + "step": 43080 + }, + { + "epoch": 0.7791321328719536, + "grad_norm": 44.0625, + "learning_rate": 9.878260810603882e-06, + "loss": 16.6483, + "step": 43090 + }, + { + "epoch": 0.7793129479410814, + "grad_norm": 41.0625, + "learning_rate": 9.878232558297222e-06, + "loss": 17.0843, + "step": 43100 + }, + { + "epoch": 0.7794937630102092, + "grad_norm": 42.4375, + "learning_rate": 9.878204305990563e-06, + "loss": 16.4456, + "step": 43110 + }, + { + "epoch": 0.7796745780793372, + "grad_norm": 44.90625, + "learning_rate": 9.878176053683904e-06, + "loss": 16.9825, + "step": 43120 + }, + { + "epoch": 0.779855393148465, + "grad_norm": 43.9375, + "learning_rate": 9.878147801377244e-06, + "loss": 17.033, + "step": 43130 + }, + { + "epoch": 0.7800362082175929, + "grad_norm": 42.0, + "learning_rate": 9.878119549070585e-06, + "loss": 16.6665, + "step": 43140 + }, + { + "epoch": 0.7802170232867207, + "grad_norm": 45.46875, + "learning_rate": 9.878091296763926e-06, + "loss": 16.7691, + "step": 43150 + }, + { + "epoch": 0.7803978383558485, + "grad_norm": 42.3125, + "learning_rate": 9.878063044457266e-06, + "loss": 16.9256, + "step": 43160 + }, + { + "epoch": 0.7805786534249765, + "grad_norm": 44.9375, + "learning_rate": 9.878034792150605e-06, + "loss": 16.8972, + "step": 43170 + }, + { + "epoch": 0.7807594684941043, + "grad_norm": 43.15625, + "learning_rate": 9.878006539843946e-06, + "loss": 16.2868, + "step": 43180 + }, + { + "epoch": 0.7809402835632322, + "grad_norm": 43.375, + "learning_rate": 9.877978287537286e-06, + "loss": 16.5691, + "step": 43190 + }, + { + "epoch": 0.78112109863236, + "grad_norm": 44.09375, + "learning_rate": 9.877950035230627e-06, + "loss": 16.9558, + "step": 43200 + }, + { + "epoch": 0.7813019137014878, + "grad_norm": 42.78125, + "learning_rate": 9.877921782923968e-06, + "loss": 16.5978, + "step": 43210 + }, + { + "epoch": 0.7814827287706158, + "grad_norm": 43.5, + "learning_rate": 9.877893530617308e-06, + "loss": 16.3195, + "step": 43220 + }, + { + "epoch": 0.7816635438397436, + "grad_norm": 42.78125, + "learning_rate": 9.877865278310649e-06, + "loss": 16.8766, + "step": 43230 + }, + { + "epoch": 0.7818443589088715, + "grad_norm": 42.71875, + "learning_rate": 9.87783702600399e-06, + "loss": 16.7194, + "step": 43240 + }, + { + "epoch": 0.7820251739779993, + "grad_norm": 43.4375, + "learning_rate": 9.87780877369733e-06, + "loss": 16.5889, + "step": 43250 + }, + { + "epoch": 0.7822059890471272, + "grad_norm": 41.75, + "learning_rate": 9.877780521390669e-06, + "loss": 16.5907, + "step": 43260 + }, + { + "epoch": 0.782386804116255, + "grad_norm": 44.21875, + "learning_rate": 9.87775226908401e-06, + "loss": 16.5148, + "step": 43270 + }, + { + "epoch": 0.7825676191853829, + "grad_norm": 42.53125, + "learning_rate": 9.87772401677735e-06, + "loss": 17.0262, + "step": 43280 + }, + { + "epoch": 0.7827484342545108, + "grad_norm": 40.875, + "learning_rate": 9.877695764470691e-06, + "loss": 16.7053, + "step": 43290 + }, + { + "epoch": 0.7829292493236386, + "grad_norm": 42.65625, + "learning_rate": 9.877667512164032e-06, + "loss": 16.8643, + "step": 43300 + }, + { + "epoch": 0.7831100643927665, + "grad_norm": 46.46875, + "learning_rate": 9.877639259857372e-06, + "loss": 16.8704, + "step": 43310 + }, + { + "epoch": 0.7832908794618944, + "grad_norm": 45.46875, + "learning_rate": 9.877611007550713e-06, + "loss": 16.2131, + "step": 43320 + }, + { + "epoch": 0.7834716945310222, + "grad_norm": 41.90625, + "learning_rate": 9.877582755244053e-06, + "loss": 17.0028, + "step": 43330 + }, + { + "epoch": 0.7836525096001501, + "grad_norm": 43.5, + "learning_rate": 9.877554502937392e-06, + "loss": 17.1055, + "step": 43340 + }, + { + "epoch": 0.7838333246692779, + "grad_norm": 42.8125, + "learning_rate": 9.877526250630733e-06, + "loss": 16.8805, + "step": 43350 + }, + { + "epoch": 0.7840141397384058, + "grad_norm": 44.78125, + "learning_rate": 9.877497998324074e-06, + "loss": 16.7048, + "step": 43360 + }, + { + "epoch": 0.7841949548075337, + "grad_norm": 40.875, + "learning_rate": 9.877469746017414e-06, + "loss": 16.7925, + "step": 43370 + }, + { + "epoch": 0.7843757698766615, + "grad_norm": 43.75, + "learning_rate": 9.877441493710755e-06, + "loss": 16.9311, + "step": 43380 + }, + { + "epoch": 0.7845565849457894, + "grad_norm": 42.15625, + "learning_rate": 9.877413241404095e-06, + "loss": 16.5517, + "step": 43390 + }, + { + "epoch": 0.7847374000149172, + "grad_norm": 43.71875, + "learning_rate": 9.877384989097436e-06, + "loss": 16.7184, + "step": 43400 + }, + { + "epoch": 0.7849182150840451, + "grad_norm": 43.625, + "learning_rate": 9.877356736790777e-06, + "loss": 16.433, + "step": 43410 + }, + { + "epoch": 0.785099030153173, + "grad_norm": 43.84375, + "learning_rate": 9.877328484484117e-06, + "loss": 16.5557, + "step": 43420 + }, + { + "epoch": 0.7852798452223009, + "grad_norm": 43.5625, + "learning_rate": 9.877300232177456e-06, + "loss": 17.1738, + "step": 43430 + }, + { + "epoch": 0.7854606602914287, + "grad_norm": 43.09375, + "learning_rate": 9.877271979870797e-06, + "loss": 16.7636, + "step": 43440 + }, + { + "epoch": 0.7856414753605565, + "grad_norm": 41.4375, + "learning_rate": 9.877243727564137e-06, + "loss": 16.6188, + "step": 43450 + }, + { + "epoch": 0.7858222904296844, + "grad_norm": 46.0, + "learning_rate": 9.877215475257478e-06, + "loss": 16.8037, + "step": 43460 + }, + { + "epoch": 0.7860031054988122, + "grad_norm": 42.34375, + "learning_rate": 9.877187222950819e-06, + "loss": 16.84, + "step": 43470 + }, + { + "epoch": 0.7861839205679402, + "grad_norm": 40.71875, + "learning_rate": 9.87715897064416e-06, + "loss": 16.7468, + "step": 43480 + }, + { + "epoch": 0.786364735637068, + "grad_norm": 42.71875, + "learning_rate": 9.8771307183375e-06, + "loss": 16.8372, + "step": 43490 + }, + { + "epoch": 0.7865455507061958, + "grad_norm": 41.9375, + "learning_rate": 9.87710246603084e-06, + "loss": 16.5974, + "step": 43500 + }, + { + "epoch": 0.7867263657753237, + "grad_norm": 41.90625, + "learning_rate": 9.877074213724181e-06, + "loss": 16.6528, + "step": 43510 + }, + { + "epoch": 0.7869071808444515, + "grad_norm": 43.8125, + "learning_rate": 9.87704596141752e-06, + "loss": 16.7555, + "step": 43520 + }, + { + "epoch": 0.7870879959135795, + "grad_norm": 41.5, + "learning_rate": 9.87701770911086e-06, + "loss": 17.2377, + "step": 43530 + }, + { + "epoch": 0.7872688109827073, + "grad_norm": 43.375, + "learning_rate": 9.876989456804201e-06, + "loss": 17.0131, + "step": 43540 + }, + { + "epoch": 0.7874496260518351, + "grad_norm": 43.8125, + "learning_rate": 9.876961204497542e-06, + "loss": 16.7978, + "step": 43550 + }, + { + "epoch": 0.787630441120963, + "grad_norm": 44.0, + "learning_rate": 9.876932952190883e-06, + "loss": 17.0134, + "step": 43560 + }, + { + "epoch": 0.7878112561900908, + "grad_norm": 45.40625, + "learning_rate": 9.876904699884223e-06, + "loss": 16.7749, + "step": 43570 + }, + { + "epoch": 0.7879920712592188, + "grad_norm": 43.9375, + "learning_rate": 9.876876447577564e-06, + "loss": 16.8705, + "step": 43580 + }, + { + "epoch": 0.7881728863283466, + "grad_norm": 42.8125, + "learning_rate": 9.876848195270904e-06, + "loss": 16.4949, + "step": 43590 + }, + { + "epoch": 0.7883537013974745, + "grad_norm": 46.65625, + "learning_rate": 9.876819942964243e-06, + "loss": 17.022, + "step": 43600 + }, + { + "epoch": 0.7885345164666023, + "grad_norm": 42.15625, + "learning_rate": 9.876791690657584e-06, + "loss": 16.8378, + "step": 43610 + }, + { + "epoch": 0.7887153315357301, + "grad_norm": 42.53125, + "learning_rate": 9.876763438350925e-06, + "loss": 16.9004, + "step": 43620 + }, + { + "epoch": 0.7888961466048581, + "grad_norm": 44.09375, + "learning_rate": 9.876735186044265e-06, + "loss": 16.8231, + "step": 43630 + }, + { + "epoch": 0.7890769616739859, + "grad_norm": 44.375, + "learning_rate": 9.876706933737606e-06, + "loss": 16.7193, + "step": 43640 + }, + { + "epoch": 0.7892577767431138, + "grad_norm": 44.1875, + "learning_rate": 9.876678681430947e-06, + "loss": 16.8568, + "step": 43650 + }, + { + "epoch": 0.7894385918122416, + "grad_norm": 43.84375, + "learning_rate": 9.876650429124287e-06, + "loss": 16.5435, + "step": 43660 + }, + { + "epoch": 0.7896194068813694, + "grad_norm": 43.5, + "learning_rate": 9.876622176817628e-06, + "loss": 16.9701, + "step": 43670 + }, + { + "epoch": 0.7898002219504974, + "grad_norm": 44.15625, + "learning_rate": 9.876593924510968e-06, + "loss": 16.5421, + "step": 43680 + }, + { + "epoch": 0.7899810370196252, + "grad_norm": 41.40625, + "learning_rate": 9.876565672204307e-06, + "loss": 16.8361, + "step": 43690 + }, + { + "epoch": 0.7901618520887531, + "grad_norm": 43.8125, + "learning_rate": 9.876537419897648e-06, + "loss": 17.0753, + "step": 43700 + }, + { + "epoch": 0.7903426671578809, + "grad_norm": 43.1875, + "learning_rate": 9.876509167590989e-06, + "loss": 16.8627, + "step": 43710 + }, + { + "epoch": 0.7905234822270087, + "grad_norm": 43.40625, + "learning_rate": 9.87648091528433e-06, + "loss": 16.7451, + "step": 43720 + }, + { + "epoch": 0.7907042972961367, + "grad_norm": 42.59375, + "learning_rate": 9.87645266297767e-06, + "loss": 16.8212, + "step": 43730 + }, + { + "epoch": 0.7908851123652645, + "grad_norm": 41.8125, + "learning_rate": 9.87642441067101e-06, + "loss": 16.9218, + "step": 43740 + }, + { + "epoch": 0.7910659274343924, + "grad_norm": 43.96875, + "learning_rate": 9.876396158364351e-06, + "loss": 16.8543, + "step": 43750 + }, + { + "epoch": 0.7912467425035202, + "grad_norm": 42.3125, + "learning_rate": 9.876367906057692e-06, + "loss": 16.5328, + "step": 43760 + }, + { + "epoch": 0.7914275575726482, + "grad_norm": 43.8125, + "learning_rate": 9.87633965375103e-06, + "loss": 16.5861, + "step": 43770 + }, + { + "epoch": 0.791608372641776, + "grad_norm": 43.1875, + "learning_rate": 9.876311401444371e-06, + "loss": 16.7647, + "step": 43780 + }, + { + "epoch": 0.7917891877109038, + "grad_norm": 41.5, + "learning_rate": 9.876283149137712e-06, + "loss": 16.6035, + "step": 43790 + }, + { + "epoch": 0.7919700027800317, + "grad_norm": 44.40625, + "learning_rate": 9.876254896831052e-06, + "loss": 16.502, + "step": 43800 + }, + { + "epoch": 0.7921508178491595, + "grad_norm": 42.5, + "learning_rate": 9.876226644524393e-06, + "loss": 16.6553, + "step": 43810 + }, + { + "epoch": 0.7923316329182875, + "grad_norm": 43.21875, + "learning_rate": 9.876198392217734e-06, + "loss": 16.2923, + "step": 43820 + }, + { + "epoch": 0.7925124479874153, + "grad_norm": 47.375, + "learning_rate": 9.876170139911074e-06, + "loss": 16.8679, + "step": 43830 + }, + { + "epoch": 0.7926932630565431, + "grad_norm": 46.5, + "learning_rate": 9.876141887604415e-06, + "loss": 17.1426, + "step": 43840 + }, + { + "epoch": 0.792874078125671, + "grad_norm": 43.5625, + "learning_rate": 9.876113635297756e-06, + "loss": 16.7616, + "step": 43850 + }, + { + "epoch": 0.7930548931947988, + "grad_norm": 41.1875, + "learning_rate": 9.876085382991095e-06, + "loss": 16.7291, + "step": 43860 + }, + { + "epoch": 0.7932357082639268, + "grad_norm": 41.75, + "learning_rate": 9.876057130684435e-06, + "loss": 16.9794, + "step": 43870 + }, + { + "epoch": 0.7934165233330546, + "grad_norm": 45.90625, + "learning_rate": 9.876028878377776e-06, + "loss": 16.7668, + "step": 43880 + }, + { + "epoch": 0.7935973384021824, + "grad_norm": 44.6875, + "learning_rate": 9.876000626071116e-06, + "loss": 17.1568, + "step": 43890 + }, + { + "epoch": 0.7937781534713103, + "grad_norm": 42.34375, + "learning_rate": 9.875972373764457e-06, + "loss": 16.3285, + "step": 43900 + }, + { + "epoch": 0.7939589685404381, + "grad_norm": 45.6875, + "learning_rate": 9.875944121457796e-06, + "loss": 16.5321, + "step": 43910 + }, + { + "epoch": 0.794139783609566, + "grad_norm": 43.75, + "learning_rate": 9.875915869151138e-06, + "loss": 16.8059, + "step": 43920 + }, + { + "epoch": 0.7943205986786939, + "grad_norm": 41.78125, + "learning_rate": 9.875887616844479e-06, + "loss": 16.7762, + "step": 43930 + }, + { + "epoch": 0.7945014137478218, + "grad_norm": 39.8125, + "learning_rate": 9.87585936453782e-06, + "loss": 16.8121, + "step": 43940 + }, + { + "epoch": 0.7946822288169496, + "grad_norm": 43.1875, + "learning_rate": 9.875831112231158e-06, + "loss": 16.7487, + "step": 43950 + }, + { + "epoch": 0.7948630438860774, + "grad_norm": 46.40625, + "learning_rate": 9.875802859924499e-06, + "loss": 16.681, + "step": 43960 + }, + { + "epoch": 0.7950438589552054, + "grad_norm": 44.375, + "learning_rate": 9.87577460761784e-06, + "loss": 16.903, + "step": 43970 + }, + { + "epoch": 0.7952246740243332, + "grad_norm": 46.59375, + "learning_rate": 9.87574635531118e-06, + "loss": 17.0115, + "step": 43980 + }, + { + "epoch": 0.7954054890934611, + "grad_norm": 43.5625, + "learning_rate": 9.875718103004521e-06, + "loss": 16.7663, + "step": 43990 + }, + { + "epoch": 0.7955863041625889, + "grad_norm": 41.90625, + "learning_rate": 9.875689850697862e-06, + "loss": 16.5117, + "step": 44000 + }, + { + "epoch": 0.7957671192317167, + "grad_norm": 42.5625, + "learning_rate": 9.875661598391202e-06, + "loss": 16.799, + "step": 44010 + }, + { + "epoch": 0.7959479343008447, + "grad_norm": 43.96875, + "learning_rate": 9.875633346084543e-06, + "loss": 16.8954, + "step": 44020 + }, + { + "epoch": 0.7961287493699725, + "grad_norm": 44.625, + "learning_rate": 9.875605093777882e-06, + "loss": 16.9782, + "step": 44030 + }, + { + "epoch": 0.7963095644391004, + "grad_norm": 42.75, + "learning_rate": 9.875576841471222e-06, + "loss": 16.9463, + "step": 44040 + }, + { + "epoch": 0.7964903795082282, + "grad_norm": 40.65625, + "learning_rate": 9.875548589164563e-06, + "loss": 16.6091, + "step": 44050 + }, + { + "epoch": 0.796671194577356, + "grad_norm": 44.96875, + "learning_rate": 9.875520336857904e-06, + "loss": 16.6663, + "step": 44060 + }, + { + "epoch": 0.796852009646484, + "grad_norm": 44.125, + "learning_rate": 9.875492084551244e-06, + "loss": 17.0986, + "step": 44070 + }, + { + "epoch": 0.7970328247156118, + "grad_norm": 41.46875, + "learning_rate": 9.875463832244583e-06, + "loss": 17.1912, + "step": 44080 + }, + { + "epoch": 0.7972136397847397, + "grad_norm": 42.09375, + "learning_rate": 9.875435579937925e-06, + "loss": 17.1274, + "step": 44090 + }, + { + "epoch": 0.7973944548538675, + "grad_norm": 44.375, + "learning_rate": 9.875407327631266e-06, + "loss": 16.9897, + "step": 44100 + }, + { + "epoch": 0.7975752699229954, + "grad_norm": 42.125, + "learning_rate": 9.875379075324607e-06, + "loss": 16.8096, + "step": 44110 + }, + { + "epoch": 0.7977560849921232, + "grad_norm": 47.8125, + "learning_rate": 9.875350823017946e-06, + "loss": 17.1615, + "step": 44120 + }, + { + "epoch": 0.7979369000612511, + "grad_norm": 43.75, + "learning_rate": 9.875322570711286e-06, + "loss": 16.8355, + "step": 44130 + }, + { + "epoch": 0.798117715130379, + "grad_norm": 44.09375, + "learning_rate": 9.875294318404627e-06, + "loss": 17.0292, + "step": 44140 + }, + { + "epoch": 0.7982985301995068, + "grad_norm": 43.5, + "learning_rate": 9.875266066097967e-06, + "loss": 16.3573, + "step": 44150 + }, + { + "epoch": 0.7984793452686347, + "grad_norm": 42.34375, + "learning_rate": 9.875237813791308e-06, + "loss": 16.6646, + "step": 44160 + }, + { + "epoch": 0.7986601603377625, + "grad_norm": 42.34375, + "learning_rate": 9.875209561484647e-06, + "loss": 16.396, + "step": 44170 + }, + { + "epoch": 0.7988409754068904, + "grad_norm": 42.71875, + "learning_rate": 9.87518130917799e-06, + "loss": 16.947, + "step": 44180 + }, + { + "epoch": 0.7990217904760183, + "grad_norm": 43.65625, + "learning_rate": 9.87515305687133e-06, + "loss": 16.8843, + "step": 44190 + }, + { + "epoch": 0.7992026055451461, + "grad_norm": 43.71875, + "learning_rate": 9.875124804564669e-06, + "loss": 16.7936, + "step": 44200 + }, + { + "epoch": 0.799383420614274, + "grad_norm": 41.53125, + "learning_rate": 9.87509655225801e-06, + "loss": 16.9738, + "step": 44210 + }, + { + "epoch": 0.7995642356834018, + "grad_norm": 43.78125, + "learning_rate": 9.87506829995135e-06, + "loss": 17.1478, + "step": 44220 + }, + { + "epoch": 0.7997450507525297, + "grad_norm": 42.40625, + "learning_rate": 9.87504004764469e-06, + "loss": 16.736, + "step": 44230 + }, + { + "epoch": 0.7999258658216576, + "grad_norm": 46.125, + "learning_rate": 9.875011795338031e-06, + "loss": 16.9612, + "step": 44240 + }, + { + "epoch": 0.8001066808907854, + "grad_norm": 44.625, + "learning_rate": 9.874983543031372e-06, + "loss": 17.1204, + "step": 44250 + }, + { + "epoch": 0.8002874959599133, + "grad_norm": 44.40625, + "learning_rate": 9.874955290724711e-06, + "loss": 16.6089, + "step": 44260 + }, + { + "epoch": 0.8004683110290411, + "grad_norm": 43.34375, + "learning_rate": 9.874927038418053e-06, + "loss": 16.6859, + "step": 44270 + }, + { + "epoch": 0.8006491260981691, + "grad_norm": 41.78125, + "learning_rate": 9.874898786111394e-06, + "loss": 16.8503, + "step": 44280 + }, + { + "epoch": 0.8008299411672969, + "grad_norm": 43.75, + "learning_rate": 9.874870533804733e-06, + "loss": 16.7074, + "step": 44290 + }, + { + "epoch": 0.8010107562364247, + "grad_norm": 40.625, + "learning_rate": 9.874842281498073e-06, + "loss": 16.9856, + "step": 44300 + }, + { + "epoch": 0.8011915713055526, + "grad_norm": 41.15625, + "learning_rate": 9.874814029191414e-06, + "loss": 17.1389, + "step": 44310 + }, + { + "epoch": 0.8013723863746804, + "grad_norm": 43.4375, + "learning_rate": 9.874785776884755e-06, + "loss": 16.9076, + "step": 44320 + }, + { + "epoch": 0.8015532014438084, + "grad_norm": 42.03125, + "learning_rate": 9.874757524578095e-06, + "loss": 16.6759, + "step": 44330 + }, + { + "epoch": 0.8017340165129362, + "grad_norm": 39.84375, + "learning_rate": 9.874729272271434e-06, + "loss": 16.5483, + "step": 44340 + }, + { + "epoch": 0.801914831582064, + "grad_norm": 43.125, + "learning_rate": 9.874701019964777e-06, + "loss": 16.9525, + "step": 44350 + }, + { + "epoch": 0.8020956466511919, + "grad_norm": 45.78125, + "learning_rate": 9.874672767658117e-06, + "loss": 16.641, + "step": 44360 + }, + { + "epoch": 0.8022764617203197, + "grad_norm": 42.34375, + "learning_rate": 9.874644515351458e-06, + "loss": 16.7242, + "step": 44370 + }, + { + "epoch": 0.8024572767894477, + "grad_norm": 42.4375, + "learning_rate": 9.874616263044797e-06, + "loss": 16.9407, + "step": 44380 + }, + { + "epoch": 0.8026380918585755, + "grad_norm": 44.0625, + "learning_rate": 9.874588010738137e-06, + "loss": 17.0624, + "step": 44390 + }, + { + "epoch": 0.8028189069277033, + "grad_norm": 41.59375, + "learning_rate": 9.874559758431478e-06, + "loss": 16.4757, + "step": 44400 + }, + { + "epoch": 0.8029997219968312, + "grad_norm": 43.15625, + "learning_rate": 9.874531506124819e-06, + "loss": 17.0101, + "step": 44410 + }, + { + "epoch": 0.803180537065959, + "grad_norm": 44.9375, + "learning_rate": 9.87450325381816e-06, + "loss": 16.7205, + "step": 44420 + }, + { + "epoch": 0.803361352135087, + "grad_norm": 41.90625, + "learning_rate": 9.874475001511498e-06, + "loss": 17.1587, + "step": 44430 + }, + { + "epoch": 0.8035421672042148, + "grad_norm": 42.25, + "learning_rate": 9.87444674920484e-06, + "loss": 16.4366, + "step": 44440 + }, + { + "epoch": 0.8037229822733427, + "grad_norm": 46.9375, + "learning_rate": 9.874418496898181e-06, + "loss": 17.0698, + "step": 44450 + }, + { + "epoch": 0.8039037973424705, + "grad_norm": 42.5, + "learning_rate": 9.87439024459152e-06, + "loss": 16.8598, + "step": 44460 + }, + { + "epoch": 0.8040846124115983, + "grad_norm": 43.65625, + "learning_rate": 9.87436199228486e-06, + "loss": 16.9069, + "step": 44470 + }, + { + "epoch": 0.8042654274807263, + "grad_norm": 42.78125, + "learning_rate": 9.874333739978201e-06, + "loss": 16.4355, + "step": 44480 + }, + { + "epoch": 0.8044462425498541, + "grad_norm": 47.53125, + "learning_rate": 9.874305487671542e-06, + "loss": 16.8449, + "step": 44490 + }, + { + "epoch": 0.804627057618982, + "grad_norm": 42.75, + "learning_rate": 9.874277235364882e-06, + "loss": 17.022, + "step": 44500 + }, + { + "epoch": 0.8048078726881098, + "grad_norm": 41.875, + "learning_rate": 9.874248983058221e-06, + "loss": 17.0746, + "step": 44510 + }, + { + "epoch": 0.8049886877572376, + "grad_norm": 42.3125, + "learning_rate": 9.874220730751562e-06, + "loss": 17.0565, + "step": 44520 + }, + { + "epoch": 0.8051695028263656, + "grad_norm": 40.78125, + "learning_rate": 9.874192478444904e-06, + "loss": 16.7014, + "step": 44530 + }, + { + "epoch": 0.8053503178954934, + "grad_norm": 45.09375, + "learning_rate": 9.874164226138245e-06, + "loss": 16.5769, + "step": 44540 + }, + { + "epoch": 0.8055311329646213, + "grad_norm": 45.5625, + "learning_rate": 9.874135973831584e-06, + "loss": 16.7823, + "step": 44550 + }, + { + "epoch": 0.8057119480337491, + "grad_norm": 46.71875, + "learning_rate": 9.874107721524925e-06, + "loss": 16.4381, + "step": 44560 + }, + { + "epoch": 0.8058927631028769, + "grad_norm": 48.0625, + "learning_rate": 9.874079469218265e-06, + "loss": 16.2223, + "step": 44570 + }, + { + "epoch": 0.8060735781720049, + "grad_norm": 42.71875, + "learning_rate": 9.874051216911606e-06, + "loss": 16.6863, + "step": 44580 + }, + { + "epoch": 0.8062543932411327, + "grad_norm": 41.875, + "learning_rate": 9.874022964604946e-06, + "loss": 16.6746, + "step": 44590 + }, + { + "epoch": 0.8064352083102606, + "grad_norm": 40.25, + "learning_rate": 9.873994712298285e-06, + "loss": 16.7305, + "step": 44600 + }, + { + "epoch": 0.8066160233793884, + "grad_norm": 47.125, + "learning_rate": 9.873966459991626e-06, + "loss": 16.9804, + "step": 44610 + }, + { + "epoch": 0.8067968384485164, + "grad_norm": 46.21875, + "learning_rate": 9.873938207684968e-06, + "loss": 16.7462, + "step": 44620 + }, + { + "epoch": 0.8069776535176442, + "grad_norm": 45.9375, + "learning_rate": 9.873909955378307e-06, + "loss": 16.8084, + "step": 44630 + }, + { + "epoch": 0.807158468586772, + "grad_norm": 46.125, + "learning_rate": 9.873881703071648e-06, + "loss": 16.5409, + "step": 44640 + }, + { + "epoch": 0.8073392836558999, + "grad_norm": 43.46875, + "learning_rate": 9.873853450764988e-06, + "loss": 16.5718, + "step": 44650 + }, + { + "epoch": 0.8075200987250277, + "grad_norm": 38.34375, + "learning_rate": 9.873825198458329e-06, + "loss": 16.6985, + "step": 44660 + }, + { + "epoch": 0.8077009137941557, + "grad_norm": 41.84375, + "learning_rate": 9.87379694615167e-06, + "loss": 16.5292, + "step": 44670 + }, + { + "epoch": 0.8078817288632835, + "grad_norm": 43.65625, + "learning_rate": 9.87376869384501e-06, + "loss": 16.7325, + "step": 44680 + }, + { + "epoch": 0.8080625439324113, + "grad_norm": 44.0625, + "learning_rate": 9.87374044153835e-06, + "loss": 17.2133, + "step": 44690 + }, + { + "epoch": 0.8082433590015392, + "grad_norm": 44.9375, + "learning_rate": 9.873712189231692e-06, + "loss": 16.6545, + "step": 44700 + }, + { + "epoch": 0.808424174070667, + "grad_norm": 44.5625, + "learning_rate": 9.873683936925032e-06, + "loss": 16.7823, + "step": 44710 + }, + { + "epoch": 0.808604989139795, + "grad_norm": 41.6875, + "learning_rate": 9.873655684618371e-06, + "loss": 16.7078, + "step": 44720 + }, + { + "epoch": 0.8087858042089228, + "grad_norm": 44.625, + "learning_rate": 9.873627432311712e-06, + "loss": 16.8342, + "step": 44730 + }, + { + "epoch": 0.8089666192780506, + "grad_norm": 40.3125, + "learning_rate": 9.873599180005052e-06, + "loss": 17.0291, + "step": 44740 + }, + { + "epoch": 0.8091474343471785, + "grad_norm": 42.90625, + "learning_rate": 9.873570927698393e-06, + "loss": 16.6447, + "step": 44750 + }, + { + "epoch": 0.8093282494163063, + "grad_norm": 45.3125, + "learning_rate": 9.873542675391734e-06, + "loss": 17.0995, + "step": 44760 + }, + { + "epoch": 0.8095090644854342, + "grad_norm": 44.21875, + "learning_rate": 9.873514423085073e-06, + "loss": 16.6301, + "step": 44770 + }, + { + "epoch": 0.8096898795545621, + "grad_norm": 42.5625, + "learning_rate": 9.873486170778413e-06, + "loss": 17.1241, + "step": 44780 + }, + { + "epoch": 0.80987069462369, + "grad_norm": 45.625, + "learning_rate": 9.873457918471755e-06, + "loss": 16.5711, + "step": 44790 + }, + { + "epoch": 0.8100515096928178, + "grad_norm": 43.375, + "learning_rate": 9.873429666165096e-06, + "loss": 16.494, + "step": 44800 + }, + { + "epoch": 0.8102323247619456, + "grad_norm": 43.875, + "learning_rate": 9.873401413858435e-06, + "loss": 16.9783, + "step": 44810 + }, + { + "epoch": 0.8104131398310735, + "grad_norm": 42.09375, + "learning_rate": 9.873373161551776e-06, + "loss": 16.9719, + "step": 44820 + }, + { + "epoch": 0.8105939549002014, + "grad_norm": 41.125, + "learning_rate": 9.873344909245116e-06, + "loss": 16.8303, + "step": 44830 + }, + { + "epoch": 0.8107747699693293, + "grad_norm": 43.96875, + "learning_rate": 9.873316656938457e-06, + "loss": 16.6135, + "step": 44840 + }, + { + "epoch": 0.8109555850384571, + "grad_norm": 47.21875, + "learning_rate": 9.873288404631797e-06, + "loss": 17.2661, + "step": 44850 + }, + { + "epoch": 0.8111364001075849, + "grad_norm": 43.09375, + "learning_rate": 9.873260152325136e-06, + "loss": 16.436, + "step": 44860 + }, + { + "epoch": 0.8113172151767128, + "grad_norm": 40.4375, + "learning_rate": 9.873231900018477e-06, + "loss": 16.6702, + "step": 44870 + }, + { + "epoch": 0.8114980302458407, + "grad_norm": 42.25, + "learning_rate": 9.87320364771182e-06, + "loss": 16.866, + "step": 44880 + }, + { + "epoch": 0.8116788453149686, + "grad_norm": 43.34375, + "learning_rate": 9.873175395405158e-06, + "loss": 16.8068, + "step": 44890 + }, + { + "epoch": 0.8118596603840964, + "grad_norm": 42.4375, + "learning_rate": 9.873147143098499e-06, + "loss": 16.5403, + "step": 44900 + }, + { + "epoch": 0.8120404754532242, + "grad_norm": 42.21875, + "learning_rate": 9.87311889079184e-06, + "loss": 17.0165, + "step": 44910 + }, + { + "epoch": 0.8122212905223521, + "grad_norm": 44.25, + "learning_rate": 9.87309063848518e-06, + "loss": 16.3663, + "step": 44920 + }, + { + "epoch": 0.81240210559148, + "grad_norm": 42.4375, + "learning_rate": 9.87306238617852e-06, + "loss": 16.5132, + "step": 44930 + }, + { + "epoch": 0.8125829206606079, + "grad_norm": 42.21875, + "learning_rate": 9.87303413387186e-06, + "loss": 16.7811, + "step": 44940 + }, + { + "epoch": 0.8127637357297357, + "grad_norm": 47.28125, + "learning_rate": 9.8730058815652e-06, + "loss": 17.3034, + "step": 44950 + }, + { + "epoch": 0.8129445507988636, + "grad_norm": 42.6875, + "learning_rate": 9.872977629258541e-06, + "loss": 16.4767, + "step": 44960 + }, + { + "epoch": 0.8131253658679914, + "grad_norm": 42.125, + "learning_rate": 9.872949376951883e-06, + "loss": 17.1173, + "step": 44970 + }, + { + "epoch": 0.8133061809371193, + "grad_norm": 43.46875, + "learning_rate": 9.872921124645222e-06, + "loss": 16.4008, + "step": 44980 + }, + { + "epoch": 0.8134869960062472, + "grad_norm": 43.09375, + "learning_rate": 9.872892872338563e-06, + "loss": 16.4453, + "step": 44990 + }, + { + "epoch": 0.813667811075375, + "grad_norm": 45.5, + "learning_rate": 9.872864620031903e-06, + "loss": 16.954, + "step": 45000 + }, + { + "epoch": 0.813667811075375, + "eval_loss": 2.1011931896209717, + "eval_runtime": 229.8768, + "eval_samples_per_second": 3158.471, + "eval_steps_per_second": 49.353, + "step": 45000 + }, + { + "epoch": 0.8138486261445029, + "grad_norm": 41.6875, + "learning_rate": 9.872836367725244e-06, + "loss": 16.1717, + "step": 45010 + }, + { + "epoch": 0.8140294412136307, + "grad_norm": 40.21875, + "learning_rate": 9.872808115418585e-06, + "loss": 16.7626, + "step": 45020 + }, + { + "epoch": 0.8142102562827586, + "grad_norm": 41.25, + "learning_rate": 9.872779863111924e-06, + "loss": 17.0021, + "step": 45030 + }, + { + "epoch": 0.8143910713518865, + "grad_norm": 43.40625, + "learning_rate": 9.872751610805264e-06, + "loss": 16.884, + "step": 45040 + }, + { + "epoch": 0.8145718864210143, + "grad_norm": 42.0625, + "learning_rate": 9.872723358498607e-06, + "loss": 16.9262, + "step": 45050 + }, + { + "epoch": 0.8147527014901422, + "grad_norm": 44.0625, + "learning_rate": 9.872695106191945e-06, + "loss": 17.088, + "step": 45060 + }, + { + "epoch": 0.81493351655927, + "grad_norm": 45.5625, + "learning_rate": 9.872666853885286e-06, + "loss": 16.5677, + "step": 45070 + }, + { + "epoch": 0.8151143316283979, + "grad_norm": 44.6875, + "learning_rate": 9.872638601578627e-06, + "loss": 16.9741, + "step": 45080 + }, + { + "epoch": 0.8152951466975258, + "grad_norm": 42.40625, + "learning_rate": 9.872610349271967e-06, + "loss": 16.5187, + "step": 45090 + }, + { + "epoch": 0.8154759617666536, + "grad_norm": 41.84375, + "learning_rate": 9.872582096965308e-06, + "loss": 16.83, + "step": 45100 + }, + { + "epoch": 0.8156567768357815, + "grad_norm": 45.0, + "learning_rate": 9.872553844658649e-06, + "loss": 16.7083, + "step": 45110 + }, + { + "epoch": 0.8158375919049093, + "grad_norm": 46.1875, + "learning_rate": 9.872525592351988e-06, + "loss": 17.0987, + "step": 45120 + }, + { + "epoch": 0.8160184069740373, + "grad_norm": 45.625, + "learning_rate": 9.872497340045328e-06, + "loss": 16.7546, + "step": 45130 + }, + { + "epoch": 0.8161992220431651, + "grad_norm": 45.3125, + "learning_rate": 9.87246908773867e-06, + "loss": 17.1085, + "step": 45140 + }, + { + "epoch": 0.8163800371122929, + "grad_norm": 43.5, + "learning_rate": 9.87244083543201e-06, + "loss": 16.7036, + "step": 45150 + }, + { + "epoch": 0.8165608521814208, + "grad_norm": 45.78125, + "learning_rate": 9.87241258312535e-06, + "loss": 16.5432, + "step": 45160 + }, + { + "epoch": 0.8167416672505486, + "grad_norm": 41.1875, + "learning_rate": 9.87238433081869e-06, + "loss": 16.7783, + "step": 45170 + }, + { + "epoch": 0.8169224823196766, + "grad_norm": 44.875, + "learning_rate": 9.872356078512031e-06, + "loss": 17.3186, + "step": 45180 + }, + { + "epoch": 0.8171032973888044, + "grad_norm": 45.9375, + "learning_rate": 9.872327826205372e-06, + "loss": 17.2104, + "step": 45190 + }, + { + "epoch": 0.8172841124579322, + "grad_norm": 42.28125, + "learning_rate": 9.87229957389871e-06, + "loss": 16.3376, + "step": 45200 + }, + { + "epoch": 0.8174649275270601, + "grad_norm": 42.46875, + "learning_rate": 9.872271321592051e-06, + "loss": 16.9167, + "step": 45210 + }, + { + "epoch": 0.8176457425961879, + "grad_norm": 47.34375, + "learning_rate": 9.872243069285392e-06, + "loss": 17.1726, + "step": 45220 + }, + { + "epoch": 0.8178265576653159, + "grad_norm": 43.21875, + "learning_rate": 9.872214816978734e-06, + "loss": 16.9629, + "step": 45230 + }, + { + "epoch": 0.8180073727344437, + "grad_norm": 46.375, + "learning_rate": 9.872186564672073e-06, + "loss": 16.6152, + "step": 45240 + }, + { + "epoch": 0.8181881878035715, + "grad_norm": 41.53125, + "learning_rate": 9.872158312365414e-06, + "loss": 16.5411, + "step": 45250 + }, + { + "epoch": 0.8183690028726994, + "grad_norm": 41.28125, + "learning_rate": 9.872130060058755e-06, + "loss": 16.7691, + "step": 45260 + }, + { + "epoch": 0.8185498179418272, + "grad_norm": 43.5, + "learning_rate": 9.872101807752095e-06, + "loss": 16.5727, + "step": 45270 + }, + { + "epoch": 0.8187306330109552, + "grad_norm": 42.9375, + "learning_rate": 9.872073555445436e-06, + "loss": 16.6411, + "step": 45280 + }, + { + "epoch": 0.818911448080083, + "grad_norm": 41.6875, + "learning_rate": 9.872045303138775e-06, + "loss": 16.804, + "step": 45290 + }, + { + "epoch": 0.8190922631492109, + "grad_norm": 44.71875, + "learning_rate": 9.872017050832115e-06, + "loss": 16.286, + "step": 45300 + }, + { + "epoch": 0.8192730782183387, + "grad_norm": 43.8125, + "learning_rate": 9.871988798525456e-06, + "loss": 16.8301, + "step": 45310 + }, + { + "epoch": 0.8194538932874665, + "grad_norm": 40.5625, + "learning_rate": 9.871960546218797e-06, + "loss": 16.8716, + "step": 45320 + }, + { + "epoch": 0.8196347083565945, + "grad_norm": 42.78125, + "learning_rate": 9.871932293912137e-06, + "loss": 16.4862, + "step": 45330 + }, + { + "epoch": 0.8198155234257223, + "grad_norm": 46.46875, + "learning_rate": 9.871904041605478e-06, + "loss": 16.9789, + "step": 45340 + }, + { + "epoch": 0.8199963384948502, + "grad_norm": 41.34375, + "learning_rate": 9.871875789298818e-06, + "loss": 16.7286, + "step": 45350 + }, + { + "epoch": 0.820177153563978, + "grad_norm": 43.5625, + "learning_rate": 9.871847536992159e-06, + "loss": 16.843, + "step": 45360 + }, + { + "epoch": 0.8203579686331058, + "grad_norm": 43.3125, + "learning_rate": 9.871819284685498e-06, + "loss": 16.6393, + "step": 45370 + }, + { + "epoch": 0.8205387837022338, + "grad_norm": 48.25, + "learning_rate": 9.871791032378839e-06, + "loss": 16.6034, + "step": 45380 + }, + { + "epoch": 0.8207195987713616, + "grad_norm": 43.90625, + "learning_rate": 9.87176278007218e-06, + "loss": 17.0632, + "step": 45390 + }, + { + "epoch": 0.8209004138404895, + "grad_norm": 44.375, + "learning_rate": 9.871734527765522e-06, + "loss": 16.6226, + "step": 45400 + }, + { + "epoch": 0.8210812289096173, + "grad_norm": 45.71875, + "learning_rate": 9.87170627545886e-06, + "loss": 16.8551, + "step": 45410 + }, + { + "epoch": 0.8212620439787451, + "grad_norm": 43.25, + "learning_rate": 9.871678023152201e-06, + "loss": 17.2683, + "step": 45420 + }, + { + "epoch": 0.8214428590478731, + "grad_norm": 39.3125, + "learning_rate": 9.871649770845542e-06, + "loss": 17.3095, + "step": 45430 + }, + { + "epoch": 0.8216236741170009, + "grad_norm": 42.1875, + "learning_rate": 9.871621518538882e-06, + "loss": 16.9099, + "step": 45440 + }, + { + "epoch": 0.8218044891861288, + "grad_norm": 42.25, + "learning_rate": 9.871593266232223e-06, + "loss": 16.8983, + "step": 45450 + }, + { + "epoch": 0.8219853042552566, + "grad_norm": 44.3125, + "learning_rate": 9.871565013925562e-06, + "loss": 16.3534, + "step": 45460 + }, + { + "epoch": 0.8221661193243845, + "grad_norm": 42.5, + "learning_rate": 9.871536761618903e-06, + "loss": 16.9496, + "step": 45470 + }, + { + "epoch": 0.8223469343935124, + "grad_norm": 44.21875, + "learning_rate": 9.871508509312243e-06, + "loss": 16.7329, + "step": 45480 + }, + { + "epoch": 0.8225277494626402, + "grad_norm": 43.78125, + "learning_rate": 9.871480257005584e-06, + "loss": 16.4478, + "step": 45490 + }, + { + "epoch": 0.8227085645317681, + "grad_norm": 46.59375, + "learning_rate": 9.871452004698924e-06, + "loss": 16.8804, + "step": 45500 + }, + { + "epoch": 0.8228893796008959, + "grad_norm": 41.25, + "learning_rate": 9.871423752392265e-06, + "loss": 16.4803, + "step": 45510 + }, + { + "epoch": 0.8230701946700238, + "grad_norm": 42.375, + "learning_rate": 9.871395500085606e-06, + "loss": 17.0666, + "step": 45520 + }, + { + "epoch": 0.8232510097391517, + "grad_norm": 42.03125, + "learning_rate": 9.871367247778946e-06, + "loss": 17.1151, + "step": 45530 + }, + { + "epoch": 0.8234318248082795, + "grad_norm": 41.0625, + "learning_rate": 9.871338995472285e-06, + "loss": 17.0039, + "step": 45540 + }, + { + "epoch": 0.8236126398774074, + "grad_norm": 45.0, + "learning_rate": 9.871310743165626e-06, + "loss": 16.9728, + "step": 45550 + }, + { + "epoch": 0.8237934549465352, + "grad_norm": 48.59375, + "learning_rate": 9.871282490858966e-06, + "loss": 17.0749, + "step": 45560 + }, + { + "epoch": 0.8239742700156631, + "grad_norm": 43.375, + "learning_rate": 9.871254238552307e-06, + "loss": 17.18, + "step": 45570 + }, + { + "epoch": 0.824155085084791, + "grad_norm": 41.09375, + "learning_rate": 9.871225986245648e-06, + "loss": 17.0381, + "step": 45580 + }, + { + "epoch": 0.8243359001539188, + "grad_norm": 42.9375, + "learning_rate": 9.871197733938988e-06, + "loss": 16.5253, + "step": 45590 + }, + { + "epoch": 0.8245167152230467, + "grad_norm": 42.28125, + "learning_rate": 9.871169481632329e-06, + "loss": 16.8323, + "step": 45600 + }, + { + "epoch": 0.8246975302921745, + "grad_norm": 40.5, + "learning_rate": 9.87114122932567e-06, + "loss": 16.5359, + "step": 45610 + }, + { + "epoch": 0.8248783453613024, + "grad_norm": 41.90625, + "learning_rate": 9.87111297701901e-06, + "loss": 16.7246, + "step": 45620 + }, + { + "epoch": 0.8250591604304303, + "grad_norm": 47.375, + "learning_rate": 9.871084724712349e-06, + "loss": 16.5078, + "step": 45630 + }, + { + "epoch": 0.8252399754995582, + "grad_norm": 43.8125, + "learning_rate": 9.87105647240569e-06, + "loss": 16.6933, + "step": 45640 + }, + { + "epoch": 0.825420790568686, + "grad_norm": 43.96875, + "learning_rate": 9.87102822009903e-06, + "loss": 16.3012, + "step": 45650 + }, + { + "epoch": 0.8256016056378138, + "grad_norm": 44.125, + "learning_rate": 9.870999967792371e-06, + "loss": 16.8273, + "step": 45660 + }, + { + "epoch": 0.8257824207069417, + "grad_norm": 42.65625, + "learning_rate": 9.870971715485712e-06, + "loss": 17.0442, + "step": 45670 + }, + { + "epoch": 0.8259632357760696, + "grad_norm": 43.1875, + "learning_rate": 9.870943463179052e-06, + "loss": 16.7705, + "step": 45680 + }, + { + "epoch": 0.8261440508451975, + "grad_norm": 42.0, + "learning_rate": 9.870915210872393e-06, + "loss": 16.8474, + "step": 45690 + }, + { + "epoch": 0.8263248659143253, + "grad_norm": 45.09375, + "learning_rate": 9.870886958565733e-06, + "loss": 16.8813, + "step": 45700 + }, + { + "epoch": 0.8265056809834531, + "grad_norm": 44.375, + "learning_rate": 9.870858706259074e-06, + "loss": 16.8363, + "step": 45710 + }, + { + "epoch": 0.826686496052581, + "grad_norm": 43.71875, + "learning_rate": 9.870830453952413e-06, + "loss": 16.4847, + "step": 45720 + }, + { + "epoch": 0.8268673111217089, + "grad_norm": 44.9375, + "learning_rate": 9.870802201645754e-06, + "loss": 16.3199, + "step": 45730 + }, + { + "epoch": 0.8270481261908368, + "grad_norm": 40.875, + "learning_rate": 9.870773949339094e-06, + "loss": 17.1142, + "step": 45740 + }, + { + "epoch": 0.8272289412599646, + "grad_norm": 42.46875, + "learning_rate": 9.870745697032435e-06, + "loss": 17.0783, + "step": 45750 + }, + { + "epoch": 0.8274097563290924, + "grad_norm": 45.25, + "learning_rate": 9.870717444725775e-06, + "loss": 16.8826, + "step": 45760 + }, + { + "epoch": 0.8275905713982203, + "grad_norm": 42.75, + "learning_rate": 9.870689192419116e-06, + "loss": 16.5429, + "step": 45770 + }, + { + "epoch": 0.8277713864673482, + "grad_norm": 42.28125, + "learning_rate": 9.870660940112457e-06, + "loss": 16.6929, + "step": 45780 + }, + { + "epoch": 0.8279522015364761, + "grad_norm": 43.5, + "learning_rate": 9.870632687805797e-06, + "loss": 17.2222, + "step": 45790 + }, + { + "epoch": 0.8281330166056039, + "grad_norm": 41.3125, + "learning_rate": 9.870604435499136e-06, + "loss": 17.0767, + "step": 45800 + }, + { + "epoch": 0.8283138316747318, + "grad_norm": 42.5625, + "learning_rate": 9.870576183192477e-06, + "loss": 16.9483, + "step": 45810 + }, + { + "epoch": 0.8284946467438596, + "grad_norm": 43.25, + "learning_rate": 9.870547930885818e-06, + "loss": 17.0716, + "step": 45820 + }, + { + "epoch": 0.8286754618129875, + "grad_norm": 42.375, + "learning_rate": 9.870519678579158e-06, + "loss": 16.5698, + "step": 45830 + }, + { + "epoch": 0.8288562768821154, + "grad_norm": 40.625, + "learning_rate": 9.870491426272499e-06, + "loss": 16.5512, + "step": 45840 + }, + { + "epoch": 0.8290370919512432, + "grad_norm": 43.65625, + "learning_rate": 9.87046317396584e-06, + "loss": 17.2864, + "step": 45850 + }, + { + "epoch": 0.8292179070203711, + "grad_norm": 43.125, + "learning_rate": 9.87043492165918e-06, + "loss": 17.0127, + "step": 45860 + }, + { + "epoch": 0.8293987220894989, + "grad_norm": 42.28125, + "learning_rate": 9.87040666935252e-06, + "loss": 16.7168, + "step": 45870 + }, + { + "epoch": 0.8295795371586268, + "grad_norm": 43.21875, + "learning_rate": 9.870378417045861e-06, + "loss": 16.454, + "step": 45880 + }, + { + "epoch": 0.8297603522277547, + "grad_norm": 43.78125, + "learning_rate": 9.8703501647392e-06, + "loss": 16.5775, + "step": 45890 + }, + { + "epoch": 0.8299411672968825, + "grad_norm": 41.6875, + "learning_rate": 9.87032191243254e-06, + "loss": 16.7222, + "step": 45900 + }, + { + "epoch": 0.8301219823660104, + "grad_norm": 42.4375, + "learning_rate": 9.870293660125881e-06, + "loss": 16.7991, + "step": 45910 + }, + { + "epoch": 0.8303027974351382, + "grad_norm": 42.46875, + "learning_rate": 9.870265407819222e-06, + "loss": 16.6125, + "step": 45920 + }, + { + "epoch": 0.830483612504266, + "grad_norm": 41.875, + "learning_rate": 9.870237155512563e-06, + "loss": 16.341, + "step": 45930 + }, + { + "epoch": 0.830664427573394, + "grad_norm": 42.25, + "learning_rate": 9.870208903205903e-06, + "loss": 17.3, + "step": 45940 + }, + { + "epoch": 0.8308452426425218, + "grad_norm": 43.34375, + "learning_rate": 9.870180650899244e-06, + "loss": 16.0941, + "step": 45950 + }, + { + "epoch": 0.8310260577116497, + "grad_norm": 44.5, + "learning_rate": 9.870152398592585e-06, + "loss": 16.6848, + "step": 45960 + }, + { + "epoch": 0.8312068727807775, + "grad_norm": 42.0, + "learning_rate": 9.870124146285923e-06, + "loss": 16.6799, + "step": 45970 + }, + { + "epoch": 0.8313876878499055, + "grad_norm": 42.34375, + "learning_rate": 9.870095893979264e-06, + "loss": 16.3178, + "step": 45980 + }, + { + "epoch": 0.8315685029190333, + "grad_norm": 41.96875, + "learning_rate": 9.870067641672605e-06, + "loss": 16.6089, + "step": 45990 + }, + { + "epoch": 0.8317493179881611, + "grad_norm": 44.15625, + "learning_rate": 9.870039389365945e-06, + "loss": 17.0249, + "step": 46000 + }, + { + "epoch": 0.831930133057289, + "grad_norm": 45.96875, + "learning_rate": 9.870011137059286e-06, + "loss": 16.8498, + "step": 46010 + }, + { + "epoch": 0.8321109481264168, + "grad_norm": 45.53125, + "learning_rate": 9.869982884752627e-06, + "loss": 16.647, + "step": 46020 + }, + { + "epoch": 0.8322917631955448, + "grad_norm": 43.25, + "learning_rate": 9.869954632445967e-06, + "loss": 16.6773, + "step": 46030 + }, + { + "epoch": 0.8324725782646726, + "grad_norm": 43.5625, + "learning_rate": 9.869926380139308e-06, + "loss": 16.4971, + "step": 46040 + }, + { + "epoch": 0.8326533933338004, + "grad_norm": 43.1875, + "learning_rate": 9.869898127832648e-06, + "loss": 16.3478, + "step": 46050 + }, + { + "epoch": 0.8328342084029283, + "grad_norm": 43.8125, + "learning_rate": 9.869869875525987e-06, + "loss": 16.8095, + "step": 46060 + }, + { + "epoch": 0.8330150234720561, + "grad_norm": 45.375, + "learning_rate": 9.869841623219328e-06, + "loss": 17.218, + "step": 46070 + }, + { + "epoch": 0.8331958385411841, + "grad_norm": 43.0625, + "learning_rate": 9.869813370912669e-06, + "loss": 16.6927, + "step": 46080 + }, + { + "epoch": 0.8333766536103119, + "grad_norm": 44.3125, + "learning_rate": 9.86978511860601e-06, + "loss": 16.999, + "step": 46090 + }, + { + "epoch": 0.8335574686794397, + "grad_norm": 47.84375, + "learning_rate": 9.86975686629935e-06, + "loss": 16.9362, + "step": 46100 + }, + { + "epoch": 0.8337382837485676, + "grad_norm": 47.09375, + "learning_rate": 9.86972861399269e-06, + "loss": 16.9018, + "step": 46110 + }, + { + "epoch": 0.8339190988176954, + "grad_norm": 49.65625, + "learning_rate": 9.869700361686031e-06, + "loss": 17.1598, + "step": 46120 + }, + { + "epoch": 0.8340999138868234, + "grad_norm": 44.375, + "learning_rate": 9.869672109379372e-06, + "loss": 16.3532, + "step": 46130 + }, + { + "epoch": 0.8342807289559512, + "grad_norm": 42.125, + "learning_rate": 9.869643857072712e-06, + "loss": 16.8197, + "step": 46140 + }, + { + "epoch": 0.8344615440250791, + "grad_norm": 43.375, + "learning_rate": 9.869615604766051e-06, + "loss": 16.5218, + "step": 46150 + }, + { + "epoch": 0.8346423590942069, + "grad_norm": 41.4375, + "learning_rate": 9.869587352459392e-06, + "loss": 16.7873, + "step": 46160 + }, + { + "epoch": 0.8348231741633347, + "grad_norm": 41.4375, + "learning_rate": 9.869559100152733e-06, + "loss": 16.6171, + "step": 46170 + }, + { + "epoch": 0.8350039892324627, + "grad_norm": 41.59375, + "learning_rate": 9.869530847846073e-06, + "loss": 16.6384, + "step": 46180 + }, + { + "epoch": 0.8351848043015905, + "grad_norm": 42.0, + "learning_rate": 9.869502595539414e-06, + "loss": 16.6035, + "step": 46190 + }, + { + "epoch": 0.8353656193707184, + "grad_norm": 44.96875, + "learning_rate": 9.869474343232754e-06, + "loss": 16.9676, + "step": 46200 + }, + { + "epoch": 0.8355464344398462, + "grad_norm": 42.96875, + "learning_rate": 9.869446090926095e-06, + "loss": 16.6818, + "step": 46210 + }, + { + "epoch": 0.835727249508974, + "grad_norm": 41.6875, + "learning_rate": 9.869417838619436e-06, + "loss": 16.6355, + "step": 46220 + }, + { + "epoch": 0.835908064578102, + "grad_norm": 44.5625, + "learning_rate": 9.869389586312775e-06, + "loss": 16.671, + "step": 46230 + }, + { + "epoch": 0.8360888796472298, + "grad_norm": 42.625, + "learning_rate": 9.869361334006115e-06, + "loss": 16.6258, + "step": 46240 + }, + { + "epoch": 0.8362696947163577, + "grad_norm": 43.3125, + "learning_rate": 9.869333081699456e-06, + "loss": 16.8311, + "step": 46250 + }, + { + "epoch": 0.8364505097854855, + "grad_norm": 40.5, + "learning_rate": 9.869304829392796e-06, + "loss": 16.7952, + "step": 46260 + }, + { + "epoch": 0.8366313248546133, + "grad_norm": 43.03125, + "learning_rate": 9.869276577086137e-06, + "loss": 16.7644, + "step": 46270 + }, + { + "epoch": 0.8368121399237413, + "grad_norm": 43.71875, + "learning_rate": 9.869248324779478e-06, + "loss": 16.9682, + "step": 46280 + }, + { + "epoch": 0.8369929549928691, + "grad_norm": 44.375, + "learning_rate": 9.869220072472818e-06, + "loss": 16.6565, + "step": 46290 + }, + { + "epoch": 0.837173770061997, + "grad_norm": 45.3125, + "learning_rate": 9.869191820166159e-06, + "loss": 16.4278, + "step": 46300 + }, + { + "epoch": 0.8373545851311248, + "grad_norm": 44.75, + "learning_rate": 9.8691635678595e-06, + "loss": 17.0361, + "step": 46310 + }, + { + "epoch": 0.8375354002002527, + "grad_norm": 41.8125, + "learning_rate": 9.869135315552838e-06, + "loss": 16.7528, + "step": 46320 + }, + { + "epoch": 0.8377162152693806, + "grad_norm": 43.84375, + "learning_rate": 9.869107063246179e-06, + "loss": 16.6941, + "step": 46330 + }, + { + "epoch": 0.8378970303385084, + "grad_norm": 41.625, + "learning_rate": 9.86907881093952e-06, + "loss": 16.5204, + "step": 46340 + }, + { + "epoch": 0.8380778454076363, + "grad_norm": 41.25, + "learning_rate": 9.86905055863286e-06, + "loss": 17.0408, + "step": 46350 + }, + { + "epoch": 0.8382586604767641, + "grad_norm": 40.8125, + "learning_rate": 9.869022306326201e-06, + "loss": 16.3497, + "step": 46360 + }, + { + "epoch": 0.838439475545892, + "grad_norm": 42.15625, + "learning_rate": 9.868994054019542e-06, + "loss": 16.5019, + "step": 46370 + }, + { + "epoch": 0.8386202906150199, + "grad_norm": 41.8125, + "learning_rate": 9.868965801712882e-06, + "loss": 16.9828, + "step": 46380 + }, + { + "epoch": 0.8388011056841477, + "grad_norm": 45.5, + "learning_rate": 9.868937549406223e-06, + "loss": 16.5874, + "step": 46390 + }, + { + "epoch": 0.8389819207532756, + "grad_norm": 44.9375, + "learning_rate": 9.868909297099562e-06, + "loss": 17.0737, + "step": 46400 + }, + { + "epoch": 0.8391627358224034, + "grad_norm": 40.84375, + "learning_rate": 9.868881044792902e-06, + "loss": 17.0793, + "step": 46410 + }, + { + "epoch": 0.8393435508915313, + "grad_norm": 40.0625, + "learning_rate": 9.868852792486243e-06, + "loss": 16.9732, + "step": 46420 + }, + { + "epoch": 0.8395243659606592, + "grad_norm": 41.78125, + "learning_rate": 9.868824540179584e-06, + "loss": 17.0003, + "step": 46430 + }, + { + "epoch": 0.839705181029787, + "grad_norm": 41.96875, + "learning_rate": 9.868796287872924e-06, + "loss": 16.6613, + "step": 46440 + }, + { + "epoch": 0.8398859960989149, + "grad_norm": 43.78125, + "learning_rate": 9.868768035566265e-06, + "loss": 16.735, + "step": 46450 + }, + { + "epoch": 0.8400668111680427, + "grad_norm": 44.90625, + "learning_rate": 9.868739783259605e-06, + "loss": 16.3271, + "step": 46460 + }, + { + "epoch": 0.8402476262371706, + "grad_norm": 42.40625, + "learning_rate": 9.868711530952946e-06, + "loss": 16.5614, + "step": 46470 + }, + { + "epoch": 0.8404284413062985, + "grad_norm": 43.84375, + "learning_rate": 9.868683278646287e-06, + "loss": 16.7678, + "step": 46480 + }, + { + "epoch": 0.8406092563754263, + "grad_norm": 44.21875, + "learning_rate": 9.868655026339626e-06, + "loss": 16.8397, + "step": 46490 + }, + { + "epoch": 0.8407900714445542, + "grad_norm": 41.28125, + "learning_rate": 9.868626774032966e-06, + "loss": 16.9057, + "step": 46500 + }, + { + "epoch": 0.840970886513682, + "grad_norm": 42.09375, + "learning_rate": 9.868598521726307e-06, + "loss": 16.3798, + "step": 46510 + }, + { + "epoch": 0.8411517015828099, + "grad_norm": 42.3125, + "learning_rate": 9.868570269419648e-06, + "loss": 16.7016, + "step": 46520 + }, + { + "epoch": 0.8413325166519378, + "grad_norm": 44.09375, + "learning_rate": 9.868542017112988e-06, + "loss": 16.4862, + "step": 46530 + }, + { + "epoch": 0.8415133317210657, + "grad_norm": 45.59375, + "learning_rate": 9.868513764806329e-06, + "loss": 16.9339, + "step": 46540 + }, + { + "epoch": 0.8416941467901935, + "grad_norm": 44.8125, + "learning_rate": 9.86848551249967e-06, + "loss": 17.1162, + "step": 46550 + }, + { + "epoch": 0.8418749618593213, + "grad_norm": 43.96875, + "learning_rate": 9.86845726019301e-06, + "loss": 17.1667, + "step": 46560 + }, + { + "epoch": 0.8420557769284492, + "grad_norm": 43.21875, + "learning_rate": 9.86842900788635e-06, + "loss": 16.5697, + "step": 46570 + }, + { + "epoch": 0.842236591997577, + "grad_norm": 41.125, + "learning_rate": 9.86840075557969e-06, + "loss": 16.86, + "step": 46580 + }, + { + "epoch": 0.842417407066705, + "grad_norm": 41.34375, + "learning_rate": 9.86837250327303e-06, + "loss": 16.8571, + "step": 46590 + }, + { + "epoch": 0.8425982221358328, + "grad_norm": 45.9375, + "learning_rate": 9.86834425096637e-06, + "loss": 17.0934, + "step": 46600 + }, + { + "epoch": 0.8427790372049606, + "grad_norm": 40.28125, + "learning_rate": 9.868315998659711e-06, + "loss": 17.0367, + "step": 46610 + }, + { + "epoch": 0.8429598522740885, + "grad_norm": 43.9375, + "learning_rate": 9.868287746353052e-06, + "loss": 16.933, + "step": 46620 + }, + { + "epoch": 0.8431406673432164, + "grad_norm": 45.53125, + "learning_rate": 9.868259494046393e-06, + "loss": 17.0869, + "step": 46630 + }, + { + "epoch": 0.8433214824123443, + "grad_norm": 42.625, + "learning_rate": 9.868231241739733e-06, + "loss": 16.5878, + "step": 46640 + }, + { + "epoch": 0.8435022974814721, + "grad_norm": 42.9375, + "learning_rate": 9.868202989433074e-06, + "loss": 16.6963, + "step": 46650 + }, + { + "epoch": 0.8436831125505999, + "grad_norm": 43.3125, + "learning_rate": 9.868174737126413e-06, + "loss": 17.0261, + "step": 46660 + }, + { + "epoch": 0.8438639276197278, + "grad_norm": 42.03125, + "learning_rate": 9.868146484819753e-06, + "loss": 16.5386, + "step": 46670 + }, + { + "epoch": 0.8440447426888557, + "grad_norm": 40.4375, + "learning_rate": 9.868118232513094e-06, + "loss": 16.9873, + "step": 46680 + }, + { + "epoch": 0.8442255577579836, + "grad_norm": 41.1875, + "learning_rate": 9.868089980206435e-06, + "loss": 16.8286, + "step": 46690 + }, + { + "epoch": 0.8444063728271114, + "grad_norm": 40.84375, + "learning_rate": 9.868061727899775e-06, + "loss": 16.9142, + "step": 46700 + }, + { + "epoch": 0.8445871878962393, + "grad_norm": 41.625, + "learning_rate": 9.868033475593114e-06, + "loss": 16.7071, + "step": 46710 + }, + { + "epoch": 0.8447680029653671, + "grad_norm": 41.03125, + "learning_rate": 9.868005223286457e-06, + "loss": 17.1076, + "step": 46720 + }, + { + "epoch": 0.844948818034495, + "grad_norm": 43.28125, + "learning_rate": 9.867976970979797e-06, + "loss": 16.5466, + "step": 46730 + }, + { + "epoch": 0.8451296331036229, + "grad_norm": 44.875, + "learning_rate": 9.867948718673138e-06, + "loss": 17.1935, + "step": 46740 + }, + { + "epoch": 0.8453104481727507, + "grad_norm": 42.96875, + "learning_rate": 9.867920466366477e-06, + "loss": 16.4253, + "step": 46750 + }, + { + "epoch": 0.8454912632418786, + "grad_norm": 47.0, + "learning_rate": 9.867892214059817e-06, + "loss": 16.9154, + "step": 46760 + }, + { + "epoch": 0.8456720783110064, + "grad_norm": 45.59375, + "learning_rate": 9.867863961753158e-06, + "loss": 16.6339, + "step": 46770 + }, + { + "epoch": 0.8458528933801343, + "grad_norm": 40.65625, + "learning_rate": 9.867835709446499e-06, + "loss": 16.3904, + "step": 46780 + }, + { + "epoch": 0.8460337084492622, + "grad_norm": 42.59375, + "learning_rate": 9.86780745713984e-06, + "loss": 16.8356, + "step": 46790 + }, + { + "epoch": 0.84621452351839, + "grad_norm": 45.6875, + "learning_rate": 9.867779204833178e-06, + "loss": 16.9262, + "step": 46800 + }, + { + "epoch": 0.8463953385875179, + "grad_norm": 42.8125, + "learning_rate": 9.86775095252652e-06, + "loss": 17.0825, + "step": 46810 + }, + { + "epoch": 0.8465761536566457, + "grad_norm": 42.0625, + "learning_rate": 9.867722700219861e-06, + "loss": 16.8047, + "step": 46820 + }, + { + "epoch": 0.8467569687257736, + "grad_norm": 43.9375, + "learning_rate": 9.8676944479132e-06, + "loss": 16.5422, + "step": 46830 + }, + { + "epoch": 0.8469377837949015, + "grad_norm": 44.3125, + "learning_rate": 9.86766619560654e-06, + "loss": 16.8578, + "step": 46840 + }, + { + "epoch": 0.8471185988640293, + "grad_norm": 42.9375, + "learning_rate": 9.867637943299881e-06, + "loss": 16.5696, + "step": 46850 + }, + { + "epoch": 0.8472994139331572, + "grad_norm": 47.4375, + "learning_rate": 9.867609690993222e-06, + "loss": 16.7412, + "step": 46860 + }, + { + "epoch": 0.847480229002285, + "grad_norm": 43.46875, + "learning_rate": 9.867581438686563e-06, + "loss": 16.9719, + "step": 46870 + }, + { + "epoch": 0.847661044071413, + "grad_norm": 43.75, + "learning_rate": 9.867553186379903e-06, + "loss": 17.0033, + "step": 46880 + }, + { + "epoch": 0.8478418591405408, + "grad_norm": 44.4375, + "learning_rate": 9.867524934073244e-06, + "loss": 16.7468, + "step": 46890 + }, + { + "epoch": 0.8480226742096686, + "grad_norm": 43.0625, + "learning_rate": 9.867496681766584e-06, + "loss": 16.8236, + "step": 46900 + }, + { + "epoch": 0.8482034892787965, + "grad_norm": 44.125, + "learning_rate": 9.867468429459925e-06, + "loss": 16.9039, + "step": 46910 + }, + { + "epoch": 0.8483843043479243, + "grad_norm": 42.0, + "learning_rate": 9.867440177153264e-06, + "loss": 16.8074, + "step": 46920 + }, + { + "epoch": 0.8485651194170523, + "grad_norm": 40.90625, + "learning_rate": 9.867411924846605e-06, + "loss": 16.4367, + "step": 46930 + }, + { + "epoch": 0.8487459344861801, + "grad_norm": 45.90625, + "learning_rate": 9.867383672539945e-06, + "loss": 17.0261, + "step": 46940 + }, + { + "epoch": 0.8489267495553079, + "grad_norm": 44.40625, + "learning_rate": 9.867355420233286e-06, + "loss": 16.4444, + "step": 46950 + }, + { + "epoch": 0.8491075646244358, + "grad_norm": 42.25, + "learning_rate": 9.867327167926626e-06, + "loss": 16.2641, + "step": 46960 + }, + { + "epoch": 0.8492883796935636, + "grad_norm": 42.5625, + "learning_rate": 9.867298915619965e-06, + "loss": 16.3875, + "step": 46970 + }, + { + "epoch": 0.8494691947626916, + "grad_norm": 44.8125, + "learning_rate": 9.867270663313308e-06, + "loss": 16.8093, + "step": 46980 + }, + { + "epoch": 0.8496500098318194, + "grad_norm": 43.6875, + "learning_rate": 9.867242411006648e-06, + "loss": 16.3896, + "step": 46990 + }, + { + "epoch": 0.8498308249009472, + "grad_norm": 44.40625, + "learning_rate": 9.867214158699989e-06, + "loss": 16.6276, + "step": 47000 + }, + { + "epoch": 0.8500116399700751, + "grad_norm": 41.5625, + "learning_rate": 9.867185906393328e-06, + "loss": 16.7573, + "step": 47010 + }, + { + "epoch": 0.8501924550392029, + "grad_norm": 41.5625, + "learning_rate": 9.867157654086668e-06, + "loss": 17.2952, + "step": 47020 + }, + { + "epoch": 0.8503732701083309, + "grad_norm": 42.25, + "learning_rate": 9.867129401780009e-06, + "loss": 17.0923, + "step": 47030 + }, + { + "epoch": 0.8505540851774587, + "grad_norm": 41.78125, + "learning_rate": 9.86710114947335e-06, + "loss": 16.9049, + "step": 47040 + }, + { + "epoch": 0.8507349002465866, + "grad_norm": 41.78125, + "learning_rate": 9.86707289716669e-06, + "loss": 16.667, + "step": 47050 + }, + { + "epoch": 0.8509157153157144, + "grad_norm": 43.09375, + "learning_rate": 9.86704464486003e-06, + "loss": 16.6055, + "step": 47060 + }, + { + "epoch": 0.8510965303848422, + "grad_norm": 42.40625, + "learning_rate": 9.867016392553372e-06, + "loss": 17.369, + "step": 47070 + }, + { + "epoch": 0.8512773454539702, + "grad_norm": 44.5625, + "learning_rate": 9.866988140246712e-06, + "loss": 16.3215, + "step": 47080 + }, + { + "epoch": 0.851458160523098, + "grad_norm": 43.90625, + "learning_rate": 9.866959887940051e-06, + "loss": 16.7921, + "step": 47090 + }, + { + "epoch": 0.8516389755922259, + "grad_norm": 45.53125, + "learning_rate": 9.866931635633392e-06, + "loss": 16.9453, + "step": 47100 + }, + { + "epoch": 0.8518197906613537, + "grad_norm": 44.0, + "learning_rate": 9.866903383326732e-06, + "loss": 17.1338, + "step": 47110 + }, + { + "epoch": 0.8520006057304815, + "grad_norm": 40.0625, + "learning_rate": 9.866875131020073e-06, + "loss": 16.6403, + "step": 47120 + }, + { + "epoch": 0.8521814207996095, + "grad_norm": 44.21875, + "learning_rate": 9.866846878713414e-06, + "loss": 16.5787, + "step": 47130 + }, + { + "epoch": 0.8523622358687373, + "grad_norm": 47.53125, + "learning_rate": 9.866818626406753e-06, + "loss": 16.6966, + "step": 47140 + }, + { + "epoch": 0.8525430509378652, + "grad_norm": 45.90625, + "learning_rate": 9.866790374100093e-06, + "loss": 16.2386, + "step": 47150 + }, + { + "epoch": 0.852723866006993, + "grad_norm": 42.59375, + "learning_rate": 9.866762121793436e-06, + "loss": 16.7593, + "step": 47160 + }, + { + "epoch": 0.8529046810761208, + "grad_norm": 44.3125, + "learning_rate": 9.866733869486776e-06, + "loss": 16.6689, + "step": 47170 + }, + { + "epoch": 0.8530854961452488, + "grad_norm": 46.3125, + "learning_rate": 9.866705617180115e-06, + "loss": 16.603, + "step": 47180 + }, + { + "epoch": 0.8532663112143766, + "grad_norm": 42.625, + "learning_rate": 9.866677364873456e-06, + "loss": 16.7621, + "step": 47190 + }, + { + "epoch": 0.8534471262835045, + "grad_norm": 43.25, + "learning_rate": 9.866649112566796e-06, + "loss": 16.6054, + "step": 47200 + }, + { + "epoch": 0.8536279413526323, + "grad_norm": 44.1875, + "learning_rate": 9.866620860260137e-06, + "loss": 16.8067, + "step": 47210 + }, + { + "epoch": 0.8538087564217602, + "grad_norm": 44.40625, + "learning_rate": 9.866592607953478e-06, + "loss": 16.8855, + "step": 47220 + }, + { + "epoch": 0.853989571490888, + "grad_norm": 41.1875, + "learning_rate": 9.866564355646816e-06, + "loss": 16.7962, + "step": 47230 + }, + { + "epoch": 0.8541703865600159, + "grad_norm": 40.78125, + "learning_rate": 9.866536103340159e-06, + "loss": 16.797, + "step": 47240 + }, + { + "epoch": 0.8543512016291438, + "grad_norm": 46.96875, + "learning_rate": 9.8665078510335e-06, + "loss": 16.5223, + "step": 47250 + }, + { + "epoch": 0.8545320166982716, + "grad_norm": 45.8125, + "learning_rate": 9.866479598726838e-06, + "loss": 16.5319, + "step": 47260 + }, + { + "epoch": 0.8547128317673995, + "grad_norm": 45.53125, + "learning_rate": 9.866451346420179e-06, + "loss": 16.6102, + "step": 47270 + }, + { + "epoch": 0.8548936468365274, + "grad_norm": 43.78125, + "learning_rate": 9.86642309411352e-06, + "loss": 16.6636, + "step": 47280 + }, + { + "epoch": 0.8550744619056552, + "grad_norm": 43.03125, + "learning_rate": 9.86639484180686e-06, + "loss": 16.6404, + "step": 47290 + }, + { + "epoch": 0.8552552769747831, + "grad_norm": 42.9375, + "learning_rate": 9.8663665895002e-06, + "loss": 16.7613, + "step": 47300 + }, + { + "epoch": 0.8554360920439109, + "grad_norm": 41.75, + "learning_rate": 9.866338337193541e-06, + "loss": 17.157, + "step": 47310 + }, + { + "epoch": 0.8556169071130388, + "grad_norm": 46.71875, + "learning_rate": 9.86631008488688e-06, + "loss": 17.1103, + "step": 47320 + }, + { + "epoch": 0.8557977221821667, + "grad_norm": 43.90625, + "learning_rate": 9.866281832580223e-06, + "loss": 16.653, + "step": 47330 + }, + { + "epoch": 0.8559785372512945, + "grad_norm": 43.75, + "learning_rate": 9.866253580273563e-06, + "loss": 17.0226, + "step": 47340 + }, + { + "epoch": 0.8561593523204224, + "grad_norm": 42.0625, + "learning_rate": 9.866225327966902e-06, + "loss": 16.7507, + "step": 47350 + }, + { + "epoch": 0.8563401673895502, + "grad_norm": 44.90625, + "learning_rate": 9.866197075660243e-06, + "loss": 16.1328, + "step": 47360 + }, + { + "epoch": 0.8565209824586781, + "grad_norm": 43.3125, + "learning_rate": 9.866168823353583e-06, + "loss": 16.8316, + "step": 47370 + }, + { + "epoch": 0.856701797527806, + "grad_norm": 41.5, + "learning_rate": 9.866140571046924e-06, + "loss": 16.9856, + "step": 47380 + }, + { + "epoch": 0.8568826125969339, + "grad_norm": 45.125, + "learning_rate": 9.866112318740265e-06, + "loss": 17.0456, + "step": 47390 + }, + { + "epoch": 0.8570634276660617, + "grad_norm": 43.46875, + "learning_rate": 9.866084066433604e-06, + "loss": 17.0106, + "step": 47400 + }, + { + "epoch": 0.8572442427351895, + "grad_norm": 40.4375, + "learning_rate": 9.866055814126944e-06, + "loss": 16.3447, + "step": 47410 + }, + { + "epoch": 0.8574250578043174, + "grad_norm": 45.0625, + "learning_rate": 9.866027561820287e-06, + "loss": 17.0897, + "step": 47420 + }, + { + "epoch": 0.8576058728734453, + "grad_norm": 42.21875, + "learning_rate": 9.865999309513627e-06, + "loss": 16.8414, + "step": 47430 + }, + { + "epoch": 0.8577866879425732, + "grad_norm": 44.03125, + "learning_rate": 9.865971057206966e-06, + "loss": 16.7726, + "step": 47440 + }, + { + "epoch": 0.857967503011701, + "grad_norm": 43.09375, + "learning_rate": 9.865942804900307e-06, + "loss": 16.839, + "step": 47450 + }, + { + "epoch": 0.8581483180808288, + "grad_norm": 44.1875, + "learning_rate": 9.865914552593647e-06, + "loss": 16.4288, + "step": 47460 + }, + { + "epoch": 0.8583291331499567, + "grad_norm": 42.90625, + "learning_rate": 9.865886300286988e-06, + "loss": 16.6905, + "step": 47470 + }, + { + "epoch": 0.8585099482190846, + "grad_norm": 43.34375, + "learning_rate": 9.865858047980329e-06, + "loss": 16.5618, + "step": 47480 + }, + { + "epoch": 0.8586907632882125, + "grad_norm": 46.125, + "learning_rate": 9.865829795673668e-06, + "loss": 16.7685, + "step": 47490 + }, + { + "epoch": 0.8588715783573403, + "grad_norm": 42.5, + "learning_rate": 9.865801543367008e-06, + "loss": 16.5453, + "step": 47500 + }, + { + "epoch": 0.8590523934264681, + "grad_norm": 40.78125, + "learning_rate": 9.86577329106035e-06, + "loss": 16.8157, + "step": 47510 + }, + { + "epoch": 0.859233208495596, + "grad_norm": 45.9375, + "learning_rate": 9.86574503875369e-06, + "loss": 16.924, + "step": 47520 + }, + { + "epoch": 0.8594140235647239, + "grad_norm": 42.09375, + "learning_rate": 9.86571678644703e-06, + "loss": 16.8491, + "step": 47530 + }, + { + "epoch": 0.8595948386338518, + "grad_norm": 42.875, + "learning_rate": 9.86568853414037e-06, + "loss": 16.7195, + "step": 47540 + }, + { + "epoch": 0.8597756537029796, + "grad_norm": 47.21875, + "learning_rate": 9.865660281833711e-06, + "loss": 16.9705, + "step": 47550 + }, + { + "epoch": 0.8599564687721075, + "grad_norm": 43.375, + "learning_rate": 9.865632029527052e-06, + "loss": 16.7851, + "step": 47560 + }, + { + "epoch": 0.8601372838412353, + "grad_norm": 43.625, + "learning_rate": 9.865603777220391e-06, + "loss": 17.0512, + "step": 47570 + }, + { + "epoch": 0.8603180989103631, + "grad_norm": 43.34375, + "learning_rate": 9.865575524913731e-06, + "loss": 16.9528, + "step": 47580 + }, + { + "epoch": 0.8604989139794911, + "grad_norm": 45.21875, + "learning_rate": 9.865547272607074e-06, + "loss": 16.8987, + "step": 47590 + }, + { + "epoch": 0.8606797290486189, + "grad_norm": 41.9375, + "learning_rate": 9.865519020300414e-06, + "loss": 16.7751, + "step": 47600 + }, + { + "epoch": 0.8608605441177468, + "grad_norm": 45.34375, + "learning_rate": 9.865490767993753e-06, + "loss": 16.5027, + "step": 47610 + }, + { + "epoch": 0.8610413591868746, + "grad_norm": 45.4375, + "learning_rate": 9.865462515687094e-06, + "loss": 17.264, + "step": 47620 + }, + { + "epoch": 0.8612221742560024, + "grad_norm": 44.40625, + "learning_rate": 9.865434263380435e-06, + "loss": 16.5418, + "step": 47630 + }, + { + "epoch": 0.8614029893251304, + "grad_norm": 44.875, + "learning_rate": 9.865406011073775e-06, + "loss": 16.6337, + "step": 47640 + }, + { + "epoch": 0.8615838043942582, + "grad_norm": 43.46875, + "learning_rate": 9.865377758767116e-06, + "loss": 16.822, + "step": 47650 + }, + { + "epoch": 0.8617646194633861, + "grad_norm": 43.15625, + "learning_rate": 9.865349506460455e-06, + "loss": 17.0387, + "step": 47660 + }, + { + "epoch": 0.8619454345325139, + "grad_norm": 40.4375, + "learning_rate": 9.865321254153795e-06, + "loss": 16.4247, + "step": 47670 + }, + { + "epoch": 0.8621262496016417, + "grad_norm": 40.40625, + "learning_rate": 9.865293001847138e-06, + "loss": 16.9979, + "step": 47680 + }, + { + "epoch": 0.8623070646707697, + "grad_norm": 42.25, + "learning_rate": 9.865264749540477e-06, + "loss": 16.5828, + "step": 47690 + }, + { + "epoch": 0.8624878797398975, + "grad_norm": 42.53125, + "learning_rate": 9.865236497233817e-06, + "loss": 16.8847, + "step": 47700 + }, + { + "epoch": 0.8626686948090254, + "grad_norm": 44.09375, + "learning_rate": 9.865208244927158e-06, + "loss": 16.9782, + "step": 47710 + }, + { + "epoch": 0.8628495098781532, + "grad_norm": 44.875, + "learning_rate": 9.865179992620498e-06, + "loss": 16.5684, + "step": 47720 + }, + { + "epoch": 0.8630303249472812, + "grad_norm": 44.8125, + "learning_rate": 9.865151740313839e-06, + "loss": 16.2784, + "step": 47730 + }, + { + "epoch": 0.863211140016409, + "grad_norm": 45.40625, + "learning_rate": 9.86512348800718e-06, + "loss": 17.0211, + "step": 47740 + }, + { + "epoch": 0.8633919550855368, + "grad_norm": 46.625, + "learning_rate": 9.865095235700519e-06, + "loss": 16.7793, + "step": 47750 + }, + { + "epoch": 0.8635727701546647, + "grad_norm": 38.5625, + "learning_rate": 9.86506698339386e-06, + "loss": 16.9268, + "step": 47760 + }, + { + "epoch": 0.8637535852237925, + "grad_norm": 39.9375, + "learning_rate": 9.865038731087202e-06, + "loss": 16.7704, + "step": 47770 + }, + { + "epoch": 0.8639344002929205, + "grad_norm": 43.46875, + "learning_rate": 9.86501047878054e-06, + "loss": 16.9899, + "step": 47780 + }, + { + "epoch": 0.8641152153620483, + "grad_norm": 42.09375, + "learning_rate": 9.864982226473881e-06, + "loss": 16.8435, + "step": 47790 + }, + { + "epoch": 0.8642960304311761, + "grad_norm": 43.34375, + "learning_rate": 9.864953974167222e-06, + "loss": 16.5593, + "step": 47800 + }, + { + "epoch": 0.864476845500304, + "grad_norm": 46.1875, + "learning_rate": 9.864925721860562e-06, + "loss": 16.7038, + "step": 47810 + }, + { + "epoch": 0.8646576605694318, + "grad_norm": 43.21875, + "learning_rate": 9.864897469553903e-06, + "loss": 16.8785, + "step": 47820 + }, + { + "epoch": 0.8648384756385598, + "grad_norm": 41.28125, + "learning_rate": 9.864869217247242e-06, + "loss": 16.9381, + "step": 47830 + }, + { + "epoch": 0.8650192907076876, + "grad_norm": 43.96875, + "learning_rate": 9.864840964940583e-06, + "loss": 16.3128, + "step": 47840 + }, + { + "epoch": 0.8652001057768154, + "grad_norm": 42.5625, + "learning_rate": 9.864812712633923e-06, + "loss": 16.5548, + "step": 47850 + }, + { + "epoch": 0.8653809208459433, + "grad_norm": 46.09375, + "learning_rate": 9.864784460327266e-06, + "loss": 16.6328, + "step": 47860 + }, + { + "epoch": 0.8655617359150711, + "grad_norm": 44.8125, + "learning_rate": 9.864756208020604e-06, + "loss": 16.7783, + "step": 47870 + }, + { + "epoch": 0.865742550984199, + "grad_norm": 43.375, + "learning_rate": 9.864727955713945e-06, + "loss": 16.5969, + "step": 47880 + }, + { + "epoch": 0.8659233660533269, + "grad_norm": 45.15625, + "learning_rate": 9.864699703407286e-06, + "loss": 16.7332, + "step": 47890 + }, + { + "epoch": 0.8661041811224548, + "grad_norm": 44.25, + "learning_rate": 9.864671451100626e-06, + "loss": 16.8043, + "step": 47900 + }, + { + "epoch": 0.8662849961915826, + "grad_norm": 42.4375, + "learning_rate": 9.864643198793967e-06, + "loss": 16.796, + "step": 47910 + }, + { + "epoch": 0.8664658112607104, + "grad_norm": 43.21875, + "learning_rate": 9.864614946487306e-06, + "loss": 17.0849, + "step": 47920 + }, + { + "epoch": 0.8666466263298384, + "grad_norm": 42.625, + "learning_rate": 9.864586694180646e-06, + "loss": 16.6788, + "step": 47930 + }, + { + "epoch": 0.8668274413989662, + "grad_norm": 42.40625, + "learning_rate": 9.864558441873989e-06, + "loss": 16.5263, + "step": 47940 + }, + { + "epoch": 0.8670082564680941, + "grad_norm": 45.59375, + "learning_rate": 9.864530189567328e-06, + "loss": 16.6813, + "step": 47950 + }, + { + "epoch": 0.8671890715372219, + "grad_norm": 43.15625, + "learning_rate": 9.864501937260668e-06, + "loss": 16.7066, + "step": 47960 + }, + { + "epoch": 0.8673698866063497, + "grad_norm": 43.75, + "learning_rate": 9.864473684954009e-06, + "loss": 16.4457, + "step": 47970 + }, + { + "epoch": 0.8675507016754777, + "grad_norm": 44.6875, + "learning_rate": 9.86444543264735e-06, + "loss": 16.8559, + "step": 47980 + }, + { + "epoch": 0.8677315167446055, + "grad_norm": 43.6875, + "learning_rate": 9.86441718034069e-06, + "loss": 16.6689, + "step": 47990 + }, + { + "epoch": 0.8679123318137334, + "grad_norm": 42.46875, + "learning_rate": 9.864388928034029e-06, + "loss": 17.0778, + "step": 48000 + }, + { + "epoch": 0.8680931468828612, + "grad_norm": 43.15625, + "learning_rate": 9.86436067572737e-06, + "loss": 16.6127, + "step": 48010 + }, + { + "epoch": 0.868273961951989, + "grad_norm": 42.8125, + "learning_rate": 9.86433242342071e-06, + "loss": 16.7316, + "step": 48020 + }, + { + "epoch": 0.868454777021117, + "grad_norm": 42.78125, + "learning_rate": 9.864304171114053e-06, + "loss": 17.0651, + "step": 48030 + }, + { + "epoch": 0.8686355920902448, + "grad_norm": 39.875, + "learning_rate": 9.864275918807392e-06, + "loss": 16.8044, + "step": 48040 + }, + { + "epoch": 0.8688164071593727, + "grad_norm": 46.4375, + "learning_rate": 9.864247666500732e-06, + "loss": 16.662, + "step": 48050 + }, + { + "epoch": 0.8689972222285005, + "grad_norm": 46.34375, + "learning_rate": 9.864219414194073e-06, + "loss": 16.8479, + "step": 48060 + }, + { + "epoch": 0.8691780372976284, + "grad_norm": 43.40625, + "learning_rate": 9.864191161887413e-06, + "loss": 16.6773, + "step": 48070 + }, + { + "epoch": 0.8693588523667563, + "grad_norm": 45.0625, + "learning_rate": 9.864162909580754e-06, + "loss": 16.3427, + "step": 48080 + }, + { + "epoch": 0.8695396674358841, + "grad_norm": 42.09375, + "learning_rate": 9.864134657274093e-06, + "loss": 16.5875, + "step": 48090 + }, + { + "epoch": 0.869720482505012, + "grad_norm": 43.03125, + "learning_rate": 9.864106404967434e-06, + "loss": 16.8144, + "step": 48100 + }, + { + "epoch": 0.8699012975741398, + "grad_norm": 42.1875, + "learning_rate": 9.864078152660774e-06, + "loss": 16.7123, + "step": 48110 + }, + { + "epoch": 0.8700821126432677, + "grad_norm": 46.96875, + "learning_rate": 9.864049900354115e-06, + "loss": 16.5468, + "step": 48120 + }, + { + "epoch": 0.8702629277123956, + "grad_norm": 43.3125, + "learning_rate": 9.864021648047456e-06, + "loss": 16.6123, + "step": 48130 + }, + { + "epoch": 0.8704437427815234, + "grad_norm": 43.03125, + "learning_rate": 9.863993395740796e-06, + "loss": 16.6049, + "step": 48140 + }, + { + "epoch": 0.8706245578506513, + "grad_norm": 42.03125, + "learning_rate": 9.863965143434137e-06, + "loss": 16.6403, + "step": 48150 + }, + { + "epoch": 0.8708053729197791, + "grad_norm": 43.21875, + "learning_rate": 9.863936891127477e-06, + "loss": 16.3105, + "step": 48160 + }, + { + "epoch": 0.870986187988907, + "grad_norm": 44.1875, + "learning_rate": 9.863908638820816e-06, + "loss": 16.4318, + "step": 48170 + }, + { + "epoch": 0.8711670030580349, + "grad_norm": 44.90625, + "learning_rate": 9.863880386514157e-06, + "loss": 16.7314, + "step": 48180 + }, + { + "epoch": 0.8713478181271627, + "grad_norm": 45.25, + "learning_rate": 9.863852134207498e-06, + "loss": 17.3316, + "step": 48190 + }, + { + "epoch": 0.8715286331962906, + "grad_norm": 43.28125, + "learning_rate": 9.863823881900838e-06, + "loss": 16.8433, + "step": 48200 + }, + { + "epoch": 0.8717094482654184, + "grad_norm": 42.75, + "learning_rate": 9.863795629594179e-06, + "loss": 16.7235, + "step": 48210 + }, + { + "epoch": 0.8718902633345463, + "grad_norm": 43.4375, + "learning_rate": 9.86376737728752e-06, + "loss": 16.7464, + "step": 48220 + }, + { + "epoch": 0.8720710784036741, + "grad_norm": 44.3125, + "learning_rate": 9.86373912498086e-06, + "loss": 16.5672, + "step": 48230 + }, + { + "epoch": 0.8722518934728021, + "grad_norm": 42.1875, + "learning_rate": 9.8637108726742e-06, + "loss": 16.292, + "step": 48240 + }, + { + "epoch": 0.8724327085419299, + "grad_norm": 42.5625, + "learning_rate": 9.863682620367541e-06, + "loss": 16.7786, + "step": 48250 + }, + { + "epoch": 0.8726135236110577, + "grad_norm": 43.6875, + "learning_rate": 9.86365436806088e-06, + "loss": 16.6473, + "step": 48260 + }, + { + "epoch": 0.8727943386801856, + "grad_norm": 42.53125, + "learning_rate": 9.863626115754221e-06, + "loss": 16.7557, + "step": 48270 + }, + { + "epoch": 0.8729751537493134, + "grad_norm": 43.34375, + "learning_rate": 9.863597863447561e-06, + "loss": 17.0484, + "step": 48280 + }, + { + "epoch": 0.8731559688184414, + "grad_norm": 43.125, + "learning_rate": 9.863569611140902e-06, + "loss": 16.5089, + "step": 48290 + }, + { + "epoch": 0.8733367838875692, + "grad_norm": 45.5625, + "learning_rate": 9.863541358834243e-06, + "loss": 16.8514, + "step": 48300 + }, + { + "epoch": 0.873517598956697, + "grad_norm": 42.8125, + "learning_rate": 9.863513106527583e-06, + "loss": 16.7409, + "step": 48310 + }, + { + "epoch": 0.8736984140258249, + "grad_norm": 43.71875, + "learning_rate": 9.863484854220924e-06, + "loss": 16.4951, + "step": 48320 + }, + { + "epoch": 0.8738792290949527, + "grad_norm": 42.53125, + "learning_rate": 9.863456601914265e-06, + "loss": 17.0963, + "step": 48330 + }, + { + "epoch": 0.8740600441640807, + "grad_norm": 44.78125, + "learning_rate": 9.863428349607605e-06, + "loss": 17.1169, + "step": 48340 + }, + { + "epoch": 0.8742408592332085, + "grad_norm": 44.59375, + "learning_rate": 9.863400097300944e-06, + "loss": 16.532, + "step": 48350 + }, + { + "epoch": 0.8744216743023363, + "grad_norm": 42.6875, + "learning_rate": 9.863371844994285e-06, + "loss": 16.4468, + "step": 48360 + }, + { + "epoch": 0.8746024893714642, + "grad_norm": 42.875, + "learning_rate": 9.863343592687625e-06, + "loss": 16.5803, + "step": 48370 + }, + { + "epoch": 0.874783304440592, + "grad_norm": 40.125, + "learning_rate": 9.863315340380966e-06, + "loss": 16.6159, + "step": 48380 + }, + { + "epoch": 0.87496411950972, + "grad_norm": 42.1875, + "learning_rate": 9.863287088074307e-06, + "loss": 16.6872, + "step": 48390 + }, + { + "epoch": 0.8751449345788478, + "grad_norm": 41.84375, + "learning_rate": 9.863258835767647e-06, + "loss": 16.4338, + "step": 48400 + }, + { + "epoch": 0.8753257496479757, + "grad_norm": 43.0625, + "learning_rate": 9.863230583460988e-06, + "loss": 16.7224, + "step": 48410 + }, + { + "epoch": 0.8755065647171035, + "grad_norm": 44.21875, + "learning_rate": 9.863202331154329e-06, + "loss": 16.6712, + "step": 48420 + }, + { + "epoch": 0.8756873797862313, + "grad_norm": 44.03125, + "learning_rate": 9.863174078847667e-06, + "loss": 17.0588, + "step": 48430 + }, + { + "epoch": 0.8758681948553593, + "grad_norm": 41.5625, + "learning_rate": 9.863145826541008e-06, + "loss": 16.8368, + "step": 48440 + }, + { + "epoch": 0.8760490099244871, + "grad_norm": 42.28125, + "learning_rate": 9.863117574234349e-06, + "loss": 16.9934, + "step": 48450 + }, + { + "epoch": 0.876229824993615, + "grad_norm": 45.53125, + "learning_rate": 9.86308932192769e-06, + "loss": 17.0267, + "step": 48460 + }, + { + "epoch": 0.8764106400627428, + "grad_norm": 46.5625, + "learning_rate": 9.86306106962103e-06, + "loss": 16.648, + "step": 48470 + }, + { + "epoch": 0.8765914551318706, + "grad_norm": 44.625, + "learning_rate": 9.86303281731437e-06, + "loss": 16.7814, + "step": 48480 + }, + { + "epoch": 0.8767722702009986, + "grad_norm": 43.25, + "learning_rate": 9.863004565007711e-06, + "loss": 16.8009, + "step": 48490 + }, + { + "epoch": 0.8769530852701264, + "grad_norm": 43.625, + "learning_rate": 9.862976312701052e-06, + "loss": 16.8888, + "step": 48500 + }, + { + "epoch": 0.8771339003392543, + "grad_norm": 44.46875, + "learning_rate": 9.862948060394392e-06, + "loss": 16.9263, + "step": 48510 + }, + { + "epoch": 0.8773147154083821, + "grad_norm": 43.1875, + "learning_rate": 9.862919808087731e-06, + "loss": 16.7748, + "step": 48520 + }, + { + "epoch": 0.8774955304775099, + "grad_norm": 46.65625, + "learning_rate": 9.862891555781072e-06, + "loss": 16.5806, + "step": 48530 + }, + { + "epoch": 0.8776763455466379, + "grad_norm": 41.375, + "learning_rate": 9.862863303474413e-06, + "loss": 16.629, + "step": 48540 + }, + { + "epoch": 0.8778571606157657, + "grad_norm": 44.15625, + "learning_rate": 9.862835051167753e-06, + "loss": 17.0033, + "step": 48550 + }, + { + "epoch": 0.8780379756848936, + "grad_norm": 44.46875, + "learning_rate": 9.862806798861094e-06, + "loss": 17.1855, + "step": 48560 + }, + { + "epoch": 0.8782187907540214, + "grad_norm": 43.8125, + "learning_rate": 9.862778546554434e-06, + "loss": 16.6759, + "step": 48570 + }, + { + "epoch": 0.8783996058231494, + "grad_norm": 41.90625, + "learning_rate": 9.862750294247775e-06, + "loss": 16.7433, + "step": 48580 + }, + { + "epoch": 0.8785804208922772, + "grad_norm": 44.28125, + "learning_rate": 9.862722041941116e-06, + "loss": 16.637, + "step": 48590 + }, + { + "epoch": 0.878761235961405, + "grad_norm": 42.40625, + "learning_rate": 9.862693789634455e-06, + "loss": 16.9434, + "step": 48600 + }, + { + "epoch": 0.8789420510305329, + "grad_norm": 42.78125, + "learning_rate": 9.862665537327795e-06, + "loss": 16.7419, + "step": 48610 + }, + { + "epoch": 0.8791228660996607, + "grad_norm": 44.90625, + "learning_rate": 9.862637285021136e-06, + "loss": 16.7121, + "step": 48620 + }, + { + "epoch": 0.8793036811687887, + "grad_norm": 41.40625, + "learning_rate": 9.862609032714476e-06, + "loss": 17.281, + "step": 48630 + }, + { + "epoch": 0.8794844962379165, + "grad_norm": 45.46875, + "learning_rate": 9.862580780407817e-06, + "loss": 17.0728, + "step": 48640 + }, + { + "epoch": 0.8796653113070443, + "grad_norm": 45.40625, + "learning_rate": 9.862552528101158e-06, + "loss": 17.1695, + "step": 48650 + }, + { + "epoch": 0.8798461263761722, + "grad_norm": 43.34375, + "learning_rate": 9.862524275794498e-06, + "loss": 16.7729, + "step": 48660 + }, + { + "epoch": 0.8800269414453, + "grad_norm": 42.59375, + "learning_rate": 9.862496023487839e-06, + "loss": 16.5975, + "step": 48670 + }, + { + "epoch": 0.880207756514428, + "grad_norm": 41.96875, + "learning_rate": 9.86246777118118e-06, + "loss": 17.0676, + "step": 48680 + }, + { + "epoch": 0.8803885715835558, + "grad_norm": 43.625, + "learning_rate": 9.862439518874519e-06, + "loss": 16.4909, + "step": 48690 + }, + { + "epoch": 0.8805693866526836, + "grad_norm": 45.96875, + "learning_rate": 9.86241126656786e-06, + "loss": 16.2651, + "step": 48700 + }, + { + "epoch": 0.8807502017218115, + "grad_norm": 43.59375, + "learning_rate": 9.8623830142612e-06, + "loss": 16.8944, + "step": 48710 + }, + { + "epoch": 0.8809310167909393, + "grad_norm": 43.84375, + "learning_rate": 9.86235476195454e-06, + "loss": 16.936, + "step": 48720 + }, + { + "epoch": 0.8811118318600673, + "grad_norm": 43.1875, + "learning_rate": 9.862326509647881e-06, + "loss": 16.6326, + "step": 48730 + }, + { + "epoch": 0.8812926469291951, + "grad_norm": 42.65625, + "learning_rate": 9.862298257341222e-06, + "loss": 16.9422, + "step": 48740 + }, + { + "epoch": 0.881473461998323, + "grad_norm": 41.375, + "learning_rate": 9.862270005034562e-06, + "loss": 16.3952, + "step": 48750 + }, + { + "epoch": 0.8816542770674508, + "grad_norm": 43.0, + "learning_rate": 9.862241752727903e-06, + "loss": 16.612, + "step": 48760 + }, + { + "epoch": 0.8818350921365786, + "grad_norm": 44.34375, + "learning_rate": 9.862213500421244e-06, + "loss": 17.0378, + "step": 48770 + }, + { + "epoch": 0.8820159072057066, + "grad_norm": 44.09375, + "learning_rate": 9.862185248114582e-06, + "loss": 16.5489, + "step": 48780 + }, + { + "epoch": 0.8821967222748344, + "grad_norm": 43.53125, + "learning_rate": 9.862156995807923e-06, + "loss": 16.7768, + "step": 48790 + }, + { + "epoch": 0.8823775373439623, + "grad_norm": 47.0, + "learning_rate": 9.862128743501264e-06, + "loss": 16.9206, + "step": 48800 + }, + { + "epoch": 0.8825583524130901, + "grad_norm": 39.8125, + "learning_rate": 9.862100491194604e-06, + "loss": 16.4764, + "step": 48810 + }, + { + "epoch": 0.8827391674822179, + "grad_norm": 41.875, + "learning_rate": 9.862072238887945e-06, + "loss": 16.5945, + "step": 48820 + }, + { + "epoch": 0.8829199825513458, + "grad_norm": 45.625, + "learning_rate": 9.862043986581286e-06, + "loss": 16.5602, + "step": 48830 + }, + { + "epoch": 0.8831007976204737, + "grad_norm": 42.0, + "learning_rate": 9.862015734274626e-06, + "loss": 16.9655, + "step": 48840 + }, + { + "epoch": 0.8832816126896016, + "grad_norm": 44.21875, + "learning_rate": 9.861987481967967e-06, + "loss": 16.6877, + "step": 48850 + }, + { + "epoch": 0.8834624277587294, + "grad_norm": 43.375, + "learning_rate": 9.861959229661306e-06, + "loss": 16.6978, + "step": 48860 + }, + { + "epoch": 0.8836432428278572, + "grad_norm": 44.3125, + "learning_rate": 9.861930977354646e-06, + "loss": 16.9109, + "step": 48870 + }, + { + "epoch": 0.8838240578969851, + "grad_norm": 46.0625, + "learning_rate": 9.861902725047987e-06, + "loss": 16.019, + "step": 48880 + }, + { + "epoch": 0.884004872966113, + "grad_norm": 45.90625, + "learning_rate": 9.861874472741328e-06, + "loss": 17.0793, + "step": 48890 + }, + { + "epoch": 0.8841856880352409, + "grad_norm": 44.84375, + "learning_rate": 9.861846220434668e-06, + "loss": 16.832, + "step": 48900 + }, + { + "epoch": 0.8843665031043687, + "grad_norm": 43.21875, + "learning_rate": 9.861817968128009e-06, + "loss": 16.9307, + "step": 48910 + }, + { + "epoch": 0.8845473181734966, + "grad_norm": 43.21875, + "learning_rate": 9.86178971582135e-06, + "loss": 16.1817, + "step": 48920 + }, + { + "epoch": 0.8847281332426244, + "grad_norm": 44.1875, + "learning_rate": 9.86176146351469e-06, + "loss": 17.1689, + "step": 48930 + }, + { + "epoch": 0.8849089483117523, + "grad_norm": 42.375, + "learning_rate": 9.86173321120803e-06, + "loss": 16.9108, + "step": 48940 + }, + { + "epoch": 0.8850897633808802, + "grad_norm": 42.6875, + "learning_rate": 9.86170495890137e-06, + "loss": 16.6475, + "step": 48950 + }, + { + "epoch": 0.885270578450008, + "grad_norm": 42.71875, + "learning_rate": 9.86167670659471e-06, + "loss": 16.5359, + "step": 48960 + }, + { + "epoch": 0.8854513935191359, + "grad_norm": 43.9375, + "learning_rate": 9.861648454288051e-06, + "loss": 16.8834, + "step": 48970 + }, + { + "epoch": 0.8856322085882637, + "grad_norm": 43.0625, + "learning_rate": 9.861620201981391e-06, + "loss": 16.5986, + "step": 48980 + }, + { + "epoch": 0.8858130236573916, + "grad_norm": 47.15625, + "learning_rate": 9.861591949674732e-06, + "loss": 16.5972, + "step": 48990 + }, + { + "epoch": 0.8859938387265195, + "grad_norm": 43.8125, + "learning_rate": 9.861563697368073e-06, + "loss": 16.8942, + "step": 49000 + }, + { + "epoch": 0.8861746537956473, + "grad_norm": 45.34375, + "learning_rate": 9.861535445061413e-06, + "loss": 16.5526, + "step": 49010 + }, + { + "epoch": 0.8863554688647752, + "grad_norm": 41.34375, + "learning_rate": 9.861507192754754e-06, + "loss": 16.7797, + "step": 49020 + }, + { + "epoch": 0.886536283933903, + "grad_norm": 43.28125, + "learning_rate": 9.861478940448093e-06, + "loss": 16.9787, + "step": 49030 + }, + { + "epoch": 0.8867170990030309, + "grad_norm": 42.6875, + "learning_rate": 9.861450688141434e-06, + "loss": 16.6541, + "step": 49040 + }, + { + "epoch": 0.8868979140721588, + "grad_norm": 43.90625, + "learning_rate": 9.861422435834774e-06, + "loss": 16.7543, + "step": 49050 + }, + { + "epoch": 0.8870787291412866, + "grad_norm": 43.0, + "learning_rate": 9.861394183528115e-06, + "loss": 16.7508, + "step": 49060 + }, + { + "epoch": 0.8872595442104145, + "grad_norm": 40.3125, + "learning_rate": 9.861365931221455e-06, + "loss": 16.7637, + "step": 49070 + }, + { + "epoch": 0.8874403592795423, + "grad_norm": 40.28125, + "learning_rate": 9.861337678914796e-06, + "loss": 16.0263, + "step": 49080 + }, + { + "epoch": 0.8876211743486703, + "grad_norm": 42.625, + "learning_rate": 9.861309426608137e-06, + "loss": 16.8237, + "step": 49090 + }, + { + "epoch": 0.8878019894177981, + "grad_norm": 40.6875, + "learning_rate": 9.861281174301477e-06, + "loss": 17.5541, + "step": 49100 + }, + { + "epoch": 0.8879828044869259, + "grad_norm": 44.96875, + "learning_rate": 9.861252921994818e-06, + "loss": 17.1185, + "step": 49110 + }, + { + "epoch": 0.8881636195560538, + "grad_norm": 42.9375, + "learning_rate": 9.861224669688157e-06, + "loss": 16.7611, + "step": 49120 + }, + { + "epoch": 0.8883444346251816, + "grad_norm": 42.84375, + "learning_rate": 9.861196417381497e-06, + "loss": 16.2662, + "step": 49130 + }, + { + "epoch": 0.8885252496943096, + "grad_norm": 43.03125, + "learning_rate": 9.861168165074838e-06, + "loss": 16.8431, + "step": 49140 + }, + { + "epoch": 0.8887060647634374, + "grad_norm": 43.09375, + "learning_rate": 9.861139912768179e-06, + "loss": 16.5856, + "step": 49150 + }, + { + "epoch": 0.8888868798325652, + "grad_norm": 43.0625, + "learning_rate": 9.86111166046152e-06, + "loss": 17.3069, + "step": 49160 + }, + { + "epoch": 0.8890676949016931, + "grad_norm": 43.5, + "learning_rate": 9.86108340815486e-06, + "loss": 16.948, + "step": 49170 + }, + { + "epoch": 0.8892485099708209, + "grad_norm": 44.25, + "learning_rate": 9.8610551558482e-06, + "loss": 16.3898, + "step": 49180 + }, + { + "epoch": 0.8894293250399489, + "grad_norm": 42.78125, + "learning_rate": 9.861026903541541e-06, + "loss": 16.5265, + "step": 49190 + }, + { + "epoch": 0.8896101401090767, + "grad_norm": 45.71875, + "learning_rate": 9.860998651234882e-06, + "loss": 16.7786, + "step": 49200 + }, + { + "epoch": 0.8897909551782045, + "grad_norm": 45.84375, + "learning_rate": 9.86097039892822e-06, + "loss": 16.3813, + "step": 49210 + }, + { + "epoch": 0.8899717702473324, + "grad_norm": 41.21875, + "learning_rate": 9.860942146621561e-06, + "loss": 16.3776, + "step": 49220 + }, + { + "epoch": 0.8901525853164602, + "grad_norm": 42.75, + "learning_rate": 9.860913894314902e-06, + "loss": 16.5008, + "step": 49230 + }, + { + "epoch": 0.8903334003855882, + "grad_norm": 44.0, + "learning_rate": 9.860885642008243e-06, + "loss": 16.4967, + "step": 49240 + }, + { + "epoch": 0.890514215454716, + "grad_norm": 42.09375, + "learning_rate": 9.860857389701583e-06, + "loss": 16.8697, + "step": 49250 + }, + { + "epoch": 0.8906950305238439, + "grad_norm": 43.3125, + "learning_rate": 9.860829137394924e-06, + "loss": 16.6238, + "step": 49260 + }, + { + "epoch": 0.8908758455929717, + "grad_norm": 44.28125, + "learning_rate": 9.860800885088264e-06, + "loss": 16.7561, + "step": 49270 + }, + { + "epoch": 0.8910566606620995, + "grad_norm": 44.8125, + "learning_rate": 9.860772632781605e-06, + "loss": 17.2425, + "step": 49280 + }, + { + "epoch": 0.8912374757312275, + "grad_norm": 43.90625, + "learning_rate": 9.860744380474944e-06, + "loss": 16.5298, + "step": 49290 + }, + { + "epoch": 0.8914182908003553, + "grad_norm": 44.5625, + "learning_rate": 9.860716128168285e-06, + "loss": 16.7332, + "step": 49300 + }, + { + "epoch": 0.8915991058694832, + "grad_norm": 44.5, + "learning_rate": 9.860687875861625e-06, + "loss": 17.0327, + "step": 49310 + }, + { + "epoch": 0.891779920938611, + "grad_norm": 43.3125, + "learning_rate": 9.860659623554966e-06, + "loss": 16.7635, + "step": 49320 + }, + { + "epoch": 0.8919607360077388, + "grad_norm": 42.96875, + "learning_rate": 9.860631371248307e-06, + "loss": 16.7297, + "step": 49330 + }, + { + "epoch": 0.8921415510768668, + "grad_norm": 44.96875, + "learning_rate": 9.860603118941647e-06, + "loss": 16.9263, + "step": 49340 + }, + { + "epoch": 0.8923223661459946, + "grad_norm": 43.96875, + "learning_rate": 9.860574866634988e-06, + "loss": 17.077, + "step": 49350 + }, + { + "epoch": 0.8925031812151225, + "grad_norm": 43.1875, + "learning_rate": 9.860546614328328e-06, + "loss": 16.648, + "step": 49360 + }, + { + "epoch": 0.8926839962842503, + "grad_norm": 42.0625, + "learning_rate": 9.860518362021669e-06, + "loss": 16.6563, + "step": 49370 + }, + { + "epoch": 0.8928648113533781, + "grad_norm": 42.15625, + "learning_rate": 9.860490109715008e-06, + "loss": 16.6227, + "step": 49380 + }, + { + "epoch": 0.8930456264225061, + "grad_norm": 44.0625, + "learning_rate": 9.860461857408349e-06, + "loss": 16.5707, + "step": 49390 + }, + { + "epoch": 0.8932264414916339, + "grad_norm": 39.40625, + "learning_rate": 9.86043360510169e-06, + "loss": 17.3028, + "step": 49400 + }, + { + "epoch": 0.8934072565607618, + "grad_norm": 44.25, + "learning_rate": 9.86040535279503e-06, + "loss": 16.9679, + "step": 49410 + }, + { + "epoch": 0.8935880716298896, + "grad_norm": 46.0, + "learning_rate": 9.86037710048837e-06, + "loss": 16.7102, + "step": 49420 + }, + { + "epoch": 0.8937688866990176, + "grad_norm": 44.65625, + "learning_rate": 9.860348848181711e-06, + "loss": 16.5409, + "step": 49430 + }, + { + "epoch": 0.8939497017681454, + "grad_norm": 43.84375, + "learning_rate": 9.860320595875052e-06, + "loss": 16.6154, + "step": 49440 + }, + { + "epoch": 0.8941305168372732, + "grad_norm": 44.125, + "learning_rate": 9.860292343568392e-06, + "loss": 16.6337, + "step": 49450 + }, + { + "epoch": 0.8943113319064011, + "grad_norm": 43.0, + "learning_rate": 9.860264091261731e-06, + "loss": 16.6019, + "step": 49460 + }, + { + "epoch": 0.8944921469755289, + "grad_norm": 41.84375, + "learning_rate": 9.860235838955072e-06, + "loss": 16.6053, + "step": 49470 + }, + { + "epoch": 0.8946729620446568, + "grad_norm": 44.65625, + "learning_rate": 9.860207586648412e-06, + "loss": 16.5996, + "step": 49480 + }, + { + "epoch": 0.8948537771137847, + "grad_norm": 44.25, + "learning_rate": 9.860179334341753e-06, + "loss": 16.5798, + "step": 49490 + }, + { + "epoch": 0.8950345921829125, + "grad_norm": 43.5, + "learning_rate": 9.860151082035094e-06, + "loss": 16.9582, + "step": 49500 + }, + { + "epoch": 0.8952154072520404, + "grad_norm": 41.96875, + "learning_rate": 9.860122829728434e-06, + "loss": 16.8579, + "step": 49510 + }, + { + "epoch": 0.8953962223211682, + "grad_norm": 45.125, + "learning_rate": 9.860094577421775e-06, + "loss": 16.5535, + "step": 49520 + }, + { + "epoch": 0.8955770373902961, + "grad_norm": 43.875, + "learning_rate": 9.860066325115116e-06, + "loss": 17.1519, + "step": 49530 + }, + { + "epoch": 0.895757852459424, + "grad_norm": 42.375, + "learning_rate": 9.860038072808456e-06, + "loss": 16.4699, + "step": 49540 + }, + { + "epoch": 0.8959386675285518, + "grad_norm": 42.78125, + "learning_rate": 9.860009820501795e-06, + "loss": 16.67, + "step": 49550 + }, + { + "epoch": 0.8961194825976797, + "grad_norm": 45.5625, + "learning_rate": 9.859981568195136e-06, + "loss": 17.0471, + "step": 49560 + }, + { + "epoch": 0.8963002976668075, + "grad_norm": 43.8125, + "learning_rate": 9.859953315888476e-06, + "loss": 17.0289, + "step": 49570 + }, + { + "epoch": 0.8964811127359354, + "grad_norm": 46.0625, + "learning_rate": 9.859925063581817e-06, + "loss": 16.8848, + "step": 49580 + }, + { + "epoch": 0.8966619278050633, + "grad_norm": 44.78125, + "learning_rate": 9.859896811275158e-06, + "loss": 17.0249, + "step": 49590 + }, + { + "epoch": 0.8968427428741912, + "grad_norm": 42.03125, + "learning_rate": 9.859868558968497e-06, + "loss": 16.5212, + "step": 49600 + }, + { + "epoch": 0.897023557943319, + "grad_norm": 43.34375, + "learning_rate": 9.859840306661839e-06, + "loss": 16.6439, + "step": 49610 + }, + { + "epoch": 0.8972043730124468, + "grad_norm": 42.3125, + "learning_rate": 9.85981205435518e-06, + "loss": 16.4337, + "step": 49620 + }, + { + "epoch": 0.8973851880815747, + "grad_norm": 45.1875, + "learning_rate": 9.85978380204852e-06, + "loss": 16.9367, + "step": 49630 + }, + { + "epoch": 0.8975660031507026, + "grad_norm": 45.3125, + "learning_rate": 9.859755549741859e-06, + "loss": 17.0094, + "step": 49640 + }, + { + "epoch": 0.8977468182198305, + "grad_norm": 48.46875, + "learning_rate": 9.8597272974352e-06, + "loss": 17.0798, + "step": 49650 + }, + { + "epoch": 0.8979276332889583, + "grad_norm": 46.09375, + "learning_rate": 9.85969904512854e-06, + "loss": 16.8546, + "step": 49660 + }, + { + "epoch": 0.8981084483580861, + "grad_norm": 42.65625, + "learning_rate": 9.859670792821881e-06, + "loss": 16.4153, + "step": 49670 + }, + { + "epoch": 0.898289263427214, + "grad_norm": 46.1875, + "learning_rate": 9.859642540515222e-06, + "loss": 16.5445, + "step": 49680 + }, + { + "epoch": 0.8984700784963419, + "grad_norm": 42.09375, + "learning_rate": 9.859614288208562e-06, + "loss": 16.6849, + "step": 49690 + }, + { + "epoch": 0.8986508935654698, + "grad_norm": 42.90625, + "learning_rate": 9.859586035901903e-06, + "loss": 16.7131, + "step": 49700 + }, + { + "epoch": 0.8988317086345976, + "grad_norm": 41.15625, + "learning_rate": 9.859557783595243e-06, + "loss": 16.6575, + "step": 49710 + }, + { + "epoch": 0.8990125237037254, + "grad_norm": 43.46875, + "learning_rate": 9.859529531288582e-06, + "loss": 16.6249, + "step": 49720 + }, + { + "epoch": 0.8991933387728533, + "grad_norm": 44.34375, + "learning_rate": 9.859501278981923e-06, + "loss": 16.9034, + "step": 49730 + }, + { + "epoch": 0.8993741538419812, + "grad_norm": 43.46875, + "learning_rate": 9.859473026675264e-06, + "loss": 16.7501, + "step": 49740 + }, + { + "epoch": 0.8995549689111091, + "grad_norm": 43.40625, + "learning_rate": 9.859444774368604e-06, + "loss": 16.6332, + "step": 49750 + }, + { + "epoch": 0.8997357839802369, + "grad_norm": 44.1875, + "learning_rate": 9.859416522061945e-06, + "loss": 16.6388, + "step": 49760 + }, + { + "epoch": 0.8999165990493648, + "grad_norm": 44.75, + "learning_rate": 9.859388269755284e-06, + "loss": 16.5714, + "step": 49770 + }, + { + "epoch": 0.9000974141184926, + "grad_norm": 44.65625, + "learning_rate": 9.859360017448626e-06, + "loss": 16.5571, + "step": 49780 + }, + { + "epoch": 0.9002782291876205, + "grad_norm": 45.5625, + "learning_rate": 9.859331765141967e-06, + "loss": 16.8097, + "step": 49790 + }, + { + "epoch": 0.9004590442567484, + "grad_norm": 39.1875, + "learning_rate": 9.859303512835307e-06, + "loss": 17.1475, + "step": 49800 + }, + { + "epoch": 0.9006398593258762, + "grad_norm": 43.875, + "learning_rate": 9.859275260528646e-06, + "loss": 17.1851, + "step": 49810 + }, + { + "epoch": 0.9008206743950041, + "grad_norm": 41.71875, + "learning_rate": 9.859247008221987e-06, + "loss": 16.6011, + "step": 49820 + }, + { + "epoch": 0.9010014894641319, + "grad_norm": 44.875, + "learning_rate": 9.859218755915327e-06, + "loss": 16.3234, + "step": 49830 + }, + { + "epoch": 0.9011823045332598, + "grad_norm": 41.71875, + "learning_rate": 9.859190503608668e-06, + "loss": 16.7468, + "step": 49840 + }, + { + "epoch": 0.9013631196023877, + "grad_norm": 49.09375, + "learning_rate": 9.859162251302009e-06, + "loss": 17.3395, + "step": 49850 + }, + { + "epoch": 0.9015439346715155, + "grad_norm": 44.03125, + "learning_rate": 9.859133998995348e-06, + "loss": 16.6495, + "step": 49860 + }, + { + "epoch": 0.9017247497406434, + "grad_norm": 44.53125, + "learning_rate": 9.85910574668869e-06, + "loss": 16.7057, + "step": 49870 + }, + { + "epoch": 0.9019055648097712, + "grad_norm": 44.625, + "learning_rate": 9.85907749438203e-06, + "loss": 16.5376, + "step": 49880 + }, + { + "epoch": 0.9020863798788991, + "grad_norm": 44.96875, + "learning_rate": 9.85904924207537e-06, + "loss": 17.2714, + "step": 49890 + }, + { + "epoch": 0.902267194948027, + "grad_norm": 46.78125, + "learning_rate": 9.85902098976871e-06, + "loss": 16.6724, + "step": 49900 + }, + { + "epoch": 0.9024480100171548, + "grad_norm": 43.875, + "learning_rate": 9.85899273746205e-06, + "loss": 16.9802, + "step": 49910 + }, + { + "epoch": 0.9026288250862827, + "grad_norm": 45.1875, + "learning_rate": 9.858964485155391e-06, + "loss": 17.1209, + "step": 49920 + }, + { + "epoch": 0.9028096401554105, + "grad_norm": 43.71875, + "learning_rate": 9.858936232848732e-06, + "loss": 16.8002, + "step": 49930 + }, + { + "epoch": 0.9029904552245384, + "grad_norm": 43.125, + "learning_rate": 9.858907980542073e-06, + "loss": 16.709, + "step": 49940 + }, + { + "epoch": 0.9031712702936663, + "grad_norm": 45.96875, + "learning_rate": 9.858879728235412e-06, + "loss": 16.9123, + "step": 49950 + }, + { + "epoch": 0.9033520853627941, + "grad_norm": 46.0625, + "learning_rate": 9.858851475928754e-06, + "loss": 16.4468, + "step": 49960 + }, + { + "epoch": 0.903532900431922, + "grad_norm": 43.34375, + "learning_rate": 9.858823223622094e-06, + "loss": 16.6946, + "step": 49970 + }, + { + "epoch": 0.9037137155010498, + "grad_norm": 43.71875, + "learning_rate": 9.858794971315433e-06, + "loss": 16.6168, + "step": 49980 + }, + { + "epoch": 0.9038945305701778, + "grad_norm": 43.5, + "learning_rate": 9.858766719008774e-06, + "loss": 16.8038, + "step": 49990 + }, + { + "epoch": 0.9040753456393056, + "grad_norm": 41.90625, + "learning_rate": 9.858738466702115e-06, + "loss": 16.6773, + "step": 50000 + }, + { + "epoch": 0.9040753456393056, + "eval_loss": 2.0931339263916016, + "eval_runtime": 229.7481, + "eval_samples_per_second": 3160.239, + "eval_steps_per_second": 49.38, + "step": 50000 + }, + { + "epoch": 0.9042561607084334, + "grad_norm": 42.5, + "learning_rate": 9.858710214395455e-06, + "loss": 16.8778, + "step": 50010 + }, + { + "epoch": 0.9044369757775613, + "grad_norm": 45.40625, + "learning_rate": 9.858681962088796e-06, + "loss": 16.5687, + "step": 50020 + }, + { + "epoch": 0.9046177908466891, + "grad_norm": 44.34375, + "learning_rate": 9.858653709782135e-06, + "loss": 16.6903, + "step": 50030 + }, + { + "epoch": 0.9047986059158171, + "grad_norm": 44.15625, + "learning_rate": 9.858625457475477e-06, + "loss": 16.3654, + "step": 50040 + }, + { + "epoch": 0.9049794209849449, + "grad_norm": 45.40625, + "learning_rate": 9.858597205168818e-06, + "loss": 16.9448, + "step": 50050 + }, + { + "epoch": 0.9051602360540727, + "grad_norm": 41.9375, + "learning_rate": 9.858568952862158e-06, + "loss": 17.0241, + "step": 50060 + }, + { + "epoch": 0.9053410511232006, + "grad_norm": 43.3125, + "learning_rate": 9.858540700555497e-06, + "loss": 17.1035, + "step": 50070 + }, + { + "epoch": 0.9055218661923284, + "grad_norm": 44.34375, + "learning_rate": 9.858512448248838e-06, + "loss": 17.3552, + "step": 50080 + }, + { + "epoch": 0.9057026812614564, + "grad_norm": 44.125, + "learning_rate": 9.858484195942179e-06, + "loss": 16.1245, + "step": 50090 + }, + { + "epoch": 0.9058834963305842, + "grad_norm": 41.65625, + "learning_rate": 9.85845594363552e-06, + "loss": 16.7964, + "step": 50100 + }, + { + "epoch": 0.906064311399712, + "grad_norm": 46.71875, + "learning_rate": 9.85842769132886e-06, + "loss": 16.774, + "step": 50110 + }, + { + "epoch": 0.9062451264688399, + "grad_norm": 45.4375, + "learning_rate": 9.858399439022199e-06, + "loss": 16.5967, + "step": 50120 + }, + { + "epoch": 0.9064259415379677, + "grad_norm": 45.03125, + "learning_rate": 9.858371186715541e-06, + "loss": 16.5357, + "step": 50130 + }, + { + "epoch": 0.9066067566070957, + "grad_norm": 43.90625, + "learning_rate": 9.858342934408882e-06, + "loss": 16.9263, + "step": 50140 + }, + { + "epoch": 0.9067875716762235, + "grad_norm": 45.0625, + "learning_rate": 9.85831468210222e-06, + "loss": 16.9574, + "step": 50150 + }, + { + "epoch": 0.9069683867453514, + "grad_norm": 42.34375, + "learning_rate": 9.858286429795561e-06, + "loss": 16.55, + "step": 50160 + }, + { + "epoch": 0.9071492018144792, + "grad_norm": 43.8125, + "learning_rate": 9.858258177488902e-06, + "loss": 16.7054, + "step": 50170 + }, + { + "epoch": 0.907330016883607, + "grad_norm": 43.5625, + "learning_rate": 9.858229925182242e-06, + "loss": 16.5315, + "step": 50180 + }, + { + "epoch": 0.907510831952735, + "grad_norm": 46.71875, + "learning_rate": 9.858201672875583e-06, + "loss": 17.2083, + "step": 50190 + }, + { + "epoch": 0.9076916470218628, + "grad_norm": 45.96875, + "learning_rate": 9.858173420568922e-06, + "loss": 16.9714, + "step": 50200 + }, + { + "epoch": 0.9078724620909907, + "grad_norm": 45.625, + "learning_rate": 9.858145168262263e-06, + "loss": 16.5915, + "step": 50210 + }, + { + "epoch": 0.9080532771601185, + "grad_norm": 43.3125, + "learning_rate": 9.858116915955605e-06, + "loss": 16.7036, + "step": 50220 + }, + { + "epoch": 0.9082340922292463, + "grad_norm": 46.09375, + "learning_rate": 9.858088663648946e-06, + "loss": 16.9555, + "step": 50230 + }, + { + "epoch": 0.9084149072983743, + "grad_norm": 42.9375, + "learning_rate": 9.858060411342284e-06, + "loss": 16.5951, + "step": 50240 + }, + { + "epoch": 0.9085957223675021, + "grad_norm": 43.15625, + "learning_rate": 9.858032159035625e-06, + "loss": 16.8463, + "step": 50250 + }, + { + "epoch": 0.90877653743663, + "grad_norm": 42.0, + "learning_rate": 9.858003906728966e-06, + "loss": 16.7482, + "step": 50260 + }, + { + "epoch": 0.9089573525057578, + "grad_norm": 42.90625, + "learning_rate": 9.857975654422306e-06, + "loss": 16.4641, + "step": 50270 + }, + { + "epoch": 0.9091381675748856, + "grad_norm": 43.4375, + "learning_rate": 9.857947402115647e-06, + "loss": 17.0, + "step": 50280 + }, + { + "epoch": 0.9093189826440136, + "grad_norm": 42.125, + "learning_rate": 9.857919149808986e-06, + "loss": 16.7016, + "step": 50290 + }, + { + "epoch": 0.9094997977131414, + "grad_norm": 45.3125, + "learning_rate": 9.857890897502327e-06, + "loss": 16.5641, + "step": 50300 + }, + { + "epoch": 0.9096806127822693, + "grad_norm": 41.78125, + "learning_rate": 9.857862645195669e-06, + "loss": 16.8491, + "step": 50310 + }, + { + "epoch": 0.9098614278513971, + "grad_norm": 43.75, + "learning_rate": 9.857834392889008e-06, + "loss": 16.7857, + "step": 50320 + }, + { + "epoch": 0.910042242920525, + "grad_norm": 42.21875, + "learning_rate": 9.857806140582348e-06, + "loss": 16.7927, + "step": 50330 + }, + { + "epoch": 0.9102230579896529, + "grad_norm": 42.90625, + "learning_rate": 9.857777888275689e-06, + "loss": 17.1758, + "step": 50340 + }, + { + "epoch": 0.9104038730587807, + "grad_norm": 41.0, + "learning_rate": 9.85774963596903e-06, + "loss": 16.2274, + "step": 50350 + }, + { + "epoch": 0.9105846881279086, + "grad_norm": 44.1875, + "learning_rate": 9.85772138366237e-06, + "loss": 16.6145, + "step": 50360 + }, + { + "epoch": 0.9107655031970364, + "grad_norm": 42.9375, + "learning_rate": 9.85769313135571e-06, + "loss": 16.3565, + "step": 50370 + }, + { + "epoch": 0.9109463182661643, + "grad_norm": 44.09375, + "learning_rate": 9.85766487904905e-06, + "loss": 16.6859, + "step": 50380 + }, + { + "epoch": 0.9111271333352922, + "grad_norm": 44.3125, + "learning_rate": 9.85763662674239e-06, + "loss": 16.6597, + "step": 50390 + }, + { + "epoch": 0.91130794840442, + "grad_norm": 44.75, + "learning_rate": 9.857608374435733e-06, + "loss": 17.1254, + "step": 50400 + }, + { + "epoch": 0.9114887634735479, + "grad_norm": 44.75, + "learning_rate": 9.857580122129072e-06, + "loss": 16.8251, + "step": 50410 + }, + { + "epoch": 0.9116695785426757, + "grad_norm": 42.40625, + "learning_rate": 9.857551869822412e-06, + "loss": 16.9839, + "step": 50420 + }, + { + "epoch": 0.9118503936118036, + "grad_norm": 43.875, + "learning_rate": 9.857523617515753e-06, + "loss": 16.8362, + "step": 50430 + }, + { + "epoch": 0.9120312086809315, + "grad_norm": 44.8125, + "learning_rate": 9.857495365209094e-06, + "loss": 16.8484, + "step": 50440 + }, + { + "epoch": 0.9122120237500593, + "grad_norm": 47.40625, + "learning_rate": 9.857467112902434e-06, + "loss": 16.4676, + "step": 50450 + }, + { + "epoch": 0.9123928388191872, + "grad_norm": 43.0, + "learning_rate": 9.857438860595773e-06, + "loss": 16.7908, + "step": 50460 + }, + { + "epoch": 0.912573653888315, + "grad_norm": 43.1875, + "learning_rate": 9.857410608289114e-06, + "loss": 16.5521, + "step": 50470 + }, + { + "epoch": 0.9127544689574429, + "grad_norm": 41.875, + "learning_rate": 9.857382355982456e-06, + "loss": 16.5968, + "step": 50480 + }, + { + "epoch": 0.9129352840265708, + "grad_norm": 46.65625, + "learning_rate": 9.857354103675795e-06, + "loss": 16.4031, + "step": 50490 + }, + { + "epoch": 0.9131160990956987, + "grad_norm": 44.15625, + "learning_rate": 9.857325851369136e-06, + "loss": 17.1271, + "step": 50500 + }, + { + "epoch": 0.9132969141648265, + "grad_norm": 43.0, + "learning_rate": 9.857297599062476e-06, + "loss": 16.6066, + "step": 50510 + }, + { + "epoch": 0.9134777292339543, + "grad_norm": 42.09375, + "learning_rate": 9.857269346755817e-06, + "loss": 16.6022, + "step": 50520 + }, + { + "epoch": 0.9136585443030822, + "grad_norm": 43.65625, + "learning_rate": 9.857241094449157e-06, + "loss": 16.5399, + "step": 50530 + }, + { + "epoch": 0.9138393593722101, + "grad_norm": 41.21875, + "learning_rate": 9.857212842142498e-06, + "loss": 16.75, + "step": 50540 + }, + { + "epoch": 0.914020174441338, + "grad_norm": 44.21875, + "learning_rate": 9.857184589835837e-06, + "loss": 17.1319, + "step": 50550 + }, + { + "epoch": 0.9142009895104658, + "grad_norm": 44.46875, + "learning_rate": 9.857156337529178e-06, + "loss": 16.3081, + "step": 50560 + }, + { + "epoch": 0.9143818045795936, + "grad_norm": 42.53125, + "learning_rate": 9.85712808522252e-06, + "loss": 16.4195, + "step": 50570 + }, + { + "epoch": 0.9145626196487215, + "grad_norm": 47.9375, + "learning_rate": 9.857099832915859e-06, + "loss": 16.6389, + "step": 50580 + }, + { + "epoch": 0.9147434347178494, + "grad_norm": 44.40625, + "learning_rate": 9.8570715806092e-06, + "loss": 16.4462, + "step": 50590 + }, + { + "epoch": 0.9149242497869773, + "grad_norm": 46.78125, + "learning_rate": 9.85704332830254e-06, + "loss": 17.0027, + "step": 50600 + }, + { + "epoch": 0.9151050648561051, + "grad_norm": 48.78125, + "learning_rate": 9.85701507599588e-06, + "loss": 17.1624, + "step": 50610 + }, + { + "epoch": 0.9152858799252329, + "grad_norm": 42.5625, + "learning_rate": 9.856986823689221e-06, + "loss": 17.0643, + "step": 50620 + }, + { + "epoch": 0.9154666949943608, + "grad_norm": 43.78125, + "learning_rate": 9.85695857138256e-06, + "loss": 16.2225, + "step": 50630 + }, + { + "epoch": 0.9156475100634887, + "grad_norm": 45.375, + "learning_rate": 9.856930319075901e-06, + "loss": 16.9626, + "step": 50640 + }, + { + "epoch": 0.9158283251326166, + "grad_norm": 40.28125, + "learning_rate": 9.856902066769242e-06, + "loss": 17.0447, + "step": 50650 + }, + { + "epoch": 0.9160091402017444, + "grad_norm": 48.125, + "learning_rate": 9.856873814462584e-06, + "loss": 16.5066, + "step": 50660 + }, + { + "epoch": 0.9161899552708723, + "grad_norm": 45.6875, + "learning_rate": 9.856845562155923e-06, + "loss": 16.6187, + "step": 50670 + }, + { + "epoch": 0.9163707703400001, + "grad_norm": 46.34375, + "learning_rate": 9.856817309849263e-06, + "loss": 16.5057, + "step": 50680 + }, + { + "epoch": 0.916551585409128, + "grad_norm": 45.8125, + "learning_rate": 9.856789057542604e-06, + "loss": 16.7443, + "step": 50690 + }, + { + "epoch": 0.9167324004782559, + "grad_norm": 41.09375, + "learning_rate": 9.856760805235945e-06, + "loss": 16.4325, + "step": 50700 + }, + { + "epoch": 0.9169132155473837, + "grad_norm": 44.65625, + "learning_rate": 9.856732552929285e-06, + "loss": 16.9035, + "step": 50710 + }, + { + "epoch": 0.9170940306165116, + "grad_norm": 45.9375, + "learning_rate": 9.856704300622624e-06, + "loss": 16.5906, + "step": 50720 + }, + { + "epoch": 0.9172748456856394, + "grad_norm": 46.90625, + "learning_rate": 9.856676048315965e-06, + "loss": 16.8493, + "step": 50730 + }, + { + "epoch": 0.9174556607547673, + "grad_norm": 45.84375, + "learning_rate": 9.856647796009305e-06, + "loss": 16.6537, + "step": 50740 + }, + { + "epoch": 0.9176364758238952, + "grad_norm": 43.09375, + "learning_rate": 9.856619543702646e-06, + "loss": 16.8398, + "step": 50750 + }, + { + "epoch": 0.917817290893023, + "grad_norm": 43.3125, + "learning_rate": 9.856591291395987e-06, + "loss": 16.7299, + "step": 50760 + }, + { + "epoch": 0.9179981059621509, + "grad_norm": 41.625, + "learning_rate": 9.856563039089327e-06, + "loss": 16.2667, + "step": 50770 + }, + { + "epoch": 0.9181789210312787, + "grad_norm": 43.09375, + "learning_rate": 9.856534786782668e-06, + "loss": 16.7683, + "step": 50780 + }, + { + "epoch": 0.9183597361004066, + "grad_norm": 45.15625, + "learning_rate": 9.856506534476009e-06, + "loss": 16.4613, + "step": 50790 + }, + { + "epoch": 0.9185405511695345, + "grad_norm": 45.46875, + "learning_rate": 9.856478282169347e-06, + "loss": 16.8121, + "step": 50800 + }, + { + "epoch": 0.9187213662386623, + "grad_norm": 44.5, + "learning_rate": 9.856450029862688e-06, + "loss": 16.7824, + "step": 50810 + }, + { + "epoch": 0.9189021813077902, + "grad_norm": 46.375, + "learning_rate": 9.856421777556029e-06, + "loss": 16.6512, + "step": 50820 + }, + { + "epoch": 0.919082996376918, + "grad_norm": 43.0625, + "learning_rate": 9.856393525249371e-06, + "loss": 16.9636, + "step": 50830 + }, + { + "epoch": 0.919263811446046, + "grad_norm": 44.8125, + "learning_rate": 9.85636527294271e-06, + "loss": 16.3377, + "step": 50840 + }, + { + "epoch": 0.9194446265151738, + "grad_norm": 40.8125, + "learning_rate": 9.85633702063605e-06, + "loss": 16.2992, + "step": 50850 + }, + { + "epoch": 0.9196254415843016, + "grad_norm": 46.09375, + "learning_rate": 9.856308768329391e-06, + "loss": 16.8767, + "step": 50860 + }, + { + "epoch": 0.9198062566534295, + "grad_norm": 42.8125, + "learning_rate": 9.856280516022732e-06, + "loss": 17.1975, + "step": 50870 + }, + { + "epoch": 0.9199870717225573, + "grad_norm": 46.25, + "learning_rate": 9.856252263716072e-06, + "loss": 16.8003, + "step": 50880 + }, + { + "epoch": 0.9201678867916853, + "grad_norm": 42.46875, + "learning_rate": 9.856224011409411e-06, + "loss": 16.5021, + "step": 50890 + }, + { + "epoch": 0.9203487018608131, + "grad_norm": 43.28125, + "learning_rate": 9.856195759102752e-06, + "loss": 16.8396, + "step": 50900 + }, + { + "epoch": 0.9205295169299409, + "grad_norm": 41.78125, + "learning_rate": 9.856167506796093e-06, + "loss": 16.3, + "step": 50910 + }, + { + "epoch": 0.9207103319990688, + "grad_norm": 47.40625, + "learning_rate": 9.856139254489433e-06, + "loss": 17.3092, + "step": 50920 + }, + { + "epoch": 0.9208911470681966, + "grad_norm": 43.96875, + "learning_rate": 9.856111002182774e-06, + "loss": 17.0263, + "step": 50930 + }, + { + "epoch": 0.9210719621373246, + "grad_norm": 41.5625, + "learning_rate": 9.856082749876115e-06, + "loss": 16.6091, + "step": 50940 + }, + { + "epoch": 0.9212527772064524, + "grad_norm": 46.5, + "learning_rate": 9.856054497569455e-06, + "loss": 17.2658, + "step": 50950 + }, + { + "epoch": 0.9214335922755802, + "grad_norm": 43.1875, + "learning_rate": 9.856026245262796e-06, + "loss": 16.9387, + "step": 50960 + }, + { + "epoch": 0.9216144073447081, + "grad_norm": 42.1875, + "learning_rate": 9.855997992956136e-06, + "loss": 16.5608, + "step": 50970 + }, + { + "epoch": 0.9217952224138359, + "grad_norm": 44.375, + "learning_rate": 9.855969740649475e-06, + "loss": 16.4755, + "step": 50980 + }, + { + "epoch": 0.9219760374829639, + "grad_norm": 44.625, + "learning_rate": 9.855941488342816e-06, + "loss": 16.774, + "step": 50990 + }, + { + "epoch": 0.9221568525520917, + "grad_norm": 46.0, + "learning_rate": 9.855913236036157e-06, + "loss": 16.2279, + "step": 51000 + }, + { + "epoch": 0.9223376676212196, + "grad_norm": 43.25, + "learning_rate": 9.855884983729497e-06, + "loss": 16.9542, + "step": 51010 + }, + { + "epoch": 0.9225184826903474, + "grad_norm": 46.4375, + "learning_rate": 9.855856731422838e-06, + "loss": 17.0627, + "step": 51020 + }, + { + "epoch": 0.9226992977594752, + "grad_norm": 45.15625, + "learning_rate": 9.855828479116178e-06, + "loss": 16.5087, + "step": 51030 + }, + { + "epoch": 0.9228801128286032, + "grad_norm": 45.21875, + "learning_rate": 9.855800226809519e-06, + "loss": 16.333, + "step": 51040 + }, + { + "epoch": 0.923060927897731, + "grad_norm": 44.90625, + "learning_rate": 9.85577197450286e-06, + "loss": 16.7786, + "step": 51050 + }, + { + "epoch": 0.9232417429668589, + "grad_norm": 43.59375, + "learning_rate": 9.855743722196199e-06, + "loss": 16.6454, + "step": 51060 + }, + { + "epoch": 0.9234225580359867, + "grad_norm": 44.71875, + "learning_rate": 9.85571546988954e-06, + "loss": 16.572, + "step": 51070 + }, + { + "epoch": 0.9236033731051145, + "grad_norm": 41.09375, + "learning_rate": 9.85568721758288e-06, + "loss": 16.4866, + "step": 51080 + }, + { + "epoch": 0.9237841881742425, + "grad_norm": 47.84375, + "learning_rate": 9.85565896527622e-06, + "loss": 16.9147, + "step": 51090 + }, + { + "epoch": 0.9239650032433703, + "grad_norm": 44.46875, + "learning_rate": 9.855630712969561e-06, + "loss": 16.818, + "step": 51100 + }, + { + "epoch": 0.9241458183124982, + "grad_norm": 47.375, + "learning_rate": 9.855602460662902e-06, + "loss": 16.7599, + "step": 51110 + }, + { + "epoch": 0.924326633381626, + "grad_norm": 44.21875, + "learning_rate": 9.855574208356242e-06, + "loss": 16.9495, + "step": 51120 + }, + { + "epoch": 0.9245074484507538, + "grad_norm": 44.65625, + "learning_rate": 9.855545956049583e-06, + "loss": 16.7892, + "step": 51130 + }, + { + "epoch": 0.9246882635198818, + "grad_norm": 44.75, + "learning_rate": 9.855517703742924e-06, + "loss": 16.4863, + "step": 51140 + }, + { + "epoch": 0.9248690785890096, + "grad_norm": 46.03125, + "learning_rate": 9.855489451436262e-06, + "loss": 16.3058, + "step": 51150 + }, + { + "epoch": 0.9250498936581375, + "grad_norm": 44.1875, + "learning_rate": 9.855461199129603e-06, + "loss": 16.1729, + "step": 51160 + }, + { + "epoch": 0.9252307087272653, + "grad_norm": 43.625, + "learning_rate": 9.855432946822944e-06, + "loss": 16.9115, + "step": 51170 + }, + { + "epoch": 0.9254115237963932, + "grad_norm": 44.5, + "learning_rate": 9.855404694516284e-06, + "loss": 16.577, + "step": 51180 + }, + { + "epoch": 0.9255923388655211, + "grad_norm": 46.84375, + "learning_rate": 9.855376442209625e-06, + "loss": 16.3869, + "step": 51190 + }, + { + "epoch": 0.9257731539346489, + "grad_norm": 45.53125, + "learning_rate": 9.855348189902966e-06, + "loss": 16.6404, + "step": 51200 + }, + { + "epoch": 0.9259539690037768, + "grad_norm": 45.46875, + "learning_rate": 9.855319937596306e-06, + "loss": 16.7318, + "step": 51210 + }, + { + "epoch": 0.9261347840729046, + "grad_norm": 42.0, + "learning_rate": 9.855291685289647e-06, + "loss": 16.1691, + "step": 51220 + }, + { + "epoch": 0.9263155991420325, + "grad_norm": 43.40625, + "learning_rate": 9.855263432982986e-06, + "loss": 16.7121, + "step": 51230 + }, + { + "epoch": 0.9264964142111604, + "grad_norm": 44.0625, + "learning_rate": 9.855235180676326e-06, + "loss": 16.653, + "step": 51240 + }, + { + "epoch": 0.9266772292802882, + "grad_norm": 44.875, + "learning_rate": 9.855206928369667e-06, + "loss": 16.7853, + "step": 51250 + }, + { + "epoch": 0.9268580443494161, + "grad_norm": 42.4375, + "learning_rate": 9.855178676063008e-06, + "loss": 16.5499, + "step": 51260 + }, + { + "epoch": 0.9270388594185439, + "grad_norm": 41.375, + "learning_rate": 9.855150423756348e-06, + "loss": 16.4913, + "step": 51270 + }, + { + "epoch": 0.9272196744876718, + "grad_norm": 43.40625, + "learning_rate": 9.855122171449689e-06, + "loss": 16.9152, + "step": 51280 + }, + { + "epoch": 0.9274004895567997, + "grad_norm": 43.53125, + "learning_rate": 9.85509391914303e-06, + "loss": 16.3459, + "step": 51290 + }, + { + "epoch": 0.9275813046259275, + "grad_norm": 42.75, + "learning_rate": 9.85506566683637e-06, + "loss": 16.257, + "step": 51300 + }, + { + "epoch": 0.9277621196950554, + "grad_norm": 43.0, + "learning_rate": 9.85503741452971e-06, + "loss": 16.853, + "step": 51310 + }, + { + "epoch": 0.9279429347641832, + "grad_norm": 48.90625, + "learning_rate": 9.85500916222305e-06, + "loss": 16.3804, + "step": 51320 + }, + { + "epoch": 0.9281237498333111, + "grad_norm": 40.75, + "learning_rate": 9.85498090991639e-06, + "loss": 16.1179, + "step": 51330 + }, + { + "epoch": 0.928304564902439, + "grad_norm": 45.3125, + "learning_rate": 9.854952657609731e-06, + "loss": 16.7624, + "step": 51340 + }, + { + "epoch": 0.9284853799715669, + "grad_norm": 43.78125, + "learning_rate": 9.854924405303072e-06, + "loss": 16.7898, + "step": 51350 + }, + { + "epoch": 0.9286661950406947, + "grad_norm": 45.40625, + "learning_rate": 9.854896152996412e-06, + "loss": 16.8738, + "step": 51360 + }, + { + "epoch": 0.9288470101098225, + "grad_norm": 43.25, + "learning_rate": 9.854867900689753e-06, + "loss": 16.4131, + "step": 51370 + }, + { + "epoch": 0.9290278251789504, + "grad_norm": 44.03125, + "learning_rate": 9.854839648383093e-06, + "loss": 16.6141, + "step": 51380 + }, + { + "epoch": 0.9292086402480783, + "grad_norm": 43.28125, + "learning_rate": 9.854811396076434e-06, + "loss": 16.2441, + "step": 51390 + }, + { + "epoch": 0.9293894553172062, + "grad_norm": 43.53125, + "learning_rate": 9.854783143769775e-06, + "loss": 16.764, + "step": 51400 + }, + { + "epoch": 0.929570270386334, + "grad_norm": 46.9375, + "learning_rate": 9.854754891463114e-06, + "loss": 17.3107, + "step": 51410 + }, + { + "epoch": 0.9297510854554618, + "grad_norm": 46.21875, + "learning_rate": 9.854726639156454e-06, + "loss": 16.2477, + "step": 51420 + }, + { + "epoch": 0.9299319005245897, + "grad_norm": 43.96875, + "learning_rate": 9.854698386849795e-06, + "loss": 16.8256, + "step": 51430 + }, + { + "epoch": 0.9301127155937176, + "grad_norm": 45.75, + "learning_rate": 9.854670134543135e-06, + "loss": 16.806, + "step": 51440 + }, + { + "epoch": 0.9302935306628455, + "grad_norm": 44.40625, + "learning_rate": 9.854641882236476e-06, + "loss": 16.7943, + "step": 51450 + }, + { + "epoch": 0.9304743457319733, + "grad_norm": 46.4375, + "learning_rate": 9.854613629929817e-06, + "loss": 16.4803, + "step": 51460 + }, + { + "epoch": 0.9306551608011011, + "grad_norm": 41.9375, + "learning_rate": 9.854585377623157e-06, + "loss": 17.0244, + "step": 51470 + }, + { + "epoch": 0.930835975870229, + "grad_norm": 44.0625, + "learning_rate": 9.854557125316498e-06, + "loss": 16.4517, + "step": 51480 + }, + { + "epoch": 0.9310167909393569, + "grad_norm": 44.84375, + "learning_rate": 9.854528873009837e-06, + "loss": 16.8978, + "step": 51490 + }, + { + "epoch": 0.9311976060084848, + "grad_norm": 46.6875, + "learning_rate": 9.854500620703178e-06, + "loss": 16.7435, + "step": 51500 + }, + { + "epoch": 0.9313784210776126, + "grad_norm": 44.90625, + "learning_rate": 9.854472368396518e-06, + "loss": 16.402, + "step": 51510 + }, + { + "epoch": 0.9315592361467405, + "grad_norm": 48.34375, + "learning_rate": 9.854444116089859e-06, + "loss": 16.7344, + "step": 51520 + }, + { + "epoch": 0.9317400512158683, + "grad_norm": 44.40625, + "learning_rate": 9.8544158637832e-06, + "loss": 16.4329, + "step": 51530 + }, + { + "epoch": 0.9319208662849962, + "grad_norm": 42.4375, + "learning_rate": 9.85438761147654e-06, + "loss": 16.3157, + "step": 51540 + }, + { + "epoch": 0.9321016813541241, + "grad_norm": 42.625, + "learning_rate": 9.85435935916988e-06, + "loss": 16.9422, + "step": 51550 + }, + { + "epoch": 0.9322824964232519, + "grad_norm": 44.09375, + "learning_rate": 9.854331106863221e-06, + "loss": 16.4778, + "step": 51560 + }, + { + "epoch": 0.9324633114923798, + "grad_norm": 44.8125, + "learning_rate": 9.854302854556562e-06, + "loss": 16.7903, + "step": 51570 + }, + { + "epoch": 0.9326441265615076, + "grad_norm": 48.03125, + "learning_rate": 9.8542746022499e-06, + "loss": 16.8277, + "step": 51580 + }, + { + "epoch": 0.9328249416306355, + "grad_norm": 44.15625, + "learning_rate": 9.854246349943241e-06, + "loss": 16.7422, + "step": 51590 + }, + { + "epoch": 0.9330057566997634, + "grad_norm": 42.5, + "learning_rate": 9.854218097636582e-06, + "loss": 16.8482, + "step": 51600 + }, + { + "epoch": 0.9331865717688912, + "grad_norm": 45.625, + "learning_rate": 9.854189845329923e-06, + "loss": 16.5671, + "step": 51610 + }, + { + "epoch": 0.9333673868380191, + "grad_norm": 42.625, + "learning_rate": 9.854161593023263e-06, + "loss": 16.6159, + "step": 51620 + }, + { + "epoch": 0.9335482019071469, + "grad_norm": 44.03125, + "learning_rate": 9.854133340716604e-06, + "loss": 16.8276, + "step": 51630 + }, + { + "epoch": 0.9337290169762748, + "grad_norm": 41.96875, + "learning_rate": 9.854105088409945e-06, + "loss": 16.4779, + "step": 51640 + }, + { + "epoch": 0.9339098320454027, + "grad_norm": 43.78125, + "learning_rate": 9.854076836103285e-06, + "loss": 16.5682, + "step": 51650 + }, + { + "epoch": 0.9340906471145305, + "grad_norm": 42.96875, + "learning_rate": 9.854048583796624e-06, + "loss": 16.5536, + "step": 51660 + }, + { + "epoch": 0.9342714621836584, + "grad_norm": 49.0625, + "learning_rate": 9.854020331489965e-06, + "loss": 16.8541, + "step": 51670 + }, + { + "epoch": 0.9344522772527862, + "grad_norm": 44.1875, + "learning_rate": 9.853992079183305e-06, + "loss": 17.2924, + "step": 51680 + }, + { + "epoch": 0.9346330923219142, + "grad_norm": 45.8125, + "learning_rate": 9.853963826876646e-06, + "loss": 16.4853, + "step": 51690 + }, + { + "epoch": 0.934813907391042, + "grad_norm": 43.71875, + "learning_rate": 9.853935574569987e-06, + "loss": 16.4445, + "step": 51700 + }, + { + "epoch": 0.9349947224601698, + "grad_norm": 46.5625, + "learning_rate": 9.853907322263327e-06, + "loss": 16.7954, + "step": 51710 + }, + { + "epoch": 0.9351755375292977, + "grad_norm": 45.21875, + "learning_rate": 9.853879069956668e-06, + "loss": 16.6579, + "step": 51720 + }, + { + "epoch": 0.9353563525984255, + "grad_norm": 46.5, + "learning_rate": 9.853850817650008e-06, + "loss": 17.1303, + "step": 51730 + }, + { + "epoch": 0.9355371676675535, + "grad_norm": 44.75, + "learning_rate": 9.853822565343349e-06, + "loss": 16.8715, + "step": 51740 + }, + { + "epoch": 0.9357179827366813, + "grad_norm": 43.75, + "learning_rate": 9.853794313036688e-06, + "loss": 16.7436, + "step": 51750 + }, + { + "epoch": 0.9358987978058091, + "grad_norm": 45.5, + "learning_rate": 9.853766060730029e-06, + "loss": 16.4995, + "step": 51760 + }, + { + "epoch": 0.936079612874937, + "grad_norm": 42.40625, + "learning_rate": 9.85373780842337e-06, + "loss": 16.6631, + "step": 51770 + }, + { + "epoch": 0.9362604279440648, + "grad_norm": 41.625, + "learning_rate": 9.85370955611671e-06, + "loss": 16.1694, + "step": 51780 + }, + { + "epoch": 0.9364412430131928, + "grad_norm": 43.40625, + "learning_rate": 9.85368130381005e-06, + "loss": 16.9699, + "step": 51790 + }, + { + "epoch": 0.9366220580823206, + "grad_norm": 44.78125, + "learning_rate": 9.853653051503391e-06, + "loss": 16.5178, + "step": 51800 + }, + { + "epoch": 0.9368028731514484, + "grad_norm": 42.875, + "learning_rate": 9.853624799196732e-06, + "loss": 16.3057, + "step": 51810 + }, + { + "epoch": 0.9369836882205763, + "grad_norm": 45.28125, + "learning_rate": 9.853596546890072e-06, + "loss": 16.699, + "step": 51820 + }, + { + "epoch": 0.9371645032897041, + "grad_norm": 41.8125, + "learning_rate": 9.853568294583413e-06, + "loss": 16.8024, + "step": 51830 + }, + { + "epoch": 0.9373453183588321, + "grad_norm": 41.59375, + "learning_rate": 9.853540042276752e-06, + "loss": 16.6078, + "step": 51840 + }, + { + "epoch": 0.9375261334279599, + "grad_norm": 47.21875, + "learning_rate": 9.853511789970093e-06, + "loss": 16.8264, + "step": 51850 + }, + { + "epoch": 0.9377069484970878, + "grad_norm": 44.9375, + "learning_rate": 9.853483537663433e-06, + "loss": 16.8107, + "step": 51860 + }, + { + "epoch": 0.9378877635662156, + "grad_norm": 43.34375, + "learning_rate": 9.853455285356774e-06, + "loss": 16.7995, + "step": 51870 + }, + { + "epoch": 0.9380685786353434, + "grad_norm": 44.875, + "learning_rate": 9.853427033050114e-06, + "loss": 16.6142, + "step": 51880 + }, + { + "epoch": 0.9382493937044714, + "grad_norm": 43.6875, + "learning_rate": 9.853398780743455e-06, + "loss": 16.7819, + "step": 51890 + }, + { + "epoch": 0.9384302087735992, + "grad_norm": 44.125, + "learning_rate": 9.853370528436796e-06, + "loss": 16.7242, + "step": 51900 + }, + { + "epoch": 0.9386110238427271, + "grad_norm": 43.84375, + "learning_rate": 9.853342276130136e-06, + "loss": 16.6088, + "step": 51910 + }, + { + "epoch": 0.9387918389118549, + "grad_norm": 47.1875, + "learning_rate": 9.853314023823475e-06, + "loss": 16.1789, + "step": 51920 + }, + { + "epoch": 0.9389726539809827, + "grad_norm": 43.9375, + "learning_rate": 9.853285771516816e-06, + "loss": 16.3855, + "step": 51930 + }, + { + "epoch": 0.9391534690501107, + "grad_norm": 45.8125, + "learning_rate": 9.853257519210156e-06, + "loss": 17.0041, + "step": 51940 + }, + { + "epoch": 0.9393342841192385, + "grad_norm": 41.3125, + "learning_rate": 9.853229266903497e-06, + "loss": 16.7801, + "step": 51950 + }, + { + "epoch": 0.9395150991883664, + "grad_norm": 42.84375, + "learning_rate": 9.853201014596838e-06, + "loss": 16.7844, + "step": 51960 + }, + { + "epoch": 0.9396959142574942, + "grad_norm": 42.125, + "learning_rate": 9.853172762290178e-06, + "loss": 16.6697, + "step": 51970 + }, + { + "epoch": 0.939876729326622, + "grad_norm": 42.625, + "learning_rate": 9.853144509983519e-06, + "loss": 16.4901, + "step": 51980 + }, + { + "epoch": 0.94005754439575, + "grad_norm": 44.375, + "learning_rate": 9.85311625767686e-06, + "loss": 16.4376, + "step": 51990 + }, + { + "epoch": 0.9402383594648778, + "grad_norm": 41.625, + "learning_rate": 9.8530880053702e-06, + "loss": 16.743, + "step": 52000 + }, + { + "epoch": 0.9404191745340057, + "grad_norm": 45.125, + "learning_rate": 9.853059753063539e-06, + "loss": 16.7142, + "step": 52010 + }, + { + "epoch": 0.9405999896031335, + "grad_norm": 47.90625, + "learning_rate": 9.85303150075688e-06, + "loss": 16.6547, + "step": 52020 + }, + { + "epoch": 0.9407808046722614, + "grad_norm": 44.53125, + "learning_rate": 9.85300324845022e-06, + "loss": 17.2, + "step": 52030 + }, + { + "epoch": 0.9409616197413893, + "grad_norm": 46.03125, + "learning_rate": 9.852974996143561e-06, + "loss": 16.5735, + "step": 52040 + }, + { + "epoch": 0.9411424348105171, + "grad_norm": 46.0625, + "learning_rate": 9.852946743836902e-06, + "loss": 16.9435, + "step": 52050 + }, + { + "epoch": 0.941323249879645, + "grad_norm": 45.84375, + "learning_rate": 9.852918491530242e-06, + "loss": 16.5768, + "step": 52060 + }, + { + "epoch": 0.9415040649487728, + "grad_norm": 43.09375, + "learning_rate": 9.852890239223583e-06, + "loss": 16.8057, + "step": 52070 + }, + { + "epoch": 0.9416848800179007, + "grad_norm": 44.96875, + "learning_rate": 9.852861986916923e-06, + "loss": 16.56, + "step": 52080 + }, + { + "epoch": 0.9418656950870286, + "grad_norm": 43.84375, + "learning_rate": 9.852833734610262e-06, + "loss": 17.1698, + "step": 52090 + }, + { + "epoch": 0.9420465101561564, + "grad_norm": 45.8125, + "learning_rate": 9.852805482303603e-06, + "loss": 16.6837, + "step": 52100 + }, + { + "epoch": 0.9422273252252843, + "grad_norm": 45.78125, + "learning_rate": 9.852777229996944e-06, + "loss": 16.8276, + "step": 52110 + }, + { + "epoch": 0.9424081402944121, + "grad_norm": 42.4375, + "learning_rate": 9.852748977690284e-06, + "loss": 16.3848, + "step": 52120 + }, + { + "epoch": 0.94258895536354, + "grad_norm": 43.28125, + "learning_rate": 9.852720725383625e-06, + "loss": 16.8509, + "step": 52130 + }, + { + "epoch": 0.9427697704326679, + "grad_norm": 44.1875, + "learning_rate": 9.852692473076965e-06, + "loss": 16.6207, + "step": 52140 + }, + { + "epoch": 0.9429505855017957, + "grad_norm": 44.53125, + "learning_rate": 9.852664220770306e-06, + "loss": 16.405, + "step": 52150 + }, + { + "epoch": 0.9431314005709236, + "grad_norm": 43.96875, + "learning_rate": 9.852635968463647e-06, + "loss": 16.8411, + "step": 52160 + }, + { + "epoch": 0.9433122156400514, + "grad_norm": 45.96875, + "learning_rate": 9.852607716156987e-06, + "loss": 16.647, + "step": 52170 + }, + { + "epoch": 0.9434930307091793, + "grad_norm": 47.15625, + "learning_rate": 9.852579463850326e-06, + "loss": 16.826, + "step": 52180 + }, + { + "epoch": 0.9436738457783072, + "grad_norm": 42.59375, + "learning_rate": 9.852551211543667e-06, + "loss": 16.9083, + "step": 52190 + }, + { + "epoch": 0.9438546608474351, + "grad_norm": 45.46875, + "learning_rate": 9.852522959237008e-06, + "loss": 17.141, + "step": 52200 + }, + { + "epoch": 0.9440354759165629, + "grad_norm": 42.875, + "learning_rate": 9.852494706930348e-06, + "loss": 16.7258, + "step": 52210 + }, + { + "epoch": 0.9442162909856907, + "grad_norm": 44.34375, + "learning_rate": 9.852466454623689e-06, + "loss": 16.6493, + "step": 52220 + }, + { + "epoch": 0.9443971060548186, + "grad_norm": 43.375, + "learning_rate": 9.85243820231703e-06, + "loss": 16.6304, + "step": 52230 + }, + { + "epoch": 0.9445779211239465, + "grad_norm": 45.25, + "learning_rate": 9.85240995001037e-06, + "loss": 16.5853, + "step": 52240 + }, + { + "epoch": 0.9447587361930744, + "grad_norm": 43.625, + "learning_rate": 9.85238169770371e-06, + "loss": 17.031, + "step": 52250 + }, + { + "epoch": 0.9449395512622022, + "grad_norm": 45.625, + "learning_rate": 9.852353445397051e-06, + "loss": 16.8603, + "step": 52260 + }, + { + "epoch": 0.94512036633133, + "grad_norm": 43.6875, + "learning_rate": 9.85232519309039e-06, + "loss": 16.9095, + "step": 52270 + }, + { + "epoch": 0.9453011814004579, + "grad_norm": 43.53125, + "learning_rate": 9.85229694078373e-06, + "loss": 16.8246, + "step": 52280 + }, + { + "epoch": 0.9454819964695858, + "grad_norm": 45.3125, + "learning_rate": 9.852268688477071e-06, + "loss": 16.8132, + "step": 52290 + }, + { + "epoch": 0.9456628115387137, + "grad_norm": 48.875, + "learning_rate": 9.852240436170412e-06, + "loss": 16.8849, + "step": 52300 + }, + { + "epoch": 0.9458436266078415, + "grad_norm": 44.75, + "learning_rate": 9.852212183863753e-06, + "loss": 16.3753, + "step": 52310 + }, + { + "epoch": 0.9460244416769693, + "grad_norm": 42.59375, + "learning_rate": 9.852183931557093e-06, + "loss": 16.4101, + "step": 52320 + }, + { + "epoch": 0.9462052567460972, + "grad_norm": 47.6875, + "learning_rate": 9.852155679250434e-06, + "loss": 16.9043, + "step": 52330 + }, + { + "epoch": 0.946386071815225, + "grad_norm": 45.78125, + "learning_rate": 9.852127426943775e-06, + "loss": 16.5976, + "step": 52340 + }, + { + "epoch": 0.946566886884353, + "grad_norm": 45.0625, + "learning_rate": 9.852099174637113e-06, + "loss": 16.648, + "step": 52350 + }, + { + "epoch": 0.9467477019534808, + "grad_norm": 45.46875, + "learning_rate": 9.852070922330454e-06, + "loss": 16.923, + "step": 52360 + }, + { + "epoch": 0.9469285170226087, + "grad_norm": 49.625, + "learning_rate": 9.852042670023795e-06, + "loss": 17.094, + "step": 52370 + }, + { + "epoch": 0.9471093320917365, + "grad_norm": 45.75, + "learning_rate": 9.852014417717135e-06, + "loss": 17.2729, + "step": 52380 + }, + { + "epoch": 0.9472901471608643, + "grad_norm": 44.53125, + "learning_rate": 9.851986165410476e-06, + "loss": 16.6978, + "step": 52390 + }, + { + "epoch": 0.9474709622299923, + "grad_norm": 44.34375, + "learning_rate": 9.851957913103815e-06, + "loss": 16.9399, + "step": 52400 + }, + { + "epoch": 0.9476517772991201, + "grad_norm": 44.8125, + "learning_rate": 9.851929660797157e-06, + "loss": 17.1842, + "step": 52410 + }, + { + "epoch": 0.947832592368248, + "grad_norm": 45.09375, + "learning_rate": 9.851901408490498e-06, + "loss": 17.0201, + "step": 52420 + }, + { + "epoch": 0.9480134074373758, + "grad_norm": 46.3125, + "learning_rate": 9.851873156183838e-06, + "loss": 16.8392, + "step": 52430 + }, + { + "epoch": 0.9481942225065036, + "grad_norm": 44.6875, + "learning_rate": 9.851844903877177e-06, + "loss": 16.222, + "step": 52440 + }, + { + "epoch": 0.9483750375756316, + "grad_norm": 46.53125, + "learning_rate": 9.851816651570518e-06, + "loss": 16.7396, + "step": 52450 + }, + { + "epoch": 0.9485558526447594, + "grad_norm": 44.375, + "learning_rate": 9.851788399263859e-06, + "loss": 16.944, + "step": 52460 + }, + { + "epoch": 0.9487366677138873, + "grad_norm": 43.8125, + "learning_rate": 9.8517601469572e-06, + "loss": 16.3753, + "step": 52470 + }, + { + "epoch": 0.9489174827830151, + "grad_norm": 43.625, + "learning_rate": 9.85173189465054e-06, + "loss": 16.7334, + "step": 52480 + }, + { + "epoch": 0.949098297852143, + "grad_norm": 43.4375, + "learning_rate": 9.851703642343879e-06, + "loss": 16.5398, + "step": 52490 + }, + { + "epoch": 0.9492791129212709, + "grad_norm": 45.28125, + "learning_rate": 9.851675390037221e-06, + "loss": 16.5976, + "step": 52500 + }, + { + "epoch": 0.9494599279903987, + "grad_norm": 42.5, + "learning_rate": 9.851647137730562e-06, + "loss": 16.487, + "step": 52510 + }, + { + "epoch": 0.9496407430595266, + "grad_norm": 43.96875, + "learning_rate": 9.8516188854239e-06, + "loss": 17.0914, + "step": 52520 + }, + { + "epoch": 0.9498215581286544, + "grad_norm": 43.5625, + "learning_rate": 9.851590633117241e-06, + "loss": 16.472, + "step": 52530 + }, + { + "epoch": 0.9500023731977824, + "grad_norm": 46.90625, + "learning_rate": 9.851562380810582e-06, + "loss": 16.7584, + "step": 52540 + }, + { + "epoch": 0.9501831882669102, + "grad_norm": 44.75, + "learning_rate": 9.851534128503923e-06, + "loss": 16.8644, + "step": 52550 + }, + { + "epoch": 0.950364003336038, + "grad_norm": 43.875, + "learning_rate": 9.851505876197263e-06, + "loss": 16.5356, + "step": 52560 + }, + { + "epoch": 0.9505448184051659, + "grad_norm": 43.875, + "learning_rate": 9.851477623890604e-06, + "loss": 16.2732, + "step": 52570 + }, + { + "epoch": 0.9507256334742937, + "grad_norm": 44.84375, + "learning_rate": 9.851449371583944e-06, + "loss": 16.608, + "step": 52580 + }, + { + "epoch": 0.9509064485434217, + "grad_norm": 42.40625, + "learning_rate": 9.851421119277285e-06, + "loss": 16.5287, + "step": 52590 + }, + { + "epoch": 0.9510872636125495, + "grad_norm": 45.71875, + "learning_rate": 9.851392866970626e-06, + "loss": 17.0554, + "step": 52600 + }, + { + "epoch": 0.9512680786816773, + "grad_norm": 43.8125, + "learning_rate": 9.851364614663965e-06, + "loss": 16.7669, + "step": 52610 + }, + { + "epoch": 0.9514488937508052, + "grad_norm": 43.65625, + "learning_rate": 9.851336362357305e-06, + "loss": 16.934, + "step": 52620 + }, + { + "epoch": 0.951629708819933, + "grad_norm": 44.78125, + "learning_rate": 9.851308110050646e-06, + "loss": 15.9467, + "step": 52630 + }, + { + "epoch": 0.951810523889061, + "grad_norm": 45.28125, + "learning_rate": 9.851279857743986e-06, + "loss": 16.2832, + "step": 52640 + }, + { + "epoch": 0.9519913389581888, + "grad_norm": 45.375, + "learning_rate": 9.851251605437327e-06, + "loss": 16.6556, + "step": 52650 + }, + { + "epoch": 0.9521721540273166, + "grad_norm": 44.65625, + "learning_rate": 9.851223353130666e-06, + "loss": 16.8088, + "step": 52660 + }, + { + "epoch": 0.9523529690964445, + "grad_norm": 46.03125, + "learning_rate": 9.851195100824008e-06, + "loss": 16.5832, + "step": 52670 + }, + { + "epoch": 0.9525337841655723, + "grad_norm": 45.15625, + "learning_rate": 9.851166848517349e-06, + "loss": 17.0086, + "step": 52680 + }, + { + "epoch": 0.9527145992347003, + "grad_norm": 45.71875, + "learning_rate": 9.851138596210688e-06, + "loss": 16.6411, + "step": 52690 + }, + { + "epoch": 0.9528954143038281, + "grad_norm": 46.34375, + "learning_rate": 9.851110343904028e-06, + "loss": 16.5188, + "step": 52700 + }, + { + "epoch": 0.953076229372956, + "grad_norm": 44.0625, + "learning_rate": 9.851082091597369e-06, + "loss": 16.6827, + "step": 52710 + }, + { + "epoch": 0.9532570444420838, + "grad_norm": 46.4375, + "learning_rate": 9.85105383929071e-06, + "loss": 16.5534, + "step": 52720 + }, + { + "epoch": 0.9534378595112116, + "grad_norm": 44.40625, + "learning_rate": 9.85102558698405e-06, + "loss": 16.7903, + "step": 52730 + }, + { + "epoch": 0.9536186745803396, + "grad_norm": 43.0625, + "learning_rate": 9.850997334677391e-06, + "loss": 16.7192, + "step": 52740 + }, + { + "epoch": 0.9537994896494674, + "grad_norm": 45.84375, + "learning_rate": 9.85096908237073e-06, + "loss": 16.6573, + "step": 52750 + }, + { + "epoch": 0.9539803047185953, + "grad_norm": 45.625, + "learning_rate": 9.850940830064072e-06, + "loss": 16.2963, + "step": 52760 + }, + { + "epoch": 0.9541611197877231, + "grad_norm": 45.09375, + "learning_rate": 9.850912577757413e-06, + "loss": 16.2455, + "step": 52770 + }, + { + "epoch": 0.9543419348568509, + "grad_norm": 48.15625, + "learning_rate": 9.850884325450752e-06, + "loss": 16.774, + "step": 52780 + }, + { + "epoch": 0.9545227499259789, + "grad_norm": 42.40625, + "learning_rate": 9.850856073144092e-06, + "loss": 16.7267, + "step": 52790 + }, + { + "epoch": 0.9547035649951067, + "grad_norm": 43.46875, + "learning_rate": 9.850827820837433e-06, + "loss": 16.56, + "step": 52800 + }, + { + "epoch": 0.9548843800642346, + "grad_norm": 41.96875, + "learning_rate": 9.850799568530774e-06, + "loss": 17.191, + "step": 52810 + }, + { + "epoch": 0.9550651951333624, + "grad_norm": 43.75, + "learning_rate": 9.850771316224114e-06, + "loss": 16.2504, + "step": 52820 + }, + { + "epoch": 0.9552460102024902, + "grad_norm": 44.59375, + "learning_rate": 9.850743063917453e-06, + "loss": 16.4462, + "step": 52830 + }, + { + "epoch": 0.9554268252716182, + "grad_norm": 44.25, + "learning_rate": 9.850714811610794e-06, + "loss": 16.653, + "step": 52840 + }, + { + "epoch": 0.955607640340746, + "grad_norm": 41.15625, + "learning_rate": 9.850686559304136e-06, + "loss": 16.4501, + "step": 52850 + }, + { + "epoch": 0.9557884554098739, + "grad_norm": 45.09375, + "learning_rate": 9.850658306997477e-06, + "loss": 16.4311, + "step": 52860 + }, + { + "epoch": 0.9559692704790017, + "grad_norm": 42.5, + "learning_rate": 9.850630054690816e-06, + "loss": 16.4035, + "step": 52870 + }, + { + "epoch": 0.9561500855481296, + "grad_norm": 43.46875, + "learning_rate": 9.850601802384156e-06, + "loss": 16.8654, + "step": 52880 + }, + { + "epoch": 0.9563309006172575, + "grad_norm": 43.125, + "learning_rate": 9.850573550077497e-06, + "loss": 16.8872, + "step": 52890 + }, + { + "epoch": 0.9565117156863853, + "grad_norm": 45.59375, + "learning_rate": 9.850545297770838e-06, + "loss": 17.3379, + "step": 52900 + }, + { + "epoch": 0.9566925307555132, + "grad_norm": 47.5, + "learning_rate": 9.850517045464178e-06, + "loss": 16.7555, + "step": 52910 + }, + { + "epoch": 0.956873345824641, + "grad_norm": 47.375, + "learning_rate": 9.850488793157517e-06, + "loss": 16.522, + "step": 52920 + }, + { + "epoch": 0.9570541608937689, + "grad_norm": 44.65625, + "learning_rate": 9.85046054085086e-06, + "loss": 16.7725, + "step": 52930 + }, + { + "epoch": 0.9572349759628968, + "grad_norm": 42.53125, + "learning_rate": 9.8504322885442e-06, + "loss": 16.493, + "step": 52940 + }, + { + "epoch": 0.9574157910320246, + "grad_norm": 43.34375, + "learning_rate": 9.850404036237539e-06, + "loss": 16.8442, + "step": 52950 + }, + { + "epoch": 0.9575966061011525, + "grad_norm": 47.34375, + "learning_rate": 9.85037578393088e-06, + "loss": 16.7463, + "step": 52960 + }, + { + "epoch": 0.9577774211702803, + "grad_norm": 44.40625, + "learning_rate": 9.85034753162422e-06, + "loss": 16.7918, + "step": 52970 + }, + { + "epoch": 0.9579582362394082, + "grad_norm": 43.46875, + "learning_rate": 9.85031927931756e-06, + "loss": 16.6675, + "step": 52980 + }, + { + "epoch": 0.958139051308536, + "grad_norm": 40.71875, + "learning_rate": 9.850291027010901e-06, + "loss": 16.662, + "step": 52990 + }, + { + "epoch": 0.9583198663776639, + "grad_norm": 42.09375, + "learning_rate": 9.85026277470424e-06, + "loss": 16.813, + "step": 53000 + }, + { + "epoch": 0.9585006814467918, + "grad_norm": 44.40625, + "learning_rate": 9.850234522397581e-06, + "loss": 16.8846, + "step": 53010 + }, + { + "epoch": 0.9586814965159196, + "grad_norm": 45.625, + "learning_rate": 9.850206270090923e-06, + "loss": 17.0815, + "step": 53020 + }, + { + "epoch": 0.9588623115850475, + "grad_norm": 47.40625, + "learning_rate": 9.850178017784264e-06, + "loss": 16.2415, + "step": 53030 + }, + { + "epoch": 0.9590431266541753, + "grad_norm": 43.3125, + "learning_rate": 9.850149765477603e-06, + "loss": 16.5826, + "step": 53040 + }, + { + "epoch": 0.9592239417233033, + "grad_norm": 45.4375, + "learning_rate": 9.850121513170943e-06, + "loss": 16.5556, + "step": 53050 + }, + { + "epoch": 0.9594047567924311, + "grad_norm": 44.0625, + "learning_rate": 9.850093260864284e-06, + "loss": 16.7386, + "step": 53060 + }, + { + "epoch": 0.9595855718615589, + "grad_norm": 41.21875, + "learning_rate": 9.850065008557625e-06, + "loss": 16.5449, + "step": 53070 + }, + { + "epoch": 0.9597663869306868, + "grad_norm": 51.65625, + "learning_rate": 9.850036756250965e-06, + "loss": 16.669, + "step": 53080 + }, + { + "epoch": 0.9599472019998146, + "grad_norm": 43.65625, + "learning_rate": 9.850008503944304e-06, + "loss": 16.748, + "step": 53090 + }, + { + "epoch": 0.9601280170689426, + "grad_norm": 45.40625, + "learning_rate": 9.849980251637645e-06, + "loss": 17.1109, + "step": 53100 + }, + { + "epoch": 0.9603088321380704, + "grad_norm": 40.03125, + "learning_rate": 9.849951999330987e-06, + "loss": 16.6088, + "step": 53110 + }, + { + "epoch": 0.9604896472071982, + "grad_norm": 45.21875, + "learning_rate": 9.849923747024326e-06, + "loss": 16.7283, + "step": 53120 + }, + { + "epoch": 0.9606704622763261, + "grad_norm": 46.03125, + "learning_rate": 9.849895494717667e-06, + "loss": 16.4558, + "step": 53130 + }, + { + "epoch": 0.960851277345454, + "grad_norm": 46.09375, + "learning_rate": 9.849867242411007e-06, + "loss": 16.6179, + "step": 53140 + }, + { + "epoch": 0.9610320924145819, + "grad_norm": 42.71875, + "learning_rate": 9.849838990104348e-06, + "loss": 16.864, + "step": 53150 + }, + { + "epoch": 0.9612129074837097, + "grad_norm": 43.71875, + "learning_rate": 9.849810737797689e-06, + "loss": 16.7878, + "step": 53160 + }, + { + "epoch": 0.9613937225528375, + "grad_norm": 44.59375, + "learning_rate": 9.84978248549103e-06, + "loss": 16.676, + "step": 53170 + }, + { + "epoch": 0.9615745376219654, + "grad_norm": 44.0625, + "learning_rate": 9.849754233184368e-06, + "loss": 16.5465, + "step": 53180 + }, + { + "epoch": 0.9617553526910932, + "grad_norm": 45.0, + "learning_rate": 9.849725980877709e-06, + "loss": 16.7175, + "step": 53190 + }, + { + "epoch": 0.9619361677602212, + "grad_norm": 44.1875, + "learning_rate": 9.849697728571051e-06, + "loss": 16.9195, + "step": 53200 + }, + { + "epoch": 0.962116982829349, + "grad_norm": 46.0625, + "learning_rate": 9.84966947626439e-06, + "loss": 17.0238, + "step": 53210 + }, + { + "epoch": 0.9622977978984769, + "grad_norm": 44.78125, + "learning_rate": 9.84964122395773e-06, + "loss": 16.6907, + "step": 53220 + }, + { + "epoch": 0.9624786129676047, + "grad_norm": 48.65625, + "learning_rate": 9.849612971651071e-06, + "loss": 16.7542, + "step": 53230 + }, + { + "epoch": 0.9626594280367325, + "grad_norm": 45.90625, + "learning_rate": 9.849584719344412e-06, + "loss": 16.4365, + "step": 53240 + }, + { + "epoch": 0.9628402431058605, + "grad_norm": 43.65625, + "learning_rate": 9.849556467037753e-06, + "loss": 16.5799, + "step": 53250 + }, + { + "epoch": 0.9630210581749883, + "grad_norm": 42.71875, + "learning_rate": 9.849528214731091e-06, + "loss": 17.1928, + "step": 53260 + }, + { + "epoch": 0.9632018732441162, + "grad_norm": 42.28125, + "learning_rate": 9.849499962424432e-06, + "loss": 16.5996, + "step": 53270 + }, + { + "epoch": 0.963382688313244, + "grad_norm": 44.875, + "learning_rate": 9.849471710117774e-06, + "loss": 16.595, + "step": 53280 + }, + { + "epoch": 0.9635635033823718, + "grad_norm": 42.9375, + "learning_rate": 9.849443457811115e-06, + "loss": 16.4589, + "step": 53290 + }, + { + "epoch": 0.9637443184514998, + "grad_norm": 45.0, + "learning_rate": 9.849415205504454e-06, + "loss": 16.8769, + "step": 53300 + }, + { + "epoch": 0.9639251335206276, + "grad_norm": 44.34375, + "learning_rate": 9.849386953197795e-06, + "loss": 16.4602, + "step": 53310 + }, + { + "epoch": 0.9641059485897555, + "grad_norm": 39.125, + "learning_rate": 9.849358700891135e-06, + "loss": 17.0599, + "step": 53320 + }, + { + "epoch": 0.9642867636588833, + "grad_norm": 46.09375, + "learning_rate": 9.849330448584476e-06, + "loss": 16.9556, + "step": 53330 + }, + { + "epoch": 0.9644675787280111, + "grad_norm": 45.875, + "learning_rate": 9.849302196277816e-06, + "loss": 16.5521, + "step": 53340 + }, + { + "epoch": 0.9646483937971391, + "grad_norm": 46.15625, + "learning_rate": 9.849273943971155e-06, + "loss": 16.6604, + "step": 53350 + }, + { + "epoch": 0.9648292088662669, + "grad_norm": 43.8125, + "learning_rate": 9.849245691664496e-06, + "loss": 16.7637, + "step": 53360 + }, + { + "epoch": 0.9650100239353948, + "grad_norm": 42.5, + "learning_rate": 9.849217439357838e-06, + "loss": 16.5801, + "step": 53370 + }, + { + "epoch": 0.9651908390045226, + "grad_norm": 45.625, + "learning_rate": 9.849189187051177e-06, + "loss": 16.5078, + "step": 53380 + }, + { + "epoch": 0.9653716540736504, + "grad_norm": 46.28125, + "learning_rate": 9.849160934744518e-06, + "loss": 16.5973, + "step": 53390 + }, + { + "epoch": 0.9655524691427784, + "grad_norm": 47.65625, + "learning_rate": 9.849132682437858e-06, + "loss": 17.1258, + "step": 53400 + }, + { + "epoch": 0.9657332842119062, + "grad_norm": 44.46875, + "learning_rate": 9.849104430131199e-06, + "loss": 16.6324, + "step": 53410 + }, + { + "epoch": 0.9659140992810341, + "grad_norm": 42.90625, + "learning_rate": 9.84907617782454e-06, + "loss": 16.9065, + "step": 53420 + }, + { + "epoch": 0.9660949143501619, + "grad_norm": 44.46875, + "learning_rate": 9.849047925517879e-06, + "loss": 16.5174, + "step": 53430 + }, + { + "epoch": 0.9662757294192899, + "grad_norm": 45.03125, + "learning_rate": 9.84901967321122e-06, + "loss": 16.9315, + "step": 53440 + }, + { + "epoch": 0.9664565444884177, + "grad_norm": 49.90625, + "learning_rate": 9.84899142090456e-06, + "loss": 16.6896, + "step": 53450 + }, + { + "epoch": 0.9666373595575455, + "grad_norm": 43.5625, + "learning_rate": 9.848963168597902e-06, + "loss": 16.5972, + "step": 53460 + }, + { + "epoch": 0.9668181746266734, + "grad_norm": 45.21875, + "learning_rate": 9.848934916291241e-06, + "loss": 16.7322, + "step": 53470 + }, + { + "epoch": 0.9669989896958012, + "grad_norm": 45.25, + "learning_rate": 9.848906663984582e-06, + "loss": 16.9706, + "step": 53480 + }, + { + "epoch": 0.9671798047649292, + "grad_norm": 46.5625, + "learning_rate": 9.848878411677922e-06, + "loss": 16.8125, + "step": 53490 + }, + { + "epoch": 0.967360619834057, + "grad_norm": 45.75, + "learning_rate": 9.848850159371263e-06, + "loss": 16.3223, + "step": 53500 + }, + { + "epoch": 0.9675414349031848, + "grad_norm": 40.0, + "learning_rate": 9.848821907064604e-06, + "loss": 16.235, + "step": 53510 + }, + { + "epoch": 0.9677222499723127, + "grad_norm": 43.34375, + "learning_rate": 9.848793654757943e-06, + "loss": 16.5035, + "step": 53520 + }, + { + "epoch": 0.9679030650414405, + "grad_norm": 46.15625, + "learning_rate": 9.848765402451283e-06, + "loss": 16.6695, + "step": 53530 + }, + { + "epoch": 0.9680838801105685, + "grad_norm": 45.46875, + "learning_rate": 9.848737150144624e-06, + "loss": 16.551, + "step": 53540 + }, + { + "epoch": 0.9682646951796963, + "grad_norm": 47.21875, + "learning_rate": 9.848708897837964e-06, + "loss": 16.9247, + "step": 53550 + }, + { + "epoch": 0.9684455102488241, + "grad_norm": 44.34375, + "learning_rate": 9.848680645531305e-06, + "loss": 16.7716, + "step": 53560 + }, + { + "epoch": 0.968626325317952, + "grad_norm": 42.125, + "learning_rate": 9.848652393224646e-06, + "loss": 17.0277, + "step": 53570 + }, + { + "epoch": 0.9688071403870798, + "grad_norm": 42.84375, + "learning_rate": 9.848624140917986e-06, + "loss": 16.7511, + "step": 53580 + }, + { + "epoch": 0.9689879554562078, + "grad_norm": 46.75, + "learning_rate": 9.848595888611327e-06, + "loss": 16.8811, + "step": 53590 + }, + { + "epoch": 0.9691687705253356, + "grad_norm": 42.5625, + "learning_rate": 9.848567636304668e-06, + "loss": 16.6817, + "step": 53600 + }, + { + "epoch": 0.9693495855944635, + "grad_norm": 46.46875, + "learning_rate": 9.848539383998006e-06, + "loss": 16.6221, + "step": 53610 + }, + { + "epoch": 0.9695304006635913, + "grad_norm": 43.4375, + "learning_rate": 9.848511131691347e-06, + "loss": 17.0512, + "step": 53620 + }, + { + "epoch": 0.9697112157327191, + "grad_norm": 40.8125, + "learning_rate": 9.84848287938469e-06, + "loss": 16.2973, + "step": 53630 + }, + { + "epoch": 0.969892030801847, + "grad_norm": 44.71875, + "learning_rate": 9.848454627078028e-06, + "loss": 16.8274, + "step": 53640 + }, + { + "epoch": 0.9700728458709749, + "grad_norm": 46.03125, + "learning_rate": 9.848426374771369e-06, + "loss": 16.3493, + "step": 53650 + }, + { + "epoch": 0.9702536609401028, + "grad_norm": 45.1875, + "learning_rate": 9.84839812246471e-06, + "loss": 16.2608, + "step": 53660 + }, + { + "epoch": 0.9704344760092306, + "grad_norm": 41.5625, + "learning_rate": 9.84836987015805e-06, + "loss": 16.3783, + "step": 53670 + }, + { + "epoch": 0.9706152910783584, + "grad_norm": 44.75, + "learning_rate": 9.84834161785139e-06, + "loss": 16.6116, + "step": 53680 + }, + { + "epoch": 0.9707961061474863, + "grad_norm": 43.5, + "learning_rate": 9.84831336554473e-06, + "loss": 16.6586, + "step": 53690 + }, + { + "epoch": 0.9709769212166142, + "grad_norm": 44.71875, + "learning_rate": 9.84828511323807e-06, + "loss": 16.7844, + "step": 53700 + }, + { + "epoch": 0.9711577362857421, + "grad_norm": 45.34375, + "learning_rate": 9.848256860931411e-06, + "loss": 16.8105, + "step": 53710 + }, + { + "epoch": 0.9713385513548699, + "grad_norm": 45.25, + "learning_rate": 9.848228608624753e-06, + "loss": 16.7695, + "step": 53720 + }, + { + "epoch": 0.9715193664239977, + "grad_norm": 44.59375, + "learning_rate": 9.848200356318092e-06, + "loss": 16.7423, + "step": 53730 + }, + { + "epoch": 0.9717001814931256, + "grad_norm": 48.625, + "learning_rate": 9.848172104011433e-06, + "loss": 16.6176, + "step": 53740 + }, + { + "epoch": 0.9718809965622535, + "grad_norm": 47.3125, + "learning_rate": 9.848143851704773e-06, + "loss": 16.8514, + "step": 53750 + }, + { + "epoch": 0.9720618116313814, + "grad_norm": 43.8125, + "learning_rate": 9.848115599398114e-06, + "loss": 16.7589, + "step": 53760 + }, + { + "epoch": 0.9722426267005092, + "grad_norm": 47.03125, + "learning_rate": 9.848087347091455e-06, + "loss": 16.797, + "step": 53770 + }, + { + "epoch": 0.9724234417696371, + "grad_norm": 45.34375, + "learning_rate": 9.848059094784794e-06, + "loss": 16.6639, + "step": 53780 + }, + { + "epoch": 0.972604256838765, + "grad_norm": 41.8125, + "learning_rate": 9.848030842478134e-06, + "loss": 16.494, + "step": 53790 + }, + { + "epoch": 0.9727850719078928, + "grad_norm": 44.75, + "learning_rate": 9.848002590171475e-06, + "loss": 16.8343, + "step": 53800 + }, + { + "epoch": 0.9729658869770207, + "grad_norm": 43.09375, + "learning_rate": 9.847974337864816e-06, + "loss": 16.5076, + "step": 53810 + }, + { + "epoch": 0.9731467020461485, + "grad_norm": 45.3125, + "learning_rate": 9.847946085558156e-06, + "loss": 16.5628, + "step": 53820 + }, + { + "epoch": 0.9733275171152764, + "grad_norm": 42.625, + "learning_rate": 9.847917833251497e-06, + "loss": 16.3621, + "step": 53830 + }, + { + "epoch": 0.9735083321844042, + "grad_norm": 44.09375, + "learning_rate": 9.847889580944837e-06, + "loss": 16.68, + "step": 53840 + }, + { + "epoch": 0.9736891472535321, + "grad_norm": 42.75, + "learning_rate": 9.847861328638178e-06, + "loss": 16.4268, + "step": 53850 + }, + { + "epoch": 0.97386996232266, + "grad_norm": 46.90625, + "learning_rate": 9.847833076331517e-06, + "loss": 16.3298, + "step": 53860 + }, + { + "epoch": 0.9740507773917878, + "grad_norm": 46.625, + "learning_rate": 9.847804824024858e-06, + "loss": 16.6696, + "step": 53870 + }, + { + "epoch": 0.9742315924609157, + "grad_norm": 46.3125, + "learning_rate": 9.847776571718198e-06, + "loss": 16.8559, + "step": 53880 + }, + { + "epoch": 0.9744124075300435, + "grad_norm": 40.90625, + "learning_rate": 9.847748319411539e-06, + "loss": 16.8465, + "step": 53890 + }, + { + "epoch": 0.9745932225991714, + "grad_norm": 45.03125, + "learning_rate": 9.84772006710488e-06, + "loss": 16.295, + "step": 53900 + }, + { + "epoch": 0.9747740376682993, + "grad_norm": 44.125, + "learning_rate": 9.84769181479822e-06, + "loss": 16.9551, + "step": 53910 + }, + { + "epoch": 0.9749548527374271, + "grad_norm": 44.4375, + "learning_rate": 9.84766356249156e-06, + "loss": 16.4401, + "step": 53920 + }, + { + "epoch": 0.975135667806555, + "grad_norm": 43.96875, + "learning_rate": 9.847635310184901e-06, + "loss": 16.5902, + "step": 53930 + }, + { + "epoch": 0.9753164828756828, + "grad_norm": 45.71875, + "learning_rate": 9.847607057878242e-06, + "loss": 16.6108, + "step": 53940 + }, + { + "epoch": 0.9754972979448108, + "grad_norm": 43.875, + "learning_rate": 9.84757880557158e-06, + "loss": 16.9275, + "step": 53950 + }, + { + "epoch": 0.9756781130139386, + "grad_norm": 45.0, + "learning_rate": 9.847550553264921e-06, + "loss": 16.6905, + "step": 53960 + }, + { + "epoch": 0.9758589280830664, + "grad_norm": 43.75, + "learning_rate": 9.847522300958262e-06, + "loss": 16.5468, + "step": 53970 + }, + { + "epoch": 0.9760397431521943, + "grad_norm": 44.0625, + "learning_rate": 9.847494048651603e-06, + "loss": 16.9278, + "step": 53980 + }, + { + "epoch": 0.9762205582213221, + "grad_norm": 43.8125, + "learning_rate": 9.847465796344943e-06, + "loss": 16.379, + "step": 53990 + }, + { + "epoch": 0.9764013732904501, + "grad_norm": 43.40625, + "learning_rate": 9.847437544038284e-06, + "loss": 16.4727, + "step": 54000 + }, + { + "epoch": 0.9765821883595779, + "grad_norm": 45.3125, + "learning_rate": 9.847409291731625e-06, + "loss": 16.854, + "step": 54010 + }, + { + "epoch": 0.9767630034287057, + "grad_norm": 43.46875, + "learning_rate": 9.847381039424965e-06, + "loss": 16.6393, + "step": 54020 + }, + { + "epoch": 0.9769438184978336, + "grad_norm": 45.65625, + "learning_rate": 9.847352787118306e-06, + "loss": 16.786, + "step": 54030 + }, + { + "epoch": 0.9771246335669614, + "grad_norm": 43.09375, + "learning_rate": 9.847324534811645e-06, + "loss": 16.5585, + "step": 54040 + }, + { + "epoch": 0.9773054486360894, + "grad_norm": 42.46875, + "learning_rate": 9.847296282504985e-06, + "loss": 16.6101, + "step": 54050 + }, + { + "epoch": 0.9774862637052172, + "grad_norm": 45.53125, + "learning_rate": 9.847268030198326e-06, + "loss": 16.7073, + "step": 54060 + }, + { + "epoch": 0.977667078774345, + "grad_norm": 44.78125, + "learning_rate": 9.847239777891667e-06, + "loss": 16.6391, + "step": 54070 + }, + { + "epoch": 0.9778478938434729, + "grad_norm": 42.09375, + "learning_rate": 9.847211525585007e-06, + "loss": 16.8252, + "step": 54080 + }, + { + "epoch": 0.9780287089126007, + "grad_norm": 44.375, + "learning_rate": 9.847183273278348e-06, + "loss": 16.2046, + "step": 54090 + }, + { + "epoch": 0.9782095239817287, + "grad_norm": 45.4375, + "learning_rate": 9.847155020971688e-06, + "loss": 16.8368, + "step": 54100 + }, + { + "epoch": 0.9783903390508565, + "grad_norm": 41.84375, + "learning_rate": 9.847126768665029e-06, + "loss": 16.9655, + "step": 54110 + }, + { + "epoch": 0.9785711541199844, + "grad_norm": 42.34375, + "learning_rate": 9.847098516358368e-06, + "loss": 16.7057, + "step": 54120 + }, + { + "epoch": 0.9787519691891122, + "grad_norm": 47.28125, + "learning_rate": 9.847070264051709e-06, + "loss": 16.7744, + "step": 54130 + }, + { + "epoch": 0.97893278425824, + "grad_norm": 46.1875, + "learning_rate": 9.84704201174505e-06, + "loss": 16.8817, + "step": 54140 + }, + { + "epoch": 0.979113599327368, + "grad_norm": 48.625, + "learning_rate": 9.84701375943839e-06, + "loss": 16.5609, + "step": 54150 + }, + { + "epoch": 0.9792944143964958, + "grad_norm": 45.09375, + "learning_rate": 9.84698550713173e-06, + "loss": 16.8131, + "step": 54160 + }, + { + "epoch": 0.9794752294656237, + "grad_norm": 42.09375, + "learning_rate": 9.846957254825071e-06, + "loss": 16.4902, + "step": 54170 + }, + { + "epoch": 0.9796560445347515, + "grad_norm": 43.0625, + "learning_rate": 9.846929002518412e-06, + "loss": 16.5643, + "step": 54180 + }, + { + "epoch": 0.9798368596038793, + "grad_norm": 46.84375, + "learning_rate": 9.846900750211752e-06, + "loss": 16.5155, + "step": 54190 + }, + { + "epoch": 0.9800176746730073, + "grad_norm": 45.34375, + "learning_rate": 9.846872497905093e-06, + "loss": 16.5785, + "step": 54200 + }, + { + "epoch": 0.9801984897421351, + "grad_norm": 46.4375, + "learning_rate": 9.846844245598432e-06, + "loss": 16.6293, + "step": 54210 + }, + { + "epoch": 0.980379304811263, + "grad_norm": 42.21875, + "learning_rate": 9.846815993291773e-06, + "loss": 16.4009, + "step": 54220 + }, + { + "epoch": 0.9805601198803908, + "grad_norm": 43.21875, + "learning_rate": 9.846787740985113e-06, + "loss": 16.6647, + "step": 54230 + }, + { + "epoch": 0.9807409349495186, + "grad_norm": 45.5, + "learning_rate": 9.846759488678454e-06, + "loss": 16.7975, + "step": 54240 + }, + { + "epoch": 0.9809217500186466, + "grad_norm": 43.875, + "learning_rate": 9.846731236371794e-06, + "loss": 16.7165, + "step": 54250 + }, + { + "epoch": 0.9811025650877744, + "grad_norm": 45.5625, + "learning_rate": 9.846702984065135e-06, + "loss": 16.5906, + "step": 54260 + }, + { + "epoch": 0.9812833801569023, + "grad_norm": 48.40625, + "learning_rate": 9.846674731758476e-06, + "loss": 16.5228, + "step": 54270 + }, + { + "epoch": 0.9814641952260301, + "grad_norm": 45.40625, + "learning_rate": 9.846646479451816e-06, + "loss": 16.3795, + "step": 54280 + }, + { + "epoch": 0.981645010295158, + "grad_norm": 46.78125, + "learning_rate": 9.846618227145155e-06, + "loss": 17.0372, + "step": 54290 + }, + { + "epoch": 0.9818258253642859, + "grad_norm": 48.375, + "learning_rate": 9.846589974838496e-06, + "loss": 16.7747, + "step": 54300 + }, + { + "epoch": 0.9820066404334137, + "grad_norm": 43.125, + "learning_rate": 9.846561722531836e-06, + "loss": 16.8236, + "step": 54310 + }, + { + "epoch": 0.9821874555025416, + "grad_norm": 46.25, + "learning_rate": 9.846533470225177e-06, + "loss": 16.6047, + "step": 54320 + }, + { + "epoch": 0.9823682705716694, + "grad_norm": 47.9375, + "learning_rate": 9.846505217918518e-06, + "loss": 16.911, + "step": 54330 + }, + { + "epoch": 0.9825490856407973, + "grad_norm": 45.9375, + "learning_rate": 9.846476965611858e-06, + "loss": 16.969, + "step": 54340 + }, + { + "epoch": 0.9827299007099252, + "grad_norm": 41.71875, + "learning_rate": 9.846448713305199e-06, + "loss": 16.8471, + "step": 54350 + }, + { + "epoch": 0.982910715779053, + "grad_norm": 45.5, + "learning_rate": 9.84642046099854e-06, + "loss": 16.3527, + "step": 54360 + }, + { + "epoch": 0.9830915308481809, + "grad_norm": 47.65625, + "learning_rate": 9.84639220869188e-06, + "loss": 16.3417, + "step": 54370 + }, + { + "epoch": 0.9832723459173087, + "grad_norm": 43.6875, + "learning_rate": 9.846363956385219e-06, + "loss": 16.4029, + "step": 54380 + }, + { + "epoch": 0.9834531609864366, + "grad_norm": 44.09375, + "learning_rate": 9.84633570407856e-06, + "loss": 16.9436, + "step": 54390 + }, + { + "epoch": 0.9836339760555645, + "grad_norm": 42.4375, + "learning_rate": 9.8463074517719e-06, + "loss": 16.6759, + "step": 54400 + }, + { + "epoch": 0.9838147911246923, + "grad_norm": 45.375, + "learning_rate": 9.846279199465241e-06, + "loss": 16.7103, + "step": 54410 + }, + { + "epoch": 0.9839956061938202, + "grad_norm": 42.5625, + "learning_rate": 9.846250947158582e-06, + "loss": 16.3442, + "step": 54420 + }, + { + "epoch": 0.984176421262948, + "grad_norm": 46.0625, + "learning_rate": 9.846222694851922e-06, + "loss": 16.9653, + "step": 54430 + }, + { + "epoch": 0.984357236332076, + "grad_norm": 47.03125, + "learning_rate": 9.846194442545263e-06, + "loss": 16.7699, + "step": 54440 + }, + { + "epoch": 0.9845380514012038, + "grad_norm": 46.34375, + "learning_rate": 9.846166190238603e-06, + "loss": 16.7369, + "step": 54450 + }, + { + "epoch": 0.9847188664703317, + "grad_norm": 45.9375, + "learning_rate": 9.846137937931944e-06, + "loss": 17.2226, + "step": 54460 + }, + { + "epoch": 0.9848996815394595, + "grad_norm": 45.03125, + "learning_rate": 9.846109685625283e-06, + "loss": 16.9318, + "step": 54470 + }, + { + "epoch": 0.9850804966085873, + "grad_norm": 45.15625, + "learning_rate": 9.846081433318624e-06, + "loss": 16.7337, + "step": 54480 + }, + { + "epoch": 0.9852613116777152, + "grad_norm": 45.6875, + "learning_rate": 9.846053181011964e-06, + "loss": 17.0956, + "step": 54490 + }, + { + "epoch": 0.9854421267468431, + "grad_norm": 42.84375, + "learning_rate": 9.846024928705305e-06, + "loss": 16.4707, + "step": 54500 + }, + { + "epoch": 0.985622941815971, + "grad_norm": 41.4375, + "learning_rate": 9.845996676398646e-06, + "loss": 16.2423, + "step": 54510 + }, + { + "epoch": 0.9858037568850988, + "grad_norm": 44.90625, + "learning_rate": 9.845968424091986e-06, + "loss": 16.9379, + "step": 54520 + }, + { + "epoch": 0.9859845719542266, + "grad_norm": 43.4375, + "learning_rate": 9.845940171785327e-06, + "loss": 16.7772, + "step": 54530 + }, + { + "epoch": 0.9861653870233545, + "grad_norm": 43.125, + "learning_rate": 9.845911919478667e-06, + "loss": 16.7428, + "step": 54540 + }, + { + "epoch": 0.9863462020924824, + "grad_norm": 45.59375, + "learning_rate": 9.845883667172006e-06, + "loss": 16.4671, + "step": 54550 + }, + { + "epoch": 0.9865270171616103, + "grad_norm": 45.28125, + "learning_rate": 9.845855414865347e-06, + "loss": 16.8481, + "step": 54560 + }, + { + "epoch": 0.9867078322307381, + "grad_norm": 40.65625, + "learning_rate": 9.845827162558688e-06, + "loss": 16.5104, + "step": 54570 + }, + { + "epoch": 0.9868886472998659, + "grad_norm": 44.21875, + "learning_rate": 9.845798910252028e-06, + "loss": 16.8001, + "step": 54580 + }, + { + "epoch": 0.9870694623689938, + "grad_norm": 43.78125, + "learning_rate": 9.845770657945369e-06, + "loss": 16.502, + "step": 54590 + }, + { + "epoch": 0.9872502774381217, + "grad_norm": 44.59375, + "learning_rate": 9.84574240563871e-06, + "loss": 16.652, + "step": 54600 + }, + { + "epoch": 0.9874310925072496, + "grad_norm": 45.875, + "learning_rate": 9.84571415333205e-06, + "loss": 16.5849, + "step": 54610 + }, + { + "epoch": 0.9876119075763774, + "grad_norm": 46.46875, + "learning_rate": 9.84568590102539e-06, + "loss": 16.6386, + "step": 54620 + }, + { + "epoch": 0.9877927226455053, + "grad_norm": 48.0, + "learning_rate": 9.845657648718731e-06, + "loss": 16.7037, + "step": 54630 + }, + { + "epoch": 0.9879735377146331, + "grad_norm": 46.625, + "learning_rate": 9.84562939641207e-06, + "loss": 16.9638, + "step": 54640 + }, + { + "epoch": 0.988154352783761, + "grad_norm": 43.34375, + "learning_rate": 9.845601144105411e-06, + "loss": 16.5736, + "step": 54650 + }, + { + "epoch": 0.9883351678528889, + "grad_norm": 45.28125, + "learning_rate": 9.845572891798751e-06, + "loss": 16.9063, + "step": 54660 + }, + { + "epoch": 0.9885159829220167, + "grad_norm": 42.28125, + "learning_rate": 9.845544639492092e-06, + "loss": 16.9911, + "step": 54670 + }, + { + "epoch": 0.9886967979911446, + "grad_norm": 44.78125, + "learning_rate": 9.845516387185433e-06, + "loss": 16.7827, + "step": 54680 + }, + { + "epoch": 0.9888776130602724, + "grad_norm": 46.53125, + "learning_rate": 9.845488134878773e-06, + "loss": 16.8007, + "step": 54690 + }, + { + "epoch": 0.9890584281294003, + "grad_norm": 43.3125, + "learning_rate": 9.845459882572114e-06, + "loss": 16.2794, + "step": 54700 + }, + { + "epoch": 0.9892392431985282, + "grad_norm": 45.59375, + "learning_rate": 9.845431630265455e-06, + "loss": 16.7441, + "step": 54710 + }, + { + "epoch": 0.989420058267656, + "grad_norm": 43.25, + "learning_rate": 9.845403377958794e-06, + "loss": 16.602, + "step": 54720 + }, + { + "epoch": 0.9896008733367839, + "grad_norm": 46.59375, + "learning_rate": 9.845375125652134e-06, + "loss": 16.721, + "step": 54730 + }, + { + "epoch": 0.9897816884059117, + "grad_norm": 46.40625, + "learning_rate": 9.845346873345475e-06, + "loss": 17.2184, + "step": 54740 + }, + { + "epoch": 0.9899625034750396, + "grad_norm": 41.8125, + "learning_rate": 9.845318621038815e-06, + "loss": 16.793, + "step": 54750 + }, + { + "epoch": 0.9901433185441675, + "grad_norm": 43.5625, + "learning_rate": 9.845290368732156e-06, + "loss": 16.7583, + "step": 54760 + }, + { + "epoch": 0.9903241336132953, + "grad_norm": 44.90625, + "learning_rate": 9.845262116425497e-06, + "loss": 16.8049, + "step": 54770 + }, + { + "epoch": 0.9905049486824232, + "grad_norm": 42.28125, + "learning_rate": 9.845233864118837e-06, + "loss": 16.8066, + "step": 54780 + }, + { + "epoch": 0.990685763751551, + "grad_norm": 42.3125, + "learning_rate": 9.845205611812178e-06, + "loss": 16.7789, + "step": 54790 + }, + { + "epoch": 0.990866578820679, + "grad_norm": 46.125, + "learning_rate": 9.845177359505518e-06, + "loss": 16.5531, + "step": 54800 + }, + { + "epoch": 0.9910473938898068, + "grad_norm": 43.96875, + "learning_rate": 9.845149107198857e-06, + "loss": 16.586, + "step": 54810 + }, + { + "epoch": 0.9912282089589346, + "grad_norm": 44.5625, + "learning_rate": 9.845120854892198e-06, + "loss": 16.7009, + "step": 54820 + }, + { + "epoch": 0.9914090240280625, + "grad_norm": 45.9375, + "learning_rate": 9.845092602585539e-06, + "loss": 16.6305, + "step": 54830 + }, + { + "epoch": 0.9915898390971903, + "grad_norm": 43.34375, + "learning_rate": 9.84506435027888e-06, + "loss": 16.8697, + "step": 54840 + }, + { + "epoch": 0.9917706541663183, + "grad_norm": 45.25, + "learning_rate": 9.84503609797222e-06, + "loss": 16.5857, + "step": 54850 + }, + { + "epoch": 0.9919514692354461, + "grad_norm": 45.9375, + "learning_rate": 9.84500784566556e-06, + "loss": 16.4074, + "step": 54860 + }, + { + "epoch": 0.9921322843045739, + "grad_norm": 48.34375, + "learning_rate": 9.844979593358901e-06, + "loss": 16.8042, + "step": 54870 + }, + { + "epoch": 0.9923130993737018, + "grad_norm": 43.4375, + "learning_rate": 9.844951341052242e-06, + "loss": 16.6853, + "step": 54880 + }, + { + "epoch": 0.9924939144428296, + "grad_norm": 46.5625, + "learning_rate": 9.844923088745582e-06, + "loss": 16.6639, + "step": 54890 + }, + { + "epoch": 0.9926747295119576, + "grad_norm": 47.875, + "learning_rate": 9.844894836438921e-06, + "loss": 16.7631, + "step": 54900 + }, + { + "epoch": 0.9928555445810854, + "grad_norm": 43.34375, + "learning_rate": 9.844866584132262e-06, + "loss": 17.0811, + "step": 54910 + }, + { + "epoch": 0.9930363596502132, + "grad_norm": 47.59375, + "learning_rate": 9.844838331825603e-06, + "loss": 17.37, + "step": 54920 + }, + { + "epoch": 0.9932171747193411, + "grad_norm": 46.125, + "learning_rate": 9.844810079518943e-06, + "loss": 16.7417, + "step": 54930 + }, + { + "epoch": 0.9933979897884689, + "grad_norm": 47.75, + "learning_rate": 9.844781827212284e-06, + "loss": 16.3878, + "step": 54940 + }, + { + "epoch": 0.9935788048575969, + "grad_norm": 42.65625, + "learning_rate": 9.844753574905624e-06, + "loss": 16.5961, + "step": 54950 + }, + { + "epoch": 0.9937596199267247, + "grad_norm": 45.4375, + "learning_rate": 9.844725322598965e-06, + "loss": 16.6482, + "step": 54960 + }, + { + "epoch": 0.9939404349958526, + "grad_norm": 45.8125, + "learning_rate": 9.844697070292306e-06, + "loss": 16.5548, + "step": 54970 + }, + { + "epoch": 0.9941212500649804, + "grad_norm": 45.6875, + "learning_rate": 9.844668817985645e-06, + "loss": 16.468, + "step": 54980 + }, + { + "epoch": 0.9943020651341082, + "grad_norm": 43.96875, + "learning_rate": 9.844640565678985e-06, + "loss": 16.5594, + "step": 54990 + }, + { + "epoch": 0.9944828802032362, + "grad_norm": 45.40625, + "learning_rate": 9.844612313372326e-06, + "loss": 16.8028, + "step": 55000 + }, + { + "epoch": 0.9944828802032362, + "eval_loss": 2.082907199859619, + "eval_runtime": 228.736, + "eval_samples_per_second": 3174.223, + "eval_steps_per_second": 49.599, + "step": 55000 + }, + { + "epoch": 0.994663695272364, + "grad_norm": 44.0, + "learning_rate": 9.844584061065666e-06, + "loss": 16.6952, + "step": 55010 + }, + { + "epoch": 0.9948445103414919, + "grad_norm": 44.90625, + "learning_rate": 9.844555808759007e-06, + "loss": 16.4976, + "step": 55020 + }, + { + "epoch": 0.9950253254106197, + "grad_norm": 45.875, + "learning_rate": 9.844527556452346e-06, + "loss": 16.4644, + "step": 55030 + }, + { + "epoch": 0.9952061404797475, + "grad_norm": 44.0, + "learning_rate": 9.844499304145688e-06, + "loss": 16.7778, + "step": 55040 + }, + { + "epoch": 0.9953869555488755, + "grad_norm": 44.3125, + "learning_rate": 9.844471051839029e-06, + "loss": 17.0716, + "step": 55050 + }, + { + "epoch": 0.9955677706180033, + "grad_norm": 53.21875, + "learning_rate": 9.84444279953237e-06, + "loss": 17.1855, + "step": 55060 + }, + { + "epoch": 0.9957485856871312, + "grad_norm": 45.09375, + "learning_rate": 9.844414547225709e-06, + "loss": 16.8956, + "step": 55070 + }, + { + "epoch": 0.995929400756259, + "grad_norm": 45.03125, + "learning_rate": 9.844386294919049e-06, + "loss": 16.755, + "step": 55080 + }, + { + "epoch": 0.9961102158253868, + "grad_norm": 46.8125, + "learning_rate": 9.84435804261239e-06, + "loss": 16.4342, + "step": 55090 + }, + { + "epoch": 0.9962910308945148, + "grad_norm": 43.59375, + "learning_rate": 9.84432979030573e-06, + "loss": 16.5041, + "step": 55100 + }, + { + "epoch": 0.9964718459636426, + "grad_norm": 44.65625, + "learning_rate": 9.844301537999071e-06, + "loss": 16.9493, + "step": 55110 + }, + { + "epoch": 0.9966526610327705, + "grad_norm": 41.65625, + "learning_rate": 9.844273285692412e-06, + "loss": 16.5818, + "step": 55120 + }, + { + "epoch": 0.9968334761018983, + "grad_norm": 44.625, + "learning_rate": 9.844245033385752e-06, + "loss": 16.5041, + "step": 55130 + }, + { + "epoch": 0.9970142911710262, + "grad_norm": 43.3125, + "learning_rate": 9.844216781079093e-06, + "loss": 16.6332, + "step": 55140 + }, + { + "epoch": 0.9971951062401541, + "grad_norm": 43.96875, + "learning_rate": 9.844188528772432e-06, + "loss": 16.9803, + "step": 55150 + }, + { + "epoch": 0.9973759213092819, + "grad_norm": 46.53125, + "learning_rate": 9.844160276465772e-06, + "loss": 16.5326, + "step": 55160 + }, + { + "epoch": 0.9975567363784098, + "grad_norm": 47.46875, + "learning_rate": 9.844132024159113e-06, + "loss": 16.9527, + "step": 55170 + }, + { + "epoch": 0.9977375514475376, + "grad_norm": 49.3125, + "learning_rate": 9.844103771852454e-06, + "loss": 16.533, + "step": 55180 + }, + { + "epoch": 0.9979183665166655, + "grad_norm": 46.78125, + "learning_rate": 9.844075519545794e-06, + "loss": 16.6507, + "step": 55190 + }, + { + "epoch": 0.9980991815857934, + "grad_norm": 45.40625, + "learning_rate": 9.844047267239133e-06, + "loss": 16.4906, + "step": 55200 + }, + { + "epoch": 0.9982799966549212, + "grad_norm": 44.3125, + "learning_rate": 9.844019014932476e-06, + "loss": 16.5156, + "step": 55210 + }, + { + "epoch": 0.9984608117240491, + "grad_norm": 47.5625, + "learning_rate": 9.843990762625816e-06, + "loss": 16.7121, + "step": 55220 + }, + { + "epoch": 0.9986416267931769, + "grad_norm": 44.28125, + "learning_rate": 9.843962510319157e-06, + "loss": 16.9591, + "step": 55230 + }, + { + "epoch": 0.9988224418623048, + "grad_norm": 44.78125, + "learning_rate": 9.843934258012496e-06, + "loss": 16.4461, + "step": 55240 + }, + { + "epoch": 0.9990032569314327, + "grad_norm": 44.40625, + "learning_rate": 9.843906005705836e-06, + "loss": 16.741, + "step": 55250 + }, + { + "epoch": 0.9991840720005605, + "grad_norm": 44.03125, + "learning_rate": 9.843877753399177e-06, + "loss": 16.965, + "step": 55260 + }, + { + "epoch": 0.9993648870696884, + "grad_norm": 41.5, + "learning_rate": 9.843849501092518e-06, + "loss": 16.5569, + "step": 55270 + }, + { + "epoch": 0.9995457021388162, + "grad_norm": 47.46875, + "learning_rate": 9.843821248785858e-06, + "loss": 16.7305, + "step": 55280 + }, + { + "epoch": 0.9997265172079441, + "grad_norm": 42.53125, + "learning_rate": 9.843792996479197e-06, + "loss": 16.5323, + "step": 55290 + }, + { + "epoch": 0.999907332277072, + "grad_norm": 43.65625, + "learning_rate": 9.84376474417254e-06, + "loss": 16.207, + "step": 55300 + } + ], + "logging_steps": 10, + "max_steps": 55305, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.653613618876383e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}