{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20022, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004994506043352313, "grad_norm": 0.8967432379722595, "learning_rate": 9.999998461261478e-05, "loss": 1.5801, "step": 10 }, { "epoch": 0.0009989012086704626, "grad_norm": 2.9660449028015137, "learning_rate": 9.99999384504686e-05, "loss": 1.3974, "step": 20 }, { "epoch": 0.0014983518130056938, "grad_norm": 1.4422023296356201, "learning_rate": 9.999986151358985e-05, "loss": 1.3449, "step": 30 }, { "epoch": 0.001997802417340925, "grad_norm": 5.9646430015563965, "learning_rate": 9.999975380202592e-05, "loss": 1.264, "step": 40 }, { "epoch": 0.002497253021676156, "grad_norm": 5.202009677886963, "learning_rate": 9.999961531584305e-05, "loss": 1.2576, "step": 50 }, { "epoch": 0.0029967036260113876, "grad_norm": 2.8307554721832275, "learning_rate": 9.999944605512654e-05, "loss": 1.0792, "step": 60 }, { "epoch": 0.0034961542303466185, "grad_norm": 6.909800052642822, "learning_rate": 9.999924601998054e-05, "loss": 0.9607, "step": 70 }, { "epoch": 0.00399560483468185, "grad_norm": 4.00949239730835, "learning_rate": 9.999901521052817e-05, "loss": 1.1402, "step": 80 }, { "epoch": 0.004495055439017081, "grad_norm": 4.892696857452393, "learning_rate": 9.999875362691149e-05, "loss": 1.0316, "step": 90 }, { "epoch": 0.004994506043352312, "grad_norm": 2.0788443088531494, "learning_rate": 9.999846126929151e-05, "loss": 1.1106, "step": 100 }, { "epoch": 0.005493956647687544, "grad_norm": 4.872004508972168, "learning_rate": 9.999813813784817e-05, "loss": 0.8462, "step": 110 }, { "epoch": 0.005993407252022775, "grad_norm": 1.1907895803451538, "learning_rate": 9.999778423278036e-05, "loss": 0.841, "step": 120 }, { "epoch": 0.0064928578563580065, "grad_norm": 2.7548792362213135, "learning_rate": 9.99973995543059e-05, "loss": 1.2706, "step": 130 }, { "epoch": 0.006992308460693237, "grad_norm": 2.08931040763855, "learning_rate": 9.999698410266157e-05, "loss": 1.1731, "step": 140 }, { "epoch": 0.0074917590650284685, "grad_norm": 2.7863011360168457, "learning_rate": 9.999653787810307e-05, "loss": 1.1902, "step": 150 }, { "epoch": 0.0079912096693637, "grad_norm": 5.634822845458984, "learning_rate": 9.999606088090505e-05, "loss": 1.1306, "step": 160 }, { "epoch": 0.00849066027369893, "grad_norm": 7.692355155944824, "learning_rate": 9.999555311136111e-05, "loss": 0.8827, "step": 170 }, { "epoch": 0.008990110878034162, "grad_norm": 2.2336292266845703, "learning_rate": 9.999501456978375e-05, "loss": 1.0124, "step": 180 }, { "epoch": 0.009489561482369393, "grad_norm": 4.546370983123779, "learning_rate": 9.999444525650448e-05, "loss": 0.9965, "step": 190 }, { "epoch": 0.009989012086704625, "grad_norm": 1.5386520624160767, "learning_rate": 9.999384517187369e-05, "loss": 0.9698, "step": 200 }, { "epoch": 0.010488462691039856, "grad_norm": 7.2799391746521, "learning_rate": 9.999321431626072e-05, "loss": 1.0506, "step": 210 }, { "epoch": 0.010987913295375087, "grad_norm": 3.7693991661071777, "learning_rate": 9.999255269005387e-05, "loss": 0.814, "step": 220 }, { "epoch": 0.011487363899710319, "grad_norm": 4.580488204956055, "learning_rate": 9.999186029366036e-05, "loss": 0.7806, "step": 230 }, { "epoch": 0.01198681450404555, "grad_norm": 2.3693222999572754, "learning_rate": 9.999113712750637e-05, "loss": 1.0188, "step": 240 }, { "epoch": 0.012486265108380782, "grad_norm": 3.053426504135132, "learning_rate": 9.999038319203699e-05, "loss": 0.8191, "step": 250 }, { "epoch": 0.012985715712716013, "grad_norm": 3.2131946086883545, "learning_rate": 9.99895984877163e-05, "loss": 1.0856, "step": 260 }, { "epoch": 0.013485166317051244, "grad_norm": 5.172701835632324, "learning_rate": 9.998878301502723e-05, "loss": 0.854, "step": 270 }, { "epoch": 0.013984616921386474, "grad_norm": 4.785862922668457, "learning_rate": 9.998793677447172e-05, "loss": 0.9823, "step": 280 }, { "epoch": 0.014484067525721706, "grad_norm": 2.307417869567871, "learning_rate": 9.998705976657064e-05, "loss": 1.0952, "step": 290 }, { "epoch": 0.014983518130056937, "grad_norm": 3.640514373779297, "learning_rate": 9.998615199186378e-05, "loss": 0.9484, "step": 300 }, { "epoch": 0.015482968734392168, "grad_norm": 1.485445499420166, "learning_rate": 9.998521345090986e-05, "loss": 0.9246, "step": 310 }, { "epoch": 0.0159824193387274, "grad_norm": 6.912786960601807, "learning_rate": 9.998424414428656e-05, "loss": 0.9799, "step": 320 }, { "epoch": 0.01648186994306263, "grad_norm": 6.9490180015563965, "learning_rate": 9.998324407259048e-05, "loss": 0.7185, "step": 330 }, { "epoch": 0.01698132054739786, "grad_norm": 6.867273330688477, "learning_rate": 9.998221323643714e-05, "loss": 0.982, "step": 340 }, { "epoch": 0.017480771151733094, "grad_norm": 4.2101054191589355, "learning_rate": 9.998115163646104e-05, "loss": 1.0997, "step": 350 }, { "epoch": 0.017980221756068324, "grad_norm": 2.994784355163574, "learning_rate": 9.998005927331559e-05, "loss": 0.9564, "step": 360 }, { "epoch": 0.018479672360403557, "grad_norm": 2.8205251693725586, "learning_rate": 9.997893614767312e-05, "loss": 1.0382, "step": 370 }, { "epoch": 0.018979122964738786, "grad_norm": 4.516458511352539, "learning_rate": 9.997778226022492e-05, "loss": 1.1056, "step": 380 }, { "epoch": 0.01947857356907402, "grad_norm": 3.8063511848449707, "learning_rate": 9.997659761168119e-05, "loss": 0.8491, "step": 390 }, { "epoch": 0.01997802417340925, "grad_norm": 2.98667049407959, "learning_rate": 9.997538220277108e-05, "loss": 1.0848, "step": 400 }, { "epoch": 0.020477474777744482, "grad_norm": 4.909623622894287, "learning_rate": 9.997413603424268e-05, "loss": 0.8135, "step": 410 }, { "epoch": 0.020976925382079712, "grad_norm": 2.7305667400360107, "learning_rate": 9.9972859106863e-05, "loss": 1.0388, "step": 420 }, { "epoch": 0.021476375986414945, "grad_norm": 3.258394718170166, "learning_rate": 9.997155142141796e-05, "loss": 0.8844, "step": 430 }, { "epoch": 0.021975826590750175, "grad_norm": 5.074390888214111, "learning_rate": 9.997021297871247e-05, "loss": 1.2031, "step": 440 }, { "epoch": 0.022475277195085405, "grad_norm": 1.4644519090652466, "learning_rate": 9.996884377957029e-05, "loss": 1.0039, "step": 450 }, { "epoch": 0.022974727799420638, "grad_norm": 3.1942813396453857, "learning_rate": 9.996744382483421e-05, "loss": 1.0889, "step": 460 }, { "epoch": 0.023474178403755867, "grad_norm": 8.286642074584961, "learning_rate": 9.996601311536586e-05, "loss": 0.9957, "step": 470 }, { "epoch": 0.0239736290080911, "grad_norm": 1.7618873119354248, "learning_rate": 9.996455165204583e-05, "loss": 0.8698, "step": 480 }, { "epoch": 0.02447307961242633, "grad_norm": 1.5642335414886475, "learning_rate": 9.996305943577366e-05, "loss": 0.9486, "step": 490 }, { "epoch": 0.024972530216761563, "grad_norm": 13.810921669006348, "learning_rate": 9.996153646746781e-05, "loss": 0.9216, "step": 500 }, { "epoch": 0.025471980821096793, "grad_norm": 2.5668368339538574, "learning_rate": 9.995998274806563e-05, "loss": 1.0009, "step": 510 }, { "epoch": 0.025971431425432026, "grad_norm": 1.7821905612945557, "learning_rate": 9.995839827852346e-05, "loss": 0.9474, "step": 520 }, { "epoch": 0.026470882029767256, "grad_norm": 3.3876638412475586, "learning_rate": 9.995678305981652e-05, "loss": 0.9413, "step": 530 }, { "epoch": 0.02697033263410249, "grad_norm": 3.1569855213165283, "learning_rate": 9.995513709293897e-05, "loss": 0.9358, "step": 540 }, { "epoch": 0.02746978323843772, "grad_norm": 5.637537479400635, "learning_rate": 9.99534603789039e-05, "loss": 1.0568, "step": 550 }, { "epoch": 0.027969233842772948, "grad_norm": 2.1420366764068604, "learning_rate": 9.995175291874331e-05, "loss": 0.8176, "step": 560 }, { "epoch": 0.02846868444710818, "grad_norm": 2.7238314151763916, "learning_rate": 9.995001471350811e-05, "loss": 0.9457, "step": 570 }, { "epoch": 0.02896813505144341, "grad_norm": 8.453182220458984, "learning_rate": 9.994824576426822e-05, "loss": 0.9144, "step": 580 }, { "epoch": 0.029467585655778644, "grad_norm": 1.4573590755462646, "learning_rate": 9.994644607211236e-05, "loss": 0.8454, "step": 590 }, { "epoch": 0.029967036260113874, "grad_norm": 1.987531065940857, "learning_rate": 9.994461563814828e-05, "loss": 1.0507, "step": 600 }, { "epoch": 0.030466486864449107, "grad_norm": 2.4331166744232178, "learning_rate": 9.994275446350256e-05, "loss": 0.9642, "step": 610 }, { "epoch": 0.030965937468784337, "grad_norm": 1.337424397468567, "learning_rate": 9.994086254932078e-05, "loss": 1.0701, "step": 620 }, { "epoch": 0.03146538807311957, "grad_norm": 5.322749614715576, "learning_rate": 9.99389398967674e-05, "loss": 1.0151, "step": 630 }, { "epoch": 0.0319648386774548, "grad_norm": 1.6098060607910156, "learning_rate": 9.993698650702578e-05, "loss": 1.0929, "step": 640 }, { "epoch": 0.03246428928179003, "grad_norm": 2.8981151580810547, "learning_rate": 9.993500238129824e-05, "loss": 0.9528, "step": 650 }, { "epoch": 0.03296373988612526, "grad_norm": 2.6054182052612305, "learning_rate": 9.9932987520806e-05, "loss": 0.799, "step": 660 }, { "epoch": 0.033463190490460495, "grad_norm": 4.730319976806641, "learning_rate": 9.993094192678918e-05, "loss": 1.2382, "step": 670 }, { "epoch": 0.03396264109479572, "grad_norm": 2.3788576126098633, "learning_rate": 9.992886560050686e-05, "loss": 0.7042, "step": 680 }, { "epoch": 0.034462091699130955, "grad_norm": 9.173518180847168, "learning_rate": 9.9926758543237e-05, "loss": 0.9632, "step": 690 }, { "epoch": 0.03496154230346619, "grad_norm": 3.4788527488708496, "learning_rate": 9.992462075627646e-05, "loss": 0.7926, "step": 700 }, { "epoch": 0.03546099290780142, "grad_norm": 2.7496581077575684, "learning_rate": 9.992245224094109e-05, "loss": 1.0237, "step": 710 }, { "epoch": 0.03596044351213665, "grad_norm": 2.733050584793091, "learning_rate": 9.992025299856556e-05, "loss": 0.772, "step": 720 }, { "epoch": 0.03645989411647188, "grad_norm": 0.8263514041900635, "learning_rate": 9.99180230305035e-05, "loss": 0.9264, "step": 730 }, { "epoch": 0.036959344720807114, "grad_norm": 3.485135555267334, "learning_rate": 9.991576233812745e-05, "loss": 1.057, "step": 740 }, { "epoch": 0.03745879532514235, "grad_norm": 0.6603259444236755, "learning_rate": 9.991347092282885e-05, "loss": 0.6896, "step": 750 }, { "epoch": 0.03795824592947757, "grad_norm": 4.332391262054443, "learning_rate": 9.991114878601806e-05, "loss": 0.8355, "step": 760 }, { "epoch": 0.038457696533812806, "grad_norm": 4.493191242218018, "learning_rate": 9.990879592912436e-05, "loss": 0.8238, "step": 770 }, { "epoch": 0.03895714713814804, "grad_norm": 2.2180371284484863, "learning_rate": 9.99064123535959e-05, "loss": 0.7578, "step": 780 }, { "epoch": 0.039456597742483265, "grad_norm": 1.2803512811660767, "learning_rate": 9.990399806089976e-05, "loss": 0.7661, "step": 790 }, { "epoch": 0.0399560483468185, "grad_norm": 1.5827938318252563, "learning_rate": 9.990155305252194e-05, "loss": 0.8578, "step": 800 }, { "epoch": 0.04045549895115373, "grad_norm": 3.514148473739624, "learning_rate": 9.989907732996733e-05, "loss": 0.7647, "step": 810 }, { "epoch": 0.040954949555488965, "grad_norm": 4.280004024505615, "learning_rate": 9.989657089475972e-05, "loss": 0.9934, "step": 820 }, { "epoch": 0.04145440015982419, "grad_norm": 3.2037782669067383, "learning_rate": 9.98940337484418e-05, "loss": 0.9177, "step": 830 }, { "epoch": 0.041953850764159424, "grad_norm": 3.360799551010132, "learning_rate": 9.989146589257519e-05, "loss": 0.926, "step": 840 }, { "epoch": 0.04245330136849466, "grad_norm": 2.9952991008758545, "learning_rate": 9.98888673287404e-05, "loss": 0.9762, "step": 850 }, { "epoch": 0.04295275197282989, "grad_norm": 0.840907871723175, "learning_rate": 9.988623805853679e-05, "loss": 0.731, "step": 860 }, { "epoch": 0.04345220257716512, "grad_norm": 2.830888509750366, "learning_rate": 9.988357808358272e-05, "loss": 0.8859, "step": 870 }, { "epoch": 0.04395165318150035, "grad_norm": 2.584847927093506, "learning_rate": 9.988088740551535e-05, "loss": 0.9814, "step": 880 }, { "epoch": 0.04445110378583558, "grad_norm": 4.042998313903809, "learning_rate": 9.98781660259908e-05, "loss": 1.0667, "step": 890 }, { "epoch": 0.04495055439017081, "grad_norm": 3.0407557487487793, "learning_rate": 9.987541394668407e-05, "loss": 0.9815, "step": 900 }, { "epoch": 0.04545000499450604, "grad_norm": 4.644862651824951, "learning_rate": 9.987263116928903e-05, "loss": 1.0614, "step": 910 }, { "epoch": 0.045949455598841275, "grad_norm": 4.128206253051758, "learning_rate": 9.98698176955185e-05, "loss": 0.853, "step": 920 }, { "epoch": 0.04644890620317651, "grad_norm": 1.7882283926010132, "learning_rate": 9.986697352710413e-05, "loss": 0.7559, "step": 930 }, { "epoch": 0.046948356807511735, "grad_norm": 2.259190320968628, "learning_rate": 9.98640986657965e-05, "loss": 1.1111, "step": 940 }, { "epoch": 0.04744780741184697, "grad_norm": 2.4604718685150146, "learning_rate": 9.986119311336509e-05, "loss": 0.9216, "step": 950 }, { "epoch": 0.0479472580161822, "grad_norm": 1.7957441806793213, "learning_rate": 9.985825687159823e-05, "loss": 0.8028, "step": 960 }, { "epoch": 0.048446708620517434, "grad_norm": 3.120357036590576, "learning_rate": 9.985528994230318e-05, "loss": 0.8791, "step": 970 }, { "epoch": 0.04894615922485266, "grad_norm": 4.274290084838867, "learning_rate": 9.985229232730607e-05, "loss": 0.8734, "step": 980 }, { "epoch": 0.04944560982918789, "grad_norm": 1.9140256643295288, "learning_rate": 9.984926402845192e-05, "loss": 1.0805, "step": 990 }, { "epoch": 0.04994506043352313, "grad_norm": 2.572977066040039, "learning_rate": 9.984620504760462e-05, "loss": 1.1131, "step": 1000 }, { "epoch": 0.05044451103785835, "grad_norm": 5.084359645843506, "learning_rate": 9.984311538664697e-05, "loss": 0.8575, "step": 1010 }, { "epoch": 0.050943961642193586, "grad_norm": 11.383241653442383, "learning_rate": 9.983999504748065e-05, "loss": 1.1465, "step": 1020 }, { "epoch": 0.05144341224652882, "grad_norm": 1.993849277496338, "learning_rate": 9.98368440320262e-05, "loss": 0.9018, "step": 1030 }, { "epoch": 0.05194286285086405, "grad_norm": 5.786123275756836, "learning_rate": 9.983366234222305e-05, "loss": 1.0299, "step": 1040 }, { "epoch": 0.05244231345519928, "grad_norm": 2.2412679195404053, "learning_rate": 9.983044998002953e-05, "loss": 0.7733, "step": 1050 }, { "epoch": 0.05294176405953451, "grad_norm": 2.106492042541504, "learning_rate": 9.982720694742284e-05, "loss": 0.8868, "step": 1060 }, { "epoch": 0.053441214663869745, "grad_norm": 3.3351902961730957, "learning_rate": 9.982393324639902e-05, "loss": 0.9423, "step": 1070 }, { "epoch": 0.05394066526820498, "grad_norm": 4.159750938415527, "learning_rate": 9.982062887897307e-05, "loss": 0.7502, "step": 1080 }, { "epoch": 0.054440115872540204, "grad_norm": 4.879648685455322, "learning_rate": 9.981729384717876e-05, "loss": 0.9594, "step": 1090 }, { "epoch": 0.05493956647687544, "grad_norm": 8.314805030822754, "learning_rate": 9.981392815306882e-05, "loss": 1.1986, "step": 1100 }, { "epoch": 0.05543901708121067, "grad_norm": 1.7291924953460693, "learning_rate": 9.98105317987148e-05, "loss": 0.9061, "step": 1110 }, { "epoch": 0.055938467685545896, "grad_norm": 2.105881929397583, "learning_rate": 9.980710478620717e-05, "loss": 0.9843, "step": 1120 }, { "epoch": 0.05643791828988113, "grad_norm": 2.115079402923584, "learning_rate": 9.980364711765519e-05, "loss": 1.0894, "step": 1130 }, { "epoch": 0.05693736889421636, "grad_norm": 2.8204243183135986, "learning_rate": 9.980015879518707e-05, "loss": 0.9992, "step": 1140 }, { "epoch": 0.057436819498551596, "grad_norm": 7.11791467666626, "learning_rate": 9.979663982094987e-05, "loss": 0.8877, "step": 1150 }, { "epoch": 0.05793627010288682, "grad_norm": 1.6929785013198853, "learning_rate": 9.979309019710949e-05, "loss": 0.8585, "step": 1160 }, { "epoch": 0.058435720707222055, "grad_norm": 3.3160712718963623, "learning_rate": 9.978950992585069e-05, "loss": 0.8392, "step": 1170 }, { "epoch": 0.05893517131155729, "grad_norm": 3.6795084476470947, "learning_rate": 9.978589900937714e-05, "loss": 0.9824, "step": 1180 }, { "epoch": 0.05943462191589252, "grad_norm": 2.2378451824188232, "learning_rate": 9.978225744991133e-05, "loss": 0.8501, "step": 1190 }, { "epoch": 0.05993407252022775, "grad_norm": 1.5347133874893188, "learning_rate": 9.97785852496946e-05, "loss": 0.9016, "step": 1200 }, { "epoch": 0.06043352312456298, "grad_norm": 2.2190093994140625, "learning_rate": 9.97748824109872e-05, "loss": 0.8335, "step": 1210 }, { "epoch": 0.060932973728898214, "grad_norm": 2.3186557292938232, "learning_rate": 9.977114893606822e-05, "loss": 1.0573, "step": 1220 }, { "epoch": 0.06143242433323344, "grad_norm": 5.181076526641846, "learning_rate": 9.976738482723557e-05, "loss": 1.1189, "step": 1230 }, { "epoch": 0.06193187493756867, "grad_norm": 0.9674704074859619, "learning_rate": 9.976359008680605e-05, "loss": 0.6374, "step": 1240 }, { "epoch": 0.062431325541903906, "grad_norm": 4.01783561706543, "learning_rate": 9.97597647171153e-05, "loss": 0.9386, "step": 1250 }, { "epoch": 0.06293077614623914, "grad_norm": 4.014340400695801, "learning_rate": 9.975590872051783e-05, "loss": 0.8924, "step": 1260 }, { "epoch": 0.06343022675057437, "grad_norm": 2.0934877395629883, "learning_rate": 9.9752022099387e-05, "loss": 0.7906, "step": 1270 }, { "epoch": 0.0639296773549096, "grad_norm": 3.8254194259643555, "learning_rate": 9.974810485611497e-05, "loss": 1.0506, "step": 1280 }, { "epoch": 0.06442912795924483, "grad_norm": 2.7336432933807373, "learning_rate": 9.974415699311282e-05, "loss": 1.0317, "step": 1290 }, { "epoch": 0.06492857856358006, "grad_norm": 1.1078413724899292, "learning_rate": 9.974017851281041e-05, "loss": 1.0106, "step": 1300 }, { "epoch": 0.06542802916791529, "grad_norm": 6.520686626434326, "learning_rate": 9.97361694176565e-05, "loss": 1.0209, "step": 1310 }, { "epoch": 0.06592747977225052, "grad_norm": 4.254242897033691, "learning_rate": 9.973212971011868e-05, "loss": 0.9462, "step": 1320 }, { "epoch": 0.06642693037658576, "grad_norm": 1.754723310470581, "learning_rate": 9.972805939268332e-05, "loss": 0.7216, "step": 1330 }, { "epoch": 0.06692638098092099, "grad_norm": 1.4408202171325684, "learning_rate": 9.972395846785574e-05, "loss": 1.033, "step": 1340 }, { "epoch": 0.06742583158525622, "grad_norm": 1.896499514579773, "learning_rate": 9.971982693816001e-05, "loss": 0.8383, "step": 1350 }, { "epoch": 0.06792528218959144, "grad_norm": 2.5673983097076416, "learning_rate": 9.971566480613907e-05, "loss": 1.0443, "step": 1360 }, { "epoch": 0.06842473279392668, "grad_norm": 2.175894260406494, "learning_rate": 9.971147207435471e-05, "loss": 1.0507, "step": 1370 }, { "epoch": 0.06892418339826191, "grad_norm": 2.4703495502471924, "learning_rate": 9.970724874538753e-05, "loss": 0.8492, "step": 1380 }, { "epoch": 0.06942363400259714, "grad_norm": 3.719816207885742, "learning_rate": 9.970299482183694e-05, "loss": 1.024, "step": 1390 }, { "epoch": 0.06992308460693238, "grad_norm": 2.802920341491699, "learning_rate": 9.969871030632125e-05, "loss": 1.0782, "step": 1400 }, { "epoch": 0.07042253521126761, "grad_norm": 5.848260402679443, "learning_rate": 9.969439520147754e-05, "loss": 0.996, "step": 1410 }, { "epoch": 0.07092198581560284, "grad_norm": 3.2062735557556152, "learning_rate": 9.969004950996175e-05, "loss": 1.0682, "step": 1420 }, { "epoch": 0.07142143641993806, "grad_norm": 1.4582406282424927, "learning_rate": 9.968567323444862e-05, "loss": 0.6954, "step": 1430 }, { "epoch": 0.0719208870242733, "grad_norm": 1.4951964616775513, "learning_rate": 9.968126637763173e-05, "loss": 1.0056, "step": 1440 }, { "epoch": 0.07242033762860853, "grad_norm": 2.992192029953003, "learning_rate": 9.967682894222348e-05, "loss": 0.8429, "step": 1450 }, { "epoch": 0.07291978823294376, "grad_norm": 2.689845085144043, "learning_rate": 9.967236093095509e-05, "loss": 1.0161, "step": 1460 }, { "epoch": 0.073419238837279, "grad_norm": 1.2474361658096313, "learning_rate": 9.96678623465766e-05, "loss": 0.9082, "step": 1470 }, { "epoch": 0.07391868944161423, "grad_norm": 2.83001971244812, "learning_rate": 9.966333319185687e-05, "loss": 1.1928, "step": 1480 }, { "epoch": 0.07441814004594946, "grad_norm": 2.0565037727355957, "learning_rate": 9.96587734695836e-05, "loss": 0.8594, "step": 1490 }, { "epoch": 0.0749175906502847, "grad_norm": 4.0485148429870605, "learning_rate": 9.965418318256323e-05, "loss": 0.8476, "step": 1500 }, { "epoch": 0.07541704125461991, "grad_norm": 1.5409923791885376, "learning_rate": 9.964956233362111e-05, "loss": 0.9508, "step": 1510 }, { "epoch": 0.07591649185895515, "grad_norm": 3.438107490539551, "learning_rate": 9.96449109256013e-05, "loss": 0.8173, "step": 1520 }, { "epoch": 0.07641594246329038, "grad_norm": 1.3228156566619873, "learning_rate": 9.964022896136675e-05, "loss": 0.6898, "step": 1530 }, { "epoch": 0.07691539306762561, "grad_norm": 1.1662561893463135, "learning_rate": 9.96355164437992e-05, "loss": 0.7646, "step": 1540 }, { "epoch": 0.07741484367196085, "grad_norm": 3.2023818492889404, "learning_rate": 9.963077337579916e-05, "loss": 0.923, "step": 1550 }, { "epoch": 0.07791429427629608, "grad_norm": 2.6391987800598145, "learning_rate": 9.962599976028596e-05, "loss": 1.0986, "step": 1560 }, { "epoch": 0.07841374488063131, "grad_norm": 3.005326747894287, "learning_rate": 9.962119560019776e-05, "loss": 0.7723, "step": 1570 }, { "epoch": 0.07891319548496653, "grad_norm": 3.3779022693634033, "learning_rate": 9.961636089849149e-05, "loss": 0.8936, "step": 1580 }, { "epoch": 0.07941264608930176, "grad_norm": 2.644306182861328, "learning_rate": 9.96114956581429e-05, "loss": 0.7907, "step": 1590 }, { "epoch": 0.079912096693637, "grad_norm": 3.233680009841919, "learning_rate": 9.960659988214649e-05, "loss": 1.0045, "step": 1600 }, { "epoch": 0.08041154729797223, "grad_norm": 1.8452726602554321, "learning_rate": 9.96016735735156e-05, "loss": 0.8637, "step": 1610 }, { "epoch": 0.08091099790230746, "grad_norm": 3.7355189323425293, "learning_rate": 9.959671673528238e-05, "loss": 1.1491, "step": 1620 }, { "epoch": 0.0814104485066427, "grad_norm": 3.3520915508270264, "learning_rate": 9.95917293704977e-05, "loss": 0.8111, "step": 1630 }, { "epoch": 0.08190989911097793, "grad_norm": 1.7798441648483276, "learning_rate": 9.95867114822313e-05, "loss": 0.8114, "step": 1640 }, { "epoch": 0.08240934971531315, "grad_norm": 3.374290704727173, "learning_rate": 9.958166307357163e-05, "loss": 0.7545, "step": 1650 }, { "epoch": 0.08290880031964838, "grad_norm": 3.4847121238708496, "learning_rate": 9.957658414762598e-05, "loss": 0.9837, "step": 1660 }, { "epoch": 0.08340825092398362, "grad_norm": 2.798218011856079, "learning_rate": 9.957147470752042e-05, "loss": 0.676, "step": 1670 }, { "epoch": 0.08390770152831885, "grad_norm": 1.433990716934204, "learning_rate": 9.956633475639975e-05, "loss": 1.0011, "step": 1680 }, { "epoch": 0.08440715213265408, "grad_norm": 3.560073137283325, "learning_rate": 9.956116429742761e-05, "loss": 0.9904, "step": 1690 }, { "epoch": 0.08490660273698931, "grad_norm": 2.9942626953125, "learning_rate": 9.95559633337864e-05, "loss": 0.9149, "step": 1700 }, { "epoch": 0.08540605334132455, "grad_norm": 5.4364495277404785, "learning_rate": 9.955073186867728e-05, "loss": 0.893, "step": 1710 }, { "epoch": 0.08590550394565978, "grad_norm": 1.7060109376907349, "learning_rate": 9.954546990532019e-05, "loss": 0.8281, "step": 1720 }, { "epoch": 0.086404954549995, "grad_norm": 3.2188754081726074, "learning_rate": 9.954017744695386e-05, "loss": 0.9606, "step": 1730 }, { "epoch": 0.08690440515433023, "grad_norm": 2.1602442264556885, "learning_rate": 9.953485449683576e-05, "loss": 0.8535, "step": 1740 }, { "epoch": 0.08740385575866547, "grad_norm": 2.050398349761963, "learning_rate": 9.952950105824211e-05, "loss": 0.8366, "step": 1750 }, { "epoch": 0.0879033063630007, "grad_norm": 1.8876922130584717, "learning_rate": 9.952411713446798e-05, "loss": 0.7703, "step": 1760 }, { "epoch": 0.08840275696733593, "grad_norm": 1.2664496898651123, "learning_rate": 9.951870272882713e-05, "loss": 0.7808, "step": 1770 }, { "epoch": 0.08890220757167117, "grad_norm": 1.7820614576339722, "learning_rate": 9.95132578446521e-05, "loss": 1.0168, "step": 1780 }, { "epoch": 0.0894016581760064, "grad_norm": 3.8653557300567627, "learning_rate": 9.95077824852942e-05, "loss": 1.0786, "step": 1790 }, { "epoch": 0.08990110878034162, "grad_norm": 2.9980475902557373, "learning_rate": 9.950227665412349e-05, "loss": 0.8898, "step": 1800 }, { "epoch": 0.09040055938467685, "grad_norm": 1.1599256992340088, "learning_rate": 9.949674035452877e-05, "loss": 0.7905, "step": 1810 }, { "epoch": 0.09090000998901208, "grad_norm": 6.718392848968506, "learning_rate": 9.94911735899176e-05, "loss": 0.8412, "step": 1820 }, { "epoch": 0.09139946059334732, "grad_norm": 2.9782862663269043, "learning_rate": 9.94855763637163e-05, "loss": 0.7786, "step": 1830 }, { "epoch": 0.09189891119768255, "grad_norm": 5.974144458770752, "learning_rate": 9.947994867936997e-05, "loss": 0.7711, "step": 1840 }, { "epoch": 0.09239836180201778, "grad_norm": 5.793637275695801, "learning_rate": 9.947429054034238e-05, "loss": 0.9684, "step": 1850 }, { "epoch": 0.09289781240635302, "grad_norm": 3.8431828022003174, "learning_rate": 9.946860195011614e-05, "loss": 0.9766, "step": 1860 }, { "epoch": 0.09339726301068824, "grad_norm": 3.748358964920044, "learning_rate": 9.94628829121925e-05, "loss": 1.1646, "step": 1870 }, { "epoch": 0.09389671361502347, "grad_norm": 1.4407458305358887, "learning_rate": 9.945713343009153e-05, "loss": 0.8303, "step": 1880 }, { "epoch": 0.0943961642193587, "grad_norm": 1.9862149953842163, "learning_rate": 9.945135350735199e-05, "loss": 0.8215, "step": 1890 }, { "epoch": 0.09489561482369394, "grad_norm": 1.7664053440093994, "learning_rate": 9.944554314753143e-05, "loss": 0.9924, "step": 1900 }, { "epoch": 0.09539506542802917, "grad_norm": 2.36393404006958, "learning_rate": 9.943970235420605e-05, "loss": 0.7561, "step": 1910 }, { "epoch": 0.0958945160323644, "grad_norm": 3.7872016429901123, "learning_rate": 9.943383113097089e-05, "loss": 0.9625, "step": 1920 }, { "epoch": 0.09639396663669964, "grad_norm": 3.44275164604187, "learning_rate": 9.94279294814396e-05, "loss": 0.8123, "step": 1930 }, { "epoch": 0.09689341724103487, "grad_norm": 2.9458820819854736, "learning_rate": 9.942199740924467e-05, "loss": 0.8189, "step": 1940 }, { "epoch": 0.09739286784537009, "grad_norm": 3.4364569187164307, "learning_rate": 9.941603491803724e-05, "loss": 0.9232, "step": 1950 }, { "epoch": 0.09789231844970532, "grad_norm": 2.805171251296997, "learning_rate": 9.94100420114872e-05, "loss": 1.0483, "step": 1960 }, { "epoch": 0.09839176905404055, "grad_norm": 0.2529987394809723, "learning_rate": 9.940401869328314e-05, "loss": 0.7037, "step": 1970 }, { "epoch": 0.09889121965837579, "grad_norm": 3.3802037239074707, "learning_rate": 9.93979649671324e-05, "loss": 0.796, "step": 1980 }, { "epoch": 0.09939067026271102, "grad_norm": 1.818002700805664, "learning_rate": 9.939188083676103e-05, "loss": 0.6942, "step": 1990 }, { "epoch": 0.09989012086704625, "grad_norm": 1.597109317779541, "learning_rate": 9.938576630591377e-05, "loss": 1.0238, "step": 2000 }, { "epoch": 0.10038957147138149, "grad_norm": 1.5362275838851929, "learning_rate": 9.93796213783541e-05, "loss": 0.7831, "step": 2010 }, { "epoch": 0.1008890220757167, "grad_norm": 1.4740239381790161, "learning_rate": 9.937344605786416e-05, "loss": 0.6369, "step": 2020 }, { "epoch": 0.10138847268005194, "grad_norm": 5.3471150398254395, "learning_rate": 9.936724034824487e-05, "loss": 0.9997, "step": 2030 }, { "epoch": 0.10188792328438717, "grad_norm": 3.965418577194214, "learning_rate": 9.93610042533158e-05, "loss": 0.7309, "step": 2040 }, { "epoch": 0.1023873738887224, "grad_norm": 1.7020810842514038, "learning_rate": 9.935473777691526e-05, "loss": 1.1524, "step": 2050 }, { "epoch": 0.10288682449305764, "grad_norm": 1.3779456615447998, "learning_rate": 9.934844092290019e-05, "loss": 0.7777, "step": 2060 }, { "epoch": 0.10338627509739287, "grad_norm": 7.693228244781494, "learning_rate": 9.93421136951463e-05, "loss": 1.2078, "step": 2070 }, { "epoch": 0.1038857257017281, "grad_norm": 2.664365768432617, "learning_rate": 9.9335756097548e-05, "loss": 0.8353, "step": 2080 }, { "epoch": 0.10438517630606332, "grad_norm": 3.8594837188720703, "learning_rate": 9.93293681340183e-05, "loss": 1.0259, "step": 2090 }, { "epoch": 0.10488462691039856, "grad_norm": 1.5903313159942627, "learning_rate": 9.9322949808489e-05, "loss": 0.8916, "step": 2100 }, { "epoch": 0.10538407751473379, "grad_norm": 1.939350962638855, "learning_rate": 9.931650112491057e-05, "loss": 1.0823, "step": 2110 }, { "epoch": 0.10588352811906902, "grad_norm": 2.0447142124176025, "learning_rate": 9.93100220872521e-05, "loss": 1.0213, "step": 2120 }, { "epoch": 0.10638297872340426, "grad_norm": 2.2896556854248047, "learning_rate": 9.930351269950143e-05, "loss": 0.8663, "step": 2130 }, { "epoch": 0.10688242932773949, "grad_norm": 1.772435188293457, "learning_rate": 9.929697296566507e-05, "loss": 0.8649, "step": 2140 }, { "epoch": 0.10738187993207472, "grad_norm": 2.264937162399292, "learning_rate": 9.929040288976816e-05, "loss": 0.948, "step": 2150 }, { "epoch": 0.10788133053640996, "grad_norm": 2.3183889389038086, "learning_rate": 9.92838024758546e-05, "loss": 0.8205, "step": 2160 }, { "epoch": 0.10838078114074517, "grad_norm": 9.26515007019043, "learning_rate": 9.927717172798687e-05, "loss": 0.8452, "step": 2170 }, { "epoch": 0.10888023174508041, "grad_norm": 2.1538867950439453, "learning_rate": 9.92705106502462e-05, "loss": 0.8233, "step": 2180 }, { "epoch": 0.10937968234941564, "grad_norm": 2.3352103233337402, "learning_rate": 9.926381924673241e-05, "loss": 1.1426, "step": 2190 }, { "epoch": 0.10987913295375087, "grad_norm": 0.5940951108932495, "learning_rate": 9.925709752156407e-05, "loss": 0.7086, "step": 2200 }, { "epoch": 0.11037858355808611, "grad_norm": 2.1471216678619385, "learning_rate": 9.925034547887837e-05, "loss": 1.1785, "step": 2210 }, { "epoch": 0.11087803416242134, "grad_norm": 1.97718346118927, "learning_rate": 9.924356312283113e-05, "loss": 0.8249, "step": 2220 }, { "epoch": 0.11137748476675657, "grad_norm": 2.9583914279937744, "learning_rate": 9.923675045759689e-05, "loss": 0.7702, "step": 2230 }, { "epoch": 0.11187693537109179, "grad_norm": 0.724244236946106, "learning_rate": 9.922990748736877e-05, "loss": 0.8585, "step": 2240 }, { "epoch": 0.11237638597542703, "grad_norm": 1.6883105039596558, "learning_rate": 9.922303421635864e-05, "loss": 0.7324, "step": 2250 }, { "epoch": 0.11287583657976226, "grad_norm": 1.215675711631775, "learning_rate": 9.921613064879695e-05, "loss": 0.9073, "step": 2260 }, { "epoch": 0.11337528718409749, "grad_norm": 4.0062031745910645, "learning_rate": 9.920919678893278e-05, "loss": 0.9143, "step": 2270 }, { "epoch": 0.11387473778843273, "grad_norm": 1.8300977945327759, "learning_rate": 9.920223264103395e-05, "loss": 0.9411, "step": 2280 }, { "epoch": 0.11437418839276796, "grad_norm": 1.9074162244796753, "learning_rate": 9.919523820938681e-05, "loss": 0.8758, "step": 2290 }, { "epoch": 0.11487363899710319, "grad_norm": 3.277357339859009, "learning_rate": 9.918821349829641e-05, "loss": 1.0738, "step": 2300 }, { "epoch": 0.11537308960143841, "grad_norm": 1.000112533569336, "learning_rate": 9.918115851208644e-05, "loss": 0.9844, "step": 2310 }, { "epoch": 0.11587254020577364, "grad_norm": 2.460766315460205, "learning_rate": 9.917407325509922e-05, "loss": 0.8265, "step": 2320 }, { "epoch": 0.11637199081010888, "grad_norm": 3.4657835960388184, "learning_rate": 9.916695773169569e-05, "loss": 1.0271, "step": 2330 }, { "epoch": 0.11687144141444411, "grad_norm": 6.398989677429199, "learning_rate": 9.915981194625539e-05, "loss": 1.0857, "step": 2340 }, { "epoch": 0.11737089201877934, "grad_norm": 3.102544069290161, "learning_rate": 9.915263590317654e-05, "loss": 0.9166, "step": 2350 }, { "epoch": 0.11787034262311458, "grad_norm": 1.6086996793746948, "learning_rate": 9.914542960687597e-05, "loss": 0.7819, "step": 2360 }, { "epoch": 0.11836979322744981, "grad_norm": 3.1169967651367188, "learning_rate": 9.913819306178912e-05, "loss": 0.8216, "step": 2370 }, { "epoch": 0.11886924383178504, "grad_norm": 3.2373907566070557, "learning_rate": 9.913092627237004e-05, "loss": 0.9111, "step": 2380 }, { "epoch": 0.11936869443612026, "grad_norm": 1.271636962890625, "learning_rate": 9.91236292430914e-05, "loss": 0.7123, "step": 2390 }, { "epoch": 0.1198681450404555, "grad_norm": 1.2871203422546387, "learning_rate": 9.91163019784445e-05, "loss": 0.9538, "step": 2400 }, { "epoch": 0.12036759564479073, "grad_norm": 3.983081579208374, "learning_rate": 9.910894448293926e-05, "loss": 1.1022, "step": 2410 }, { "epoch": 0.12086704624912596, "grad_norm": 1.4097563028335571, "learning_rate": 9.910155676110412e-05, "loss": 0.9296, "step": 2420 }, { "epoch": 0.1213664968534612, "grad_norm": 2.7831010818481445, "learning_rate": 9.909413881748627e-05, "loss": 0.7646, "step": 2430 }, { "epoch": 0.12186594745779643, "grad_norm": 2.1280717849731445, "learning_rate": 9.908669065665137e-05, "loss": 0.8786, "step": 2440 }, { "epoch": 0.12236539806213166, "grad_norm": 2.318333148956299, "learning_rate": 9.907921228318373e-05, "loss": 0.8555, "step": 2450 }, { "epoch": 0.12286484866646688, "grad_norm": 4.173869609832764, "learning_rate": 9.907170370168626e-05, "loss": 0.9869, "step": 2460 }, { "epoch": 0.12336429927080211, "grad_norm": 2.4868521690368652, "learning_rate": 9.906416491678047e-05, "loss": 0.7005, "step": 2470 }, { "epoch": 0.12386374987513735, "grad_norm": 4.937372207641602, "learning_rate": 9.905659593310643e-05, "loss": 0.999, "step": 2480 }, { "epoch": 0.12436320047947258, "grad_norm": 2.488497018814087, "learning_rate": 9.904899675532282e-05, "loss": 0.9049, "step": 2490 }, { "epoch": 0.12486265108380781, "grad_norm": 1.3979971408843994, "learning_rate": 9.904136738810692e-05, "loss": 0.7388, "step": 2500 }, { "epoch": 0.12536210168814305, "grad_norm": 0.7958292961120605, "learning_rate": 9.903370783615453e-05, "loss": 0.7922, "step": 2510 }, { "epoch": 0.12586155229247828, "grad_norm": 1.8274697065353394, "learning_rate": 9.902601810418011e-05, "loss": 0.8086, "step": 2520 }, { "epoch": 0.1263610028968135, "grad_norm": 3.842331886291504, "learning_rate": 9.901829819691662e-05, "loss": 0.9567, "step": 2530 }, { "epoch": 0.12686045350114875, "grad_norm": 3.233276128768921, "learning_rate": 9.901054811911565e-05, "loss": 0.9538, "step": 2540 }, { "epoch": 0.12735990410548398, "grad_norm": 2.3809590339660645, "learning_rate": 9.900276787554734e-05, "loss": 0.9824, "step": 2550 }, { "epoch": 0.1278593547098192, "grad_norm": 12.891764640808105, "learning_rate": 9.899495747100037e-05, "loss": 0.9333, "step": 2560 }, { "epoch": 0.12835880531415442, "grad_norm": 1.6126281023025513, "learning_rate": 9.898711691028204e-05, "loss": 0.8228, "step": 2570 }, { "epoch": 0.12885825591848965, "grad_norm": 1.4332327842712402, "learning_rate": 9.897924619821815e-05, "loss": 1.1386, "step": 2580 }, { "epoch": 0.12935770652282488, "grad_norm": 1.1457003355026245, "learning_rate": 9.89713453396531e-05, "loss": 1.0789, "step": 2590 }, { "epoch": 0.12985715712716012, "grad_norm": 3.1065938472747803, "learning_rate": 9.896341433944983e-05, "loss": 0.8222, "step": 2600 }, { "epoch": 0.13035660773149535, "grad_norm": 2.5499441623687744, "learning_rate": 9.895545320248985e-05, "loss": 0.8682, "step": 2610 }, { "epoch": 0.13085605833583058, "grad_norm": 2.2303380966186523, "learning_rate": 9.894746193367317e-05, "loss": 0.8319, "step": 2620 }, { "epoch": 0.13135550894016582, "grad_norm": 1.9166508913040161, "learning_rate": 9.893944053791841e-05, "loss": 1.1738, "step": 2630 }, { "epoch": 0.13185495954450105, "grad_norm": 2.200120210647583, "learning_rate": 9.893138902016267e-05, "loss": 0.7885, "step": 2640 }, { "epoch": 0.13235441014883628, "grad_norm": 4.75616455078125, "learning_rate": 9.892330738536167e-05, "loss": 1.014, "step": 2650 }, { "epoch": 0.13285386075317152, "grad_norm": 2.533257484436035, "learning_rate": 9.891519563848959e-05, "loss": 0.9097, "step": 2660 }, { "epoch": 0.13335331135750675, "grad_norm": 1.3194255828857422, "learning_rate": 9.890705378453917e-05, "loss": 1.2093, "step": 2670 }, { "epoch": 0.13385276196184198, "grad_norm": 2.478529453277588, "learning_rate": 9.889888182852169e-05, "loss": 0.9337, "step": 2680 }, { "epoch": 0.13435221256617721, "grad_norm": 1.3562705516815186, "learning_rate": 9.889067977546694e-05, "loss": 0.7337, "step": 2690 }, { "epoch": 0.13485166317051245, "grad_norm": 1.4239592552185059, "learning_rate": 9.888244763042327e-05, "loss": 0.9373, "step": 2700 }, { "epoch": 0.13535111377484768, "grad_norm": 2.2055132389068604, "learning_rate": 9.88741853984575e-05, "loss": 0.8111, "step": 2710 }, { "epoch": 0.1358505643791829, "grad_norm": 1.5610190629959106, "learning_rate": 9.886589308465501e-05, "loss": 1.0326, "step": 2720 }, { "epoch": 0.13635001498351812, "grad_norm": 1.9877070188522339, "learning_rate": 9.88575706941197e-05, "loss": 0.8532, "step": 2730 }, { "epoch": 0.13684946558785335, "grad_norm": 1.881638526916504, "learning_rate": 9.884921823197392e-05, "loss": 0.7869, "step": 2740 }, { "epoch": 0.13734891619218859, "grad_norm": 1.3000985383987427, "learning_rate": 9.88408357033586e-05, "loss": 1.1506, "step": 2750 }, { "epoch": 0.13784836679652382, "grad_norm": 3.4877495765686035, "learning_rate": 9.883242311343314e-05, "loss": 1.005, "step": 2760 }, { "epoch": 0.13834781740085905, "grad_norm": 2.5101332664489746, "learning_rate": 9.882398046737547e-05, "loss": 0.8723, "step": 2770 }, { "epoch": 0.13884726800519429, "grad_norm": 1.4676083326339722, "learning_rate": 9.881550777038197e-05, "loss": 0.9566, "step": 2780 }, { "epoch": 0.13934671860952952, "grad_norm": 4.2586283683776855, "learning_rate": 9.880700502766758e-05, "loss": 0.9785, "step": 2790 }, { "epoch": 0.13984616921386475, "grad_norm": 6.4517717361450195, "learning_rate": 9.879847224446566e-05, "loss": 1.0445, "step": 2800 }, { "epoch": 0.14034561981819998, "grad_norm": 2.9773175716400146, "learning_rate": 9.878990942602813e-05, "loss": 0.816, "step": 2810 }, { "epoch": 0.14084507042253522, "grad_norm": 2.0519254207611084, "learning_rate": 9.878131657762535e-05, "loss": 0.8386, "step": 2820 }, { "epoch": 0.14134452102687045, "grad_norm": 1.1298130750656128, "learning_rate": 9.877269370454618e-05, "loss": 0.659, "step": 2830 }, { "epoch": 0.14184397163120568, "grad_norm": 1.5811046361923218, "learning_rate": 9.876404081209796e-05, "loss": 0.7454, "step": 2840 }, { "epoch": 0.14234342223554092, "grad_norm": 1.5561637878417969, "learning_rate": 9.87553579056065e-05, "loss": 0.733, "step": 2850 }, { "epoch": 0.14284287283987612, "grad_norm": 3.4421191215515137, "learning_rate": 9.87466449904161e-05, "loss": 0.7871, "step": 2860 }, { "epoch": 0.14334232344421136, "grad_norm": 1.2579591274261475, "learning_rate": 9.873790207188952e-05, "loss": 0.8923, "step": 2870 }, { "epoch": 0.1438417740485466, "grad_norm": 0.7126845717430115, "learning_rate": 9.872912915540799e-05, "loss": 0.9842, "step": 2880 }, { "epoch": 0.14434122465288182, "grad_norm": 1.9931137561798096, "learning_rate": 9.872032624637118e-05, "loss": 0.7486, "step": 2890 }, { "epoch": 0.14484067525721706, "grad_norm": 2.537330389022827, "learning_rate": 9.871149335019725e-05, "loss": 0.9579, "step": 2900 }, { "epoch": 0.1453401258615523, "grad_norm": 2.626526355743408, "learning_rate": 9.87026304723228e-05, "loss": 1.0062, "step": 2910 }, { "epoch": 0.14583957646588752, "grad_norm": 1.5687665939331055, "learning_rate": 9.869373761820291e-05, "loss": 0.9602, "step": 2920 }, { "epoch": 0.14633902707022275, "grad_norm": 1.7398674488067627, "learning_rate": 9.868481479331107e-05, "loss": 0.7881, "step": 2930 }, { "epoch": 0.146838477674558, "grad_norm": 1.6017673015594482, "learning_rate": 9.867586200313926e-05, "loss": 0.8862, "step": 2940 }, { "epoch": 0.14733792827889322, "grad_norm": 5.415498733520508, "learning_rate": 9.866687925319786e-05, "loss": 0.8873, "step": 2950 }, { "epoch": 0.14783737888322845, "grad_norm": 1.596312165260315, "learning_rate": 9.865786654901573e-05, "loss": 0.8904, "step": 2960 }, { "epoch": 0.1483368294875637, "grad_norm": 4.682255744934082, "learning_rate": 9.864882389614014e-05, "loss": 0.842, "step": 2970 }, { "epoch": 0.14883628009189892, "grad_norm": 5.508861064910889, "learning_rate": 9.863975130013678e-05, "loss": 1.0358, "step": 2980 }, { "epoch": 0.14933573069623415, "grad_norm": 2.0030109882354736, "learning_rate": 9.863064876658983e-05, "loss": 0.9057, "step": 2990 }, { "epoch": 0.1498351813005694, "grad_norm": 4.921465873718262, "learning_rate": 9.862151630110182e-05, "loss": 1.0228, "step": 3000 }, { "epoch": 0.1503346319049046, "grad_norm": 5.827107906341553, "learning_rate": 9.861235390929378e-05, "loss": 0.8681, "step": 3010 }, { "epoch": 0.15083408250923983, "grad_norm": 1.1233668327331543, "learning_rate": 9.860316159680507e-05, "loss": 0.8535, "step": 3020 }, { "epoch": 0.15133353311357506, "grad_norm": 3.167651414871216, "learning_rate": 9.859393936929357e-05, "loss": 0.8039, "step": 3030 }, { "epoch": 0.1518329837179103, "grad_norm": 1.0371288061141968, "learning_rate": 9.858468723243549e-05, "loss": 0.9443, "step": 3040 }, { "epoch": 0.15233243432224552, "grad_norm": 0.5512734055519104, "learning_rate": 9.857540519192547e-05, "loss": 0.7964, "step": 3050 }, { "epoch": 0.15283188492658076, "grad_norm": 1.2850098609924316, "learning_rate": 9.856609325347659e-05, "loss": 0.8136, "step": 3060 }, { "epoch": 0.153331335530916, "grad_norm": 2.381633996963501, "learning_rate": 9.855675142282028e-05, "loss": 0.8199, "step": 3070 }, { "epoch": 0.15383078613525122, "grad_norm": 2.244203805923462, "learning_rate": 9.85473797057064e-05, "loss": 0.8278, "step": 3080 }, { "epoch": 0.15433023673958646, "grad_norm": 4.833874225616455, "learning_rate": 9.853797810790322e-05, "loss": 1.0593, "step": 3090 }, { "epoch": 0.1548296873439217, "grad_norm": 1.0583378076553345, "learning_rate": 9.852854663519736e-05, "loss": 0.7239, "step": 3100 }, { "epoch": 0.15532913794825692, "grad_norm": 1.1276373863220215, "learning_rate": 9.851908529339383e-05, "loss": 0.9269, "step": 3110 }, { "epoch": 0.15582858855259216, "grad_norm": 1.5166157484054565, "learning_rate": 9.850959408831609e-05, "loss": 1.0135, "step": 3120 }, { "epoch": 0.1563280391569274, "grad_norm": 0.892557680606842, "learning_rate": 9.850007302580588e-05, "loss": 0.9219, "step": 3130 }, { "epoch": 0.15682748976126262, "grad_norm": 2.4699206352233887, "learning_rate": 9.84905221117234e-05, "loss": 0.9638, "step": 3140 }, { "epoch": 0.15732694036559786, "grad_norm": 3.945955753326416, "learning_rate": 9.84809413519472e-05, "loss": 1.0434, "step": 3150 }, { "epoch": 0.15782639096993306, "grad_norm": 3.5126733779907227, "learning_rate": 9.84713307523742e-05, "loss": 0.9455, "step": 3160 }, { "epoch": 0.1583258415742683, "grad_norm": 2.5539886951446533, "learning_rate": 9.846169031891965e-05, "loss": 0.8115, "step": 3170 }, { "epoch": 0.15882529217860353, "grad_norm": 6.17609167098999, "learning_rate": 9.845202005751721e-05, "loss": 1.0244, "step": 3180 }, { "epoch": 0.15932474278293876, "grad_norm": 1.2907859086990356, "learning_rate": 9.844231997411887e-05, "loss": 0.7892, "step": 3190 }, { "epoch": 0.159824193387274, "grad_norm": 2.824439764022827, "learning_rate": 9.843259007469501e-05, "loss": 0.7057, "step": 3200 }, { "epoch": 0.16032364399160923, "grad_norm": 2.161973476409912, "learning_rate": 9.842283036523431e-05, "loss": 0.9797, "step": 3210 }, { "epoch": 0.16082309459594446, "grad_norm": 1.7337701320648193, "learning_rate": 9.841304085174386e-05, "loss": 0.8497, "step": 3220 }, { "epoch": 0.1613225452002797, "grad_norm": 2.365299701690674, "learning_rate": 9.840322154024901e-05, "loss": 1.0819, "step": 3230 }, { "epoch": 0.16182199580461493, "grad_norm": 4.2217888832092285, "learning_rate": 9.839337243679355e-05, "loss": 0.9062, "step": 3240 }, { "epoch": 0.16232144640895016, "grad_norm": 3.642833709716797, "learning_rate": 9.838349354743954e-05, "loss": 0.7498, "step": 3250 }, { "epoch": 0.1628208970132854, "grad_norm": 1.1478465795516968, "learning_rate": 9.837358487826737e-05, "loss": 1.0177, "step": 3260 }, { "epoch": 0.16332034761762063, "grad_norm": 1.1263699531555176, "learning_rate": 9.836364643537583e-05, "loss": 0.848, "step": 3270 }, { "epoch": 0.16381979822195586, "grad_norm": 3.684113025665283, "learning_rate": 9.835367822488195e-05, "loss": 1.0706, "step": 3280 }, { "epoch": 0.1643192488262911, "grad_norm": 1.0138434171676636, "learning_rate": 9.834368025292112e-05, "loss": 0.8569, "step": 3290 }, { "epoch": 0.1648186994306263, "grad_norm": 3.8836417198181152, "learning_rate": 9.833365252564706e-05, "loss": 0.6981, "step": 3300 }, { "epoch": 0.16531815003496153, "grad_norm": 4.29389762878418, "learning_rate": 9.832359504923176e-05, "loss": 0.7828, "step": 3310 }, { "epoch": 0.16581760063929676, "grad_norm": 1.0574113130569458, "learning_rate": 9.83135078298656e-05, "loss": 0.8543, "step": 3320 }, { "epoch": 0.166317051243632, "grad_norm": 3.4949302673339844, "learning_rate": 9.830339087375717e-05, "loss": 0.8726, "step": 3330 }, { "epoch": 0.16681650184796723, "grad_norm": 1.8446637392044067, "learning_rate": 9.829324418713342e-05, "loss": 0.9586, "step": 3340 }, { "epoch": 0.16731595245230246, "grad_norm": 1.1169347763061523, "learning_rate": 9.828306777623961e-05, "loss": 0.9523, "step": 3350 }, { "epoch": 0.1678154030566377, "grad_norm": 1.7979636192321777, "learning_rate": 9.827286164733926e-05, "loss": 0.9379, "step": 3360 }, { "epoch": 0.16831485366097293, "grad_norm": 1.7395380735397339, "learning_rate": 9.826262580671422e-05, "loss": 0.7038, "step": 3370 }, { "epoch": 0.16881430426530816, "grad_norm": 1.3596173524856567, "learning_rate": 9.825236026066456e-05, "loss": 0.7086, "step": 3380 }, { "epoch": 0.1693137548696434, "grad_norm": 2.8585293292999268, "learning_rate": 9.824206501550868e-05, "loss": 0.8265, "step": 3390 }, { "epoch": 0.16981320547397863, "grad_norm": 1.5410815477371216, "learning_rate": 9.823174007758328e-05, "loss": 0.905, "step": 3400 }, { "epoch": 0.17031265607831386, "grad_norm": 3.265131950378418, "learning_rate": 9.822138545324333e-05, "loss": 0.7908, "step": 3410 }, { "epoch": 0.1708121066826491, "grad_norm": 1.7951984405517578, "learning_rate": 9.821100114886201e-05, "loss": 0.7412, "step": 3420 }, { "epoch": 0.17131155728698433, "grad_norm": 5.0211029052734375, "learning_rate": 9.820058717083083e-05, "loss": 0.9029, "step": 3430 }, { "epoch": 0.17181100789131956, "grad_norm": 1.9129225015640259, "learning_rate": 9.819014352555955e-05, "loss": 0.8848, "step": 3440 }, { "epoch": 0.17231045849565477, "grad_norm": 0.9868128895759583, "learning_rate": 9.817967021947619e-05, "loss": 0.7679, "step": 3450 }, { "epoch": 0.17280990909999, "grad_norm": 1.28288996219635, "learning_rate": 9.816916725902698e-05, "loss": 0.9419, "step": 3460 }, { "epoch": 0.17330935970432523, "grad_norm": 2.4345173835754395, "learning_rate": 9.815863465067651e-05, "loss": 1.1373, "step": 3470 }, { "epoch": 0.17380881030866047, "grad_norm": 5.2141523361206055, "learning_rate": 9.81480724009075e-05, "loss": 0.9027, "step": 3480 }, { "epoch": 0.1743082609129957, "grad_norm": 0.884800136089325, "learning_rate": 9.813748051622101e-05, "loss": 0.8381, "step": 3490 }, { "epoch": 0.17480771151733093, "grad_norm": 3.8795621395111084, "learning_rate": 9.812685900313626e-05, "loss": 0.9876, "step": 3500 }, { "epoch": 0.17530716212166617, "grad_norm": 0.9977666735649109, "learning_rate": 9.811620786819074e-05, "loss": 1.1316, "step": 3510 }, { "epoch": 0.1758066127260014, "grad_norm": 1.1967836618423462, "learning_rate": 9.810552711794021e-05, "loss": 0.9226, "step": 3520 }, { "epoch": 0.17630606333033663, "grad_norm": 0.5718588829040527, "learning_rate": 9.80948167589586e-05, "loss": 0.7573, "step": 3530 }, { "epoch": 0.17680551393467187, "grad_norm": 5.537643909454346, "learning_rate": 9.808407679783806e-05, "loss": 0.8398, "step": 3540 }, { "epoch": 0.1773049645390071, "grad_norm": 1.492255687713623, "learning_rate": 9.807330724118905e-05, "loss": 0.8927, "step": 3550 }, { "epoch": 0.17780441514334233, "grad_norm": 0.9181856513023376, "learning_rate": 9.806250809564014e-05, "loss": 1.1437, "step": 3560 }, { "epoch": 0.17830386574767756, "grad_norm": 1.5734564065933228, "learning_rate": 9.805167936783815e-05, "loss": 0.8037, "step": 3570 }, { "epoch": 0.1788033163520128, "grad_norm": 3.1623282432556152, "learning_rate": 9.804082106444814e-05, "loss": 0.9561, "step": 3580 }, { "epoch": 0.17930276695634803, "grad_norm": 2.334489345550537, "learning_rate": 9.802993319215332e-05, "loss": 1.0817, "step": 3590 }, { "epoch": 0.17980221756068324, "grad_norm": 1.4881401062011719, "learning_rate": 9.801901575765515e-05, "loss": 0.8115, "step": 3600 }, { "epoch": 0.18030166816501847, "grad_norm": 1.3333991765975952, "learning_rate": 9.800806876767324e-05, "loss": 0.8874, "step": 3610 }, { "epoch": 0.1808011187693537, "grad_norm": 3.5096232891082764, "learning_rate": 9.799709222894539e-05, "loss": 0.8099, "step": 3620 }, { "epoch": 0.18130056937368894, "grad_norm": 3.1804404258728027, "learning_rate": 9.798608614822769e-05, "loss": 0.8315, "step": 3630 }, { "epoch": 0.18180001997802417, "grad_norm": 1.045871376991272, "learning_rate": 9.797505053229425e-05, "loss": 0.6821, "step": 3640 }, { "epoch": 0.1822994705823594, "grad_norm": 2.789235830307007, "learning_rate": 9.796398538793748e-05, "loss": 0.7989, "step": 3650 }, { "epoch": 0.18279892118669464, "grad_norm": 1.0591028928756714, "learning_rate": 9.795289072196789e-05, "loss": 0.9044, "step": 3660 }, { "epoch": 0.18329837179102987, "grad_norm": 1.6923969984054565, "learning_rate": 9.794176654121425e-05, "loss": 0.8402, "step": 3670 }, { "epoch": 0.1837978223953651, "grad_norm": 1.9938535690307617, "learning_rate": 9.793061285252341e-05, "loss": 0.974, "step": 3680 }, { "epoch": 0.18429727299970033, "grad_norm": 4.054079532623291, "learning_rate": 9.791942966276043e-05, "loss": 0.954, "step": 3690 }, { "epoch": 0.18479672360403557, "grad_norm": 1.561954140663147, "learning_rate": 9.79082169788085e-05, "loss": 0.8529, "step": 3700 }, { "epoch": 0.1852961742083708, "grad_norm": 6.34870719909668, "learning_rate": 9.789697480756896e-05, "loss": 1.0314, "step": 3710 }, { "epoch": 0.18579562481270603, "grad_norm": 2.829148530960083, "learning_rate": 9.788570315596134e-05, "loss": 0.9551, "step": 3720 }, { "epoch": 0.18629507541704127, "grad_norm": 1.198886752128601, "learning_rate": 9.787440203092329e-05, "loss": 0.8587, "step": 3730 }, { "epoch": 0.18679452602137647, "grad_norm": 2.539764642715454, "learning_rate": 9.786307143941057e-05, "loss": 0.8987, "step": 3740 }, { "epoch": 0.1872939766257117, "grad_norm": 1.3188793659210205, "learning_rate": 9.785171138839715e-05, "loss": 1.1287, "step": 3750 }, { "epoch": 0.18779342723004694, "grad_norm": 4.570198059082031, "learning_rate": 9.784032188487506e-05, "loss": 0.8166, "step": 3760 }, { "epoch": 0.18829287783438217, "grad_norm": 1.0271146297454834, "learning_rate": 9.782890293585449e-05, "loss": 0.7467, "step": 3770 }, { "epoch": 0.1887923284387174, "grad_norm": 2.158318519592285, "learning_rate": 9.781745454836377e-05, "loss": 1.0177, "step": 3780 }, { "epoch": 0.18929177904305264, "grad_norm": 3.2736594676971436, "learning_rate": 9.78059767294493e-05, "loss": 0.8979, "step": 3790 }, { "epoch": 0.18979122964738787, "grad_norm": 1.217395305633545, "learning_rate": 9.779446948617565e-05, "loss": 0.8489, "step": 3800 }, { "epoch": 0.1902906802517231, "grad_norm": 2.430159568786621, "learning_rate": 9.778293282562547e-05, "loss": 0.6917, "step": 3810 }, { "epoch": 0.19079013085605834, "grad_norm": 2.7768425941467285, "learning_rate": 9.77713667548995e-05, "loss": 0.8224, "step": 3820 }, { "epoch": 0.19128958146039357, "grad_norm": 3.406033515930176, "learning_rate": 9.775977128111663e-05, "loss": 0.9499, "step": 3830 }, { "epoch": 0.1917890320647288, "grad_norm": 1.3434473276138306, "learning_rate": 9.774814641141382e-05, "loss": 1.2465, "step": 3840 }, { "epoch": 0.19228848266906404, "grad_norm": 3.096501350402832, "learning_rate": 9.773649215294611e-05, "loss": 0.7828, "step": 3850 }, { "epoch": 0.19278793327339927, "grad_norm": 2.608057737350464, "learning_rate": 9.772480851288666e-05, "loss": 0.9272, "step": 3860 }, { "epoch": 0.1932873838777345, "grad_norm": 1.0312490463256836, "learning_rate": 9.771309549842666e-05, "loss": 1.1293, "step": 3870 }, { "epoch": 0.19378683448206974, "grad_norm": 1.8020999431610107, "learning_rate": 9.770135311677546e-05, "loss": 0.8136, "step": 3880 }, { "epoch": 0.19428628508640494, "grad_norm": 1.438332200050354, "learning_rate": 9.768958137516042e-05, "loss": 0.7858, "step": 3890 }, { "epoch": 0.19478573569074017, "grad_norm": 2.0370874404907227, "learning_rate": 9.7677780280827e-05, "loss": 1.1137, "step": 3900 }, { "epoch": 0.1952851862950754, "grad_norm": 1.5007089376449585, "learning_rate": 9.766594984103872e-05, "loss": 0.6553, "step": 3910 }, { "epoch": 0.19578463689941064, "grad_norm": 4.325165271759033, "learning_rate": 9.765409006307715e-05, "loss": 1.0529, "step": 3920 }, { "epoch": 0.19628408750374587, "grad_norm": 2.121492385864258, "learning_rate": 9.764220095424195e-05, "loss": 1.0299, "step": 3930 }, { "epoch": 0.1967835381080811, "grad_norm": 3.8324851989746094, "learning_rate": 9.763028252185077e-05, "loss": 0.9219, "step": 3940 }, { "epoch": 0.19728298871241634, "grad_norm": 0.20872262120246887, "learning_rate": 9.76183347732394e-05, "loss": 0.7862, "step": 3950 }, { "epoch": 0.19778243931675157, "grad_norm": 1.5114883184432983, "learning_rate": 9.760635771576161e-05, "loss": 0.9307, "step": 3960 }, { "epoch": 0.1982818899210868, "grad_norm": 1.2050377130508423, "learning_rate": 9.75943513567892e-05, "loss": 0.9463, "step": 3970 }, { "epoch": 0.19878134052542204, "grad_norm": 2.041001558303833, "learning_rate": 9.758231570371206e-05, "loss": 0.7967, "step": 3980 }, { "epoch": 0.19928079112975727, "grad_norm": 2.6333858966827393, "learning_rate": 9.757025076393805e-05, "loss": 0.838, "step": 3990 }, { "epoch": 0.1997802417340925, "grad_norm": 1.027332067489624, "learning_rate": 9.755815654489311e-05, "loss": 0.7515, "step": 4000 }, { "epoch": 0.20027969233842774, "grad_norm": 1.96125066280365, "learning_rate": 9.754603305402117e-05, "loss": 0.872, "step": 4010 }, { "epoch": 0.20077914294276297, "grad_norm": 1.2602308988571167, "learning_rate": 9.753388029878416e-05, "loss": 0.6845, "step": 4020 }, { "epoch": 0.2012785935470982, "grad_norm": 3.9577910900115967, "learning_rate": 9.752169828666208e-05, "loss": 0.8699, "step": 4030 }, { "epoch": 0.2017780441514334, "grad_norm": 1.2592867612838745, "learning_rate": 9.750948702515289e-05, "loss": 0.7011, "step": 4040 }, { "epoch": 0.20227749475576864, "grad_norm": 3.0217552185058594, "learning_rate": 9.749724652177256e-05, "loss": 0.7576, "step": 4050 }, { "epoch": 0.20277694536010388, "grad_norm": 2.191291093826294, "learning_rate": 9.748497678405507e-05, "loss": 0.8379, "step": 4060 }, { "epoch": 0.2032763959644391, "grad_norm": 3.587172508239746, "learning_rate": 9.747267781955238e-05, "loss": 0.7689, "step": 4070 }, { "epoch": 0.20377584656877434, "grad_norm": 2.85992693901062, "learning_rate": 9.746034963583444e-05, "loss": 0.8742, "step": 4080 }, { "epoch": 0.20427529717310958, "grad_norm": 1.0260587930679321, "learning_rate": 9.74479922404892e-05, "loss": 0.9285, "step": 4090 }, { "epoch": 0.2047747477774448, "grad_norm": 1.6661021709442139, "learning_rate": 9.74356056411226e-05, "loss": 0.7533, "step": 4100 }, { "epoch": 0.20527419838178004, "grad_norm": 2.2743191719055176, "learning_rate": 9.74231898453585e-05, "loss": 0.8414, "step": 4110 }, { "epoch": 0.20577364898611528, "grad_norm": 3.976389169692993, "learning_rate": 9.741074486083878e-05, "loss": 0.8494, "step": 4120 }, { "epoch": 0.2062730995904505, "grad_norm": 2.743157386779785, "learning_rate": 9.739827069522327e-05, "loss": 1.1373, "step": 4130 }, { "epoch": 0.20677255019478574, "grad_norm": 1.540973424911499, "learning_rate": 9.738576735618977e-05, "loss": 0.8152, "step": 4140 }, { "epoch": 0.20727200079912098, "grad_norm": 2.5153651237487793, "learning_rate": 9.737323485143401e-05, "loss": 0.9452, "step": 4150 }, { "epoch": 0.2077714514034562, "grad_norm": 3.126713991165161, "learning_rate": 9.736067318866973e-05, "loss": 1.1158, "step": 4160 }, { "epoch": 0.20827090200779144, "grad_norm": 2.0404253005981445, "learning_rate": 9.734808237562851e-05, "loss": 0.9586, "step": 4170 }, { "epoch": 0.20877035261212665, "grad_norm": 2.13669490814209, "learning_rate": 9.733546242006e-05, "loss": 0.8836, "step": 4180 }, { "epoch": 0.20926980321646188, "grad_norm": 1.5237314701080322, "learning_rate": 9.732281332973168e-05, "loss": 0.8961, "step": 4190 }, { "epoch": 0.2097692538207971, "grad_norm": 1.4856115579605103, "learning_rate": 9.731013511242902e-05, "loss": 0.7561, "step": 4200 }, { "epoch": 0.21026870442513235, "grad_norm": 3.4389278888702393, "learning_rate": 9.729742777595543e-05, "loss": 0.982, "step": 4210 }, { "epoch": 0.21076815502946758, "grad_norm": 1.0415877103805542, "learning_rate": 9.728469132813218e-05, "loss": 0.6615, "step": 4220 }, { "epoch": 0.2112676056338028, "grad_norm": 0.9365307688713074, "learning_rate": 9.727192577679851e-05, "loss": 0.8517, "step": 4230 }, { "epoch": 0.21176705623813805, "grad_norm": 1.5347106456756592, "learning_rate": 9.725913112981157e-05, "loss": 0.908, "step": 4240 }, { "epoch": 0.21226650684247328, "grad_norm": 1.4954674243927002, "learning_rate": 9.724630739504641e-05, "loss": 1.0653, "step": 4250 }, { "epoch": 0.2127659574468085, "grad_norm": 4.099848747253418, "learning_rate": 9.723345458039594e-05, "loss": 0.9833, "step": 4260 }, { "epoch": 0.21326540805114375, "grad_norm": 3.3358452320098877, "learning_rate": 9.722057269377105e-05, "loss": 0.8166, "step": 4270 }, { "epoch": 0.21376485865547898, "grad_norm": 1.451033592224121, "learning_rate": 9.720766174310047e-05, "loss": 0.858, "step": 4280 }, { "epoch": 0.2142643092598142, "grad_norm": 1.987222671508789, "learning_rate": 9.719472173633082e-05, "loss": 0.6486, "step": 4290 }, { "epoch": 0.21476375986414944, "grad_norm": 1.662618637084961, "learning_rate": 9.718175268142662e-05, "loss": 0.7854, "step": 4300 }, { "epoch": 0.21526321046848468, "grad_norm": 2.374152183532715, "learning_rate": 9.716875458637027e-05, "loss": 0.9253, "step": 4310 }, { "epoch": 0.2157626610728199, "grad_norm": 1.624871015548706, "learning_rate": 9.715572745916204e-05, "loss": 0.7942, "step": 4320 }, { "epoch": 0.21626211167715512, "grad_norm": 2.8344717025756836, "learning_rate": 9.714267130782006e-05, "loss": 0.9736, "step": 4330 }, { "epoch": 0.21676156228149035, "grad_norm": 1.3681503534317017, "learning_rate": 9.712958614038033e-05, "loss": 0.7467, "step": 4340 }, { "epoch": 0.21726101288582558, "grad_norm": 1.5472357273101807, "learning_rate": 9.71164719648967e-05, "loss": 0.752, "step": 4350 }, { "epoch": 0.21776046349016082, "grad_norm": 8.472622871398926, "learning_rate": 9.71033287894409e-05, "loss": 1.1423, "step": 4360 }, { "epoch": 0.21825991409449605, "grad_norm": 1.368548035621643, "learning_rate": 9.709015662210252e-05, "loss": 0.7389, "step": 4370 }, { "epoch": 0.21875936469883128, "grad_norm": 1.1928058862686157, "learning_rate": 9.707695547098891e-05, "loss": 0.7544, "step": 4380 }, { "epoch": 0.21925881530316652, "grad_norm": 9.385339736938477, "learning_rate": 9.706372534422536e-05, "loss": 0.9805, "step": 4390 }, { "epoch": 0.21975826590750175, "grad_norm": 2.0432844161987305, "learning_rate": 9.705046624995495e-05, "loss": 0.8043, "step": 4400 }, { "epoch": 0.22025771651183698, "grad_norm": 1.434600591659546, "learning_rate": 9.703717819633856e-05, "loss": 0.9335, "step": 4410 }, { "epoch": 0.22075716711617221, "grad_norm": 1.6714608669281006, "learning_rate": 9.702386119155495e-05, "loss": 1.1171, "step": 4420 }, { "epoch": 0.22125661772050745, "grad_norm": 3.2519516944885254, "learning_rate": 9.701051524380069e-05, "loss": 0.852, "step": 4430 }, { "epoch": 0.22175606832484268, "grad_norm": 4.377466201782227, "learning_rate": 9.699714036129013e-05, "loss": 1.0314, "step": 4440 }, { "epoch": 0.22225551892917791, "grad_norm": 4.908904075622559, "learning_rate": 9.698373655225546e-05, "loss": 0.8541, "step": 4450 }, { "epoch": 0.22275496953351315, "grad_norm": 1.3215618133544922, "learning_rate": 9.697030382494663e-05, "loss": 0.7287, "step": 4460 }, { "epoch": 0.22325442013784838, "grad_norm": 5.293785095214844, "learning_rate": 9.695684218763145e-05, "loss": 0.875, "step": 4470 }, { "epoch": 0.22375387074218359, "grad_norm": 3.4198853969573975, "learning_rate": 9.694335164859552e-05, "loss": 1.0289, "step": 4480 }, { "epoch": 0.22425332134651882, "grad_norm": 1.569650411605835, "learning_rate": 9.692983221614216e-05, "loss": 0.8788, "step": 4490 }, { "epoch": 0.22475277195085405, "grad_norm": 1.5537540912628174, "learning_rate": 9.691628389859253e-05, "loss": 0.5818, "step": 4500 }, { "epoch": 0.22525222255518929, "grad_norm": 1.9405503273010254, "learning_rate": 9.690270670428557e-05, "loss": 0.7723, "step": 4510 }, { "epoch": 0.22575167315952452, "grad_norm": 1.9559584856033325, "learning_rate": 9.688910064157798e-05, "loss": 0.8411, "step": 4520 }, { "epoch": 0.22625112376385975, "grad_norm": 3.755566120147705, "learning_rate": 9.687546571884421e-05, "loss": 0.7732, "step": 4530 }, { "epoch": 0.22675057436819498, "grad_norm": 2.3459646701812744, "learning_rate": 9.686180194447652e-05, "loss": 0.9359, "step": 4540 }, { "epoch": 0.22725002497253022, "grad_norm": 3.293691635131836, "learning_rate": 9.684810932688488e-05, "loss": 0.782, "step": 4550 }, { "epoch": 0.22774947557686545, "grad_norm": 5.3083109855651855, "learning_rate": 9.683438787449704e-05, "loss": 1.019, "step": 4560 }, { "epoch": 0.22824892618120068, "grad_norm": 1.8565597534179688, "learning_rate": 9.682063759575848e-05, "loss": 0.955, "step": 4570 }, { "epoch": 0.22874837678553592, "grad_norm": 1.9487963914871216, "learning_rate": 9.680685849913244e-05, "loss": 0.8568, "step": 4580 }, { "epoch": 0.22924782738987115, "grad_norm": 0.8379784226417542, "learning_rate": 9.679305059309992e-05, "loss": 0.7309, "step": 4590 }, { "epoch": 0.22974727799420638, "grad_norm": 3.0211181640625, "learning_rate": 9.677921388615959e-05, "loss": 0.9924, "step": 4600 }, { "epoch": 0.23024672859854162, "grad_norm": 1.363000750541687, "learning_rate": 9.676534838682788e-05, "loss": 0.791, "step": 4610 }, { "epoch": 0.23074617920287682, "grad_norm": 2.081880569458008, "learning_rate": 9.675145410363894e-05, "loss": 1.0073, "step": 4620 }, { "epoch": 0.23124562980721206, "grad_norm": 1.5669691562652588, "learning_rate": 9.673753104514465e-05, "loss": 0.9327, "step": 4630 }, { "epoch": 0.2317450804115473, "grad_norm": 1.333825945854187, "learning_rate": 9.67235792199146e-05, "loss": 1.0045, "step": 4640 }, { "epoch": 0.23224453101588252, "grad_norm": 3.206181049346924, "learning_rate": 9.670959863653604e-05, "loss": 0.813, "step": 4650 }, { "epoch": 0.23274398162021775, "grad_norm": 3.0151419639587402, "learning_rate": 9.669558930361397e-05, "loss": 1.0477, "step": 4660 }, { "epoch": 0.233243432224553, "grad_norm": 2.978698968887329, "learning_rate": 9.668155122977109e-05, "loss": 0.7547, "step": 4670 }, { "epoch": 0.23374288282888822, "grad_norm": 2.5535624027252197, "learning_rate": 9.666748442364775e-05, "loss": 0.8055, "step": 4680 }, { "epoch": 0.23424233343322345, "grad_norm": 1.425445556640625, "learning_rate": 9.6653388893902e-05, "loss": 0.9289, "step": 4690 }, { "epoch": 0.2347417840375587, "grad_norm": 3.3846025466918945, "learning_rate": 9.663926464920958e-05, "loss": 0.9202, "step": 4700 }, { "epoch": 0.23524123464189392, "grad_norm": 1.5291244983673096, "learning_rate": 9.662511169826391e-05, "loss": 0.9463, "step": 4710 }, { "epoch": 0.23574068524622915, "grad_norm": 0.5786092877388, "learning_rate": 9.661093004977606e-05, "loss": 0.6806, "step": 4720 }, { "epoch": 0.2362401358505644, "grad_norm": 1.5869218111038208, "learning_rate": 9.659671971247475e-05, "loss": 0.8826, "step": 4730 }, { "epoch": 0.23673958645489962, "grad_norm": 1.6809816360473633, "learning_rate": 9.658248069510639e-05, "loss": 0.8307, "step": 4740 }, { "epoch": 0.23723903705923485, "grad_norm": 1.3082853555679321, "learning_rate": 9.656821300643504e-05, "loss": 0.945, "step": 4750 }, { "epoch": 0.23773848766357009, "grad_norm": 3.3663363456726074, "learning_rate": 9.655391665524239e-05, "loss": 0.9417, "step": 4760 }, { "epoch": 0.2382379382679053, "grad_norm": 2.5983574390411377, "learning_rate": 9.653959165032779e-05, "loss": 0.8467, "step": 4770 }, { "epoch": 0.23873738887224052, "grad_norm": 1.2755992412567139, "learning_rate": 9.652523800050819e-05, "loss": 0.8064, "step": 4780 }, { "epoch": 0.23923683947657576, "grad_norm": 2.786166191101074, "learning_rate": 9.65108557146182e-05, "loss": 0.939, "step": 4790 }, { "epoch": 0.239736290080911, "grad_norm": 3.326118230819702, "learning_rate": 9.649644480151008e-05, "loss": 0.9213, "step": 4800 }, { "epoch": 0.24023574068524622, "grad_norm": 0.9329853057861328, "learning_rate": 9.648200527005364e-05, "loss": 0.8436, "step": 4810 }, { "epoch": 0.24073519128958146, "grad_norm": 1.561836838722229, "learning_rate": 9.646753712913637e-05, "loss": 0.9049, "step": 4820 }, { "epoch": 0.2412346418939167, "grad_norm": 2.1252520084381104, "learning_rate": 9.645304038766335e-05, "loss": 1.193, "step": 4830 }, { "epoch": 0.24173409249825192, "grad_norm": 2.028834819793701, "learning_rate": 9.643851505455725e-05, "loss": 0.7113, "step": 4840 }, { "epoch": 0.24223354310258716, "grad_norm": 4.80098295211792, "learning_rate": 9.642396113875834e-05, "loss": 0.8507, "step": 4850 }, { "epoch": 0.2427329937069224, "grad_norm": 2.0205676555633545, "learning_rate": 9.640937864922447e-05, "loss": 1.0054, "step": 4860 }, { "epoch": 0.24323244431125762, "grad_norm": 2.245183229446411, "learning_rate": 9.639476759493114e-05, "loss": 0.9972, "step": 4870 }, { "epoch": 0.24373189491559286, "grad_norm": 1.1496859788894653, "learning_rate": 9.638012798487135e-05, "loss": 0.9177, "step": 4880 }, { "epoch": 0.2442313455199281, "grad_norm": 4.3665032386779785, "learning_rate": 9.636545982805574e-05, "loss": 0.7268, "step": 4890 }, { "epoch": 0.24473079612426332, "grad_norm": 1.243686318397522, "learning_rate": 9.635076313351248e-05, "loss": 0.8992, "step": 4900 }, { "epoch": 0.24523024672859856, "grad_norm": 1.5397096872329712, "learning_rate": 9.633603791028732e-05, "loss": 0.8472, "step": 4910 }, { "epoch": 0.24572969733293376, "grad_norm": 2.435830593109131, "learning_rate": 9.632128416744355e-05, "loss": 1.0926, "step": 4920 }, { "epoch": 0.246229147937269, "grad_norm": 1.8933160305023193, "learning_rate": 9.630650191406205e-05, "loss": 0.7815, "step": 4930 }, { "epoch": 0.24672859854160423, "grad_norm": 3.721090078353882, "learning_rate": 9.629169115924123e-05, "loss": 0.979, "step": 4940 }, { "epoch": 0.24722804914593946, "grad_norm": 1.8302251100540161, "learning_rate": 9.627685191209703e-05, "loss": 0.7747, "step": 4950 }, { "epoch": 0.2477274997502747, "grad_norm": 1.1439883708953857, "learning_rate": 9.626198418176296e-05, "loss": 0.9603, "step": 4960 }, { "epoch": 0.24822695035460993, "grad_norm": 1.6686792373657227, "learning_rate": 9.624708797739001e-05, "loss": 0.7666, "step": 4970 }, { "epoch": 0.24872640095894516, "grad_norm": 1.5663448572158813, "learning_rate": 9.623216330814675e-05, "loss": 0.7684, "step": 4980 }, { "epoch": 0.2492258515632804, "grad_norm": 1.3111817836761475, "learning_rate": 9.621721018321924e-05, "loss": 0.774, "step": 4990 }, { "epoch": 0.24972530216761563, "grad_norm": 1.4687976837158203, "learning_rate": 9.620222861181103e-05, "loss": 0.784, "step": 5000 }, { "epoch": 0.25022475277195083, "grad_norm": 2.047543525695801, "learning_rate": 9.618721860314326e-05, "loss": 0.9805, "step": 5010 }, { "epoch": 0.2507242033762861, "grad_norm": 1.2842210531234741, "learning_rate": 9.617218016645448e-05, "loss": 1.0804, "step": 5020 }, { "epoch": 0.2512236539806213, "grad_norm": 1.8332990407943726, "learning_rate": 9.615711331100081e-05, "loss": 0.8339, "step": 5030 }, { "epoch": 0.25172310458495656, "grad_norm": 1.4070039987564087, "learning_rate": 9.614201804605581e-05, "loss": 0.8671, "step": 5040 }, { "epoch": 0.25222255518929176, "grad_norm": 2.0862865447998047, "learning_rate": 9.612689438091054e-05, "loss": 1.0481, "step": 5050 }, { "epoch": 0.252722005793627, "grad_norm": 3.3928749561309814, "learning_rate": 9.611174232487357e-05, "loss": 0.7934, "step": 5060 }, { "epoch": 0.25322145639796223, "grad_norm": 1.5095566511154175, "learning_rate": 9.609656188727089e-05, "loss": 0.7343, "step": 5070 }, { "epoch": 0.2537209070022975, "grad_norm": 1.5008156299591064, "learning_rate": 9.608135307744601e-05, "loss": 0.9165, "step": 5080 }, { "epoch": 0.2542203576066327, "grad_norm": 1.3544814586639404, "learning_rate": 9.60661159047599e-05, "loss": 0.7254, "step": 5090 }, { "epoch": 0.25471980821096796, "grad_norm": 3.1354422569274902, "learning_rate": 9.605085037859094e-05, "loss": 0.7744, "step": 5100 }, { "epoch": 0.25521925881530316, "grad_norm": 0.9450280666351318, "learning_rate": 9.6035556508335e-05, "loss": 0.9265, "step": 5110 }, { "epoch": 0.2557187094196384, "grad_norm": 1.6109542846679688, "learning_rate": 9.602023430340539e-05, "loss": 0.8604, "step": 5120 }, { "epoch": 0.25621816002397363, "grad_norm": 2.064497470855713, "learning_rate": 9.600488377323287e-05, "loss": 0.8158, "step": 5130 }, { "epoch": 0.25671761062830883, "grad_norm": 1.5562397241592407, "learning_rate": 9.598950492726559e-05, "loss": 0.8947, "step": 5140 }, { "epoch": 0.2572170612326441, "grad_norm": 1.3519991636276245, "learning_rate": 9.597409777496918e-05, "loss": 0.8841, "step": 5150 }, { "epoch": 0.2577165118369793, "grad_norm": 2.1325738430023193, "learning_rate": 9.595866232582665e-05, "loss": 0.7257, "step": 5160 }, { "epoch": 0.25821596244131456, "grad_norm": 2.0866081714630127, "learning_rate": 9.594319858933848e-05, "loss": 0.891, "step": 5170 }, { "epoch": 0.25871541304564977, "grad_norm": 1.882603645324707, "learning_rate": 9.592770657502251e-05, "loss": 1.0195, "step": 5180 }, { "epoch": 0.25921486364998503, "grad_norm": 1.3703722953796387, "learning_rate": 9.5912186292414e-05, "loss": 0.9031, "step": 5190 }, { "epoch": 0.25971431425432023, "grad_norm": 1.3103618621826172, "learning_rate": 9.589663775106564e-05, "loss": 0.8455, "step": 5200 }, { "epoch": 0.2602137648586555, "grad_norm": 2.314448118209839, "learning_rate": 9.588106096054745e-05, "loss": 1.0818, "step": 5210 }, { "epoch": 0.2607132154629907, "grad_norm": 1.4067221879959106, "learning_rate": 9.58654559304469e-05, "loss": 0.8523, "step": 5220 }, { "epoch": 0.26121266606732596, "grad_norm": 1.8233261108398438, "learning_rate": 9.58498226703688e-05, "loss": 0.8185, "step": 5230 }, { "epoch": 0.26171211667166117, "grad_norm": 4.319331645965576, "learning_rate": 9.583416118993536e-05, "loss": 1.2356, "step": 5240 }, { "epoch": 0.2622115672759964, "grad_norm": 4.6194024085998535, "learning_rate": 9.581847149878612e-05, "loss": 0.9103, "step": 5250 }, { "epoch": 0.26271101788033163, "grad_norm": 2.9623756408691406, "learning_rate": 9.580275360657805e-05, "loss": 0.9043, "step": 5260 }, { "epoch": 0.2632104684846669, "grad_norm": 1.8878183364868164, "learning_rate": 9.578700752298542e-05, "loss": 0.8641, "step": 5270 }, { "epoch": 0.2637099190890021, "grad_norm": 1.2333494424819946, "learning_rate": 9.577123325769987e-05, "loss": 0.7547, "step": 5280 }, { "epoch": 0.2642093696933373, "grad_norm": 3.1482670307159424, "learning_rate": 9.575543082043042e-05, "loss": 1.0384, "step": 5290 }, { "epoch": 0.26470882029767256, "grad_norm": 1.1335426568984985, "learning_rate": 9.573960022090336e-05, "loss": 0.9774, "step": 5300 }, { "epoch": 0.26520827090200777, "grad_norm": 0.824373185634613, "learning_rate": 9.572374146886236e-05, "loss": 0.9198, "step": 5310 }, { "epoch": 0.26570772150634303, "grad_norm": 5.559612274169922, "learning_rate": 9.570785457406842e-05, "loss": 0.7867, "step": 5320 }, { "epoch": 0.26620717211067824, "grad_norm": 1.273941159248352, "learning_rate": 9.569193954629985e-05, "loss": 0.7399, "step": 5330 }, { "epoch": 0.2667066227150135, "grad_norm": 3.5407681465148926, "learning_rate": 9.567599639535225e-05, "loss": 0.8633, "step": 5340 }, { "epoch": 0.2672060733193487, "grad_norm": 1.2934330701828003, "learning_rate": 9.56600251310386e-05, "loss": 0.8116, "step": 5350 }, { "epoch": 0.26770552392368396, "grad_norm": 1.032084584236145, "learning_rate": 9.564402576318912e-05, "loss": 0.8625, "step": 5360 }, { "epoch": 0.26820497452801917, "grad_norm": 3.6709439754486084, "learning_rate": 9.562799830165132e-05, "loss": 0.7579, "step": 5370 }, { "epoch": 0.26870442513235443, "grad_norm": 3.813777208328247, "learning_rate": 9.561194275629006e-05, "loss": 0.8647, "step": 5380 }, { "epoch": 0.26920387573668964, "grad_norm": 2.112259864807129, "learning_rate": 9.559585913698746e-05, "loss": 0.7539, "step": 5390 }, { "epoch": 0.2697033263410249, "grad_norm": 2.7863011360168457, "learning_rate": 9.557974745364289e-05, "loss": 0.8385, "step": 5400 }, { "epoch": 0.2702027769453601, "grad_norm": 3.3667349815368652, "learning_rate": 9.556360771617305e-05, "loss": 0.8315, "step": 5410 }, { "epoch": 0.27070222754969536, "grad_norm": 3.227236032485962, "learning_rate": 9.554743993451183e-05, "loss": 0.8337, "step": 5420 }, { "epoch": 0.27120167815403057, "grad_norm": 1.1658493280410767, "learning_rate": 9.553124411861045e-05, "loss": 0.7649, "step": 5430 }, { "epoch": 0.2717011287583658, "grad_norm": 2.001400947570801, "learning_rate": 9.551502027843737e-05, "loss": 0.7902, "step": 5440 }, { "epoch": 0.27220057936270103, "grad_norm": 1.665940284729004, "learning_rate": 9.549876842397827e-05, "loss": 0.7532, "step": 5450 }, { "epoch": 0.27270002996703624, "grad_norm": 3.2563281059265137, "learning_rate": 9.548248856523611e-05, "loss": 0.9413, "step": 5460 }, { "epoch": 0.2731994805713715, "grad_norm": 1.9077765941619873, "learning_rate": 9.546618071223105e-05, "loss": 0.8394, "step": 5470 }, { "epoch": 0.2736989311757067, "grad_norm": 3.954552412033081, "learning_rate": 9.54498448750005e-05, "loss": 1.0901, "step": 5480 }, { "epoch": 0.27419838178004197, "grad_norm": 2.2440567016601562, "learning_rate": 9.543348106359912e-05, "loss": 0.7122, "step": 5490 }, { "epoch": 0.27469783238437717, "grad_norm": 1.053545355796814, "learning_rate": 9.541708928809872e-05, "loss": 0.974, "step": 5500 }, { "epoch": 0.27519728298871243, "grad_norm": 1.2328383922576904, "learning_rate": 9.54006695585884e-05, "loss": 0.6709, "step": 5510 }, { "epoch": 0.27569673359304764, "grad_norm": 1.5231152772903442, "learning_rate": 9.53842218851744e-05, "loss": 0.7991, "step": 5520 }, { "epoch": 0.2761961841973829, "grad_norm": 1.3110510110855103, "learning_rate": 9.536774627798019e-05, "loss": 0.7467, "step": 5530 }, { "epoch": 0.2766956348017181, "grad_norm": 2.076733350753784, "learning_rate": 9.535124274714647e-05, "loss": 0.8851, "step": 5540 }, { "epoch": 0.27719508540605337, "grad_norm": 1.84781014919281, "learning_rate": 9.533471130283103e-05, "loss": 0.7877, "step": 5550 }, { "epoch": 0.27769453601038857, "grad_norm": 3.3306989669799805, "learning_rate": 9.531815195520893e-05, "loss": 0.7778, "step": 5560 }, { "epoch": 0.2781939866147238, "grad_norm": 2.332953691482544, "learning_rate": 9.530156471447237e-05, "loss": 0.7651, "step": 5570 }, { "epoch": 0.27869343721905904, "grad_norm": 1.7331910133361816, "learning_rate": 9.528494959083072e-05, "loss": 0.7889, "step": 5580 }, { "epoch": 0.27919288782339424, "grad_norm": 3.0806610584259033, "learning_rate": 9.52683065945105e-05, "loss": 1.1269, "step": 5590 }, { "epoch": 0.2796923384277295, "grad_norm": 3.531687021255493, "learning_rate": 9.525163573575542e-05, "loss": 0.9093, "step": 5600 }, { "epoch": 0.2801917890320647, "grad_norm": 0.8916457295417786, "learning_rate": 9.52349370248263e-05, "loss": 0.6907, "step": 5610 }, { "epoch": 0.28069123963639997, "grad_norm": 1.2871992588043213, "learning_rate": 9.521821047200112e-05, "loss": 0.7586, "step": 5620 }, { "epoch": 0.2811906902407352, "grad_norm": 2.2138314247131348, "learning_rate": 9.5201456087575e-05, "loss": 0.7528, "step": 5630 }, { "epoch": 0.28169014084507044, "grad_norm": 1.9595929384231567, "learning_rate": 9.51846738818602e-05, "loss": 1.024, "step": 5640 }, { "epoch": 0.28218959144940564, "grad_norm": 1.5594611167907715, "learning_rate": 9.516786386518607e-05, "loss": 0.7457, "step": 5650 }, { "epoch": 0.2826890420537409, "grad_norm": 3.690540075302124, "learning_rate": 9.51510260478991e-05, "loss": 0.7899, "step": 5660 }, { "epoch": 0.2831884926580761, "grad_norm": 0.46564897894859314, "learning_rate": 9.513416044036291e-05, "loss": 0.7996, "step": 5670 }, { "epoch": 0.28368794326241137, "grad_norm": 1.169894814491272, "learning_rate": 9.511726705295817e-05, "loss": 0.6761, "step": 5680 }, { "epoch": 0.2841873938667466, "grad_norm": 3.206878900527954, "learning_rate": 9.510034589608273e-05, "loss": 0.8909, "step": 5690 }, { "epoch": 0.28468684447108183, "grad_norm": 2.0504379272460938, "learning_rate": 9.508339698015145e-05, "loss": 0.9729, "step": 5700 }, { "epoch": 0.28518629507541704, "grad_norm": 3.438324213027954, "learning_rate": 9.506642031559631e-05, "loss": 0.968, "step": 5710 }, { "epoch": 0.28568574567975225, "grad_norm": 1.320440649986267, "learning_rate": 9.504941591286637e-05, "loss": 1.0769, "step": 5720 }, { "epoch": 0.2861851962840875, "grad_norm": 3.160295009613037, "learning_rate": 9.503238378242777e-05, "loss": 0.9258, "step": 5730 }, { "epoch": 0.2866846468884227, "grad_norm": 5.033565521240234, "learning_rate": 9.501532393476371e-05, "loss": 1.1267, "step": 5740 }, { "epoch": 0.287184097492758, "grad_norm": 5.532677173614502, "learning_rate": 9.499823638037444e-05, "loss": 1.1826, "step": 5750 }, { "epoch": 0.2876835480970932, "grad_norm": 0.9898492693901062, "learning_rate": 9.498112112977729e-05, "loss": 0.9613, "step": 5760 }, { "epoch": 0.28818299870142844, "grad_norm": 0.7135995626449585, "learning_rate": 9.496397819350657e-05, "loss": 0.8187, "step": 5770 }, { "epoch": 0.28868244930576364, "grad_norm": 2.109062433242798, "learning_rate": 9.494680758211374e-05, "loss": 0.9668, "step": 5780 }, { "epoch": 0.2891818999100989, "grad_norm": 1.283250093460083, "learning_rate": 9.492960930616719e-05, "loss": 0.9275, "step": 5790 }, { "epoch": 0.2896813505144341, "grad_norm": 2.174206495285034, "learning_rate": 9.491238337625239e-05, "loss": 0.5863, "step": 5800 }, { "epoch": 0.29018080111876937, "grad_norm": 2.015956163406372, "learning_rate": 9.489512980297184e-05, "loss": 0.7635, "step": 5810 }, { "epoch": 0.2906802517231046, "grad_norm": 2.241713523864746, "learning_rate": 9.487784859694501e-05, "loss": 0.6692, "step": 5820 }, { "epoch": 0.29117970232743984, "grad_norm": 3.3468496799468994, "learning_rate": 9.486053976880842e-05, "loss": 0.9513, "step": 5830 }, { "epoch": 0.29167915293177504, "grad_norm": 2.5689618587493896, "learning_rate": 9.484320332921555e-05, "loss": 0.8257, "step": 5840 }, { "epoch": 0.2921786035361103, "grad_norm": 1.690453290939331, "learning_rate": 9.482583928883693e-05, "loss": 0.9261, "step": 5850 }, { "epoch": 0.2926780541404455, "grad_norm": 1.3991612195968628, "learning_rate": 9.480844765836004e-05, "loss": 0.8518, "step": 5860 }, { "epoch": 0.2931775047447807, "grad_norm": 2.159390687942505, "learning_rate": 9.479102844848933e-05, "loss": 0.6723, "step": 5870 }, { "epoch": 0.293676955349116, "grad_norm": 1.380337119102478, "learning_rate": 9.477358166994625e-05, "loss": 0.7546, "step": 5880 }, { "epoch": 0.2941764059534512, "grad_norm": 4.3527045249938965, "learning_rate": 9.475610733346922e-05, "loss": 1.0041, "step": 5890 }, { "epoch": 0.29467585655778644, "grad_norm": 2.296097993850708, "learning_rate": 9.473860544981362e-05, "loss": 0.8609, "step": 5900 }, { "epoch": 0.29517530716212165, "grad_norm": 2.5303385257720947, "learning_rate": 9.472107602975178e-05, "loss": 0.5821, "step": 5910 }, { "epoch": 0.2956747577664569, "grad_norm": 2.853238821029663, "learning_rate": 9.470351908407294e-05, "loss": 0.9672, "step": 5920 }, { "epoch": 0.2961742083707921, "grad_norm": 4.257169246673584, "learning_rate": 9.468593462358337e-05, "loss": 1.0844, "step": 5930 }, { "epoch": 0.2966736589751274, "grad_norm": 1.730938196182251, "learning_rate": 9.466832265910619e-05, "loss": 0.9442, "step": 5940 }, { "epoch": 0.2971731095794626, "grad_norm": 1.6568882465362549, "learning_rate": 9.46506832014815e-05, "loss": 0.8206, "step": 5950 }, { "epoch": 0.29767256018379784, "grad_norm": 2.4923627376556396, "learning_rate": 9.46330162615663e-05, "loss": 0.9823, "step": 5960 }, { "epoch": 0.29817201078813305, "grad_norm": 3.7002739906311035, "learning_rate": 9.461532185023452e-05, "loss": 0.9996, "step": 5970 }, { "epoch": 0.2986714613924683, "grad_norm": 1.3538328409194946, "learning_rate": 9.459759997837696e-05, "loss": 0.7523, "step": 5980 }, { "epoch": 0.2991709119968035, "grad_norm": 2.311701774597168, "learning_rate": 9.457985065690138e-05, "loss": 0.5584, "step": 5990 }, { "epoch": 0.2996703626011388, "grad_norm": 3.0756266117095947, "learning_rate": 9.45620738967324e-05, "loss": 0.8745, "step": 6000 }, { "epoch": 0.300169813205474, "grad_norm": 0.9751172065734863, "learning_rate": 9.454426970881153e-05, "loss": 0.9713, "step": 6010 }, { "epoch": 0.3006692638098092, "grad_norm": 1.8199493885040283, "learning_rate": 9.452643810409716e-05, "loss": 1.0827, "step": 6020 }, { "epoch": 0.30116871441414444, "grad_norm": 2.796093225479126, "learning_rate": 9.450857909356459e-05, "loss": 0.9819, "step": 6030 }, { "epoch": 0.30166816501847965, "grad_norm": 2.0498862266540527, "learning_rate": 9.449069268820592e-05, "loss": 0.7893, "step": 6040 }, { "epoch": 0.3021676156228149, "grad_norm": 3.2884140014648438, "learning_rate": 9.447277889903015e-05, "loss": 0.865, "step": 6050 }, { "epoch": 0.3026670662271501, "grad_norm": 1.4740967750549316, "learning_rate": 9.445483773706318e-05, "loss": 0.8088, "step": 6060 }, { "epoch": 0.3031665168314854, "grad_norm": 2.078514337539673, "learning_rate": 9.443686921334766e-05, "loss": 0.9547, "step": 6070 }, { "epoch": 0.3036659674358206, "grad_norm": 3.334926128387451, "learning_rate": 9.441887333894319e-05, "loss": 0.9822, "step": 6080 }, { "epoch": 0.30416541804015584, "grad_norm": 4.29825496673584, "learning_rate": 9.44008501249261e-05, "loss": 0.8485, "step": 6090 }, { "epoch": 0.30466486864449105, "grad_norm": 3.656981945037842, "learning_rate": 9.43827995823896e-05, "loss": 0.8, "step": 6100 }, { "epoch": 0.3051643192488263, "grad_norm": 2.2716410160064697, "learning_rate": 9.436472172244374e-05, "loss": 1.0473, "step": 6110 }, { "epoch": 0.3056637698531615, "grad_norm": 1.8566354513168335, "learning_rate": 9.434661655621535e-05, "loss": 0.8222, "step": 6120 }, { "epoch": 0.3061632204574968, "grad_norm": 2.2977349758148193, "learning_rate": 9.432848409484807e-05, "loss": 0.7384, "step": 6130 }, { "epoch": 0.306662671061832, "grad_norm": 0.9810746312141418, "learning_rate": 9.431032434950236e-05, "loss": 0.73, "step": 6140 }, { "epoch": 0.30716212166616724, "grad_norm": 1.3762357234954834, "learning_rate": 9.429213733135543e-05, "loss": 0.8214, "step": 6150 }, { "epoch": 0.30766157227050245, "grad_norm": 1.197029709815979, "learning_rate": 9.427392305160134e-05, "loss": 0.9194, "step": 6160 }, { "epoch": 0.30816102287483765, "grad_norm": 1.17782461643219, "learning_rate": 9.42556815214509e-05, "loss": 0.8519, "step": 6170 }, { "epoch": 0.3086604734791729, "grad_norm": 1.609448790550232, "learning_rate": 9.423741275213165e-05, "loss": 0.8852, "step": 6180 }, { "epoch": 0.3091599240835081, "grad_norm": 4.505314826965332, "learning_rate": 9.421911675488797e-05, "loss": 1.0346, "step": 6190 }, { "epoch": 0.3096593746878434, "grad_norm": 2.860553026199341, "learning_rate": 9.420079354098093e-05, "loss": 0.8123, "step": 6200 }, { "epoch": 0.3101588252921786, "grad_norm": 3.7026782035827637, "learning_rate": 9.418244312168842e-05, "loss": 0.7699, "step": 6210 }, { "epoch": 0.31065827589651385, "grad_norm": 1.043831467628479, "learning_rate": 9.4164065508305e-05, "loss": 0.8402, "step": 6220 }, { "epoch": 0.31115772650084905, "grad_norm": 2.187206506729126, "learning_rate": 9.414566071214204e-05, "loss": 0.862, "step": 6230 }, { "epoch": 0.3116571771051843, "grad_norm": 3.2246241569519043, "learning_rate": 9.412722874452758e-05, "loss": 0.9774, "step": 6240 }, { "epoch": 0.3121566277095195, "grad_norm": 1.6527377367019653, "learning_rate": 9.410876961680644e-05, "loss": 0.783, "step": 6250 }, { "epoch": 0.3126560783138548, "grad_norm": 4.022501468658447, "learning_rate": 9.409028334034011e-05, "loss": 0.8063, "step": 6260 }, { "epoch": 0.31315552891819, "grad_norm": 1.2540746927261353, "learning_rate": 9.407176992650681e-05, "loss": 0.8424, "step": 6270 }, { "epoch": 0.31365497952252525, "grad_norm": 3.3473026752471924, "learning_rate": 9.405322938670146e-05, "loss": 0.8454, "step": 6280 }, { "epoch": 0.31415443012686045, "grad_norm": 4.546389579772949, "learning_rate": 9.403466173233569e-05, "loss": 1.1217, "step": 6290 }, { "epoch": 0.3146538807311957, "grad_norm": 1.1712909936904907, "learning_rate": 9.40160669748378e-05, "loss": 0.9331, "step": 6300 }, { "epoch": 0.3151533313355309, "grad_norm": 2.0095112323760986, "learning_rate": 9.399744512565276e-05, "loss": 0.6213, "step": 6310 }, { "epoch": 0.3156527819398661, "grad_norm": 2.3807756900787354, "learning_rate": 9.397879619624226e-05, "loss": 0.8187, "step": 6320 }, { "epoch": 0.3161522325442014, "grad_norm": 10.111104965209961, "learning_rate": 9.39601201980846e-05, "loss": 1.0629, "step": 6330 }, { "epoch": 0.3166516831485366, "grad_norm": 1.3615238666534424, "learning_rate": 9.394141714267481e-05, "loss": 0.7424, "step": 6340 }, { "epoch": 0.31715113375287185, "grad_norm": 2.2437679767608643, "learning_rate": 9.392268704152449e-05, "loss": 1.0506, "step": 6350 }, { "epoch": 0.31765058435720706, "grad_norm": 1.7457895278930664, "learning_rate": 9.390392990616199e-05, "loss": 0.9029, "step": 6360 }, { "epoch": 0.3181500349615423, "grad_norm": 2.4126617908477783, "learning_rate": 9.388514574813216e-05, "loss": 1.2185, "step": 6370 }, { "epoch": 0.3186494855658775, "grad_norm": 2.142484188079834, "learning_rate": 9.386633457899665e-05, "loss": 1.0851, "step": 6380 }, { "epoch": 0.3191489361702128, "grad_norm": 6.224302768707275, "learning_rate": 9.384749641033359e-05, "loss": 0.8494, "step": 6390 }, { "epoch": 0.319648386774548, "grad_norm": 1.4591060876846313, "learning_rate": 9.38286312537378e-05, "loss": 0.9518, "step": 6400 }, { "epoch": 0.32014783737888325, "grad_norm": 4.708948135375977, "learning_rate": 9.38097391208207e-05, "loss": 0.9182, "step": 6410 }, { "epoch": 0.32064728798321845, "grad_norm": 1.6708866357803345, "learning_rate": 9.379082002321031e-05, "loss": 0.7359, "step": 6420 }, { "epoch": 0.3211467385875537, "grad_norm": 2.336205005645752, "learning_rate": 9.377187397255125e-05, "loss": 0.833, "step": 6430 }, { "epoch": 0.3216461891918889, "grad_norm": 1.5494914054870605, "learning_rate": 9.375290098050473e-05, "loss": 1.0873, "step": 6440 }, { "epoch": 0.3221456397962241, "grad_norm": 1.990413784980774, "learning_rate": 9.373390105874853e-05, "loss": 0.7674, "step": 6450 }, { "epoch": 0.3226450904005594, "grad_norm": 2.463172197341919, "learning_rate": 9.371487421897703e-05, "loss": 1.0482, "step": 6460 }, { "epoch": 0.3231445410048946, "grad_norm": 2.06982159614563, "learning_rate": 9.369582047290114e-05, "loss": 0.7685, "step": 6470 }, { "epoch": 0.32364399160922985, "grad_norm": 3.1632564067840576, "learning_rate": 9.367673983224838e-05, "loss": 0.9676, "step": 6480 }, { "epoch": 0.32414344221356506, "grad_norm": 1.9890689849853516, "learning_rate": 9.365763230876276e-05, "loss": 0.8591, "step": 6490 }, { "epoch": 0.3246428928179003, "grad_norm": 3.06638240814209, "learning_rate": 9.363849791420492e-05, "loss": 1.0175, "step": 6500 }, { "epoch": 0.3251423434222355, "grad_norm": 4.139556407928467, "learning_rate": 9.361933666035197e-05, "loss": 0.9419, "step": 6510 }, { "epoch": 0.3256417940265708, "grad_norm": 2.3454649448394775, "learning_rate": 9.360014855899755e-05, "loss": 0.8663, "step": 6520 }, { "epoch": 0.326141244630906, "grad_norm": 2.2644970417022705, "learning_rate": 9.358093362195188e-05, "loss": 0.7967, "step": 6530 }, { "epoch": 0.32664069523524125, "grad_norm": 1.9934505224227905, "learning_rate": 9.356169186104165e-05, "loss": 0.8378, "step": 6540 }, { "epoch": 0.32714014583957646, "grad_norm": 1.7212903499603271, "learning_rate": 9.35424232881101e-05, "loss": 0.7944, "step": 6550 }, { "epoch": 0.3276395964439117, "grad_norm": 1.9699835777282715, "learning_rate": 9.35231279150169e-05, "loss": 0.9057, "step": 6560 }, { "epoch": 0.3281390470482469, "grad_norm": 1.6256062984466553, "learning_rate": 9.35038057536383e-05, "loss": 0.8083, "step": 6570 }, { "epoch": 0.3286384976525822, "grad_norm": 1.4498624801635742, "learning_rate": 9.348445681586702e-05, "loss": 0.8005, "step": 6580 }, { "epoch": 0.3291379482569174, "grad_norm": 2.1569433212280273, "learning_rate": 9.346508111361218e-05, "loss": 0.8273, "step": 6590 }, { "epoch": 0.3296373988612526, "grad_norm": 4.2927703857421875, "learning_rate": 9.34456786587995e-05, "loss": 0.9587, "step": 6600 }, { "epoch": 0.33013684946558786, "grad_norm": 2.9086852073669434, "learning_rate": 9.342624946337106e-05, "loss": 1.002, "step": 6610 }, { "epoch": 0.33063630006992306, "grad_norm": 3.242313861846924, "learning_rate": 9.340679353928548e-05, "loss": 0.8861, "step": 6620 }, { "epoch": 0.3311357506742583, "grad_norm": 1.050492525100708, "learning_rate": 9.338731089851774e-05, "loss": 0.6672, "step": 6630 }, { "epoch": 0.3316352012785935, "grad_norm": 2.7671895027160645, "learning_rate": 9.336780155305935e-05, "loss": 0.8712, "step": 6640 }, { "epoch": 0.3321346518829288, "grad_norm": 3.723975896835327, "learning_rate": 9.334826551491821e-05, "loss": 0.8757, "step": 6650 }, { "epoch": 0.332634102487264, "grad_norm": 1.2424172163009644, "learning_rate": 9.332870279611868e-05, "loss": 1.0399, "step": 6660 }, { "epoch": 0.33313355309159925, "grad_norm": 1.625571370124817, "learning_rate": 9.33091134087015e-05, "loss": 0.9807, "step": 6670 }, { "epoch": 0.33363300369593446, "grad_norm": 1.5173087120056152, "learning_rate": 9.328949736472385e-05, "loss": 0.9511, "step": 6680 }, { "epoch": 0.3341324543002697, "grad_norm": 1.8095574378967285, "learning_rate": 9.326985467625932e-05, "loss": 0.8397, "step": 6690 }, { "epoch": 0.3346319049046049, "grad_norm": 0.9123914241790771, "learning_rate": 9.325018535539793e-05, "loss": 0.9547, "step": 6700 }, { "epoch": 0.3351313555089402, "grad_norm": 2.3337626457214355, "learning_rate": 9.3230489414246e-05, "loss": 0.7205, "step": 6710 }, { "epoch": 0.3356308061132754, "grad_norm": 1.0187079906463623, "learning_rate": 9.321076686492633e-05, "loss": 0.6824, "step": 6720 }, { "epoch": 0.33613025671761065, "grad_norm": 2.9060966968536377, "learning_rate": 9.319101771957804e-05, "loss": 0.9898, "step": 6730 }, { "epoch": 0.33662970732194586, "grad_norm": 1.6975250244140625, "learning_rate": 9.317124199035663e-05, "loss": 0.7268, "step": 6740 }, { "epoch": 0.33712915792628106, "grad_norm": 0.9867602586746216, "learning_rate": 9.315143968943401e-05, "loss": 0.6433, "step": 6750 }, { "epoch": 0.3376286085306163, "grad_norm": 4.174263954162598, "learning_rate": 9.313161082899834e-05, "loss": 0.7511, "step": 6760 }, { "epoch": 0.33812805913495153, "grad_norm": 3.0919437408447266, "learning_rate": 9.311175542125427e-05, "loss": 0.8776, "step": 6770 }, { "epoch": 0.3386275097392868, "grad_norm": 1.691468596458435, "learning_rate": 9.309187347842266e-05, "loss": 0.7749, "step": 6780 }, { "epoch": 0.339126960343622, "grad_norm": 1.6738675832748413, "learning_rate": 9.307196501274077e-05, "loss": 0.9477, "step": 6790 }, { "epoch": 0.33962641094795726, "grad_norm": 1.3428436517715454, "learning_rate": 9.305203003646217e-05, "loss": 0.9914, "step": 6800 }, { "epoch": 0.34012586155229246, "grad_norm": 2.856218099594116, "learning_rate": 9.303206856185674e-05, "loss": 0.7905, "step": 6810 }, { "epoch": 0.3406253121566277, "grad_norm": 5.577542304992676, "learning_rate": 9.301208060121067e-05, "loss": 0.8945, "step": 6820 }, { "epoch": 0.34112476276096293, "grad_norm": 2.0562527179718018, "learning_rate": 9.299206616682647e-05, "loss": 1.0614, "step": 6830 }, { "epoch": 0.3416242133652982, "grad_norm": 1.6865025758743286, "learning_rate": 9.297202527102294e-05, "loss": 0.8211, "step": 6840 }, { "epoch": 0.3421236639696334, "grad_norm": 1.926328420639038, "learning_rate": 9.295195792613514e-05, "loss": 0.8534, "step": 6850 }, { "epoch": 0.34262311457396866, "grad_norm": 1.5476715564727783, "learning_rate": 9.293186414451444e-05, "loss": 0.774, "step": 6860 }, { "epoch": 0.34312256517830386, "grad_norm": 1.5437580347061157, "learning_rate": 9.291174393852849e-05, "loss": 0.8928, "step": 6870 }, { "epoch": 0.3436220157826391, "grad_norm": 1.0636396408081055, "learning_rate": 9.289159732056114e-05, "loss": 0.8541, "step": 6880 }, { "epoch": 0.34412146638697433, "grad_norm": 1.2523255348205566, "learning_rate": 9.287142430301256e-05, "loss": 1.0726, "step": 6890 }, { "epoch": 0.34462091699130953, "grad_norm": 2.373462677001953, "learning_rate": 9.285122489829918e-05, "loss": 0.6725, "step": 6900 }, { "epoch": 0.3451203675956448, "grad_norm": 3.2414791584014893, "learning_rate": 9.28309991188536e-05, "loss": 0.8107, "step": 6910 }, { "epoch": 0.34561981819998, "grad_norm": 0.4396897554397583, "learning_rate": 9.281074697712471e-05, "loss": 0.7285, "step": 6920 }, { "epoch": 0.34611926880431526, "grad_norm": 1.6284270286560059, "learning_rate": 9.27904684855776e-05, "loss": 0.8076, "step": 6930 }, { "epoch": 0.34661871940865047, "grad_norm": 1.275205373764038, "learning_rate": 9.277016365669359e-05, "loss": 0.7222, "step": 6940 }, { "epoch": 0.3471181700129857, "grad_norm": 2.0777640342712402, "learning_rate": 9.274983250297025e-05, "loss": 1.0457, "step": 6950 }, { "epoch": 0.34761762061732093, "grad_norm": 2.362515926361084, "learning_rate": 9.272947503692126e-05, "loss": 1.0001, "step": 6960 }, { "epoch": 0.3481170712216562, "grad_norm": 2.075861930847168, "learning_rate": 9.270909127107655e-05, "loss": 0.8281, "step": 6970 }, { "epoch": 0.3486165218259914, "grad_norm": 7.117288112640381, "learning_rate": 9.268868121798227e-05, "loss": 1.1095, "step": 6980 }, { "epoch": 0.34911597243032666, "grad_norm": 2.2000198364257812, "learning_rate": 9.266824489020069e-05, "loss": 0.7435, "step": 6990 }, { "epoch": 0.34961542303466187, "grad_norm": 4.886950969696045, "learning_rate": 9.264778230031027e-05, "loss": 0.783, "step": 7000 }, { "epoch": 0.3501148736389971, "grad_norm": 2.541064977645874, "learning_rate": 9.262729346090566e-05, "loss": 0.8305, "step": 7010 }, { "epoch": 0.35061432424333233, "grad_norm": 2.1147301197052, "learning_rate": 9.260677838459764e-05, "loss": 1.0566, "step": 7020 }, { "epoch": 0.3511137748476676, "grad_norm": 6.015203952789307, "learning_rate": 9.258623708401313e-05, "loss": 1.0223, "step": 7030 }, { "epoch": 0.3516132254520028, "grad_norm": 3.577023506164551, "learning_rate": 9.25656695717952e-05, "loss": 0.8236, "step": 7040 }, { "epoch": 0.352112676056338, "grad_norm": 1.7369171380996704, "learning_rate": 9.254507586060311e-05, "loss": 0.8042, "step": 7050 }, { "epoch": 0.35261212666067326, "grad_norm": 2.276552677154541, "learning_rate": 9.252445596311214e-05, "loss": 0.8826, "step": 7060 }, { "epoch": 0.35311157726500847, "grad_norm": 3.7419540882110596, "learning_rate": 9.250380989201378e-05, "loss": 0.7798, "step": 7070 }, { "epoch": 0.35361102786934373, "grad_norm": 1.865341067314148, "learning_rate": 9.248313766001558e-05, "loss": 0.8, "step": 7080 }, { "epoch": 0.35411047847367894, "grad_norm": 2.2110178470611572, "learning_rate": 9.246243927984118e-05, "loss": 0.7609, "step": 7090 }, { "epoch": 0.3546099290780142, "grad_norm": 1.0193909406661987, "learning_rate": 9.244171476423037e-05, "loss": 1.0596, "step": 7100 }, { "epoch": 0.3551093796823494, "grad_norm": 1.9720994234085083, "learning_rate": 9.242096412593897e-05, "loss": 0.936, "step": 7110 }, { "epoch": 0.35560883028668466, "grad_norm": 3.3525550365448, "learning_rate": 9.240018737773892e-05, "loss": 1.0104, "step": 7120 }, { "epoch": 0.35610828089101987, "grad_norm": 2.81687331199646, "learning_rate": 9.237938453241821e-05, "loss": 1.1244, "step": 7130 }, { "epoch": 0.35660773149535513, "grad_norm": 2.2435097694396973, "learning_rate": 9.23585556027809e-05, "loss": 0.7526, "step": 7140 }, { "epoch": 0.35710718209969033, "grad_norm": 2.639529228210449, "learning_rate": 9.233770060164708e-05, "loss": 0.9304, "step": 7150 }, { "epoch": 0.3576066327040256, "grad_norm": 2.164951801300049, "learning_rate": 9.231681954185293e-05, "loss": 0.8313, "step": 7160 }, { "epoch": 0.3581060833083608, "grad_norm": 2.807096481323242, "learning_rate": 9.229591243625064e-05, "loss": 1.0993, "step": 7170 }, { "epoch": 0.35860553391269606, "grad_norm": 1.1400171518325806, "learning_rate": 9.227497929770843e-05, "loss": 0.8521, "step": 7180 }, { "epoch": 0.35910498451703127, "grad_norm": 3.1278467178344727, "learning_rate": 9.225402013911057e-05, "loss": 0.9987, "step": 7190 }, { "epoch": 0.3596044351213665, "grad_norm": 1.8795716762542725, "learning_rate": 9.22330349733573e-05, "loss": 0.9404, "step": 7200 }, { "epoch": 0.36010388572570173, "grad_norm": 2.5907552242279053, "learning_rate": 9.221202381336489e-05, "loss": 0.8723, "step": 7210 }, { "epoch": 0.36060333633003694, "grad_norm": 2.619285821914673, "learning_rate": 9.219098667206565e-05, "loss": 0.8166, "step": 7220 }, { "epoch": 0.3611027869343722, "grad_norm": 1.0197978019714355, "learning_rate": 9.216992356240782e-05, "loss": 0.8487, "step": 7230 }, { "epoch": 0.3616022375387074, "grad_norm": 1.6571626663208008, "learning_rate": 9.214883449735563e-05, "loss": 1.0165, "step": 7240 }, { "epoch": 0.36210168814304267, "grad_norm": 3.8328757286071777, "learning_rate": 9.212771948988935e-05, "loss": 1.1171, "step": 7250 }, { "epoch": 0.36260113874737787, "grad_norm": 2.775155782699585, "learning_rate": 9.210657855300511e-05, "loss": 0.8796, "step": 7260 }, { "epoch": 0.36310058935171313, "grad_norm": 1.5948903560638428, "learning_rate": 9.208541169971511e-05, "loss": 0.9731, "step": 7270 }, { "epoch": 0.36360003995604834, "grad_norm": 2.231907606124878, "learning_rate": 9.206421894304743e-05, "loss": 0.9123, "step": 7280 }, { "epoch": 0.3640994905603836, "grad_norm": 1.6792292594909668, "learning_rate": 9.20430002960461e-05, "loss": 0.8166, "step": 7290 }, { "epoch": 0.3645989411647188, "grad_norm": 2.6319034099578857, "learning_rate": 9.202175577177114e-05, "loss": 1.0514, "step": 7300 }, { "epoch": 0.36509839176905406, "grad_norm": 1.0980263948440552, "learning_rate": 9.200048538329844e-05, "loss": 0.8034, "step": 7310 }, { "epoch": 0.36559784237338927, "grad_norm": 3.211888313293457, "learning_rate": 9.197918914371979e-05, "loss": 1.0184, "step": 7320 }, { "epoch": 0.3660972929777245, "grad_norm": 2.2598297595977783, "learning_rate": 9.195786706614298e-05, "loss": 0.9803, "step": 7330 }, { "epoch": 0.36659674358205974, "grad_norm": 1.6521825790405273, "learning_rate": 9.193651916369162e-05, "loss": 0.8432, "step": 7340 }, { "epoch": 0.36709619418639494, "grad_norm": 2.371178388595581, "learning_rate": 9.191514544950525e-05, "loss": 0.8751, "step": 7350 }, { "epoch": 0.3675956447907302, "grad_norm": 2.6362617015838623, "learning_rate": 9.189374593673932e-05, "loss": 0.8984, "step": 7360 }, { "epoch": 0.3680950953950654, "grad_norm": 1.8687587976455688, "learning_rate": 9.187232063856509e-05, "loss": 0.8932, "step": 7370 }, { "epoch": 0.36859454599940067, "grad_norm": 2.2871909141540527, "learning_rate": 9.185086956816975e-05, "loss": 0.7942, "step": 7380 }, { "epoch": 0.3690939966037359, "grad_norm": 2.3869709968566895, "learning_rate": 9.182939273875634e-05, "loss": 0.8035, "step": 7390 }, { "epoch": 0.36959344720807114, "grad_norm": 1.6239734888076782, "learning_rate": 9.180789016354376e-05, "loss": 0.7439, "step": 7400 }, { "epoch": 0.37009289781240634, "grad_norm": 1.3308115005493164, "learning_rate": 9.178636185576672e-05, "loss": 0.7992, "step": 7410 }, { "epoch": 0.3705923484167416, "grad_norm": 0.9633936882019043, "learning_rate": 9.17648078286758e-05, "loss": 1.0191, "step": 7420 }, { "epoch": 0.3710917990210768, "grad_norm": 0.912649929523468, "learning_rate": 9.174322809553743e-05, "loss": 0.8333, "step": 7430 }, { "epoch": 0.37159124962541207, "grad_norm": 4.272607803344727, "learning_rate": 9.172162266963382e-05, "loss": 0.8305, "step": 7440 }, { "epoch": 0.3720907002297473, "grad_norm": 1.9677715301513672, "learning_rate": 9.169999156426301e-05, "loss": 0.9996, "step": 7450 }, { "epoch": 0.37259015083408253, "grad_norm": 1.1595687866210938, "learning_rate": 9.167833479273883e-05, "loss": 0.7525, "step": 7460 }, { "epoch": 0.37308960143841774, "grad_norm": 1.137204885482788, "learning_rate": 9.165665236839095e-05, "loss": 0.7123, "step": 7470 }, { "epoch": 0.37358905204275294, "grad_norm": 2.023561716079712, "learning_rate": 9.16349443045648e-05, "loss": 0.7421, "step": 7480 }, { "epoch": 0.3740885026470882, "grad_norm": 0.4606032371520996, "learning_rate": 9.161321061462157e-05, "loss": 0.7728, "step": 7490 }, { "epoch": 0.3745879532514234, "grad_norm": 8.039925575256348, "learning_rate": 9.159145131193827e-05, "loss": 1.0022, "step": 7500 }, { "epoch": 0.37508740385575867, "grad_norm": 2.0657260417938232, "learning_rate": 9.156966640990763e-05, "loss": 0.9218, "step": 7510 }, { "epoch": 0.3755868544600939, "grad_norm": 1.5123809576034546, "learning_rate": 9.15478559219382e-05, "loss": 0.6706, "step": 7520 }, { "epoch": 0.37608630506442914, "grad_norm": 1.5668985843658447, "learning_rate": 9.152601986145417e-05, "loss": 0.8428, "step": 7530 }, { "epoch": 0.37658575566876434, "grad_norm": 4.2369513511657715, "learning_rate": 9.15041582418956e-05, "loss": 0.8685, "step": 7540 }, { "epoch": 0.3770852062730996, "grad_norm": 1.7388386726379395, "learning_rate": 9.148227107671818e-05, "loss": 0.8348, "step": 7550 }, { "epoch": 0.3775846568774348, "grad_norm": 1.765231728553772, "learning_rate": 9.146035837939335e-05, "loss": 0.8973, "step": 7560 }, { "epoch": 0.37808410748177007, "grad_norm": 1.4724568128585815, "learning_rate": 9.143842016340831e-05, "loss": 0.9471, "step": 7570 }, { "epoch": 0.3785835580861053, "grad_norm": 1.9186527729034424, "learning_rate": 9.141645644226591e-05, "loss": 0.8671, "step": 7580 }, { "epoch": 0.37908300869044054, "grad_norm": 1.309566855430603, "learning_rate": 9.13944672294847e-05, "loss": 0.8727, "step": 7590 }, { "epoch": 0.37958245929477574, "grad_norm": 1.2128655910491943, "learning_rate": 9.137245253859898e-05, "loss": 0.7453, "step": 7600 }, { "epoch": 0.380081909899111, "grad_norm": 8.178140640258789, "learning_rate": 9.135041238315868e-05, "loss": 0.9213, "step": 7610 }, { "epoch": 0.3805813605034462, "grad_norm": 2.761096477508545, "learning_rate": 9.132834677672939e-05, "loss": 1.1112, "step": 7620 }, { "epoch": 0.3810808111077814, "grad_norm": 4.105166912078857, "learning_rate": 9.130625573289242e-05, "loss": 0.9173, "step": 7630 }, { "epoch": 0.3815802617121167, "grad_norm": 1.7994587421417236, "learning_rate": 9.128413926524468e-05, "loss": 0.9755, "step": 7640 }, { "epoch": 0.3820797123164519, "grad_norm": 1.1037505865097046, "learning_rate": 9.126199738739878e-05, "loss": 0.8283, "step": 7650 }, { "epoch": 0.38257916292078714, "grad_norm": 6.051690101623535, "learning_rate": 9.123983011298292e-05, "loss": 0.8041, "step": 7660 }, { "epoch": 0.38307861352512235, "grad_norm": 3.525859832763672, "learning_rate": 9.121763745564098e-05, "loss": 1.0139, "step": 7670 }, { "epoch": 0.3835780641294576, "grad_norm": 3.6487855911254883, "learning_rate": 9.119541942903241e-05, "loss": 0.9372, "step": 7680 }, { "epoch": 0.3840775147337928, "grad_norm": 1.3445885181427002, "learning_rate": 9.117317604683233e-05, "loss": 0.7081, "step": 7690 }, { "epoch": 0.3845769653381281, "grad_norm": 1.1839733123779297, "learning_rate": 9.115090732273141e-05, "loss": 0.737, "step": 7700 }, { "epoch": 0.3850764159424633, "grad_norm": 2.7112619876861572, "learning_rate": 9.112861327043598e-05, "loss": 0.9045, "step": 7710 }, { "epoch": 0.38557586654679854, "grad_norm": 1.1596410274505615, "learning_rate": 9.110629390366791e-05, "loss": 0.8373, "step": 7720 }, { "epoch": 0.38607531715113375, "grad_norm": 1.6945937871932983, "learning_rate": 9.108394923616468e-05, "loss": 0.9547, "step": 7730 }, { "epoch": 0.386574767755469, "grad_norm": 2.133749008178711, "learning_rate": 9.106157928167931e-05, "loss": 0.9767, "step": 7740 }, { "epoch": 0.3870742183598042, "grad_norm": 1.468874216079712, "learning_rate": 9.103918405398042e-05, "loss": 0.7833, "step": 7750 }, { "epoch": 0.3875736689641395, "grad_norm": 3.5914108753204346, "learning_rate": 9.101676356685216e-05, "loss": 0.8531, "step": 7760 }, { "epoch": 0.3880731195684747, "grad_norm": 3.0970280170440674, "learning_rate": 9.099431783409424e-05, "loss": 0.65, "step": 7770 }, { "epoch": 0.3885725701728099, "grad_norm": 2.831838846206665, "learning_rate": 9.097184686952192e-05, "loss": 0.7634, "step": 7780 }, { "epoch": 0.38907202077714514, "grad_norm": 1.6186414957046509, "learning_rate": 9.094935068696596e-05, "loss": 0.8975, "step": 7790 }, { "epoch": 0.38957147138148035, "grad_norm": 1.0518146753311157, "learning_rate": 9.092682930027264e-05, "loss": 1.136, "step": 7800 }, { "epoch": 0.3900709219858156, "grad_norm": 3.9278268814086914, "learning_rate": 9.09042827233038e-05, "loss": 0.9904, "step": 7810 }, { "epoch": 0.3905703725901508, "grad_norm": 5.630587577819824, "learning_rate": 9.088171096993675e-05, "loss": 1.1147, "step": 7820 }, { "epoch": 0.3910698231944861, "grad_norm": 4.258095741271973, "learning_rate": 9.085911405406428e-05, "loss": 1.0105, "step": 7830 }, { "epoch": 0.3915692737988213, "grad_norm": 3.0702295303344727, "learning_rate": 9.083649198959472e-05, "loss": 0.9473, "step": 7840 }, { "epoch": 0.39206872440315654, "grad_norm": 1.486130714416504, "learning_rate": 9.081384479045183e-05, "loss": 0.8446, "step": 7850 }, { "epoch": 0.39256817500749175, "grad_norm": 6.066441059112549, "learning_rate": 9.079117247057484e-05, "loss": 0.9462, "step": 7860 }, { "epoch": 0.393067625611827, "grad_norm": 4.194586753845215, "learning_rate": 9.07684750439185e-05, "loss": 0.9815, "step": 7870 }, { "epoch": 0.3935670762161622, "grad_norm": 3.13558292388916, "learning_rate": 9.074575252445293e-05, "loss": 0.9681, "step": 7880 }, { "epoch": 0.3940665268204975, "grad_norm": 1.231960415840149, "learning_rate": 9.072300492616376e-05, "loss": 1.0147, "step": 7890 }, { "epoch": 0.3945659774248327, "grad_norm": 0.8152504563331604, "learning_rate": 9.070023226305202e-05, "loss": 0.6973, "step": 7900 }, { "epoch": 0.39506542802916794, "grad_norm": 2.331622362136841, "learning_rate": 9.067743454913418e-05, "loss": 0.8363, "step": 7910 }, { "epoch": 0.39556487863350315, "grad_norm": 1.8101062774658203, "learning_rate": 9.065461179844215e-05, "loss": 0.744, "step": 7920 }, { "epoch": 0.39606432923783835, "grad_norm": 5.449390411376953, "learning_rate": 9.063176402502321e-05, "loss": 0.9046, "step": 7930 }, { "epoch": 0.3965637798421736, "grad_norm": 1.1542668342590332, "learning_rate": 9.060889124294006e-05, "loss": 1.0153, "step": 7940 }, { "epoch": 0.3970632304465088, "grad_norm": 1.930710792541504, "learning_rate": 9.05859934662708e-05, "loss": 0.774, "step": 7950 }, { "epoch": 0.3975626810508441, "grad_norm": 2.1788411140441895, "learning_rate": 9.056307070910888e-05, "loss": 0.9153, "step": 7960 }, { "epoch": 0.3980621316551793, "grad_norm": 0.6524488925933838, "learning_rate": 9.054012298556319e-05, "loss": 0.7557, "step": 7970 }, { "epoch": 0.39856158225951455, "grad_norm": 2.8747739791870117, "learning_rate": 9.051715030975793e-05, "loss": 0.7849, "step": 7980 }, { "epoch": 0.39906103286384975, "grad_norm": 2.320890426635742, "learning_rate": 9.049415269583268e-05, "loss": 0.9834, "step": 7990 }, { "epoch": 0.399560483468185, "grad_norm": 5.673957824707031, "learning_rate": 9.047113015794235e-05, "loss": 1.02, "step": 8000 }, { "epoch": 0.4000599340725202, "grad_norm": 2.38036847114563, "learning_rate": 9.044808271025722e-05, "loss": 0.9644, "step": 8010 }, { "epoch": 0.4005593846768555, "grad_norm": 2.981459617614746, "learning_rate": 9.042501036696289e-05, "loss": 1.054, "step": 8020 }, { "epoch": 0.4010588352811907, "grad_norm": 2.8263211250305176, "learning_rate": 9.04019131422603e-05, "loss": 0.8065, "step": 8030 }, { "epoch": 0.40155828588552595, "grad_norm": 2.295149326324463, "learning_rate": 9.037879105036564e-05, "loss": 0.8118, "step": 8040 }, { "epoch": 0.40205773648986115, "grad_norm": 2.1756789684295654, "learning_rate": 9.035564410551049e-05, "loss": 0.8478, "step": 8050 }, { "epoch": 0.4025571870941964, "grad_norm": 2.2337467670440674, "learning_rate": 9.033247232194166e-05, "loss": 0.8868, "step": 8060 }, { "epoch": 0.4030566376985316, "grad_norm": 1.9026663303375244, "learning_rate": 9.030927571392132e-05, "loss": 1.1555, "step": 8070 }, { "epoch": 0.4035560883028668, "grad_norm": 2.4279093742370605, "learning_rate": 9.028605429572683e-05, "loss": 0.7624, "step": 8080 }, { "epoch": 0.4040555389072021, "grad_norm": 1.2404896020889282, "learning_rate": 9.026280808165087e-05, "loss": 0.8664, "step": 8090 }, { "epoch": 0.4045549895115373, "grad_norm": 1.4976485967636108, "learning_rate": 9.02395370860014e-05, "loss": 0.8968, "step": 8100 }, { "epoch": 0.40505444011587255, "grad_norm": 1.6639426946640015, "learning_rate": 9.02162413231016e-05, "loss": 0.9324, "step": 8110 }, { "epoch": 0.40555389072020775, "grad_norm": 1.4774401187896729, "learning_rate": 9.019292080728992e-05, "loss": 0.9493, "step": 8120 }, { "epoch": 0.406053341324543, "grad_norm": 0.4843757748603821, "learning_rate": 9.016957555292e-05, "loss": 0.7782, "step": 8130 }, { "epoch": 0.4065527919288782, "grad_norm": 2.6284985542297363, "learning_rate": 9.014620557436077e-05, "loss": 0.835, "step": 8140 }, { "epoch": 0.4070522425332135, "grad_norm": 1.5704331398010254, "learning_rate": 9.012281088599632e-05, "loss": 0.9565, "step": 8150 }, { "epoch": 0.4075516931375487, "grad_norm": 5.700881004333496, "learning_rate": 9.009939150222599e-05, "loss": 1.0529, "step": 8160 }, { "epoch": 0.40805114374188395, "grad_norm": 2.501964807510376, "learning_rate": 9.007594743746429e-05, "loss": 0.8192, "step": 8170 }, { "epoch": 0.40855059434621915, "grad_norm": 2.092198371887207, "learning_rate": 9.005247870614095e-05, "loss": 0.7568, "step": 8180 }, { "epoch": 0.4090500449505544, "grad_norm": 1.0870630741119385, "learning_rate": 9.002898532270084e-05, "loss": 0.6473, "step": 8190 }, { "epoch": 0.4095494955548896, "grad_norm": 1.612242579460144, "learning_rate": 9.000546730160406e-05, "loss": 0.7603, "step": 8200 }, { "epoch": 0.4100489461592248, "grad_norm": 1.722089171409607, "learning_rate": 8.998192465732582e-05, "loss": 0.7233, "step": 8210 }, { "epoch": 0.4105483967635601, "grad_norm": 2.79156494140625, "learning_rate": 8.995835740435653e-05, "loss": 0.6629, "step": 8220 }, { "epoch": 0.4110478473678953, "grad_norm": 1.4629532098770142, "learning_rate": 8.99347655572017e-05, "loss": 0.9104, "step": 8230 }, { "epoch": 0.41154729797223055, "grad_norm": 2.188950538635254, "learning_rate": 8.991114913038202e-05, "loss": 0.9731, "step": 8240 }, { "epoch": 0.41204674857656576, "grad_norm": 2.220014810562134, "learning_rate": 8.98875081384333e-05, "loss": 0.8968, "step": 8250 }, { "epoch": 0.412546199180901, "grad_norm": 0.8119724988937378, "learning_rate": 8.986384259590645e-05, "loss": 0.7018, "step": 8260 }, { "epoch": 0.4130456497852362, "grad_norm": 7.188334941864014, "learning_rate": 8.98401525173675e-05, "loss": 0.9704, "step": 8270 }, { "epoch": 0.4135451003895715, "grad_norm": 0.9612888693809509, "learning_rate": 8.98164379173976e-05, "loss": 0.6979, "step": 8280 }, { "epoch": 0.4140445509939067, "grad_norm": 3.1759042739868164, "learning_rate": 8.979269881059295e-05, "loss": 0.8401, "step": 8290 }, { "epoch": 0.41454400159824195, "grad_norm": 1.566257119178772, "learning_rate": 8.976893521156491e-05, "loss": 0.7969, "step": 8300 }, { "epoch": 0.41504345220257716, "grad_norm": 1.340470552444458, "learning_rate": 8.974514713493983e-05, "loss": 0.7772, "step": 8310 }, { "epoch": 0.4155429028069124, "grad_norm": 2.2976415157318115, "learning_rate": 8.972133459535914e-05, "loss": 0.8865, "step": 8320 }, { "epoch": 0.4160423534112476, "grad_norm": 2.3305583000183105, "learning_rate": 8.969749760747938e-05, "loss": 0.8805, "step": 8330 }, { "epoch": 0.4165418040155829, "grad_norm": 1.950758457183838, "learning_rate": 8.967363618597213e-05, "loss": 0.7312, "step": 8340 }, { "epoch": 0.4170412546199181, "grad_norm": 1.5078762769699097, "learning_rate": 8.964975034552394e-05, "loss": 0.8563, "step": 8350 }, { "epoch": 0.4175407052242533, "grad_norm": 1.504718542098999, "learning_rate": 8.962584010083645e-05, "loss": 0.8045, "step": 8360 }, { "epoch": 0.41804015582858856, "grad_norm": 1.307501196861267, "learning_rate": 8.960190546662632e-05, "loss": 0.6778, "step": 8370 }, { "epoch": 0.41853960643292376, "grad_norm": 4.109325408935547, "learning_rate": 8.957794645762519e-05, "loss": 1.1622, "step": 8380 }, { "epoch": 0.419039057037259, "grad_norm": 4.46181583404541, "learning_rate": 8.955396308857973e-05, "loss": 0.8266, "step": 8390 }, { "epoch": 0.4195385076415942, "grad_norm": 2.433459520339966, "learning_rate": 8.952995537425157e-05, "loss": 0.8096, "step": 8400 }, { "epoch": 0.4200379582459295, "grad_norm": 1.5981595516204834, "learning_rate": 8.950592332941739e-05, "loss": 0.8317, "step": 8410 }, { "epoch": 0.4205374088502647, "grad_norm": 1.0722473859786987, "learning_rate": 8.948186696886877e-05, "loss": 0.7952, "step": 8420 }, { "epoch": 0.42103685945459995, "grad_norm": 1.6660276651382446, "learning_rate": 8.945778630741231e-05, "loss": 0.8521, "step": 8430 }, { "epoch": 0.42153631005893516, "grad_norm": 3.410531759262085, "learning_rate": 8.943368135986954e-05, "loss": 0.8354, "step": 8440 }, { "epoch": 0.4220357606632704, "grad_norm": 2.2601499557495117, "learning_rate": 8.940955214107693e-05, "loss": 0.962, "step": 8450 }, { "epoch": 0.4225352112676056, "grad_norm": 1.2968422174453735, "learning_rate": 8.938539866588592e-05, "loss": 0.7417, "step": 8460 }, { "epoch": 0.4230346618719409, "grad_norm": 1.7342209815979004, "learning_rate": 8.936122094916286e-05, "loss": 0.6829, "step": 8470 }, { "epoch": 0.4235341124762761, "grad_norm": 1.3093066215515137, "learning_rate": 8.933701900578901e-05, "loss": 0.7837, "step": 8480 }, { "epoch": 0.42403356308061135, "grad_norm": 2.27400279045105, "learning_rate": 8.931279285066059e-05, "loss": 1.1384, "step": 8490 }, { "epoch": 0.42453301368494656, "grad_norm": 1.4279674291610718, "learning_rate": 8.928854249868865e-05, "loss": 1.0138, "step": 8500 }, { "epoch": 0.42503246428928176, "grad_norm": 1.471238374710083, "learning_rate": 8.926426796479919e-05, "loss": 1.0406, "step": 8510 }, { "epoch": 0.425531914893617, "grad_norm": 1.3381599187850952, "learning_rate": 8.923996926393305e-05, "loss": 0.9937, "step": 8520 }, { "epoch": 0.42603136549795223, "grad_norm": 1.3054723739624023, "learning_rate": 8.9215646411046e-05, "loss": 0.8807, "step": 8530 }, { "epoch": 0.4265308161022875, "grad_norm": 1.0258605480194092, "learning_rate": 8.919129942110864e-05, "loss": 1.1096, "step": 8540 }, { "epoch": 0.4270302667066227, "grad_norm": 2.3745763301849365, "learning_rate": 8.916692830910642e-05, "loss": 0.8066, "step": 8550 }, { "epoch": 0.42752971731095796, "grad_norm": 0.9149400591850281, "learning_rate": 8.914253309003964e-05, "loss": 0.7646, "step": 8560 }, { "epoch": 0.42802916791529316, "grad_norm": 0.8165902495384216, "learning_rate": 8.911811377892345e-05, "loss": 0.8465, "step": 8570 }, { "epoch": 0.4285286185196284, "grad_norm": 3.450925827026367, "learning_rate": 8.909367039078784e-05, "loss": 0.9557, "step": 8580 }, { "epoch": 0.42902806912396363, "grad_norm": 1.085161805152893, "learning_rate": 8.906920294067759e-05, "loss": 0.89, "step": 8590 }, { "epoch": 0.4295275197282989, "grad_norm": 1.9233146905899048, "learning_rate": 8.904471144365232e-05, "loss": 0.7721, "step": 8600 }, { "epoch": 0.4300269703326341, "grad_norm": 0.30743876099586487, "learning_rate": 8.90201959147864e-05, "loss": 0.9018, "step": 8610 }, { "epoch": 0.43052642093696936, "grad_norm": 2.8039252758026123, "learning_rate": 8.899565636916904e-05, "loss": 0.9276, "step": 8620 }, { "epoch": 0.43102587154130456, "grad_norm": 1.278592824935913, "learning_rate": 8.897109282190423e-05, "loss": 0.846, "step": 8630 }, { "epoch": 0.4315253221456398, "grad_norm": 1.5045511722564697, "learning_rate": 8.894650528811072e-05, "loss": 0.8324, "step": 8640 }, { "epoch": 0.43202477274997503, "grad_norm": 1.3858511447906494, "learning_rate": 8.892189378292202e-05, "loss": 0.7414, "step": 8650 }, { "epoch": 0.43252422335431023, "grad_norm": 4.206646919250488, "learning_rate": 8.88972583214864e-05, "loss": 0.766, "step": 8660 }, { "epoch": 0.4330236739586455, "grad_norm": 1.2903480529785156, "learning_rate": 8.887259891896684e-05, "loss": 0.8342, "step": 8670 }, { "epoch": 0.4335231245629807, "grad_norm": 1.558398962020874, "learning_rate": 8.884791559054113e-05, "loss": 0.7591, "step": 8680 }, { "epoch": 0.43402257516731596, "grad_norm": 0.9594738483428955, "learning_rate": 8.882320835140174e-05, "loss": 0.8384, "step": 8690 }, { "epoch": 0.43452202577165117, "grad_norm": 2.354264259338379, "learning_rate": 8.879847721675586e-05, "loss": 1.1023, "step": 8700 }, { "epoch": 0.4350214763759864, "grad_norm": 1.0832626819610596, "learning_rate": 8.877372220182539e-05, "loss": 0.9754, "step": 8710 }, { "epoch": 0.43552092698032163, "grad_norm": 1.4442058801651, "learning_rate": 8.87489433218469e-05, "loss": 0.6892, "step": 8720 }, { "epoch": 0.4360203775846569, "grad_norm": 2.1258246898651123, "learning_rate": 8.872414059207172e-05, "loss": 0.932, "step": 8730 }, { "epoch": 0.4365198281889921, "grad_norm": 1.835067868232727, "learning_rate": 8.869931402776579e-05, "loss": 1.2159, "step": 8740 }, { "epoch": 0.43701927879332736, "grad_norm": 3.0866270065307617, "learning_rate": 8.867446364420975e-05, "loss": 0.7718, "step": 8750 }, { "epoch": 0.43751872939766256, "grad_norm": 3.470754623413086, "learning_rate": 8.86495894566989e-05, "loss": 0.8308, "step": 8760 }, { "epoch": 0.4380181800019978, "grad_norm": 2.440117835998535, "learning_rate": 8.862469148054319e-05, "loss": 0.9607, "step": 8770 }, { "epoch": 0.43851763060633303, "grad_norm": 1.012770175933838, "learning_rate": 8.859976973106721e-05, "loss": 0.6873, "step": 8780 }, { "epoch": 0.4390170812106683, "grad_norm": 1.4482653141021729, "learning_rate": 8.857482422361018e-05, "loss": 0.7227, "step": 8790 }, { "epoch": 0.4395165318150035, "grad_norm": 3.04582142829895, "learning_rate": 8.854985497352595e-05, "loss": 0.8951, "step": 8800 }, { "epoch": 0.4400159824193387, "grad_norm": 1.4655203819274902, "learning_rate": 8.852486199618298e-05, "loss": 1.1724, "step": 8810 }, { "epoch": 0.44051543302367396, "grad_norm": 3.0861003398895264, "learning_rate": 8.84998453069643e-05, "loss": 0.8812, "step": 8820 }, { "epoch": 0.44101488362800917, "grad_norm": 1.3584084510803223, "learning_rate": 8.847480492126761e-05, "loss": 0.678, "step": 8830 }, { "epoch": 0.44151433423234443, "grad_norm": 3.5568907260894775, "learning_rate": 8.844974085450515e-05, "loss": 0.9797, "step": 8840 }, { "epoch": 0.44201378483667964, "grad_norm": 4.693130016326904, "learning_rate": 8.84246531221037e-05, "loss": 0.9045, "step": 8850 }, { "epoch": 0.4425132354410149, "grad_norm": 2.3070759773254395, "learning_rate": 8.839954173950469e-05, "loss": 0.714, "step": 8860 }, { "epoch": 0.4430126860453501, "grad_norm": 1.7823224067687988, "learning_rate": 8.837440672216404e-05, "loss": 0.6414, "step": 8870 }, { "epoch": 0.44351213664968536, "grad_norm": 3.037031888961792, "learning_rate": 8.834924808555223e-05, "loss": 0.8324, "step": 8880 }, { "epoch": 0.44401158725402057, "grad_norm": 3.7925896644592285, "learning_rate": 8.83240658451543e-05, "loss": 0.7691, "step": 8890 }, { "epoch": 0.44451103785835583, "grad_norm": 1.6429016590118408, "learning_rate": 8.829886001646978e-05, "loss": 0.9756, "step": 8900 }, { "epoch": 0.44501048846269103, "grad_norm": 1.7131584882736206, "learning_rate": 8.827363061501276e-05, "loss": 0.9618, "step": 8910 }, { "epoch": 0.4455099390670263, "grad_norm": 2.0057897567749023, "learning_rate": 8.824837765631185e-05, "loss": 0.8267, "step": 8920 }, { "epoch": 0.4460093896713615, "grad_norm": 2.802243947982788, "learning_rate": 8.822310115591008e-05, "loss": 0.9813, "step": 8930 }, { "epoch": 0.44650884027569676, "grad_norm": 3.030748128890991, "learning_rate": 8.819780112936502e-05, "loss": 0.9917, "step": 8940 }, { "epoch": 0.44700829088003197, "grad_norm": 1.8534924983978271, "learning_rate": 8.817247759224876e-05, "loss": 0.7916, "step": 8950 }, { "epoch": 0.44750774148436717, "grad_norm": 2.390037775039673, "learning_rate": 8.814713056014781e-05, "loss": 0.8005, "step": 8960 }, { "epoch": 0.44800719208870243, "grad_norm": 2.032355308532715, "learning_rate": 8.812176004866313e-05, "loss": 1.0164, "step": 8970 }, { "epoch": 0.44850664269303764, "grad_norm": 2.673767566680908, "learning_rate": 8.809636607341017e-05, "loss": 0.9618, "step": 8980 }, { "epoch": 0.4490060932973729, "grad_norm": 1.2164344787597656, "learning_rate": 8.807094865001878e-05, "loss": 0.9268, "step": 8990 }, { "epoch": 0.4495055439017081, "grad_norm": 1.4314156770706177, "learning_rate": 8.804550779413331e-05, "loss": 0.9236, "step": 9000 }, { "epoch": 0.45000499450604337, "grad_norm": 1.4462699890136719, "learning_rate": 8.802004352141247e-05, "loss": 0.812, "step": 9010 }, { "epoch": 0.45050444511037857, "grad_norm": 3.182401657104492, "learning_rate": 8.799455584752938e-05, "loss": 0.9307, "step": 9020 }, { "epoch": 0.45100389571471383, "grad_norm": 4.526309013366699, "learning_rate": 8.796904478817162e-05, "loss": 0.9278, "step": 9030 }, { "epoch": 0.45150334631904904, "grad_norm": 1.9855213165283203, "learning_rate": 8.794351035904112e-05, "loss": 0.6975, "step": 9040 }, { "epoch": 0.4520027969233843, "grad_norm": 2.0959839820861816, "learning_rate": 8.791795257585421e-05, "loss": 0.8845, "step": 9050 }, { "epoch": 0.4525022475277195, "grad_norm": 1.1127110719680786, "learning_rate": 8.789237145434155e-05, "loss": 0.8488, "step": 9060 }, { "epoch": 0.45300169813205476, "grad_norm": 1.0070494413375854, "learning_rate": 8.786676701024826e-05, "loss": 0.7984, "step": 9070 }, { "epoch": 0.45350114873638997, "grad_norm": 2.1222105026245117, "learning_rate": 8.784113925933371e-05, "loss": 0.9088, "step": 9080 }, { "epoch": 0.4540005993407252, "grad_norm": 1.2153772115707397, "learning_rate": 8.78154882173717e-05, "loss": 0.7524, "step": 9090 }, { "epoch": 0.45450004994506044, "grad_norm": 2.5599777698516846, "learning_rate": 8.77898139001503e-05, "loss": 0.8623, "step": 9100 }, { "epoch": 0.45499950054939564, "grad_norm": 2.3141989707946777, "learning_rate": 8.776411632347194e-05, "loss": 0.8572, "step": 9110 }, { "epoch": 0.4554989511537309, "grad_norm": 4.364091873168945, "learning_rate": 8.773839550315337e-05, "loss": 0.9145, "step": 9120 }, { "epoch": 0.4559984017580661, "grad_norm": 1.888785719871521, "learning_rate": 8.771265145502566e-05, "loss": 0.7613, "step": 9130 }, { "epoch": 0.45649785236240137, "grad_norm": 1.7906880378723145, "learning_rate": 8.76868841949341e-05, "loss": 0.6708, "step": 9140 }, { "epoch": 0.4569973029667366, "grad_norm": 1.3806205987930298, "learning_rate": 8.766109373873834e-05, "loss": 0.6981, "step": 9150 }, { "epoch": 0.45749675357107183, "grad_norm": 1.7840837240219116, "learning_rate": 8.76352801023123e-05, "loss": 1.0249, "step": 9160 }, { "epoch": 0.45799620417540704, "grad_norm": 1.4487371444702148, "learning_rate": 8.760944330154418e-05, "loss": 0.8626, "step": 9170 }, { "epoch": 0.4584956547797423, "grad_norm": 3.1842362880706787, "learning_rate": 8.758358335233634e-05, "loss": 0.8693, "step": 9180 }, { "epoch": 0.4589951053840775, "grad_norm": 1.9841814041137695, "learning_rate": 8.755770027060552e-05, "loss": 0.7999, "step": 9190 }, { "epoch": 0.45949455598841277, "grad_norm": 1.462900161743164, "learning_rate": 8.75317940722826e-05, "loss": 0.8821, "step": 9200 }, { "epoch": 0.459994006592748, "grad_norm": 1.1118979454040527, "learning_rate": 8.750586477331277e-05, "loss": 0.9123, "step": 9210 }, { "epoch": 0.46049345719708323, "grad_norm": 3.976433753967285, "learning_rate": 8.747991238965536e-05, "loss": 0.807, "step": 9220 }, { "epoch": 0.46099290780141844, "grad_norm": 3.9534356594085693, "learning_rate": 8.745393693728395e-05, "loss": 1.037, "step": 9230 }, { "epoch": 0.46149235840575364, "grad_norm": 2.3069164752960205, "learning_rate": 8.742793843218633e-05, "loss": 0.8247, "step": 9240 }, { "epoch": 0.4619918090100889, "grad_norm": 2.868238925933838, "learning_rate": 8.740191689036443e-05, "loss": 0.6486, "step": 9250 }, { "epoch": 0.4624912596144241, "grad_norm": 2.9097187519073486, "learning_rate": 8.73758723278344e-05, "loss": 0.9045, "step": 9260 }, { "epoch": 0.46299071021875937, "grad_norm": 1.270633578300476, "learning_rate": 8.734980476062657e-05, "loss": 0.6936, "step": 9270 }, { "epoch": 0.4634901608230946, "grad_norm": 1.2080847024917603, "learning_rate": 8.732371420478538e-05, "loss": 0.8571, "step": 9280 }, { "epoch": 0.46398961142742984, "grad_norm": 0.9066054224967957, "learning_rate": 8.729760067636944e-05, "loss": 0.7925, "step": 9290 }, { "epoch": 0.46448906203176504, "grad_norm": 2.1034419536590576, "learning_rate": 8.727146419145155e-05, "loss": 0.6313, "step": 9300 }, { "epoch": 0.4649885126361003, "grad_norm": 1.5577855110168457, "learning_rate": 8.724530476611857e-05, "loss": 0.9527, "step": 9310 }, { "epoch": 0.4654879632404355, "grad_norm": 3.563767910003662, "learning_rate": 8.721912241647151e-05, "loss": 1.0483, "step": 9320 }, { "epoch": 0.46598741384477077, "grad_norm": 1.8607417345046997, "learning_rate": 8.719291715862547e-05, "loss": 0.8991, "step": 9330 }, { "epoch": 0.466486864449106, "grad_norm": 1.8814846277236938, "learning_rate": 8.71666890087097e-05, "loss": 0.8022, "step": 9340 }, { "epoch": 0.46698631505344124, "grad_norm": 4.706637382507324, "learning_rate": 8.714043798286746e-05, "loss": 0.8227, "step": 9350 }, { "epoch": 0.46748576565777644, "grad_norm": 1.5149883031845093, "learning_rate": 8.711416409725619e-05, "loss": 0.7861, "step": 9360 }, { "epoch": 0.4679852162621117, "grad_norm": 2.6364152431488037, "learning_rate": 8.708786736804729e-05, "loss": 0.8212, "step": 9370 }, { "epoch": 0.4684846668664469, "grad_norm": 3.7528250217437744, "learning_rate": 8.706154781142632e-05, "loss": 0.8007, "step": 9380 }, { "epoch": 0.4689841174707821, "grad_norm": 2.3562207221984863, "learning_rate": 8.703520544359282e-05, "loss": 0.9271, "step": 9390 }, { "epoch": 0.4694835680751174, "grad_norm": 2.0192863941192627, "learning_rate": 8.700884028076042e-05, "loss": 0.8238, "step": 9400 }, { "epoch": 0.4699830186794526, "grad_norm": 1.6539078950881958, "learning_rate": 8.698245233915673e-05, "loss": 0.7086, "step": 9410 }, { "epoch": 0.47048246928378784, "grad_norm": 3.2260684967041016, "learning_rate": 8.695604163502342e-05, "loss": 0.7594, "step": 9420 }, { "epoch": 0.47098191988812305, "grad_norm": 4.1658854484558105, "learning_rate": 8.692960818461617e-05, "loss": 0.9258, "step": 9430 }, { "epoch": 0.4714813704924583, "grad_norm": 1.852576732635498, "learning_rate": 8.690315200420462e-05, "loss": 0.6446, "step": 9440 }, { "epoch": 0.4719808210967935, "grad_norm": 0.9907698035240173, "learning_rate": 8.687667311007245e-05, "loss": 1.2197, "step": 9450 }, { "epoch": 0.4724802717011288, "grad_norm": 1.4926533699035645, "learning_rate": 8.685017151851728e-05, "loss": 0.8126, "step": 9460 }, { "epoch": 0.472979722305464, "grad_norm": 3.9340710639953613, "learning_rate": 8.682364724585075e-05, "loss": 0.8304, "step": 9470 }, { "epoch": 0.47347917290979924, "grad_norm": 2.3523919582366943, "learning_rate": 8.679710030839838e-05, "loss": 0.8956, "step": 9480 }, { "epoch": 0.47397862351413445, "grad_norm": 1.8453624248504639, "learning_rate": 8.677053072249972e-05, "loss": 0.7989, "step": 9490 }, { "epoch": 0.4744780741184697, "grad_norm": 0.7890266180038452, "learning_rate": 8.674393850450823e-05, "loss": 0.7103, "step": 9500 }, { "epoch": 0.4749775247228049, "grad_norm": 2.487265110015869, "learning_rate": 8.671732367079129e-05, "loss": 1.0322, "step": 9510 }, { "epoch": 0.47547697532714017, "grad_norm": 1.0967894792556763, "learning_rate": 8.66906862377302e-05, "loss": 0.8271, "step": 9520 }, { "epoch": 0.4759764259314754, "grad_norm": 5.3334059715271, "learning_rate": 8.666402622172018e-05, "loss": 1.2287, "step": 9530 }, { "epoch": 0.4764758765358106, "grad_norm": 0.7524279952049255, "learning_rate": 8.663734363917037e-05, "loss": 0.9327, "step": 9540 }, { "epoch": 0.47697532714014584, "grad_norm": 5.157166004180908, "learning_rate": 8.661063850650375e-05, "loss": 0.8208, "step": 9550 }, { "epoch": 0.47747477774448105, "grad_norm": 1.666438341140747, "learning_rate": 8.658391084015723e-05, "loss": 1.0964, "step": 9560 }, { "epoch": 0.4779742283488163, "grad_norm": 2.5982296466827393, "learning_rate": 8.655716065658154e-05, "loss": 0.8009, "step": 9570 }, { "epoch": 0.4784736789531515, "grad_norm": 1.3922905921936035, "learning_rate": 8.653038797224132e-05, "loss": 0.864, "step": 9580 }, { "epoch": 0.4789731295574868, "grad_norm": 6.944724082946777, "learning_rate": 8.6503592803615e-05, "loss": 0.8825, "step": 9590 }, { "epoch": 0.479472580161822, "grad_norm": 1.7859829664230347, "learning_rate": 8.647677516719492e-05, "loss": 1.001, "step": 9600 }, { "epoch": 0.47997203076615724, "grad_norm": 1.1808656454086304, "learning_rate": 8.64499350794872e-05, "loss": 0.8243, "step": 9610 }, { "epoch": 0.48047148137049245, "grad_norm": 1.345732569694519, "learning_rate": 8.642307255701177e-05, "loss": 0.8518, "step": 9620 }, { "epoch": 0.4809709319748277, "grad_norm": 2.1847989559173584, "learning_rate": 8.639618761630242e-05, "loss": 0.9443, "step": 9630 }, { "epoch": 0.4814703825791629, "grad_norm": 3.106778621673584, "learning_rate": 8.63692802739067e-05, "loss": 0.9343, "step": 9640 }, { "epoch": 0.4819698331834982, "grad_norm": 1.8735466003417969, "learning_rate": 8.634235054638595e-05, "loss": 0.6618, "step": 9650 }, { "epoch": 0.4824692837878334, "grad_norm": 2.5529987812042236, "learning_rate": 8.631539845031529e-05, "loss": 0.834, "step": 9660 }, { "epoch": 0.48296873439216864, "grad_norm": 1.7117773294448853, "learning_rate": 8.628842400228361e-05, "loss": 0.7188, "step": 9670 }, { "epoch": 0.48346818499650385, "grad_norm": 5.574675559997559, "learning_rate": 8.626142721889358e-05, "loss": 0.9675, "step": 9680 }, { "epoch": 0.48396763560083905, "grad_norm": 1.724246621131897, "learning_rate": 8.623440811676158e-05, "loss": 0.7304, "step": 9690 }, { "epoch": 0.4844670862051743, "grad_norm": 2.3189258575439453, "learning_rate": 8.620736671251773e-05, "loss": 0.6906, "step": 9700 }, { "epoch": 0.4849665368095095, "grad_norm": 4.428511619567871, "learning_rate": 8.618030302280591e-05, "loss": 0.8414, "step": 9710 }, { "epoch": 0.4854659874138448, "grad_norm": 0.6134352684020996, "learning_rate": 8.61532170642837e-05, "loss": 0.8284, "step": 9720 }, { "epoch": 0.48596543801818, "grad_norm": 1.8473355770111084, "learning_rate": 8.612610885362236e-05, "loss": 0.8872, "step": 9730 }, { "epoch": 0.48646488862251525, "grad_norm": 1.4100711345672607, "learning_rate": 8.60989784075069e-05, "loss": 0.8218, "step": 9740 }, { "epoch": 0.48696433922685045, "grad_norm": 2.5205984115600586, "learning_rate": 8.607182574263595e-05, "loss": 0.8991, "step": 9750 }, { "epoch": 0.4874637898311857, "grad_norm": 2.3048384189605713, "learning_rate": 8.604465087572188e-05, "loss": 0.834, "step": 9760 }, { "epoch": 0.4879632404355209, "grad_norm": 2.423758029937744, "learning_rate": 8.601745382349068e-05, "loss": 0.8198, "step": 9770 }, { "epoch": 0.4884626910398562, "grad_norm": 4.580270767211914, "learning_rate": 8.599023460268202e-05, "loss": 1.1273, "step": 9780 }, { "epoch": 0.4889621416441914, "grad_norm": 1.962369441986084, "learning_rate": 8.596299323004919e-05, "loss": 0.8828, "step": 9790 }, { "epoch": 0.48946159224852664, "grad_norm": 3.3463711738586426, "learning_rate": 8.593572972235915e-05, "loss": 0.9256, "step": 9800 }, { "epoch": 0.48996104285286185, "grad_norm": 2.2721240520477295, "learning_rate": 8.590844409639248e-05, "loss": 0.911, "step": 9810 }, { "epoch": 0.4904604934571971, "grad_norm": 2.064649820327759, "learning_rate": 8.588113636894328e-05, "loss": 0.8313, "step": 9820 }, { "epoch": 0.4909599440615323, "grad_norm": 4.452096462249756, "learning_rate": 8.585380655681943e-05, "loss": 0.8163, "step": 9830 }, { "epoch": 0.4914593946658675, "grad_norm": 1.811700701713562, "learning_rate": 8.582645467684223e-05, "loss": 0.9164, "step": 9840 }, { "epoch": 0.4919588452702028, "grad_norm": 1.8374183177947998, "learning_rate": 8.579908074584666e-05, "loss": 0.7401, "step": 9850 }, { "epoch": 0.492458295874538, "grad_norm": 1.4280540943145752, "learning_rate": 8.577168478068127e-05, "loss": 0.901, "step": 9860 }, { "epoch": 0.49295774647887325, "grad_norm": 2.486199140548706, "learning_rate": 8.574426679820813e-05, "loss": 0.5956, "step": 9870 }, { "epoch": 0.49345719708320845, "grad_norm": 2.0157670974731445, "learning_rate": 8.571682681530289e-05, "loss": 0.9413, "step": 9880 }, { "epoch": 0.4939566476875437, "grad_norm": 1.218309998512268, "learning_rate": 8.56893648488547e-05, "loss": 1.0669, "step": 9890 }, { "epoch": 0.4944560982918789, "grad_norm": 1.4599615335464478, "learning_rate": 8.566188091576634e-05, "loss": 0.6705, "step": 9900 }, { "epoch": 0.4949555488962142, "grad_norm": 4.729304790496826, "learning_rate": 8.563437503295398e-05, "loss": 0.8737, "step": 9910 }, { "epoch": 0.4954549995005494, "grad_norm": 1.7913062572479248, "learning_rate": 8.560684721734742e-05, "loss": 0.9062, "step": 9920 }, { "epoch": 0.49595445010488465, "grad_norm": 2.2768373489379883, "learning_rate": 8.557929748588986e-05, "loss": 0.8404, "step": 9930 }, { "epoch": 0.49645390070921985, "grad_norm": 6.3551788330078125, "learning_rate": 8.555172585553805e-05, "loss": 0.7617, "step": 9940 }, { "epoch": 0.4969533513135551, "grad_norm": 2.6699812412261963, "learning_rate": 8.552413234326219e-05, "loss": 0.992, "step": 9950 }, { "epoch": 0.4974528019178903, "grad_norm": 6.235543727874756, "learning_rate": 8.549651696604599e-05, "loss": 1.0864, "step": 9960 }, { "epoch": 0.4979522525222255, "grad_norm": 2.092358350753784, "learning_rate": 8.546887974088656e-05, "loss": 0.9257, "step": 9970 }, { "epoch": 0.4984517031265608, "grad_norm": 0.8900352120399475, "learning_rate": 8.544122068479449e-05, "loss": 0.7231, "step": 9980 }, { "epoch": 0.498951153730896, "grad_norm": 2.0917277336120605, "learning_rate": 8.541353981479383e-05, "loss": 0.9558, "step": 9990 }, { "epoch": 0.49945060433523125, "grad_norm": 0.6328327059745789, "learning_rate": 8.538583714792198e-05, "loss": 0.8234, "step": 10000 }, { "epoch": 0.49995005493956646, "grad_norm": 1.4690412282943726, "learning_rate": 8.535811270122986e-05, "loss": 0.907, "step": 10010 }, { "epoch": 0.5004495055439017, "grad_norm": 1.581143856048584, "learning_rate": 8.533036649178169e-05, "loss": 0.8558, "step": 10020 }, { "epoch": 0.500948956148237, "grad_norm": 2.121934652328491, "learning_rate": 8.530259853665514e-05, "loss": 0.9822, "step": 10030 }, { "epoch": 0.5014484067525722, "grad_norm": 2.247655153274536, "learning_rate": 8.52748088529413e-05, "loss": 0.8619, "step": 10040 }, { "epoch": 0.5019478573569074, "grad_norm": 2.4512712955474854, "learning_rate": 8.524699745774455e-05, "loss": 1.1144, "step": 10050 }, { "epoch": 0.5024473079612426, "grad_norm": 1.4861620664596558, "learning_rate": 8.521916436818269e-05, "loss": 0.8321, "step": 10060 }, { "epoch": 0.5029467585655779, "grad_norm": 2.7260966300964355, "learning_rate": 8.519130960138686e-05, "loss": 0.9533, "step": 10070 }, { "epoch": 0.5034462091699131, "grad_norm": 1.08968985080719, "learning_rate": 8.516343317450156e-05, "loss": 0.5477, "step": 10080 }, { "epoch": 0.5039456597742483, "grad_norm": 1.4115921258926392, "learning_rate": 8.513553510468457e-05, "loss": 0.8258, "step": 10090 }, { "epoch": 0.5044451103785835, "grad_norm": 3.039212942123413, "learning_rate": 8.510761540910704e-05, "loss": 0.9886, "step": 10100 }, { "epoch": 0.5049445609829188, "grad_norm": 3.746342897415161, "learning_rate": 8.507967410495339e-05, "loss": 0.9196, "step": 10110 }, { "epoch": 0.505444011587254, "grad_norm": 1.0602586269378662, "learning_rate": 8.505171120942142e-05, "loss": 0.7648, "step": 10120 }, { "epoch": 0.5059434621915893, "grad_norm": 1.7583624124526978, "learning_rate": 8.502372673972211e-05, "loss": 0.9866, "step": 10130 }, { "epoch": 0.5064429127959245, "grad_norm": 3.013103723526001, "learning_rate": 8.49957207130798e-05, "loss": 0.683, "step": 10140 }, { "epoch": 0.5069423634002597, "grad_norm": 1.2070417404174805, "learning_rate": 8.496769314673207e-05, "loss": 0.8469, "step": 10150 }, { "epoch": 0.507441814004595, "grad_norm": 2.0063936710357666, "learning_rate": 8.493964405792973e-05, "loss": 0.8121, "step": 10160 }, { "epoch": 0.5079412646089302, "grad_norm": 3.4089725017547607, "learning_rate": 8.491157346393693e-05, "loss": 0.9276, "step": 10170 }, { "epoch": 0.5084407152132654, "grad_norm": 2.365583658218384, "learning_rate": 8.488348138203091e-05, "loss": 1.1725, "step": 10180 }, { "epoch": 0.5089401658176006, "grad_norm": 2.678743600845337, "learning_rate": 8.485536782950228e-05, "loss": 0.9273, "step": 10190 }, { "epoch": 0.5094396164219359, "grad_norm": 1.7188700437545776, "learning_rate": 8.482723282365477e-05, "loss": 0.7503, "step": 10200 }, { "epoch": 0.5099390670262711, "grad_norm": 2.3885445594787598, "learning_rate": 8.479907638180535e-05, "loss": 0.8827, "step": 10210 }, { "epoch": 0.5104385176306063, "grad_norm": 2.0695323944091797, "learning_rate": 8.477089852128421e-05, "loss": 0.9493, "step": 10220 }, { "epoch": 0.5109379682349415, "grad_norm": 3.528078317642212, "learning_rate": 8.474269925943465e-05, "loss": 0.8015, "step": 10230 }, { "epoch": 0.5114374188392768, "grad_norm": 2.784247636795044, "learning_rate": 8.471447861361321e-05, "loss": 0.8912, "step": 10240 }, { "epoch": 0.511936869443612, "grad_norm": 1.4341551065444946, "learning_rate": 8.468623660118958e-05, "loss": 0.843, "step": 10250 }, { "epoch": 0.5124363200479473, "grad_norm": 1.393707275390625, "learning_rate": 8.465797323954656e-05, "loss": 1.0695, "step": 10260 }, { "epoch": 0.5129357706522825, "grad_norm": 2.879314661026001, "learning_rate": 8.462968854608013e-05, "loss": 0.8203, "step": 10270 }, { "epoch": 0.5134352212566177, "grad_norm": 2.527120351791382, "learning_rate": 8.46013825381994e-05, "loss": 0.7242, "step": 10280 }, { "epoch": 0.513934671860953, "grad_norm": 2.0064504146575928, "learning_rate": 8.457305523332657e-05, "loss": 0.7982, "step": 10290 }, { "epoch": 0.5144341224652882, "grad_norm": 1.5294859409332275, "learning_rate": 8.4544706648897e-05, "loss": 0.7853, "step": 10300 }, { "epoch": 0.5149335730696234, "grad_norm": 2.493802309036255, "learning_rate": 8.451633680235906e-05, "loss": 0.9062, "step": 10310 }, { "epoch": 0.5154330236739586, "grad_norm": 1.1306747198104858, "learning_rate": 8.448794571117431e-05, "loss": 0.7402, "step": 10320 }, { "epoch": 0.5159324742782939, "grad_norm": 0.39416784048080444, "learning_rate": 8.445953339281731e-05, "loss": 0.7237, "step": 10330 }, { "epoch": 0.5164319248826291, "grad_norm": 2.656313896179199, "learning_rate": 8.443109986477573e-05, "loss": 0.8015, "step": 10340 }, { "epoch": 0.5169313754869643, "grad_norm": 0.6447727680206299, "learning_rate": 8.440264514455025e-05, "loss": 0.765, "step": 10350 }, { "epoch": 0.5174308260912995, "grad_norm": 1.8436886072158813, "learning_rate": 8.437416924965464e-05, "loss": 0.9542, "step": 10360 }, { "epoch": 0.5179302766956349, "grad_norm": 2.326608657836914, "learning_rate": 8.434567219761566e-05, "loss": 1.0256, "step": 10370 }, { "epoch": 0.5184297272999701, "grad_norm": 1.378662109375, "learning_rate": 8.431715400597315e-05, "loss": 0.7725, "step": 10380 }, { "epoch": 0.5189291779043053, "grad_norm": 3.0724847316741943, "learning_rate": 8.428861469227991e-05, "loss": 0.9924, "step": 10390 }, { "epoch": 0.5194286285086405, "grad_norm": 2.2953217029571533, "learning_rate": 8.426005427410176e-05, "loss": 0.905, "step": 10400 }, { "epoch": 0.5199280791129757, "grad_norm": 1.05632483959198, "learning_rate": 8.423147276901747e-05, "loss": 0.6062, "step": 10410 }, { "epoch": 0.520427529717311, "grad_norm": 1.9191445112228394, "learning_rate": 8.420287019461887e-05, "loss": 0.7846, "step": 10420 }, { "epoch": 0.5209269803216462, "grad_norm": 1.5224536657333374, "learning_rate": 8.41742465685107e-05, "loss": 0.9027, "step": 10430 }, { "epoch": 0.5214264309259814, "grad_norm": 1.7912894487380981, "learning_rate": 8.414560190831067e-05, "loss": 1.014, "step": 10440 }, { "epoch": 0.5219258815303166, "grad_norm": 3.137920379638672, "learning_rate": 8.411693623164942e-05, "loss": 0.9506, "step": 10450 }, { "epoch": 0.5224253321346519, "grad_norm": 2.5194101333618164, "learning_rate": 8.408824955617057e-05, "loss": 1.0063, "step": 10460 }, { "epoch": 0.5229247827389871, "grad_norm": 2.2313342094421387, "learning_rate": 8.405954189953062e-05, "loss": 0.7812, "step": 10470 }, { "epoch": 0.5234242333433223, "grad_norm": 2.128974199295044, "learning_rate": 8.403081327939902e-05, "loss": 0.863, "step": 10480 }, { "epoch": 0.5239236839476575, "grad_norm": 0.898611307144165, "learning_rate": 8.400206371345809e-05, "loss": 0.6259, "step": 10490 }, { "epoch": 0.5244231345519929, "grad_norm": 1.9370143413543701, "learning_rate": 8.397329321940304e-05, "loss": 1.0129, "step": 10500 }, { "epoch": 0.5249225851563281, "grad_norm": 0.7454211115837097, "learning_rate": 8.394450181494198e-05, "loss": 0.7934, "step": 10510 }, { "epoch": 0.5254220357606633, "grad_norm": 0.5250659584999084, "learning_rate": 8.391568951779593e-05, "loss": 0.8833, "step": 10520 }, { "epoch": 0.5259214863649985, "grad_norm": 4.715530872344971, "learning_rate": 8.388685634569869e-05, "loss": 0.8296, "step": 10530 }, { "epoch": 0.5264209369693338, "grad_norm": 1.491549015045166, "learning_rate": 8.385800231639693e-05, "loss": 0.7825, "step": 10540 }, { "epoch": 0.526920387573669, "grad_norm": 3.0522241592407227, "learning_rate": 8.382912744765021e-05, "loss": 0.8571, "step": 10550 }, { "epoch": 0.5274198381780042, "grad_norm": 1.1122174263000488, "learning_rate": 8.380023175723087e-05, "loss": 0.7613, "step": 10560 }, { "epoch": 0.5279192887823394, "grad_norm": 3.839743137359619, "learning_rate": 8.377131526292405e-05, "loss": 0.8311, "step": 10570 }, { "epoch": 0.5284187393866746, "grad_norm": 2.86128306388855, "learning_rate": 8.374237798252775e-05, "loss": 1.0072, "step": 10580 }, { "epoch": 0.5289181899910099, "grad_norm": 1.6249940395355225, "learning_rate": 8.371341993385271e-05, "loss": 0.9004, "step": 10590 }, { "epoch": 0.5294176405953451, "grad_norm": 2.512314796447754, "learning_rate": 8.36844411347225e-05, "loss": 0.8968, "step": 10600 }, { "epoch": 0.5299170911996803, "grad_norm": 1.5922598838806152, "learning_rate": 8.365544160297341e-05, "loss": 0.8379, "step": 10610 }, { "epoch": 0.5304165418040155, "grad_norm": 1.7710751295089722, "learning_rate": 8.362642135645454e-05, "loss": 0.9609, "step": 10620 }, { "epoch": 0.5309159924083509, "grad_norm": 2.3227810859680176, "learning_rate": 8.359738041302772e-05, "loss": 0.7204, "step": 10630 }, { "epoch": 0.5314154430126861, "grad_norm": 2.00604248046875, "learning_rate": 8.35683187905675e-05, "loss": 0.95, "step": 10640 }, { "epoch": 0.5319148936170213, "grad_norm": 1.0721955299377441, "learning_rate": 8.353923650696118e-05, "loss": 0.8605, "step": 10650 }, { "epoch": 0.5324143442213565, "grad_norm": 1.99587881565094, "learning_rate": 8.351013358010877e-05, "loss": 0.8029, "step": 10660 }, { "epoch": 0.5329137948256918, "grad_norm": 2.7038462162017822, "learning_rate": 8.348101002792301e-05, "loss": 1.0095, "step": 10670 }, { "epoch": 0.533413245430027, "grad_norm": 2.7722578048706055, "learning_rate": 8.345186586832929e-05, "loss": 1.1092, "step": 10680 }, { "epoch": 0.5339126960343622, "grad_norm": 0.7673277854919434, "learning_rate": 8.342270111926571e-05, "loss": 0.9073, "step": 10690 }, { "epoch": 0.5344121466386974, "grad_norm": 1.2908411026000977, "learning_rate": 8.339351579868304e-05, "loss": 0.6403, "step": 10700 }, { "epoch": 0.5349115972430326, "grad_norm": 1.8550677299499512, "learning_rate": 8.336430992454474e-05, "loss": 0.9918, "step": 10710 }, { "epoch": 0.5354110478473679, "grad_norm": 0.9618448615074158, "learning_rate": 8.333508351482682e-05, "loss": 0.7949, "step": 10720 }, { "epoch": 0.5359104984517031, "grad_norm": 1.6754440069198608, "learning_rate": 8.330583658751807e-05, "loss": 0.7728, "step": 10730 }, { "epoch": 0.5364099490560383, "grad_norm": 1.1474179029464722, "learning_rate": 8.327656916061982e-05, "loss": 0.9935, "step": 10740 }, { "epoch": 0.5369093996603735, "grad_norm": 1.2575993537902832, "learning_rate": 8.324728125214603e-05, "loss": 0.7431, "step": 10750 }, { "epoch": 0.5374088502647089, "grad_norm": 2.4379231929779053, "learning_rate": 8.321797288012326e-05, "loss": 0.8684, "step": 10760 }, { "epoch": 0.5379083008690441, "grad_norm": 1.0595794916152954, "learning_rate": 8.31886440625907e-05, "loss": 0.9839, "step": 10770 }, { "epoch": 0.5384077514733793, "grad_norm": 2.0171382427215576, "learning_rate": 8.31592948176001e-05, "loss": 0.9853, "step": 10780 }, { "epoch": 0.5389072020777145, "grad_norm": 1.2597289085388184, "learning_rate": 8.312992516321578e-05, "loss": 0.8726, "step": 10790 }, { "epoch": 0.5394066526820498, "grad_norm": 4.327160835266113, "learning_rate": 8.310053511751463e-05, "loss": 0.7747, "step": 10800 }, { "epoch": 0.539906103286385, "grad_norm": 4.0043230056762695, "learning_rate": 8.307112469858608e-05, "loss": 0.8551, "step": 10810 }, { "epoch": 0.5404055538907202, "grad_norm": 2.365731954574585, "learning_rate": 8.304169392453213e-05, "loss": 0.8783, "step": 10820 }, { "epoch": 0.5409050044950554, "grad_norm": 3.5071523189544678, "learning_rate": 8.301224281346726e-05, "loss": 0.8082, "step": 10830 }, { "epoch": 0.5414044550993907, "grad_norm": 3.745530128479004, "learning_rate": 8.29827713835185e-05, "loss": 0.7679, "step": 10840 }, { "epoch": 0.5419039057037259, "grad_norm": 2.147007942199707, "learning_rate": 8.29532796528254e-05, "loss": 1.0009, "step": 10850 }, { "epoch": 0.5424033563080611, "grad_norm": 2.499675750732422, "learning_rate": 8.292376763953995e-05, "loss": 0.9068, "step": 10860 }, { "epoch": 0.5429028069123963, "grad_norm": 3.647723913192749, "learning_rate": 8.28942353618267e-05, "loss": 1.0705, "step": 10870 }, { "epoch": 0.5434022575167315, "grad_norm": 3.1378934383392334, "learning_rate": 8.28646828378626e-05, "loss": 0.7857, "step": 10880 }, { "epoch": 0.5439017081210669, "grad_norm": 2.2153217792510986, "learning_rate": 8.283511008583708e-05, "loss": 0.8, "step": 10890 }, { "epoch": 0.5444011587254021, "grad_norm": 1.5370389223098755, "learning_rate": 8.280551712395208e-05, "loss": 0.9032, "step": 10900 }, { "epoch": 0.5449006093297373, "grad_norm": 2.0717661380767822, "learning_rate": 8.27759039704219e-05, "loss": 0.7332, "step": 10910 }, { "epoch": 0.5454000599340725, "grad_norm": 1.4211114645004272, "learning_rate": 8.274627064347331e-05, "loss": 0.8698, "step": 10920 }, { "epoch": 0.5458995105384078, "grad_norm": 2.067880153656006, "learning_rate": 8.271661716134549e-05, "loss": 0.9097, "step": 10930 }, { "epoch": 0.546398961142743, "grad_norm": 4.041163921356201, "learning_rate": 8.268694354229001e-05, "loss": 0.8441, "step": 10940 }, { "epoch": 0.5468984117470782, "grad_norm": 1.7370940446853638, "learning_rate": 8.265724980457086e-05, "loss": 0.7791, "step": 10950 }, { "epoch": 0.5473978623514134, "grad_norm": 0.741644561290741, "learning_rate": 8.262753596646439e-05, "loss": 1.0678, "step": 10960 }, { "epoch": 0.5478973129557487, "grad_norm": 6.259626388549805, "learning_rate": 8.259780204625932e-05, "loss": 0.7785, "step": 10970 }, { "epoch": 0.5483967635600839, "grad_norm": 1.2122634649276733, "learning_rate": 8.256804806225677e-05, "loss": 0.9801, "step": 10980 }, { "epoch": 0.5488962141644191, "grad_norm": 3.5091569423675537, "learning_rate": 8.253827403277015e-05, "loss": 1.0206, "step": 10990 }, { "epoch": 0.5493956647687543, "grad_norm": 5.1107563972473145, "learning_rate": 8.250847997612527e-05, "loss": 0.9987, "step": 11000 }, { "epoch": 0.5498951153730895, "grad_norm": 6.479084491729736, "learning_rate": 8.24786659106602e-05, "loss": 1.11, "step": 11010 }, { "epoch": 0.5503945659774249, "grad_norm": 6.204999923706055, "learning_rate": 8.244883185472538e-05, "loss": 1.1227, "step": 11020 }, { "epoch": 0.5508940165817601, "grad_norm": 1.817765474319458, "learning_rate": 8.241897782668355e-05, "loss": 0.7799, "step": 11030 }, { "epoch": 0.5513934671860953, "grad_norm": 1.324508786201477, "learning_rate": 8.23891038449097e-05, "loss": 0.8929, "step": 11040 }, { "epoch": 0.5518929177904305, "grad_norm": 5.498021125793457, "learning_rate": 8.235920992779114e-05, "loss": 0.7613, "step": 11050 }, { "epoch": 0.5523923683947658, "grad_norm": 2.9075751304626465, "learning_rate": 8.232929609372744e-05, "loss": 0.745, "step": 11060 }, { "epoch": 0.552891818999101, "grad_norm": 3.374605894088745, "learning_rate": 8.229936236113042e-05, "loss": 0.9081, "step": 11070 }, { "epoch": 0.5533912696034362, "grad_norm": 1.7104989290237427, "learning_rate": 8.226940874842417e-05, "loss": 0.6835, "step": 11080 }, { "epoch": 0.5538907202077714, "grad_norm": 0.7547670602798462, "learning_rate": 8.223943527404498e-05, "loss": 0.6781, "step": 11090 }, { "epoch": 0.5543901708121067, "grad_norm": 2.296278476715088, "learning_rate": 8.22094419564414e-05, "loss": 1.1, "step": 11100 }, { "epoch": 0.5548896214164419, "grad_norm": 4.66751766204834, "learning_rate": 8.217942881407416e-05, "loss": 0.9201, "step": 11110 }, { "epoch": 0.5553890720207771, "grad_norm": 1.8728293180465698, "learning_rate": 8.214939586541626e-05, "loss": 0.8055, "step": 11120 }, { "epoch": 0.5558885226251123, "grad_norm": 2.5496201515197754, "learning_rate": 8.21193431289528e-05, "loss": 0.8833, "step": 11130 }, { "epoch": 0.5563879732294476, "grad_norm": 2.07619309425354, "learning_rate": 8.20892706231811e-05, "loss": 0.8857, "step": 11140 }, { "epoch": 0.5568874238337829, "grad_norm": 0.936546266078949, "learning_rate": 8.205917836661067e-05, "loss": 1.0265, "step": 11150 }, { "epoch": 0.5573868744381181, "grad_norm": 1.0726035833358765, "learning_rate": 8.202906637776316e-05, "loss": 0.9022, "step": 11160 }, { "epoch": 0.5578863250424533, "grad_norm": 2.3275067806243896, "learning_rate": 8.199893467517231e-05, "loss": 0.7763, "step": 11170 }, { "epoch": 0.5583857756467885, "grad_norm": 2.8741378784179688, "learning_rate": 8.196878327738411e-05, "loss": 0.8514, "step": 11180 }, { "epoch": 0.5588852262511238, "grad_norm": 0.6805217266082764, "learning_rate": 8.193861220295657e-05, "loss": 0.7612, "step": 11190 }, { "epoch": 0.559384676855459, "grad_norm": 1.7229316234588623, "learning_rate": 8.190842147045985e-05, "loss": 0.8793, "step": 11200 }, { "epoch": 0.5598841274597942, "grad_norm": 1.988580584526062, "learning_rate": 8.187821109847621e-05, "loss": 0.944, "step": 11210 }, { "epoch": 0.5603835780641294, "grad_norm": 3.3690719604492188, "learning_rate": 8.184798110560002e-05, "loss": 0.9576, "step": 11220 }, { "epoch": 0.5608830286684647, "grad_norm": 2.5268752574920654, "learning_rate": 8.181773151043767e-05, "loss": 0.7382, "step": 11230 }, { "epoch": 0.5613824792727999, "grad_norm": 1.3180385828018188, "learning_rate": 8.178746233160766e-05, "loss": 0.8898, "step": 11240 }, { "epoch": 0.5618819298771351, "grad_norm": 1.845851182937622, "learning_rate": 8.175717358774052e-05, "loss": 0.8032, "step": 11250 }, { "epoch": 0.5623813804814703, "grad_norm": 2.3231937885284424, "learning_rate": 8.172686529747885e-05, "loss": 0.7337, "step": 11260 }, { "epoch": 0.5628808310858057, "grad_norm": 2.671926975250244, "learning_rate": 8.169653747947724e-05, "loss": 0.8322, "step": 11270 }, { "epoch": 0.5633802816901409, "grad_norm": 0.9263166785240173, "learning_rate": 8.166619015240236e-05, "loss": 1.0253, "step": 11280 }, { "epoch": 0.5638797322944761, "grad_norm": 3.5001063346862793, "learning_rate": 8.16358233349328e-05, "loss": 0.807, "step": 11290 }, { "epoch": 0.5643791828988113, "grad_norm": 3.631415843963623, "learning_rate": 8.160543704575924e-05, "loss": 1.1009, "step": 11300 }, { "epoch": 0.5648786335031465, "grad_norm": 0.8493178486824036, "learning_rate": 8.157503130358431e-05, "loss": 0.8958, "step": 11310 }, { "epoch": 0.5653780841074818, "grad_norm": 2.1403348445892334, "learning_rate": 8.154460612712254e-05, "loss": 0.7372, "step": 11320 }, { "epoch": 0.565877534711817, "grad_norm": 2.315157413482666, "learning_rate": 8.151416153510054e-05, "loss": 0.7946, "step": 11330 }, { "epoch": 0.5663769853161522, "grad_norm": 0.8276131749153137, "learning_rate": 8.14836975462568e-05, "loss": 0.7355, "step": 11340 }, { "epoch": 0.5668764359204874, "grad_norm": 4.0848822593688965, "learning_rate": 8.145321417934179e-05, "loss": 1.068, "step": 11350 }, { "epoch": 0.5673758865248227, "grad_norm": 2.2448577880859375, "learning_rate": 8.142271145311783e-05, "loss": 1.1738, "step": 11360 }, { "epoch": 0.5678753371291579, "grad_norm": 1.619431734085083, "learning_rate": 8.139218938635927e-05, "loss": 0.9228, "step": 11370 }, { "epoch": 0.5683747877334931, "grad_norm": 3.5642194747924805, "learning_rate": 8.136164799785224e-05, "loss": 0.7772, "step": 11380 }, { "epoch": 0.5688742383378284, "grad_norm": 3.4192471504211426, "learning_rate": 8.133108730639489e-05, "loss": 1.1743, "step": 11390 }, { "epoch": 0.5693736889421637, "grad_norm": 1.5282948017120361, "learning_rate": 8.130050733079712e-05, "loss": 0.8939, "step": 11400 }, { "epoch": 0.5698731395464989, "grad_norm": 1.6148427724838257, "learning_rate": 8.126990808988082e-05, "loss": 0.954, "step": 11410 }, { "epoch": 0.5703725901508341, "grad_norm": 1.722934603691101, "learning_rate": 8.123928960247964e-05, "loss": 0.7706, "step": 11420 }, { "epoch": 0.5708720407551693, "grad_norm": 2.6629602909088135, "learning_rate": 8.120865188743914e-05, "loss": 0.9743, "step": 11430 }, { "epoch": 0.5713714913595045, "grad_norm": 4.457296848297119, "learning_rate": 8.117799496361669e-05, "loss": 0.9488, "step": 11440 }, { "epoch": 0.5718709419638398, "grad_norm": 1.44455087184906, "learning_rate": 8.114731884988149e-05, "loss": 1.03, "step": 11450 }, { "epoch": 0.572370392568175, "grad_norm": 1.7806223630905151, "learning_rate": 8.111662356511453e-05, "loss": 0.722, "step": 11460 }, { "epoch": 0.5728698431725102, "grad_norm": 1.8148882389068604, "learning_rate": 8.108590912820864e-05, "loss": 0.9295, "step": 11470 }, { "epoch": 0.5733692937768454, "grad_norm": 1.7864558696746826, "learning_rate": 8.105517555806841e-05, "loss": 0.8256, "step": 11480 }, { "epoch": 0.5738687443811807, "grad_norm": 1.2219898700714111, "learning_rate": 8.102442287361018e-05, "loss": 1.0176, "step": 11490 }, { "epoch": 0.574368194985516, "grad_norm": 3.0943713188171387, "learning_rate": 8.099365109376213e-05, "loss": 0.9669, "step": 11500 }, { "epoch": 0.5748676455898512, "grad_norm": 1.2991565465927124, "learning_rate": 8.096286023746414e-05, "loss": 0.9712, "step": 11510 }, { "epoch": 0.5753670961941864, "grad_norm": 1.5561408996582031, "learning_rate": 8.093205032366782e-05, "loss": 0.7694, "step": 11520 }, { "epoch": 0.5758665467985217, "grad_norm": 1.454827070236206, "learning_rate": 8.090122137133653e-05, "loss": 0.7044, "step": 11530 }, { "epoch": 0.5763659974028569, "grad_norm": 1.141781210899353, "learning_rate": 8.087037339944536e-05, "loss": 0.8696, "step": 11540 }, { "epoch": 0.5768654480071921, "grad_norm": 1.3285212516784668, "learning_rate": 8.083950642698112e-05, "loss": 0.9028, "step": 11550 }, { "epoch": 0.5773648986115273, "grad_norm": 1.0808900594711304, "learning_rate": 8.080862047294225e-05, "loss": 0.7895, "step": 11560 }, { "epoch": 0.5778643492158626, "grad_norm": 2.268902540206909, "learning_rate": 8.077771555633893e-05, "loss": 0.8546, "step": 11570 }, { "epoch": 0.5783637998201978, "grad_norm": 2.3639559745788574, "learning_rate": 8.0746791696193e-05, "loss": 0.8286, "step": 11580 }, { "epoch": 0.578863250424533, "grad_norm": 2.1565093994140625, "learning_rate": 8.071584891153792e-05, "loss": 0.8108, "step": 11590 }, { "epoch": 0.5793627010288682, "grad_norm": 0.9906855225563049, "learning_rate": 8.06848872214189e-05, "loss": 0.6949, "step": 11600 }, { "epoch": 0.5798621516332034, "grad_norm": 3.8159871101379395, "learning_rate": 8.065390664489264e-05, "loss": 0.8888, "step": 11610 }, { "epoch": 0.5803616022375387, "grad_norm": 3.860344648361206, "learning_rate": 8.062290720102759e-05, "loss": 0.9017, "step": 11620 }, { "epoch": 0.580861052841874, "grad_norm": 2.809664249420166, "learning_rate": 8.059188890890375e-05, "loss": 0.9088, "step": 11630 }, { "epoch": 0.5813605034462092, "grad_norm": 3.262469530105591, "learning_rate": 8.056085178761275e-05, "loss": 0.9483, "step": 11640 }, { "epoch": 0.5818599540505444, "grad_norm": 4.122176647186279, "learning_rate": 8.052979585625778e-05, "loss": 1.1103, "step": 11650 }, { "epoch": 0.5823594046548797, "grad_norm": 1.2592228651046753, "learning_rate": 8.049872113395363e-05, "loss": 0.8896, "step": 11660 }, { "epoch": 0.5828588552592149, "grad_norm": 2.1690526008605957, "learning_rate": 8.046762763982665e-05, "loss": 0.9666, "step": 11670 }, { "epoch": 0.5833583058635501, "grad_norm": 1.719984531402588, "learning_rate": 8.043651539301475e-05, "loss": 0.8685, "step": 11680 }, { "epoch": 0.5838577564678853, "grad_norm": 3.1530261039733887, "learning_rate": 8.040538441266736e-05, "loss": 0.7708, "step": 11690 }, { "epoch": 0.5843572070722206, "grad_norm": 6.055736541748047, "learning_rate": 8.037423471794545e-05, "loss": 0.8857, "step": 11700 }, { "epoch": 0.5848566576765558, "grad_norm": 2.080730676651001, "learning_rate": 8.034306632802154e-05, "loss": 1.0014, "step": 11710 }, { "epoch": 0.585356108280891, "grad_norm": 2.089596748352051, "learning_rate": 8.031187926207962e-05, "loss": 0.7845, "step": 11720 }, { "epoch": 0.5858555588852262, "grad_norm": 1.791109561920166, "learning_rate": 8.028067353931517e-05, "loss": 0.8536, "step": 11730 }, { "epoch": 0.5863550094895614, "grad_norm": 4.204870700836182, "learning_rate": 8.024944917893519e-05, "loss": 0.8977, "step": 11740 }, { "epoch": 0.5868544600938967, "grad_norm": 0.7246403098106384, "learning_rate": 8.021820620015811e-05, "loss": 0.8527, "step": 11750 }, { "epoch": 0.587353910698232, "grad_norm": 0.8796305060386658, "learning_rate": 8.018694462221387e-05, "loss": 0.8079, "step": 11760 }, { "epoch": 0.5878533613025672, "grad_norm": 1.0883392095565796, "learning_rate": 8.01556644643438e-05, "loss": 0.5785, "step": 11770 }, { "epoch": 0.5883528119069024, "grad_norm": 2.5127203464508057, "learning_rate": 8.01243657458007e-05, "loss": 0.6381, "step": 11780 }, { "epoch": 0.5888522625112377, "grad_norm": 1.3080767393112183, "learning_rate": 8.009304848584879e-05, "loss": 0.7554, "step": 11790 }, { "epoch": 0.5893517131155729, "grad_norm": 3.24627947807312, "learning_rate": 8.00617127037637e-05, "loss": 0.9587, "step": 11800 }, { "epoch": 0.5898511637199081, "grad_norm": 1.6273128986358643, "learning_rate": 8.003035841883249e-05, "loss": 0.7672, "step": 11810 }, { "epoch": 0.5903506143242433, "grad_norm": 2.0696349143981934, "learning_rate": 7.999898565035352e-05, "loss": 0.7795, "step": 11820 }, { "epoch": 0.5908500649285786, "grad_norm": 2.585515260696411, "learning_rate": 7.996759441763661e-05, "loss": 0.7312, "step": 11830 }, { "epoch": 0.5913495155329138, "grad_norm": 5.360396862030029, "learning_rate": 7.993618474000293e-05, "loss": 0.8446, "step": 11840 }, { "epoch": 0.591848966137249, "grad_norm": 1.9149426221847534, "learning_rate": 7.9904756636785e-05, "loss": 0.8087, "step": 11850 }, { "epoch": 0.5923484167415842, "grad_norm": 1.9312024116516113, "learning_rate": 7.987331012732665e-05, "loss": 0.93, "step": 11860 }, { "epoch": 0.5928478673459195, "grad_norm": 1.9590035676956177, "learning_rate": 7.984184523098307e-05, "loss": 0.8477, "step": 11870 }, { "epoch": 0.5933473179502547, "grad_norm": 2.831815481185913, "learning_rate": 7.981036196712077e-05, "loss": 0.7381, "step": 11880 }, { "epoch": 0.59384676855459, "grad_norm": 1.3266551494598389, "learning_rate": 7.977886035511753e-05, "loss": 1.0491, "step": 11890 }, { "epoch": 0.5943462191589252, "grad_norm": 1.720620036125183, "learning_rate": 7.974734041436246e-05, "loss": 1.0184, "step": 11900 }, { "epoch": 0.5948456697632604, "grad_norm": 1.073946237564087, "learning_rate": 7.971580216425596e-05, "loss": 0.6153, "step": 11910 }, { "epoch": 0.5953451203675957, "grad_norm": 1.2611136436462402, "learning_rate": 7.968424562420966e-05, "loss": 0.8856, "step": 11920 }, { "epoch": 0.5958445709719309, "grad_norm": 1.2337510585784912, "learning_rate": 7.965267081364644e-05, "loss": 1.0286, "step": 11930 }, { "epoch": 0.5963440215762661, "grad_norm": 1.8494173288345337, "learning_rate": 7.96210777520005e-05, "loss": 0.8438, "step": 11940 }, { "epoch": 0.5968434721806013, "grad_norm": 1.8579139709472656, "learning_rate": 7.958946645871719e-05, "loss": 0.7892, "step": 11950 }, { "epoch": 0.5973429227849366, "grad_norm": 1.6124542951583862, "learning_rate": 7.955783695325315e-05, "loss": 1.0447, "step": 11960 }, { "epoch": 0.5978423733892718, "grad_norm": 1.4833028316497803, "learning_rate": 7.952618925507614e-05, "loss": 0.9831, "step": 11970 }, { "epoch": 0.598341823993607, "grad_norm": 1.6952725648880005, "learning_rate": 7.94945233836652e-05, "loss": 0.891, "step": 11980 }, { "epoch": 0.5988412745979422, "grad_norm": 2.0533688068389893, "learning_rate": 7.946283935851057e-05, "loss": 0.9172, "step": 11990 }, { "epoch": 0.5993407252022775, "grad_norm": 1.5406326055526733, "learning_rate": 7.943113719911355e-05, "loss": 0.9592, "step": 12000 }, { "epoch": 0.5998401758066128, "grad_norm": 1.6479825973510742, "learning_rate": 7.939941692498674e-05, "loss": 0.8748, "step": 12010 }, { "epoch": 0.600339626410948, "grad_norm": 1.3644030094146729, "learning_rate": 7.936767855565376e-05, "loss": 0.7153, "step": 12020 }, { "epoch": 0.6008390770152832, "grad_norm": 2.766441822052002, "learning_rate": 7.933592211064949e-05, "loss": 0.9166, "step": 12030 }, { "epoch": 0.6013385276196184, "grad_norm": 3.7731025218963623, "learning_rate": 7.930414760951983e-05, "loss": 0.8657, "step": 12040 }, { "epoch": 0.6018379782239537, "grad_norm": 2.5364325046539307, "learning_rate": 7.927235507182186e-05, "loss": 0.8775, "step": 12050 }, { "epoch": 0.6023374288282889, "grad_norm": 2.266455888748169, "learning_rate": 7.924054451712375e-05, "loss": 1.0283, "step": 12060 }, { "epoch": 0.6028368794326241, "grad_norm": 1.0789244174957275, "learning_rate": 7.920871596500472e-05, "loss": 0.7952, "step": 12070 }, { "epoch": 0.6033363300369593, "grad_norm": 3.885934352874756, "learning_rate": 7.917686943505514e-05, "loss": 0.9007, "step": 12080 }, { "epoch": 0.6038357806412946, "grad_norm": 1.2866015434265137, "learning_rate": 7.914500494687637e-05, "loss": 0.6869, "step": 12090 }, { "epoch": 0.6043352312456298, "grad_norm": 3.5734331607818604, "learning_rate": 7.911312252008086e-05, "loss": 1.001, "step": 12100 }, { "epoch": 0.604834681849965, "grad_norm": 2.1679513454437256, "learning_rate": 7.908122217429212e-05, "loss": 0.897, "step": 12110 }, { "epoch": 0.6053341324543002, "grad_norm": 2.468299627304077, "learning_rate": 7.904930392914464e-05, "loss": 0.7623, "step": 12120 }, { "epoch": 0.6058335830586355, "grad_norm": 2.6202375888824463, "learning_rate": 7.901736780428394e-05, "loss": 0.8243, "step": 12130 }, { "epoch": 0.6063330336629708, "grad_norm": 1.9101169109344482, "learning_rate": 7.898541381936662e-05, "loss": 0.7416, "step": 12140 }, { "epoch": 0.606832484267306, "grad_norm": 0.6252166628837585, "learning_rate": 7.895344199406017e-05, "loss": 0.8222, "step": 12150 }, { "epoch": 0.6073319348716412, "grad_norm": 1.0127849578857422, "learning_rate": 7.89214523480431e-05, "loss": 0.7764, "step": 12160 }, { "epoch": 0.6078313854759764, "grad_norm": 0.734090268611908, "learning_rate": 7.888944490100487e-05, "loss": 1.0371, "step": 12170 }, { "epoch": 0.6083308360803117, "grad_norm": 1.8829647302627563, "learning_rate": 7.885741967264595e-05, "loss": 0.9722, "step": 12180 }, { "epoch": 0.6088302866846469, "grad_norm": 2.159318685531616, "learning_rate": 7.882537668267773e-05, "loss": 0.6813, "step": 12190 }, { "epoch": 0.6093297372889821, "grad_norm": 2.9517064094543457, "learning_rate": 7.879331595082249e-05, "loss": 0.9704, "step": 12200 }, { "epoch": 0.6098291878933173, "grad_norm": 1.1465985774993896, "learning_rate": 7.876123749681349e-05, "loss": 0.9773, "step": 12210 }, { "epoch": 0.6103286384976526, "grad_norm": 0.9804210066795349, "learning_rate": 7.872914134039484e-05, "loss": 0.7583, "step": 12220 }, { "epoch": 0.6108280891019878, "grad_norm": 3.645322561264038, "learning_rate": 7.869702750132162e-05, "loss": 0.851, "step": 12230 }, { "epoch": 0.611327539706323, "grad_norm": 1.5942487716674805, "learning_rate": 7.866489599935971e-05, "loss": 0.794, "step": 12240 }, { "epoch": 0.6118269903106582, "grad_norm": 2.031911849975586, "learning_rate": 7.863274685428594e-05, "loss": 1.1151, "step": 12250 }, { "epoch": 0.6123264409149936, "grad_norm": 1.527058482170105, "learning_rate": 7.860058008588791e-05, "loss": 0.8269, "step": 12260 }, { "epoch": 0.6128258915193288, "grad_norm": 2.355426549911499, "learning_rate": 7.856839571396417e-05, "loss": 0.7066, "step": 12270 }, { "epoch": 0.613325342123664, "grad_norm": 3.1753835678100586, "learning_rate": 7.853619375832404e-05, "loss": 0.6978, "step": 12280 }, { "epoch": 0.6138247927279992, "grad_norm": 3.317664623260498, "learning_rate": 7.850397423878766e-05, "loss": 0.7919, "step": 12290 }, { "epoch": 0.6143242433323345, "grad_norm": 2.4618120193481445, "learning_rate": 7.8471737175186e-05, "loss": 0.9619, "step": 12300 }, { "epoch": 0.6148236939366697, "grad_norm": 1.3759201765060425, "learning_rate": 7.843948258736082e-05, "loss": 0.8436, "step": 12310 }, { "epoch": 0.6153231445410049, "grad_norm": 3.534787178039551, "learning_rate": 7.840721049516468e-05, "loss": 0.845, "step": 12320 }, { "epoch": 0.6158225951453401, "grad_norm": 1.6224181652069092, "learning_rate": 7.837492091846092e-05, "loss": 0.7444, "step": 12330 }, { "epoch": 0.6163220457496753, "grad_norm": 1.3301310539245605, "learning_rate": 7.83426138771236e-05, "loss": 0.8436, "step": 12340 }, { "epoch": 0.6168214963540106, "grad_norm": 1.7994141578674316, "learning_rate": 7.831028939103757e-05, "loss": 0.8018, "step": 12350 }, { "epoch": 0.6173209469583458, "grad_norm": 1.82992684841156, "learning_rate": 7.82779474800984e-05, "loss": 1.1498, "step": 12360 }, { "epoch": 0.617820397562681, "grad_norm": 0.945503294467926, "learning_rate": 7.824558816421237e-05, "loss": 0.8305, "step": 12370 }, { "epoch": 0.6183198481670162, "grad_norm": 0.9997421503067017, "learning_rate": 7.821321146329652e-05, "loss": 1.3226, "step": 12380 }, { "epoch": 0.6188192987713516, "grad_norm": 2.531658411026001, "learning_rate": 7.818081739727855e-05, "loss": 0.7832, "step": 12390 }, { "epoch": 0.6193187493756868, "grad_norm": 2.9111216068267822, "learning_rate": 7.814840598609686e-05, "loss": 0.8982, "step": 12400 }, { "epoch": 0.619818199980022, "grad_norm": 2.1175637245178223, "learning_rate": 7.811597724970051e-05, "loss": 0.9697, "step": 12410 }, { "epoch": 0.6203176505843572, "grad_norm": 1.2090173959732056, "learning_rate": 7.808353120804926e-05, "loss": 0.6774, "step": 12420 }, { "epoch": 0.6208171011886925, "grad_norm": 2.112946033477783, "learning_rate": 7.805106788111347e-05, "loss": 0.9978, "step": 12430 }, { "epoch": 0.6213165517930277, "grad_norm": 2.040161609649658, "learning_rate": 7.801858728887421e-05, "loss": 0.8364, "step": 12440 }, { "epoch": 0.6218160023973629, "grad_norm": 2.338141679763794, "learning_rate": 7.798608945132311e-05, "loss": 0.8842, "step": 12450 }, { "epoch": 0.6223154530016981, "grad_norm": 3.354522705078125, "learning_rate": 7.795357438846243e-05, "loss": 0.9676, "step": 12460 }, { "epoch": 0.6228149036060333, "grad_norm": 1.1040174961090088, "learning_rate": 7.792104212030506e-05, "loss": 0.7489, "step": 12470 }, { "epoch": 0.6233143542103686, "grad_norm": 1.954264760017395, "learning_rate": 7.788849266687446e-05, "loss": 1.0124, "step": 12480 }, { "epoch": 0.6238138048147038, "grad_norm": 2.1573355197906494, "learning_rate": 7.785592604820466e-05, "loss": 0.7793, "step": 12490 }, { "epoch": 0.624313255419039, "grad_norm": 2.3539342880249023, "learning_rate": 7.782334228434028e-05, "loss": 1.0291, "step": 12500 }, { "epoch": 0.6248127060233742, "grad_norm": 1.8481172323226929, "learning_rate": 7.779074139533647e-05, "loss": 1.0117, "step": 12510 }, { "epoch": 0.6253121566277096, "grad_norm": 2.7877354621887207, "learning_rate": 7.77581234012589e-05, "loss": 0.8698, "step": 12520 }, { "epoch": 0.6258116072320448, "grad_norm": 4.730374336242676, "learning_rate": 7.772548832218383e-05, "loss": 1.0457, "step": 12530 }, { "epoch": 0.62631105783638, "grad_norm": 1.8074944019317627, "learning_rate": 7.7692836178198e-05, "loss": 0.9615, "step": 12540 }, { "epoch": 0.6268105084407152, "grad_norm": 0.8964559435844421, "learning_rate": 7.766016698939864e-05, "loss": 0.6061, "step": 12550 }, { "epoch": 0.6273099590450505, "grad_norm": 5.012725830078125, "learning_rate": 7.76274807758935e-05, "loss": 0.9625, "step": 12560 }, { "epoch": 0.6278094096493857, "grad_norm": 1.6120495796203613, "learning_rate": 7.759477755780078e-05, "loss": 0.8735, "step": 12570 }, { "epoch": 0.6283088602537209, "grad_norm": 2.0828094482421875, "learning_rate": 7.756205735524917e-05, "loss": 0.6527, "step": 12580 }, { "epoch": 0.6288083108580561, "grad_norm": 1.591010332107544, "learning_rate": 7.75293201883778e-05, "loss": 0.8747, "step": 12590 }, { "epoch": 0.6293077614623914, "grad_norm": 2.5814242362976074, "learning_rate": 7.749656607733624e-05, "loss": 0.9139, "step": 12600 }, { "epoch": 0.6298072120667266, "grad_norm": 1.6371376514434814, "learning_rate": 7.746379504228452e-05, "loss": 0.8774, "step": 12610 }, { "epoch": 0.6303066626710618, "grad_norm": 1.6633555889129639, "learning_rate": 7.743100710339304e-05, "loss": 0.8397, "step": 12620 }, { "epoch": 0.630806113275397, "grad_norm": 4.060708999633789, "learning_rate": 7.739820228084261e-05, "loss": 0.8574, "step": 12630 }, { "epoch": 0.6313055638797322, "grad_norm": 2.887648820877075, "learning_rate": 7.736538059482447e-05, "loss": 0.9689, "step": 12640 }, { "epoch": 0.6318050144840676, "grad_norm": 0.8675547242164612, "learning_rate": 7.733254206554024e-05, "loss": 0.7439, "step": 12650 }, { "epoch": 0.6323044650884028, "grad_norm": 1.7451297044754028, "learning_rate": 7.729968671320185e-05, "loss": 0.7079, "step": 12660 }, { "epoch": 0.632803915692738, "grad_norm": 2.2067673206329346, "learning_rate": 7.726681455803161e-05, "loss": 0.7516, "step": 12670 }, { "epoch": 0.6333033662970732, "grad_norm": 2.3583004474639893, "learning_rate": 7.723392562026221e-05, "loss": 0.8686, "step": 12680 }, { "epoch": 0.6338028169014085, "grad_norm": 3.745316505432129, "learning_rate": 7.720101992013662e-05, "loss": 0.843, "step": 12690 }, { "epoch": 0.6343022675057437, "grad_norm": 0.6729642152786255, "learning_rate": 7.716809747790817e-05, "loss": 0.8411, "step": 12700 }, { "epoch": 0.6348017181100789, "grad_norm": 1.1551730632781982, "learning_rate": 7.713515831384044e-05, "loss": 0.7959, "step": 12710 }, { "epoch": 0.6353011687144141, "grad_norm": 1.2299566268920898, "learning_rate": 7.710220244820736e-05, "loss": 0.9145, "step": 12720 }, { "epoch": 0.6358006193187494, "grad_norm": 1.5636696815490723, "learning_rate": 7.706922990129309e-05, "loss": 0.9513, "step": 12730 }, { "epoch": 0.6363000699230846, "grad_norm": 3.4296226501464844, "learning_rate": 7.703624069339211e-05, "loss": 0.9121, "step": 12740 }, { "epoch": 0.6367995205274198, "grad_norm": 3.456554889678955, "learning_rate": 7.700323484480911e-05, "loss": 0.858, "step": 12750 }, { "epoch": 0.637298971131755, "grad_norm": 2.2380423545837402, "learning_rate": 7.697021237585906e-05, "loss": 0.8616, "step": 12760 }, { "epoch": 0.6377984217360902, "grad_norm": 2.1804730892181396, "learning_rate": 7.693717330686709e-05, "loss": 0.9048, "step": 12770 }, { "epoch": 0.6382978723404256, "grad_norm": 2.1640286445617676, "learning_rate": 7.690411765816864e-05, "loss": 0.9936, "step": 12780 }, { "epoch": 0.6387973229447608, "grad_norm": 1.3663392066955566, "learning_rate": 7.687104545010928e-05, "loss": 0.9104, "step": 12790 }, { "epoch": 0.639296773549096, "grad_norm": 1.6175613403320312, "learning_rate": 7.683795670304484e-05, "loss": 0.7661, "step": 12800 }, { "epoch": 0.6397962241534312, "grad_norm": 2.310173273086548, "learning_rate": 7.680485143734125e-05, "loss": 0.9265, "step": 12810 }, { "epoch": 0.6402956747577665, "grad_norm": 1.0012131929397583, "learning_rate": 7.677172967337467e-05, "loss": 0.8629, "step": 12820 }, { "epoch": 0.6407951253621017, "grad_norm": 1.1713634729385376, "learning_rate": 7.673859143153139e-05, "loss": 0.7713, "step": 12830 }, { "epoch": 0.6412945759664369, "grad_norm": 2.488898277282715, "learning_rate": 7.670543673220786e-05, "loss": 0.825, "step": 12840 }, { "epoch": 0.6417940265707721, "grad_norm": 1.26347017288208, "learning_rate": 7.667226559581062e-05, "loss": 0.7914, "step": 12850 }, { "epoch": 0.6422934771751074, "grad_norm": 2.276587724685669, "learning_rate": 7.663907804275636e-05, "loss": 1.0954, "step": 12860 }, { "epoch": 0.6427929277794426, "grad_norm": 0.8161036968231201, "learning_rate": 7.660587409347187e-05, "loss": 0.7575, "step": 12870 }, { "epoch": 0.6432923783837778, "grad_norm": 0.5742631554603577, "learning_rate": 7.657265376839405e-05, "loss": 0.6672, "step": 12880 }, { "epoch": 0.643791828988113, "grad_norm": 2.646259307861328, "learning_rate": 7.653941708796981e-05, "loss": 0.9497, "step": 12890 }, { "epoch": 0.6442912795924483, "grad_norm": 0.9056533575057983, "learning_rate": 7.650616407265623e-05, "loss": 0.8258, "step": 12900 }, { "epoch": 0.6447907301967836, "grad_norm": 3.490792751312256, "learning_rate": 7.647289474292032e-05, "loss": 0.7076, "step": 12910 }, { "epoch": 0.6452901808011188, "grad_norm": 1.7089931964874268, "learning_rate": 7.643960911923926e-05, "loss": 0.7547, "step": 12920 }, { "epoch": 0.645789631405454, "grad_norm": 1.3501700162887573, "learning_rate": 7.640630722210018e-05, "loss": 0.8654, "step": 12930 }, { "epoch": 0.6462890820097892, "grad_norm": 1.3468042612075806, "learning_rate": 7.637298907200024e-05, "loss": 0.7137, "step": 12940 }, { "epoch": 0.6467885326141245, "grad_norm": 1.3366823196411133, "learning_rate": 7.633965468944662e-05, "loss": 0.7872, "step": 12950 }, { "epoch": 0.6472879832184597, "grad_norm": 2.4136240482330322, "learning_rate": 7.630630409495645e-05, "loss": 0.8831, "step": 12960 }, { "epoch": 0.6477874338227949, "grad_norm": 3.5967211723327637, "learning_rate": 7.627293730905689e-05, "loss": 0.8968, "step": 12970 }, { "epoch": 0.6482868844271301, "grad_norm": 4.377511501312256, "learning_rate": 7.623955435228505e-05, "loss": 0.8787, "step": 12980 }, { "epoch": 0.6487863350314654, "grad_norm": 1.7462594509124756, "learning_rate": 7.620615524518797e-05, "loss": 0.9401, "step": 12990 }, { "epoch": 0.6492857856358006, "grad_norm": 1.7474712133407593, "learning_rate": 7.617274000832266e-05, "loss": 0.9544, "step": 13000 }, { "epoch": 0.6497852362401358, "grad_norm": 0.8920682668685913, "learning_rate": 7.613930866225604e-05, "loss": 0.898, "step": 13010 }, { "epoch": 0.650284686844471, "grad_norm": 1.4357318878173828, "learning_rate": 7.610586122756496e-05, "loss": 0.6582, "step": 13020 }, { "epoch": 0.6507841374488064, "grad_norm": 2.6412606239318848, "learning_rate": 7.607239772483614e-05, "loss": 0.909, "step": 13030 }, { "epoch": 0.6512835880531416, "grad_norm": 0.8142654895782471, "learning_rate": 7.603891817466621e-05, "loss": 0.8265, "step": 13040 }, { "epoch": 0.6517830386574768, "grad_norm": 2.4540538787841797, "learning_rate": 7.600542259766173e-05, "loss": 0.6881, "step": 13050 }, { "epoch": 0.652282489261812, "grad_norm": 1.987032175064087, "learning_rate": 7.597191101443901e-05, "loss": 0.7851, "step": 13060 }, { "epoch": 0.6527819398661472, "grad_norm": 2.8409721851348877, "learning_rate": 7.593838344562432e-05, "loss": 0.626, "step": 13070 }, { "epoch": 0.6532813904704825, "grad_norm": 4.194586753845215, "learning_rate": 7.590483991185369e-05, "loss": 0.7684, "step": 13080 }, { "epoch": 0.6537808410748177, "grad_norm": 2.416386127471924, "learning_rate": 7.587128043377304e-05, "loss": 0.9145, "step": 13090 }, { "epoch": 0.6542802916791529, "grad_norm": 4.792058944702148, "learning_rate": 7.583770503203807e-05, "loss": 1.0233, "step": 13100 }, { "epoch": 0.6547797422834881, "grad_norm": 1.2306267023086548, "learning_rate": 7.580411372731426e-05, "loss": 0.7519, "step": 13110 }, { "epoch": 0.6552791928878234, "grad_norm": 1.764892816543579, "learning_rate": 7.577050654027693e-05, "loss": 0.9306, "step": 13120 }, { "epoch": 0.6557786434921586, "grad_norm": 1.615273356437683, "learning_rate": 7.573688349161115e-05, "loss": 0.8097, "step": 13130 }, { "epoch": 0.6562780940964938, "grad_norm": 1.7956855297088623, "learning_rate": 7.570324460201174e-05, "loss": 0.8041, "step": 13140 }, { "epoch": 0.656777544700829, "grad_norm": 2.333482503890991, "learning_rate": 7.566958989218329e-05, "loss": 1.0211, "step": 13150 }, { "epoch": 0.6572769953051644, "grad_norm": 1.5992728471755981, "learning_rate": 7.563591938284011e-05, "loss": 0.8495, "step": 13160 }, { "epoch": 0.6577764459094996, "grad_norm": 2.574862480163574, "learning_rate": 7.560223309470627e-05, "loss": 0.773, "step": 13170 }, { "epoch": 0.6582758965138348, "grad_norm": 2.4205756187438965, "learning_rate": 7.556853104851548e-05, "loss": 0.945, "step": 13180 }, { "epoch": 0.65877534711817, "grad_norm": 5.572146892547607, "learning_rate": 7.553481326501124e-05, "loss": 1.1562, "step": 13190 }, { "epoch": 0.6592747977225052, "grad_norm": 1.2964991331100464, "learning_rate": 7.550107976494665e-05, "loss": 0.7917, "step": 13200 }, { "epoch": 0.6597742483268405, "grad_norm": 1.5800459384918213, "learning_rate": 7.546733056908456e-05, "loss": 0.8779, "step": 13210 }, { "epoch": 0.6602736989311757, "grad_norm": 2.146944999694824, "learning_rate": 7.543356569819744e-05, "loss": 0.9196, "step": 13220 }, { "epoch": 0.6607731495355109, "grad_norm": 2.837876081466675, "learning_rate": 7.539978517306738e-05, "loss": 0.7187, "step": 13230 }, { "epoch": 0.6612726001398461, "grad_norm": 3.4195945262908936, "learning_rate": 7.536598901448617e-05, "loss": 0.7647, "step": 13240 }, { "epoch": 0.6617720507441814, "grad_norm": 2.067004680633545, "learning_rate": 7.533217724325519e-05, "loss": 1.1108, "step": 13250 }, { "epoch": 0.6622715013485166, "grad_norm": 1.316638469696045, "learning_rate": 7.529834988018542e-05, "loss": 0.7777, "step": 13260 }, { "epoch": 0.6627709519528519, "grad_norm": 2.552154064178467, "learning_rate": 7.526450694609745e-05, "loss": 1.0747, "step": 13270 }, { "epoch": 0.663270402557187, "grad_norm": 2.6598401069641113, "learning_rate": 7.523064846182145e-05, "loss": 0.7556, "step": 13280 }, { "epoch": 0.6637698531615224, "grad_norm": 0.7628982663154602, "learning_rate": 7.519677444819714e-05, "loss": 0.8351, "step": 13290 }, { "epoch": 0.6642693037658576, "grad_norm": 1.2733036279678345, "learning_rate": 7.516288492607388e-05, "loss": 0.9932, "step": 13300 }, { "epoch": 0.6647687543701928, "grad_norm": 1.3308387994766235, "learning_rate": 7.512897991631045e-05, "loss": 0.7084, "step": 13310 }, { "epoch": 0.665268204974528, "grad_norm": 2.271099328994751, "learning_rate": 7.509505943977526e-05, "loss": 0.9023, "step": 13320 }, { "epoch": 0.6657676555788633, "grad_norm": 3.208682060241699, "learning_rate": 7.50611235173462e-05, "loss": 0.8111, "step": 13330 }, { "epoch": 0.6662671061831985, "grad_norm": 3.4175901412963867, "learning_rate": 7.502717216991069e-05, "loss": 0.9072, "step": 13340 }, { "epoch": 0.6667665567875337, "grad_norm": 2.390958786010742, "learning_rate": 7.499320541836559e-05, "loss": 0.7867, "step": 13350 }, { "epoch": 0.6672660073918689, "grad_norm": 6.080948352813721, "learning_rate": 7.495922328361733e-05, "loss": 1.1479, "step": 13360 }, { "epoch": 0.6677654579962041, "grad_norm": 1.4552528858184814, "learning_rate": 7.492522578658171e-05, "loss": 0.9252, "step": 13370 }, { "epoch": 0.6682649086005394, "grad_norm": 1.0324900150299072, "learning_rate": 7.489121294818407e-05, "loss": 0.6551, "step": 13380 }, { "epoch": 0.6687643592048746, "grad_norm": 2.4894304275512695, "learning_rate": 7.485718478935913e-05, "loss": 1.0623, "step": 13390 }, { "epoch": 0.6692638098092099, "grad_norm": 2.6332945823669434, "learning_rate": 7.482314133105108e-05, "loss": 0.8745, "step": 13400 }, { "epoch": 0.6697632604135451, "grad_norm": 3.0975561141967773, "learning_rate": 7.478908259421351e-05, "loss": 1.0553, "step": 13410 }, { "epoch": 0.6702627110178804, "grad_norm": 1.7106579542160034, "learning_rate": 7.475500859980942e-05, "loss": 1.0071, "step": 13420 }, { "epoch": 0.6707621616222156, "grad_norm": 1.1007097959518433, "learning_rate": 7.47209193688112e-05, "loss": 0.9686, "step": 13430 }, { "epoch": 0.6712616122265508, "grad_norm": 4.405452728271484, "learning_rate": 7.46868149222006e-05, "loss": 0.8554, "step": 13440 }, { "epoch": 0.671761062830886, "grad_norm": 1.4012975692749023, "learning_rate": 7.465269528096875e-05, "loss": 0.9445, "step": 13450 }, { "epoch": 0.6722605134352213, "grad_norm": 1.4058866500854492, "learning_rate": 7.461856046611614e-05, "loss": 0.7213, "step": 13460 }, { "epoch": 0.6727599640395565, "grad_norm": 2.2778289318084717, "learning_rate": 7.45844104986526e-05, "loss": 0.8677, "step": 13470 }, { "epoch": 0.6732594146438917, "grad_norm": 1.989430546760559, "learning_rate": 7.455024539959727e-05, "loss": 0.8598, "step": 13480 }, { "epoch": 0.6737588652482269, "grad_norm": 0.7515740394592285, "learning_rate": 7.451606518997862e-05, "loss": 0.8142, "step": 13490 }, { "epoch": 0.6742583158525621, "grad_norm": 1.0352113246917725, "learning_rate": 7.44818698908344e-05, "loss": 0.8191, "step": 13500 }, { "epoch": 0.6747577664568974, "grad_norm": 1.4111157655715942, "learning_rate": 7.444765952321164e-05, "loss": 0.8052, "step": 13510 }, { "epoch": 0.6752572170612327, "grad_norm": 0.8759949803352356, "learning_rate": 7.441343410816671e-05, "loss": 0.7079, "step": 13520 }, { "epoch": 0.6757566676655679, "grad_norm": 1.5545209646224976, "learning_rate": 7.437919366676517e-05, "loss": 0.8178, "step": 13530 }, { "epoch": 0.6762561182699031, "grad_norm": 6.742189884185791, "learning_rate": 7.434493822008187e-05, "loss": 0.9947, "step": 13540 }, { "epoch": 0.6767555688742384, "grad_norm": 3.9772582054138184, "learning_rate": 7.431066778920086e-05, "loss": 1.0145, "step": 13550 }, { "epoch": 0.6772550194785736, "grad_norm": 3.103926420211792, "learning_rate": 7.427638239521543e-05, "loss": 1.2054, "step": 13560 }, { "epoch": 0.6777544700829088, "grad_norm": 1.726408839225769, "learning_rate": 7.424208205922812e-05, "loss": 0.7034, "step": 13570 }, { "epoch": 0.678253920687244, "grad_norm": 2.6054139137268066, "learning_rate": 7.420776680235058e-05, "loss": 0.8105, "step": 13580 }, { "epoch": 0.6787533712915793, "grad_norm": 4.8610920906066895, "learning_rate": 7.417343664570372e-05, "loss": 0.7722, "step": 13590 }, { "epoch": 0.6792528218959145, "grad_norm": 2.0720362663269043, "learning_rate": 7.413909161041759e-05, "loss": 0.9086, "step": 13600 }, { "epoch": 0.6797522725002497, "grad_norm": 1.0414842367172241, "learning_rate": 7.410473171763141e-05, "loss": 0.6482, "step": 13610 }, { "epoch": 0.6802517231045849, "grad_norm": 1.2426433563232422, "learning_rate": 7.407035698849352e-05, "loss": 0.7256, "step": 13620 }, { "epoch": 0.6807511737089202, "grad_norm": 3.9983971118927, "learning_rate": 7.403596744416141e-05, "loss": 1.1534, "step": 13630 }, { "epoch": 0.6812506243132554, "grad_norm": 2.105839490890503, "learning_rate": 7.40015631058017e-05, "loss": 0.9809, "step": 13640 }, { "epoch": 0.6817500749175907, "grad_norm": 1.7513221502304077, "learning_rate": 7.39671439945901e-05, "loss": 0.8554, "step": 13650 }, { "epoch": 0.6822495255219259, "grad_norm": 4.3615522384643555, "learning_rate": 7.393271013171142e-05, "loss": 0.976, "step": 13660 }, { "epoch": 0.6827489761262611, "grad_norm": 2.0290863513946533, "learning_rate": 7.389826153835951e-05, "loss": 0.8127, "step": 13670 }, { "epoch": 0.6832484267305964, "grad_norm": 0.8345702886581421, "learning_rate": 7.386379823573736e-05, "loss": 1.0663, "step": 13680 }, { "epoch": 0.6837478773349316, "grad_norm": 1.057742953300476, "learning_rate": 7.382932024505695e-05, "loss": 0.7797, "step": 13690 }, { "epoch": 0.6842473279392668, "grad_norm": 3.281606674194336, "learning_rate": 7.379482758753936e-05, "loss": 0.7851, "step": 13700 }, { "epoch": 0.684746778543602, "grad_norm": 5.573545932769775, "learning_rate": 7.37603202844146e-05, "loss": 0.6852, "step": 13710 }, { "epoch": 0.6852462291479373, "grad_norm": 2.6311960220336914, "learning_rate": 7.372579835692182e-05, "loss": 0.8513, "step": 13720 }, { "epoch": 0.6857456797522725, "grad_norm": 1.099637508392334, "learning_rate": 7.369126182630907e-05, "loss": 0.7877, "step": 13730 }, { "epoch": 0.6862451303566077, "grad_norm": 4.3573503494262695, "learning_rate": 7.365671071383345e-05, "loss": 0.8938, "step": 13740 }, { "epoch": 0.6867445809609429, "grad_norm": 2.802907705307007, "learning_rate": 7.362214504076097e-05, "loss": 1.0123, "step": 13750 }, { "epoch": 0.6872440315652782, "grad_norm": 2.6232540607452393, "learning_rate": 7.35875648283667e-05, "loss": 0.7469, "step": 13760 }, { "epoch": 0.6877434821696135, "grad_norm": 2.878917932510376, "learning_rate": 7.355297009793456e-05, "loss": 0.7885, "step": 13770 }, { "epoch": 0.6882429327739487, "grad_norm": 2.5070621967315674, "learning_rate": 7.351836087075748e-05, "loss": 0.9247, "step": 13780 }, { "epoch": 0.6887423833782839, "grad_norm": 5.181169033050537, "learning_rate": 7.348373716813723e-05, "loss": 0.8374, "step": 13790 }, { "epoch": 0.6892418339826191, "grad_norm": 1.5600733757019043, "learning_rate": 7.34490990113846e-05, "loss": 0.7839, "step": 13800 }, { "epoch": 0.6897412845869544, "grad_norm": 1.6517608165740967, "learning_rate": 7.341444642181917e-05, "loss": 0.7345, "step": 13810 }, { "epoch": 0.6902407351912896, "grad_norm": 1.5919768810272217, "learning_rate": 7.337977942076948e-05, "loss": 0.6954, "step": 13820 }, { "epoch": 0.6907401857956248, "grad_norm": 1.4466248750686646, "learning_rate": 7.33450980295729e-05, "loss": 0.9392, "step": 13830 }, { "epoch": 0.69123963639996, "grad_norm": 0.9786671996116638, "learning_rate": 7.331040226957566e-05, "loss": 0.6875, "step": 13840 }, { "epoch": 0.6917390870042953, "grad_norm": 4.586532115936279, "learning_rate": 7.327569216213283e-05, "loss": 1.2082, "step": 13850 }, { "epoch": 0.6922385376086305, "grad_norm": 2.9234001636505127, "learning_rate": 7.324096772860837e-05, "loss": 0.9455, "step": 13860 }, { "epoch": 0.6927379882129657, "grad_norm": 5.324400424957275, "learning_rate": 7.320622899037496e-05, "loss": 0.8514, "step": 13870 }, { "epoch": 0.6932374388173009, "grad_norm": 1.910937786102295, "learning_rate": 7.317147596881416e-05, "loss": 1.0465, "step": 13880 }, { "epoch": 0.6937368894216362, "grad_norm": 1.2394311428070068, "learning_rate": 7.313670868531628e-05, "loss": 0.8079, "step": 13890 }, { "epoch": 0.6942363400259715, "grad_norm": 1.649618148803711, "learning_rate": 7.310192716128043e-05, "loss": 1.0114, "step": 13900 }, { "epoch": 0.6947357906303067, "grad_norm": 0.8796854019165039, "learning_rate": 7.306713141811448e-05, "loss": 0.7572, "step": 13910 }, { "epoch": 0.6952352412346419, "grad_norm": 1.5423787832260132, "learning_rate": 7.303232147723504e-05, "loss": 0.8204, "step": 13920 }, { "epoch": 0.6957346918389771, "grad_norm": 3.8138532638549805, "learning_rate": 7.299749736006748e-05, "loss": 1.0489, "step": 13930 }, { "epoch": 0.6962341424433124, "grad_norm": 1.8880746364593506, "learning_rate": 7.29626590880459e-05, "loss": 0.8099, "step": 13940 }, { "epoch": 0.6967335930476476, "grad_norm": 1.2062681913375854, "learning_rate": 7.292780668261306e-05, "loss": 0.8952, "step": 13950 }, { "epoch": 0.6972330436519828, "grad_norm": 1.6811144351959229, "learning_rate": 7.289294016522048e-05, "loss": 0.9711, "step": 13960 }, { "epoch": 0.697732494256318, "grad_norm": 3.8066329956054688, "learning_rate": 7.285805955732833e-05, "loss": 0.8488, "step": 13970 }, { "epoch": 0.6982319448606533, "grad_norm": 3.49137544631958, "learning_rate": 7.282316488040546e-05, "loss": 0.7784, "step": 13980 }, { "epoch": 0.6987313954649885, "grad_norm": 2.736736536026001, "learning_rate": 7.278825615592942e-05, "loss": 0.8092, "step": 13990 }, { "epoch": 0.6992308460693237, "grad_norm": 1.3519524335861206, "learning_rate": 7.27533334053863e-05, "loss": 0.6568, "step": 14000 }, { "epoch": 0.6997302966736589, "grad_norm": 2.622758388519287, "learning_rate": 7.271839665027098e-05, "loss": 1.0338, "step": 14010 }, { "epoch": 0.7002297472779943, "grad_norm": 1.6801555156707764, "learning_rate": 7.268344591208679e-05, "loss": 0.7619, "step": 14020 }, { "epoch": 0.7007291978823295, "grad_norm": 1.671966552734375, "learning_rate": 7.264848121234581e-05, "loss": 0.9252, "step": 14030 }, { "epoch": 0.7012286484866647, "grad_norm": 3.0895462036132812, "learning_rate": 7.261350257256861e-05, "loss": 0.9392, "step": 14040 }, { "epoch": 0.7017280990909999, "grad_norm": 1.9470264911651611, "learning_rate": 7.257851001428442e-05, "loss": 0.863, "step": 14050 }, { "epoch": 0.7022275496953352, "grad_norm": 2.663722276687622, "learning_rate": 7.254350355903095e-05, "loss": 0.7673, "step": 14060 }, { "epoch": 0.7027270002996704, "grad_norm": 3.543322801589966, "learning_rate": 7.250848322835458e-05, "loss": 0.8099, "step": 14070 }, { "epoch": 0.7032264509040056, "grad_norm": 5.451095104217529, "learning_rate": 7.24734490438101e-05, "loss": 0.8569, "step": 14080 }, { "epoch": 0.7037259015083408, "grad_norm": 1.6805773973464966, "learning_rate": 7.243840102696092e-05, "loss": 0.8407, "step": 14090 }, { "epoch": 0.704225352112676, "grad_norm": 1.407873511314392, "learning_rate": 7.240333919937893e-05, "loss": 1.1547, "step": 14100 }, { "epoch": 0.7047248027170113, "grad_norm": 4.562381267547607, "learning_rate": 7.236826358264452e-05, "loss": 1.0369, "step": 14110 }, { "epoch": 0.7052242533213465, "grad_norm": 2.872616767883301, "learning_rate": 7.233317419834657e-05, "loss": 1.013, "step": 14120 }, { "epoch": 0.7057237039256817, "grad_norm": 1.679598331451416, "learning_rate": 7.229807106808244e-05, "loss": 0.8949, "step": 14130 }, { "epoch": 0.7062231545300169, "grad_norm": 2.2586183547973633, "learning_rate": 7.226295421345793e-05, "loss": 0.8691, "step": 14140 }, { "epoch": 0.7067226051343523, "grad_norm": 3.6093177795410156, "learning_rate": 7.222782365608733e-05, "loss": 1.0768, "step": 14150 }, { "epoch": 0.7072220557386875, "grad_norm": 0.9443897008895874, "learning_rate": 7.219267941759333e-05, "loss": 0.7503, "step": 14160 }, { "epoch": 0.7077215063430227, "grad_norm": 1.0663303136825562, "learning_rate": 7.215752151960702e-05, "loss": 0.7296, "step": 14170 }, { "epoch": 0.7082209569473579, "grad_norm": 3.196075677871704, "learning_rate": 7.212234998376796e-05, "loss": 0.8507, "step": 14180 }, { "epoch": 0.7087204075516932, "grad_norm": 1.3957006931304932, "learning_rate": 7.208716483172404e-05, "loss": 0.9945, "step": 14190 }, { "epoch": 0.7092198581560284, "grad_norm": 2.797950029373169, "learning_rate": 7.205196608513159e-05, "loss": 0.8203, "step": 14200 }, { "epoch": 0.7097193087603636, "grad_norm": 2.4570584297180176, "learning_rate": 7.201675376565525e-05, "loss": 0.9767, "step": 14210 }, { "epoch": 0.7102187593646988, "grad_norm": 3.108165979385376, "learning_rate": 7.198152789496804e-05, "loss": 0.8216, "step": 14220 }, { "epoch": 0.710718209969034, "grad_norm": 0.8162358999252319, "learning_rate": 7.194628849475135e-05, "loss": 0.8584, "step": 14230 }, { "epoch": 0.7112176605733693, "grad_norm": 1.895009994506836, "learning_rate": 7.191103558669486e-05, "loss": 0.9255, "step": 14240 }, { "epoch": 0.7117171111777045, "grad_norm": 3.324361562728882, "learning_rate": 7.187576919249653e-05, "loss": 0.7713, "step": 14250 }, { "epoch": 0.7122165617820397, "grad_norm": 2.6848435401916504, "learning_rate": 7.184048933386274e-05, "loss": 0.8191, "step": 14260 }, { "epoch": 0.7127160123863749, "grad_norm": 1.1187618970870972, "learning_rate": 7.180519603250801e-05, "loss": 0.9038, "step": 14270 }, { "epoch": 0.7132154629907103, "grad_norm": 2.602867364883423, "learning_rate": 7.176988931015523e-05, "loss": 0.876, "step": 14280 }, { "epoch": 0.7137149135950455, "grad_norm": 2.6675899028778076, "learning_rate": 7.173456918853555e-05, "loss": 0.8261, "step": 14290 }, { "epoch": 0.7142143641993807, "grad_norm": 1.920407772064209, "learning_rate": 7.169923568938833e-05, "loss": 0.8312, "step": 14300 }, { "epoch": 0.7147138148037159, "grad_norm": 1.513654112815857, "learning_rate": 7.166388883446113e-05, "loss": 0.6906, "step": 14310 }, { "epoch": 0.7152132654080512, "grad_norm": 2.566261053085327, "learning_rate": 7.162852864550985e-05, "loss": 0.661, "step": 14320 }, { "epoch": 0.7157127160123864, "grad_norm": 2.6750776767730713, "learning_rate": 7.159315514429847e-05, "loss": 0.879, "step": 14330 }, { "epoch": 0.7162121666167216, "grad_norm": 1.8424862623214722, "learning_rate": 7.155776835259926e-05, "loss": 1.0846, "step": 14340 }, { "epoch": 0.7167116172210568, "grad_norm": 1.061425805091858, "learning_rate": 7.15223682921926e-05, "loss": 0.7079, "step": 14350 }, { "epoch": 0.7172110678253921, "grad_norm": 4.195835590362549, "learning_rate": 7.148695498486706e-05, "loss": 1.1204, "step": 14360 }, { "epoch": 0.7177105184297273, "grad_norm": 0.6323617100715637, "learning_rate": 7.145152845241937e-05, "loss": 0.8321, "step": 14370 }, { "epoch": 0.7182099690340625, "grad_norm": 4.118073463439941, "learning_rate": 7.141608871665443e-05, "loss": 0.833, "step": 14380 }, { "epoch": 0.7187094196383977, "grad_norm": 5.030306339263916, "learning_rate": 7.13806357993852e-05, "loss": 0.8106, "step": 14390 }, { "epoch": 0.719208870242733, "grad_norm": 2.2950358390808105, "learning_rate": 7.13451697224328e-05, "loss": 0.7925, "step": 14400 }, { "epoch": 0.7197083208470683, "grad_norm": 2.321904182434082, "learning_rate": 7.130969050762644e-05, "loss": 0.8813, "step": 14410 }, { "epoch": 0.7202077714514035, "grad_norm": 3.0077672004699707, "learning_rate": 7.12741981768034e-05, "loss": 0.7522, "step": 14420 }, { "epoch": 0.7207072220557387, "grad_norm": 2.579281806945801, "learning_rate": 7.123869275180907e-05, "loss": 0.9041, "step": 14430 }, { "epoch": 0.7212066726600739, "grad_norm": 0.8570026159286499, "learning_rate": 7.120317425449683e-05, "loss": 0.822, "step": 14440 }, { "epoch": 0.7217061232644092, "grad_norm": 0.9074950218200684, "learning_rate": 7.116764270672822e-05, "loss": 0.9113, "step": 14450 }, { "epoch": 0.7222055738687444, "grad_norm": 2.054647207260132, "learning_rate": 7.113209813037269e-05, "loss": 0.9178, "step": 14460 }, { "epoch": 0.7227050244730796, "grad_norm": 1.069751262664795, "learning_rate": 7.10965405473078e-05, "loss": 0.824, "step": 14470 }, { "epoch": 0.7232044750774148, "grad_norm": 1.2552647590637207, "learning_rate": 7.106096997941905e-05, "loss": 0.7028, "step": 14480 }, { "epoch": 0.7237039256817501, "grad_norm": 2.4348955154418945, "learning_rate": 7.102538644859997e-05, "loss": 0.8449, "step": 14490 }, { "epoch": 0.7242033762860853, "grad_norm": 2.4546279907226562, "learning_rate": 7.098978997675207e-05, "loss": 0.9386, "step": 14500 }, { "epoch": 0.7247028268904205, "grad_norm": 2.0445778369903564, "learning_rate": 7.095418058578481e-05, "loss": 0.9844, "step": 14510 }, { "epoch": 0.7252022774947557, "grad_norm": 0.9729900360107422, "learning_rate": 7.09185582976156e-05, "loss": 0.8822, "step": 14520 }, { "epoch": 0.725701728099091, "grad_norm": 1.3782198429107666, "learning_rate": 7.08829231341698e-05, "loss": 0.8396, "step": 14530 }, { "epoch": 0.7262011787034263, "grad_norm": 1.925838828086853, "learning_rate": 7.084727511738068e-05, "loss": 0.6616, "step": 14540 }, { "epoch": 0.7267006293077615, "grad_norm": 0.7856277823448181, "learning_rate": 7.081161426918947e-05, "loss": 0.7673, "step": 14550 }, { "epoch": 0.7272000799120967, "grad_norm": 2.251613140106201, "learning_rate": 7.07759406115452e-05, "loss": 0.8624, "step": 14560 }, { "epoch": 0.7276995305164319, "grad_norm": 1.078507900238037, "learning_rate": 7.074025416640489e-05, "loss": 0.7539, "step": 14570 }, { "epoch": 0.7281989811207672, "grad_norm": 5.626710891723633, "learning_rate": 7.070455495573334e-05, "loss": 0.9461, "step": 14580 }, { "epoch": 0.7286984317251024, "grad_norm": 1.1566303968429565, "learning_rate": 7.066884300150329e-05, "loss": 0.9538, "step": 14590 }, { "epoch": 0.7291978823294376, "grad_norm": 2.0159220695495605, "learning_rate": 7.063311832569526e-05, "loss": 0.6276, "step": 14600 }, { "epoch": 0.7296973329337728, "grad_norm": 1.379220962524414, "learning_rate": 7.059738095029765e-05, "loss": 0.8525, "step": 14610 }, { "epoch": 0.7301967835381081, "grad_norm": 1.6599482297897339, "learning_rate": 7.056163089730661e-05, "loss": 0.8347, "step": 14620 }, { "epoch": 0.7306962341424433, "grad_norm": 1.2135945558547974, "learning_rate": 7.052586818872616e-05, "loss": 0.7454, "step": 14630 }, { "epoch": 0.7311956847467785, "grad_norm": 1.0972834825515747, "learning_rate": 7.049009284656808e-05, "loss": 0.7955, "step": 14640 }, { "epoch": 0.7316951353511137, "grad_norm": 0.9523053765296936, "learning_rate": 7.045430489285193e-05, "loss": 0.9727, "step": 14650 }, { "epoch": 0.732194585955449, "grad_norm": 1.0745872259140015, "learning_rate": 7.041850434960502e-05, "loss": 0.8024, "step": 14660 }, { "epoch": 0.7326940365597843, "grad_norm": 1.3045142889022827, "learning_rate": 7.038269123886245e-05, "loss": 0.7976, "step": 14670 }, { "epoch": 0.7331934871641195, "grad_norm": 1.8772603273391724, "learning_rate": 7.034686558266699e-05, "loss": 1.0487, "step": 14680 }, { "epoch": 0.7336929377684547, "grad_norm": 1.138713002204895, "learning_rate": 7.031102740306918e-05, "loss": 0.7276, "step": 14690 }, { "epoch": 0.7341923883727899, "grad_norm": 0.8011229038238525, "learning_rate": 7.027517672212726e-05, "loss": 0.8344, "step": 14700 }, { "epoch": 0.7346918389771252, "grad_norm": 1.3305041790008545, "learning_rate": 7.023931356190714e-05, "loss": 0.8671, "step": 14710 }, { "epoch": 0.7351912895814604, "grad_norm": 4.798947334289551, "learning_rate": 7.020343794448247e-05, "loss": 0.7968, "step": 14720 }, { "epoch": 0.7356907401857956, "grad_norm": 2.0859580039978027, "learning_rate": 7.016754989193448e-05, "loss": 0.8018, "step": 14730 }, { "epoch": 0.7361901907901308, "grad_norm": 1.570669412612915, "learning_rate": 7.013164942635216e-05, "loss": 0.6164, "step": 14740 }, { "epoch": 0.7366896413944661, "grad_norm": 2.7400262355804443, "learning_rate": 7.0095736569832e-05, "loss": 0.9568, "step": 14750 }, { "epoch": 0.7371890919988013, "grad_norm": 1.430620789527893, "learning_rate": 7.005981134447827e-05, "loss": 0.8949, "step": 14760 }, { "epoch": 0.7376885426031365, "grad_norm": 0.8303244113922119, "learning_rate": 7.002387377240276e-05, "loss": 0.7792, "step": 14770 }, { "epoch": 0.7381879932074717, "grad_norm": 1.8937126398086548, "learning_rate": 6.998792387572488e-05, "loss": 0.7584, "step": 14780 }, { "epoch": 0.7386874438118071, "grad_norm": 4.442506790161133, "learning_rate": 6.99519616765716e-05, "loss": 0.8228, "step": 14790 }, { "epoch": 0.7391868944161423, "grad_norm": 2.819027900695801, "learning_rate": 6.991598719707754e-05, "loss": 0.9048, "step": 14800 }, { "epoch": 0.7396863450204775, "grad_norm": 1.1911256313323975, "learning_rate": 6.988000045938477e-05, "loss": 0.8307, "step": 14810 }, { "epoch": 0.7401857956248127, "grad_norm": 4.393520355224609, "learning_rate": 6.984400148564303e-05, "loss": 0.957, "step": 14820 }, { "epoch": 0.7406852462291479, "grad_norm": 2.6378350257873535, "learning_rate": 6.980799029800945e-05, "loss": 0.8641, "step": 14830 }, { "epoch": 0.7411846968334832, "grad_norm": 2.7240123748779297, "learning_rate": 6.97719669186488e-05, "loss": 0.8224, "step": 14840 }, { "epoch": 0.7416841474378184, "grad_norm": 1.3037054538726807, "learning_rate": 6.973593136973327e-05, "loss": 0.8244, "step": 14850 }, { "epoch": 0.7421835980421536, "grad_norm": 1.5047438144683838, "learning_rate": 6.969988367344262e-05, "loss": 0.6301, "step": 14860 }, { "epoch": 0.7426830486464888, "grad_norm": 5.839412689208984, "learning_rate": 6.9663823851964e-05, "loss": 0.841, "step": 14870 }, { "epoch": 0.7431824992508241, "grad_norm": 2.1603047847747803, "learning_rate": 6.962775192749209e-05, "loss": 0.9352, "step": 14880 }, { "epoch": 0.7436819498551593, "grad_norm": 3.4553701877593994, "learning_rate": 6.959166792222898e-05, "loss": 0.8384, "step": 14890 }, { "epoch": 0.7441814004594945, "grad_norm": 1.3646953105926514, "learning_rate": 6.955557185838422e-05, "loss": 0.7684, "step": 14900 }, { "epoch": 0.7446808510638298, "grad_norm": 4.13494873046875, "learning_rate": 6.951946375817474e-05, "loss": 0.8509, "step": 14910 }, { "epoch": 0.7451803016681651, "grad_norm": 1.2285308837890625, "learning_rate": 6.948334364382496e-05, "loss": 0.8276, "step": 14920 }, { "epoch": 0.7456797522725003, "grad_norm": 2.744372606277466, "learning_rate": 6.944721153756661e-05, "loss": 0.772, "step": 14930 }, { "epoch": 0.7461792028768355, "grad_norm": 2.268390417098999, "learning_rate": 6.941106746163884e-05, "loss": 1.0615, "step": 14940 }, { "epoch": 0.7466786534811707, "grad_norm": 1.9456779956817627, "learning_rate": 6.937491143828818e-05, "loss": 0.7258, "step": 14950 }, { "epoch": 0.7471781040855059, "grad_norm": 2.2939226627349854, "learning_rate": 6.933874348976848e-05, "loss": 1.0961, "step": 14960 }, { "epoch": 0.7476775546898412, "grad_norm": 1.3495417833328247, "learning_rate": 6.930256363834095e-05, "loss": 0.8552, "step": 14970 }, { "epoch": 0.7481770052941764, "grad_norm": 2.128901481628418, "learning_rate": 6.926637190627413e-05, "loss": 0.6513, "step": 14980 }, { "epoch": 0.7486764558985116, "grad_norm": 3.783480405807495, "learning_rate": 6.923016831584385e-05, "loss": 1.0062, "step": 14990 }, { "epoch": 0.7491759065028468, "grad_norm": 0.9812283515930176, "learning_rate": 6.919395288933326e-05, "loss": 0.7931, "step": 15000 }, { "epoch": 0.7496753571071821, "grad_norm": 1.0028212070465088, "learning_rate": 6.915772564903278e-05, "loss": 0.8394, "step": 15010 }, { "epoch": 0.7501748077115173, "grad_norm": 2.930734157562256, "learning_rate": 6.912148661724013e-05, "loss": 0.8039, "step": 15020 }, { "epoch": 0.7506742583158525, "grad_norm": 1.2220996618270874, "learning_rate": 6.908523581626026e-05, "loss": 0.801, "step": 15030 }, { "epoch": 0.7511737089201878, "grad_norm": 4.743584156036377, "learning_rate": 6.904897326840537e-05, "loss": 0.8226, "step": 15040 }, { "epoch": 0.7516731595245231, "grad_norm": 1.7444546222686768, "learning_rate": 6.90126989959949e-05, "loss": 0.893, "step": 15050 }, { "epoch": 0.7521726101288583, "grad_norm": 0.928571879863739, "learning_rate": 6.897641302135546e-05, "loss": 0.891, "step": 15060 }, { "epoch": 0.7526720607331935, "grad_norm": 1.5573511123657227, "learning_rate": 6.894011536682097e-05, "loss": 0.721, "step": 15070 }, { "epoch": 0.7531715113375287, "grad_norm": 1.379233717918396, "learning_rate": 6.89038060547324e-05, "loss": 0.8222, "step": 15080 }, { "epoch": 0.753670961941864, "grad_norm": 1.5439879894256592, "learning_rate": 6.8867485107438e-05, "loss": 0.9543, "step": 15090 }, { "epoch": 0.7541704125461992, "grad_norm": 4.321331977844238, "learning_rate": 6.883115254729315e-05, "loss": 0.9038, "step": 15100 }, { "epoch": 0.7546698631505344, "grad_norm": 1.3366317749023438, "learning_rate": 6.879480839666037e-05, "loss": 0.7858, "step": 15110 }, { "epoch": 0.7551693137548696, "grad_norm": 1.95951247215271, "learning_rate": 6.87584526779093e-05, "loss": 0.7887, "step": 15120 }, { "epoch": 0.7556687643592048, "grad_norm": 2.0874879360198975, "learning_rate": 6.872208541341673e-05, "loss": 1.0866, "step": 15130 }, { "epoch": 0.7561682149635401, "grad_norm": 4.349565505981445, "learning_rate": 6.868570662556656e-05, "loss": 0.9345, "step": 15140 }, { "epoch": 0.7566676655678753, "grad_norm": 0.8630907535552979, "learning_rate": 6.864931633674974e-05, "loss": 0.9329, "step": 15150 }, { "epoch": 0.7571671161722106, "grad_norm": 2.172128677368164, "learning_rate": 6.861291456936435e-05, "loss": 0.6593, "step": 15160 }, { "epoch": 0.7576665667765458, "grad_norm": 2.1994476318359375, "learning_rate": 6.857650134581548e-05, "loss": 0.8166, "step": 15170 }, { "epoch": 0.7581660173808811, "grad_norm": 2.3780038356781006, "learning_rate": 6.854007668851532e-05, "loss": 0.8221, "step": 15180 }, { "epoch": 0.7586654679852163, "grad_norm": 1.0928269624710083, "learning_rate": 6.850364061988309e-05, "loss": 0.6895, "step": 15190 }, { "epoch": 0.7591649185895515, "grad_norm": 2.102372407913208, "learning_rate": 6.846719316234503e-05, "loss": 0.6285, "step": 15200 }, { "epoch": 0.7596643691938867, "grad_norm": 1.18507719039917, "learning_rate": 6.843073433833433e-05, "loss": 0.7219, "step": 15210 }, { "epoch": 0.760163819798222, "grad_norm": 1.166977882385254, "learning_rate": 6.839426417029128e-05, "loss": 0.6944, "step": 15220 }, { "epoch": 0.7606632704025572, "grad_norm": 1.865195631980896, "learning_rate": 6.835778268066309e-05, "loss": 0.756, "step": 15230 }, { "epoch": 0.7611627210068924, "grad_norm": 2.251166820526123, "learning_rate": 6.832128989190395e-05, "loss": 0.8524, "step": 15240 }, { "epoch": 0.7616621716112276, "grad_norm": 1.7802624702453613, "learning_rate": 6.828478582647499e-05, "loss": 0.8078, "step": 15250 }, { "epoch": 0.7621616222155628, "grad_norm": 1.9270282983779907, "learning_rate": 6.82482705068443e-05, "loss": 0.83, "step": 15260 }, { "epoch": 0.7626610728198981, "grad_norm": 2.9224936962127686, "learning_rate": 6.82117439554869e-05, "loss": 0.7695, "step": 15270 }, { "epoch": 0.7631605234242334, "grad_norm": 3.9404971599578857, "learning_rate": 6.817520619488471e-05, "loss": 0.7579, "step": 15280 }, { "epoch": 0.7636599740285686, "grad_norm": 3.2672882080078125, "learning_rate": 6.813865724752655e-05, "loss": 0.827, "step": 15290 }, { "epoch": 0.7641594246329038, "grad_norm": 1.6898232698440552, "learning_rate": 6.810209713590814e-05, "loss": 0.8288, "step": 15300 }, { "epoch": 0.7646588752372391, "grad_norm": 2.8118972778320312, "learning_rate": 6.806552588253204e-05, "loss": 0.9278, "step": 15310 }, { "epoch": 0.7651583258415743, "grad_norm": 0.7698076963424683, "learning_rate": 6.802894350990771e-05, "loss": 0.8713, "step": 15320 }, { "epoch": 0.7656577764459095, "grad_norm": 1.5094542503356934, "learning_rate": 6.799235004055142e-05, "loss": 0.9178, "step": 15330 }, { "epoch": 0.7661572270502447, "grad_norm": 1.088309407234192, "learning_rate": 6.79557454969863e-05, "loss": 1.0671, "step": 15340 }, { "epoch": 0.76665667765458, "grad_norm": 1.8290929794311523, "learning_rate": 6.791912990174225e-05, "loss": 0.92, "step": 15350 }, { "epoch": 0.7671561282589152, "grad_norm": 2.360210657119751, "learning_rate": 6.788250327735603e-05, "loss": 0.7379, "step": 15360 }, { "epoch": 0.7676555788632504, "grad_norm": 1.6595243215560913, "learning_rate": 6.784586564637114e-05, "loss": 0.6747, "step": 15370 }, { "epoch": 0.7681550294675856, "grad_norm": 8.949684143066406, "learning_rate": 6.780921703133786e-05, "loss": 0.8656, "step": 15380 }, { "epoch": 0.7686544800719209, "grad_norm": 3.4417693614959717, "learning_rate": 6.77725574548133e-05, "loss": 0.9244, "step": 15390 }, { "epoch": 0.7691539306762561, "grad_norm": 3.909416437149048, "learning_rate": 6.773588693936119e-05, "loss": 0.873, "step": 15400 }, { "epoch": 0.7696533812805914, "grad_norm": 1.2584865093231201, "learning_rate": 6.769920550755213e-05, "loss": 0.9571, "step": 15410 }, { "epoch": 0.7701528318849266, "grad_norm": 1.364255666732788, "learning_rate": 6.766251318196332e-05, "loss": 0.7366, "step": 15420 }, { "epoch": 0.7706522824892618, "grad_norm": 1.2045292854309082, "learning_rate": 6.762580998517875e-05, "loss": 0.7667, "step": 15430 }, { "epoch": 0.7711517330935971, "grad_norm": 0.5805503129959106, "learning_rate": 6.758909593978904e-05, "loss": 0.9362, "step": 15440 }, { "epoch": 0.7716511836979323, "grad_norm": 2.512843370437622, "learning_rate": 6.755237106839154e-05, "loss": 0.7338, "step": 15450 }, { "epoch": 0.7721506343022675, "grad_norm": 4.625179767608643, "learning_rate": 6.751563539359023e-05, "loss": 0.9157, "step": 15460 }, { "epoch": 0.7726500849066027, "grad_norm": 4.126613140106201, "learning_rate": 6.747888893799577e-05, "loss": 0.9855, "step": 15470 }, { "epoch": 0.773149535510938, "grad_norm": 2.273141622543335, "learning_rate": 6.744213172422541e-05, "loss": 0.9608, "step": 15480 }, { "epoch": 0.7736489861152732, "grad_norm": 2.4304282665252686, "learning_rate": 6.740536377490306e-05, "loss": 0.7095, "step": 15490 }, { "epoch": 0.7741484367196084, "grad_norm": 2.9688961505889893, "learning_rate": 6.736858511265921e-05, "loss": 0.8406, "step": 15500 }, { "epoch": 0.7746478873239436, "grad_norm": 2.441664934158325, "learning_rate": 6.733179576013098e-05, "loss": 1.0852, "step": 15510 }, { "epoch": 0.775147337928279, "grad_norm": 3.616377830505371, "learning_rate": 6.729499573996201e-05, "loss": 0.795, "step": 15520 }, { "epoch": 0.7756467885326142, "grad_norm": 1.4523836374282837, "learning_rate": 6.72581850748026e-05, "loss": 0.8452, "step": 15530 }, { "epoch": 0.7761462391369494, "grad_norm": 1.5587873458862305, "learning_rate": 6.722136378730948e-05, "loss": 0.9114, "step": 15540 }, { "epoch": 0.7766456897412846, "grad_norm": 0.8045804500579834, "learning_rate": 6.718453190014602e-05, "loss": 0.6468, "step": 15550 }, { "epoch": 0.7771451403456198, "grad_norm": 0.9503934979438782, "learning_rate": 6.71476894359821e-05, "loss": 0.6298, "step": 15560 }, { "epoch": 0.7776445909499551, "grad_norm": 1.3000394105911255, "learning_rate": 6.711083641749404e-05, "loss": 0.8494, "step": 15570 }, { "epoch": 0.7781440415542903, "grad_norm": 1.8700768947601318, "learning_rate": 6.707397286736472e-05, "loss": 0.9172, "step": 15580 }, { "epoch": 0.7786434921586255, "grad_norm": 2.115405559539795, "learning_rate": 6.703709880828351e-05, "loss": 0.7838, "step": 15590 }, { "epoch": 0.7791429427629607, "grad_norm": 2.222421407699585, "learning_rate": 6.700021426294619e-05, "loss": 0.9487, "step": 15600 }, { "epoch": 0.779642393367296, "grad_norm": 1.515528917312622, "learning_rate": 6.696331925405504e-05, "loss": 0.8356, "step": 15610 }, { "epoch": 0.7801418439716312, "grad_norm": 1.561881422996521, "learning_rate": 6.692641380431879e-05, "loss": 0.8453, "step": 15620 }, { "epoch": 0.7806412945759664, "grad_norm": 1.2895303964614868, "learning_rate": 6.688949793645254e-05, "loss": 0.8658, "step": 15630 }, { "epoch": 0.7811407451803016, "grad_norm": 1.4048974514007568, "learning_rate": 6.685257167317786e-05, "loss": 0.8668, "step": 15640 }, { "epoch": 0.781640195784637, "grad_norm": 1.6670610904693604, "learning_rate": 6.681563503722268e-05, "loss": 0.9999, "step": 15650 }, { "epoch": 0.7821396463889722, "grad_norm": 2.2475922107696533, "learning_rate": 6.677868805132135e-05, "loss": 0.6803, "step": 15660 }, { "epoch": 0.7826390969933074, "grad_norm": 1.3656936883926392, "learning_rate": 6.674173073821454e-05, "loss": 0.8731, "step": 15670 }, { "epoch": 0.7831385475976426, "grad_norm": 1.6786401271820068, "learning_rate": 6.670476312064934e-05, "loss": 0.8405, "step": 15680 }, { "epoch": 0.7836379982019778, "grad_norm": 2.163822650909424, "learning_rate": 6.666778522137915e-05, "loss": 0.7527, "step": 15690 }, { "epoch": 0.7841374488063131, "grad_norm": 3.461778163909912, "learning_rate": 6.663079706316366e-05, "loss": 0.9058, "step": 15700 }, { "epoch": 0.7846368994106483, "grad_norm": 0.9004759192466736, "learning_rate": 6.659379866876894e-05, "loss": 0.8703, "step": 15710 }, { "epoch": 0.7851363500149835, "grad_norm": 1.10280179977417, "learning_rate": 6.655679006096734e-05, "loss": 0.7312, "step": 15720 }, { "epoch": 0.7856358006193187, "grad_norm": 2.1050362586975098, "learning_rate": 6.651977126253744e-05, "loss": 0.8669, "step": 15730 }, { "epoch": 0.786135251223654, "grad_norm": 2.1199238300323486, "learning_rate": 6.648274229626419e-05, "loss": 0.9007, "step": 15740 }, { "epoch": 0.7866347018279892, "grad_norm": 1.3192224502563477, "learning_rate": 6.644570318493874e-05, "loss": 0.7224, "step": 15750 }, { "epoch": 0.7871341524323244, "grad_norm": 3.3043160438537598, "learning_rate": 6.640865395135848e-05, "loss": 0.8629, "step": 15760 }, { "epoch": 0.7876336030366596, "grad_norm": 2.9390816688537598, "learning_rate": 6.637159461832705e-05, "loss": 0.8931, "step": 15770 }, { "epoch": 0.788133053640995, "grad_norm": 3.64808988571167, "learning_rate": 6.633452520865428e-05, "loss": 1.0136, "step": 15780 }, { "epoch": 0.7886325042453302, "grad_norm": 1.8212376832962036, "learning_rate": 6.629744574515626e-05, "loss": 0.9769, "step": 15790 }, { "epoch": 0.7891319548496654, "grad_norm": 1.8592854738235474, "learning_rate": 6.626035625065522e-05, "loss": 0.8619, "step": 15800 }, { "epoch": 0.7896314054540006, "grad_norm": 2.5067596435546875, "learning_rate": 6.622325674797955e-05, "loss": 0.9626, "step": 15810 }, { "epoch": 0.7901308560583359, "grad_norm": 0.9920625686645508, "learning_rate": 6.618614725996382e-05, "loss": 0.9642, "step": 15820 }, { "epoch": 0.7906303066626711, "grad_norm": 4.127655982971191, "learning_rate": 6.614902780944879e-05, "loss": 0.9797, "step": 15830 }, { "epoch": 0.7911297572670063, "grad_norm": 2.1669821739196777, "learning_rate": 6.611189841928128e-05, "loss": 0.6941, "step": 15840 }, { "epoch": 0.7916292078713415, "grad_norm": 1.6327829360961914, "learning_rate": 6.607475911231426e-05, "loss": 0.9778, "step": 15850 }, { "epoch": 0.7921286584756767, "grad_norm": 1.0588231086730957, "learning_rate": 6.603760991140681e-05, "loss": 0.9366, "step": 15860 }, { "epoch": 0.792628109080012, "grad_norm": 2.468372344970703, "learning_rate": 6.60004508394241e-05, "loss": 0.8592, "step": 15870 }, { "epoch": 0.7931275596843472, "grad_norm": 1.4136161804199219, "learning_rate": 6.596328191923734e-05, "loss": 0.839, "step": 15880 }, { "epoch": 0.7936270102886824, "grad_norm": 2.3732004165649414, "learning_rate": 6.592610317372387e-05, "loss": 0.7439, "step": 15890 }, { "epoch": 0.7941264608930176, "grad_norm": 3.4499545097351074, "learning_rate": 6.588891462576701e-05, "loss": 0.7574, "step": 15900 }, { "epoch": 0.794625911497353, "grad_norm": 2.6960248947143555, "learning_rate": 6.585171629825615e-05, "loss": 0.9557, "step": 15910 }, { "epoch": 0.7951253621016882, "grad_norm": 1.224998950958252, "learning_rate": 6.581450821408668e-05, "loss": 0.9733, "step": 15920 }, { "epoch": 0.7956248127060234, "grad_norm": 1.9505045413970947, "learning_rate": 6.577729039616002e-05, "loss": 0.6941, "step": 15930 }, { "epoch": 0.7961242633103586, "grad_norm": 1.4508978128433228, "learning_rate": 6.574006286738354e-05, "loss": 0.752, "step": 15940 }, { "epoch": 0.7966237139146939, "grad_norm": 2.164996862411499, "learning_rate": 6.570282565067065e-05, "loss": 1.0168, "step": 15950 }, { "epoch": 0.7971231645190291, "grad_norm": 1.3738571405410767, "learning_rate": 6.566557876894067e-05, "loss": 1.0198, "step": 15960 }, { "epoch": 0.7976226151233643, "grad_norm": 1.2134963274002075, "learning_rate": 6.562832224511888e-05, "loss": 0.9293, "step": 15970 }, { "epoch": 0.7981220657276995, "grad_norm": 1.7455850839614868, "learning_rate": 6.559105610213649e-05, "loss": 0.9195, "step": 15980 }, { "epoch": 0.7986215163320347, "grad_norm": 2.994417190551758, "learning_rate": 6.555378036293066e-05, "loss": 0.8009, "step": 15990 }, { "epoch": 0.79912096693637, "grad_norm": 3.2063357830047607, "learning_rate": 6.551649505044442e-05, "loss": 0.9724, "step": 16000 }, { "epoch": 0.7996204175407052, "grad_norm": 1.8232203722000122, "learning_rate": 6.547920018762672e-05, "loss": 0.7755, "step": 16010 }, { "epoch": 0.8001198681450404, "grad_norm": 1.8015304803848267, "learning_rate": 6.544189579743238e-05, "loss": 0.8729, "step": 16020 }, { "epoch": 0.8006193187493756, "grad_norm": 2.915858745574951, "learning_rate": 6.540458190282207e-05, "loss": 0.7648, "step": 16030 }, { "epoch": 0.801118769353711, "grad_norm": 2.4863932132720947, "learning_rate": 6.536725852676231e-05, "loss": 0.741, "step": 16040 }, { "epoch": 0.8016182199580462, "grad_norm": 2.194597005844116, "learning_rate": 6.53299256922255e-05, "loss": 0.9772, "step": 16050 }, { "epoch": 0.8021176705623814, "grad_norm": 1.7982498407363892, "learning_rate": 6.52925834221898e-05, "loss": 0.691, "step": 16060 }, { "epoch": 0.8026171211667166, "grad_norm": 1.1537199020385742, "learning_rate": 6.525523173963922e-05, "loss": 0.8748, "step": 16070 }, { "epoch": 0.8031165717710519, "grad_norm": 2.4173102378845215, "learning_rate": 6.521787066756354e-05, "loss": 0.8947, "step": 16080 }, { "epoch": 0.8036160223753871, "grad_norm": 1.8931891918182373, "learning_rate": 6.518050022895834e-05, "loss": 0.9027, "step": 16090 }, { "epoch": 0.8041154729797223, "grad_norm": 5.7853546142578125, "learning_rate": 6.514312044682494e-05, "loss": 0.8129, "step": 16100 }, { "epoch": 0.8046149235840575, "grad_norm": 1.2767221927642822, "learning_rate": 6.510573134417043e-05, "loss": 0.7389, "step": 16110 }, { "epoch": 0.8051143741883928, "grad_norm": 1.1879867315292358, "learning_rate": 6.506833294400763e-05, "loss": 0.8777, "step": 16120 }, { "epoch": 0.805613824792728, "grad_norm": 1.1377538442611694, "learning_rate": 6.503092526935508e-05, "loss": 1.0342, "step": 16130 }, { "epoch": 0.8061132753970632, "grad_norm": 2.8258886337280273, "learning_rate": 6.499350834323706e-05, "loss": 0.9868, "step": 16140 }, { "epoch": 0.8066127260013984, "grad_norm": 2.124138832092285, "learning_rate": 6.495608218868348e-05, "loss": 0.8422, "step": 16150 }, { "epoch": 0.8071121766057336, "grad_norm": 0.7884851098060608, "learning_rate": 6.491864682872997e-05, "loss": 0.8204, "step": 16160 }, { "epoch": 0.807611627210069, "grad_norm": 1.948546051979065, "learning_rate": 6.488120228641783e-05, "loss": 0.955, "step": 16170 }, { "epoch": 0.8081110778144042, "grad_norm": 2.1687068939208984, "learning_rate": 6.484374858479402e-05, "loss": 0.9238, "step": 16180 }, { "epoch": 0.8086105284187394, "grad_norm": 0.8520563840866089, "learning_rate": 6.48062857469111e-05, "loss": 0.9406, "step": 16190 }, { "epoch": 0.8091099790230746, "grad_norm": 4.233059883117676, "learning_rate": 6.47688137958273e-05, "loss": 1.0369, "step": 16200 }, { "epoch": 0.8096094296274099, "grad_norm": 2.6756277084350586, "learning_rate": 6.473133275460638e-05, "loss": 0.8692, "step": 16210 }, { "epoch": 0.8101088802317451, "grad_norm": 5.516173362731934, "learning_rate": 6.469384264631782e-05, "loss": 0.9871, "step": 16220 }, { "epoch": 0.8106083308360803, "grad_norm": 1.0051600933074951, "learning_rate": 6.465634349403656e-05, "loss": 0.7404, "step": 16230 }, { "epoch": 0.8111077814404155, "grad_norm": 2.0114965438842773, "learning_rate": 6.461883532084316e-05, "loss": 0.7105, "step": 16240 }, { "epoch": 0.8116072320447508, "grad_norm": 1.9885752201080322, "learning_rate": 6.458131814982373e-05, "loss": 0.8684, "step": 16250 }, { "epoch": 0.812106682649086, "grad_norm": 1.7377644777297974, "learning_rate": 6.454379200406995e-05, "loss": 0.923, "step": 16260 }, { "epoch": 0.8126061332534212, "grad_norm": 1.5406426191329956, "learning_rate": 6.450625690667895e-05, "loss": 0.723, "step": 16270 }, { "epoch": 0.8131055838577564, "grad_norm": 3.6169092655181885, "learning_rate": 6.446871288075345e-05, "loss": 0.9572, "step": 16280 }, { "epoch": 0.8136050344620916, "grad_norm": 1.060236930847168, "learning_rate": 6.443115994940156e-05, "loss": 0.7335, "step": 16290 }, { "epoch": 0.814104485066427, "grad_norm": 9.283597946166992, "learning_rate": 6.4393598135737e-05, "loss": 1.1551, "step": 16300 }, { "epoch": 0.8146039356707622, "grad_norm": 2.340845823287964, "learning_rate": 6.435602746287887e-05, "loss": 0.9098, "step": 16310 }, { "epoch": 0.8151033862750974, "grad_norm": 1.9237539768218994, "learning_rate": 6.431844795395177e-05, "loss": 0.9227, "step": 16320 }, { "epoch": 0.8156028368794326, "grad_norm": 1.4562195539474487, "learning_rate": 6.428085963208566e-05, "loss": 0.7405, "step": 16330 }, { "epoch": 0.8161022874837679, "grad_norm": 0.7459195852279663, "learning_rate": 6.424326252041602e-05, "loss": 1.0744, "step": 16340 }, { "epoch": 0.8166017380881031, "grad_norm": 1.7994718551635742, "learning_rate": 6.420565664208371e-05, "loss": 0.5823, "step": 16350 }, { "epoch": 0.8171011886924383, "grad_norm": 2.3014066219329834, "learning_rate": 6.416804202023495e-05, "loss": 0.9816, "step": 16360 }, { "epoch": 0.8176006392967735, "grad_norm": 2.7471611499786377, "learning_rate": 6.413041867802138e-05, "loss": 0.8872, "step": 16370 }, { "epoch": 0.8181000899011088, "grad_norm": 4.223907947540283, "learning_rate": 6.409278663859997e-05, "loss": 0.8984, "step": 16380 }, { "epoch": 0.818599540505444, "grad_norm": 1.6525057554244995, "learning_rate": 6.40551459251331e-05, "loss": 0.9931, "step": 16390 }, { "epoch": 0.8190989911097792, "grad_norm": 0.9545865058898926, "learning_rate": 6.401749656078844e-05, "loss": 0.845, "step": 16400 }, { "epoch": 0.8195984417141144, "grad_norm": 3.1227684020996094, "learning_rate": 6.397983856873902e-05, "loss": 0.7437, "step": 16410 }, { "epoch": 0.8200978923184497, "grad_norm": 1.1917845010757446, "learning_rate": 6.394217197216312e-05, "loss": 0.8466, "step": 16420 }, { "epoch": 0.820597342922785, "grad_norm": 6.300127983093262, "learning_rate": 6.390449679424439e-05, "loss": 0.9267, "step": 16430 }, { "epoch": 0.8210967935271202, "grad_norm": 1.3488715887069702, "learning_rate": 6.386681305817172e-05, "loss": 0.6616, "step": 16440 }, { "epoch": 0.8215962441314554, "grad_norm": 1.7635983228683472, "learning_rate": 6.38291207871393e-05, "loss": 1.0489, "step": 16450 }, { "epoch": 0.8220956947357906, "grad_norm": 3.4676079750061035, "learning_rate": 6.37914200043465e-05, "loss": 0.745, "step": 16460 }, { "epoch": 0.8225951453401259, "grad_norm": 2.3036060333251953, "learning_rate": 6.375371073299802e-05, "loss": 0.8561, "step": 16470 }, { "epoch": 0.8230945959444611, "grad_norm": 0.7986567616462708, "learning_rate": 6.371599299630374e-05, "loss": 0.8123, "step": 16480 }, { "epoch": 0.8235940465487963, "grad_norm": 1.3983267545700073, "learning_rate": 6.367826681747872e-05, "loss": 1.0328, "step": 16490 }, { "epoch": 0.8240934971531315, "grad_norm": 1.514461874961853, "learning_rate": 6.364053221974329e-05, "loss": 1.1373, "step": 16500 }, { "epoch": 0.8245929477574668, "grad_norm": 0.82142573595047, "learning_rate": 6.36027892263229e-05, "loss": 0.6504, "step": 16510 }, { "epoch": 0.825092398361802, "grad_norm": 3.0528926849365234, "learning_rate": 6.35650378604482e-05, "loss": 0.8885, "step": 16520 }, { "epoch": 0.8255918489661372, "grad_norm": 1.6076291799545288, "learning_rate": 6.352727814535498e-05, "loss": 0.9077, "step": 16530 }, { "epoch": 0.8260912995704724, "grad_norm": 2.3793752193450928, "learning_rate": 6.348951010428416e-05, "loss": 0.8903, "step": 16540 }, { "epoch": 0.8265907501748078, "grad_norm": 1.5829384326934814, "learning_rate": 6.345173376048179e-05, "loss": 0.9315, "step": 16550 }, { "epoch": 0.827090200779143, "grad_norm": 1.056965708732605, "learning_rate": 6.341394913719908e-05, "loss": 0.6395, "step": 16560 }, { "epoch": 0.8275896513834782, "grad_norm": 4.013810157775879, "learning_rate": 6.337615625769225e-05, "loss": 1.025, "step": 16570 }, { "epoch": 0.8280891019878134, "grad_norm": 2.1943602561950684, "learning_rate": 6.333835514522265e-05, "loss": 0.7397, "step": 16580 }, { "epoch": 0.8285885525921486, "grad_norm": 0.603503942489624, "learning_rate": 6.330054582305671e-05, "loss": 0.644, "step": 16590 }, { "epoch": 0.8290880031964839, "grad_norm": 1.5025651454925537, "learning_rate": 6.326272831446587e-05, "loss": 0.8164, "step": 16600 }, { "epoch": 0.8295874538008191, "grad_norm": 1.9878567457199097, "learning_rate": 6.322490264272665e-05, "loss": 0.7026, "step": 16610 }, { "epoch": 0.8300869044051543, "grad_norm": 1.1617220640182495, "learning_rate": 6.318706883112058e-05, "loss": 0.8819, "step": 16620 }, { "epoch": 0.8305863550094895, "grad_norm": 2.0502772331237793, "learning_rate": 6.314922690293416e-05, "loss": 0.9579, "step": 16630 }, { "epoch": 0.8310858056138248, "grad_norm": 3.7792224884033203, "learning_rate": 6.311137688145898e-05, "loss": 0.91, "step": 16640 }, { "epoch": 0.83158525621816, "grad_norm": 0.6884984374046326, "learning_rate": 6.307351878999151e-05, "loss": 0.739, "step": 16650 }, { "epoch": 0.8320847068224952, "grad_norm": 1.9633368253707886, "learning_rate": 6.303565265183326e-05, "loss": 0.748, "step": 16660 }, { "epoch": 0.8325841574268305, "grad_norm": 1.5782347917556763, "learning_rate": 6.299777849029066e-05, "loss": 0.8776, "step": 16670 }, { "epoch": 0.8330836080311658, "grad_norm": 1.628798246383667, "learning_rate": 6.295989632867506e-05, "loss": 0.6668, "step": 16680 }, { "epoch": 0.833583058635501, "grad_norm": 2.231889009475708, "learning_rate": 6.292200619030278e-05, "loss": 0.7465, "step": 16690 }, { "epoch": 0.8340825092398362, "grad_norm": 2.304318904876709, "learning_rate": 6.288410809849502e-05, "loss": 0.8359, "step": 16700 }, { "epoch": 0.8345819598441714, "grad_norm": 3.2880563735961914, "learning_rate": 6.284620207657787e-05, "loss": 0.9138, "step": 16710 }, { "epoch": 0.8350814104485066, "grad_norm": 1.3525863885879517, "learning_rate": 6.280828814788232e-05, "loss": 1.0842, "step": 16720 }, { "epoch": 0.8355808610528419, "grad_norm": 1.97789466381073, "learning_rate": 6.277036633574421e-05, "loss": 0.8623, "step": 16730 }, { "epoch": 0.8360803116571771, "grad_norm": 1.0432552099227905, "learning_rate": 6.273243666350427e-05, "loss": 0.7067, "step": 16740 }, { "epoch": 0.8365797622615123, "grad_norm": 2.6929054260253906, "learning_rate": 6.269449915450803e-05, "loss": 0.9287, "step": 16750 }, { "epoch": 0.8370792128658475, "grad_norm": 1.5089311599731445, "learning_rate": 6.265655383210582e-05, "loss": 0.7812, "step": 16760 }, { "epoch": 0.8375786634701828, "grad_norm": 2.037862777709961, "learning_rate": 6.261860071965286e-05, "loss": 0.7476, "step": 16770 }, { "epoch": 0.838078114074518, "grad_norm": 1.1124966144561768, "learning_rate": 6.258063984050907e-05, "loss": 0.8935, "step": 16780 }, { "epoch": 0.8385775646788532, "grad_norm": 2.5978944301605225, "learning_rate": 6.254267121803922e-05, "loss": 0.9156, "step": 16790 }, { "epoch": 0.8390770152831885, "grad_norm": 1.905880331993103, "learning_rate": 6.250469487561283e-05, "loss": 0.9308, "step": 16800 }, { "epoch": 0.8395764658875238, "grad_norm": 0.8824878931045532, "learning_rate": 6.246671083660416e-05, "loss": 0.7035, "step": 16810 }, { "epoch": 0.840075916491859, "grad_norm": 1.4680544137954712, "learning_rate": 6.24287191243922e-05, "loss": 0.8742, "step": 16820 }, { "epoch": 0.8405753670961942, "grad_norm": 1.9640511274337769, "learning_rate": 6.239071976236069e-05, "loss": 0.6797, "step": 16830 }, { "epoch": 0.8410748177005294, "grad_norm": 2.1258668899536133, "learning_rate": 6.235271277389805e-05, "loss": 1.1352, "step": 16840 }, { "epoch": 0.8415742683048647, "grad_norm": 1.9400843381881714, "learning_rate": 6.231469818239741e-05, "loss": 0.7029, "step": 16850 }, { "epoch": 0.8420737189091999, "grad_norm": 0.7889087200164795, "learning_rate": 6.227667601125657e-05, "loss": 0.7216, "step": 16860 }, { "epoch": 0.8425731695135351, "grad_norm": 0.7702601552009583, "learning_rate": 6.223864628387803e-05, "loss": 0.7218, "step": 16870 }, { "epoch": 0.8430726201178703, "grad_norm": 1.353019118309021, "learning_rate": 6.220060902366888e-05, "loss": 0.732, "step": 16880 }, { "epoch": 0.8435720707222055, "grad_norm": 0.7442846298217773, "learning_rate": 6.216256425404089e-05, "loss": 0.8635, "step": 16890 }, { "epoch": 0.8440715213265408, "grad_norm": 4.365800857543945, "learning_rate": 6.212451199841043e-05, "loss": 0.8221, "step": 16900 }, { "epoch": 0.844570971930876, "grad_norm": 3.0228402614593506, "learning_rate": 6.208645228019852e-05, "loss": 0.64, "step": 16910 }, { "epoch": 0.8450704225352113, "grad_norm": 1.9168883562088013, "learning_rate": 6.204838512283072e-05, "loss": 0.8189, "step": 16920 }, { "epoch": 0.8455698731395465, "grad_norm": 2.254575729370117, "learning_rate": 6.201031054973721e-05, "loss": 0.7512, "step": 16930 }, { "epoch": 0.8460693237438818, "grad_norm": 2.7147164344787598, "learning_rate": 6.197222858435267e-05, "loss": 0.8503, "step": 16940 }, { "epoch": 0.846568774348217, "grad_norm": 2.5285685062408447, "learning_rate": 6.193413925011642e-05, "loss": 0.8169, "step": 16950 }, { "epoch": 0.8470682249525522, "grad_norm": 1.0177611112594604, "learning_rate": 6.189604257047224e-05, "loss": 0.7792, "step": 16960 }, { "epoch": 0.8475676755568874, "grad_norm": 1.2296720743179321, "learning_rate": 6.185793856886849e-05, "loss": 0.6905, "step": 16970 }, { "epoch": 0.8480671261612227, "grad_norm": 4.773038864135742, "learning_rate": 6.181982726875799e-05, "loss": 1.1014, "step": 16980 }, { "epoch": 0.8485665767655579, "grad_norm": 3.047926425933838, "learning_rate": 6.178170869359808e-05, "loss": 0.823, "step": 16990 }, { "epoch": 0.8490660273698931, "grad_norm": 1.9710932970046997, "learning_rate": 6.174358286685054e-05, "loss": 0.8389, "step": 17000 }, { "epoch": 0.8495654779742283, "grad_norm": 1.147913932800293, "learning_rate": 6.170544981198168e-05, "loss": 0.97, "step": 17010 }, { "epoch": 0.8500649285785635, "grad_norm": 1.007738709449768, "learning_rate": 6.16673095524622e-05, "loss": 0.9433, "step": 17020 }, { "epoch": 0.8505643791828988, "grad_norm": 1.1430408954620361, "learning_rate": 6.162916211176725e-05, "loss": 0.9482, "step": 17030 }, { "epoch": 0.851063829787234, "grad_norm": 1.4783233404159546, "learning_rate": 6.159100751337642e-05, "loss": 0.8203, "step": 17040 }, { "epoch": 0.8515632803915693, "grad_norm": 1.9705936908721924, "learning_rate": 6.155284578077369e-05, "loss": 0.8806, "step": 17050 }, { "epoch": 0.8520627309959045, "grad_norm": 2.27795672416687, "learning_rate": 6.15146769374474e-05, "loss": 0.9348, "step": 17060 }, { "epoch": 0.8525621816002398, "grad_norm": 3.018404960632324, "learning_rate": 6.147650100689033e-05, "loss": 1.0186, "step": 17070 }, { "epoch": 0.853061632204575, "grad_norm": 1.2547297477722168, "learning_rate": 6.14383180125996e-05, "loss": 0.9815, "step": 17080 }, { "epoch": 0.8535610828089102, "grad_norm": 1.3103141784667969, "learning_rate": 6.140012797807664e-05, "loss": 0.6705, "step": 17090 }, { "epoch": 0.8540605334132454, "grad_norm": 1.1061055660247803, "learning_rate": 6.136193092682725e-05, "loss": 0.7973, "step": 17100 }, { "epoch": 0.8545599840175807, "grad_norm": 1.5585850477218628, "learning_rate": 6.132372688236152e-05, "loss": 0.7361, "step": 17110 }, { "epoch": 0.8550594346219159, "grad_norm": 2.628854274749756, "learning_rate": 6.128551586819391e-05, "loss": 1.0098, "step": 17120 }, { "epoch": 0.8555588852262511, "grad_norm": 1.6417533159255981, "learning_rate": 6.124729790784308e-05, "loss": 0.938, "step": 17130 }, { "epoch": 0.8560583358305863, "grad_norm": 1.5524022579193115, "learning_rate": 6.120907302483205e-05, "loss": 0.914, "step": 17140 }, { "epoch": 0.8565577864349215, "grad_norm": 0.955947756767273, "learning_rate": 6.1170841242688e-05, "loss": 0.6852, "step": 17150 }, { "epoch": 0.8570572370392568, "grad_norm": 1.8045988082885742, "learning_rate": 6.113260258494247e-05, "loss": 0.8964, "step": 17160 }, { "epoch": 0.857556687643592, "grad_norm": 0.7617775201797485, "learning_rate": 6.109435707513118e-05, "loss": 0.8379, "step": 17170 }, { "epoch": 0.8580561382479273, "grad_norm": 2.472621202468872, "learning_rate": 6.105610473679403e-05, "loss": 0.8932, "step": 17180 }, { "epoch": 0.8585555888522625, "grad_norm": 1.8227863311767578, "learning_rate": 6.1017845593475166e-05, "loss": 0.8147, "step": 17190 }, { "epoch": 0.8590550394565978, "grad_norm": 0.4599326550960541, "learning_rate": 6.097957966872294e-05, "loss": 0.8396, "step": 17200 }, { "epoch": 0.859554490060933, "grad_norm": 2.4516680240631104, "learning_rate": 6.094130698608983e-05, "loss": 1.1839, "step": 17210 }, { "epoch": 0.8600539406652682, "grad_norm": 1.868933081626892, "learning_rate": 6.0903027569132506e-05, "loss": 1.1272, "step": 17220 }, { "epoch": 0.8605533912696034, "grad_norm": 2.447511911392212, "learning_rate": 6.086474144141178e-05, "loss": 0.9087, "step": 17230 }, { "epoch": 0.8610528418739387, "grad_norm": 2.8627376556396484, "learning_rate": 6.082644862649256e-05, "loss": 0.7799, "step": 17240 }, { "epoch": 0.8615522924782739, "grad_norm": 3.317176342010498, "learning_rate": 6.078814914794393e-05, "loss": 1.0523, "step": 17250 }, { "epoch": 0.8620517430826091, "grad_norm": 0.8495808243751526, "learning_rate": 6.0749843029339036e-05, "loss": 0.8708, "step": 17260 }, { "epoch": 0.8625511936869443, "grad_norm": 1.3395750522613525, "learning_rate": 6.071153029425509e-05, "loss": 1.0773, "step": 17270 }, { "epoch": 0.8630506442912796, "grad_norm": 1.3448898792266846, "learning_rate": 6.0673210966273456e-05, "loss": 0.985, "step": 17280 }, { "epoch": 0.8635500948956149, "grad_norm": 3.114107847213745, "learning_rate": 6.0634885068979466e-05, "loss": 0.8944, "step": 17290 }, { "epoch": 0.8640495454999501, "grad_norm": 2.589629888534546, "learning_rate": 6.0596552625962536e-05, "loss": 0.8187, "step": 17300 }, { "epoch": 0.8645489961042853, "grad_norm": 1.3278167247772217, "learning_rate": 6.055821366081612e-05, "loss": 0.9411, "step": 17310 }, { "epoch": 0.8650484467086205, "grad_norm": 2.274196147918701, "learning_rate": 6.0519868197137664e-05, "loss": 0.9286, "step": 17320 }, { "epoch": 0.8655478973129558, "grad_norm": 2.7580153942108154, "learning_rate": 6.0481516258528635e-05, "loss": 0.7447, "step": 17330 }, { "epoch": 0.866047347917291, "grad_norm": 2.3561079502105713, "learning_rate": 6.044315786859447e-05, "loss": 0.8562, "step": 17340 }, { "epoch": 0.8665467985216262, "grad_norm": 1.28364896774292, "learning_rate": 6.0404793050944596e-05, "loss": 0.691, "step": 17350 }, { "epoch": 0.8670462491259614, "grad_norm": 1.2503184080123901, "learning_rate": 6.0366421829192356e-05, "loss": 0.8561, "step": 17360 }, { "epoch": 0.8675456997302967, "grad_norm": 3.0285422801971436, "learning_rate": 6.0328044226955075e-05, "loss": 1.1469, "step": 17370 }, { "epoch": 0.8680451503346319, "grad_norm": 1.0694063901901245, "learning_rate": 6.028966026785399e-05, "loss": 0.6241, "step": 17380 }, { "epoch": 0.8685446009389671, "grad_norm": 1.8019158840179443, "learning_rate": 6.025126997551427e-05, "loss": 0.8304, "step": 17390 }, { "epoch": 0.8690440515433023, "grad_norm": 3.0490691661834717, "learning_rate": 6.021287337356493e-05, "loss": 0.9058, "step": 17400 }, { "epoch": 0.8695435021476376, "grad_norm": 1.581011414527893, "learning_rate": 6.017447048563891e-05, "loss": 0.7703, "step": 17410 }, { "epoch": 0.8700429527519729, "grad_norm": 1.7052842378616333, "learning_rate": 6.013606133537303e-05, "loss": 0.8926, "step": 17420 }, { "epoch": 0.8705424033563081, "grad_norm": 5.302414417266846, "learning_rate": 6.0097645946407945e-05, "loss": 0.9729, "step": 17430 }, { "epoch": 0.8710418539606433, "grad_norm": 3.3168435096740723, "learning_rate": 6.0059224342388134e-05, "loss": 0.8982, "step": 17440 }, { "epoch": 0.8715413045649785, "grad_norm": 1.1036734580993652, "learning_rate": 6.002079654696192e-05, "loss": 0.8249, "step": 17450 }, { "epoch": 0.8720407551693138, "grad_norm": 2.3743739128112793, "learning_rate": 5.998236258378143e-05, "loss": 0.7628, "step": 17460 }, { "epoch": 0.872540205773649, "grad_norm": 1.0588688850402832, "learning_rate": 5.994392247650262e-05, "loss": 0.6464, "step": 17470 }, { "epoch": 0.8730396563779842, "grad_norm": 1.3939181566238403, "learning_rate": 5.990547624878516e-05, "loss": 0.7323, "step": 17480 }, { "epoch": 0.8735391069823194, "grad_norm": 3.428685188293457, "learning_rate": 5.9867023924292575e-05, "loss": 0.8373, "step": 17490 }, { "epoch": 0.8740385575866547, "grad_norm": 2.5662968158721924, "learning_rate": 5.982856552669205e-05, "loss": 0.8612, "step": 17500 }, { "epoch": 0.8745380081909899, "grad_norm": 3.1228392124176025, "learning_rate": 5.9790101079654556e-05, "loss": 0.7528, "step": 17510 }, { "epoch": 0.8750374587953251, "grad_norm": 6.499607563018799, "learning_rate": 5.975163060685481e-05, "loss": 1.174, "step": 17520 }, { "epoch": 0.8755369093996603, "grad_norm": 1.335733413696289, "learning_rate": 5.9713154131971206e-05, "loss": 0.9548, "step": 17530 }, { "epoch": 0.8760363600039957, "grad_norm": 2.2760865688323975, "learning_rate": 5.9674671678685813e-05, "loss": 0.9427, "step": 17540 }, { "epoch": 0.8765358106083309, "grad_norm": 3.9111785888671875, "learning_rate": 5.963618327068443e-05, "loss": 0.9205, "step": 17550 }, { "epoch": 0.8770352612126661, "grad_norm": 2.4227874279022217, "learning_rate": 5.959768893165648e-05, "loss": 0.7424, "step": 17560 }, { "epoch": 0.8775347118170013, "grad_norm": 1.6442713737487793, "learning_rate": 5.9559188685295067e-05, "loss": 0.9272, "step": 17570 }, { "epoch": 0.8780341624213366, "grad_norm": 1.4858150482177734, "learning_rate": 5.952068255529689e-05, "loss": 0.9458, "step": 17580 }, { "epoch": 0.8785336130256718, "grad_norm": 2.355178117752075, "learning_rate": 5.9482170565362314e-05, "loss": 0.8913, "step": 17590 }, { "epoch": 0.879033063630007, "grad_norm": 0.8928540945053101, "learning_rate": 5.944365273919531e-05, "loss": 0.8276, "step": 17600 }, { "epoch": 0.8795325142343422, "grad_norm": 1.362815022468567, "learning_rate": 5.9405129100503384e-05, "loss": 1.0135, "step": 17610 }, { "epoch": 0.8800319648386774, "grad_norm": 1.983511209487915, "learning_rate": 5.9366599672997694e-05, "loss": 0.9972, "step": 17620 }, { "epoch": 0.8805314154430127, "grad_norm": 0.8186060786247253, "learning_rate": 5.9328064480392886e-05, "loss": 0.7884, "step": 17630 }, { "epoch": 0.8810308660473479, "grad_norm": 3.547245740890503, "learning_rate": 5.928952354640723e-05, "loss": 0.7936, "step": 17640 }, { "epoch": 0.8815303166516831, "grad_norm": 1.1291515827178955, "learning_rate": 5.925097689476249e-05, "loss": 0.7797, "step": 17650 }, { "epoch": 0.8820297672560183, "grad_norm": 0.8624637126922607, "learning_rate": 5.921242454918394e-05, "loss": 0.8495, "step": 17660 }, { "epoch": 0.8825292178603537, "grad_norm": 3.757883071899414, "learning_rate": 5.917386653340036e-05, "loss": 0.7233, "step": 17670 }, { "epoch": 0.8830286684646889, "grad_norm": 1.6425225734710693, "learning_rate": 5.913530287114406e-05, "loss": 0.8921, "step": 17680 }, { "epoch": 0.8835281190690241, "grad_norm": 1.29030442237854, "learning_rate": 5.909673358615079e-05, "loss": 0.7937, "step": 17690 }, { "epoch": 0.8840275696733593, "grad_norm": 1.672472357749939, "learning_rate": 5.905815870215976e-05, "loss": 0.7725, "step": 17700 }, { "epoch": 0.8845270202776946, "grad_norm": 1.3420329093933105, "learning_rate": 5.901957824291362e-05, "loss": 0.879, "step": 17710 }, { "epoch": 0.8850264708820298, "grad_norm": 1.5928447246551514, "learning_rate": 5.8980992232158505e-05, "loss": 0.733, "step": 17720 }, { "epoch": 0.885525921486365, "grad_norm": 2.805663585662842, "learning_rate": 5.894240069364389e-05, "loss": 0.9222, "step": 17730 }, { "epoch": 0.8860253720907002, "grad_norm": 2.8470332622528076, "learning_rate": 5.890380365112272e-05, "loss": 0.7656, "step": 17740 }, { "epoch": 0.8865248226950354, "grad_norm": 1.383072853088379, "learning_rate": 5.886520112835128e-05, "loss": 0.7539, "step": 17750 }, { "epoch": 0.8870242732993707, "grad_norm": 1.161964774131775, "learning_rate": 5.8826593149089235e-05, "loss": 0.8152, "step": 17760 }, { "epoch": 0.8875237239037059, "grad_norm": 1.071505069732666, "learning_rate": 5.878797973709964e-05, "loss": 0.7832, "step": 17770 }, { "epoch": 0.8880231745080411, "grad_norm": 0.9550650119781494, "learning_rate": 5.8749360916148865e-05, "loss": 0.7887, "step": 17780 }, { "epoch": 0.8885226251123763, "grad_norm": 2.1567580699920654, "learning_rate": 5.871073671000661e-05, "loss": 0.9789, "step": 17790 }, { "epoch": 0.8890220757167117, "grad_norm": 3.1140804290771484, "learning_rate": 5.8672107142445906e-05, "loss": 1.1522, "step": 17800 }, { "epoch": 0.8895215263210469, "grad_norm": 3.8770267963409424, "learning_rate": 5.8633472237243083e-05, "loss": 0.9059, "step": 17810 }, { "epoch": 0.8900209769253821, "grad_norm": 1.022334098815918, "learning_rate": 5.859483201817772e-05, "loss": 0.9102, "step": 17820 }, { "epoch": 0.8905204275297173, "grad_norm": 2.7335667610168457, "learning_rate": 5.855618650903272e-05, "loss": 0.9173, "step": 17830 }, { "epoch": 0.8910198781340526, "grad_norm": 1.1789907217025757, "learning_rate": 5.851753573359419e-05, "loss": 0.8724, "step": 17840 }, { "epoch": 0.8915193287383878, "grad_norm": 1.1129345893859863, "learning_rate": 5.8478879715651547e-05, "loss": 0.6434, "step": 17850 }, { "epoch": 0.892018779342723, "grad_norm": 2.9063918590545654, "learning_rate": 5.844021847899734e-05, "loss": 1.0347, "step": 17860 }, { "epoch": 0.8925182299470582, "grad_norm": 1.131264328956604, "learning_rate": 5.8401552047427444e-05, "loss": 0.9652, "step": 17870 }, { "epoch": 0.8930176805513935, "grad_norm": 0.964443564414978, "learning_rate": 5.83628804447408e-05, "loss": 0.9808, "step": 17880 }, { "epoch": 0.8935171311557287, "grad_norm": 2.7607600688934326, "learning_rate": 5.8324203694739664e-05, "loss": 0.9103, "step": 17890 }, { "epoch": 0.8940165817600639, "grad_norm": 1.1801255941390991, "learning_rate": 5.828552182122936e-05, "loss": 0.7595, "step": 17900 }, { "epoch": 0.8945160323643991, "grad_norm": 1.7192250490188599, "learning_rate": 5.8246834848018425e-05, "loss": 0.7443, "step": 17910 }, { "epoch": 0.8950154829687343, "grad_norm": 1.1626238822937012, "learning_rate": 5.82081427989185e-05, "loss": 0.8838, "step": 17920 }, { "epoch": 0.8955149335730697, "grad_norm": 1.4231460094451904, "learning_rate": 5.816944569774436e-05, "loss": 0.795, "step": 17930 }, { "epoch": 0.8960143841774049, "grad_norm": 1.108681082725525, "learning_rate": 5.81307435683139e-05, "loss": 0.9652, "step": 17940 }, { "epoch": 0.8965138347817401, "grad_norm": 1.17128324508667, "learning_rate": 5.8092036434448115e-05, "loss": 0.6767, "step": 17950 }, { "epoch": 0.8970132853860753, "grad_norm": 4.2591986656188965, "learning_rate": 5.805332431997106e-05, "loss": 1.0136, "step": 17960 }, { "epoch": 0.8975127359904106, "grad_norm": 2.6580145359039307, "learning_rate": 5.8014607248709843e-05, "loss": 1.0335, "step": 17970 }, { "epoch": 0.8980121865947458, "grad_norm": 3.448183059692383, "learning_rate": 5.7975885244494665e-05, "loss": 0.9011, "step": 17980 }, { "epoch": 0.898511637199081, "grad_norm": 3.073150634765625, "learning_rate": 5.7937158331158756e-05, "loss": 0.9683, "step": 17990 }, { "epoch": 0.8990110878034162, "grad_norm": 1.6009252071380615, "learning_rate": 5.789842653253832e-05, "loss": 0.8209, "step": 18000 }, { "epoch": 0.8995105384077515, "grad_norm": 2.7628567218780518, "learning_rate": 5.7859689872472614e-05, "loss": 0.6138, "step": 18010 }, { "epoch": 0.9000099890120867, "grad_norm": 1.1852062940597534, "learning_rate": 5.78209483748039e-05, "loss": 0.9531, "step": 18020 }, { "epoch": 0.9005094396164219, "grad_norm": 2.6659107208251953, "learning_rate": 5.7782202063377346e-05, "loss": 0.7996, "step": 18030 }, { "epoch": 0.9010088902207571, "grad_norm": 3.0761845111846924, "learning_rate": 5.774345096204117e-05, "loss": 0.9946, "step": 18040 }, { "epoch": 0.9015083408250923, "grad_norm": 2.088696241378784, "learning_rate": 5.7704695094646455e-05, "loss": 0.8477, "step": 18050 }, { "epoch": 0.9020077914294277, "grad_norm": 1.6839580535888672, "learning_rate": 5.766593448504729e-05, "loss": 0.7837, "step": 18060 }, { "epoch": 0.9025072420337629, "grad_norm": 2.36403751373291, "learning_rate": 5.762716915710065e-05, "loss": 0.739, "step": 18070 }, { "epoch": 0.9030066926380981, "grad_norm": 0.5672938823699951, "learning_rate": 5.758839913466642e-05, "loss": 0.7367, "step": 18080 }, { "epoch": 0.9035061432424333, "grad_norm": 2.3402769565582275, "learning_rate": 5.754962444160735e-05, "loss": 0.7342, "step": 18090 }, { "epoch": 0.9040055938467686, "grad_norm": 3.5708863735198975, "learning_rate": 5.75108451017891e-05, "loss": 0.8345, "step": 18100 }, { "epoch": 0.9045050444511038, "grad_norm": 3.8043088912963867, "learning_rate": 5.747206113908017e-05, "loss": 0.8163, "step": 18110 }, { "epoch": 0.905004495055439, "grad_norm": 0.3083461821079254, "learning_rate": 5.7433272577351936e-05, "loss": 0.8427, "step": 18120 }, { "epoch": 0.9055039456597742, "grad_norm": 1.7447819709777832, "learning_rate": 5.739447944047854e-05, "loss": 0.7814, "step": 18130 }, { "epoch": 0.9060033962641095, "grad_norm": 2.5935428142547607, "learning_rate": 5.7355681752337e-05, "loss": 0.7454, "step": 18140 }, { "epoch": 0.9065028468684447, "grad_norm": 1.5312762260437012, "learning_rate": 5.731687953680711e-05, "loss": 0.7288, "step": 18150 }, { "epoch": 0.9070022974727799, "grad_norm": 0.5429087281227112, "learning_rate": 5.727807281777147e-05, "loss": 0.7177, "step": 18160 }, { "epoch": 0.9075017480771151, "grad_norm": 3.9769444465637207, "learning_rate": 5.7239261619115415e-05, "loss": 0.8287, "step": 18170 }, { "epoch": 0.9080011986814503, "grad_norm": 1.3674291372299194, "learning_rate": 5.720044596472709e-05, "loss": 0.6996, "step": 18180 }, { "epoch": 0.9085006492857857, "grad_norm": 5.381434440612793, "learning_rate": 5.7161625878497294e-05, "loss": 0.8507, "step": 18190 }, { "epoch": 0.9090000998901209, "grad_norm": 4.628538131713867, "learning_rate": 5.712280138431969e-05, "loss": 0.7252, "step": 18200 }, { "epoch": 0.9094995504944561, "grad_norm": 2.7768666744232178, "learning_rate": 5.708397250609051e-05, "loss": 0.77, "step": 18210 }, { "epoch": 0.9099990010987913, "grad_norm": 2.563023090362549, "learning_rate": 5.70451392677088e-05, "loss": 0.6975, "step": 18220 }, { "epoch": 0.9104984517031266, "grad_norm": 1.5395959615707397, "learning_rate": 5.7006301693076204e-05, "loss": 0.7916, "step": 18230 }, { "epoch": 0.9109979023074618, "grad_norm": 2.5181050300598145, "learning_rate": 5.696745980609708e-05, "loss": 0.9363, "step": 18240 }, { "epoch": 0.911497352911797, "grad_norm": 1.535658597946167, "learning_rate": 5.692861363067843e-05, "loss": 0.7894, "step": 18250 }, { "epoch": 0.9119968035161322, "grad_norm": 1.0996131896972656, "learning_rate": 5.6889763190729916e-05, "loss": 0.8189, "step": 18260 }, { "epoch": 0.9124962541204675, "grad_norm": 1.8074034452438354, "learning_rate": 5.685090851016378e-05, "loss": 0.8101, "step": 18270 }, { "epoch": 0.9129957047248027, "grad_norm": 3.0158116817474365, "learning_rate": 5.68120496128949e-05, "loss": 0.7849, "step": 18280 }, { "epoch": 0.9134951553291379, "grad_norm": 3.0711352825164795, "learning_rate": 5.677318652284078e-05, "loss": 1.0275, "step": 18290 }, { "epoch": 0.9139946059334731, "grad_norm": 5.088053226470947, "learning_rate": 5.673431926392144e-05, "loss": 0.8505, "step": 18300 }, { "epoch": 0.9144940565378085, "grad_norm": 1.0952471494674683, "learning_rate": 5.6695447860059516e-05, "loss": 0.6712, "step": 18310 }, { "epoch": 0.9149935071421437, "grad_norm": 3.6260998249053955, "learning_rate": 5.665657233518018e-05, "loss": 0.7956, "step": 18320 }, { "epoch": 0.9154929577464789, "grad_norm": 0.613185465335846, "learning_rate": 5.661769271321114e-05, "loss": 0.8378, "step": 18330 }, { "epoch": 0.9159924083508141, "grad_norm": 0.9299529790878296, "learning_rate": 5.657880901808259e-05, "loss": 0.8693, "step": 18340 }, { "epoch": 0.9164918589551493, "grad_norm": 1.4949249029159546, "learning_rate": 5.6539921273727324e-05, "loss": 0.9464, "step": 18350 }, { "epoch": 0.9169913095594846, "grad_norm": 2.3860533237457275, "learning_rate": 5.650102950408051e-05, "loss": 0.7277, "step": 18360 }, { "epoch": 0.9174907601638198, "grad_norm": 2.837008237838745, "learning_rate": 5.646213373307992e-05, "loss": 0.8322, "step": 18370 }, { "epoch": 0.917990210768155, "grad_norm": 0.9468108415603638, "learning_rate": 5.6423233984665646e-05, "loss": 0.869, "step": 18380 }, { "epoch": 0.9184896613724902, "grad_norm": 0.9317816495895386, "learning_rate": 5.638433028278038e-05, "loss": 0.8345, "step": 18390 }, { "epoch": 0.9189891119768255, "grad_norm": 2.894892692565918, "learning_rate": 5.634542265136911e-05, "loss": 0.9068, "step": 18400 }, { "epoch": 0.9194885625811607, "grad_norm": 2.379563570022583, "learning_rate": 5.630651111437935e-05, "loss": 0.8521, "step": 18410 }, { "epoch": 0.919988013185496, "grad_norm": 1.6990435123443604, "learning_rate": 5.626759569576094e-05, "loss": 0.9663, "step": 18420 }, { "epoch": 0.9204874637898312, "grad_norm": 0.7252857685089111, "learning_rate": 5.622867641946617e-05, "loss": 0.8153, "step": 18430 }, { "epoch": 0.9209869143941665, "grad_norm": 1.4249210357666016, "learning_rate": 5.618975330944966e-05, "loss": 0.9578, "step": 18440 }, { "epoch": 0.9214863649985017, "grad_norm": 1.74830162525177, "learning_rate": 5.615082638966839e-05, "loss": 0.6519, "step": 18450 }, { "epoch": 0.9219858156028369, "grad_norm": 1.5790170431137085, "learning_rate": 5.6111895684081725e-05, "loss": 0.9353, "step": 18460 }, { "epoch": 0.9224852662071721, "grad_norm": 0.9895839691162109, "learning_rate": 5.607296121665133e-05, "loss": 0.7957, "step": 18470 }, { "epoch": 0.9229847168115073, "grad_norm": 1.759394645690918, "learning_rate": 5.60340230113412e-05, "loss": 0.884, "step": 18480 }, { "epoch": 0.9234841674158426, "grad_norm": 2.133049488067627, "learning_rate": 5.599508109211759e-05, "loss": 0.8668, "step": 18490 }, { "epoch": 0.9239836180201778, "grad_norm": 2.912149429321289, "learning_rate": 5.59561354829491e-05, "loss": 0.7475, "step": 18500 }, { "epoch": 0.924483068624513, "grad_norm": 2.350933313369751, "learning_rate": 5.591718620780657e-05, "loss": 0.9002, "step": 18510 }, { "epoch": 0.9249825192288482, "grad_norm": 4.567841529846191, "learning_rate": 5.587823329066308e-05, "loss": 0.7602, "step": 18520 }, { "epoch": 0.9254819698331835, "grad_norm": 5.905770301818848, "learning_rate": 5.583927675549401e-05, "loss": 0.8341, "step": 18530 }, { "epoch": 0.9259814204375187, "grad_norm": 1.3211944103240967, "learning_rate": 5.58003166262769e-05, "loss": 0.9097, "step": 18540 }, { "epoch": 0.926480871041854, "grad_norm": 1.3427737951278687, "learning_rate": 5.576135292699153e-05, "loss": 0.7227, "step": 18550 }, { "epoch": 0.9269803216461892, "grad_norm": 0.635680079460144, "learning_rate": 5.5722385681619894e-05, "loss": 0.7066, "step": 18560 }, { "epoch": 0.9274797722505245, "grad_norm": 1.1708614826202393, "learning_rate": 5.568341491414613e-05, "loss": 0.7015, "step": 18570 }, { "epoch": 0.9279792228548597, "grad_norm": 2.472395896911621, "learning_rate": 5.5644440648556595e-05, "loss": 0.8477, "step": 18580 }, { "epoch": 0.9284786734591949, "grad_norm": 2.3495097160339355, "learning_rate": 5.5605462908839746e-05, "loss": 0.7455, "step": 18590 }, { "epoch": 0.9289781240635301, "grad_norm": 1.7436636686325073, "learning_rate": 5.556648171898623e-05, "loss": 0.9545, "step": 18600 }, { "epoch": 0.9294775746678654, "grad_norm": 3.8654696941375732, "learning_rate": 5.552749710298875e-05, "loss": 0.8446, "step": 18610 }, { "epoch": 0.9299770252722006, "grad_norm": 2.026738166809082, "learning_rate": 5.548850908484219e-05, "loss": 0.8007, "step": 18620 }, { "epoch": 0.9304764758765358, "grad_norm": 1.5802265405654907, "learning_rate": 5.5449517688543486e-05, "loss": 0.7624, "step": 18630 }, { "epoch": 0.930975926480871, "grad_norm": 2.023195743560791, "learning_rate": 5.541052293809167e-05, "loss": 0.7756, "step": 18640 }, { "epoch": 0.9314753770852062, "grad_norm": 1.725743055343628, "learning_rate": 5.537152485748782e-05, "loss": 0.9886, "step": 18650 }, { "epoch": 0.9319748276895415, "grad_norm": 2.1248693466186523, "learning_rate": 5.5332523470735086e-05, "loss": 0.8428, "step": 18660 }, { "epoch": 0.9324742782938767, "grad_norm": 1.0478453636169434, "learning_rate": 5.529351880183863e-05, "loss": 0.6858, "step": 18670 }, { "epoch": 0.932973728898212, "grad_norm": 2.333965539932251, "learning_rate": 5.525451087480568e-05, "loss": 0.9799, "step": 18680 }, { "epoch": 0.9334731795025472, "grad_norm": 3.503141164779663, "learning_rate": 5.521549971364538e-05, "loss": 0.918, "step": 18690 }, { "epoch": 0.9339726301068825, "grad_norm": 1.8752412796020508, "learning_rate": 5.5176485342368964e-05, "loss": 0.7339, "step": 18700 }, { "epoch": 0.9344720807112177, "grad_norm": 2.4250271320343018, "learning_rate": 5.5137467784989595e-05, "loss": 0.8915, "step": 18710 }, { "epoch": 0.9349715313155529, "grad_norm": 1.8618446588516235, "learning_rate": 5.5098447065522386e-05, "loss": 0.8496, "step": 18720 }, { "epoch": 0.9354709819198881, "grad_norm": 1.5213110446929932, "learning_rate": 5.5059423207984407e-05, "loss": 0.8608, "step": 18730 }, { "epoch": 0.9359704325242234, "grad_norm": 2.1037065982818604, "learning_rate": 5.502039623639468e-05, "loss": 0.9171, "step": 18740 }, { "epoch": 0.9364698831285586, "grad_norm": 2.4831433296203613, "learning_rate": 5.498136617477413e-05, "loss": 0.821, "step": 18750 }, { "epoch": 0.9369693337328938, "grad_norm": 2.180809497833252, "learning_rate": 5.494233304714556e-05, "loss": 0.8884, "step": 18760 }, { "epoch": 0.937468784337229, "grad_norm": 3.107668399810791, "learning_rate": 5.4903296877533685e-05, "loss": 0.8365, "step": 18770 }, { "epoch": 0.9379682349415642, "grad_norm": 0.9787671566009521, "learning_rate": 5.4864257689965105e-05, "loss": 0.8015, "step": 18780 }, { "epoch": 0.9384676855458995, "grad_norm": 1.7403486967086792, "learning_rate": 5.482521550846823e-05, "loss": 0.9814, "step": 18790 }, { "epoch": 0.9389671361502347, "grad_norm": 1.2914844751358032, "learning_rate": 5.478617035707337e-05, "loss": 0.6709, "step": 18800 }, { "epoch": 0.93946658675457, "grad_norm": 1.861228346824646, "learning_rate": 5.4747122259812655e-05, "loss": 0.6572, "step": 18810 }, { "epoch": 0.9399660373589052, "grad_norm": 2.417015552520752, "learning_rate": 5.470807124071996e-05, "loss": 0.7708, "step": 18820 }, { "epoch": 0.9404654879632405, "grad_norm": 3.1048529148101807, "learning_rate": 5.466901732383103e-05, "loss": 0.8148, "step": 18830 }, { "epoch": 0.9409649385675757, "grad_norm": 1.4176899194717407, "learning_rate": 5.4629960533183375e-05, "loss": 0.9662, "step": 18840 }, { "epoch": 0.9414643891719109, "grad_norm": 1.4135115146636963, "learning_rate": 5.459090089281629e-05, "loss": 1.0568, "step": 18850 }, { "epoch": 0.9419638397762461, "grad_norm": 1.8020182847976685, "learning_rate": 5.455183842677076e-05, "loss": 0.8499, "step": 18860 }, { "epoch": 0.9424632903805814, "grad_norm": 1.875177025794983, "learning_rate": 5.451277315908959e-05, "loss": 0.6807, "step": 18870 }, { "epoch": 0.9429627409849166, "grad_norm": 1.8055870532989502, "learning_rate": 5.4473705113817254e-05, "loss": 1.0195, "step": 18880 }, { "epoch": 0.9434621915892518, "grad_norm": 3.1722288131713867, "learning_rate": 5.443463431499997e-05, "loss": 0.9666, "step": 18890 }, { "epoch": 0.943961642193587, "grad_norm": 2.5591423511505127, "learning_rate": 5.4395560786685606e-05, "loss": 0.7662, "step": 18900 }, { "epoch": 0.9444610927979222, "grad_norm": 2.3718013763427734, "learning_rate": 5.435648455292378e-05, "loss": 0.784, "step": 18910 }, { "epoch": 0.9449605434022575, "grad_norm": 2.845883846282959, "learning_rate": 5.431740563776568e-05, "loss": 0.9012, "step": 18920 }, { "epoch": 0.9454599940065928, "grad_norm": 0.9969069957733154, "learning_rate": 5.427832406526427e-05, "loss": 0.7442, "step": 18930 }, { "epoch": 0.945959444610928, "grad_norm": 2.8231468200683594, "learning_rate": 5.4239239859474026e-05, "loss": 0.9244, "step": 18940 }, { "epoch": 0.9464588952152632, "grad_norm": 2.8638861179351807, "learning_rate": 5.420015304445112e-05, "loss": 0.7635, "step": 18950 }, { "epoch": 0.9469583458195985, "grad_norm": 1.2569690942764282, "learning_rate": 5.4161063644253284e-05, "loss": 1.0894, "step": 18960 }, { "epoch": 0.9474577964239337, "grad_norm": 1.3123199939727783, "learning_rate": 5.4121971682939885e-05, "loss": 0.8007, "step": 18970 }, { "epoch": 0.9479572470282689, "grad_norm": 1.932054042816162, "learning_rate": 5.408287718457185e-05, "loss": 0.7395, "step": 18980 }, { "epoch": 0.9484566976326041, "grad_norm": 1.4530482292175293, "learning_rate": 5.4043780173211635e-05, "loss": 0.8638, "step": 18990 }, { "epoch": 0.9489561482369394, "grad_norm": 2.344294786453247, "learning_rate": 5.400468067292329e-05, "loss": 0.9084, "step": 19000 }, { "epoch": 0.9494555988412746, "grad_norm": 2.032836437225342, "learning_rate": 5.396557870777239e-05, "loss": 1.0318, "step": 19010 }, { "epoch": 0.9499550494456098, "grad_norm": 6.245739936828613, "learning_rate": 5.3926474301826e-05, "loss": 0.7927, "step": 19020 }, { "epoch": 0.950454500049945, "grad_norm": 1.9433112144470215, "learning_rate": 5.3887367479152706e-05, "loss": 0.6354, "step": 19030 }, { "epoch": 0.9509539506542803, "grad_norm": 1.9389113187789917, "learning_rate": 5.384825826382257e-05, "loss": 0.8145, "step": 19040 }, { "epoch": 0.9514534012586155, "grad_norm": 1.4428133964538574, "learning_rate": 5.380914667990714e-05, "loss": 0.7581, "step": 19050 }, { "epoch": 0.9519528518629508, "grad_norm": 2.562809944152832, "learning_rate": 5.377003275147943e-05, "loss": 0.7327, "step": 19060 }, { "epoch": 0.952452302467286, "grad_norm": 1.8769131898880005, "learning_rate": 5.373091650261385e-05, "loss": 0.7855, "step": 19070 }, { "epoch": 0.9529517530716212, "grad_norm": 1.8019176721572876, "learning_rate": 5.3691797957386316e-05, "loss": 0.8249, "step": 19080 }, { "epoch": 0.9534512036759565, "grad_norm": 2.966636896133423, "learning_rate": 5.365267713987407e-05, "loss": 0.8536, "step": 19090 }, { "epoch": 0.9539506542802917, "grad_norm": 5.881937503814697, "learning_rate": 5.3613554074155815e-05, "loss": 0.9598, "step": 19100 }, { "epoch": 0.9544501048846269, "grad_norm": 1.0772712230682373, "learning_rate": 5.3574428784311624e-05, "loss": 0.7772, "step": 19110 }, { "epoch": 0.9549495554889621, "grad_norm": 4.194241523742676, "learning_rate": 5.353530129442293e-05, "loss": 0.7927, "step": 19120 }, { "epoch": 0.9554490060932974, "grad_norm": 1.3591468334197998, "learning_rate": 5.349617162857251e-05, "loss": 0.6133, "step": 19130 }, { "epoch": 0.9559484566976326, "grad_norm": 2.7203056812286377, "learning_rate": 5.345703981084451e-05, "loss": 0.8675, "step": 19140 }, { "epoch": 0.9564479073019678, "grad_norm": 3.1273722648620605, "learning_rate": 5.341790586532438e-05, "loss": 0.7179, "step": 19150 }, { "epoch": 0.956947357906303, "grad_norm": 1.601722240447998, "learning_rate": 5.33787698160989e-05, "loss": 1.0063, "step": 19160 }, { "epoch": 0.9574468085106383, "grad_norm": 1.9342644214630127, "learning_rate": 5.3339631687256084e-05, "loss": 0.816, "step": 19170 }, { "epoch": 0.9579462591149736, "grad_norm": 2.065150737762451, "learning_rate": 5.330049150288531e-05, "loss": 0.7717, "step": 19180 }, { "epoch": 0.9584457097193088, "grad_norm": 1.3680709600448608, "learning_rate": 5.326134928707716e-05, "loss": 0.7374, "step": 19190 }, { "epoch": 0.958945160323644, "grad_norm": 3.846485137939453, "learning_rate": 5.322220506392352e-05, "loss": 0.8245, "step": 19200 }, { "epoch": 0.9594446109279792, "grad_norm": 2.7010490894317627, "learning_rate": 5.318305885751742e-05, "loss": 0.8996, "step": 19210 }, { "epoch": 0.9599440615323145, "grad_norm": 1.8771228790283203, "learning_rate": 5.3143910691953234e-05, "loss": 0.6841, "step": 19220 }, { "epoch": 0.9604435121366497, "grad_norm": 1.0770602226257324, "learning_rate": 5.310476059132645e-05, "loss": 0.8859, "step": 19230 }, { "epoch": 0.9609429627409849, "grad_norm": 1.264762282371521, "learning_rate": 5.3065608579733775e-05, "loss": 0.78, "step": 19240 }, { "epoch": 0.9614424133453201, "grad_norm": 0.8207452893257141, "learning_rate": 5.30264546812731e-05, "loss": 0.8977, "step": 19250 }, { "epoch": 0.9619418639496554, "grad_norm": 1.057423710823059, "learning_rate": 5.2987298920043435e-05, "loss": 0.7647, "step": 19260 }, { "epoch": 0.9624413145539906, "grad_norm": 2.5369441509246826, "learning_rate": 5.294814132014503e-05, "loss": 0.7623, "step": 19270 }, { "epoch": 0.9629407651583258, "grad_norm": 1.0441887378692627, "learning_rate": 5.290898190567917e-05, "loss": 0.9578, "step": 19280 }, { "epoch": 0.963440215762661, "grad_norm": 3.11104679107666, "learning_rate": 5.28698207007483e-05, "loss": 1.0426, "step": 19290 }, { "epoch": 0.9639396663669964, "grad_norm": 2.6482746601104736, "learning_rate": 5.283065772945594e-05, "loss": 0.8509, "step": 19300 }, { "epoch": 0.9644391169713316, "grad_norm": 1.1781560182571411, "learning_rate": 5.279149301590679e-05, "loss": 0.792, "step": 19310 }, { "epoch": 0.9649385675756668, "grad_norm": 1.125461220741272, "learning_rate": 5.275232658420648e-05, "loss": 0.6797, "step": 19320 }, { "epoch": 0.965438018180002, "grad_norm": 0.8164302110671997, "learning_rate": 5.271315845846181e-05, "loss": 0.8455, "step": 19330 }, { "epoch": 0.9659374687843373, "grad_norm": 4.994171142578125, "learning_rate": 5.267398866278054e-05, "loss": 0.9375, "step": 19340 }, { "epoch": 0.9664369193886725, "grad_norm": 2.1752095222473145, "learning_rate": 5.2634817221271534e-05, "loss": 0.898, "step": 19350 }, { "epoch": 0.9669363699930077, "grad_norm": 1.8746306896209717, "learning_rate": 5.2595644158044634e-05, "loss": 0.8818, "step": 19360 }, { "epoch": 0.9674358205973429, "grad_norm": 1.6423358917236328, "learning_rate": 5.2556469497210684e-05, "loss": 0.983, "step": 19370 }, { "epoch": 0.9679352712016781, "grad_norm": 2.424043655395508, "learning_rate": 5.251729326288147e-05, "loss": 1.0658, "step": 19380 }, { "epoch": 0.9684347218060134, "grad_norm": 1.116323709487915, "learning_rate": 5.247811547916982e-05, "loss": 0.7563, "step": 19390 }, { "epoch": 0.9689341724103486, "grad_norm": 2.169189214706421, "learning_rate": 5.243893617018945e-05, "loss": 0.7063, "step": 19400 }, { "epoch": 0.9694336230146838, "grad_norm": 1.8824118375778198, "learning_rate": 5.23997553600551e-05, "loss": 0.9368, "step": 19410 }, { "epoch": 0.969933073619019, "grad_norm": 2.2264509201049805, "learning_rate": 5.2360573072882334e-05, "loss": 0.7755, "step": 19420 }, { "epoch": 0.9704325242233544, "grad_norm": 6.045483112335205, "learning_rate": 5.2321389332787664e-05, "loss": 1.132, "step": 19430 }, { "epoch": 0.9709319748276896, "grad_norm": 1.84587824344635, "learning_rate": 5.228220416388854e-05, "loss": 0.8128, "step": 19440 }, { "epoch": 0.9714314254320248, "grad_norm": 1.2925611734390259, "learning_rate": 5.224301759030321e-05, "loss": 0.8222, "step": 19450 }, { "epoch": 0.97193087603636, "grad_norm": 3.026948928833008, "learning_rate": 5.220382963615086e-05, "loss": 1.1625, "step": 19460 }, { "epoch": 0.9724303266406953, "grad_norm": 4.627734661102295, "learning_rate": 5.2164640325551484e-05, "loss": 0.771, "step": 19470 }, { "epoch": 0.9729297772450305, "grad_norm": 2.621882200241089, "learning_rate": 5.212544968262594e-05, "loss": 0.8294, "step": 19480 }, { "epoch": 0.9734292278493657, "grad_norm": 3.361600875854492, "learning_rate": 5.2086257731495856e-05, "loss": 0.9133, "step": 19490 }, { "epoch": 0.9739286784537009, "grad_norm": 2.0608766078948975, "learning_rate": 5.204706449628374e-05, "loss": 0.8347, "step": 19500 }, { "epoch": 0.9744281290580361, "grad_norm": 1.9795416593551636, "learning_rate": 5.20078700011128e-05, "loss": 0.7975, "step": 19510 }, { "epoch": 0.9749275796623714, "grad_norm": 2.003333806991577, "learning_rate": 5.196867427010711e-05, "loss": 0.8248, "step": 19520 }, { "epoch": 0.9754270302667066, "grad_norm": 4.734315872192383, "learning_rate": 5.192947732739143e-05, "loss": 0.8322, "step": 19530 }, { "epoch": 0.9759264808710418, "grad_norm": 2.263054609298706, "learning_rate": 5.189027919709133e-05, "loss": 0.9079, "step": 19540 }, { "epoch": 0.976425931475377, "grad_norm": 1.3464040756225586, "learning_rate": 5.185107990333306e-05, "loss": 0.7659, "step": 19550 }, { "epoch": 0.9769253820797124, "grad_norm": 1.762438416481018, "learning_rate": 5.1811879470243595e-05, "loss": 0.8269, "step": 19560 }, { "epoch": 0.9774248326840476, "grad_norm": 1.8302335739135742, "learning_rate": 5.1772677921950643e-05, "loss": 0.8949, "step": 19570 }, { "epoch": 0.9779242832883828, "grad_norm": 2.3929293155670166, "learning_rate": 5.1733475282582565e-05, "loss": 0.8596, "step": 19580 }, { "epoch": 0.978423733892718, "grad_norm": 2.744567632675171, "learning_rate": 5.1694271576268415e-05, "loss": 1.0234, "step": 19590 }, { "epoch": 0.9789231844970533, "grad_norm": 1.7753416299819946, "learning_rate": 5.165506682713788e-05, "loss": 0.9801, "step": 19600 }, { "epoch": 0.9794226351013885, "grad_norm": 1.1553436517715454, "learning_rate": 5.161586105932131e-05, "loss": 0.9044, "step": 19610 }, { "epoch": 0.9799220857057237, "grad_norm": 4.240505695343018, "learning_rate": 5.15766542969497e-05, "loss": 1.082, "step": 19620 }, { "epoch": 0.9804215363100589, "grad_norm": 1.8582956790924072, "learning_rate": 5.153744656415459e-05, "loss": 0.9097, "step": 19630 }, { "epoch": 0.9809209869143942, "grad_norm": 1.5120314359664917, "learning_rate": 5.149823788506818e-05, "loss": 0.7637, "step": 19640 }, { "epoch": 0.9814204375187294, "grad_norm": 1.2795964479446411, "learning_rate": 5.145902828382323e-05, "loss": 0.7, "step": 19650 }, { "epoch": 0.9819198881230646, "grad_norm": 1.9886530637741089, "learning_rate": 5.141981778455308e-05, "loss": 0.8157, "step": 19660 }, { "epoch": 0.9824193387273998, "grad_norm": 1.139605164527893, "learning_rate": 5.1380606411391594e-05, "loss": 0.6963, "step": 19670 }, { "epoch": 0.982918789331735, "grad_norm": 3.01100754737854, "learning_rate": 5.134139418847321e-05, "loss": 0.6669, "step": 19680 }, { "epoch": 0.9834182399360704, "grad_norm": 0.967864990234375, "learning_rate": 5.130218113993285e-05, "loss": 0.8265, "step": 19690 }, { "epoch": 0.9839176905404056, "grad_norm": 1.3162167072296143, "learning_rate": 5.1262967289905974e-05, "loss": 0.8367, "step": 19700 }, { "epoch": 0.9844171411447408, "grad_norm": 0.8989589810371399, "learning_rate": 5.122375266252855e-05, "loss": 0.7617, "step": 19710 }, { "epoch": 0.984916591749076, "grad_norm": 2.836339235305786, "learning_rate": 5.118453728193696e-05, "loss": 0.8839, "step": 19720 }, { "epoch": 0.9854160423534113, "grad_norm": 2.4945757389068604, "learning_rate": 5.1145321172268115e-05, "loss": 0.8062, "step": 19730 }, { "epoch": 0.9859154929577465, "grad_norm": 3.778860330581665, "learning_rate": 5.110610435765934e-05, "loss": 0.9809, "step": 19740 }, { "epoch": 0.9864149435620817, "grad_norm": 2.4653830528259277, "learning_rate": 5.106688686224843e-05, "loss": 0.8808, "step": 19750 }, { "epoch": 0.9869143941664169, "grad_norm": 2.0902090072631836, "learning_rate": 5.102766871017355e-05, "loss": 0.8346, "step": 19760 }, { "epoch": 0.9874138447707522, "grad_norm": 3.120569944381714, "learning_rate": 5.0988449925573286e-05, "loss": 0.9277, "step": 19770 }, { "epoch": 0.9879132953750874, "grad_norm": 2.109555244445801, "learning_rate": 5.0949230532586635e-05, "loss": 0.7138, "step": 19780 }, { "epoch": 0.9884127459794226, "grad_norm": 1.553554892539978, "learning_rate": 5.0910010555352964e-05, "loss": 0.6749, "step": 19790 }, { "epoch": 0.9889121965837578, "grad_norm": 2.5567986965179443, "learning_rate": 5.087079001801196e-05, "loss": 0.7742, "step": 19800 }, { "epoch": 0.989411647188093, "grad_norm": 2.878876209259033, "learning_rate": 5.083156894470371e-05, "loss": 0.7715, "step": 19810 }, { "epoch": 0.9899110977924284, "grad_norm": 1.859236717224121, "learning_rate": 5.079234735956857e-05, "loss": 0.8226, "step": 19820 }, { "epoch": 0.9904105483967636, "grad_norm": 4.09383487701416, "learning_rate": 5.0753125286747285e-05, "loss": 0.8875, "step": 19830 }, { "epoch": 0.9909099990010988, "grad_norm": 1.2874923944473267, "learning_rate": 5.071390275038084e-05, "loss": 0.794, "step": 19840 }, { "epoch": 0.991409449605434, "grad_norm": 1.0898391008377075, "learning_rate": 5.067467977461053e-05, "loss": 0.8219, "step": 19850 }, { "epoch": 0.9919089002097693, "grad_norm": 1.6944223642349243, "learning_rate": 5.063545638357791e-05, "loss": 0.8407, "step": 19860 }, { "epoch": 0.9924083508141045, "grad_norm": 1.0680359601974487, "learning_rate": 5.059623260142481e-05, "loss": 0.8435, "step": 19870 }, { "epoch": 0.9929078014184397, "grad_norm": 2.1276001930236816, "learning_rate": 5.055700845229327e-05, "loss": 0.7866, "step": 19880 }, { "epoch": 0.9934072520227749, "grad_norm": 1.2395598888397217, "learning_rate": 5.0517783960325616e-05, "loss": 1.0635, "step": 19890 }, { "epoch": 0.9939067026271102, "grad_norm": 3.308525323867798, "learning_rate": 5.047855914966429e-05, "loss": 1.0118, "step": 19900 }, { "epoch": 0.9944061532314454, "grad_norm": 1.449928641319275, "learning_rate": 5.0439334044452e-05, "loss": 0.7627, "step": 19910 }, { "epoch": 0.9949056038357806, "grad_norm": 2.200239419937134, "learning_rate": 5.040010866883162e-05, "loss": 0.6629, "step": 19920 }, { "epoch": 0.9954050544401158, "grad_norm": 2.7821123600006104, "learning_rate": 5.036088304694622e-05, "loss": 0.7161, "step": 19930 }, { "epoch": 0.995904505044451, "grad_norm": 2.221611499786377, "learning_rate": 5.0321657202938935e-05, "loss": 0.8149, "step": 19940 }, { "epoch": 0.9964039556487864, "grad_norm": 0.8881192207336426, "learning_rate": 5.0282431160953116e-05, "loss": 0.8979, "step": 19950 }, { "epoch": 0.9969034062531216, "grad_norm": 1.5408036708831787, "learning_rate": 5.024320494513223e-05, "loss": 0.8557, "step": 19960 }, { "epoch": 0.9974028568574568, "grad_norm": 2.813300132751465, "learning_rate": 5.02039785796198e-05, "loss": 0.8963, "step": 19970 }, { "epoch": 0.997902307461792, "grad_norm": 1.3438774347305298, "learning_rate": 5.016475208855952e-05, "loss": 0.9493, "step": 19980 }, { "epoch": 0.9984017580661273, "grad_norm": 1.9065624475479126, "learning_rate": 5.012552549609505e-05, "loss": 0.8626, "step": 19990 }, { "epoch": 0.9989012086704625, "grad_norm": 2.3506124019622803, "learning_rate": 5.008629882637024e-05, "loss": 0.7836, "step": 20000 }, { "epoch": 0.9994006592747977, "grad_norm": 1.8984686136245728, "learning_rate": 5.004707210352888e-05, "loss": 1.0569, "step": 20010 }, { "epoch": 0.9999001098791329, "grad_norm": 1.8179553747177124, "learning_rate": 5.0007845351714875e-05, "loss": 0.9464, "step": 20020 } ], "logging_steps": 10, "max_steps": 40044, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2755084680839168e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }