{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 17646, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2496.8073030248033, "learning_rate": 5.6657223796033996e-09, "loss": 12.3922, "step": 1 }, { "epoch": 0.0, "grad_norm": 2798.9639739819045, "learning_rate": 2.8328611898017002e-08, "loss": 12.556, "step": 5 }, { "epoch": 0.0, "grad_norm": 3186.5811854816743, "learning_rate": 5.6657223796034004e-08, "loss": 12.6663, "step": 10 }, { "epoch": 0.0, "grad_norm": 2433.131292284797, "learning_rate": 8.4985835694051e-08, "loss": 11.8166, "step": 15 }, { "epoch": 0.0, "grad_norm": 927.7244167671865, "learning_rate": 1.1331444759206801e-07, "loss": 9.872, "step": 20 }, { "epoch": 0.0, "grad_norm": 598.7163941919265, "learning_rate": 1.41643059490085e-07, "loss": 8.1112, "step": 25 }, { "epoch": 0.01, "grad_norm": 593.7817469340148, "learning_rate": 1.69971671388102e-07, "loss": 7.6257, "step": 30 }, { "epoch": 0.01, "grad_norm": 361.1842229161203, "learning_rate": 1.9830028328611898e-07, "loss": 6.8663, "step": 35 }, { "epoch": 0.01, "grad_norm": 261.41360349547375, "learning_rate": 2.2662889518413602e-07, "loss": 6.297, "step": 40 }, { "epoch": 0.01, "grad_norm": 294.80563694492105, "learning_rate": 2.54957507082153e-07, "loss": 5.8564, "step": 45 }, { "epoch": 0.01, "grad_norm": 223.89911973808375, "learning_rate": 2.8328611898017e-07, "loss": 5.5386, "step": 50 }, { "epoch": 0.01, "grad_norm": 205.52507903905445, "learning_rate": 3.1161473087818695e-07, "loss": 5.232, "step": 55 }, { "epoch": 0.01, "grad_norm": 115.75404026789134, "learning_rate": 3.39943342776204e-07, "loss": 5.065, "step": 60 }, { "epoch": 0.01, "grad_norm": 76.96969560974317, "learning_rate": 3.6827195467422096e-07, "loss": 4.862, "step": 65 }, { "epoch": 0.01, "grad_norm": 80.34469970951176, "learning_rate": 3.9660056657223797e-07, "loss": 4.7516, "step": 70 }, { "epoch": 0.01, "grad_norm": 81.7595945293725, "learning_rate": 4.24929178470255e-07, "loss": 4.5732, "step": 75 }, { "epoch": 0.01, "grad_norm": 63.15931120806828, "learning_rate": 4.5325779036827203e-07, "loss": 4.4904, "step": 80 }, { "epoch": 0.01, "grad_norm": 56.380024867999694, "learning_rate": 4.815864022662889e-07, "loss": 4.3397, "step": 85 }, { "epoch": 0.02, "grad_norm": 60.056632903837375, "learning_rate": 5.09915014164306e-07, "loss": 4.2419, "step": 90 }, { "epoch": 0.02, "grad_norm": 57.72643638091866, "learning_rate": 5.382436260623229e-07, "loss": 4.2126, "step": 95 }, { "epoch": 0.02, "grad_norm": 44.8303385559054, "learning_rate": 5.6657223796034e-07, "loss": 4.083, "step": 100 }, { "epoch": 0.02, "grad_norm": 45.71148017336654, "learning_rate": 5.949008498583571e-07, "loss": 3.9693, "step": 105 }, { "epoch": 0.02, "grad_norm": 46.76823280991884, "learning_rate": 6.232294617563739e-07, "loss": 3.8478, "step": 110 }, { "epoch": 0.02, "grad_norm": 42.27034196603908, "learning_rate": 6.51558073654391e-07, "loss": 3.8355, "step": 115 }, { "epoch": 0.02, "grad_norm": 43.786683236286535, "learning_rate": 6.79886685552408e-07, "loss": 3.7417, "step": 120 }, { "epoch": 0.02, "grad_norm": 33.38556504010887, "learning_rate": 7.08215297450425e-07, "loss": 3.619, "step": 125 }, { "epoch": 0.02, "grad_norm": 38.77121164219573, "learning_rate": 7.365439093484419e-07, "loss": 3.6023, "step": 130 }, { "epoch": 0.02, "grad_norm": 35.85722948165567, "learning_rate": 7.64872521246459e-07, "loss": 3.5028, "step": 135 }, { "epoch": 0.02, "grad_norm": 34.87382466876409, "learning_rate": 7.932011331444759e-07, "loss": 3.3987, "step": 140 }, { "epoch": 0.02, "grad_norm": 28.499096674335632, "learning_rate": 8.215297450424931e-07, "loss": 3.3644, "step": 145 }, { "epoch": 0.03, "grad_norm": 26.379175262470692, "learning_rate": 8.4985835694051e-07, "loss": 3.2703, "step": 150 }, { "epoch": 0.03, "grad_norm": 29.865960399124674, "learning_rate": 8.781869688385269e-07, "loss": 3.2143, "step": 155 }, { "epoch": 0.03, "grad_norm": 25.246604944858476, "learning_rate": 9.065155807365441e-07, "loss": 3.1724, "step": 160 }, { "epoch": 0.03, "grad_norm": 26.803614997802278, "learning_rate": 9.34844192634561e-07, "loss": 3.1165, "step": 165 }, { "epoch": 0.03, "grad_norm": 24.868744499172813, "learning_rate": 9.631728045325779e-07, "loss": 3.0226, "step": 170 }, { "epoch": 0.03, "grad_norm": 28.072404376457335, "learning_rate": 9.91501416430595e-07, "loss": 2.974, "step": 175 }, { "epoch": 0.03, "grad_norm": 24.13981569942378, "learning_rate": 1.019830028328612e-06, "loss": 2.8986, "step": 180 }, { "epoch": 0.03, "grad_norm": 24.682144066119207, "learning_rate": 1.048158640226629e-06, "loss": 2.8929, "step": 185 }, { "epoch": 0.03, "grad_norm": 26.393633161733234, "learning_rate": 1.0764872521246459e-06, "loss": 2.8201, "step": 190 }, { "epoch": 0.03, "grad_norm": 25.165888181101067, "learning_rate": 1.104815864022663e-06, "loss": 2.7715, "step": 195 }, { "epoch": 0.03, "grad_norm": 24.497827155588414, "learning_rate": 1.13314447592068e-06, "loss": 2.7423, "step": 200 }, { "epoch": 0.03, "grad_norm": 22.577170865053922, "learning_rate": 1.1614730878186968e-06, "loss": 2.6814, "step": 205 }, { "epoch": 0.04, "grad_norm": 26.811068284154047, "learning_rate": 1.1898016997167141e-06, "loss": 2.681, "step": 210 }, { "epoch": 0.04, "grad_norm": 23.625896687896763, "learning_rate": 1.218130311614731e-06, "loss": 2.6641, "step": 215 }, { "epoch": 0.04, "grad_norm": 25.558263239239487, "learning_rate": 1.2464589235127478e-06, "loss": 2.6209, "step": 220 }, { "epoch": 0.04, "grad_norm": 28.59558876783872, "learning_rate": 1.274787535410765e-06, "loss": 2.6035, "step": 225 }, { "epoch": 0.04, "grad_norm": 22.99206869102384, "learning_rate": 1.303116147308782e-06, "loss": 2.5784, "step": 230 }, { "epoch": 0.04, "grad_norm": 24.770207084904367, "learning_rate": 1.331444759206799e-06, "loss": 2.5403, "step": 235 }, { "epoch": 0.04, "grad_norm": 26.943253914438525, "learning_rate": 1.359773371104816e-06, "loss": 2.4988, "step": 240 }, { "epoch": 0.04, "grad_norm": 28.669675628003457, "learning_rate": 1.388101983002833e-06, "loss": 2.4725, "step": 245 }, { "epoch": 0.04, "grad_norm": 32.51240263679756, "learning_rate": 1.41643059490085e-06, "loss": 2.4197, "step": 250 }, { "epoch": 0.04, "grad_norm": 24.03710460536725, "learning_rate": 1.444759206798867e-06, "loss": 2.4541, "step": 255 }, { "epoch": 0.04, "grad_norm": 30.538221432223352, "learning_rate": 1.4730878186968839e-06, "loss": 2.375, "step": 260 }, { "epoch": 0.05, "grad_norm": 27.4383780573818, "learning_rate": 1.5014164305949011e-06, "loss": 2.4054, "step": 265 }, { "epoch": 0.05, "grad_norm": 38.83226342367871, "learning_rate": 1.529745042492918e-06, "loss": 2.4019, "step": 270 }, { "epoch": 0.05, "grad_norm": 22.04189249067676, "learning_rate": 1.558073654390935e-06, "loss": 2.3203, "step": 275 }, { "epoch": 0.05, "grad_norm": 26.17958111536244, "learning_rate": 1.5864022662889519e-06, "loss": 2.3432, "step": 280 }, { "epoch": 0.05, "grad_norm": 22.95410575681228, "learning_rate": 1.614730878186969e-06, "loss": 2.3145, "step": 285 }, { "epoch": 0.05, "grad_norm": 23.93998688099864, "learning_rate": 1.6430594900849862e-06, "loss": 2.26, "step": 290 }, { "epoch": 0.05, "grad_norm": 21.32107695775869, "learning_rate": 1.671388101983003e-06, "loss": 2.2498, "step": 295 }, { "epoch": 0.05, "grad_norm": 24.70050367023582, "learning_rate": 1.69971671388102e-06, "loss": 2.2334, "step": 300 }, { "epoch": 0.05, "grad_norm": 48.55117360385103, "learning_rate": 1.728045325779037e-06, "loss": 2.2476, "step": 305 }, { "epoch": 0.05, "grad_norm": 27.055278892184113, "learning_rate": 1.7563739376770538e-06, "loss": 2.2039, "step": 310 }, { "epoch": 0.05, "grad_norm": 19.84999881904028, "learning_rate": 1.7847025495750709e-06, "loss": 2.1813, "step": 315 }, { "epoch": 0.05, "grad_norm": 40.40413661716398, "learning_rate": 1.8130311614730881e-06, "loss": 2.2304, "step": 320 }, { "epoch": 0.06, "grad_norm": 32.46690888295986, "learning_rate": 1.841359773371105e-06, "loss": 2.2263, "step": 325 }, { "epoch": 0.06, "grad_norm": 20.51738634970082, "learning_rate": 1.869688385269122e-06, "loss": 2.1234, "step": 330 }, { "epoch": 0.06, "grad_norm": 35.222336018483524, "learning_rate": 1.8980169971671389e-06, "loss": 2.1573, "step": 335 }, { "epoch": 0.06, "grad_norm": 37.7368422922636, "learning_rate": 1.9263456090651557e-06, "loss": 2.1452, "step": 340 }, { "epoch": 0.06, "grad_norm": 21.56463831517406, "learning_rate": 1.954674220963173e-06, "loss": 2.128, "step": 345 }, { "epoch": 0.06, "grad_norm": 23.972624335060562, "learning_rate": 1.98300283286119e-06, "loss": 2.1398, "step": 350 }, { "epoch": 0.06, "grad_norm": 20.213325978350014, "learning_rate": 2.011331444759207e-06, "loss": 2.0917, "step": 355 }, { "epoch": 0.06, "grad_norm": 32.978422655923445, "learning_rate": 2.039660056657224e-06, "loss": 2.0906, "step": 360 }, { "epoch": 0.06, "grad_norm": 24.072172345830047, "learning_rate": 2.067988668555241e-06, "loss": 2.073, "step": 365 }, { "epoch": 0.06, "grad_norm": 18.862216422542378, "learning_rate": 2.096317280453258e-06, "loss": 2.0942, "step": 370 }, { "epoch": 0.06, "grad_norm": 22.782579148305224, "learning_rate": 2.124645892351275e-06, "loss": 2.0978, "step": 375 }, { "epoch": 0.06, "grad_norm": 27.64608563553747, "learning_rate": 2.1529745042492918e-06, "loss": 2.0611, "step": 380 }, { "epoch": 0.07, "grad_norm": 25.729919004188048, "learning_rate": 2.181303116147309e-06, "loss": 2.047, "step": 385 }, { "epoch": 0.07, "grad_norm": 19.56297139440245, "learning_rate": 2.209631728045326e-06, "loss": 2.0339, "step": 390 }, { "epoch": 0.07, "grad_norm": 19.52987241347125, "learning_rate": 2.237960339943343e-06, "loss": 2.0449, "step": 395 }, { "epoch": 0.07, "grad_norm": 21.353166316715352, "learning_rate": 2.26628895184136e-06, "loss": 2.0223, "step": 400 }, { "epoch": 0.07, "grad_norm": 18.933589191705828, "learning_rate": 2.294617563739377e-06, "loss": 2.0439, "step": 405 }, { "epoch": 0.07, "grad_norm": 21.840396501445202, "learning_rate": 2.3229461756373937e-06, "loss": 2.0128, "step": 410 }, { "epoch": 0.07, "grad_norm": 38.57601116485757, "learning_rate": 2.3512747875354108e-06, "loss": 2.0437, "step": 415 }, { "epoch": 0.07, "grad_norm": 41.85780844791552, "learning_rate": 2.3796033994334282e-06, "loss": 1.9994, "step": 420 }, { "epoch": 0.07, "grad_norm": 35.48875979085677, "learning_rate": 2.407932011331445e-06, "loss": 2.0033, "step": 425 }, { "epoch": 0.07, "grad_norm": 43.38637323829772, "learning_rate": 2.436260623229462e-06, "loss": 2.005, "step": 430 }, { "epoch": 0.07, "grad_norm": 19.900163750049977, "learning_rate": 2.464589235127479e-06, "loss": 1.9753, "step": 435 }, { "epoch": 0.07, "grad_norm": 22.204455228895014, "learning_rate": 2.4929178470254956e-06, "loss": 1.9557, "step": 440 }, { "epoch": 0.08, "grad_norm": 17.74787616134596, "learning_rate": 2.521246458923513e-06, "loss": 1.9367, "step": 445 }, { "epoch": 0.08, "grad_norm": 21.03296681245724, "learning_rate": 2.54957507082153e-06, "loss": 1.9577, "step": 450 }, { "epoch": 0.08, "grad_norm": 23.070555798078754, "learning_rate": 2.577903682719547e-06, "loss": 1.9425, "step": 455 }, { "epoch": 0.08, "grad_norm": 18.095413622860136, "learning_rate": 2.606232294617564e-06, "loss": 1.9424, "step": 460 }, { "epoch": 0.08, "grad_norm": 37.1892575296801, "learning_rate": 2.634560906515581e-06, "loss": 1.941, "step": 465 }, { "epoch": 0.08, "grad_norm": 30.687534445988742, "learning_rate": 2.662889518413598e-06, "loss": 1.9047, "step": 470 }, { "epoch": 0.08, "grad_norm": 22.424833044102115, "learning_rate": 2.6912181303116146e-06, "loss": 1.9292, "step": 475 }, { "epoch": 0.08, "grad_norm": 27.360616215950174, "learning_rate": 2.719546742209632e-06, "loss": 1.9009, "step": 480 }, { "epoch": 0.08, "grad_norm": 40.15039463581183, "learning_rate": 2.747875354107649e-06, "loss": 1.9329, "step": 485 }, { "epoch": 0.08, "grad_norm": 22.5561162952693, "learning_rate": 2.776203966005666e-06, "loss": 1.8924, "step": 490 }, { "epoch": 0.08, "grad_norm": 20.21398000813242, "learning_rate": 2.804532577903683e-06, "loss": 1.8809, "step": 495 }, { "epoch": 0.09, "grad_norm": 32.098925635303246, "learning_rate": 2.8328611898017e-06, "loss": 1.9138, "step": 500 }, { "epoch": 0.09, "grad_norm": 42.05244685784008, "learning_rate": 2.861189801699717e-06, "loss": 1.8856, "step": 505 }, { "epoch": 0.09, "grad_norm": 25.005208485231954, "learning_rate": 2.889518413597734e-06, "loss": 1.841, "step": 510 }, { "epoch": 0.09, "grad_norm": 28.615268279304342, "learning_rate": 2.9178470254957506e-06, "loss": 1.8342, "step": 515 }, { "epoch": 0.09, "grad_norm": 36.647396681534886, "learning_rate": 2.9461756373937677e-06, "loss": 1.8696, "step": 520 }, { "epoch": 0.09, "grad_norm": 52.74806377562607, "learning_rate": 2.9745042492917848e-06, "loss": 1.8655, "step": 525 }, { "epoch": 0.09, "grad_norm": 35.82795457958816, "learning_rate": 3.0028328611898022e-06, "loss": 1.8683, "step": 530 }, { "epoch": 0.09, "grad_norm": 29.29875595295583, "learning_rate": 3.0311614730878193e-06, "loss": 1.8225, "step": 535 }, { "epoch": 0.09, "grad_norm": 31.058050277581376, "learning_rate": 3.059490084985836e-06, "loss": 1.8488, "step": 540 }, { "epoch": 0.09, "grad_norm": 23.73823300434112, "learning_rate": 3.087818696883853e-06, "loss": 1.8479, "step": 545 }, { "epoch": 0.09, "grad_norm": 19.57654252274149, "learning_rate": 3.11614730878187e-06, "loss": 1.8151, "step": 550 }, { "epoch": 0.09, "grad_norm": 17.0532264932674, "learning_rate": 3.144475920679887e-06, "loss": 1.8559, "step": 555 }, { "epoch": 0.1, "grad_norm": 17.48136081087222, "learning_rate": 3.1728045325779038e-06, "loss": 1.7867, "step": 560 }, { "epoch": 0.1, "grad_norm": 20.273223063451194, "learning_rate": 3.201133144475921e-06, "loss": 1.8184, "step": 565 }, { "epoch": 0.1, "grad_norm": 22.305551298403266, "learning_rate": 3.229461756373938e-06, "loss": 1.792, "step": 570 }, { "epoch": 0.1, "grad_norm": 51.38398971351818, "learning_rate": 3.257790368271955e-06, "loss": 1.8154, "step": 575 }, { "epoch": 0.1, "grad_norm": 49.786223800122116, "learning_rate": 3.2861189801699724e-06, "loss": 1.8044, "step": 580 }, { "epoch": 0.1, "grad_norm": 47.56380663604067, "learning_rate": 3.314447592067989e-06, "loss": 1.8327, "step": 585 }, { "epoch": 0.1, "grad_norm": 44.47524479659074, "learning_rate": 3.342776203966006e-06, "loss": 1.8105, "step": 590 }, { "epoch": 0.1, "grad_norm": 33.604777311500975, "learning_rate": 3.371104815864023e-06, "loss": 1.7709, "step": 595 }, { "epoch": 0.1, "grad_norm": 28.320348049863327, "learning_rate": 3.39943342776204e-06, "loss": 1.7832, "step": 600 }, { "epoch": 0.1, "grad_norm": 27.15094992711375, "learning_rate": 3.427762039660057e-06, "loss": 1.7769, "step": 605 }, { "epoch": 0.1, "grad_norm": 18.163403461295847, "learning_rate": 3.456090651558074e-06, "loss": 1.7995, "step": 610 }, { "epoch": 0.1, "grad_norm": 27.076996397342533, "learning_rate": 3.484419263456091e-06, "loss": 1.7721, "step": 615 }, { "epoch": 0.11, "grad_norm": 20.08583043568208, "learning_rate": 3.5127478753541076e-06, "loss": 1.7494, "step": 620 }, { "epoch": 0.11, "grad_norm": 34.60488151573862, "learning_rate": 3.5410764872521247e-06, "loss": 1.7875, "step": 625 }, { "epoch": 0.11, "grad_norm": 19.583304295440154, "learning_rate": 3.5694050991501417e-06, "loss": 1.7533, "step": 630 }, { "epoch": 0.11, "grad_norm": 42.29524810809757, "learning_rate": 3.597733711048159e-06, "loss": 1.7644, "step": 635 }, { "epoch": 0.11, "grad_norm": 66.58393026110944, "learning_rate": 3.6260623229461763e-06, "loss": 1.7385, "step": 640 }, { "epoch": 0.11, "grad_norm": 31.076394544761435, "learning_rate": 3.654390934844193e-06, "loss": 1.742, "step": 645 }, { "epoch": 0.11, "grad_norm": 18.91941602854681, "learning_rate": 3.68271954674221e-06, "loss": 1.7907, "step": 650 }, { "epoch": 0.11, "grad_norm": 20.48663060720521, "learning_rate": 3.711048158640227e-06, "loss": 1.772, "step": 655 }, { "epoch": 0.11, "grad_norm": 21.169491370728753, "learning_rate": 3.739376770538244e-06, "loss": 1.7613, "step": 660 }, { "epoch": 0.11, "grad_norm": 19.394119690737643, "learning_rate": 3.7677053824362607e-06, "loss": 1.7357, "step": 665 }, { "epoch": 0.11, "grad_norm": 36.97000360839211, "learning_rate": 3.7960339943342778e-06, "loss": 1.7568, "step": 670 }, { "epoch": 0.11, "grad_norm": 41.170536011729254, "learning_rate": 3.824362606232295e-06, "loss": 1.7878, "step": 675 }, { "epoch": 0.12, "grad_norm": 18.91271833985899, "learning_rate": 3.8526912181303115e-06, "loss": 1.735, "step": 680 }, { "epoch": 0.12, "grad_norm": 15.177589104249861, "learning_rate": 3.881019830028329e-06, "loss": 1.7479, "step": 685 }, { "epoch": 0.12, "grad_norm": 53.90923998912517, "learning_rate": 3.909348441926346e-06, "loss": 1.7264, "step": 690 }, { "epoch": 0.12, "grad_norm": 79.43302487475911, "learning_rate": 3.937677053824363e-06, "loss": 1.6986, "step": 695 }, { "epoch": 0.12, "grad_norm": 20.1848106409289, "learning_rate": 3.96600566572238e-06, "loss": 1.7403, "step": 700 }, { "epoch": 0.12, "grad_norm": 18.977411772592603, "learning_rate": 3.994334277620397e-06, "loss": 1.729, "step": 705 }, { "epoch": 0.12, "grad_norm": 19.60858292804487, "learning_rate": 4.022662889518414e-06, "loss": 1.7391, "step": 710 }, { "epoch": 0.12, "grad_norm": 25.667932494840965, "learning_rate": 4.0509915014164304e-06, "loss": 1.7108, "step": 715 }, { "epoch": 0.12, "grad_norm": 20.78540678599752, "learning_rate": 4.079320113314448e-06, "loss": 1.7221, "step": 720 }, { "epoch": 0.12, "grad_norm": 76.42698172331296, "learning_rate": 4.1076487252124646e-06, "loss": 1.7105, "step": 725 }, { "epoch": 0.12, "grad_norm": 71.16243308160482, "learning_rate": 4.135977337110482e-06, "loss": 1.727, "step": 730 }, { "epoch": 0.12, "grad_norm": 17.40654783870013, "learning_rate": 4.1643059490084995e-06, "loss": 1.6936, "step": 735 }, { "epoch": 0.13, "grad_norm": 34.693252022614544, "learning_rate": 4.192634560906516e-06, "loss": 1.6621, "step": 740 }, { "epoch": 0.13, "grad_norm": 38.53652676282218, "learning_rate": 4.220963172804533e-06, "loss": 1.6904, "step": 745 }, { "epoch": 0.13, "grad_norm": 26.83929663571352, "learning_rate": 4.24929178470255e-06, "loss": 1.6837, "step": 750 }, { "epoch": 0.13, "grad_norm": 45.81507221553908, "learning_rate": 4.277620396600567e-06, "loss": 1.6989, "step": 755 }, { "epoch": 0.13, "grad_norm": 33.692133005547205, "learning_rate": 4.3059490084985835e-06, "loss": 1.6386, "step": 760 }, { "epoch": 0.13, "grad_norm": 19.38330427723925, "learning_rate": 4.334277620396601e-06, "loss": 1.6653, "step": 765 }, { "epoch": 0.13, "grad_norm": 30.678965153137693, "learning_rate": 4.362606232294618e-06, "loss": 1.6807, "step": 770 }, { "epoch": 0.13, "grad_norm": 24.165456765295332, "learning_rate": 4.390934844192635e-06, "loss": 1.6987, "step": 775 }, { "epoch": 0.13, "grad_norm": 32.29327910090702, "learning_rate": 4.419263456090652e-06, "loss": 1.6802, "step": 780 }, { "epoch": 0.13, "grad_norm": 21.16971480892796, "learning_rate": 4.447592067988669e-06, "loss": 1.6497, "step": 785 }, { "epoch": 0.13, "grad_norm": 26.707784735238036, "learning_rate": 4.475920679886686e-06, "loss": 1.6604, "step": 790 }, { "epoch": 0.14, "grad_norm": 50.87756098034557, "learning_rate": 4.504249291784703e-06, "loss": 1.6326, "step": 795 }, { "epoch": 0.14, "grad_norm": 52.23774283052041, "learning_rate": 4.53257790368272e-06, "loss": 1.6485, "step": 800 }, { "epoch": 0.14, "grad_norm": 63.05813173390926, "learning_rate": 4.560906515580737e-06, "loss": 1.6834, "step": 805 }, { "epoch": 0.14, "grad_norm": 22.075712655510806, "learning_rate": 4.589235127478754e-06, "loss": 1.6382, "step": 810 }, { "epoch": 0.14, "grad_norm": 29.09471283050641, "learning_rate": 4.617563739376771e-06, "loss": 1.6572, "step": 815 }, { "epoch": 0.14, "grad_norm": 43.606834380034115, "learning_rate": 4.645892351274787e-06, "loss": 1.6566, "step": 820 }, { "epoch": 0.14, "grad_norm": 49.418261757308244, "learning_rate": 4.674220963172805e-06, "loss": 1.6501, "step": 825 }, { "epoch": 0.14, "grad_norm": 60.58000269353636, "learning_rate": 4.7025495750708215e-06, "loss": 1.6284, "step": 830 }, { "epoch": 0.14, "grad_norm": 19.768535580269724, "learning_rate": 4.730878186968839e-06, "loss": 1.6342, "step": 835 }, { "epoch": 0.14, "grad_norm": 25.27132812623081, "learning_rate": 4.7592067988668565e-06, "loss": 1.6236, "step": 840 }, { "epoch": 0.14, "grad_norm": 17.37282219238344, "learning_rate": 4.787535410764873e-06, "loss": 1.6277, "step": 845 }, { "epoch": 0.14, "grad_norm": 46.068808485877284, "learning_rate": 4.81586402266289e-06, "loss": 1.6378, "step": 850 }, { "epoch": 0.15, "grad_norm": 64.72578782324423, "learning_rate": 4.844192634560907e-06, "loss": 1.6242, "step": 855 }, { "epoch": 0.15, "grad_norm": 36.406125553208405, "learning_rate": 4.872521246458924e-06, "loss": 1.625, "step": 860 }, { "epoch": 0.15, "grad_norm": 19.157180999051647, "learning_rate": 4.9008498583569405e-06, "loss": 1.6419, "step": 865 }, { "epoch": 0.15, "grad_norm": 31.267783331002605, "learning_rate": 4.929178470254958e-06, "loss": 1.6156, "step": 870 }, { "epoch": 0.15, "grad_norm": 30.51950308301667, "learning_rate": 4.957507082152975e-06, "loss": 1.6127, "step": 875 }, { "epoch": 0.15, "grad_norm": 59.65679660528968, "learning_rate": 4.985835694050991e-06, "loss": 1.5823, "step": 880 }, { "epoch": 0.15, "grad_norm": 33.09588742022726, "learning_rate": 5.014164305949009e-06, "loss": 1.6119, "step": 885 }, { "epoch": 0.15, "grad_norm": 17.46276943511441, "learning_rate": 5.042492917847026e-06, "loss": 1.5669, "step": 890 }, { "epoch": 0.15, "grad_norm": 27.95986512759189, "learning_rate": 5.070821529745043e-06, "loss": 1.6021, "step": 895 }, { "epoch": 0.15, "grad_norm": 40.53798724973572, "learning_rate": 5.09915014164306e-06, "loss": 1.5952, "step": 900 }, { "epoch": 0.15, "grad_norm": 98.13247181294633, "learning_rate": 5.127478753541076e-06, "loss": 1.6185, "step": 905 }, { "epoch": 0.15, "grad_norm": 52.03314426839591, "learning_rate": 5.155807365439094e-06, "loss": 1.6101, "step": 910 }, { "epoch": 0.16, "grad_norm": 64.46486065152293, "learning_rate": 5.184135977337111e-06, "loss": 1.5932, "step": 915 }, { "epoch": 0.16, "grad_norm": 55.85131329674641, "learning_rate": 5.212464589235128e-06, "loss": 1.5597, "step": 920 }, { "epoch": 0.16, "grad_norm": 104.94386787582185, "learning_rate": 5.240793201133145e-06, "loss": 1.6062, "step": 925 }, { "epoch": 0.16, "grad_norm": 55.90312620104168, "learning_rate": 5.269121813031162e-06, "loss": 1.6001, "step": 930 }, { "epoch": 0.16, "grad_norm": 36.37767655193305, "learning_rate": 5.297450424929179e-06, "loss": 1.5993, "step": 935 }, { "epoch": 0.16, "grad_norm": 39.26128766918641, "learning_rate": 5.325779036827196e-06, "loss": 1.5651, "step": 940 }, { "epoch": 0.16, "grad_norm": 43.57675317461804, "learning_rate": 5.3541076487252134e-06, "loss": 1.5913, "step": 945 }, { "epoch": 0.16, "grad_norm": 17.26426628907133, "learning_rate": 5.382436260623229e-06, "loss": 1.594, "step": 950 }, { "epoch": 0.16, "grad_norm": 18.037423782357767, "learning_rate": 5.410764872521247e-06, "loss": 1.5744, "step": 955 }, { "epoch": 0.16, "grad_norm": 130.6445045404753, "learning_rate": 5.439093484419264e-06, "loss": 1.6486, "step": 960 }, { "epoch": 0.16, "grad_norm": 54.45429682389578, "learning_rate": 5.467422096317281e-06, "loss": 1.5734, "step": 965 }, { "epoch": 0.16, "grad_norm": 42.69622349959172, "learning_rate": 5.495750708215298e-06, "loss": 1.6064, "step": 970 }, { "epoch": 0.17, "grad_norm": 59.71378250423668, "learning_rate": 5.524079320113315e-06, "loss": 1.6193, "step": 975 }, { "epoch": 0.17, "grad_norm": 62.55182445293084, "learning_rate": 5.552407932011332e-06, "loss": 1.5776, "step": 980 }, { "epoch": 0.17, "grad_norm": 59.05878711283353, "learning_rate": 5.580736543909348e-06, "loss": 1.5794, "step": 985 }, { "epoch": 0.17, "grad_norm": 28.560889477217298, "learning_rate": 5.609065155807366e-06, "loss": 1.5593, "step": 990 }, { "epoch": 0.17, "grad_norm": 27.490073642677785, "learning_rate": 5.637393767705382e-06, "loss": 1.5749, "step": 995 }, { "epoch": 0.17, "grad_norm": 15.950065631003085, "learning_rate": 5.6657223796034e-06, "loss": 1.5537, "step": 1000 }, { "epoch": 0.17, "grad_norm": 43.35553144079011, "learning_rate": 5.6940509915014164e-06, "loss": 1.5762, "step": 1005 }, { "epoch": 0.17, "grad_norm": 28.678343487108393, "learning_rate": 5.722379603399434e-06, "loss": 1.5324, "step": 1010 }, { "epoch": 0.17, "grad_norm": 98.86163137432244, "learning_rate": 5.750708215297451e-06, "loss": 1.5333, "step": 1015 }, { "epoch": 0.17, "grad_norm": 63.299358135948566, "learning_rate": 5.779036827195468e-06, "loss": 1.542, "step": 1020 }, { "epoch": 0.17, "grad_norm": 19.56384405996424, "learning_rate": 5.8073654390934855e-06, "loss": 1.5661, "step": 1025 }, { "epoch": 0.18, "grad_norm": 27.414836270647065, "learning_rate": 5.835694050991501e-06, "loss": 1.564, "step": 1030 }, { "epoch": 0.18, "grad_norm": 20.974654931798312, "learning_rate": 5.864022662889519e-06, "loss": 1.5267, "step": 1035 }, { "epoch": 0.18, "grad_norm": 15.14250216808397, "learning_rate": 5.892351274787535e-06, "loss": 1.5485, "step": 1040 }, { "epoch": 0.18, "grad_norm": 19.483781002585005, "learning_rate": 5.920679886685553e-06, "loss": 1.5312, "step": 1045 }, { "epoch": 0.18, "grad_norm": 19.977930403123093, "learning_rate": 5.9490084985835695e-06, "loss": 1.5574, "step": 1050 }, { "epoch": 0.18, "grad_norm": 35.868563902903155, "learning_rate": 5.977337110481587e-06, "loss": 1.5089, "step": 1055 }, { "epoch": 0.18, "grad_norm": 35.70653189378658, "learning_rate": 6.0056657223796045e-06, "loss": 1.4986, "step": 1060 }, { "epoch": 0.18, "grad_norm": 83.85460986190164, "learning_rate": 6.033994334277621e-06, "loss": 1.5424, "step": 1065 }, { "epoch": 0.18, "grad_norm": 52.01364066249039, "learning_rate": 6.062322946175639e-06, "loss": 1.5451, "step": 1070 }, { "epoch": 0.18, "grad_norm": 41.4992437255658, "learning_rate": 6.090651558073654e-06, "loss": 1.5275, "step": 1075 }, { "epoch": 0.18, "grad_norm": 30.925485991933222, "learning_rate": 6.118980169971672e-06, "loss": 1.5082, "step": 1080 }, { "epoch": 0.18, "grad_norm": 44.720242480528114, "learning_rate": 6.1473087818696885e-06, "loss": 1.5481, "step": 1085 }, { "epoch": 0.19, "grad_norm": 53.91170639021118, "learning_rate": 6.175637393767706e-06, "loss": 1.5137, "step": 1090 }, { "epoch": 0.19, "grad_norm": 78.19398697511646, "learning_rate": 6.203966005665723e-06, "loss": 1.5195, "step": 1095 }, { "epoch": 0.19, "grad_norm": 39.94674555010526, "learning_rate": 6.23229461756374e-06, "loss": 1.5273, "step": 1100 }, { "epoch": 0.19, "grad_norm": 17.855101814171903, "learning_rate": 6.260623229461757e-06, "loss": 1.5309, "step": 1105 }, { "epoch": 0.19, "grad_norm": 44.867726205072024, "learning_rate": 6.288951841359774e-06, "loss": 1.5242, "step": 1110 }, { "epoch": 0.19, "grad_norm": 29.131859743656516, "learning_rate": 6.317280453257792e-06, "loss": 1.5273, "step": 1115 }, { "epoch": 0.19, "grad_norm": 17.611183595956476, "learning_rate": 6.3456090651558075e-06, "loss": 1.5011, "step": 1120 }, { "epoch": 0.19, "grad_norm": 53.35713779003586, "learning_rate": 6.373937677053825e-06, "loss": 1.4962, "step": 1125 }, { "epoch": 0.19, "grad_norm": 44.931444706391076, "learning_rate": 6.402266288951842e-06, "loss": 1.511, "step": 1130 }, { "epoch": 0.19, "grad_norm": 22.44919456560898, "learning_rate": 6.430594900849859e-06, "loss": 1.5138, "step": 1135 }, { "epoch": 0.19, "grad_norm": 45.08279033177284, "learning_rate": 6.458923512747876e-06, "loss": 1.4963, "step": 1140 }, { "epoch": 0.19, "grad_norm": 17.52813913674402, "learning_rate": 6.487252124645893e-06, "loss": 1.5131, "step": 1145 }, { "epoch": 0.2, "grad_norm": 26.26804383614154, "learning_rate": 6.51558073654391e-06, "loss": 1.5118, "step": 1150 }, { "epoch": 0.2, "grad_norm": 38.38984295804615, "learning_rate": 6.543909348441927e-06, "loss": 1.5052, "step": 1155 }, { "epoch": 0.2, "grad_norm": 62.89163803243198, "learning_rate": 6.572237960339945e-06, "loss": 1.4908, "step": 1160 }, { "epoch": 0.2, "grad_norm": 16.43464761001089, "learning_rate": 6.600566572237961e-06, "loss": 1.4863, "step": 1165 }, { "epoch": 0.2, "grad_norm": 27.171308819606647, "learning_rate": 6.628895184135978e-06, "loss": 1.4866, "step": 1170 }, { "epoch": 0.2, "grad_norm": 12.858618384000161, "learning_rate": 6.657223796033995e-06, "loss": 1.4997, "step": 1175 }, { "epoch": 0.2, "grad_norm": 26.73384894098579, "learning_rate": 6.685552407932012e-06, "loss": 1.4724, "step": 1180 }, { "epoch": 0.2, "grad_norm": 21.318889353798717, "learning_rate": 6.713881019830029e-06, "loss": 1.4655, "step": 1185 }, { "epoch": 0.2, "grad_norm": 47.36154668479138, "learning_rate": 6.742209631728046e-06, "loss": 1.4677, "step": 1190 }, { "epoch": 0.2, "grad_norm": 13.312162357247926, "learning_rate": 6.770538243626062e-06, "loss": 1.487, "step": 1195 }, { "epoch": 0.2, "grad_norm": 28.90405260718444, "learning_rate": 6.79886685552408e-06, "loss": 1.4774, "step": 1200 }, { "epoch": 0.2, "grad_norm": 43.20968543413434, "learning_rate": 6.827195467422096e-06, "loss": 1.4592, "step": 1205 }, { "epoch": 0.21, "grad_norm": 15.41534301679069, "learning_rate": 6.855524079320114e-06, "loss": 1.4741, "step": 1210 }, { "epoch": 0.21, "grad_norm": 39.00940865689802, "learning_rate": 6.883852691218131e-06, "loss": 1.476, "step": 1215 }, { "epoch": 0.21, "grad_norm": 15.463235319018573, "learning_rate": 6.912181303116148e-06, "loss": 1.4989, "step": 1220 }, { "epoch": 0.21, "grad_norm": 16.14813609317964, "learning_rate": 6.940509915014165e-06, "loss": 1.4772, "step": 1225 }, { "epoch": 0.21, "grad_norm": 45.64032082296392, "learning_rate": 6.968838526912182e-06, "loss": 1.4859, "step": 1230 }, { "epoch": 0.21, "grad_norm": 14.426783265938841, "learning_rate": 6.997167138810199e-06, "loss": 1.4732, "step": 1235 }, { "epoch": 0.21, "grad_norm": 56.98259656288749, "learning_rate": 7.025495750708215e-06, "loss": 1.4717, "step": 1240 }, { "epoch": 0.21, "grad_norm": 73.22843429393423, "learning_rate": 7.053824362606233e-06, "loss": 1.4471, "step": 1245 }, { "epoch": 0.21, "grad_norm": 25.573775068067942, "learning_rate": 7.082152974504249e-06, "loss": 1.4372, "step": 1250 }, { "epoch": 0.21, "grad_norm": 12.443033292868687, "learning_rate": 7.110481586402267e-06, "loss": 1.4631, "step": 1255 }, { "epoch": 0.21, "grad_norm": 14.668450041884629, "learning_rate": 7.1388101983002834e-06, "loss": 1.4368, "step": 1260 }, { "epoch": 0.22, "grad_norm": 115.98207102359974, "learning_rate": 7.167138810198301e-06, "loss": 1.468, "step": 1265 }, { "epoch": 0.22, "grad_norm": 128.82110344511952, "learning_rate": 7.195467422096318e-06, "loss": 1.4917, "step": 1270 }, { "epoch": 0.22, "grad_norm": 31.41157580165412, "learning_rate": 7.223796033994335e-06, "loss": 1.4766, "step": 1275 }, { "epoch": 0.22, "grad_norm": 33.117081868197246, "learning_rate": 7.2521246458923525e-06, "loss": 1.4506, "step": 1280 }, { "epoch": 0.22, "grad_norm": 33.6257926561511, "learning_rate": 7.280453257790368e-06, "loss": 1.4871, "step": 1285 }, { "epoch": 0.22, "grad_norm": 16.68056867252948, "learning_rate": 7.308781869688386e-06, "loss": 1.4495, "step": 1290 }, { "epoch": 0.22, "grad_norm": 76.32432258998954, "learning_rate": 7.337110481586402e-06, "loss": 1.4476, "step": 1295 }, { "epoch": 0.22, "grad_norm": 73.51943278047416, "learning_rate": 7.36543909348442e-06, "loss": 1.4601, "step": 1300 }, { "epoch": 0.22, "grad_norm": 35.92091908157174, "learning_rate": 7.3937677053824365e-06, "loss": 1.4612, "step": 1305 }, { "epoch": 0.22, "grad_norm": 37.15133392906697, "learning_rate": 7.422096317280454e-06, "loss": 1.4608, "step": 1310 }, { "epoch": 0.22, "grad_norm": 54.24887555884184, "learning_rate": 7.4504249291784715e-06, "loss": 1.4471, "step": 1315 }, { "epoch": 0.22, "grad_norm": 66.61085194888463, "learning_rate": 7.478753541076488e-06, "loss": 1.4306, "step": 1320 }, { "epoch": 0.23, "grad_norm": 67.55198055290484, "learning_rate": 7.507082152974506e-06, "loss": 1.4294, "step": 1325 }, { "epoch": 0.23, "grad_norm": 60.83568936526317, "learning_rate": 7.535410764872521e-06, "loss": 1.4426, "step": 1330 }, { "epoch": 0.23, "grad_norm": 64.35303855984785, "learning_rate": 7.563739376770539e-06, "loss": 1.4382, "step": 1335 }, { "epoch": 0.23, "grad_norm": 56.55071526619856, "learning_rate": 7.5920679886685555e-06, "loss": 1.4224, "step": 1340 }, { "epoch": 0.23, "grad_norm": 67.12284685088699, "learning_rate": 7.620396600566573e-06, "loss": 1.4321, "step": 1345 }, { "epoch": 0.23, "grad_norm": 48.395017715798886, "learning_rate": 7.64872521246459e-06, "loss": 1.4346, "step": 1350 }, { "epoch": 0.23, "grad_norm": 38.088435110493364, "learning_rate": 7.677053824362606e-06, "loss": 1.434, "step": 1355 }, { "epoch": 0.23, "grad_norm": 15.108791280938446, "learning_rate": 7.705382436260623e-06, "loss": 1.4467, "step": 1360 }, { "epoch": 0.23, "grad_norm": 46.213258435941164, "learning_rate": 7.733711048158641e-06, "loss": 1.4726, "step": 1365 }, { "epoch": 0.23, "grad_norm": 31.36163401132065, "learning_rate": 7.762039660056658e-06, "loss": 1.417, "step": 1370 }, { "epoch": 0.23, "grad_norm": 21.57521345294901, "learning_rate": 7.790368271954675e-06, "loss": 1.4167, "step": 1375 }, { "epoch": 0.23, "grad_norm": 25.684711958358132, "learning_rate": 7.818696883852693e-06, "loss": 1.4279, "step": 1380 }, { "epoch": 0.24, "grad_norm": 46.14025445942214, "learning_rate": 7.847025495750708e-06, "loss": 1.4367, "step": 1385 }, { "epoch": 0.24, "grad_norm": 16.837822707093785, "learning_rate": 7.875354107648726e-06, "loss": 1.4414, "step": 1390 }, { "epoch": 0.24, "grad_norm": 25.32494554542278, "learning_rate": 7.903682719546743e-06, "loss": 1.4225, "step": 1395 }, { "epoch": 0.24, "grad_norm": 22.096692157328395, "learning_rate": 7.93201133144476e-06, "loss": 1.4244, "step": 1400 }, { "epoch": 0.24, "grad_norm": 23.085791295961783, "learning_rate": 7.960339943342776e-06, "loss": 1.4269, "step": 1405 }, { "epoch": 0.24, "grad_norm": 42.63437473811217, "learning_rate": 7.988668555240794e-06, "loss": 1.4216, "step": 1410 }, { "epoch": 0.24, "grad_norm": 14.581831841044835, "learning_rate": 8.016997167138811e-06, "loss": 1.4241, "step": 1415 }, { "epoch": 0.24, "grad_norm": 107.73424906023473, "learning_rate": 8.045325779036828e-06, "loss": 1.4305, "step": 1420 }, { "epoch": 0.24, "grad_norm": 102.80669799826326, "learning_rate": 8.073654390934846e-06, "loss": 1.4035, "step": 1425 }, { "epoch": 0.24, "grad_norm": 54.25200717068274, "learning_rate": 8.101983002832861e-06, "loss": 1.4146, "step": 1430 }, { "epoch": 0.24, "grad_norm": 70.28692545400818, "learning_rate": 8.13031161473088e-06, "loss": 1.4386, "step": 1435 }, { "epoch": 0.24, "grad_norm": 41.22226556175442, "learning_rate": 8.158640226628896e-06, "loss": 1.4142, "step": 1440 }, { "epoch": 0.25, "grad_norm": 33.320500906922966, "learning_rate": 8.186968838526912e-06, "loss": 1.4063, "step": 1445 }, { "epoch": 0.25, "grad_norm": 20.96946984665567, "learning_rate": 8.215297450424929e-06, "loss": 1.3873, "step": 1450 }, { "epoch": 0.25, "grad_norm": 36.4133779105684, "learning_rate": 8.243626062322947e-06, "loss": 1.3998, "step": 1455 }, { "epoch": 0.25, "grad_norm": 19.19446044964224, "learning_rate": 8.271954674220964e-06, "loss": 1.4123, "step": 1460 }, { "epoch": 0.25, "grad_norm": 21.008839218394122, "learning_rate": 8.30028328611898e-06, "loss": 1.3916, "step": 1465 }, { "epoch": 0.25, "grad_norm": 20.51211979323123, "learning_rate": 8.328611898016999e-06, "loss": 1.4045, "step": 1470 }, { "epoch": 0.25, "grad_norm": 28.355806563412795, "learning_rate": 8.356940509915014e-06, "loss": 1.3989, "step": 1475 }, { "epoch": 0.25, "grad_norm": 56.645997417423445, "learning_rate": 8.385269121813032e-06, "loss": 1.3892, "step": 1480 }, { "epoch": 0.25, "grad_norm": 37.58733117483026, "learning_rate": 8.413597733711049e-06, "loss": 1.3983, "step": 1485 }, { "epoch": 0.25, "grad_norm": 23.45675645372948, "learning_rate": 8.441926345609066e-06, "loss": 1.4338, "step": 1490 }, { "epoch": 0.25, "grad_norm": 43.27659750878071, "learning_rate": 8.470254957507082e-06, "loss": 1.4018, "step": 1495 }, { "epoch": 0.26, "grad_norm": 26.338070551642947, "learning_rate": 8.4985835694051e-06, "loss": 1.3832, "step": 1500 }, { "epoch": 0.26, "grad_norm": 35.56121674532834, "learning_rate": 8.526912181303117e-06, "loss": 1.3637, "step": 1505 }, { "epoch": 0.26, "grad_norm": 13.858156584818255, "learning_rate": 8.555240793201134e-06, "loss": 1.3929, "step": 1510 }, { "epoch": 0.26, "grad_norm": 14.860684083485195, "learning_rate": 8.583569405099152e-06, "loss": 1.3933, "step": 1515 }, { "epoch": 0.26, "grad_norm": 16.67764945655357, "learning_rate": 8.611898016997167e-06, "loss": 1.3912, "step": 1520 }, { "epoch": 0.26, "grad_norm": 14.069808432652248, "learning_rate": 8.640226628895185e-06, "loss": 1.3796, "step": 1525 }, { "epoch": 0.26, "grad_norm": 65.37764259474062, "learning_rate": 8.668555240793202e-06, "loss": 1.3854, "step": 1530 }, { "epoch": 0.26, "grad_norm": 67.37196707064975, "learning_rate": 8.696883852691219e-06, "loss": 1.3932, "step": 1535 }, { "epoch": 0.26, "grad_norm": 26.53654854865448, "learning_rate": 8.725212464589235e-06, "loss": 1.3971, "step": 1540 }, { "epoch": 0.26, "grad_norm": 120.33111697552754, "learning_rate": 8.753541076487254e-06, "loss": 1.3925, "step": 1545 }, { "epoch": 0.26, "grad_norm": 26.033106458020956, "learning_rate": 8.78186968838527e-06, "loss": 1.3924, "step": 1550 }, { "epoch": 0.26, "grad_norm": 127.2750601403478, "learning_rate": 8.810198300283287e-06, "loss": 1.4086, "step": 1555 }, { "epoch": 0.27, "grad_norm": 97.35619667630274, "learning_rate": 8.838526912181304e-06, "loss": 1.4252, "step": 1560 }, { "epoch": 0.27, "grad_norm": 41.5132287656772, "learning_rate": 8.86685552407932e-06, "loss": 1.4048, "step": 1565 }, { "epoch": 0.27, "grad_norm": 20.19314986884624, "learning_rate": 8.895184135977339e-06, "loss": 1.4018, "step": 1570 }, { "epoch": 0.27, "grad_norm": 20.207399962119673, "learning_rate": 8.923512747875355e-06, "loss": 1.3928, "step": 1575 }, { "epoch": 0.27, "grad_norm": 18.706236409485633, "learning_rate": 8.951841359773372e-06, "loss": 1.402, "step": 1580 }, { "epoch": 0.27, "grad_norm": 18.15951416173616, "learning_rate": 8.980169971671388e-06, "loss": 1.3589, "step": 1585 }, { "epoch": 0.27, "grad_norm": 19.959848444063436, "learning_rate": 9.008498583569407e-06, "loss": 1.3918, "step": 1590 }, { "epoch": 0.27, "grad_norm": 18.03318135371038, "learning_rate": 9.036827195467422e-06, "loss": 1.3908, "step": 1595 }, { "epoch": 0.27, "grad_norm": 59.051519885278694, "learning_rate": 9.06515580736544e-06, "loss": 1.3731, "step": 1600 }, { "epoch": 0.27, "grad_norm": 52.467108769254416, "learning_rate": 9.093484419263457e-06, "loss": 1.3487, "step": 1605 }, { "epoch": 0.27, "grad_norm": 59.018311552807326, "learning_rate": 9.121813031161473e-06, "loss": 1.377, "step": 1610 }, { "epoch": 0.27, "grad_norm": 14.959473760297636, "learning_rate": 9.150141643059492e-06, "loss": 1.4018, "step": 1615 }, { "epoch": 0.28, "grad_norm": 35.688486380413146, "learning_rate": 9.178470254957508e-06, "loss": 1.3702, "step": 1620 }, { "epoch": 0.28, "grad_norm": 17.702780585065685, "learning_rate": 9.206798866855525e-06, "loss": 1.3887, "step": 1625 }, { "epoch": 0.28, "grad_norm": 44.95672001007261, "learning_rate": 9.235127478753542e-06, "loss": 1.3852, "step": 1630 }, { "epoch": 0.28, "grad_norm": 23.805509750982903, "learning_rate": 9.26345609065156e-06, "loss": 1.3646, "step": 1635 }, { "epoch": 0.28, "grad_norm": 39.33476110738001, "learning_rate": 9.291784702549575e-06, "loss": 1.3764, "step": 1640 }, { "epoch": 0.28, "grad_norm": 12.540226055124654, "learning_rate": 9.320113314447593e-06, "loss": 1.3633, "step": 1645 }, { "epoch": 0.28, "grad_norm": 60.937887345190425, "learning_rate": 9.34844192634561e-06, "loss": 1.3941, "step": 1650 }, { "epoch": 0.28, "grad_norm": 15.183842224404986, "learning_rate": 9.376770538243626e-06, "loss": 1.3632, "step": 1655 }, { "epoch": 0.28, "grad_norm": 13.268626818286448, "learning_rate": 9.405099150141643e-06, "loss": 1.3632, "step": 1660 }, { "epoch": 0.28, "grad_norm": 14.977488338411256, "learning_rate": 9.433427762039661e-06, "loss": 1.3437, "step": 1665 }, { "epoch": 0.28, "grad_norm": 17.69425703983086, "learning_rate": 9.461756373937678e-06, "loss": 1.3407, "step": 1670 }, { "epoch": 0.28, "grad_norm": 50.81269776992468, "learning_rate": 9.490084985835695e-06, "loss": 1.3649, "step": 1675 }, { "epoch": 0.29, "grad_norm": 52.987834036106406, "learning_rate": 9.518413597733713e-06, "loss": 1.368, "step": 1680 }, { "epoch": 0.29, "grad_norm": 53.09975091644179, "learning_rate": 9.546742209631728e-06, "loss": 1.3753, "step": 1685 }, { "epoch": 0.29, "grad_norm": 65.42753190343696, "learning_rate": 9.575070821529746e-06, "loss": 1.3373, "step": 1690 }, { "epoch": 0.29, "grad_norm": 12.50878001715189, "learning_rate": 9.603399433427763e-06, "loss": 1.3204, "step": 1695 }, { "epoch": 0.29, "grad_norm": 51.46300322685196, "learning_rate": 9.63172804532578e-06, "loss": 1.3517, "step": 1700 }, { "epoch": 0.29, "grad_norm": 50.08389878971014, "learning_rate": 9.660056657223796e-06, "loss": 1.3377, "step": 1705 }, { "epoch": 0.29, "grad_norm": 98.7501586904867, "learning_rate": 9.688385269121814e-06, "loss": 1.358, "step": 1710 }, { "epoch": 0.29, "grad_norm": 25.256204978294605, "learning_rate": 9.716713881019831e-06, "loss": 1.3391, "step": 1715 }, { "epoch": 0.29, "grad_norm": 96.05571532903846, "learning_rate": 9.745042492917848e-06, "loss": 1.3615, "step": 1720 }, { "epoch": 0.29, "grad_norm": 26.755333256803034, "learning_rate": 9.773371104815866e-06, "loss": 1.3662, "step": 1725 }, { "epoch": 0.29, "grad_norm": 25.837535923418997, "learning_rate": 9.801699716713881e-06, "loss": 1.38, "step": 1730 }, { "epoch": 0.29, "grad_norm": 24.738456192454084, "learning_rate": 9.8300283286119e-06, "loss": 1.333, "step": 1735 }, { "epoch": 0.3, "grad_norm": 41.26197817069162, "learning_rate": 9.858356940509916e-06, "loss": 1.3491, "step": 1740 }, { "epoch": 0.3, "grad_norm": 51.99630683152431, "learning_rate": 9.886685552407933e-06, "loss": 1.3429, "step": 1745 }, { "epoch": 0.3, "grad_norm": 46.34281673405669, "learning_rate": 9.91501416430595e-06, "loss": 1.3646, "step": 1750 }, { "epoch": 0.3, "grad_norm": 22.857745511222603, "learning_rate": 9.943342776203968e-06, "loss": 1.3297, "step": 1755 }, { "epoch": 0.3, "grad_norm": 11.712863491665592, "learning_rate": 9.971671388101982e-06, "loss": 1.321, "step": 1760 }, { "epoch": 0.3, "grad_norm": 11.7261894236906, "learning_rate": 1e-05, "loss": 1.3444, "step": 1765 }, { "epoch": 0.3, "grad_norm": 32.60283769493224, "learning_rate": 9.99999755418257e-06, "loss": 1.3408, "step": 1770 }, { "epoch": 0.3, "grad_norm": 17.203512310970503, "learning_rate": 9.999990216732668e-06, "loss": 1.3446, "step": 1775 }, { "epoch": 0.3, "grad_norm": 17.883185421539476, "learning_rate": 9.999977987657479e-06, "loss": 1.3313, "step": 1780 }, { "epoch": 0.3, "grad_norm": 19.323697826239748, "learning_rate": 9.99996086696896e-06, "loss": 1.3503, "step": 1785 }, { "epoch": 0.3, "grad_norm": 27.72449057867512, "learning_rate": 9.999938854683867e-06, "loss": 1.3201, "step": 1790 }, { "epoch": 0.31, "grad_norm": 35.22723944266936, "learning_rate": 9.99991195082373e-06, "loss": 1.3395, "step": 1795 }, { "epoch": 0.31, "grad_norm": 36.374853638958115, "learning_rate": 9.999880155414872e-06, "loss": 1.34, "step": 1800 }, { "epoch": 0.31, "grad_norm": 29.11298844229375, "learning_rate": 9.9998434684884e-06, "loss": 1.3075, "step": 1805 }, { "epoch": 0.31, "grad_norm": 18.978651122935727, "learning_rate": 9.999801890080203e-06, "loss": 1.3248, "step": 1810 }, { "epoch": 0.31, "grad_norm": 62.277032520069206, "learning_rate": 9.999755420230964e-06, "loss": 1.3174, "step": 1815 }, { "epoch": 0.31, "grad_norm": 42.25047939113886, "learning_rate": 9.999704058986139e-06, "loss": 1.3196, "step": 1820 }, { "epoch": 0.31, "grad_norm": 121.34773754544872, "learning_rate": 9.99964780639598e-06, "loss": 1.3449, "step": 1825 }, { "epoch": 0.31, "grad_norm": 157.875932629027, "learning_rate": 9.999586662515519e-06, "loss": 1.3766, "step": 1830 }, { "epoch": 0.31, "grad_norm": 67.21735658366185, "learning_rate": 9.999520627404576e-06, "loss": 1.3336, "step": 1835 }, { "epoch": 0.31, "grad_norm": 40.32573656282714, "learning_rate": 9.999449701127753e-06, "loss": 1.3358, "step": 1840 }, { "epoch": 0.31, "grad_norm": 16.578779780453953, "learning_rate": 9.999373883754442e-06, "loss": 1.3263, "step": 1845 }, { "epoch": 0.31, "grad_norm": 45.13578422606087, "learning_rate": 9.999293175358814e-06, "loss": 1.3156, "step": 1850 }, { "epoch": 0.32, "grad_norm": 25.271194035753734, "learning_rate": 9.99920757601983e-06, "loss": 1.3158, "step": 1855 }, { "epoch": 0.32, "grad_norm": 39.94426166106633, "learning_rate": 9.999117085821233e-06, "loss": 1.323, "step": 1860 }, { "epoch": 0.32, "grad_norm": 30.457883465258536, "learning_rate": 9.999021704851555e-06, "loss": 1.3516, "step": 1865 }, { "epoch": 0.32, "grad_norm": 36.9726912711344, "learning_rate": 9.998921433204106e-06, "loss": 1.333, "step": 1870 }, { "epoch": 0.32, "grad_norm": 23.030928072688916, "learning_rate": 9.998816270976985e-06, "loss": 1.2991, "step": 1875 }, { "epoch": 0.32, "grad_norm": 47.126156393517775, "learning_rate": 9.998706218273078e-06, "loss": 1.2744, "step": 1880 }, { "epoch": 0.32, "grad_norm": 32.59637028245757, "learning_rate": 9.998591275200049e-06, "loss": 1.3096, "step": 1885 }, { "epoch": 0.32, "grad_norm": 23.94926147678152, "learning_rate": 9.998471441870353e-06, "loss": 1.2996, "step": 1890 }, { "epoch": 0.32, "grad_norm": 21.030622870893925, "learning_rate": 9.998346718401222e-06, "loss": 1.3209, "step": 1895 }, { "epoch": 0.32, "grad_norm": 17.715711453727344, "learning_rate": 9.998217104914683e-06, "loss": 1.3006, "step": 1900 }, { "epoch": 0.32, "grad_norm": 13.08367409477122, "learning_rate": 9.998082601537533e-06, "loss": 1.3012, "step": 1905 }, { "epoch": 0.32, "grad_norm": 13.183410345778563, "learning_rate": 9.997943208401365e-06, "loss": 1.3134, "step": 1910 }, { "epoch": 0.33, "grad_norm": 13.086410225689482, "learning_rate": 9.99779892564255e-06, "loss": 1.2904, "step": 1915 }, { "epoch": 0.33, "grad_norm": 37.16067338584074, "learning_rate": 9.997649753402243e-06, "loss": 1.3047, "step": 1920 }, { "epoch": 0.33, "grad_norm": 40.474695223414116, "learning_rate": 9.997495691826382e-06, "loss": 1.2886, "step": 1925 }, { "epoch": 0.33, "grad_norm": 53.203394356059604, "learning_rate": 9.997336741065694e-06, "loss": 1.2939, "step": 1930 }, { "epoch": 0.33, "grad_norm": 43.19777784215109, "learning_rate": 9.997172901275682e-06, "loss": 1.2875, "step": 1935 }, { "epoch": 0.33, "grad_norm": 65.5795185045484, "learning_rate": 9.997004172616633e-06, "loss": 1.3288, "step": 1940 }, { "epoch": 0.33, "grad_norm": 50.496217939819, "learning_rate": 9.996830555253622e-06, "loss": 1.3044, "step": 1945 }, { "epoch": 0.33, "grad_norm": 16.159352252412273, "learning_rate": 9.9966520493565e-06, "loss": 1.3244, "step": 1950 }, { "epoch": 0.33, "grad_norm": 14.449411392080844, "learning_rate": 9.99646865509991e-06, "loss": 1.3067, "step": 1955 }, { "epoch": 0.33, "grad_norm": 30.85859582159372, "learning_rate": 9.996280372663266e-06, "loss": 1.2854, "step": 1960 }, { "epoch": 0.33, "grad_norm": 13.954831400149821, "learning_rate": 9.996087202230773e-06, "loss": 1.2711, "step": 1965 }, { "epoch": 0.33, "grad_norm": 13.535102116165461, "learning_rate": 9.995889143991412e-06, "loss": 1.2758, "step": 1970 }, { "epoch": 0.34, "grad_norm": 14.05835385978514, "learning_rate": 9.995686198138951e-06, "loss": 1.2838, "step": 1975 }, { "epoch": 0.34, "grad_norm": 27.158362157021386, "learning_rate": 9.995478364871937e-06, "loss": 1.2848, "step": 1980 }, { "epoch": 0.34, "grad_norm": 17.607677347887147, "learning_rate": 9.995265644393698e-06, "loss": 1.3077, "step": 1985 }, { "epoch": 0.34, "grad_norm": 12.019777361051082, "learning_rate": 9.995048036912345e-06, "loss": 1.2911, "step": 1990 }, { "epoch": 0.34, "grad_norm": 36.439429993916605, "learning_rate": 9.99482554264077e-06, "loss": 1.2909, "step": 1995 }, { "epoch": 0.34, "grad_norm": 19.316454115512972, "learning_rate": 9.994598161796643e-06, "loss": 1.28, "step": 2000 }, { "epoch": 0.34, "grad_norm": 14.451990819041521, "learning_rate": 9.994365894602419e-06, "loss": 1.2796, "step": 2005 }, { "epoch": 0.34, "grad_norm": 17.039745793623432, "learning_rate": 9.994128741285329e-06, "loss": 1.2776, "step": 2010 }, { "epoch": 0.34, "grad_norm": 34.91032967270379, "learning_rate": 9.99388670207739e-06, "loss": 1.2953, "step": 2015 }, { "epoch": 0.34, "grad_norm": 30.505391467678226, "learning_rate": 9.99363977721539e-06, "loss": 1.2912, "step": 2020 }, { "epoch": 0.34, "grad_norm": 70.60877015993087, "learning_rate": 9.993387966940908e-06, "loss": 1.3002, "step": 2025 }, { "epoch": 0.35, "grad_norm": 42.076694141065936, "learning_rate": 9.993131271500293e-06, "loss": 1.2742, "step": 2030 }, { "epoch": 0.35, "grad_norm": 40.78685017799231, "learning_rate": 9.992869691144679e-06, "loss": 1.2764, "step": 2035 }, { "epoch": 0.35, "grad_norm": 33.41521506875986, "learning_rate": 9.992603226129978e-06, "loss": 1.2883, "step": 2040 }, { "epoch": 0.35, "grad_norm": 14.665923005413482, "learning_rate": 9.992331876716877e-06, "loss": 1.269, "step": 2045 }, { "epoch": 0.35, "grad_norm": 35.13745352099027, "learning_rate": 9.992055643170844e-06, "loss": 1.2919, "step": 2050 }, { "epoch": 0.35, "grad_norm": 93.57018198555726, "learning_rate": 9.99177452576213e-06, "loss": 1.2477, "step": 2055 }, { "epoch": 0.35, "grad_norm": 73.14188594561246, "learning_rate": 9.991488524765755e-06, "loss": 1.2727, "step": 2060 }, { "epoch": 0.35, "grad_norm": 60.954485489252725, "learning_rate": 9.991197640461527e-06, "loss": 1.2804, "step": 2065 }, { "epoch": 0.35, "grad_norm": 49.16627669625653, "learning_rate": 9.99090187313402e-06, "loss": 1.2764, "step": 2070 }, { "epoch": 0.35, "grad_norm": 71.11546717627556, "learning_rate": 9.990601223072596e-06, "loss": 1.2999, "step": 2075 }, { "epoch": 0.35, "grad_norm": 20.860674350454783, "learning_rate": 9.990295690571386e-06, "loss": 1.2745, "step": 2080 }, { "epoch": 0.35, "grad_norm": 30.0270032458091, "learning_rate": 9.989985275929302e-06, "loss": 1.2672, "step": 2085 }, { "epoch": 0.36, "grad_norm": 48.4403765685246, "learning_rate": 9.98966997945003e-06, "loss": 1.2713, "step": 2090 }, { "epoch": 0.36, "grad_norm": 16.184895143802937, "learning_rate": 9.989349801442034e-06, "loss": 1.2447, "step": 2095 }, { "epoch": 0.36, "grad_norm": 17.326803432440844, "learning_rate": 9.989024742218554e-06, "loss": 1.2718, "step": 2100 }, { "epoch": 0.36, "grad_norm": 22.212522451101893, "learning_rate": 9.9886948020976e-06, "loss": 1.2692, "step": 2105 }, { "epoch": 0.36, "grad_norm": 11.955909550882618, "learning_rate": 9.988359981401967e-06, "loss": 1.2632, "step": 2110 }, { "epoch": 0.36, "grad_norm": 54.16029859047456, "learning_rate": 9.988020280459214e-06, "loss": 1.2474, "step": 2115 }, { "epoch": 0.36, "grad_norm": 18.99762681137852, "learning_rate": 9.987675699601682e-06, "loss": 1.2878, "step": 2120 }, { "epoch": 0.36, "grad_norm": 10.986794193093745, "learning_rate": 9.987326239166484e-06, "loss": 1.2581, "step": 2125 }, { "epoch": 0.36, "grad_norm": 41.18160853788803, "learning_rate": 9.986971899495505e-06, "loss": 1.2505, "step": 2130 }, { "epoch": 0.36, "grad_norm": 22.73713386704341, "learning_rate": 9.986612680935409e-06, "loss": 1.2369, "step": 2135 }, { "epoch": 0.36, "grad_norm": 39.94586605054518, "learning_rate": 9.986248583837624e-06, "loss": 1.2506, "step": 2140 }, { "epoch": 0.36, "grad_norm": 68.00261940817953, "learning_rate": 9.985879608558359e-06, "loss": 1.2386, "step": 2145 }, { "epoch": 0.37, "grad_norm": 25.278427620899553, "learning_rate": 9.98550575545859e-06, "loss": 1.2315, "step": 2150 }, { "epoch": 0.37, "grad_norm": 58.48992267362782, "learning_rate": 9.985127024904072e-06, "loss": 1.2684, "step": 2155 }, { "epoch": 0.37, "grad_norm": 107.22278634371908, "learning_rate": 9.984743417265323e-06, "loss": 1.261, "step": 2160 }, { "epoch": 0.37, "grad_norm": 56.43898177359786, "learning_rate": 9.984354932917639e-06, "loss": 1.2578, "step": 2165 }, { "epoch": 0.37, "grad_norm": 30.02405317886174, "learning_rate": 9.983961572241085e-06, "loss": 1.221, "step": 2170 }, { "epoch": 0.37, "grad_norm": 21.331779746602443, "learning_rate": 9.983563335620494e-06, "loss": 1.2405, "step": 2175 }, { "epoch": 0.37, "grad_norm": 43.31147042485862, "learning_rate": 9.983160223445476e-06, "loss": 1.2366, "step": 2180 }, { "epoch": 0.37, "grad_norm": 10.029017686861732, "learning_rate": 9.982752236110401e-06, "loss": 1.2432, "step": 2185 }, { "epoch": 0.37, "grad_norm": 14.894536079070498, "learning_rate": 9.982339374014416e-06, "loss": 1.2213, "step": 2190 }, { "epoch": 0.37, "grad_norm": 52.58513573855897, "learning_rate": 9.981921637561438e-06, "loss": 1.2446, "step": 2195 }, { "epoch": 0.37, "grad_norm": 148.5603971500644, "learning_rate": 9.981499027160147e-06, "loss": 1.2624, "step": 2200 }, { "epoch": 0.37, "grad_norm": 67.61072411459752, "learning_rate": 9.981071543223992e-06, "loss": 1.2677, "step": 2205 }, { "epoch": 0.38, "grad_norm": 54.65073271042123, "learning_rate": 9.980639186171198e-06, "loss": 1.2462, "step": 2210 }, { "epoch": 0.38, "grad_norm": 38.85335665696665, "learning_rate": 9.980201956424748e-06, "loss": 1.2708, "step": 2215 }, { "epoch": 0.38, "grad_norm": 13.33642147794773, "learning_rate": 9.979759854412395e-06, "loss": 1.2643, "step": 2220 }, { "epoch": 0.38, "grad_norm": 11.980103706134027, "learning_rate": 9.97931288056666e-06, "loss": 1.2406, "step": 2225 }, { "epoch": 0.38, "grad_norm": 9.425628724046566, "learning_rate": 9.978861035324832e-06, "loss": 1.217, "step": 2230 }, { "epoch": 0.38, "grad_norm": 46.170020521427176, "learning_rate": 9.97840431912896e-06, "loss": 1.24, "step": 2235 }, { "epoch": 0.38, "grad_norm": 20.23431894994783, "learning_rate": 9.977942732425862e-06, "loss": 1.2447, "step": 2240 }, { "epoch": 0.38, "grad_norm": 29.45427587924372, "learning_rate": 9.977476275667123e-06, "loss": 1.2285, "step": 2245 }, { "epoch": 0.38, "grad_norm": 21.476899810487403, "learning_rate": 9.97700494930909e-06, "loss": 1.1955, "step": 2250 }, { "epoch": 0.38, "grad_norm": 18.56756736605288, "learning_rate": 9.976528753812874e-06, "loss": 1.2215, "step": 2255 }, { "epoch": 0.38, "grad_norm": 14.963469861084638, "learning_rate": 9.976047689644345e-06, "loss": 1.2168, "step": 2260 }, { "epoch": 0.39, "grad_norm": 37.7498655059393, "learning_rate": 9.97556175727415e-06, "loss": 1.233, "step": 2265 }, { "epoch": 0.39, "grad_norm": 11.34730041668875, "learning_rate": 9.975070957177681e-06, "loss": 1.2258, "step": 2270 }, { "epoch": 0.39, "grad_norm": 29.797887323807636, "learning_rate": 9.974575289835107e-06, "loss": 1.2487, "step": 2275 }, { "epoch": 0.39, "grad_norm": 10.355285431034126, "learning_rate": 9.974074755731351e-06, "loss": 1.2322, "step": 2280 }, { "epoch": 0.39, "grad_norm": 11.54182670812362, "learning_rate": 9.973569355356099e-06, "loss": 1.2535, "step": 2285 }, { "epoch": 0.39, "grad_norm": 21.753113355014868, "learning_rate": 9.973059089203797e-06, "loss": 1.2257, "step": 2290 }, { "epoch": 0.39, "grad_norm": 13.018327238321696, "learning_rate": 9.972543957773653e-06, "loss": 1.2411, "step": 2295 }, { "epoch": 0.39, "grad_norm": 11.70691445466828, "learning_rate": 9.972023961569632e-06, "loss": 1.226, "step": 2300 }, { "epoch": 0.39, "grad_norm": 40.42682718423413, "learning_rate": 9.971499101100463e-06, "loss": 1.2181, "step": 2305 }, { "epoch": 0.39, "grad_norm": 51.091493759604035, "learning_rate": 9.97096937687963e-06, "loss": 1.2247, "step": 2310 }, { "epoch": 0.39, "grad_norm": 48.195350237791374, "learning_rate": 9.970434789425378e-06, "loss": 1.2125, "step": 2315 }, { "epoch": 0.39, "grad_norm": 17.38647273683228, "learning_rate": 9.969895339260706e-06, "loss": 1.2069, "step": 2320 }, { "epoch": 0.4, "grad_norm": 30.49139360025075, "learning_rate": 9.969351026913375e-06, "loss": 1.2194, "step": 2325 }, { "epoch": 0.4, "grad_norm": 15.385511153165469, "learning_rate": 9.968801852915897e-06, "loss": 1.2209, "step": 2330 }, { "epoch": 0.4, "grad_norm": 22.600235098601413, "learning_rate": 9.968247817805548e-06, "loss": 1.233, "step": 2335 }, { "epoch": 0.4, "grad_norm": 25.51632451097952, "learning_rate": 9.967688922124351e-06, "loss": 1.2234, "step": 2340 }, { "epoch": 0.4, "grad_norm": 12.458533234384973, "learning_rate": 9.967125166419092e-06, "loss": 1.2337, "step": 2345 }, { "epoch": 0.4, "grad_norm": 15.137901496985272, "learning_rate": 9.966556551241307e-06, "loss": 1.2252, "step": 2350 }, { "epoch": 0.4, "grad_norm": 13.554021202381733, "learning_rate": 9.965983077147287e-06, "loss": 1.232, "step": 2355 }, { "epoch": 0.4, "grad_norm": 29.255477281363408, "learning_rate": 9.965404744698081e-06, "loss": 1.223, "step": 2360 }, { "epoch": 0.4, "grad_norm": 55.71066517356006, "learning_rate": 9.964821554459482e-06, "loss": 1.211, "step": 2365 }, { "epoch": 0.4, "grad_norm": 46.97293908156358, "learning_rate": 9.964233507002044e-06, "loss": 1.2306, "step": 2370 }, { "epoch": 0.4, "grad_norm": 29.82807675197631, "learning_rate": 9.963640602901069e-06, "loss": 1.1983, "step": 2375 }, { "epoch": 0.4, "grad_norm": 40.8106865600396, "learning_rate": 9.96304284273661e-06, "loss": 1.2362, "step": 2380 }, { "epoch": 0.41, "grad_norm": 37.035382402104936, "learning_rate": 9.962440227093474e-06, "loss": 1.2233, "step": 2385 }, { "epoch": 0.41, "grad_norm": 47.48410746333308, "learning_rate": 9.961832756561213e-06, "loss": 1.2354, "step": 2390 }, { "epoch": 0.41, "grad_norm": 20.34844898900087, "learning_rate": 9.961220431734137e-06, "loss": 1.2215, "step": 2395 }, { "epoch": 0.41, "grad_norm": 108.59661182177989, "learning_rate": 9.960603253211295e-06, "loss": 1.2327, "step": 2400 }, { "epoch": 0.41, "grad_norm": 76.44086594519507, "learning_rate": 9.95998122159649e-06, "loss": 1.2229, "step": 2405 }, { "epoch": 0.41, "grad_norm": 38.53725294367486, "learning_rate": 9.959354337498274e-06, "loss": 1.2166, "step": 2410 }, { "epoch": 0.41, "grad_norm": 23.492069683488847, "learning_rate": 9.958722601529945e-06, "loss": 1.2349, "step": 2415 }, { "epoch": 0.41, "grad_norm": 32.42759975420263, "learning_rate": 9.958086014309545e-06, "loss": 1.212, "step": 2420 }, { "epoch": 0.41, "grad_norm": 19.57195181578406, "learning_rate": 9.957444576459864e-06, "loss": 1.2193, "step": 2425 }, { "epoch": 0.41, "grad_norm": 26.68571032098248, "learning_rate": 9.956798288608442e-06, "loss": 1.2197, "step": 2430 }, { "epoch": 0.41, "grad_norm": 23.360805598503173, "learning_rate": 9.956147151387559e-06, "loss": 1.233, "step": 2435 }, { "epoch": 0.41, "grad_norm": 22.128646502976743, "learning_rate": 9.955491165434236e-06, "loss": 1.2141, "step": 2440 }, { "epoch": 0.42, "grad_norm": 26.658087544079972, "learning_rate": 9.954830331390245e-06, "loss": 1.1987, "step": 2445 }, { "epoch": 0.42, "grad_norm": 12.065272609028908, "learning_rate": 9.954164649902097e-06, "loss": 1.192, "step": 2450 }, { "epoch": 0.42, "grad_norm": 17.279093623666416, "learning_rate": 9.953494121621047e-06, "loss": 1.2053, "step": 2455 }, { "epoch": 0.42, "grad_norm": 24.282084124178198, "learning_rate": 9.95281874720309e-06, "loss": 1.1988, "step": 2460 }, { "epoch": 0.42, "grad_norm": 44.50488285966649, "learning_rate": 9.952138527308963e-06, "loss": 1.208, "step": 2465 }, { "epoch": 0.42, "grad_norm": 33.79862535916042, "learning_rate": 9.951453462604143e-06, "loss": 1.1916, "step": 2470 }, { "epoch": 0.42, "grad_norm": 27.13073772350017, "learning_rate": 9.950763553758848e-06, "loss": 1.21, "step": 2475 }, { "epoch": 0.42, "grad_norm": 49.80064601585301, "learning_rate": 9.950068801448037e-06, "loss": 1.2042, "step": 2480 }, { "epoch": 0.42, "grad_norm": 66.21016098075907, "learning_rate": 9.9493692063514e-06, "loss": 1.2083, "step": 2485 }, { "epoch": 0.42, "grad_norm": 26.587068635738756, "learning_rate": 9.948664769153372e-06, "loss": 1.2169, "step": 2490 }, { "epoch": 0.42, "grad_norm": 10.940254770635514, "learning_rate": 9.947955490543125e-06, "loss": 1.197, "step": 2495 }, { "epoch": 0.43, "grad_norm": 22.998827359689898, "learning_rate": 9.94724137121456e-06, "loss": 1.1862, "step": 2500 }, { "epoch": 0.43, "grad_norm": 41.25760329284341, "learning_rate": 9.946522411866325e-06, "loss": 1.1847, "step": 2505 }, { "epoch": 0.43, "grad_norm": 133.01481338781392, "learning_rate": 9.945798613201794e-06, "loss": 1.2163, "step": 2510 }, { "epoch": 0.43, "grad_norm": 89.31490352272507, "learning_rate": 9.94506997592908e-06, "loss": 1.177, "step": 2515 }, { "epoch": 0.43, "grad_norm": 30.656395841990992, "learning_rate": 9.944336500761029e-06, "loss": 1.1902, "step": 2520 }, { "epoch": 0.43, "grad_norm": 40.71404633836761, "learning_rate": 9.943598188415217e-06, "loss": 1.1812, "step": 2525 }, { "epoch": 0.43, "grad_norm": 44.483429975247155, "learning_rate": 9.942855039613958e-06, "loss": 1.2245, "step": 2530 }, { "epoch": 0.43, "grad_norm": 59.00601624439954, "learning_rate": 9.942107055084292e-06, "loss": 1.1852, "step": 2535 }, { "epoch": 0.43, "grad_norm": 48.899022645872606, "learning_rate": 9.941354235557994e-06, "loss": 1.1838, "step": 2540 }, { "epoch": 0.43, "grad_norm": 17.207646488335833, "learning_rate": 9.940596581771568e-06, "loss": 1.2067, "step": 2545 }, { "epoch": 0.43, "grad_norm": 13.145055187749728, "learning_rate": 9.939834094466245e-06, "loss": 1.2037, "step": 2550 }, { "epoch": 0.43, "grad_norm": 20.3637903577066, "learning_rate": 9.939066774387988e-06, "loss": 1.2061, "step": 2555 }, { "epoch": 0.44, "grad_norm": 23.27972184509164, "learning_rate": 9.93829462228749e-06, "loss": 1.1862, "step": 2560 }, { "epoch": 0.44, "grad_norm": 15.504767354168685, "learning_rate": 9.937517638920162e-06, "loss": 1.1787, "step": 2565 }, { "epoch": 0.44, "grad_norm": 18.882534891848458, "learning_rate": 9.936735825046154e-06, "loss": 1.1721, "step": 2570 }, { "epoch": 0.44, "grad_norm": 47.85662609246532, "learning_rate": 9.93594918143033e-06, "loss": 1.1926, "step": 2575 }, { "epoch": 0.44, "grad_norm": 80.08678346889124, "learning_rate": 9.935157708842288e-06, "loss": 1.1823, "step": 2580 }, { "epoch": 0.44, "grad_norm": 28.614889713910266, "learning_rate": 9.934361408056348e-06, "loss": 1.1815, "step": 2585 }, { "epoch": 0.44, "grad_norm": 53.20030002856704, "learning_rate": 9.933560279851549e-06, "loss": 1.2112, "step": 2590 }, { "epoch": 0.44, "grad_norm": 56.59970823556055, "learning_rate": 9.93275432501166e-06, "loss": 1.1882, "step": 2595 }, { "epoch": 0.44, "grad_norm": 43.98644204083481, "learning_rate": 9.931943544325166e-06, "loss": 1.16, "step": 2600 }, { "epoch": 0.44, "grad_norm": 15.92441460995244, "learning_rate": 9.931127938585275e-06, "loss": 1.2017, "step": 2605 }, { "epoch": 0.44, "grad_norm": 73.0494618190435, "learning_rate": 9.930307508589918e-06, "loss": 1.1923, "step": 2610 }, { "epoch": 0.44, "grad_norm": 66.41961786413435, "learning_rate": 9.929482255141744e-06, "loss": 1.1775, "step": 2615 }, { "epoch": 0.45, "grad_norm": 17.458880822215225, "learning_rate": 9.928652179048119e-06, "loss": 1.1606, "step": 2620 }, { "epoch": 0.45, "grad_norm": 21.88590996455315, "learning_rate": 9.927817281121131e-06, "loss": 1.1666, "step": 2625 }, { "epoch": 0.45, "grad_norm": 29.09234292628808, "learning_rate": 9.926977562177582e-06, "loss": 1.173, "step": 2630 }, { "epoch": 0.45, "grad_norm": 34.55403219807346, "learning_rate": 9.926133023038988e-06, "loss": 1.184, "step": 2635 }, { "epoch": 0.45, "grad_norm": 10.720705771918542, "learning_rate": 9.92528366453159e-06, "loss": 1.1818, "step": 2640 }, { "epoch": 0.45, "grad_norm": 15.388664176736826, "learning_rate": 9.924429487486339e-06, "loss": 1.1724, "step": 2645 }, { "epoch": 0.45, "grad_norm": 18.130272229112684, "learning_rate": 9.923570492738895e-06, "loss": 1.1772, "step": 2650 }, { "epoch": 0.45, "grad_norm": 9.637369376811128, "learning_rate": 9.922706681129634e-06, "loss": 1.1873, "step": 2655 }, { "epoch": 0.45, "grad_norm": 22.481508332457953, "learning_rate": 9.921838053503654e-06, "loss": 1.1841, "step": 2660 }, { "epoch": 0.45, "grad_norm": 36.96789134182856, "learning_rate": 9.92096461071075e-06, "loss": 1.1646, "step": 2665 }, { "epoch": 0.45, "grad_norm": 11.20424827349328, "learning_rate": 9.920086353605439e-06, "loss": 1.1683, "step": 2670 }, { "epoch": 0.45, "grad_norm": 22.984932609600577, "learning_rate": 9.919203283046942e-06, "loss": 1.1832, "step": 2675 }, { "epoch": 0.46, "grad_norm": 17.123529916442433, "learning_rate": 9.918315399899188e-06, "loss": 1.1692, "step": 2680 }, { "epoch": 0.46, "grad_norm": 11.409240507891948, "learning_rate": 9.91742270503082e-06, "loss": 1.1767, "step": 2685 }, { "epoch": 0.46, "grad_norm": 12.592745764863913, "learning_rate": 9.916525199315186e-06, "loss": 1.1824, "step": 2690 }, { "epoch": 0.46, "grad_norm": 35.317900020424446, "learning_rate": 9.91562288363034e-06, "loss": 1.1766, "step": 2695 }, { "epoch": 0.46, "grad_norm": 11.418561626974661, "learning_rate": 9.91471575885904e-06, "loss": 1.1722, "step": 2700 }, { "epoch": 0.46, "grad_norm": 16.186458826087673, "learning_rate": 9.91380382588875e-06, "loss": 1.1511, "step": 2705 }, { "epoch": 0.46, "grad_norm": 13.392876468515981, "learning_rate": 9.912887085611642e-06, "loss": 1.1582, "step": 2710 }, { "epoch": 0.46, "grad_norm": 16.027146893897786, "learning_rate": 9.911965538924584e-06, "loss": 1.2027, "step": 2715 }, { "epoch": 0.46, "grad_norm": 23.74651860266453, "learning_rate": 9.911039186729154e-06, "loss": 1.1886, "step": 2720 }, { "epoch": 0.46, "grad_norm": 15.629032974183286, "learning_rate": 9.910108029931622e-06, "loss": 1.1849, "step": 2725 }, { "epoch": 0.46, "grad_norm": 12.341384937928606, "learning_rate": 9.90917206944297e-06, "loss": 1.1755, "step": 2730 }, { "epoch": 0.46, "grad_norm": 35.88736400267985, "learning_rate": 9.908231306178869e-06, "loss": 1.1932, "step": 2735 }, { "epoch": 0.47, "grad_norm": 13.536749553802284, "learning_rate": 9.907285741059694e-06, "loss": 1.1806, "step": 2740 }, { "epoch": 0.47, "grad_norm": 20.12273250818116, "learning_rate": 9.906335375010518e-06, "loss": 1.178, "step": 2745 }, { "epoch": 0.47, "grad_norm": 28.476959265452525, "learning_rate": 9.905380208961109e-06, "loss": 1.1809, "step": 2750 }, { "epoch": 0.47, "grad_norm": 22.670220119269143, "learning_rate": 9.904420243845931e-06, "loss": 1.1638, "step": 2755 }, { "epoch": 0.47, "grad_norm": 66.52647296720023, "learning_rate": 9.903455480604144e-06, "loss": 1.1785, "step": 2760 }, { "epoch": 0.47, "grad_norm": 65.14747317513056, "learning_rate": 9.902485920179605e-06, "loss": 1.1931, "step": 2765 }, { "epoch": 0.47, "grad_norm": 35.62003805602506, "learning_rate": 9.901511563520855e-06, "loss": 1.1614, "step": 2770 }, { "epoch": 0.47, "grad_norm": 39.14685246448072, "learning_rate": 9.900532411581141e-06, "loss": 1.1755, "step": 2775 }, { "epoch": 0.47, "grad_norm": 89.34481105669401, "learning_rate": 9.899548465318387e-06, "loss": 1.1767, "step": 2780 }, { "epoch": 0.47, "grad_norm": 52.59479557413615, "learning_rate": 9.898559725695217e-06, "loss": 1.1697, "step": 2785 }, { "epoch": 0.47, "grad_norm": 35.276525624359245, "learning_rate": 9.89756619367894e-06, "loss": 1.1632, "step": 2790 }, { "epoch": 0.48, "grad_norm": 11.966535745499472, "learning_rate": 9.89656787024156e-06, "loss": 1.16, "step": 2795 }, { "epoch": 0.48, "grad_norm": 13.569210509013814, "learning_rate": 9.895564756359758e-06, "loss": 1.1749, "step": 2800 }, { "epoch": 0.48, "grad_norm": 19.739276355409753, "learning_rate": 9.89455685301491e-06, "loss": 1.1714, "step": 2805 }, { "epoch": 0.48, "grad_norm": 59.13841059108984, "learning_rate": 9.893544161193077e-06, "loss": 1.1816, "step": 2810 }, { "epoch": 0.48, "grad_norm": 24.5159909442423, "learning_rate": 9.892526681884997e-06, "loss": 1.1617, "step": 2815 }, { "epoch": 0.48, "grad_norm": 53.43412207505383, "learning_rate": 9.891504416086102e-06, "loss": 1.1698, "step": 2820 }, { "epoch": 0.48, "grad_norm": 12.028092300763062, "learning_rate": 9.890477364796502e-06, "loss": 1.1462, "step": 2825 }, { "epoch": 0.48, "grad_norm": 10.266849451796633, "learning_rate": 9.889445529020989e-06, "loss": 1.1739, "step": 2830 }, { "epoch": 0.48, "grad_norm": 33.256972593357, "learning_rate": 9.888408909769032e-06, "loss": 1.1638, "step": 2835 }, { "epoch": 0.48, "grad_norm": 13.240389968680208, "learning_rate": 9.887367508054788e-06, "loss": 1.1539, "step": 2840 }, { "epoch": 0.48, "grad_norm": 38.32149327567404, "learning_rate": 9.886321324897088e-06, "loss": 1.1453, "step": 2845 }, { "epoch": 0.48, "grad_norm": 17.92159888176316, "learning_rate": 9.885270361319439e-06, "loss": 1.129, "step": 2850 }, { "epoch": 0.49, "grad_norm": 51.068604121950585, "learning_rate": 9.884214618350028e-06, "loss": 1.1633, "step": 2855 }, { "epoch": 0.49, "grad_norm": 74.99666351301994, "learning_rate": 9.883154097021717e-06, "loss": 1.158, "step": 2860 }, { "epoch": 0.49, "grad_norm": 30.31376407102048, "learning_rate": 9.882088798372043e-06, "loss": 1.1438, "step": 2865 }, { "epoch": 0.49, "grad_norm": 27.670642676177405, "learning_rate": 9.881018723443214e-06, "loss": 1.1429, "step": 2870 }, { "epoch": 0.49, "grad_norm": 23.64857996752343, "learning_rate": 9.879943873282116e-06, "loss": 1.1514, "step": 2875 }, { "epoch": 0.49, "grad_norm": 13.005128955967104, "learning_rate": 9.878864248940304e-06, "loss": 1.1595, "step": 2880 }, { "epoch": 0.49, "grad_norm": 8.966016821913678, "learning_rate": 9.877779851474003e-06, "loss": 1.1417, "step": 2885 }, { "epoch": 0.49, "grad_norm": 12.446956221399299, "learning_rate": 9.876690681944107e-06, "loss": 1.1525, "step": 2890 }, { "epoch": 0.49, "grad_norm": 9.693284053412812, "learning_rate": 9.87559674141618e-06, "loss": 1.1077, "step": 2895 }, { "epoch": 0.49, "grad_norm": 17.179007907357477, "learning_rate": 9.874498030960455e-06, "loss": 1.1408, "step": 2900 }, { "epoch": 0.49, "grad_norm": 33.083858737896215, "learning_rate": 9.87339455165183e-06, "loss": 1.1454, "step": 2905 }, { "epoch": 0.49, "grad_norm": 79.29658401277891, "learning_rate": 9.872286304569867e-06, "loss": 1.1322, "step": 2910 }, { "epoch": 0.5, "grad_norm": 20.385160626134113, "learning_rate": 9.871173290798795e-06, "loss": 1.1611, "step": 2915 }, { "epoch": 0.5, "grad_norm": 10.773144522385966, "learning_rate": 9.870055511427507e-06, "loss": 1.1643, "step": 2920 }, { "epoch": 0.5, "grad_norm": 16.312322239246836, "learning_rate": 9.868932967549554e-06, "loss": 1.1377, "step": 2925 }, { "epoch": 0.5, "grad_norm": 23.14049593920567, "learning_rate": 9.867805660263152e-06, "loss": 1.1327, "step": 2930 }, { "epoch": 0.5, "grad_norm": 19.77386440226736, "learning_rate": 9.866673590671176e-06, "loss": 1.1358, "step": 2935 }, { "epoch": 0.5, "grad_norm": 30.710807933209413, "learning_rate": 9.86553675988116e-06, "loss": 1.1293, "step": 2940 }, { "epoch": 0.5, "grad_norm": 9.894878262098294, "learning_rate": 9.864395169005297e-06, "loss": 1.1428, "step": 2945 }, { "epoch": 0.5, "grad_norm": 60.79458334408716, "learning_rate": 9.863248819160436e-06, "loss": 1.1302, "step": 2950 }, { "epoch": 0.5, "grad_norm": 123.22513760731647, "learning_rate": 9.862097711468082e-06, "loss": 1.1408, "step": 2955 }, { "epoch": 0.5, "grad_norm": 12.980769182017495, "learning_rate": 9.860941847054394e-06, "loss": 1.176, "step": 2960 }, { "epoch": 0.5, "grad_norm": 77.79656950182576, "learning_rate": 9.859781227050186e-06, "loss": 1.1614, "step": 2965 }, { "epoch": 0.5, "grad_norm": 45.821558359257104, "learning_rate": 9.858615852590923e-06, "loss": 1.1402, "step": 2970 }, { "epoch": 0.51, "grad_norm": 15.431782798306614, "learning_rate": 9.857445724816723e-06, "loss": 1.1479, "step": 2975 }, { "epoch": 0.51, "grad_norm": 18.816255936865193, "learning_rate": 9.856270844872355e-06, "loss": 1.1414, "step": 2980 }, { "epoch": 0.51, "grad_norm": 10.78799968669382, "learning_rate": 9.855091213907233e-06, "loss": 1.169, "step": 2985 }, { "epoch": 0.51, "grad_norm": 20.618619597391554, "learning_rate": 9.853906833075424e-06, "loss": 1.1364, "step": 2990 }, { "epoch": 0.51, "grad_norm": 29.602152847998962, "learning_rate": 9.852717703535637e-06, "loss": 1.1313, "step": 2995 }, { "epoch": 0.51, "grad_norm": 20.41523192024887, "learning_rate": 9.85152382645123e-06, "loss": 1.13, "step": 3000 }, { "epoch": 0.51, "grad_norm": 37.823239974778716, "learning_rate": 9.85032520299021e-06, "loss": 1.1508, "step": 3005 }, { "epoch": 0.51, "grad_norm": 43.2473952972184, "learning_rate": 9.849121834325216e-06, "loss": 1.1316, "step": 3010 }, { "epoch": 0.51, "grad_norm": 52.74014485034811, "learning_rate": 9.847913721633541e-06, "loss": 1.127, "step": 3015 }, { "epoch": 0.51, "grad_norm": 11.225887741481749, "learning_rate": 9.846700866097111e-06, "loss": 1.1283, "step": 3020 }, { "epoch": 0.51, "grad_norm": 22.830528962299184, "learning_rate": 9.845483268902497e-06, "loss": 1.1332, "step": 3025 }, { "epoch": 0.52, "grad_norm": 9.962513124404307, "learning_rate": 9.844260931240908e-06, "loss": 1.1192, "step": 3030 }, { "epoch": 0.52, "grad_norm": 10.075114486387971, "learning_rate": 9.843033854308189e-06, "loss": 1.1234, "step": 3035 }, { "epoch": 0.52, "grad_norm": 43.92726259157927, "learning_rate": 9.841802039304819e-06, "loss": 1.1325, "step": 3040 }, { "epoch": 0.52, "grad_norm": 10.602847875408965, "learning_rate": 9.84056548743592e-06, "loss": 1.0945, "step": 3045 }, { "epoch": 0.52, "grad_norm": 72.84853493289299, "learning_rate": 9.839324199911244e-06, "loss": 1.1146, "step": 3050 }, { "epoch": 0.52, "grad_norm": 64.59634241144452, "learning_rate": 9.838078177945174e-06, "loss": 1.1265, "step": 3055 }, { "epoch": 0.52, "grad_norm": 13.606427097030371, "learning_rate": 9.83682742275673e-06, "loss": 1.1458, "step": 3060 }, { "epoch": 0.52, "grad_norm": 22.74960141201855, "learning_rate": 9.835571935569554e-06, "loss": 1.1276, "step": 3065 }, { "epoch": 0.52, "grad_norm": 14.872212344467762, "learning_rate": 9.834311717611929e-06, "loss": 1.1275, "step": 3070 }, { "epoch": 0.52, "grad_norm": 26.848450408931033, "learning_rate": 9.833046770116758e-06, "loss": 1.1178, "step": 3075 }, { "epoch": 0.52, "grad_norm": 39.22359304839263, "learning_rate": 9.831777094321572e-06, "loss": 1.1238, "step": 3080 }, { "epoch": 0.52, "grad_norm": 38.21441517690827, "learning_rate": 9.83050269146853e-06, "loss": 1.13, "step": 3085 }, { "epoch": 0.53, "grad_norm": 71.39821885265845, "learning_rate": 9.829223562804415e-06, "loss": 1.1276, "step": 3090 }, { "epoch": 0.53, "grad_norm": 69.47267188756749, "learning_rate": 9.827939709580631e-06, "loss": 1.1404, "step": 3095 }, { "epoch": 0.53, "grad_norm": 27.2120383573781, "learning_rate": 9.82665113305321e-06, "loss": 1.1328, "step": 3100 }, { "epoch": 0.53, "grad_norm": 17.3549164827145, "learning_rate": 9.8253578344828e-06, "loss": 1.0994, "step": 3105 }, { "epoch": 0.53, "grad_norm": 26.975780544067167, "learning_rate": 9.824059815134668e-06, "loss": 1.114, "step": 3110 }, { "epoch": 0.53, "grad_norm": 10.89034671963636, "learning_rate": 9.822757076278701e-06, "loss": 1.127, "step": 3115 }, { "epoch": 0.53, "grad_norm": 18.784217590750867, "learning_rate": 9.821449619189407e-06, "loss": 1.1127, "step": 3120 }, { "epoch": 0.53, "grad_norm": 16.672619503726025, "learning_rate": 9.820137445145903e-06, "loss": 1.1257, "step": 3125 }, { "epoch": 0.53, "grad_norm": 33.283960901757666, "learning_rate": 9.818820555431928e-06, "loss": 1.1119, "step": 3130 }, { "epoch": 0.53, "grad_norm": 20.205326596044547, "learning_rate": 9.817498951335827e-06, "loss": 1.1172, "step": 3135 }, { "epoch": 0.53, "grad_norm": 38.647807030435, "learning_rate": 9.81617263415056e-06, "loss": 1.1332, "step": 3140 }, { "epoch": 0.53, "grad_norm": 14.620577977257334, "learning_rate": 9.814841605173704e-06, "loss": 1.0901, "step": 3145 }, { "epoch": 0.54, "grad_norm": 30.05892374289853, "learning_rate": 9.813505865707437e-06, "loss": 1.1164, "step": 3150 }, { "epoch": 0.54, "grad_norm": 22.96909488857937, "learning_rate": 9.81216541705855e-06, "loss": 1.1051, "step": 3155 }, { "epoch": 0.54, "grad_norm": 18.269660698493023, "learning_rate": 9.810820260538441e-06, "loss": 1.1375, "step": 3160 }, { "epoch": 0.54, "grad_norm": 13.772284348648533, "learning_rate": 9.809470397463112e-06, "loss": 1.116, "step": 3165 }, { "epoch": 0.54, "grad_norm": 10.855531811848072, "learning_rate": 9.808115829153169e-06, "loss": 1.1404, "step": 3170 }, { "epoch": 0.54, "grad_norm": 15.71560575354875, "learning_rate": 9.806756556933823e-06, "loss": 1.1401, "step": 3175 }, { "epoch": 0.54, "grad_norm": 10.457646473635371, "learning_rate": 9.805392582134888e-06, "loss": 1.0976, "step": 3180 }, { "epoch": 0.54, "grad_norm": 12.021716252996788, "learning_rate": 9.804023906090779e-06, "loss": 1.1171, "step": 3185 }, { "epoch": 0.54, "grad_norm": 36.17543609821767, "learning_rate": 9.802650530140503e-06, "loss": 1.1359, "step": 3190 }, { "epoch": 0.54, "grad_norm": 14.174434300819575, "learning_rate": 9.801272455627678e-06, "loss": 1.1354, "step": 3195 }, { "epoch": 0.54, "grad_norm": 17.968977941832232, "learning_rate": 9.799889683900506e-06, "loss": 1.082, "step": 3200 }, { "epoch": 0.54, "grad_norm": 12.179685514337823, "learning_rate": 9.798502216311791e-06, "loss": 1.1101, "step": 3205 }, { "epoch": 0.55, "grad_norm": 28.261361854490232, "learning_rate": 9.797110054218932e-06, "loss": 1.1267, "step": 3210 }, { "epoch": 0.55, "grad_norm": 24.91609567279778, "learning_rate": 9.795713198983918e-06, "loss": 1.1207, "step": 3215 }, { "epoch": 0.55, "grad_norm": 19.450449096827676, "learning_rate": 9.794311651973329e-06, "loss": 1.1189, "step": 3220 }, { "epoch": 0.55, "grad_norm": 34.838540645917654, "learning_rate": 9.792905414558336e-06, "loss": 1.1238, "step": 3225 }, { "epoch": 0.55, "grad_norm": 49.79019197252163, "learning_rate": 9.7914944881147e-06, "loss": 1.1031, "step": 3230 }, { "epoch": 0.55, "grad_norm": 22.569223081112888, "learning_rate": 9.790078874022767e-06, "loss": 1.103, "step": 3235 }, { "epoch": 0.55, "grad_norm": 34.088080046583904, "learning_rate": 9.788658573667475e-06, "loss": 1.1132, "step": 3240 }, { "epoch": 0.55, "grad_norm": 110.16064646913127, "learning_rate": 9.787233588438336e-06, "loss": 1.1, "step": 3245 }, { "epoch": 0.55, "grad_norm": 40.23449989378385, "learning_rate": 9.785803919729455e-06, "loss": 1.1135, "step": 3250 }, { "epoch": 0.55, "grad_norm": 44.46605455243022, "learning_rate": 9.784369568939516e-06, "loss": 1.1247, "step": 3255 }, { "epoch": 0.55, "grad_norm": 21.131004713406863, "learning_rate": 9.78293053747178e-06, "loss": 1.1123, "step": 3260 }, { "epoch": 0.56, "grad_norm": 21.71929947964882, "learning_rate": 9.781486826734092e-06, "loss": 1.0865, "step": 3265 }, { "epoch": 0.56, "grad_norm": 42.99587715186835, "learning_rate": 9.780038438138875e-06, "loss": 1.1011, "step": 3270 }, { "epoch": 0.56, "grad_norm": 39.95133149205219, "learning_rate": 9.778585373103123e-06, "loss": 1.0921, "step": 3275 }, { "epoch": 0.56, "grad_norm": 26.10809015145351, "learning_rate": 9.777127633048412e-06, "loss": 1.1019, "step": 3280 }, { "epoch": 0.56, "grad_norm": 34.02220191762274, "learning_rate": 9.775665219400884e-06, "loss": 1.1112, "step": 3285 }, { "epoch": 0.56, "grad_norm": 12.365701088432422, "learning_rate": 9.774198133591263e-06, "loss": 1.0958, "step": 3290 }, { "epoch": 0.56, "grad_norm": 10.289864979863733, "learning_rate": 9.772726377054838e-06, "loss": 1.0848, "step": 3295 }, { "epoch": 0.56, "grad_norm": 14.09533795089284, "learning_rate": 9.771249951231465e-06, "loss": 1.1023, "step": 3300 }, { "epoch": 0.56, "grad_norm": 33.23702455667827, "learning_rate": 9.769768857565573e-06, "loss": 1.1026, "step": 3305 }, { "epoch": 0.56, "grad_norm": 10.308081889604273, "learning_rate": 9.768283097506155e-06, "loss": 1.079, "step": 3310 }, { "epoch": 0.56, "grad_norm": 24.694123563015506, "learning_rate": 9.766792672506771e-06, "loss": 1.0742, "step": 3315 }, { "epoch": 0.56, "grad_norm": 20.34539350546079, "learning_rate": 9.765297584025542e-06, "loss": 1.0914, "step": 3320 }, { "epoch": 0.57, "grad_norm": 46.916385220387916, "learning_rate": 9.763797833525157e-06, "loss": 1.097, "step": 3325 }, { "epoch": 0.57, "grad_norm": 10.347280588345283, "learning_rate": 9.76229342247286e-06, "loss": 1.0702, "step": 3330 }, { "epoch": 0.57, "grad_norm": 8.490168781307537, "learning_rate": 9.760784352340457e-06, "loss": 1.1083, "step": 3335 }, { "epoch": 0.57, "grad_norm": 35.84052955467092, "learning_rate": 9.759270624604314e-06, "loss": 1.0903, "step": 3340 }, { "epoch": 0.57, "grad_norm": 30.14735937800149, "learning_rate": 9.757752240745348e-06, "loss": 1.0864, "step": 3345 }, { "epoch": 0.57, "grad_norm": 14.482082933392752, "learning_rate": 9.756229202249038e-06, "loss": 1.1044, "step": 3350 }, { "epoch": 0.57, "grad_norm": 24.69217569957398, "learning_rate": 9.75470151060541e-06, "loss": 1.0724, "step": 3355 }, { "epoch": 0.57, "grad_norm": 11.182521065589473, "learning_rate": 9.753169167309052e-06, "loss": 1.1146, "step": 3360 }, { "epoch": 0.57, "grad_norm": 11.069497891124406, "learning_rate": 9.751632173859092e-06, "loss": 1.0826, "step": 3365 }, { "epoch": 0.57, "grad_norm": 10.510795081029487, "learning_rate": 9.750090531759211e-06, "loss": 1.0923, "step": 3370 }, { "epoch": 0.57, "grad_norm": 25.081856809348373, "learning_rate": 9.748544242517644e-06, "loss": 1.0852, "step": 3375 }, { "epoch": 0.57, "grad_norm": 16.06757949104813, "learning_rate": 9.746993307647165e-06, "loss": 1.1093, "step": 3380 }, { "epoch": 0.58, "grad_norm": 35.25108551159975, "learning_rate": 9.745437728665095e-06, "loss": 1.0882, "step": 3385 }, { "epoch": 0.58, "grad_norm": 10.884674188000995, "learning_rate": 9.743877507093298e-06, "loss": 1.0845, "step": 3390 }, { "epoch": 0.58, "grad_norm": 10.939698638231848, "learning_rate": 9.742312644458183e-06, "loss": 1.1061, "step": 3395 }, { "epoch": 0.58, "grad_norm": 19.83963911448851, "learning_rate": 9.740743142290696e-06, "loss": 1.0948, "step": 3400 }, { "epoch": 0.58, "grad_norm": 16.542057789428704, "learning_rate": 9.739169002126326e-06, "loss": 1.0681, "step": 3405 }, { "epoch": 0.58, "grad_norm": 12.85859530160301, "learning_rate": 9.737590225505091e-06, "loss": 1.0714, "step": 3410 }, { "epoch": 0.58, "grad_norm": 10.529021625117974, "learning_rate": 9.736006813971557e-06, "loss": 1.083, "step": 3415 }, { "epoch": 0.58, "grad_norm": 32.642327759447724, "learning_rate": 9.734418769074813e-06, "loss": 1.0949, "step": 3420 }, { "epoch": 0.58, "grad_norm": 18.722861850410514, "learning_rate": 9.732826092368491e-06, "loss": 1.0958, "step": 3425 }, { "epoch": 0.58, "grad_norm": 39.00659387481143, "learning_rate": 9.731228785410746e-06, "loss": 1.0861, "step": 3430 }, { "epoch": 0.58, "grad_norm": 10.711037906612924, "learning_rate": 9.729626849764266e-06, "loss": 1.0772, "step": 3435 }, { "epoch": 0.58, "grad_norm": 9.42121870550603, "learning_rate": 9.728020286996273e-06, "loss": 1.0815, "step": 3440 }, { "epoch": 0.59, "grad_norm": 30.639900438665272, "learning_rate": 9.726409098678505e-06, "loss": 1.0801, "step": 3445 }, { "epoch": 0.59, "grad_norm": 11.123471421452953, "learning_rate": 9.724793286387233e-06, "loss": 1.0938, "step": 3450 }, { "epoch": 0.59, "grad_norm": 19.93670960977343, "learning_rate": 9.72317285170325e-06, "loss": 1.0521, "step": 3455 }, { "epoch": 0.59, "grad_norm": 28.837291173282402, "learning_rate": 9.721547796211872e-06, "loss": 1.0937, "step": 3460 }, { "epoch": 0.59, "grad_norm": 99.22043991359959, "learning_rate": 9.719918121502933e-06, "loss": 1.1099, "step": 3465 }, { "epoch": 0.59, "grad_norm": 118.91259663380502, "learning_rate": 9.718283829170788e-06, "loss": 1.0882, "step": 3470 }, { "epoch": 0.59, "grad_norm": 94.83197424147019, "learning_rate": 9.71664492081431e-06, "loss": 1.1115, "step": 3475 }, { "epoch": 0.59, "grad_norm": 23.176334095455275, "learning_rate": 9.715001398036884e-06, "loss": 1.1077, "step": 3480 }, { "epoch": 0.59, "grad_norm": 23.25526333684265, "learning_rate": 9.713353262446419e-06, "loss": 1.0704, "step": 3485 }, { "epoch": 0.59, "grad_norm": 62.50190406613821, "learning_rate": 9.711700515655327e-06, "loss": 1.0833, "step": 3490 }, { "epoch": 0.59, "grad_norm": 46.93771092847943, "learning_rate": 9.710043159280532e-06, "loss": 1.0825, "step": 3495 }, { "epoch": 0.6, "grad_norm": 29.238170589766323, "learning_rate": 9.708381194943476e-06, "loss": 1.0879, "step": 3500 }, { "epoch": 0.6, "grad_norm": 13.685350295706728, "learning_rate": 9.706714624270097e-06, "loss": 1.0783, "step": 3505 }, { "epoch": 0.6, "grad_norm": 13.303197061444635, "learning_rate": 9.705043448890852e-06, "loss": 1.0803, "step": 3510 }, { "epoch": 0.6, "grad_norm": 23.053480521822973, "learning_rate": 9.703367670440695e-06, "loss": 1.0666, "step": 3515 }, { "epoch": 0.6, "grad_norm": 16.52258620177513, "learning_rate": 9.701687290559084e-06, "loss": 1.082, "step": 3520 }, { "epoch": 0.6, "grad_norm": 22.692161284567145, "learning_rate": 9.700002310889981e-06, "loss": 1.0668, "step": 3525 }, { "epoch": 0.6, "grad_norm": 20.73627310180072, "learning_rate": 9.698312733081847e-06, "loss": 1.0806, "step": 3530 }, { "epoch": 0.6, "grad_norm": 31.383021093811415, "learning_rate": 9.696618558787643e-06, "loss": 1.0799, "step": 3535 }, { "epoch": 0.6, "grad_norm": 35.7883766937957, "learning_rate": 9.69491978966482e-06, "loss": 1.0579, "step": 3540 }, { "epoch": 0.6, "grad_norm": 22.439749721861627, "learning_rate": 9.693216427375338e-06, "loss": 1.055, "step": 3545 }, { "epoch": 0.6, "grad_norm": 10.528624040661086, "learning_rate": 9.691508473585635e-06, "loss": 1.0866, "step": 3550 }, { "epoch": 0.6, "grad_norm": 14.191808567072622, "learning_rate": 9.689795929966653e-06, "loss": 1.0625, "step": 3555 }, { "epoch": 0.61, "grad_norm": 49.35630024440896, "learning_rate": 9.688078798193816e-06, "loss": 1.0858, "step": 3560 }, { "epoch": 0.61, "grad_norm": 141.8741033629112, "learning_rate": 9.686357079947042e-06, "loss": 1.0819, "step": 3565 }, { "epoch": 0.61, "grad_norm": 101.01456442603427, "learning_rate": 9.684630776910734e-06, "loss": 1.0799, "step": 3570 }, { "epoch": 0.61, "grad_norm": 48.44038091165258, "learning_rate": 9.682899890773782e-06, "loss": 1.0784, "step": 3575 }, { "epoch": 0.61, "grad_norm": 39.09727281088586, "learning_rate": 9.68116442322956e-06, "loss": 1.094, "step": 3580 }, { "epoch": 0.61, "grad_norm": 58.65427281326046, "learning_rate": 9.679424375975916e-06, "loss": 1.0738, "step": 3585 }, { "epoch": 0.61, "grad_norm": 14.717354874808702, "learning_rate": 9.677679750715194e-06, "loss": 1.0706, "step": 3590 }, { "epoch": 0.61, "grad_norm": 12.32608732711157, "learning_rate": 9.675930549154201e-06, "loss": 1.0567, "step": 3595 }, { "epoch": 0.61, "grad_norm": 20.979587808667443, "learning_rate": 9.674176773004232e-06, "loss": 1.0738, "step": 3600 }, { "epoch": 0.61, "grad_norm": 19.984425443283612, "learning_rate": 9.672418423981051e-06, "loss": 1.0816, "step": 3605 }, { "epoch": 0.61, "grad_norm": 15.322899089745414, "learning_rate": 9.6706555038049e-06, "loss": 1.0641, "step": 3610 }, { "epoch": 0.61, "grad_norm": 14.27669372683442, "learning_rate": 9.66888801420049e-06, "loss": 1.0529, "step": 3615 }, { "epoch": 0.62, "grad_norm": 26.73787927731739, "learning_rate": 9.667115956897007e-06, "loss": 1.0565, "step": 3620 }, { "epoch": 0.62, "grad_norm": 14.332940164863567, "learning_rate": 9.6653393336281e-06, "loss": 1.0608, "step": 3625 }, { "epoch": 0.62, "grad_norm": 17.957432148595675, "learning_rate": 9.663558146131886e-06, "loss": 1.062, "step": 3630 }, { "epoch": 0.62, "grad_norm": 17.524600244030466, "learning_rate": 9.66177239615095e-06, "loss": 1.0416, "step": 3635 }, { "epoch": 0.62, "grad_norm": 33.43797866232677, "learning_rate": 9.65998208543234e-06, "loss": 1.0665, "step": 3640 }, { "epoch": 0.62, "grad_norm": 28.187859604767837, "learning_rate": 9.658187215727567e-06, "loss": 1.0512, "step": 3645 }, { "epoch": 0.62, "grad_norm": 43.17834437252287, "learning_rate": 9.656387788792594e-06, "loss": 1.0667, "step": 3650 }, { "epoch": 0.62, "grad_norm": 16.833570997925662, "learning_rate": 9.654583806387855e-06, "loss": 1.0474, "step": 3655 }, { "epoch": 0.62, "grad_norm": 19.4936094387291, "learning_rate": 9.652775270278236e-06, "loss": 1.0629, "step": 3660 }, { "epoch": 0.62, "grad_norm": 12.406721518276477, "learning_rate": 9.65096218223307e-06, "loss": 1.0415, "step": 3665 }, { "epoch": 0.62, "grad_norm": 14.348020212680531, "learning_rate": 9.649144544026154e-06, "loss": 1.0609, "step": 3670 }, { "epoch": 0.62, "grad_norm": 25.662165096305973, "learning_rate": 9.647322357435734e-06, "loss": 1.0547, "step": 3675 }, { "epoch": 0.63, "grad_norm": 62.94870309796483, "learning_rate": 9.6454956242445e-06, "loss": 1.0514, "step": 3680 }, { "epoch": 0.63, "grad_norm": 33.18978079391046, "learning_rate": 9.643664346239598e-06, "loss": 1.0716, "step": 3685 }, { "epoch": 0.63, "grad_norm": 34.21728007762453, "learning_rate": 9.641828525212616e-06, "loss": 1.0459, "step": 3690 }, { "epoch": 0.63, "grad_norm": 19.398489033262475, "learning_rate": 9.639988162959586e-06, "loss": 1.0614, "step": 3695 }, { "epoch": 0.63, "grad_norm": 35.381394002160434, "learning_rate": 9.638143261280983e-06, "loss": 1.0515, "step": 3700 }, { "epoch": 0.63, "grad_norm": 52.88442016004509, "learning_rate": 9.636293821981728e-06, "loss": 1.0715, "step": 3705 }, { "epoch": 0.63, "grad_norm": 60.17940969814508, "learning_rate": 9.634439846871173e-06, "loss": 1.0605, "step": 3710 }, { "epoch": 0.63, "grad_norm": 42.01453778390265, "learning_rate": 9.632581337763115e-06, "loss": 1.0632, "step": 3715 }, { "epoch": 0.63, "grad_norm": 16.91412969693914, "learning_rate": 9.630718296475782e-06, "loss": 1.0546, "step": 3720 }, { "epoch": 0.63, "grad_norm": 9.122096671164469, "learning_rate": 9.628850724831838e-06, "loss": 1.0404, "step": 3725 }, { "epoch": 0.63, "grad_norm": 10.360961065245824, "learning_rate": 9.626978624658378e-06, "loss": 1.0499, "step": 3730 }, { "epoch": 0.63, "grad_norm": 20.681549559872614, "learning_rate": 9.625101997786929e-06, "loss": 1.0515, "step": 3735 }, { "epoch": 0.64, "grad_norm": 34.311939706830195, "learning_rate": 9.623220846053448e-06, "loss": 1.0422, "step": 3740 }, { "epoch": 0.64, "grad_norm": 9.754605131977751, "learning_rate": 9.621335171298312e-06, "loss": 1.0469, "step": 3745 }, { "epoch": 0.64, "grad_norm": 17.552801959273697, "learning_rate": 9.619444975366328e-06, "loss": 1.0359, "step": 3750 }, { "epoch": 0.64, "grad_norm": 16.2524305965504, "learning_rate": 9.617550260106729e-06, "loss": 1.0403, "step": 3755 }, { "epoch": 0.64, "grad_norm": 11.093338843745048, "learning_rate": 9.615651027373163e-06, "loss": 1.0454, "step": 3760 }, { "epoch": 0.64, "grad_norm": 10.439354245341704, "learning_rate": 9.613747279023704e-06, "loss": 1.0597, "step": 3765 }, { "epoch": 0.64, "grad_norm": 34.420764529314724, "learning_rate": 9.611839016920837e-06, "loss": 1.034, "step": 3770 }, { "epoch": 0.64, "grad_norm": 9.15542917484291, "learning_rate": 9.609926242931467e-06, "loss": 1.0317, "step": 3775 }, { "epoch": 0.64, "grad_norm": 20.92806883061577, "learning_rate": 9.608008958926911e-06, "loss": 1.0545, "step": 3780 }, { "epoch": 0.64, "grad_norm": 20.937194797150127, "learning_rate": 9.606087166782904e-06, "loss": 1.0473, "step": 3785 }, { "epoch": 0.64, "grad_norm": 9.847890544358966, "learning_rate": 9.604160868379584e-06, "loss": 1.0362, "step": 3790 }, { "epoch": 0.65, "grad_norm": 61.39464963918651, "learning_rate": 9.6022300656015e-06, "loss": 1.0238, "step": 3795 }, { "epoch": 0.65, "grad_norm": 101.58609678822324, "learning_rate": 9.600294760337611e-06, "loss": 1.0628, "step": 3800 }, { "epoch": 0.65, "grad_norm": 48.990707631127584, "learning_rate": 9.598354954481276e-06, "loss": 1.0538, "step": 3805 }, { "epoch": 0.65, "grad_norm": 17.648537571948292, "learning_rate": 9.596410649930262e-06, "loss": 1.046, "step": 3810 }, { "epoch": 0.65, "grad_norm": 21.385685025543232, "learning_rate": 9.59446184858673e-06, "loss": 1.0317, "step": 3815 }, { "epoch": 0.65, "grad_norm": 27.822628216288326, "learning_rate": 9.592508552357251e-06, "loss": 1.0599, "step": 3820 }, { "epoch": 0.65, "grad_norm": 43.65608973671009, "learning_rate": 9.590550763152781e-06, "loss": 1.0216, "step": 3825 }, { "epoch": 0.65, "grad_norm": 68.66082796487945, "learning_rate": 9.588588482888684e-06, "loss": 1.0369, "step": 3830 }, { "epoch": 0.65, "grad_norm": 44.87563845739073, "learning_rate": 9.586621713484708e-06, "loss": 1.0447, "step": 3835 }, { "epoch": 0.65, "grad_norm": 85.29718189990733, "learning_rate": 9.584650456864997e-06, "loss": 1.0345, "step": 3840 }, { "epoch": 0.65, "grad_norm": 41.63515269827128, "learning_rate": 9.582674714958088e-06, "loss": 1.0539, "step": 3845 }, { "epoch": 0.65, "grad_norm": 44.2242489976644, "learning_rate": 9.580694489696896e-06, "loss": 1.0205, "step": 3850 }, { "epoch": 0.66, "grad_norm": 36.041665972339715, "learning_rate": 9.578709783018734e-06, "loss": 1.0488, "step": 3855 }, { "epoch": 0.66, "grad_norm": 67.57874599444452, "learning_rate": 9.576720596865292e-06, "loss": 1.0363, "step": 3860 }, { "epoch": 0.66, "grad_norm": 38.21695327694072, "learning_rate": 9.574726933182645e-06, "loss": 1.0391, "step": 3865 }, { "epoch": 0.66, "grad_norm": 13.661349716058288, "learning_rate": 9.572728793921248e-06, "loss": 1.0213, "step": 3870 }, { "epoch": 0.66, "grad_norm": 12.13353362747518, "learning_rate": 9.570726181035934e-06, "loss": 1.0435, "step": 3875 }, { "epoch": 0.66, "grad_norm": 11.334770151580553, "learning_rate": 9.568719096485915e-06, "loss": 1.0535, "step": 3880 }, { "epoch": 0.66, "grad_norm": 13.117262631880102, "learning_rate": 9.566707542234774e-06, "loss": 1.0373, "step": 3885 }, { "epoch": 0.66, "grad_norm": 10.325426340360542, "learning_rate": 9.56469152025047e-06, "loss": 1.02, "step": 3890 }, { "epoch": 0.66, "grad_norm": 27.760103500384453, "learning_rate": 9.562671032505328e-06, "loss": 1.0613, "step": 3895 }, { "epoch": 0.66, "grad_norm": 19.677606688797173, "learning_rate": 9.560646080976052e-06, "loss": 1.0374, "step": 3900 }, { "epoch": 0.66, "grad_norm": 12.221222305008846, "learning_rate": 9.558616667643703e-06, "loss": 1.0383, "step": 3905 }, { "epoch": 0.66, "grad_norm": 11.05047992856828, "learning_rate": 9.55658279449371e-06, "loss": 1.0332, "step": 3910 }, { "epoch": 0.67, "grad_norm": 8.783299101241285, "learning_rate": 9.554544463515867e-06, "loss": 1.0334, "step": 3915 }, { "epoch": 0.67, "grad_norm": 14.469272814234113, "learning_rate": 9.552501676704328e-06, "loss": 1.0243, "step": 3920 }, { "epoch": 0.67, "grad_norm": 11.437389694030383, "learning_rate": 9.55045443605761e-06, "loss": 1.042, "step": 3925 }, { "epoch": 0.67, "grad_norm": 15.818804679626847, "learning_rate": 9.548402743578578e-06, "loss": 1.0106, "step": 3930 }, { "epoch": 0.67, "grad_norm": 72.12551959106173, "learning_rate": 9.54634660127446e-06, "loss": 1.0335, "step": 3935 }, { "epoch": 0.67, "grad_norm": 60.35525493588452, "learning_rate": 9.544286011156838e-06, "loss": 1.0206, "step": 3940 }, { "epoch": 0.67, "grad_norm": 28.517071830491236, "learning_rate": 9.542220975241641e-06, "loss": 1.027, "step": 3945 }, { "epoch": 0.67, "grad_norm": 14.41421474517318, "learning_rate": 9.540151495549148e-06, "loss": 1.0294, "step": 3950 }, { "epoch": 0.67, "grad_norm": 10.95708153303163, "learning_rate": 9.538077574103988e-06, "loss": 1.0358, "step": 3955 }, { "epoch": 0.67, "grad_norm": 21.632102740651966, "learning_rate": 9.535999212935135e-06, "loss": 1.0211, "step": 3960 }, { "epoch": 0.67, "grad_norm": 13.86693762831278, "learning_rate": 9.533916414075906e-06, "loss": 1.0252, "step": 3965 }, { "epoch": 0.67, "grad_norm": 21.262197447013794, "learning_rate": 9.531829179563958e-06, "loss": 1.039, "step": 3970 }, { "epoch": 0.68, "grad_norm": 34.54442719901615, "learning_rate": 9.529737511441288e-06, "loss": 1.0335, "step": 3975 }, { "epoch": 0.68, "grad_norm": 63.2216943225613, "learning_rate": 9.527641411754234e-06, "loss": 1.0274, "step": 3980 }, { "epoch": 0.68, "grad_norm": 48.38593781451802, "learning_rate": 9.525540882553465e-06, "loss": 1.0166, "step": 3985 }, { "epoch": 0.68, "grad_norm": 45.643379369645665, "learning_rate": 9.523435925893986e-06, "loss": 1.0045, "step": 3990 }, { "epoch": 0.68, "grad_norm": 24.825075531481474, "learning_rate": 9.521326543835135e-06, "loss": 1.0216, "step": 3995 }, { "epoch": 0.68, "grad_norm": 32.30923946943804, "learning_rate": 9.519212738440572e-06, "loss": 1.0252, "step": 4000 }, { "epoch": 0.68, "grad_norm": 31.268891186427773, "learning_rate": 9.517094511778294e-06, "loss": 0.9941, "step": 4005 }, { "epoch": 0.68, "grad_norm": 19.506914693015624, "learning_rate": 9.514971865920618e-06, "loss": 1.0087, "step": 4010 }, { "epoch": 0.68, "grad_norm": 16.515776436928725, "learning_rate": 9.512844802944186e-06, "loss": 1.0195, "step": 4015 }, { "epoch": 0.68, "grad_norm": 15.148277998784094, "learning_rate": 9.510713324929962e-06, "loss": 1.0374, "step": 4020 }, { "epoch": 0.68, "grad_norm": 12.235328265558776, "learning_rate": 9.508577433963227e-06, "loss": 1.0438, "step": 4025 }, { "epoch": 0.69, "grad_norm": 17.09979546945935, "learning_rate": 9.506437132133581e-06, "loss": 1.0227, "step": 4030 }, { "epoch": 0.69, "grad_norm": 8.238589281663184, "learning_rate": 9.50429242153494e-06, "loss": 1.0176, "step": 4035 }, { "epoch": 0.69, "grad_norm": 10.311313851556408, "learning_rate": 9.50214330426553e-06, "loss": 1.0284, "step": 4040 }, { "epoch": 0.69, "grad_norm": 12.345687173293602, "learning_rate": 9.499989782427893e-06, "loss": 1.0142, "step": 4045 }, { "epoch": 0.69, "grad_norm": 10.004499727601269, "learning_rate": 9.497831858128876e-06, "loss": 1.0229, "step": 4050 }, { "epoch": 0.69, "grad_norm": 14.725097828165937, "learning_rate": 9.495669533479634e-06, "loss": 1.0155, "step": 4055 }, { "epoch": 0.69, "grad_norm": 14.228255051625794, "learning_rate": 9.49350281059563e-06, "loss": 1.0027, "step": 4060 }, { "epoch": 0.69, "grad_norm": 15.82098769307729, "learning_rate": 9.491331691596625e-06, "loss": 1.0003, "step": 4065 }, { "epoch": 0.69, "grad_norm": 20.029534817911642, "learning_rate": 9.489156178606684e-06, "loss": 1.0191, "step": 4070 }, { "epoch": 0.69, "grad_norm": 14.096382014592697, "learning_rate": 9.48697627375417e-06, "loss": 1.0358, "step": 4075 }, { "epoch": 0.69, "grad_norm": 11.164168759887309, "learning_rate": 9.484791979171744e-06, "loss": 1.0141, "step": 4080 }, { "epoch": 0.69, "grad_norm": 13.03131118489538, "learning_rate": 9.482603296996358e-06, "loss": 1.0161, "step": 4085 }, { "epoch": 0.7, "grad_norm": 20.75518249609745, "learning_rate": 9.48041022936926e-06, "loss": 1.0082, "step": 4090 }, { "epoch": 0.7, "grad_norm": 18.349151443252808, "learning_rate": 9.478212778435987e-06, "loss": 1.0038, "step": 4095 }, { "epoch": 0.7, "grad_norm": 11.74009872478776, "learning_rate": 9.476010946346365e-06, "loss": 1.0176, "step": 4100 }, { "epoch": 0.7, "grad_norm": 13.412538009143013, "learning_rate": 9.473804735254507e-06, "loss": 1.0209, "step": 4105 }, { "epoch": 0.7, "grad_norm": 55.590697499654354, "learning_rate": 9.471594147318806e-06, "loss": 1.0235, "step": 4110 }, { "epoch": 0.7, "grad_norm": 20.395513398092206, "learning_rate": 9.469379184701942e-06, "loss": 1.0252, "step": 4115 }, { "epoch": 0.7, "grad_norm": 82.57676781630296, "learning_rate": 9.46715984957087e-06, "loss": 1.0101, "step": 4120 }, { "epoch": 0.7, "grad_norm": 109.30761353748431, "learning_rate": 9.464936144096828e-06, "loss": 1.0, "step": 4125 }, { "epoch": 0.7, "grad_norm": 53.283011866055276, "learning_rate": 9.462708070455327e-06, "loss": 1.0277, "step": 4130 }, { "epoch": 0.7, "grad_norm": 53.341823780234854, "learning_rate": 9.46047563082615e-06, "loss": 1.0237, "step": 4135 }, { "epoch": 0.7, "grad_norm": 18.15169787672353, "learning_rate": 9.458238827393353e-06, "loss": 1.0182, "step": 4140 }, { "epoch": 0.7, "grad_norm": 9.281494842923637, "learning_rate": 9.455997662345262e-06, "loss": 0.9904, "step": 4145 }, { "epoch": 0.71, "grad_norm": 12.863337348739847, "learning_rate": 9.45375213787447e-06, "loss": 1.0252, "step": 4150 }, { "epoch": 0.71, "grad_norm": 19.464852452342406, "learning_rate": 9.451502256177832e-06, "loss": 1.008, "step": 4155 }, { "epoch": 0.71, "grad_norm": 23.836143947796305, "learning_rate": 9.44924801945647e-06, "loss": 1.0041, "step": 4160 }, { "epoch": 0.71, "grad_norm": 21.083807605337846, "learning_rate": 9.446989429915763e-06, "loss": 1.0062, "step": 4165 }, { "epoch": 0.71, "grad_norm": 20.37749053282835, "learning_rate": 9.44472648976535e-06, "loss": 1.0121, "step": 4170 }, { "epoch": 0.71, "grad_norm": 15.873447859539434, "learning_rate": 9.442459201219127e-06, "loss": 1.0051, "step": 4175 }, { "epoch": 0.71, "grad_norm": 18.959460649769564, "learning_rate": 9.440187566495246e-06, "loss": 1.004, "step": 4180 }, { "epoch": 0.71, "grad_norm": 27.467828324088764, "learning_rate": 9.437911587816105e-06, "loss": 0.9968, "step": 4185 }, { "epoch": 0.71, "grad_norm": 43.20261939532219, "learning_rate": 9.435631267408355e-06, "loss": 1.0115, "step": 4190 }, { "epoch": 0.71, "grad_norm": 26.800565504607206, "learning_rate": 9.4333466075029e-06, "loss": 1.0122, "step": 4195 }, { "epoch": 0.71, "grad_norm": 9.94300587635886, "learning_rate": 9.431057610334878e-06, "loss": 1.0043, "step": 4200 }, { "epoch": 0.71, "grad_norm": 10.614855349141937, "learning_rate": 9.42876427814368e-06, "loss": 1.0002, "step": 4205 }, { "epoch": 0.72, "grad_norm": 11.144998257948764, "learning_rate": 9.426466613172935e-06, "loss": 0.9941, "step": 4210 }, { "epoch": 0.72, "grad_norm": 9.103838790198429, "learning_rate": 9.42416461767051e-06, "loss": 0.9967, "step": 4215 }, { "epoch": 0.72, "grad_norm": 11.103071437327314, "learning_rate": 9.421858293888509e-06, "loss": 0.9872, "step": 4220 }, { "epoch": 0.72, "grad_norm": 17.4543054300558, "learning_rate": 9.41954764408327e-06, "loss": 1.0034, "step": 4225 }, { "epoch": 0.72, "grad_norm": 29.419165904681627, "learning_rate": 9.417232670515367e-06, "loss": 0.9876, "step": 4230 }, { "epoch": 0.72, "grad_norm": 24.500606114143153, "learning_rate": 9.414913375449598e-06, "loss": 1.004, "step": 4235 }, { "epoch": 0.72, "grad_norm": 20.066193807337648, "learning_rate": 9.41258976115499e-06, "loss": 1.0127, "step": 4240 }, { "epoch": 0.72, "grad_norm": 52.031867154118224, "learning_rate": 9.410261829904805e-06, "loss": 1.0046, "step": 4245 }, { "epoch": 0.72, "grad_norm": 21.157296869356358, "learning_rate": 9.407929583976514e-06, "loss": 1.0206, "step": 4250 }, { "epoch": 0.72, "grad_norm": 7.959852530800298, "learning_rate": 9.40559302565182e-06, "loss": 0.9852, "step": 4255 }, { "epoch": 0.72, "grad_norm": 18.55092856854972, "learning_rate": 9.40325215721664e-06, "loss": 1.0152, "step": 4260 }, { "epoch": 0.73, "grad_norm": 10.525060393654432, "learning_rate": 9.400906980961106e-06, "loss": 1.004, "step": 4265 }, { "epoch": 0.73, "grad_norm": 9.212100273291716, "learning_rate": 9.398557499179573e-06, "loss": 0.9973, "step": 4270 }, { "epoch": 0.73, "grad_norm": 22.460965399961488, "learning_rate": 9.396203714170595e-06, "loss": 0.996, "step": 4275 }, { "epoch": 0.73, "grad_norm": 14.39405900296308, "learning_rate": 9.393845628236949e-06, "loss": 1.007, "step": 4280 }, { "epoch": 0.73, "grad_norm": 51.01601071441724, "learning_rate": 9.391483243685612e-06, "loss": 0.9842, "step": 4285 }, { "epoch": 0.73, "grad_norm": 56.88915495340164, "learning_rate": 9.38911656282777e-06, "loss": 1.0078, "step": 4290 }, { "epoch": 0.73, "grad_norm": 65.33444023898811, "learning_rate": 9.386745587978809e-06, "loss": 0.9993, "step": 4295 }, { "epoch": 0.73, "grad_norm": 35.49556367595794, "learning_rate": 9.384370321458318e-06, "loss": 0.986, "step": 4300 }, { "epoch": 0.73, "grad_norm": 33.949914291057055, "learning_rate": 9.381990765590086e-06, "loss": 0.9923, "step": 4305 }, { "epoch": 0.73, "grad_norm": 29.173102258533216, "learning_rate": 9.379606922702092e-06, "loss": 0.9875, "step": 4310 }, { "epoch": 0.73, "grad_norm": 8.288434941882375, "learning_rate": 9.377218795126519e-06, "loss": 1.0079, "step": 4315 }, { "epoch": 0.73, "grad_norm": 19.693191672912995, "learning_rate": 9.374826385199735e-06, "loss": 0.9909, "step": 4320 }, { "epoch": 0.74, "grad_norm": 15.090041449361541, "learning_rate": 9.372429695262297e-06, "loss": 0.9855, "step": 4325 }, { "epoch": 0.74, "grad_norm": 10.317039626693504, "learning_rate": 9.370028727658956e-06, "loss": 0.9726, "step": 4330 }, { "epoch": 0.74, "grad_norm": 8.737797784798412, "learning_rate": 9.367623484738639e-06, "loss": 0.9939, "step": 4335 }, { "epoch": 0.74, "grad_norm": 22.78204210712204, "learning_rate": 9.365213968854463e-06, "loss": 0.9837, "step": 4340 }, { "epoch": 0.74, "grad_norm": 72.390320020571, "learning_rate": 9.362800182363718e-06, "loss": 0.9886, "step": 4345 }, { "epoch": 0.74, "grad_norm": 57.5047526181269, "learning_rate": 9.36038212762788e-06, "loss": 1.0029, "step": 4350 }, { "epoch": 0.74, "grad_norm": 101.10985440634894, "learning_rate": 9.3579598070126e-06, "loss": 0.9989, "step": 4355 }, { "epoch": 0.74, "grad_norm": 72.40360809441891, "learning_rate": 9.355533222887693e-06, "loss": 1.0273, "step": 4360 }, { "epoch": 0.74, "grad_norm": 23.804178434830444, "learning_rate": 9.353102377627155e-06, "loss": 0.9941, "step": 4365 }, { "epoch": 0.74, "grad_norm": 96.09502523304418, "learning_rate": 9.350667273609148e-06, "loss": 1.0098, "step": 4370 }, { "epoch": 0.74, "grad_norm": 75.77778989570355, "learning_rate": 9.348227913216e-06, "loss": 0.9983, "step": 4375 }, { "epoch": 0.74, "grad_norm": 18.44568445884761, "learning_rate": 9.345784298834202e-06, "loss": 0.9828, "step": 4380 }, { "epoch": 0.75, "grad_norm": 41.25631175653349, "learning_rate": 9.343336432854408e-06, "loss": 0.9831, "step": 4385 }, { "epoch": 0.75, "grad_norm": 35.698092682203594, "learning_rate": 9.340884317671432e-06, "loss": 0.998, "step": 4390 }, { "epoch": 0.75, "grad_norm": 32.49391162293078, "learning_rate": 9.338427955684243e-06, "loss": 0.9806, "step": 4395 }, { "epoch": 0.75, "grad_norm": 10.241743309089527, "learning_rate": 9.335967349295967e-06, "loss": 0.9885, "step": 4400 }, { "epoch": 0.75, "grad_norm": 32.37182387434173, "learning_rate": 9.333502500913882e-06, "loss": 0.9873, "step": 4405 }, { "epoch": 0.75, "grad_norm": 30.142157717062506, "learning_rate": 9.331033412949417e-06, "loss": 1.0035, "step": 4410 }, { "epoch": 0.75, "grad_norm": 12.9808930058223, "learning_rate": 9.328560087818143e-06, "loss": 0.9826, "step": 4415 }, { "epoch": 0.75, "grad_norm": 10.087635510049255, "learning_rate": 9.326082527939786e-06, "loss": 0.9674, "step": 4420 }, { "epoch": 0.75, "grad_norm": 13.901907099337802, "learning_rate": 9.323600735738207e-06, "loss": 0.9783, "step": 4425 }, { "epoch": 0.75, "grad_norm": 22.503501555175596, "learning_rate": 9.321114713641409e-06, "loss": 0.9824, "step": 4430 }, { "epoch": 0.75, "grad_norm": 15.802700822262079, "learning_rate": 9.318624464081535e-06, "loss": 0.9759, "step": 4435 }, { "epoch": 0.75, "grad_norm": 12.704909570321815, "learning_rate": 9.316129989494866e-06, "loss": 0.9643, "step": 4440 }, { "epoch": 0.76, "grad_norm": 9.51245114231154, "learning_rate": 9.313631292321812e-06, "loss": 0.9793, "step": 4445 }, { "epoch": 0.76, "grad_norm": 9.31093610700748, "learning_rate": 9.311128375006915e-06, "loss": 1.0008, "step": 4450 }, { "epoch": 0.76, "grad_norm": 13.48618925404952, "learning_rate": 9.308621239998847e-06, "loss": 0.9894, "step": 4455 }, { "epoch": 0.76, "grad_norm": 10.277491750024144, "learning_rate": 9.306109889750405e-06, "loss": 0.987, "step": 4460 }, { "epoch": 0.76, "grad_norm": 13.651178877768626, "learning_rate": 9.303594326718514e-06, "loss": 0.9691, "step": 4465 }, { "epoch": 0.76, "grad_norm": 22.96171498854302, "learning_rate": 9.301074553364214e-06, "loss": 0.9568, "step": 4470 }, { "epoch": 0.76, "grad_norm": 21.281745464081407, "learning_rate": 9.29855057215267e-06, "loss": 0.9785, "step": 4475 }, { "epoch": 0.76, "grad_norm": 45.786548050160974, "learning_rate": 9.296022385553156e-06, "loss": 0.9658, "step": 4480 }, { "epoch": 0.76, "grad_norm": 28.449037927202944, "learning_rate": 9.293489996039068e-06, "loss": 0.9571, "step": 4485 }, { "epoch": 0.76, "grad_norm": 25.5063776566906, "learning_rate": 9.290953406087913e-06, "loss": 0.9924, "step": 4490 }, { "epoch": 0.76, "grad_norm": 23.902215031162577, "learning_rate": 9.288412618181305e-06, "loss": 0.9644, "step": 4495 }, { "epoch": 0.77, "grad_norm": 28.321553047312477, "learning_rate": 9.285867634804961e-06, "loss": 0.9682, "step": 4500 }, { "epoch": 0.77, "grad_norm": 19.414695666211387, "learning_rate": 9.283318458448711e-06, "loss": 0.9748, "step": 4505 }, { "epoch": 0.77, "grad_norm": 8.265335693282779, "learning_rate": 9.280765091606481e-06, "loss": 0.9667, "step": 4510 }, { "epoch": 0.77, "grad_norm": 10.589118747517364, "learning_rate": 9.2782075367763e-06, "loss": 0.9871, "step": 4515 }, { "epoch": 0.77, "grad_norm": 17.766284702178446, "learning_rate": 9.275645796460292e-06, "loss": 0.9696, "step": 4520 }, { "epoch": 0.77, "grad_norm": 22.62383414229558, "learning_rate": 9.273079873164676e-06, "loss": 0.972, "step": 4525 }, { "epoch": 0.77, "grad_norm": 22.746944385026122, "learning_rate": 9.270509769399767e-06, "loss": 0.9561, "step": 4530 }, { "epoch": 0.77, "grad_norm": 13.635254266812355, "learning_rate": 9.267935487679962e-06, "loss": 0.9686, "step": 4535 }, { "epoch": 0.77, "grad_norm": 26.008001828316356, "learning_rate": 9.265357030523756e-06, "loss": 0.9966, "step": 4540 }, { "epoch": 0.77, "grad_norm": 19.690837446359836, "learning_rate": 9.262774400453717e-06, "loss": 0.9762, "step": 4545 }, { "epoch": 0.77, "grad_norm": 9.165326461540976, "learning_rate": 9.260187599996507e-06, "loss": 0.9907, "step": 4550 }, { "epoch": 0.77, "grad_norm": 16.63560083766881, "learning_rate": 9.25759663168286e-06, "loss": 0.9769, "step": 4555 }, { "epoch": 0.78, "grad_norm": 29.926072811215636, "learning_rate": 9.255001498047592e-06, "loss": 0.9694, "step": 4560 }, { "epoch": 0.78, "grad_norm": 43.916438852666026, "learning_rate": 9.252402201629588e-06, "loss": 0.9616, "step": 4565 }, { "epoch": 0.78, "grad_norm": 36.62552865460111, "learning_rate": 9.249798744971815e-06, "loss": 0.9674, "step": 4570 }, { "epoch": 0.78, "grad_norm": 44.70586590779445, "learning_rate": 9.2471911306213e-06, "loss": 0.9651, "step": 4575 }, { "epoch": 0.78, "grad_norm": 24.188160765283822, "learning_rate": 9.244579361129147e-06, "loss": 0.9441, "step": 4580 }, { "epoch": 0.78, "grad_norm": 16.006816269337822, "learning_rate": 9.241963439050519e-06, "loss": 0.9683, "step": 4585 }, { "epoch": 0.78, "grad_norm": 10.293519001103515, "learning_rate": 9.239343366944641e-06, "loss": 0.967, "step": 4590 }, { "epoch": 0.78, "grad_norm": 11.409536051527816, "learning_rate": 9.236719147374801e-06, "loss": 0.9567, "step": 4595 }, { "epoch": 0.78, "grad_norm": 11.825285059132497, "learning_rate": 9.234090782908346e-06, "loss": 0.9572, "step": 4600 }, { "epoch": 0.78, "grad_norm": 13.057182699834947, "learning_rate": 9.231458276116676e-06, "loss": 0.9747, "step": 4605 }, { "epoch": 0.78, "grad_norm": 16.464766579821713, "learning_rate": 9.22882162957524e-06, "loss": 0.949, "step": 4610 }, { "epoch": 0.78, "grad_norm": 21.69207643557753, "learning_rate": 9.226180845863544e-06, "loss": 0.9741, "step": 4615 }, { "epoch": 0.79, "grad_norm": 38.55352202928315, "learning_rate": 9.223535927565135e-06, "loss": 0.9529, "step": 4620 }, { "epoch": 0.79, "grad_norm": 34.96091841468431, "learning_rate": 9.220886877267609e-06, "loss": 0.9746, "step": 4625 }, { "epoch": 0.79, "grad_norm": 20.01930871822319, "learning_rate": 9.218233697562604e-06, "loss": 0.9626, "step": 4630 }, { "epoch": 0.79, "grad_norm": 11.324063907979856, "learning_rate": 9.215576391045797e-06, "loss": 0.9602, "step": 4635 }, { "epoch": 0.79, "grad_norm": 9.150997568199399, "learning_rate": 9.212914960316902e-06, "loss": 0.9481, "step": 4640 }, { "epoch": 0.79, "grad_norm": 41.68771819761749, "learning_rate": 9.21024940797967e-06, "loss": 0.9771, "step": 4645 }, { "epoch": 0.79, "grad_norm": 73.61863057444658, "learning_rate": 9.207579736641881e-06, "loss": 0.9795, "step": 4650 }, { "epoch": 0.79, "grad_norm": 38.792496873295825, "learning_rate": 9.204905948915345e-06, "loss": 0.9535, "step": 4655 }, { "epoch": 0.79, "grad_norm": 51.91198314996973, "learning_rate": 9.202228047415905e-06, "loss": 0.9454, "step": 4660 }, { "epoch": 0.79, "grad_norm": 66.68820192554747, "learning_rate": 9.199546034763423e-06, "loss": 0.9643, "step": 4665 }, { "epoch": 0.79, "grad_norm": 115.50748156350188, "learning_rate": 9.196859913581781e-06, "loss": 0.978, "step": 4670 }, { "epoch": 0.79, "grad_norm": 50.953571534266246, "learning_rate": 9.194169686498887e-06, "loss": 0.9425, "step": 4675 }, { "epoch": 0.8, "grad_norm": 49.81249494752531, "learning_rate": 9.191475356146661e-06, "loss": 0.9647, "step": 4680 }, { "epoch": 0.8, "grad_norm": 9.125148068708931, "learning_rate": 9.188776925161042e-06, "loss": 0.9427, "step": 4685 }, { "epoch": 0.8, "grad_norm": 35.472294700071664, "learning_rate": 9.186074396181974e-06, "loss": 0.9474, "step": 4690 }, { "epoch": 0.8, "grad_norm": 31.34096469566438, "learning_rate": 9.183367771853417e-06, "loss": 0.9613, "step": 4695 }, { "epoch": 0.8, "grad_norm": 29.51902471391319, "learning_rate": 9.180657054823334e-06, "loss": 0.9462, "step": 4700 }, { "epoch": 0.8, "grad_norm": 15.096319871398451, "learning_rate": 9.17794224774369e-06, "loss": 0.956, "step": 4705 }, { "epoch": 0.8, "grad_norm": 9.97502452642927, "learning_rate": 9.175223353270457e-06, "loss": 0.9323, "step": 4710 }, { "epoch": 0.8, "grad_norm": 11.373732134631773, "learning_rate": 9.172500374063603e-06, "loss": 0.9411, "step": 4715 }, { "epoch": 0.8, "grad_norm": 8.572979968003773, "learning_rate": 9.169773312787086e-06, "loss": 0.9431, "step": 4720 }, { "epoch": 0.8, "grad_norm": 17.165158198545925, "learning_rate": 9.167042172108874e-06, "loss": 0.9598, "step": 4725 }, { "epoch": 0.8, "grad_norm": 9.683924164964983, "learning_rate": 9.164306954700905e-06, "loss": 0.9486, "step": 4730 }, { "epoch": 0.8, "grad_norm": 15.224964756040686, "learning_rate": 9.161567663239126e-06, "loss": 0.9391, "step": 4735 }, { "epoch": 0.81, "grad_norm": 10.376098770544099, "learning_rate": 9.15882430040345e-06, "loss": 0.963, "step": 4740 }, { "epoch": 0.81, "grad_norm": 10.908609034274736, "learning_rate": 9.15607686887779e-06, "loss": 0.9311, "step": 4745 }, { "epoch": 0.81, "grad_norm": 12.63840384715431, "learning_rate": 9.153325371350028e-06, "loss": 0.9697, "step": 4750 }, { "epoch": 0.81, "grad_norm": 22.946614579574426, "learning_rate": 9.150569810512033e-06, "loss": 0.96, "step": 4755 }, { "epoch": 0.81, "grad_norm": 13.167063738591903, "learning_rate": 9.147810189059639e-06, "loss": 0.9479, "step": 4760 }, { "epoch": 0.81, "grad_norm": 37.92914695340067, "learning_rate": 9.145046509692661e-06, "loss": 0.9423, "step": 4765 }, { "epoch": 0.81, "grad_norm": 24.365889248555195, "learning_rate": 9.142278775114882e-06, "loss": 0.9418, "step": 4770 }, { "epoch": 0.81, "grad_norm": 67.68436185583097, "learning_rate": 9.139506988034049e-06, "loss": 0.9537, "step": 4775 }, { "epoch": 0.81, "grad_norm": 39.491890864272584, "learning_rate": 9.136731151161877e-06, "loss": 0.9591, "step": 4780 }, { "epoch": 0.81, "grad_norm": 45.38530370615395, "learning_rate": 9.133951267214043e-06, "loss": 0.9355, "step": 4785 }, { "epoch": 0.81, "grad_norm": 14.378791634160802, "learning_rate": 9.13116733891018e-06, "loss": 0.9579, "step": 4790 }, { "epoch": 0.82, "grad_norm": 45.370670351833056, "learning_rate": 9.128379368973884e-06, "loss": 0.9408, "step": 4795 }, { "epoch": 0.82, "grad_norm": 22.49270893151392, "learning_rate": 9.125587360132697e-06, "loss": 0.9281, "step": 4800 }, { "epoch": 0.82, "grad_norm": 43.64429040063124, "learning_rate": 9.12279131511812e-06, "loss": 0.9346, "step": 4805 }, { "epoch": 0.82, "grad_norm": 9.215164955234982, "learning_rate": 9.1199912366656e-06, "loss": 0.9424, "step": 4810 }, { "epoch": 0.82, "grad_norm": 11.742762494067678, "learning_rate": 9.117187127514524e-06, "loss": 0.9456, "step": 4815 }, { "epoch": 0.82, "grad_norm": 7.727143118510911, "learning_rate": 9.11437899040823e-06, "loss": 0.9465, "step": 4820 }, { "epoch": 0.82, "grad_norm": 10.832153090627127, "learning_rate": 9.111566828093998e-06, "loss": 0.9178, "step": 4825 }, { "epoch": 0.82, "grad_norm": 9.630265380456894, "learning_rate": 9.108750643323036e-06, "loss": 0.9504, "step": 4830 }, { "epoch": 0.82, "grad_norm": 19.444717713770846, "learning_rate": 9.1059304388505e-06, "loss": 0.9245, "step": 4835 }, { "epoch": 0.82, "grad_norm": 24.329705280184676, "learning_rate": 9.103106217435467e-06, "loss": 0.9341, "step": 4840 }, { "epoch": 0.82, "grad_norm": 27.83847152420756, "learning_rate": 9.100277981840953e-06, "loss": 0.9589, "step": 4845 }, { "epoch": 0.82, "grad_norm": 11.50842107404206, "learning_rate": 9.097445734833893e-06, "loss": 0.9566, "step": 4850 }, { "epoch": 0.83, "grad_norm": 43.06023501265569, "learning_rate": 9.094609479185153e-06, "loss": 0.9532, "step": 4855 }, { "epoch": 0.83, "grad_norm": 50.720088892736904, "learning_rate": 9.091769217669517e-06, "loss": 0.9569, "step": 4860 }, { "epoch": 0.83, "grad_norm": 62.24897567865879, "learning_rate": 9.088924953065691e-06, "loss": 0.9266, "step": 4865 }, { "epoch": 0.83, "grad_norm": 57.80399144018989, "learning_rate": 9.086076688156297e-06, "loss": 0.9427, "step": 4870 }, { "epoch": 0.83, "grad_norm": 8.46459487344967, "learning_rate": 9.083224425727867e-06, "loss": 0.9351, "step": 4875 }, { "epoch": 0.83, "grad_norm": 16.698678860667545, "learning_rate": 9.080368168570845e-06, "loss": 0.9469, "step": 4880 }, { "epoch": 0.83, "grad_norm": 9.687820627493341, "learning_rate": 9.077507919479589e-06, "loss": 0.9212, "step": 4885 }, { "epoch": 0.83, "grad_norm": 45.913078252317284, "learning_rate": 9.074643681252356e-06, "loss": 0.9336, "step": 4890 }, { "epoch": 0.83, "grad_norm": 32.80796660229274, "learning_rate": 9.071775456691303e-06, "loss": 0.9298, "step": 4895 }, { "epoch": 0.83, "grad_norm": 10.983476920809734, "learning_rate": 9.068903248602497e-06, "loss": 0.9457, "step": 4900 }, { "epoch": 0.83, "grad_norm": 10.211639785267453, "learning_rate": 9.066027059795896e-06, "loss": 0.9482, "step": 4905 }, { "epoch": 0.83, "grad_norm": 10.51243616241861, "learning_rate": 9.06314689308535e-06, "loss": 0.934, "step": 4910 }, { "epoch": 0.84, "grad_norm": 18.320268531763826, "learning_rate": 9.060262751288607e-06, "loss": 0.9434, "step": 4915 }, { "epoch": 0.84, "grad_norm": 11.275224107377596, "learning_rate": 9.057374637227299e-06, "loss": 0.9162, "step": 4920 }, { "epoch": 0.84, "grad_norm": 9.384577769247377, "learning_rate": 9.054482553726946e-06, "loss": 0.9366, "step": 4925 }, { "epoch": 0.84, "grad_norm": 7.895425873893831, "learning_rate": 9.051586503616952e-06, "loss": 0.9241, "step": 4930 }, { "epoch": 0.84, "grad_norm": 14.604982979399926, "learning_rate": 9.0486864897306e-06, "loss": 0.9256, "step": 4935 }, { "epoch": 0.84, "grad_norm": 22.10558216735927, "learning_rate": 9.045782514905052e-06, "loss": 0.9175, "step": 4940 }, { "epoch": 0.84, "grad_norm": 21.66537436134228, "learning_rate": 9.042874581981347e-06, "loss": 0.9241, "step": 4945 }, { "epoch": 0.84, "grad_norm": 28.46117267926381, "learning_rate": 9.03996269380439e-06, "loss": 0.9401, "step": 4950 }, { "epoch": 0.84, "grad_norm": 42.00707684144836, "learning_rate": 9.037046853222963e-06, "loss": 0.932, "step": 4955 }, { "epoch": 0.84, "grad_norm": 64.63923023317382, "learning_rate": 9.034127063089712e-06, "loss": 0.9438, "step": 4960 }, { "epoch": 0.84, "grad_norm": 78.80763586939075, "learning_rate": 9.031203326261144e-06, "loss": 0.9452, "step": 4965 }, { "epoch": 0.84, "grad_norm": 75.57620927164031, "learning_rate": 9.028275645597631e-06, "loss": 0.9298, "step": 4970 }, { "epoch": 0.85, "grad_norm": 8.658113859599196, "learning_rate": 9.0253440239634e-06, "loss": 0.923, "step": 4975 }, { "epoch": 0.85, "grad_norm": 48.495850283320955, "learning_rate": 9.022408464226541e-06, "loss": 0.9365, "step": 4980 }, { "epoch": 0.85, "grad_norm": 37.806366636999435, "learning_rate": 9.019468969258985e-06, "loss": 0.9413, "step": 4985 }, { "epoch": 0.85, "grad_norm": 57.21603624921526, "learning_rate": 9.01652554193652e-06, "loss": 0.9333, "step": 4990 }, { "epoch": 0.85, "grad_norm": 40.6670860223311, "learning_rate": 9.013578185138784e-06, "loss": 0.9312, "step": 4995 }, { "epoch": 0.85, "grad_norm": 14.668175890753217, "learning_rate": 9.010626901749254e-06, "loss": 0.9295, "step": 5000 }, { "epoch": 0.85, "grad_norm": 40.73927855960309, "learning_rate": 9.00767169465525e-06, "loss": 0.9169, "step": 5005 }, { "epoch": 0.85, "grad_norm": 12.109205560365105, "learning_rate": 9.004712566747929e-06, "loss": 0.916, "step": 5010 }, { "epoch": 0.85, "grad_norm": 7.781500255230441, "learning_rate": 9.001749520922289e-06, "loss": 0.9198, "step": 5015 }, { "epoch": 0.85, "grad_norm": 25.48428047305956, "learning_rate": 8.998782560077155e-06, "loss": 0.9068, "step": 5020 }, { "epoch": 0.85, "grad_norm": 32.98125695456632, "learning_rate": 8.995811687115186e-06, "loss": 0.932, "step": 5025 }, { "epoch": 0.86, "grad_norm": 14.97283215980044, "learning_rate": 8.992836904942865e-06, "loss": 0.9379, "step": 5030 }, { "epoch": 0.86, "grad_norm": 9.253049260277942, "learning_rate": 8.989858216470507e-06, "loss": 0.918, "step": 5035 }, { "epoch": 0.86, "grad_norm": 12.544214252920154, "learning_rate": 8.986875624612236e-06, "loss": 0.913, "step": 5040 }, { "epoch": 0.86, "grad_norm": 22.04564956071582, "learning_rate": 8.98388913228601e-06, "loss": 0.9178, "step": 5045 }, { "epoch": 0.86, "grad_norm": 17.683536399577275, "learning_rate": 8.980898742413587e-06, "loss": 0.9304, "step": 5050 }, { "epoch": 0.86, "grad_norm": 14.161467334972684, "learning_rate": 8.977904457920552e-06, "loss": 0.9145, "step": 5055 }, { "epoch": 0.86, "grad_norm": 8.577177096548642, "learning_rate": 8.974906281736291e-06, "loss": 0.914, "step": 5060 }, { "epoch": 0.86, "grad_norm": 13.327368511504124, "learning_rate": 8.971904216794002e-06, "loss": 0.9043, "step": 5065 }, { "epoch": 0.86, "grad_norm": 21.406488269474284, "learning_rate": 8.968898266030688e-06, "loss": 0.9202, "step": 5070 }, { "epoch": 0.86, "grad_norm": 31.269621520553603, "learning_rate": 8.965888432387147e-06, "loss": 0.9151, "step": 5075 }, { "epoch": 0.86, "grad_norm": 18.802503941578262, "learning_rate": 8.962874718807984e-06, "loss": 0.9097, "step": 5080 }, { "epoch": 0.86, "grad_norm": 18.848981198337093, "learning_rate": 8.959857128241596e-06, "loss": 0.9025, "step": 5085 }, { "epoch": 0.87, "grad_norm": 12.072241314901154, "learning_rate": 8.956835663640173e-06, "loss": 0.9138, "step": 5090 }, { "epoch": 0.87, "grad_norm": 9.332658464526737, "learning_rate": 8.953810327959693e-06, "loss": 0.9048, "step": 5095 }, { "epoch": 0.87, "grad_norm": 21.132008030484357, "learning_rate": 8.950781124159926e-06, "loss": 0.9178, "step": 5100 }, { "epoch": 0.87, "grad_norm": 39.50889372852151, "learning_rate": 8.947748055204424e-06, "loss": 0.9219, "step": 5105 }, { "epoch": 0.87, "grad_norm": 21.346258805968034, "learning_rate": 8.944711124060519e-06, "loss": 0.9192, "step": 5110 }, { "epoch": 0.87, "grad_norm": 8.112949405602714, "learning_rate": 8.941670333699323e-06, "loss": 0.9018, "step": 5115 }, { "epoch": 0.87, "grad_norm": 35.026352865269004, "learning_rate": 8.938625687095723e-06, "loss": 0.9051, "step": 5120 }, { "epoch": 0.87, "grad_norm": 21.948641624705495, "learning_rate": 8.93557718722838e-06, "loss": 0.9302, "step": 5125 }, { "epoch": 0.87, "grad_norm": 13.460413331641375, "learning_rate": 8.932524837079721e-06, "loss": 0.9094, "step": 5130 }, { "epoch": 0.87, "grad_norm": 33.16554501484199, "learning_rate": 8.929468639635946e-06, "loss": 0.9112, "step": 5135 }, { "epoch": 0.87, "grad_norm": 55.507150253651496, "learning_rate": 8.926408597887013e-06, "loss": 0.9076, "step": 5140 }, { "epoch": 0.87, "grad_norm": 25.672990324546266, "learning_rate": 8.923344714826646e-06, "loss": 0.9157, "step": 5145 }, { "epoch": 0.88, "grad_norm": 12.314062818429154, "learning_rate": 8.920276993452319e-06, "loss": 0.9138, "step": 5150 }, { "epoch": 0.88, "grad_norm": 9.4944502059902, "learning_rate": 8.917205436765272e-06, "loss": 0.9034, "step": 5155 }, { "epoch": 0.88, "grad_norm": 10.607776879666043, "learning_rate": 8.914130047770488e-06, "loss": 0.9163, "step": 5160 }, { "epoch": 0.88, "grad_norm": 15.664686781195138, "learning_rate": 8.911050829476707e-06, "loss": 0.9142, "step": 5165 }, { "epoch": 0.88, "grad_norm": 13.415556266796443, "learning_rate": 8.90796778489641e-06, "loss": 0.9195, "step": 5170 }, { "epoch": 0.88, "grad_norm": 20.298844004102687, "learning_rate": 8.90488091704582e-06, "loss": 0.9105, "step": 5175 }, { "epoch": 0.88, "grad_norm": 9.737019741824055, "learning_rate": 8.901790228944904e-06, "loss": 0.9048, "step": 5180 }, { "epoch": 0.88, "grad_norm": 13.18933269117288, "learning_rate": 8.898695723617368e-06, "loss": 0.9123, "step": 5185 }, { "epoch": 0.88, "grad_norm": 27.076492095174448, "learning_rate": 8.895597404090647e-06, "loss": 0.9173, "step": 5190 }, { "epoch": 0.88, "grad_norm": 17.017497657176467, "learning_rate": 8.892495273395913e-06, "loss": 0.9235, "step": 5195 }, { "epoch": 0.88, "grad_norm": 13.241184424854241, "learning_rate": 8.889389334568061e-06, "loss": 0.8984, "step": 5200 }, { "epoch": 0.88, "grad_norm": 14.896688029270276, "learning_rate": 8.88627959064572e-06, "loss": 0.8982, "step": 5205 }, { "epoch": 0.89, "grad_norm": 10.933863082457536, "learning_rate": 8.883166044671232e-06, "loss": 0.9027, "step": 5210 }, { "epoch": 0.89, "grad_norm": 21.301815661889073, "learning_rate": 8.880048699690664e-06, "loss": 0.8975, "step": 5215 }, { "epoch": 0.89, "grad_norm": 17.79520585776361, "learning_rate": 8.876927558753798e-06, "loss": 0.9131, "step": 5220 }, { "epoch": 0.89, "grad_norm": 69.54083736297005, "learning_rate": 8.873802624914132e-06, "loss": 0.8882, "step": 5225 }, { "epoch": 0.89, "grad_norm": 29.51640291474208, "learning_rate": 8.870673901228874e-06, "loss": 0.8986, "step": 5230 }, { "epoch": 0.89, "grad_norm": 48.5416443853786, "learning_rate": 8.867541390758935e-06, "loss": 0.8993, "step": 5235 }, { "epoch": 0.89, "grad_norm": 45.16022542591326, "learning_rate": 8.864405096568937e-06, "loss": 0.8991, "step": 5240 }, { "epoch": 0.89, "grad_norm": 48.08298242632971, "learning_rate": 8.861265021727202e-06, "loss": 0.8939, "step": 5245 }, { "epoch": 0.89, "grad_norm": 14.156779794459752, "learning_rate": 8.858121169305747e-06, "loss": 0.9005, "step": 5250 }, { "epoch": 0.89, "grad_norm": 16.63891538193591, "learning_rate": 8.854973542380289e-06, "loss": 0.907, "step": 5255 }, { "epoch": 0.89, "grad_norm": 48.7776713914096, "learning_rate": 8.851822144030237e-06, "loss": 0.9039, "step": 5260 }, { "epoch": 0.9, "grad_norm": 55.13232465033205, "learning_rate": 8.848666977338689e-06, "loss": 0.8962, "step": 5265 }, { "epoch": 0.9, "grad_norm": 29.5918085171386, "learning_rate": 8.84550804539243e-06, "loss": 0.9048, "step": 5270 }, { "epoch": 0.9, "grad_norm": 25.783491763675134, "learning_rate": 8.842345351281927e-06, "loss": 0.9006, "step": 5275 }, { "epoch": 0.9, "grad_norm": 38.42385751738853, "learning_rate": 8.839178898101327e-06, "loss": 0.9112, "step": 5280 }, { "epoch": 0.9, "grad_norm": 17.363290467768564, "learning_rate": 8.836008688948463e-06, "loss": 0.9023, "step": 5285 }, { "epoch": 0.9, "grad_norm": 11.318139570581586, "learning_rate": 8.832834726924832e-06, "loss": 0.9123, "step": 5290 }, { "epoch": 0.9, "grad_norm": 10.274091473033106, "learning_rate": 8.829657015135605e-06, "loss": 0.8881, "step": 5295 }, { "epoch": 0.9, "grad_norm": 9.354741961314343, "learning_rate": 8.826475556689627e-06, "loss": 0.8992, "step": 5300 }, { "epoch": 0.9, "grad_norm": 22.730814539741452, "learning_rate": 8.8232903546994e-06, "loss": 0.8945, "step": 5305 }, { "epoch": 0.9, "grad_norm": 11.52278714976617, "learning_rate": 8.820101412281098e-06, "loss": 0.9051, "step": 5310 }, { "epoch": 0.9, "grad_norm": 21.82268336044992, "learning_rate": 8.816908732554546e-06, "loss": 0.8859, "step": 5315 }, { "epoch": 0.9, "grad_norm": 10.685162774578716, "learning_rate": 8.81371231864323e-06, "loss": 0.896, "step": 5320 }, { "epoch": 0.91, "grad_norm": 18.962361513069325, "learning_rate": 8.810512173674288e-06, "loss": 0.8869, "step": 5325 }, { "epoch": 0.91, "grad_norm": 17.735749882533618, "learning_rate": 8.807308300778508e-06, "loss": 0.9119, "step": 5330 }, { "epoch": 0.91, "grad_norm": 32.718411258492466, "learning_rate": 8.804100703090324e-06, "loss": 0.8902, "step": 5335 }, { "epoch": 0.91, "grad_norm": 17.748223147493437, "learning_rate": 8.800889383747817e-06, "loss": 0.8814, "step": 5340 }, { "epoch": 0.91, "grad_norm": 7.8215112357413545, "learning_rate": 8.797674345892707e-06, "loss": 0.8917, "step": 5345 }, { "epoch": 0.91, "grad_norm": 9.491577453092495, "learning_rate": 8.794455592670353e-06, "loss": 0.8965, "step": 5350 }, { "epoch": 0.91, "grad_norm": 30.099277973463792, "learning_rate": 8.791233127229746e-06, "loss": 0.8815, "step": 5355 }, { "epoch": 0.91, "grad_norm": 21.471034023934507, "learning_rate": 8.788006952723512e-06, "loss": 0.9014, "step": 5360 }, { "epoch": 0.91, "grad_norm": 21.232711801308177, "learning_rate": 8.784777072307904e-06, "loss": 0.892, "step": 5365 }, { "epoch": 0.91, "grad_norm": 7.428510179914167, "learning_rate": 8.781543489142802e-06, "loss": 0.872, "step": 5370 }, { "epoch": 0.91, "grad_norm": 11.45373486754463, "learning_rate": 8.778306206391707e-06, "loss": 0.8938, "step": 5375 }, { "epoch": 0.91, "grad_norm": 13.15825587665837, "learning_rate": 8.775065227221742e-06, "loss": 0.8897, "step": 5380 }, { "epoch": 0.92, "grad_norm": 20.21159615999797, "learning_rate": 8.77182055480364e-06, "loss": 0.8693, "step": 5385 }, { "epoch": 0.92, "grad_norm": 20.704802237923644, "learning_rate": 8.768572192311757e-06, "loss": 0.9013, "step": 5390 }, { "epoch": 0.92, "grad_norm": 19.60393581725405, "learning_rate": 8.76532014292405e-06, "loss": 0.8802, "step": 5395 }, { "epoch": 0.92, "grad_norm": 32.697742283265754, "learning_rate": 8.762064409822086e-06, "loss": 0.9065, "step": 5400 }, { "epoch": 0.92, "grad_norm": 34.84197654856875, "learning_rate": 8.758804996191039e-06, "loss": 0.8728, "step": 5405 }, { "epoch": 0.92, "grad_norm": 14.215924467751915, "learning_rate": 8.75554190521968e-06, "loss": 0.873, "step": 5410 }, { "epoch": 0.92, "grad_norm": 12.416431470943925, "learning_rate": 8.752275140100379e-06, "loss": 0.8839, "step": 5415 }, { "epoch": 0.92, "grad_norm": 11.803589736885627, "learning_rate": 8.749004704029101e-06, "loss": 0.8829, "step": 5420 }, { "epoch": 0.92, "grad_norm": 10.893782852593526, "learning_rate": 8.745730600205402e-06, "loss": 0.8834, "step": 5425 }, { "epoch": 0.92, "grad_norm": 13.70609303531889, "learning_rate": 8.742452831832424e-06, "loss": 0.8986, "step": 5430 }, { "epoch": 0.92, "grad_norm": 23.16719204178583, "learning_rate": 8.7391714021169e-06, "loss": 0.9119, "step": 5435 }, { "epoch": 0.92, "grad_norm": 8.350841301575068, "learning_rate": 8.735886314269136e-06, "loss": 0.8937, "step": 5440 }, { "epoch": 0.93, "grad_norm": 10.644162093171502, "learning_rate": 8.732597571503028e-06, "loss": 0.8847, "step": 5445 }, { "epoch": 0.93, "grad_norm": 26.775136944455053, "learning_rate": 8.729305177036035e-06, "loss": 0.8868, "step": 5450 }, { "epoch": 0.93, "grad_norm": 16.916084579118998, "learning_rate": 8.726009134089202e-06, "loss": 0.8925, "step": 5455 }, { "epoch": 0.93, "grad_norm": 25.97610450325379, "learning_rate": 8.722709445887132e-06, "loss": 0.888, "step": 5460 }, { "epoch": 0.93, "grad_norm": 16.674169978898313, "learning_rate": 8.719406115658002e-06, "loss": 0.8882, "step": 5465 }, { "epoch": 0.93, "grad_norm": 9.1421190490301, "learning_rate": 8.716099146633548e-06, "loss": 0.8817, "step": 5470 }, { "epoch": 0.93, "grad_norm": 12.709501995828935, "learning_rate": 8.712788542049066e-06, "loss": 0.8745, "step": 5475 }, { "epoch": 0.93, "grad_norm": 11.302338450780343, "learning_rate": 8.70947430514341e-06, "loss": 0.8794, "step": 5480 }, { "epoch": 0.93, "grad_norm": 25.359202453596357, "learning_rate": 8.706156439158988e-06, "loss": 0.8961, "step": 5485 }, { "epoch": 0.93, "grad_norm": 14.57404538528837, "learning_rate": 8.702834947341759e-06, "loss": 0.8548, "step": 5490 }, { "epoch": 0.93, "grad_norm": 8.621561636096867, "learning_rate": 8.699509832941224e-06, "loss": 0.8827, "step": 5495 }, { "epoch": 0.94, "grad_norm": 10.556414130907989, "learning_rate": 8.696181099210436e-06, "loss": 0.899, "step": 5500 }, { "epoch": 0.94, "grad_norm": 20.965519473228266, "learning_rate": 8.692848749405985e-06, "loss": 0.8852, "step": 5505 }, { "epoch": 0.94, "grad_norm": 7.702078440520662, "learning_rate": 8.689512786787996e-06, "loss": 0.8554, "step": 5510 }, { "epoch": 0.94, "grad_norm": 15.307984622928407, "learning_rate": 8.686173214620134e-06, "loss": 0.8872, "step": 5515 }, { "epoch": 0.94, "grad_norm": 36.1312012987165, "learning_rate": 8.68283003616959e-06, "loss": 0.8548, "step": 5520 }, { "epoch": 0.94, "grad_norm": 13.263820091037312, "learning_rate": 8.679483254707089e-06, "loss": 0.8847, "step": 5525 }, { "epoch": 0.94, "grad_norm": 13.888523618921328, "learning_rate": 8.676132873506873e-06, "loss": 0.8723, "step": 5530 }, { "epoch": 0.94, "grad_norm": 7.882036703727629, "learning_rate": 8.672778895846715e-06, "loss": 0.8834, "step": 5535 }, { "epoch": 0.94, "grad_norm": 20.865318506881103, "learning_rate": 8.669421325007897e-06, "loss": 0.8802, "step": 5540 }, { "epoch": 0.94, "grad_norm": 11.716621596417292, "learning_rate": 8.666060164275224e-06, "loss": 0.8843, "step": 5545 }, { "epoch": 0.94, "grad_norm": 9.636304054762679, "learning_rate": 8.66269541693701e-06, "loss": 0.8835, "step": 5550 }, { "epoch": 0.94, "grad_norm": 8.36733991467472, "learning_rate": 8.659327086285079e-06, "loss": 0.8919, "step": 5555 }, { "epoch": 0.95, "grad_norm": 33.84809764114348, "learning_rate": 8.655955175614758e-06, "loss": 0.8774, "step": 5560 }, { "epoch": 0.95, "grad_norm": 39.82957262336742, "learning_rate": 8.65257968822488e-06, "loss": 0.8783, "step": 5565 }, { "epoch": 0.95, "grad_norm": 67.01059353943249, "learning_rate": 8.649200627417774e-06, "loss": 0.8784, "step": 5570 }, { "epoch": 0.95, "grad_norm": 45.26710543417326, "learning_rate": 8.645817996499264e-06, "loss": 0.866, "step": 5575 }, { "epoch": 0.95, "grad_norm": 74.55636509273494, "learning_rate": 8.642431798778675e-06, "loss": 0.8906, "step": 5580 }, { "epoch": 0.95, "grad_norm": 18.565055424673996, "learning_rate": 8.63904203756881e-06, "loss": 0.8829, "step": 5585 }, { "epoch": 0.95, "grad_norm": 23.61630072608257, "learning_rate": 8.635648716185964e-06, "loss": 0.8635, "step": 5590 }, { "epoch": 0.95, "grad_norm": 17.35054245441266, "learning_rate": 8.632251837949921e-06, "loss": 0.8593, "step": 5595 }, { "epoch": 0.95, "grad_norm": 8.106255161947752, "learning_rate": 8.628851406183932e-06, "loss": 0.884, "step": 5600 }, { "epoch": 0.95, "grad_norm": 13.889113128468237, "learning_rate": 8.625447424214734e-06, "loss": 0.8813, "step": 5605 }, { "epoch": 0.95, "grad_norm": 12.865051244498067, "learning_rate": 8.622039895372533e-06, "loss": 0.8618, "step": 5610 }, { "epoch": 0.95, "grad_norm": 9.349655012279877, "learning_rate": 8.618628822991009e-06, "loss": 0.8655, "step": 5615 }, { "epoch": 0.96, "grad_norm": 23.20553059319047, "learning_rate": 8.615214210407304e-06, "loss": 0.8658, "step": 5620 }, { "epoch": 0.96, "grad_norm": 30.110412035643414, "learning_rate": 8.611796060962025e-06, "loss": 0.8619, "step": 5625 }, { "epoch": 0.96, "grad_norm": 8.021610994380586, "learning_rate": 8.608374377999242e-06, "loss": 0.8635, "step": 5630 }, { "epoch": 0.96, "grad_norm": 7.71017469605099, "learning_rate": 8.604949164866478e-06, "loss": 0.8625, "step": 5635 }, { "epoch": 0.96, "grad_norm": 16.572210414926698, "learning_rate": 8.601520424914712e-06, "loss": 0.8858, "step": 5640 }, { "epoch": 0.96, "grad_norm": 22.351789187509702, "learning_rate": 8.598088161498372e-06, "loss": 0.8571, "step": 5645 }, { "epoch": 0.96, "grad_norm": 36.86554438746097, "learning_rate": 8.594652377975335e-06, "loss": 0.8658, "step": 5650 }, { "epoch": 0.96, "grad_norm": 20.15626490218535, "learning_rate": 8.591213077706918e-06, "loss": 0.88, "step": 5655 }, { "epoch": 0.96, "grad_norm": 17.06297849796565, "learning_rate": 8.587770264057887e-06, "loss": 0.8498, "step": 5660 }, { "epoch": 0.96, "grad_norm": 10.00576879340255, "learning_rate": 8.584323940396435e-06, "loss": 0.8561, "step": 5665 }, { "epoch": 0.96, "grad_norm": 39.968686341276495, "learning_rate": 8.580874110094193e-06, "loss": 0.8756, "step": 5670 }, { "epoch": 0.96, "grad_norm": 26.27040895403256, "learning_rate": 8.577420776526225e-06, "loss": 0.8655, "step": 5675 }, { "epoch": 0.97, "grad_norm": 11.695621589411635, "learning_rate": 8.57396394307102e-06, "loss": 0.8633, "step": 5680 }, { "epoch": 0.97, "grad_norm": 15.951644924859142, "learning_rate": 8.57050361311049e-06, "loss": 0.8641, "step": 5685 }, { "epoch": 0.97, "grad_norm": 14.871755754368417, "learning_rate": 8.567039790029972e-06, "loss": 0.8441, "step": 5690 }, { "epoch": 0.97, "grad_norm": 18.86324287898087, "learning_rate": 8.563572477218216e-06, "loss": 0.8558, "step": 5695 }, { "epoch": 0.97, "grad_norm": 12.9497961055212, "learning_rate": 8.560101678067385e-06, "loss": 0.8626, "step": 5700 }, { "epoch": 0.97, "grad_norm": 12.64541028079422, "learning_rate": 8.55662739597306e-06, "loss": 0.8723, "step": 5705 }, { "epoch": 0.97, "grad_norm": 8.077404639704291, "learning_rate": 8.553149634334221e-06, "loss": 0.8498, "step": 5710 }, { "epoch": 0.97, "grad_norm": 14.095385130597634, "learning_rate": 8.54966839655326e-06, "loss": 0.8676, "step": 5715 }, { "epoch": 0.97, "grad_norm": 16.816198204667515, "learning_rate": 8.546183686035963e-06, "loss": 0.86, "step": 5720 }, { "epoch": 0.97, "grad_norm": 10.324141871325514, "learning_rate": 8.542695506191516e-06, "loss": 0.8453, "step": 5725 }, { "epoch": 0.97, "grad_norm": 22.931657761275982, "learning_rate": 8.5392038604325e-06, "loss": 0.8624, "step": 5730 }, { "epoch": 0.98, "grad_norm": 22.340937938883243, "learning_rate": 8.535708752174887e-06, "loss": 0.8584, "step": 5735 }, { "epoch": 0.98, "grad_norm": 13.47137594925963, "learning_rate": 8.532210184838034e-06, "loss": 0.8562, "step": 5740 }, { "epoch": 0.98, "grad_norm": 7.022989456393278, "learning_rate": 8.528708161844688e-06, "loss": 0.8517, "step": 5745 }, { "epoch": 0.98, "grad_norm": 14.389404070039904, "learning_rate": 8.525202686620968e-06, "loss": 0.8591, "step": 5750 }, { "epoch": 0.98, "grad_norm": 11.069368671975361, "learning_rate": 8.521693762596376e-06, "loss": 0.8572, "step": 5755 }, { "epoch": 0.98, "grad_norm": 31.320280806061785, "learning_rate": 8.518181393203787e-06, "loss": 0.868, "step": 5760 }, { "epoch": 0.98, "grad_norm": 8.119093866352447, "learning_rate": 8.514665581879448e-06, "loss": 0.8533, "step": 5765 }, { "epoch": 0.98, "grad_norm": 20.745899092682233, "learning_rate": 8.511146332062971e-06, "loss": 0.8492, "step": 5770 }, { "epoch": 0.98, "grad_norm": 9.102720102074791, "learning_rate": 8.507623647197334e-06, "loss": 0.8752, "step": 5775 }, { "epoch": 0.98, "grad_norm": 13.17053377877784, "learning_rate": 8.504097530728875e-06, "loss": 0.8411, "step": 5780 }, { "epoch": 0.98, "grad_norm": 28.606022516403232, "learning_rate": 8.500567986107286e-06, "loss": 0.8534, "step": 5785 }, { "epoch": 0.98, "grad_norm": 27.615463405818634, "learning_rate": 8.497035016785617e-06, "loss": 0.8369, "step": 5790 }, { "epoch": 0.99, "grad_norm": 36.01795392268902, "learning_rate": 8.49349862622027e-06, "loss": 0.8488, "step": 5795 }, { "epoch": 0.99, "grad_norm": 13.213418700089807, "learning_rate": 8.489958817870987e-06, "loss": 0.8535, "step": 5800 }, { "epoch": 0.99, "grad_norm": 15.444031330332194, "learning_rate": 8.486415595200862e-06, "loss": 0.8453, "step": 5805 }, { "epoch": 0.99, "grad_norm": 10.335189050348509, "learning_rate": 8.482868961676321e-06, "loss": 0.8573, "step": 5810 }, { "epoch": 0.99, "grad_norm": 15.00363654906774, "learning_rate": 8.479318920767133e-06, "loss": 0.8515, "step": 5815 }, { "epoch": 0.99, "grad_norm": 13.073163283981378, "learning_rate": 8.4757654759464e-06, "loss": 0.8542, "step": 5820 }, { "epoch": 0.99, "grad_norm": 29.39872449053244, "learning_rate": 8.472208630690553e-06, "loss": 0.8415, "step": 5825 }, { "epoch": 0.99, "grad_norm": 12.515141299600042, "learning_rate": 8.468648388479347e-06, "loss": 0.8381, "step": 5830 }, { "epoch": 0.99, "grad_norm": 31.37491484904034, "learning_rate": 8.465084752795867e-06, "loss": 0.8603, "step": 5835 }, { "epoch": 0.99, "grad_norm": 58.3350303623845, "learning_rate": 8.461517727126511e-06, "loss": 0.836, "step": 5840 }, { "epoch": 0.99, "grad_norm": 29.165439866135134, "learning_rate": 8.457947314960996e-06, "loss": 0.858, "step": 5845 }, { "epoch": 0.99, "grad_norm": 11.285011618464788, "learning_rate": 8.454373519792355e-06, "loss": 0.8491, "step": 5850 }, { "epoch": 1.0, "grad_norm": 11.832032196171967, "learning_rate": 8.450796345116926e-06, "loss": 0.8352, "step": 5855 }, { "epoch": 1.0, "grad_norm": 10.440740651835677, "learning_rate": 8.447215794434356e-06, "loss": 0.8607, "step": 5860 }, { "epoch": 1.0, "grad_norm": 53.62948227734646, "learning_rate": 8.443631871247598e-06, "loss": 0.8758, "step": 5865 }, { "epoch": 1.0, "grad_norm": 40.84158978189534, "learning_rate": 8.440044579062894e-06, "loss": 0.8418, "step": 5870 }, { "epoch": 1.0, "grad_norm": 8.478218209150542, "learning_rate": 8.436453921389791e-06, "loss": 0.8503, "step": 5875 }, { "epoch": 1.0, "grad_norm": 14.378497072180087, "learning_rate": 8.43285990174113e-06, "loss": 0.8671, "step": 5880 }, { "epoch": 1.0, "eval_loss": 0.7445269227027893, "eval_runtime": 75.2523, "eval_samples_per_second": 4.81, "eval_steps_per_second": 0.611, "step": 5882 }, { "epoch": 1.0, "grad_norm": 16.074131915930433, "learning_rate": 8.429262523633034e-06, "loss": 0.7902, "step": 5885 }, { "epoch": 1.0, "grad_norm": 7.338805256843147, "learning_rate": 8.425661790584916e-06, "loss": 0.7448, "step": 5890 }, { "epoch": 1.0, "grad_norm": 21.507136158145123, "learning_rate": 8.422057706119468e-06, "loss": 0.7342, "step": 5895 }, { "epoch": 1.0, "grad_norm": 43.03840171835638, "learning_rate": 8.418450273762665e-06, "loss": 0.7323, "step": 5900 }, { "epoch": 1.0, "grad_norm": 21.45456264756192, "learning_rate": 8.41483949704376e-06, "loss": 0.7333, "step": 5905 }, { "epoch": 1.0, "grad_norm": 17.793455098771023, "learning_rate": 8.411225379495265e-06, "loss": 0.7466, "step": 5910 }, { "epoch": 1.01, "grad_norm": 9.254994967182895, "learning_rate": 8.407607924652971e-06, "loss": 0.7464, "step": 5915 }, { "epoch": 1.01, "grad_norm": 18.7584036810001, "learning_rate": 8.403987136055935e-06, "loss": 0.7299, "step": 5920 }, { "epoch": 1.01, "grad_norm": 13.026506169069334, "learning_rate": 8.40036301724647e-06, "loss": 0.7196, "step": 5925 }, { "epoch": 1.01, "grad_norm": 9.346466183636792, "learning_rate": 8.39673557177015e-06, "loss": 0.7511, "step": 5930 }, { "epoch": 1.01, "grad_norm": 18.495927112645926, "learning_rate": 8.3931048031758e-06, "loss": 0.7254, "step": 5935 }, { "epoch": 1.01, "grad_norm": 17.709617231851123, "learning_rate": 8.389470715015501e-06, "loss": 0.7286, "step": 5940 }, { "epoch": 1.01, "grad_norm": 6.780660397689895, "learning_rate": 8.385833310844582e-06, "loss": 0.7139, "step": 5945 }, { "epoch": 1.01, "grad_norm": 11.032615580465922, "learning_rate": 8.382192594221608e-06, "loss": 0.749, "step": 5950 }, { "epoch": 1.01, "grad_norm": 25.25163294724016, "learning_rate": 8.378548568708396e-06, "loss": 0.7395, "step": 5955 }, { "epoch": 1.01, "grad_norm": 16.169665624561432, "learning_rate": 8.374901237869989e-06, "loss": 0.7303, "step": 5960 }, { "epoch": 1.01, "grad_norm": 26.34099151157238, "learning_rate": 8.371250605274673e-06, "loss": 0.7056, "step": 5965 }, { "epoch": 1.01, "grad_norm": 32.873244414003075, "learning_rate": 8.367596674493959e-06, "loss": 0.7414, "step": 5970 }, { "epoch": 1.02, "grad_norm": 30.87560803986548, "learning_rate": 8.363939449102586e-06, "loss": 0.7282, "step": 5975 }, { "epoch": 1.02, "grad_norm": 19.528015029331623, "learning_rate": 8.360278932678515e-06, "loss": 0.7305, "step": 5980 }, { "epoch": 1.02, "grad_norm": 9.445981084181716, "learning_rate": 8.356615128802933e-06, "loss": 0.7212, "step": 5985 }, { "epoch": 1.02, "grad_norm": 23.770775974779845, "learning_rate": 8.352948041060234e-06, "loss": 0.7378, "step": 5990 }, { "epoch": 1.02, "grad_norm": 19.14504601941375, "learning_rate": 8.349277673038026e-06, "loss": 0.7465, "step": 5995 }, { "epoch": 1.02, "grad_norm": 20.290936891320126, "learning_rate": 8.345604028327134e-06, "loss": 0.7314, "step": 6000 }, { "epoch": 1.02, "grad_norm": 7.7028244206432515, "learning_rate": 8.341927110521583e-06, "loss": 0.7185, "step": 6005 }, { "epoch": 1.02, "grad_norm": 10.604020058712843, "learning_rate": 8.3382469232186e-06, "loss": 0.7467, "step": 6010 }, { "epoch": 1.02, "grad_norm": 56.19937286099307, "learning_rate": 8.334563470018611e-06, "loss": 0.7333, "step": 6015 }, { "epoch": 1.02, "grad_norm": 26.771294283391278, "learning_rate": 8.33087675452524e-06, "loss": 0.7225, "step": 6020 }, { "epoch": 1.02, "grad_norm": 11.734466397931051, "learning_rate": 8.327186780345295e-06, "loss": 0.7247, "step": 6025 }, { "epoch": 1.03, "grad_norm": 33.72500753847307, "learning_rate": 8.323493551088782e-06, "loss": 0.7224, "step": 6030 }, { "epoch": 1.03, "grad_norm": 16.90816584525726, "learning_rate": 8.319797070368885e-06, "loss": 0.7334, "step": 6035 }, { "epoch": 1.03, "grad_norm": 11.719459783117856, "learning_rate": 8.316097341801972e-06, "loss": 0.732, "step": 6040 }, { "epoch": 1.03, "grad_norm": 11.500384938602528, "learning_rate": 8.312394369007586e-06, "loss": 0.7176, "step": 6045 }, { "epoch": 1.03, "grad_norm": 13.378871880562029, "learning_rate": 8.308688155608446e-06, "loss": 0.7264, "step": 6050 }, { "epoch": 1.03, "grad_norm": 11.840820667726737, "learning_rate": 8.30497870523044e-06, "loss": 0.7267, "step": 6055 }, { "epoch": 1.03, "grad_norm": 22.140046240955762, "learning_rate": 8.301266021502622e-06, "loss": 0.7344, "step": 6060 }, { "epoch": 1.03, "grad_norm": 31.539045412586958, "learning_rate": 8.297550108057213e-06, "loss": 0.7417, "step": 6065 }, { "epoch": 1.03, "grad_norm": 7.69385281998031, "learning_rate": 8.293830968529592e-06, "loss": 0.7461, "step": 6070 }, { "epoch": 1.03, "grad_norm": 10.582764103392428, "learning_rate": 8.29010860655829e-06, "loss": 0.7292, "step": 6075 }, { "epoch": 1.03, "grad_norm": 22.418600869543923, "learning_rate": 8.286383025784997e-06, "loss": 0.7311, "step": 6080 }, { "epoch": 1.03, "grad_norm": 15.838293990902837, "learning_rate": 8.282654229854547e-06, "loss": 0.7406, "step": 6085 }, { "epoch": 1.04, "grad_norm": 9.173142797379702, "learning_rate": 8.278922222414924e-06, "loss": 0.7331, "step": 6090 }, { "epoch": 1.04, "grad_norm": 9.858068006200584, "learning_rate": 8.275187007117251e-06, "loss": 0.6926, "step": 6095 }, { "epoch": 1.04, "grad_norm": 18.922773613667474, "learning_rate": 8.27144858761579e-06, "loss": 0.7242, "step": 6100 }, { "epoch": 1.04, "grad_norm": 12.763980438898749, "learning_rate": 8.267706967567935e-06, "loss": 0.7223, "step": 6105 }, { "epoch": 1.04, "grad_norm": 13.61708664822042, "learning_rate": 8.263962150634215e-06, "loss": 0.7126, "step": 6110 }, { "epoch": 1.04, "grad_norm": 14.596927035396877, "learning_rate": 8.26021414047829e-06, "loss": 0.7192, "step": 6115 }, { "epoch": 1.04, "grad_norm": 8.02870706457101, "learning_rate": 8.256462940766932e-06, "loss": 0.7286, "step": 6120 }, { "epoch": 1.04, "grad_norm": 36.59195200291681, "learning_rate": 8.252708555170044e-06, "loss": 0.7096, "step": 6125 }, { "epoch": 1.04, "grad_norm": 20.281469550974503, "learning_rate": 8.248950987360645e-06, "loss": 0.7167, "step": 6130 }, { "epoch": 1.04, "grad_norm": 9.97594999145645, "learning_rate": 8.245190241014863e-06, "loss": 0.7021, "step": 6135 }, { "epoch": 1.04, "grad_norm": 6.795171570462541, "learning_rate": 8.241426319811938e-06, "loss": 0.749, "step": 6140 }, { "epoch": 1.04, "grad_norm": 13.792290293045824, "learning_rate": 8.237659227434213e-06, "loss": 0.7278, "step": 6145 }, { "epoch": 1.05, "grad_norm": 7.983265764805912, "learning_rate": 8.233888967567141e-06, "loss": 0.7367, "step": 6150 }, { "epoch": 1.05, "grad_norm": 18.662478937697124, "learning_rate": 8.230115543899265e-06, "loss": 0.713, "step": 6155 }, { "epoch": 1.05, "grad_norm": 46.90862290358351, "learning_rate": 8.226338960122229e-06, "loss": 0.7281, "step": 6160 }, { "epoch": 1.05, "grad_norm": 34.89234533567352, "learning_rate": 8.222559219930766e-06, "loss": 0.7333, "step": 6165 }, { "epoch": 1.05, "grad_norm": 14.697726604622881, "learning_rate": 8.218776327022696e-06, "loss": 0.7464, "step": 6170 }, { "epoch": 1.05, "grad_norm": 10.747423802614167, "learning_rate": 8.214990285098931e-06, "loss": 0.7376, "step": 6175 }, { "epoch": 1.05, "grad_norm": 10.96902864680576, "learning_rate": 8.211201097863452e-06, "loss": 0.7287, "step": 6180 }, { "epoch": 1.05, "grad_norm": 13.636950347264827, "learning_rate": 8.207408769023324e-06, "loss": 0.7094, "step": 6185 }, { "epoch": 1.05, "grad_norm": 9.742565886062385, "learning_rate": 8.203613302288689e-06, "loss": 0.6964, "step": 6190 }, { "epoch": 1.05, "grad_norm": 24.16036680358423, "learning_rate": 8.199814701372748e-06, "loss": 0.7124, "step": 6195 }, { "epoch": 1.05, "grad_norm": 14.32299199634298, "learning_rate": 8.19601296999178e-06, "loss": 0.7255, "step": 6200 }, { "epoch": 1.05, "grad_norm": 7.085040599511274, "learning_rate": 8.192208111865118e-06, "loss": 0.7077, "step": 6205 }, { "epoch": 1.06, "grad_norm": 8.03400567894429, "learning_rate": 8.188400130715159e-06, "loss": 0.7118, "step": 6210 }, { "epoch": 1.06, "grad_norm": 6.854524776829947, "learning_rate": 8.184589030267353e-06, "loss": 0.7329, "step": 6215 }, { "epoch": 1.06, "grad_norm": 14.850945170965838, "learning_rate": 8.180774814250204e-06, "loss": 0.7138, "step": 6220 }, { "epoch": 1.06, "grad_norm": 24.114471404232695, "learning_rate": 8.17695748639526e-06, "loss": 0.7213, "step": 6225 }, { "epoch": 1.06, "grad_norm": 29.795134364882006, "learning_rate": 8.173137050437118e-06, "loss": 0.7127, "step": 6230 }, { "epoch": 1.06, "grad_norm": 12.154362218077843, "learning_rate": 8.169313510113413e-06, "loss": 0.7324, "step": 6235 }, { "epoch": 1.06, "grad_norm": 25.20196920703092, "learning_rate": 8.165486869164814e-06, "loss": 0.7124, "step": 6240 }, { "epoch": 1.06, "grad_norm": 13.811823800608863, "learning_rate": 8.161657131335033e-06, "loss": 0.7104, "step": 6245 }, { "epoch": 1.06, "grad_norm": 17.93546314393749, "learning_rate": 8.157824300370801e-06, "loss": 0.6941, "step": 6250 }, { "epoch": 1.06, "grad_norm": 33.16849346210743, "learning_rate": 8.153988380021881e-06, "loss": 0.7347, "step": 6255 }, { "epoch": 1.06, "grad_norm": 24.269651699560313, "learning_rate": 8.150149374041061e-06, "loss": 0.7379, "step": 6260 }, { "epoch": 1.07, "grad_norm": 11.381379260049865, "learning_rate": 8.146307286184141e-06, "loss": 0.726, "step": 6265 }, { "epoch": 1.07, "grad_norm": 12.343196756170697, "learning_rate": 8.14246212020994e-06, "loss": 0.7178, "step": 6270 }, { "epoch": 1.07, "grad_norm": 7.578717720187098, "learning_rate": 8.138613879880284e-06, "loss": 0.7137, "step": 6275 }, { "epoch": 1.07, "grad_norm": 31.370528526498276, "learning_rate": 8.134762568960015e-06, "loss": 0.7106, "step": 6280 }, { "epoch": 1.07, "grad_norm": 13.249042174913429, "learning_rate": 8.130908191216974e-06, "loss": 0.7089, "step": 6285 }, { "epoch": 1.07, "grad_norm": 23.809066454954635, "learning_rate": 8.127050750422e-06, "loss": 0.7045, "step": 6290 }, { "epoch": 1.07, "grad_norm": 7.854982995546571, "learning_rate": 8.123190250348932e-06, "loss": 0.7164, "step": 6295 }, { "epoch": 1.07, "grad_norm": 17.7265251355763, "learning_rate": 8.119326694774602e-06, "loss": 0.7012, "step": 6300 }, { "epoch": 1.07, "grad_norm": 17.579992645590757, "learning_rate": 8.115460087478833e-06, "loss": 0.7404, "step": 6305 }, { "epoch": 1.07, "grad_norm": 25.041819648349822, "learning_rate": 8.11159043224443e-06, "loss": 0.7015, "step": 6310 }, { "epoch": 1.07, "grad_norm": 24.949195747447597, "learning_rate": 8.107717732857177e-06, "loss": 0.6872, "step": 6315 }, { "epoch": 1.07, "grad_norm": 19.679411460396835, "learning_rate": 8.103841993105843e-06, "loss": 0.7072, "step": 6320 }, { "epoch": 1.08, "grad_norm": 20.012872260255527, "learning_rate": 8.099963216782171e-06, "loss": 0.7031, "step": 6325 }, { "epoch": 1.08, "grad_norm": 28.27758443192042, "learning_rate": 8.09608140768087e-06, "loss": 0.7259, "step": 6330 }, { "epoch": 1.08, "grad_norm": 24.772908515131697, "learning_rate": 8.092196569599619e-06, "loss": 0.7219, "step": 6335 }, { "epoch": 1.08, "grad_norm": 18.0503438246646, "learning_rate": 8.08830870633906e-06, "loss": 0.7279, "step": 6340 }, { "epoch": 1.08, "grad_norm": 14.318031610520302, "learning_rate": 8.084417821702796e-06, "loss": 0.7082, "step": 6345 }, { "epoch": 1.08, "grad_norm": 35.01375125895254, "learning_rate": 8.080523919497381e-06, "loss": 0.7282, "step": 6350 }, { "epoch": 1.08, "grad_norm": 20.21136986925559, "learning_rate": 8.076627003532328e-06, "loss": 0.7057, "step": 6355 }, { "epoch": 1.08, "grad_norm": 11.017121017069089, "learning_rate": 8.072727077620092e-06, "loss": 0.7136, "step": 6360 }, { "epoch": 1.08, "grad_norm": 27.32813819885369, "learning_rate": 8.068824145576077e-06, "loss": 0.7499, "step": 6365 }, { "epoch": 1.08, "grad_norm": 101.56111015961008, "learning_rate": 8.064918211218628e-06, "loss": 0.7211, "step": 6370 }, { "epoch": 1.08, "grad_norm": 15.904740460926877, "learning_rate": 8.061009278369026e-06, "loss": 0.7587, "step": 6375 }, { "epoch": 1.08, "grad_norm": 13.262944951104998, "learning_rate": 8.05709735085148e-06, "loss": 0.76, "step": 6380 }, { "epoch": 1.09, "grad_norm": 19.900012124876444, "learning_rate": 8.053182432493141e-06, "loss": 0.7362, "step": 6385 }, { "epoch": 1.09, "grad_norm": 10.601924165265274, "learning_rate": 8.049264527124076e-06, "loss": 0.7501, "step": 6390 }, { "epoch": 1.09, "grad_norm": 21.08025285119702, "learning_rate": 8.045343638577278e-06, "loss": 0.7563, "step": 6395 }, { "epoch": 1.09, "grad_norm": 20.646288499921937, "learning_rate": 8.04141977068866e-06, "loss": 0.7316, "step": 6400 }, { "epoch": 1.09, "grad_norm": 9.096270740741641, "learning_rate": 8.037492927297044e-06, "loss": 0.7381, "step": 6405 }, { "epoch": 1.09, "grad_norm": 8.580229233187346, "learning_rate": 8.033563112244172e-06, "loss": 0.7287, "step": 6410 }, { "epoch": 1.09, "grad_norm": 18.28636073508511, "learning_rate": 8.02963032937468e-06, "loss": 0.7352, "step": 6415 }, { "epoch": 1.09, "grad_norm": 8.346063474119852, "learning_rate": 8.025694582536124e-06, "loss": 0.7332, "step": 6420 }, { "epoch": 1.09, "grad_norm": 10.246364151412761, "learning_rate": 8.021755875578945e-06, "loss": 0.71, "step": 6425 }, { "epoch": 1.09, "grad_norm": 8.367025054003026, "learning_rate": 8.017814212356492e-06, "loss": 0.725, "step": 6430 }, { "epoch": 1.09, "grad_norm": 8.714057040582556, "learning_rate": 8.013869596724994e-06, "loss": 0.7379, "step": 6435 }, { "epoch": 1.09, "grad_norm": 8.593995045394287, "learning_rate": 8.009922032543581e-06, "loss": 0.7415, "step": 6440 }, { "epoch": 1.1, "grad_norm": 26.587621305434546, "learning_rate": 8.005971523674257e-06, "loss": 0.7186, "step": 6445 }, { "epoch": 1.1, "grad_norm": 27.706587902467838, "learning_rate": 8.002018073981914e-06, "loss": 0.7022, "step": 6450 }, { "epoch": 1.1, "grad_norm": 21.224412769515773, "learning_rate": 7.998061687334318e-06, "loss": 0.7056, "step": 6455 }, { "epoch": 1.1, "grad_norm": 14.643126663204766, "learning_rate": 7.994102367602107e-06, "loss": 0.7126, "step": 6460 }, { "epoch": 1.1, "grad_norm": 8.286326181980531, "learning_rate": 7.990140118658792e-06, "loss": 0.7317, "step": 6465 }, { "epoch": 1.1, "grad_norm": 26.733025687439078, "learning_rate": 7.986174944380749e-06, "loss": 0.7321, "step": 6470 }, { "epoch": 1.1, "grad_norm": 19.564773455771654, "learning_rate": 7.982206848647212e-06, "loss": 0.7268, "step": 6475 }, { "epoch": 1.1, "grad_norm": 17.448541273732022, "learning_rate": 7.978235835340277e-06, "loss": 0.744, "step": 6480 }, { "epoch": 1.1, "grad_norm": 36.37395815875439, "learning_rate": 7.974261908344896e-06, "loss": 0.6985, "step": 6485 }, { "epoch": 1.1, "grad_norm": 29.85056971539694, "learning_rate": 7.970285071548868e-06, "loss": 0.7214, "step": 6490 }, { "epoch": 1.1, "grad_norm": 22.471371403927865, "learning_rate": 7.966305328842838e-06, "loss": 0.6981, "step": 6495 }, { "epoch": 1.11, "grad_norm": 29.88202765503994, "learning_rate": 7.962322684120295e-06, "loss": 0.6977, "step": 6500 }, { "epoch": 1.11, "grad_norm": 17.33714463691604, "learning_rate": 7.95833714127757e-06, "loss": 0.7451, "step": 6505 }, { "epoch": 1.11, "grad_norm": 12.99959408887192, "learning_rate": 7.954348704213825e-06, "loss": 0.7204, "step": 6510 }, { "epoch": 1.11, "grad_norm": 10.049862478420733, "learning_rate": 7.95035737683106e-06, "loss": 0.7168, "step": 6515 }, { "epoch": 1.11, "grad_norm": 14.518802904427606, "learning_rate": 7.946363163034092e-06, "loss": 0.7031, "step": 6520 }, { "epoch": 1.11, "grad_norm": 8.273603677482322, "learning_rate": 7.942366066730571e-06, "loss": 0.7324, "step": 6525 }, { "epoch": 1.11, "grad_norm": 22.046757543422785, "learning_rate": 7.938366091830967e-06, "loss": 0.7265, "step": 6530 }, { "epoch": 1.11, "grad_norm": 33.12928987146557, "learning_rate": 7.93436324224856e-06, "loss": 0.7145, "step": 6535 }, { "epoch": 1.11, "grad_norm": 12.56350041819999, "learning_rate": 7.930357521899444e-06, "loss": 0.7149, "step": 6540 }, { "epoch": 1.11, "grad_norm": 12.501911722744474, "learning_rate": 7.926348934702526e-06, "loss": 0.6998, "step": 6545 }, { "epoch": 1.11, "grad_norm": 13.688733562201573, "learning_rate": 7.922337484579516e-06, "loss": 0.7107, "step": 6550 }, { "epoch": 1.11, "grad_norm": 21.058453052713165, "learning_rate": 7.918323175454923e-06, "loss": 0.7332, "step": 6555 }, { "epoch": 1.12, "grad_norm": 38.17079743809706, "learning_rate": 7.914306011256051e-06, "loss": 0.6928, "step": 6560 }, { "epoch": 1.12, "grad_norm": 67.58525110577213, "learning_rate": 7.910285995913003e-06, "loss": 0.7347, "step": 6565 }, { "epoch": 1.12, "grad_norm": 32.44302274542706, "learning_rate": 7.90626313335867e-06, "loss": 0.7164, "step": 6570 }, { "epoch": 1.12, "grad_norm": 20.402541349672042, "learning_rate": 7.902237427528721e-06, "loss": 0.7061, "step": 6575 }, { "epoch": 1.12, "grad_norm": 11.055494830630643, "learning_rate": 7.89820888236162e-06, "loss": 0.7074, "step": 6580 }, { "epoch": 1.12, "grad_norm": 12.66556604728358, "learning_rate": 7.894177501798595e-06, "loss": 0.7163, "step": 6585 }, { "epoch": 1.12, "grad_norm": 9.07830415040536, "learning_rate": 7.890143289783658e-06, "loss": 0.7098, "step": 6590 }, { "epoch": 1.12, "grad_norm": 12.736714344164882, "learning_rate": 7.886106250263588e-06, "loss": 0.7215, "step": 6595 }, { "epoch": 1.12, "grad_norm": 16.6673812587513, "learning_rate": 7.882066387187926e-06, "loss": 0.7087, "step": 6600 }, { "epoch": 1.12, "grad_norm": 27.162715320340062, "learning_rate": 7.87802370450898e-06, "loss": 0.703, "step": 6605 }, { "epoch": 1.12, "grad_norm": 17.595778017849216, "learning_rate": 7.87397820618182e-06, "loss": 0.732, "step": 6610 }, { "epoch": 1.12, "grad_norm": 7.497247310820503, "learning_rate": 7.869929896164262e-06, "loss": 0.6938, "step": 6615 }, { "epoch": 1.13, "grad_norm": 24.174550408631685, "learning_rate": 7.865878778416879e-06, "loss": 0.7326, "step": 6620 }, { "epoch": 1.13, "grad_norm": 9.682620191492965, "learning_rate": 7.861824856902984e-06, "loss": 0.681, "step": 6625 }, { "epoch": 1.13, "grad_norm": 7.042295074217735, "learning_rate": 7.857768135588642e-06, "loss": 0.7225, "step": 6630 }, { "epoch": 1.13, "grad_norm": 23.16669446157789, "learning_rate": 7.853708618442654e-06, "loss": 0.7164, "step": 6635 }, { "epoch": 1.13, "grad_norm": 7.921927166023332, "learning_rate": 7.849646309436551e-06, "loss": 0.6961, "step": 6640 }, { "epoch": 1.13, "grad_norm": 11.13787412696501, "learning_rate": 7.845581212544605e-06, "loss": 0.7325, "step": 6645 }, { "epoch": 1.13, "grad_norm": 11.317276381618884, "learning_rate": 7.841513331743803e-06, "loss": 0.7118, "step": 6650 }, { "epoch": 1.13, "grad_norm": 13.678166525281911, "learning_rate": 7.837442671013868e-06, "loss": 0.7186, "step": 6655 }, { "epoch": 1.13, "grad_norm": 10.039215360240258, "learning_rate": 7.833369234337235e-06, "loss": 0.7159, "step": 6660 }, { "epoch": 1.13, "grad_norm": 12.15585567306351, "learning_rate": 7.829293025699056e-06, "loss": 0.72, "step": 6665 }, { "epoch": 1.13, "grad_norm": 7.964575038033085, "learning_rate": 7.8252140490872e-06, "loss": 0.6964, "step": 6670 }, { "epoch": 1.13, "grad_norm": 31.392270596686796, "learning_rate": 7.821132308492235e-06, "loss": 0.725, "step": 6675 }, { "epoch": 1.14, "grad_norm": 22.243486431851114, "learning_rate": 7.81704780790744e-06, "loss": 0.7163, "step": 6680 }, { "epoch": 1.14, "grad_norm": 32.554006910354886, "learning_rate": 7.812960551328792e-06, "loss": 0.7215, "step": 6685 }, { "epoch": 1.14, "grad_norm": 11.860686728562028, "learning_rate": 7.808870542754964e-06, "loss": 0.7126, "step": 6690 }, { "epoch": 1.14, "grad_norm": 14.717346410772016, "learning_rate": 7.804777786187324e-06, "loss": 0.7107, "step": 6695 }, { "epoch": 1.14, "grad_norm": 10.488764788651848, "learning_rate": 7.800682285629922e-06, "loss": 0.7108, "step": 6700 }, { "epoch": 1.14, "grad_norm": 20.458748569916793, "learning_rate": 7.796584045089499e-06, "loss": 0.7195, "step": 6705 }, { "epoch": 1.14, "grad_norm": 15.795348738415434, "learning_rate": 7.792483068575475e-06, "loss": 0.7119, "step": 6710 }, { "epoch": 1.14, "grad_norm": 24.613194371981375, "learning_rate": 7.788379360099944e-06, "loss": 0.7097, "step": 6715 }, { "epoch": 1.14, "grad_norm": 16.381433477062384, "learning_rate": 7.784272923677678e-06, "loss": 0.6991, "step": 6720 }, { "epoch": 1.14, "grad_norm": 12.110548394689014, "learning_rate": 7.78016376332611e-06, "loss": 0.7046, "step": 6725 }, { "epoch": 1.14, "grad_norm": 15.048488403438107, "learning_rate": 7.776051883065345e-06, "loss": 0.7019, "step": 6730 }, { "epoch": 1.15, "grad_norm": 29.011311668426053, "learning_rate": 7.771937286918147e-06, "loss": 0.7244, "step": 6735 }, { "epoch": 1.15, "grad_norm": 28.93610413762296, "learning_rate": 7.767819978909933e-06, "loss": 0.6931, "step": 6740 }, { "epoch": 1.15, "grad_norm": 8.355017448756705, "learning_rate": 7.763699963068782e-06, "loss": 0.7062, "step": 6745 }, { "epoch": 1.15, "grad_norm": 7.110861457861793, "learning_rate": 7.759577243425412e-06, "loss": 0.6952, "step": 6750 }, { "epoch": 1.15, "grad_norm": 12.727622562247888, "learning_rate": 7.755451824013194e-06, "loss": 0.7046, "step": 6755 }, { "epoch": 1.15, "grad_norm": 25.70418981169325, "learning_rate": 7.751323708868134e-06, "loss": 0.683, "step": 6760 }, { "epoch": 1.15, "grad_norm": 25.78166214567401, "learning_rate": 7.74719290202888e-06, "loss": 0.6869, "step": 6765 }, { "epoch": 1.15, "grad_norm": 9.622497117699039, "learning_rate": 7.74305940753671e-06, "loss": 0.6902, "step": 6770 }, { "epoch": 1.15, "grad_norm": 39.78003913153285, "learning_rate": 7.738923229435538e-06, "loss": 0.7013, "step": 6775 }, { "epoch": 1.15, "grad_norm": 45.810278472194724, "learning_rate": 7.734784371771894e-06, "loss": 0.7171, "step": 6780 }, { "epoch": 1.15, "grad_norm": 23.054568253500427, "learning_rate": 7.730642838594932e-06, "loss": 0.692, "step": 6785 }, { "epoch": 1.15, "grad_norm": 10.059784782634388, "learning_rate": 7.726498633956433e-06, "loss": 0.6867, "step": 6790 }, { "epoch": 1.16, "grad_norm": 9.177875706336556, "learning_rate": 7.72235176191078e-06, "loss": 0.7012, "step": 6795 }, { "epoch": 1.16, "grad_norm": 7.4495938243906625, "learning_rate": 7.718202226514968e-06, "loss": 0.7038, "step": 6800 }, { "epoch": 1.16, "grad_norm": 10.44562019083214, "learning_rate": 7.714050031828602e-06, "loss": 0.6935, "step": 6805 }, { "epoch": 1.16, "grad_norm": 7.812788788197329, "learning_rate": 7.709895181913887e-06, "loss": 0.725, "step": 6810 }, { "epoch": 1.16, "grad_norm": 9.336833846004621, "learning_rate": 7.705737680835623e-06, "loss": 0.6925, "step": 6815 }, { "epoch": 1.16, "grad_norm": 8.987925326739377, "learning_rate": 7.701577532661204e-06, "loss": 0.6947, "step": 6820 }, { "epoch": 1.16, "grad_norm": 12.87671922318729, "learning_rate": 7.697414741460616e-06, "loss": 0.687, "step": 6825 }, { "epoch": 1.16, "grad_norm": 7.712200059941566, "learning_rate": 7.693249311306433e-06, "loss": 0.6904, "step": 6830 }, { "epoch": 1.16, "grad_norm": 7.774310159518635, "learning_rate": 7.689081246273805e-06, "loss": 0.6892, "step": 6835 }, { "epoch": 1.16, "grad_norm": 18.661289555094946, "learning_rate": 7.684910550440462e-06, "loss": 0.6999, "step": 6840 }, { "epoch": 1.16, "grad_norm": 33.664597069197804, "learning_rate": 7.680737227886708e-06, "loss": 0.7047, "step": 6845 }, { "epoch": 1.16, "grad_norm": 20.181955771404265, "learning_rate": 7.67656128269542e-06, "loss": 0.7261, "step": 6850 }, { "epoch": 1.17, "grad_norm": 30.26147343843953, "learning_rate": 7.672382718952037e-06, "loss": 0.7052, "step": 6855 }, { "epoch": 1.17, "grad_norm": 20.667253974389084, "learning_rate": 7.668201540744556e-06, "loss": 0.711, "step": 6860 }, { "epoch": 1.17, "grad_norm": 42.630822600609775, "learning_rate": 7.664017752163542e-06, "loss": 0.7036, "step": 6865 }, { "epoch": 1.17, "grad_norm": 16.345967377044545, "learning_rate": 7.659831357302105e-06, "loss": 0.6998, "step": 6870 }, { "epoch": 1.17, "grad_norm": 7.85298791802671, "learning_rate": 7.655642360255911e-06, "loss": 0.6963, "step": 6875 }, { "epoch": 1.17, "grad_norm": 15.02513471044103, "learning_rate": 7.651450765123165e-06, "loss": 0.6913, "step": 6880 }, { "epoch": 1.17, "grad_norm": 21.157869645277824, "learning_rate": 7.647256576004619e-06, "loss": 0.6693, "step": 6885 }, { "epoch": 1.17, "grad_norm": 22.99490929473168, "learning_rate": 7.643059797003564e-06, "loss": 0.6849, "step": 6890 }, { "epoch": 1.17, "grad_norm": 27.020744662631394, "learning_rate": 7.638860432225818e-06, "loss": 0.723, "step": 6895 }, { "epoch": 1.17, "grad_norm": 7.624030473466098, "learning_rate": 7.634658485779736e-06, "loss": 0.6946, "step": 6900 }, { "epoch": 1.17, "grad_norm": 9.65357246747531, "learning_rate": 7.630453961776195e-06, "loss": 0.6892, "step": 6905 }, { "epoch": 1.17, "grad_norm": 12.093852140081328, "learning_rate": 7.6262468643285926e-06, "loss": 0.6927, "step": 6910 }, { "epoch": 1.18, "grad_norm": 13.094869879189778, "learning_rate": 7.622037197552846e-06, "loss": 0.6913, "step": 6915 }, { "epoch": 1.18, "grad_norm": 11.165975514550444, "learning_rate": 7.617824965567387e-06, "loss": 0.7019, "step": 6920 }, { "epoch": 1.18, "grad_norm": 19.50880196493642, "learning_rate": 7.613610172493156e-06, "loss": 0.7077, "step": 6925 }, { "epoch": 1.18, "grad_norm": 11.672651849911297, "learning_rate": 7.609392822453596e-06, "loss": 0.7017, "step": 6930 }, { "epoch": 1.18, "grad_norm": 10.164537307950836, "learning_rate": 7.605172919574657e-06, "loss": 0.7015, "step": 6935 }, { "epoch": 1.18, "grad_norm": 7.451099827206772, "learning_rate": 7.600950467984783e-06, "loss": 0.6752, "step": 6940 }, { "epoch": 1.18, "grad_norm": 16.870539162624986, "learning_rate": 7.5967254718149145e-06, "loss": 0.6853, "step": 6945 }, { "epoch": 1.18, "grad_norm": 15.135503893996802, "learning_rate": 7.592497935198474e-06, "loss": 0.6946, "step": 6950 }, { "epoch": 1.18, "grad_norm": 9.045389341400304, "learning_rate": 7.588267862271379e-06, "loss": 0.7116, "step": 6955 }, { "epoch": 1.18, "grad_norm": 7.973021024474602, "learning_rate": 7.584035257172022e-06, "loss": 0.7072, "step": 6960 }, { "epoch": 1.18, "grad_norm": 17.220087665017225, "learning_rate": 7.579800124041276e-06, "loss": 0.6944, "step": 6965 }, { "epoch": 1.18, "grad_norm": 39.31130380483513, "learning_rate": 7.575562467022484e-06, "loss": 0.701, "step": 6970 }, { "epoch": 1.19, "grad_norm": 33.3590421783122, "learning_rate": 7.571322290261462e-06, "loss": 0.7133, "step": 6975 }, { "epoch": 1.19, "grad_norm": 43.2227104390999, "learning_rate": 7.567079597906491e-06, "loss": 0.7223, "step": 6980 }, { "epoch": 1.19, "grad_norm": 52.20311146641715, "learning_rate": 7.5628343941083074e-06, "loss": 0.7041, "step": 6985 }, { "epoch": 1.19, "grad_norm": 69.35024918310937, "learning_rate": 7.5585866830201086e-06, "loss": 0.6854, "step": 6990 }, { "epoch": 1.19, "grad_norm": 26.03096173960646, "learning_rate": 7.554336468797549e-06, "loss": 0.6967, "step": 6995 }, { "epoch": 1.19, "grad_norm": 18.747268228543884, "learning_rate": 7.550083755598723e-06, "loss": 0.6989, "step": 7000 }, { "epoch": 1.19, "grad_norm": 33.98837608030512, "learning_rate": 7.5458285475841784e-06, "loss": 0.6988, "step": 7005 }, { "epoch": 1.19, "grad_norm": 30.343042257903118, "learning_rate": 7.541570848916898e-06, "loss": 0.6924, "step": 7010 }, { "epoch": 1.19, "grad_norm": 20.01218820770515, "learning_rate": 7.537310663762305e-06, "loss": 0.6964, "step": 7015 }, { "epoch": 1.19, "grad_norm": 7.42358496227341, "learning_rate": 7.533047996288252e-06, "loss": 0.6913, "step": 7020 }, { "epoch": 1.19, "grad_norm": 12.964543921746909, "learning_rate": 7.528782850665021e-06, "loss": 0.7022, "step": 7025 }, { "epoch": 1.2, "grad_norm": 25.8037225474705, "learning_rate": 7.524515231065321e-06, "loss": 0.6785, "step": 7030 }, { "epoch": 1.2, "grad_norm": 9.350765998841295, "learning_rate": 7.520245141664278e-06, "loss": 0.6869, "step": 7035 }, { "epoch": 1.2, "grad_norm": 8.861735010461002, "learning_rate": 7.515972586639435e-06, "loss": 0.6799, "step": 7040 }, { "epoch": 1.2, "grad_norm": 18.877217521620906, "learning_rate": 7.511697570170748e-06, "loss": 0.7014, "step": 7045 }, { "epoch": 1.2, "grad_norm": 7.505481993998394, "learning_rate": 7.507420096440583e-06, "loss": 0.6984, "step": 7050 }, { "epoch": 1.2, "grad_norm": 6.778863037826613, "learning_rate": 7.5031401696337066e-06, "loss": 0.6979, "step": 7055 }, { "epoch": 1.2, "grad_norm": 11.471077209038233, "learning_rate": 7.498857793937286e-06, "loss": 0.6974, "step": 7060 }, { "epoch": 1.2, "grad_norm": 36.567832461479874, "learning_rate": 7.494572973540886e-06, "loss": 0.6899, "step": 7065 }, { "epoch": 1.2, "grad_norm": 18.43230031875812, "learning_rate": 7.4902857126364605e-06, "loss": 0.6821, "step": 7070 }, { "epoch": 1.2, "grad_norm": 24.623961517100078, "learning_rate": 7.485996015418354e-06, "loss": 0.6922, "step": 7075 }, { "epoch": 1.2, "grad_norm": 8.646515546307073, "learning_rate": 7.481703886083291e-06, "loss": 0.6745, "step": 7080 }, { "epoch": 1.2, "grad_norm": 31.396337867305935, "learning_rate": 7.477409328830381e-06, "loss": 0.6928, "step": 7085 }, { "epoch": 1.21, "grad_norm": 20.424188079541075, "learning_rate": 7.473112347861103e-06, "loss": 0.6722, "step": 7090 }, { "epoch": 1.21, "grad_norm": 26.00232330630306, "learning_rate": 7.468812947379307e-06, "loss": 0.6998, "step": 7095 }, { "epoch": 1.21, "grad_norm": 27.46957886840992, "learning_rate": 7.464511131591216e-06, "loss": 0.6846, "step": 7100 }, { "epoch": 1.21, "grad_norm": 9.132272963335847, "learning_rate": 7.4602069047054105e-06, "loss": 0.6982, "step": 7105 }, { "epoch": 1.21, "grad_norm": 22.348386685560573, "learning_rate": 7.4559002709328335e-06, "loss": 0.6851, "step": 7110 }, { "epoch": 1.21, "grad_norm": 17.814202412904162, "learning_rate": 7.451591234486779e-06, "loss": 0.7055, "step": 7115 }, { "epoch": 1.21, "grad_norm": 12.859238788081717, "learning_rate": 7.447279799582895e-06, "loss": 0.6921, "step": 7120 }, { "epoch": 1.21, "grad_norm": 14.588006740632137, "learning_rate": 7.442965970439175e-06, "loss": 0.6588, "step": 7125 }, { "epoch": 1.21, "grad_norm": 16.47004331535925, "learning_rate": 7.438649751275952e-06, "loss": 0.6859, "step": 7130 }, { "epoch": 1.21, "grad_norm": 17.497100321839717, "learning_rate": 7.434331146315903e-06, "loss": 0.7088, "step": 7135 }, { "epoch": 1.21, "grad_norm": 16.389468678247884, "learning_rate": 7.430010159784032e-06, "loss": 0.6818, "step": 7140 }, { "epoch": 1.21, "grad_norm": 35.137652159527434, "learning_rate": 7.425686795907678e-06, "loss": 0.6927, "step": 7145 }, { "epoch": 1.22, "grad_norm": 24.705252151791072, "learning_rate": 7.4213610589165055e-06, "loss": 0.7036, "step": 7150 }, { "epoch": 1.22, "grad_norm": 10.207253216482892, "learning_rate": 7.4170329530425e-06, "loss": 0.6846, "step": 7155 }, { "epoch": 1.22, "grad_norm": 6.846196135712745, "learning_rate": 7.412702482519962e-06, "loss": 0.6822, "step": 7160 }, { "epoch": 1.22, "grad_norm": 17.97292030080821, "learning_rate": 7.40836965158551e-06, "loss": 0.6978, "step": 7165 }, { "epoch": 1.22, "grad_norm": 21.994990939537736, "learning_rate": 7.404034464478069e-06, "loss": 0.7095, "step": 7170 }, { "epoch": 1.22, "grad_norm": 12.622262894881901, "learning_rate": 7.399696925438868e-06, "loss": 0.6591, "step": 7175 }, { "epoch": 1.22, "grad_norm": 17.138206921967175, "learning_rate": 7.39535703871144e-06, "loss": 0.6758, "step": 7180 }, { "epoch": 1.22, "grad_norm": 7.35115037571887, "learning_rate": 7.391014808541611e-06, "loss": 0.6926, "step": 7185 }, { "epoch": 1.22, "grad_norm": 11.995736156812953, "learning_rate": 7.386670239177504e-06, "loss": 0.6809, "step": 7190 }, { "epoch": 1.22, "grad_norm": 17.45647494417984, "learning_rate": 7.382323334869529e-06, "loss": 0.6822, "step": 7195 }, { "epoch": 1.22, "grad_norm": 24.7016547776482, "learning_rate": 7.377974099870378e-06, "loss": 0.6785, "step": 7200 }, { "epoch": 1.22, "grad_norm": 23.36867960192761, "learning_rate": 7.373622538435024e-06, "loss": 0.6808, "step": 7205 }, { "epoch": 1.23, "grad_norm": 32.65941997390678, "learning_rate": 7.369268654820718e-06, "loss": 0.6907, "step": 7210 }, { "epoch": 1.23, "grad_norm": 19.77258580914987, "learning_rate": 7.3649124532869855e-06, "loss": 0.702, "step": 7215 }, { "epoch": 1.23, "grad_norm": 13.899504710148166, "learning_rate": 7.36055393809561e-06, "loss": 0.684, "step": 7220 }, { "epoch": 1.23, "grad_norm": 11.847634728164552, "learning_rate": 7.356193113510648e-06, "loss": 0.687, "step": 7225 }, { "epoch": 1.23, "grad_norm": 20.93469198208261, "learning_rate": 7.3518299837984095e-06, "loss": 0.6607, "step": 7230 }, { "epoch": 1.23, "grad_norm": 44.311453507879676, "learning_rate": 7.347464553227466e-06, "loss": 0.6924, "step": 7235 }, { "epoch": 1.23, "grad_norm": 16.396465443143537, "learning_rate": 7.343096826068631e-06, "loss": 0.7023, "step": 7240 }, { "epoch": 1.23, "grad_norm": 17.15469275451865, "learning_rate": 7.338726806594973e-06, "loss": 0.684, "step": 7245 }, { "epoch": 1.23, "grad_norm": 17.59304430753221, "learning_rate": 7.334354499081797e-06, "loss": 0.7031, "step": 7250 }, { "epoch": 1.23, "grad_norm": 15.2512260575335, "learning_rate": 7.329979907806655e-06, "loss": 0.6959, "step": 7255 }, { "epoch": 1.23, "grad_norm": 8.326978416349075, "learning_rate": 7.3256030370493216e-06, "loss": 0.6902, "step": 7260 }, { "epoch": 1.24, "grad_norm": 12.110584046270967, "learning_rate": 7.321223891091811e-06, "loss": 0.6801, "step": 7265 }, { "epoch": 1.24, "grad_norm": 7.953504528002581, "learning_rate": 7.316842474218357e-06, "loss": 0.6851, "step": 7270 }, { "epoch": 1.24, "grad_norm": 16.575325837437997, "learning_rate": 7.312458790715423e-06, "loss": 0.6897, "step": 7275 }, { "epoch": 1.24, "grad_norm": 21.78543946475332, "learning_rate": 7.308072844871679e-06, "loss": 0.6691, "step": 7280 }, { "epoch": 1.24, "grad_norm": 14.868152052821847, "learning_rate": 7.3036846409780175e-06, "loss": 0.6942, "step": 7285 }, { "epoch": 1.24, "grad_norm": 13.721946329731745, "learning_rate": 7.299294183327534e-06, "loss": 0.6692, "step": 7290 }, { "epoch": 1.24, "grad_norm": 7.006162810748249, "learning_rate": 7.294901476215537e-06, "loss": 0.6773, "step": 7295 }, { "epoch": 1.24, "grad_norm": 8.387648611571954, "learning_rate": 7.290506523939524e-06, "loss": 0.6833, "step": 7300 }, { "epoch": 1.24, "grad_norm": 18.110771012553165, "learning_rate": 7.286109330799198e-06, "loss": 0.6709, "step": 7305 }, { "epoch": 1.24, "grad_norm": 8.516727078372885, "learning_rate": 7.2817099010964545e-06, "loss": 0.6775, "step": 7310 }, { "epoch": 1.24, "grad_norm": 38.802526080079346, "learning_rate": 7.277308239135369e-06, "loss": 0.7053, "step": 7315 }, { "epoch": 1.24, "grad_norm": 32.465664109302125, "learning_rate": 7.27290434922221e-06, "loss": 0.6796, "step": 7320 }, { "epoch": 1.25, "grad_norm": 14.845671379668499, "learning_rate": 7.26849823566542e-06, "loss": 0.7167, "step": 7325 }, { "epoch": 1.25, "grad_norm": 17.518712361079984, "learning_rate": 7.264089902775618e-06, "loss": 0.7029, "step": 7330 }, { "epoch": 1.25, "grad_norm": 11.250471903999252, "learning_rate": 7.259679354865599e-06, "loss": 0.6757, "step": 7335 }, { "epoch": 1.25, "grad_norm": 13.190619690981686, "learning_rate": 7.255266596250316e-06, "loss": 0.6919, "step": 7340 }, { "epoch": 1.25, "grad_norm": 25.288326771663677, "learning_rate": 7.250851631246892e-06, "loss": 0.6594, "step": 7345 }, { "epoch": 1.25, "grad_norm": 6.247667804652537, "learning_rate": 7.246434464174607e-06, "loss": 0.6532, "step": 7350 }, { "epoch": 1.25, "grad_norm": 14.682710149971358, "learning_rate": 7.242015099354894e-06, "loss": 0.6585, "step": 7355 }, { "epoch": 1.25, "grad_norm": 27.239771872923306, "learning_rate": 7.2375935411113375e-06, "loss": 0.6923, "step": 7360 }, { "epoch": 1.25, "grad_norm": 12.142724413466496, "learning_rate": 7.2331697937696654e-06, "loss": 0.6701, "step": 7365 }, { "epoch": 1.25, "grad_norm": 19.053316941159274, "learning_rate": 7.2287438616577496e-06, "loss": 0.6655, "step": 7370 }, { "epoch": 1.25, "grad_norm": 13.044116655990347, "learning_rate": 7.2243157491056e-06, "loss": 0.6751, "step": 7375 }, { "epoch": 1.25, "grad_norm": 23.211289117451855, "learning_rate": 7.219885460445358e-06, "loss": 0.6909, "step": 7380 }, { "epoch": 1.26, "grad_norm": 19.716287423668646, "learning_rate": 7.215453000011295e-06, "loss": 0.6851, "step": 7385 }, { "epoch": 1.26, "grad_norm": 7.994891603259087, "learning_rate": 7.211018372139804e-06, "loss": 0.6716, "step": 7390 }, { "epoch": 1.26, "grad_norm": 7.834328037352239, "learning_rate": 7.2065815811694055e-06, "loss": 0.6806, "step": 7395 }, { "epoch": 1.26, "grad_norm": 6.668936421997721, "learning_rate": 7.202142631440728e-06, "loss": 0.6895, "step": 7400 }, { "epoch": 1.26, "grad_norm": 13.712019148611033, "learning_rate": 7.197701527296518e-06, "loss": 0.6713, "step": 7405 }, { "epoch": 1.26, "grad_norm": 7.575949154287882, "learning_rate": 7.193258273081626e-06, "loss": 0.6635, "step": 7410 }, { "epoch": 1.26, "grad_norm": 6.801693793087312, "learning_rate": 7.188812873143007e-06, "loss": 0.6769, "step": 7415 }, { "epoch": 1.26, "grad_norm": 12.726495724148158, "learning_rate": 7.184365331829719e-06, "loss": 0.6822, "step": 7420 }, { "epoch": 1.26, "grad_norm": 11.831922759330421, "learning_rate": 7.179915653492907e-06, "loss": 0.6427, "step": 7425 }, { "epoch": 1.26, "grad_norm": 19.151607114346472, "learning_rate": 7.175463842485815e-06, "loss": 0.6901, "step": 7430 }, { "epoch": 1.26, "grad_norm": 34.95697762852432, "learning_rate": 7.171009903163767e-06, "loss": 0.6907, "step": 7435 }, { "epoch": 1.26, "grad_norm": 20.584668811325557, "learning_rate": 7.1665538398841746e-06, "loss": 0.6821, "step": 7440 }, { "epoch": 1.27, "grad_norm": 16.792833479274165, "learning_rate": 7.162095657006523e-06, "loss": 0.6822, "step": 7445 }, { "epoch": 1.27, "grad_norm": 32.94215443286715, "learning_rate": 7.157635358892374e-06, "loss": 0.6733, "step": 7450 }, { "epoch": 1.27, "grad_norm": 46.902507165291105, "learning_rate": 7.153172949905357e-06, "loss": 0.685, "step": 7455 }, { "epoch": 1.27, "grad_norm": 37.05914931544486, "learning_rate": 7.1487084344111665e-06, "loss": 0.7108, "step": 7460 }, { "epoch": 1.27, "grad_norm": 7.901869763596967, "learning_rate": 7.144241816777559e-06, "loss": 0.678, "step": 7465 }, { "epoch": 1.27, "grad_norm": 31.0258209958262, "learning_rate": 7.139773101374346e-06, "loss": 0.6655, "step": 7470 }, { "epoch": 1.27, "grad_norm": 13.571638831792662, "learning_rate": 7.135302292573392e-06, "loss": 0.6815, "step": 7475 }, { "epoch": 1.27, "grad_norm": 14.721170462197327, "learning_rate": 7.130829394748613e-06, "loss": 0.6796, "step": 7480 }, { "epoch": 1.27, "grad_norm": 28.939363425798188, "learning_rate": 7.126354412275963e-06, "loss": 0.6667, "step": 7485 }, { "epoch": 1.27, "grad_norm": 24.334434659753086, "learning_rate": 7.121877349533438e-06, "loss": 0.6793, "step": 7490 }, { "epoch": 1.27, "grad_norm": 17.334053700215673, "learning_rate": 7.117398210901071e-06, "loss": 0.6796, "step": 7495 }, { "epoch": 1.28, "grad_norm": 13.658342801586011, "learning_rate": 7.112917000760923e-06, "loss": 0.668, "step": 7500 }, { "epoch": 1.28, "grad_norm": 7.31412533720228, "learning_rate": 7.108433723497081e-06, "loss": 0.6665, "step": 7505 }, { "epoch": 1.28, "grad_norm": 13.405988971024291, "learning_rate": 7.103948383495659e-06, "loss": 0.6748, "step": 7510 }, { "epoch": 1.28, "grad_norm": 18.334245035577876, "learning_rate": 7.099460985144784e-06, "loss": 0.682, "step": 7515 }, { "epoch": 1.28, "grad_norm": 35.91501841354662, "learning_rate": 7.094971532834601e-06, "loss": 0.6485, "step": 7520 }, { "epoch": 1.28, "grad_norm": 14.892892396192009, "learning_rate": 7.090480030957261e-06, "loss": 0.6903, "step": 7525 }, { "epoch": 1.28, "grad_norm": 17.90987688639934, "learning_rate": 7.0859864839069205e-06, "loss": 0.6636, "step": 7530 }, { "epoch": 1.28, "grad_norm": 21.575729746016865, "learning_rate": 7.081490896079738e-06, "loss": 0.6779, "step": 7535 }, { "epoch": 1.28, "grad_norm": 8.673927795129549, "learning_rate": 7.076993271873871e-06, "loss": 0.6817, "step": 7540 }, { "epoch": 1.28, "grad_norm": 11.597632317299354, "learning_rate": 7.072493615689464e-06, "loss": 0.6743, "step": 7545 }, { "epoch": 1.28, "grad_norm": 20.80844236847429, "learning_rate": 7.067991931928653e-06, "loss": 0.6773, "step": 7550 }, { "epoch": 1.28, "grad_norm": 11.088359347091162, "learning_rate": 7.063488224995555e-06, "loss": 0.6706, "step": 7555 }, { "epoch": 1.29, "grad_norm": 14.031454429113667, "learning_rate": 7.058982499296271e-06, "loss": 0.6812, "step": 7560 }, { "epoch": 1.29, "grad_norm": 16.189777793794025, "learning_rate": 7.0544747592388705e-06, "loss": 0.677, "step": 7565 }, { "epoch": 1.29, "grad_norm": 14.163766472519798, "learning_rate": 7.049965009233399e-06, "loss": 0.6629, "step": 7570 }, { "epoch": 1.29, "grad_norm": 6.698400255214532, "learning_rate": 7.045453253691868e-06, "loss": 0.6697, "step": 7575 }, { "epoch": 1.29, "grad_norm": 6.874532923503898, "learning_rate": 7.040939497028247e-06, "loss": 0.6755, "step": 7580 }, { "epoch": 1.29, "grad_norm": 16.421821705632198, "learning_rate": 7.0364237436584685e-06, "loss": 0.6592, "step": 7585 }, { "epoch": 1.29, "grad_norm": 9.231467938299327, "learning_rate": 7.031905998000414e-06, "loss": 0.662, "step": 7590 }, { "epoch": 1.29, "grad_norm": 36.750873525169936, "learning_rate": 7.027386264473914e-06, "loss": 0.6562, "step": 7595 }, { "epoch": 1.29, "grad_norm": 16.722367709743043, "learning_rate": 7.022864547500751e-06, "loss": 0.6704, "step": 7600 }, { "epoch": 1.29, "grad_norm": 7.028304612662503, "learning_rate": 7.018340851504637e-06, "loss": 0.6637, "step": 7605 }, { "epoch": 1.29, "grad_norm": 10.942067762843156, "learning_rate": 7.01381518091123e-06, "loss": 0.6681, "step": 7610 }, { "epoch": 1.29, "grad_norm": 13.57676824119045, "learning_rate": 7.009287540148113e-06, "loss": 0.6918, "step": 7615 }, { "epoch": 1.3, "grad_norm": 11.920375170456783, "learning_rate": 7.004757933644801e-06, "loss": 0.6575, "step": 7620 }, { "epoch": 1.3, "grad_norm": 14.322859449563097, "learning_rate": 7.000226365832729e-06, "loss": 0.682, "step": 7625 }, { "epoch": 1.3, "grad_norm": 9.312229271567947, "learning_rate": 6.995692841145253e-06, "loss": 0.6701, "step": 7630 }, { "epoch": 1.3, "grad_norm": 7.707392654136837, "learning_rate": 6.991157364017642e-06, "loss": 0.6654, "step": 7635 }, { "epoch": 1.3, "grad_norm": 6.796894963851315, "learning_rate": 6.986619938887076e-06, "loss": 0.6578, "step": 7640 }, { "epoch": 1.3, "grad_norm": 11.853471127640145, "learning_rate": 6.982080570192638e-06, "loss": 0.6677, "step": 7645 }, { "epoch": 1.3, "grad_norm": 12.12977247638146, "learning_rate": 6.977539262375318e-06, "loss": 0.658, "step": 7650 }, { "epoch": 1.3, "grad_norm": 12.699221620073573, "learning_rate": 6.972996019877998e-06, "loss": 0.659, "step": 7655 }, { "epoch": 1.3, "grad_norm": 7.424426213591365, "learning_rate": 6.968450847145456e-06, "loss": 0.6668, "step": 7660 }, { "epoch": 1.3, "grad_norm": 18.343320233271434, "learning_rate": 6.963903748624356e-06, "loss": 0.6363, "step": 7665 }, { "epoch": 1.3, "grad_norm": 27.69523332992964, "learning_rate": 6.959354728763247e-06, "loss": 0.6543, "step": 7670 }, { "epoch": 1.3, "grad_norm": 7.276146037274146, "learning_rate": 6.954803792012559e-06, "loss": 0.6855, "step": 7675 }, { "epoch": 1.31, "grad_norm": 5.700634905867604, "learning_rate": 6.950250942824595e-06, "loss": 0.6564, "step": 7680 }, { "epoch": 1.31, "grad_norm": 17.506750873615168, "learning_rate": 6.945696185653532e-06, "loss": 0.6768, "step": 7685 }, { "epoch": 1.31, "grad_norm": 16.791087185631472, "learning_rate": 6.941139524955409e-06, "loss": 0.6658, "step": 7690 }, { "epoch": 1.31, "grad_norm": 8.266362817079182, "learning_rate": 6.936580965188133e-06, "loss": 0.6703, "step": 7695 }, { "epoch": 1.31, "grad_norm": 16.05943328414273, "learning_rate": 6.9320205108114634e-06, "loss": 0.634, "step": 7700 }, { "epoch": 1.31, "grad_norm": 34.555429787519394, "learning_rate": 6.927458166287017e-06, "loss": 0.6676, "step": 7705 }, { "epoch": 1.31, "grad_norm": 7.106677063243689, "learning_rate": 6.9228939360782585e-06, "loss": 0.6558, "step": 7710 }, { "epoch": 1.31, "grad_norm": 15.555116707313184, "learning_rate": 6.918327824650497e-06, "loss": 0.6678, "step": 7715 }, { "epoch": 1.31, "grad_norm": 34.94775840826066, "learning_rate": 6.913759836470884e-06, "loss": 0.6676, "step": 7720 }, { "epoch": 1.31, "grad_norm": 23.503473494897815, "learning_rate": 6.9091899760084025e-06, "loss": 0.6712, "step": 7725 }, { "epoch": 1.31, "grad_norm": 7.198449503546897, "learning_rate": 6.904618247733874e-06, "loss": 0.6821, "step": 7730 }, { "epoch": 1.32, "grad_norm": 6.423391165147742, "learning_rate": 6.90004465611994e-06, "loss": 0.665, "step": 7735 }, { "epoch": 1.32, "grad_norm": 27.492408282325805, "learning_rate": 6.895469205641071e-06, "loss": 0.6656, "step": 7740 }, { "epoch": 1.32, "grad_norm": 27.131001091554225, "learning_rate": 6.890891900773552e-06, "loss": 0.6466, "step": 7745 }, { "epoch": 1.32, "grad_norm": 12.343118919497925, "learning_rate": 6.886312745995485e-06, "loss": 0.6613, "step": 7750 }, { "epoch": 1.32, "grad_norm": 26.29602942837788, "learning_rate": 6.881731745786779e-06, "loss": 0.6744, "step": 7755 }, { "epoch": 1.32, "grad_norm": 34.93533020196708, "learning_rate": 6.877148904629154e-06, "loss": 0.6661, "step": 7760 }, { "epoch": 1.32, "grad_norm": 20.015281493550773, "learning_rate": 6.872564227006122e-06, "loss": 0.6672, "step": 7765 }, { "epoch": 1.32, "grad_norm": 8.05475189958386, "learning_rate": 6.867977717403e-06, "loss": 0.6693, "step": 7770 }, { "epoch": 1.32, "grad_norm": 19.722271714945407, "learning_rate": 6.863389380306894e-06, "loss": 0.6785, "step": 7775 }, { "epoch": 1.32, "grad_norm": 13.594100948343948, "learning_rate": 6.858799220206698e-06, "loss": 0.6808, "step": 7780 }, { "epoch": 1.32, "grad_norm": 11.256070128775296, "learning_rate": 6.854207241593086e-06, "loss": 0.6684, "step": 7785 }, { "epoch": 1.32, "grad_norm": 7.284928024061565, "learning_rate": 6.849613448958518e-06, "loss": 0.6584, "step": 7790 }, { "epoch": 1.33, "grad_norm": 10.343801575420452, "learning_rate": 6.845017846797224e-06, "loss": 0.6453, "step": 7795 }, { "epoch": 1.33, "grad_norm": 5.988882531705171, "learning_rate": 6.840420439605207e-06, "loss": 0.6343, "step": 7800 }, { "epoch": 1.33, "grad_norm": 20.704821166327548, "learning_rate": 6.835821231880233e-06, "loss": 0.6438, "step": 7805 }, { "epoch": 1.33, "grad_norm": 23.222455538039853, "learning_rate": 6.831220228121831e-06, "loss": 0.6546, "step": 7810 }, { "epoch": 1.33, "grad_norm": 21.479424891039326, "learning_rate": 6.826617432831286e-06, "loss": 0.6563, "step": 7815 }, { "epoch": 1.33, "grad_norm": 42.80175925549784, "learning_rate": 6.822012850511641e-06, "loss": 0.6724, "step": 7820 }, { "epoch": 1.33, "grad_norm": 27.776123859447747, "learning_rate": 6.81740648566768e-06, "loss": 0.6648, "step": 7825 }, { "epoch": 1.33, "grad_norm": 9.208497254204323, "learning_rate": 6.812798342805933e-06, "loss": 0.6484, "step": 7830 }, { "epoch": 1.33, "grad_norm": 31.762321890202262, "learning_rate": 6.808188426434672e-06, "loss": 0.6459, "step": 7835 }, { "epoch": 1.33, "grad_norm": 8.06389345005782, "learning_rate": 6.803576741063903e-06, "loss": 0.6704, "step": 7840 }, { "epoch": 1.33, "grad_norm": 8.02164928639619, "learning_rate": 6.79896329120536e-06, "loss": 0.6617, "step": 7845 }, { "epoch": 1.33, "grad_norm": 16.709489451722483, "learning_rate": 6.794348081372507e-06, "loss": 0.6528, "step": 7850 }, { "epoch": 1.34, "grad_norm": 15.238437551559613, "learning_rate": 6.789731116080529e-06, "loss": 0.6407, "step": 7855 }, { "epoch": 1.34, "grad_norm": 11.206729482619727, "learning_rate": 6.785112399846328e-06, "loss": 0.6625, "step": 7860 }, { "epoch": 1.34, "grad_norm": 7.776138225675623, "learning_rate": 6.780491937188514e-06, "loss": 0.6543, "step": 7865 }, { "epoch": 1.34, "grad_norm": 11.706557883721104, "learning_rate": 6.775869732627417e-06, "loss": 0.6576, "step": 7870 }, { "epoch": 1.34, "grad_norm": 10.764273172448627, "learning_rate": 6.771245790685059e-06, "loss": 0.6767, "step": 7875 }, { "epoch": 1.34, "grad_norm": 52.38525602824595, "learning_rate": 6.766620115885172e-06, "loss": 0.6719, "step": 7880 }, { "epoch": 1.34, "grad_norm": 15.77477399033817, "learning_rate": 6.761992712753173e-06, "loss": 0.6544, "step": 7885 }, { "epoch": 1.34, "grad_norm": 12.49369698024647, "learning_rate": 6.757363585816178e-06, "loss": 0.6428, "step": 7890 }, { "epoch": 1.34, "grad_norm": 7.283967365331387, "learning_rate": 6.7527327396029875e-06, "loss": 0.6457, "step": 7895 }, { "epoch": 1.34, "grad_norm": 6.407807058273839, "learning_rate": 6.748100178644082e-06, "loss": 0.6507, "step": 7900 }, { "epoch": 1.34, "grad_norm": 15.881801966972798, "learning_rate": 6.743465907471623e-06, "loss": 0.663, "step": 7905 }, { "epoch": 1.34, "grad_norm": 18.894217087735242, "learning_rate": 6.738829930619438e-06, "loss": 0.6609, "step": 7910 }, { "epoch": 1.35, "grad_norm": 15.343860422804275, "learning_rate": 6.734192252623034e-06, "loss": 0.6484, "step": 7915 }, { "epoch": 1.35, "grad_norm": 12.817117023965405, "learning_rate": 6.729552878019574e-06, "loss": 0.6381, "step": 7920 }, { "epoch": 1.35, "grad_norm": 17.166428445424156, "learning_rate": 6.724911811347883e-06, "loss": 0.6493, "step": 7925 }, { "epoch": 1.35, "grad_norm": 10.180628433066271, "learning_rate": 6.7202690571484406e-06, "loss": 0.6647, "step": 7930 }, { "epoch": 1.35, "grad_norm": 14.866832953011489, "learning_rate": 6.71562461996338e-06, "loss": 0.6332, "step": 7935 }, { "epoch": 1.35, "grad_norm": 21.691113499249862, "learning_rate": 6.71097850433648e-06, "loss": 0.6726, "step": 7940 }, { "epoch": 1.35, "grad_norm": 10.55997210610206, "learning_rate": 6.706330714813161e-06, "loss": 0.6354, "step": 7945 }, { "epoch": 1.35, "grad_norm": 19.705598228965442, "learning_rate": 6.701681255940478e-06, "loss": 0.6406, "step": 7950 }, { "epoch": 1.35, "grad_norm": 7.295922874790215, "learning_rate": 6.697030132267124e-06, "loss": 0.6578, "step": 7955 }, { "epoch": 1.35, "grad_norm": 16.841094205265872, "learning_rate": 6.692377348343419e-06, "loss": 0.6552, "step": 7960 }, { "epoch": 1.35, "grad_norm": 22.95613214066316, "learning_rate": 6.687722908721308e-06, "loss": 0.6734, "step": 7965 }, { "epoch": 1.35, "grad_norm": 27.35634967120633, "learning_rate": 6.683066817954353e-06, "loss": 0.6497, "step": 7970 }, { "epoch": 1.36, "grad_norm": 14.417662535768118, "learning_rate": 6.678409080597732e-06, "loss": 0.6628, "step": 7975 }, { "epoch": 1.36, "grad_norm": 10.305778063010779, "learning_rate": 6.673749701208239e-06, "loss": 0.6278, "step": 7980 }, { "epoch": 1.36, "grad_norm": 12.067243154378666, "learning_rate": 6.669088684344266e-06, "loss": 0.6304, "step": 7985 }, { "epoch": 1.36, "grad_norm": 7.237713121087854, "learning_rate": 6.664426034565814e-06, "loss": 0.6533, "step": 7990 }, { "epoch": 1.36, "grad_norm": 6.6883233127354975, "learning_rate": 6.6597617564344796e-06, "loss": 0.6452, "step": 7995 }, { "epoch": 1.36, "grad_norm": 6.400448497043079, "learning_rate": 6.6550958545134515e-06, "loss": 0.6489, "step": 8000 }, { "epoch": 1.36, "grad_norm": 7.767147219100383, "learning_rate": 6.6504283333675065e-06, "loss": 0.6449, "step": 8005 }, { "epoch": 1.36, "grad_norm": 10.370384698947925, "learning_rate": 6.645759197563008e-06, "loss": 0.6429, "step": 8010 }, { "epoch": 1.36, "grad_norm": 7.061534266870285, "learning_rate": 6.641088451667894e-06, "loss": 0.662, "step": 8015 }, { "epoch": 1.36, "grad_norm": 21.18127236595691, "learning_rate": 6.636416100251687e-06, "loss": 0.6504, "step": 8020 }, { "epoch": 1.36, "grad_norm": 18.535465862975343, "learning_rate": 6.631742147885468e-06, "loss": 0.6772, "step": 8025 }, { "epoch": 1.37, "grad_norm": 6.964640726111356, "learning_rate": 6.627066599141895e-06, "loss": 0.6597, "step": 8030 }, { "epoch": 1.37, "grad_norm": 11.845507943939163, "learning_rate": 6.622389458595182e-06, "loss": 0.6469, "step": 8035 }, { "epoch": 1.37, "grad_norm": 16.080552123187886, "learning_rate": 6.617710730821103e-06, "loss": 0.6429, "step": 8040 }, { "epoch": 1.37, "grad_norm": 21.511449344925648, "learning_rate": 6.6130304203969805e-06, "loss": 0.6482, "step": 8045 }, { "epoch": 1.37, "grad_norm": 10.772854516081404, "learning_rate": 6.608348531901692e-06, "loss": 0.6619, "step": 8050 }, { "epoch": 1.37, "grad_norm": 12.625015490224003, "learning_rate": 6.603665069915654e-06, "loss": 0.6568, "step": 8055 }, { "epoch": 1.37, "grad_norm": 25.861303879556942, "learning_rate": 6.5989800390208226e-06, "loss": 0.6386, "step": 8060 }, { "epoch": 1.37, "grad_norm": 17.4888690477709, "learning_rate": 6.59429344380069e-06, "loss": 0.6436, "step": 8065 }, { "epoch": 1.37, "grad_norm": 12.131438242239497, "learning_rate": 6.5896052888402805e-06, "loss": 0.6476, "step": 8070 }, { "epoch": 1.37, "grad_norm": 12.257155345532185, "learning_rate": 6.584915578726141e-06, "loss": 0.6393, "step": 8075 }, { "epoch": 1.37, "grad_norm": 17.259442077739987, "learning_rate": 6.5802243180463425e-06, "loss": 0.6219, "step": 8080 }, { "epoch": 1.37, "grad_norm": 6.655363886815646, "learning_rate": 6.575531511390469e-06, "loss": 0.6438, "step": 8085 }, { "epoch": 1.38, "grad_norm": 15.89276103817776, "learning_rate": 6.570837163349624e-06, "loss": 0.6257, "step": 8090 }, { "epoch": 1.38, "grad_norm": 11.058137605602374, "learning_rate": 6.566141278516413e-06, "loss": 0.6516, "step": 8095 }, { "epoch": 1.38, "grad_norm": 48.71355207579282, "learning_rate": 6.561443861484946e-06, "loss": 0.6402, "step": 8100 }, { "epoch": 1.38, "grad_norm": 28.380628719561553, "learning_rate": 6.5567449168508346e-06, "loss": 0.654, "step": 8105 }, { "epoch": 1.38, "grad_norm": 6.411789938460549, "learning_rate": 6.552044449211181e-06, "loss": 0.6552, "step": 8110 }, { "epoch": 1.38, "grad_norm": 22.60346491316076, "learning_rate": 6.54734246316458e-06, "loss": 0.6531, "step": 8115 }, { "epoch": 1.38, "grad_norm": 11.151539620095777, "learning_rate": 6.542638963311112e-06, "loss": 0.649, "step": 8120 }, { "epoch": 1.38, "grad_norm": 29.874279752302918, "learning_rate": 6.537933954252338e-06, "loss": 0.6307, "step": 8125 }, { "epoch": 1.38, "grad_norm": 55.371100766921124, "learning_rate": 6.533227440591294e-06, "loss": 0.6719, "step": 8130 }, { "epoch": 1.38, "grad_norm": 13.23924904787888, "learning_rate": 6.5285194269324895e-06, "loss": 0.6554, "step": 8135 }, { "epoch": 1.38, "grad_norm": 6.542786323664106, "learning_rate": 6.523809917881902e-06, "loss": 0.6363, "step": 8140 }, { "epoch": 1.38, "grad_norm": 11.138369074140726, "learning_rate": 6.5190989180469736e-06, "loss": 0.6489, "step": 8145 }, { "epoch": 1.39, "grad_norm": 20.644221672684967, "learning_rate": 6.514386432036598e-06, "loss": 0.6514, "step": 8150 }, { "epoch": 1.39, "grad_norm": 7.53784502326818, "learning_rate": 6.5096724644611296e-06, "loss": 0.6505, "step": 8155 }, { "epoch": 1.39, "grad_norm": 13.832374181994506, "learning_rate": 6.5049570199323686e-06, "loss": 0.6431, "step": 8160 }, { "epoch": 1.39, "grad_norm": 9.328834025436, "learning_rate": 6.500240103063564e-06, "loss": 0.6243, "step": 8165 }, { "epoch": 1.39, "grad_norm": 15.916018663546673, "learning_rate": 6.495521718469402e-06, "loss": 0.6457, "step": 8170 }, { "epoch": 1.39, "grad_norm": 23.457435094958125, "learning_rate": 6.490801870766004e-06, "loss": 0.6568, "step": 8175 }, { "epoch": 1.39, "grad_norm": 35.150856775834455, "learning_rate": 6.486080564570925e-06, "loss": 0.6513, "step": 8180 }, { "epoch": 1.39, "grad_norm": 7.170893686186264, "learning_rate": 6.481357804503147e-06, "loss": 0.6362, "step": 8185 }, { "epoch": 1.39, "grad_norm": 15.889101730266102, "learning_rate": 6.476633595183073e-06, "loss": 0.6487, "step": 8190 }, { "epoch": 1.39, "grad_norm": 9.13019074213934, "learning_rate": 6.471907941232525e-06, "loss": 0.6449, "step": 8195 }, { "epoch": 1.39, "grad_norm": 16.939034783675478, "learning_rate": 6.467180847274737e-06, "loss": 0.6366, "step": 8200 }, { "epoch": 1.39, "grad_norm": 8.291379823253529, "learning_rate": 6.462452317934352e-06, "loss": 0.6348, "step": 8205 }, { "epoch": 1.4, "grad_norm": 8.535236612949914, "learning_rate": 6.45772235783742e-06, "loss": 0.6284, "step": 8210 }, { "epoch": 1.4, "grad_norm": 7.745474723331273, "learning_rate": 6.452990971611384e-06, "loss": 0.6245, "step": 8215 }, { "epoch": 1.4, "grad_norm": 29.761555067812456, "learning_rate": 6.448258163885092e-06, "loss": 0.6626, "step": 8220 }, { "epoch": 1.4, "grad_norm": 7.001809069262323, "learning_rate": 6.443523939288776e-06, "loss": 0.6446, "step": 8225 }, { "epoch": 1.4, "grad_norm": 14.06191759505353, "learning_rate": 6.438788302454053e-06, "loss": 0.6637, "step": 8230 }, { "epoch": 1.4, "grad_norm": 10.716149188351201, "learning_rate": 6.434051258013928e-06, "loss": 0.6606, "step": 8235 }, { "epoch": 1.4, "grad_norm": 5.812890077780472, "learning_rate": 6.429312810602777e-06, "loss": 0.6329, "step": 8240 }, { "epoch": 1.4, "grad_norm": 8.844538478736041, "learning_rate": 6.424572964856351e-06, "loss": 0.6463, "step": 8245 }, { "epoch": 1.4, "grad_norm": 15.81354924758096, "learning_rate": 6.4198317254117695e-06, "loss": 0.6431, "step": 8250 }, { "epoch": 1.4, "grad_norm": 15.492158449379378, "learning_rate": 6.4150890969075145e-06, "loss": 0.6432, "step": 8255 }, { "epoch": 1.4, "grad_norm": 6.759512578968948, "learning_rate": 6.410345083983427e-06, "loss": 0.6303, "step": 8260 }, { "epoch": 1.41, "grad_norm": 13.119349300664416, "learning_rate": 6.405599691280706e-06, "loss": 0.6543, "step": 8265 }, { "epoch": 1.41, "grad_norm": 33.850079790425276, "learning_rate": 6.400852923441892e-06, "loss": 0.6455, "step": 8270 }, { "epoch": 1.41, "grad_norm": 27.973834250364234, "learning_rate": 6.3961047851108795e-06, "loss": 0.6439, "step": 8275 }, { "epoch": 1.41, "grad_norm": 18.791402441709256, "learning_rate": 6.391355280932898e-06, "loss": 0.6526, "step": 8280 }, { "epoch": 1.41, "grad_norm": 6.8898792407939835, "learning_rate": 6.386604415554521e-06, "loss": 0.6257, "step": 8285 }, { "epoch": 1.41, "grad_norm": 6.302907884289335, "learning_rate": 6.381852193623641e-06, "loss": 0.6526, "step": 8290 }, { "epoch": 1.41, "grad_norm": 24.989620308397974, "learning_rate": 6.377098619789489e-06, "loss": 0.6402, "step": 8295 }, { "epoch": 1.41, "grad_norm": 27.03639392548826, "learning_rate": 6.372343698702612e-06, "loss": 0.6241, "step": 8300 }, { "epoch": 1.41, "grad_norm": 26.418474973434172, "learning_rate": 6.367587435014882e-06, "loss": 0.6298, "step": 8305 }, { "epoch": 1.41, "grad_norm": 15.467358507902656, "learning_rate": 6.362829833379475e-06, "loss": 0.6539, "step": 8310 }, { "epoch": 1.41, "grad_norm": 16.390598419534992, "learning_rate": 6.358070898450884e-06, "loss": 0.6575, "step": 8315 }, { "epoch": 1.41, "grad_norm": 22.18745112063696, "learning_rate": 6.353310634884901e-06, "loss": 0.6289, "step": 8320 }, { "epoch": 1.42, "grad_norm": 18.267245620225978, "learning_rate": 6.348549047338626e-06, "loss": 0.637, "step": 8325 }, { "epoch": 1.42, "grad_norm": 13.441981411328962, "learning_rate": 6.343786140470441e-06, "loss": 0.6309, "step": 8330 }, { "epoch": 1.42, "grad_norm": 14.388295631417312, "learning_rate": 6.339021918940031e-06, "loss": 0.6482, "step": 8335 }, { "epoch": 1.42, "grad_norm": 20.34027340053568, "learning_rate": 6.33425638740836e-06, "loss": 0.6316, "step": 8340 }, { "epoch": 1.42, "grad_norm": 18.72616964408831, "learning_rate": 6.3294895505376784e-06, "loss": 0.6445, "step": 8345 }, { "epoch": 1.42, "grad_norm": 16.599848289761585, "learning_rate": 6.324721412991509e-06, "loss": 0.6373, "step": 8350 }, { "epoch": 1.42, "grad_norm": 10.225968332536441, "learning_rate": 6.31995197943465e-06, "loss": 0.6426, "step": 8355 }, { "epoch": 1.42, "grad_norm": 19.216949694680935, "learning_rate": 6.315181254533167e-06, "loss": 0.631, "step": 8360 }, { "epoch": 1.42, "grad_norm": 8.470168620240177, "learning_rate": 6.310409242954392e-06, "loss": 0.6312, "step": 8365 }, { "epoch": 1.42, "grad_norm": 15.414773113372265, "learning_rate": 6.305635949366906e-06, "loss": 0.6385, "step": 8370 }, { "epoch": 1.42, "grad_norm": 34.07839646444887, "learning_rate": 6.300861378440557e-06, "loss": 0.6402, "step": 8375 }, { "epoch": 1.42, "grad_norm": 16.71560255107337, "learning_rate": 6.296085534846433e-06, "loss": 0.6279, "step": 8380 }, { "epoch": 1.43, "grad_norm": 37.75055046604799, "learning_rate": 6.291308423256873e-06, "loss": 0.622, "step": 8385 }, { "epoch": 1.43, "grad_norm": 31.051698238500787, "learning_rate": 6.286530048345452e-06, "loss": 0.6399, "step": 8390 }, { "epoch": 1.43, "grad_norm": 15.138592809970431, "learning_rate": 6.281750414786983e-06, "loss": 0.6423, "step": 8395 }, { "epoch": 1.43, "grad_norm": 29.79002494254909, "learning_rate": 6.276969527257513e-06, "loss": 0.6361, "step": 8400 }, { "epoch": 1.43, "grad_norm": 28.102533538762536, "learning_rate": 6.272187390434311e-06, "loss": 0.626, "step": 8405 }, { "epoch": 1.43, "grad_norm": 31.97698553168282, "learning_rate": 6.26740400899587e-06, "loss": 0.6422, "step": 8410 }, { "epoch": 1.43, "grad_norm": 9.465448441293175, "learning_rate": 6.262619387621902e-06, "loss": 0.6367, "step": 8415 }, { "epoch": 1.43, "grad_norm": 7.061458649998584, "learning_rate": 6.257833530993332e-06, "loss": 0.6335, "step": 8420 }, { "epoch": 1.43, "grad_norm": 15.88280726412742, "learning_rate": 6.2530464437922936e-06, "loss": 0.6312, "step": 8425 }, { "epoch": 1.43, "grad_norm": 18.12066800542379, "learning_rate": 6.2482581307021195e-06, "loss": 0.6124, "step": 8430 }, { "epoch": 1.43, "grad_norm": 16.94764296608338, "learning_rate": 6.243468596407348e-06, "loss": 0.615, "step": 8435 }, { "epoch": 1.43, "grad_norm": 7.307650693950313, "learning_rate": 6.238677845593709e-06, "loss": 0.629, "step": 8440 }, { "epoch": 1.44, "grad_norm": 8.164608367344734, "learning_rate": 6.233885882948124e-06, "loss": 0.6238, "step": 8445 }, { "epoch": 1.44, "grad_norm": 10.618845576266027, "learning_rate": 6.229092713158699e-06, "loss": 0.6307, "step": 8450 }, { "epoch": 1.44, "grad_norm": 5.999939335710288, "learning_rate": 6.22429834091472e-06, "loss": 0.6252, "step": 8455 }, { "epoch": 1.44, "grad_norm": 27.277384481971897, "learning_rate": 6.219502770906652e-06, "loss": 0.6255, "step": 8460 }, { "epoch": 1.44, "grad_norm": 26.658657226008486, "learning_rate": 6.214706007826133e-06, "loss": 0.6336, "step": 8465 }, { "epoch": 1.44, "grad_norm": 9.790938205285673, "learning_rate": 6.20990805636596e-06, "loss": 0.6216, "step": 8470 }, { "epoch": 1.44, "grad_norm": 10.181913156116149, "learning_rate": 6.205108921220102e-06, "loss": 0.6092, "step": 8475 }, { "epoch": 1.44, "grad_norm": 6.651943934098993, "learning_rate": 6.200308607083683e-06, "loss": 0.6349, "step": 8480 }, { "epoch": 1.44, "grad_norm": 10.036649904845357, "learning_rate": 6.195507118652977e-06, "loss": 0.6251, "step": 8485 }, { "epoch": 1.44, "grad_norm": 14.146870640781684, "learning_rate": 6.190704460625412e-06, "loss": 0.6372, "step": 8490 }, { "epoch": 1.44, "grad_norm": 11.004606776153171, "learning_rate": 6.185900637699555e-06, "loss": 0.6379, "step": 8495 }, { "epoch": 1.45, "grad_norm": 12.261848092144794, "learning_rate": 6.18109565457512e-06, "loss": 0.6273, "step": 8500 }, { "epoch": 1.45, "grad_norm": 8.517734637753321, "learning_rate": 6.176289515952949e-06, "loss": 0.6235, "step": 8505 }, { "epoch": 1.45, "grad_norm": 16.182064084964512, "learning_rate": 6.171482226535016e-06, "loss": 0.6338, "step": 8510 }, { "epoch": 1.45, "grad_norm": 27.011998601299425, "learning_rate": 6.1666737910244234e-06, "loss": 0.6279, "step": 8515 }, { "epoch": 1.45, "grad_norm": 7.461115547233499, "learning_rate": 6.161864214125393e-06, "loss": 0.6187, "step": 8520 }, { "epoch": 1.45, "grad_norm": 26.957756432468457, "learning_rate": 6.157053500543265e-06, "loss": 0.6401, "step": 8525 }, { "epoch": 1.45, "grad_norm": 14.81850691520542, "learning_rate": 6.152241654984488e-06, "loss": 0.6462, "step": 8530 }, { "epoch": 1.45, "grad_norm": 19.24439589408677, "learning_rate": 6.147428682156621e-06, "loss": 0.6354, "step": 8535 }, { "epoch": 1.45, "grad_norm": 36.13098754949811, "learning_rate": 6.142614586768325e-06, "loss": 0.6291, "step": 8540 }, { "epoch": 1.45, "grad_norm": 11.549996730002588, "learning_rate": 6.137799373529361e-06, "loss": 0.6183, "step": 8545 }, { "epoch": 1.45, "grad_norm": 19.749594900839334, "learning_rate": 6.132983047150579e-06, "loss": 0.6197, "step": 8550 }, { "epoch": 1.45, "grad_norm": 17.905561914987118, "learning_rate": 6.128165612343923e-06, "loss": 0.6175, "step": 8555 }, { "epoch": 1.46, "grad_norm": 15.52868932543812, "learning_rate": 6.1233470738224185e-06, "loss": 0.6218, "step": 8560 }, { "epoch": 1.46, "grad_norm": 14.267449464182201, "learning_rate": 6.118527436300175e-06, "loss": 0.6316, "step": 8565 }, { "epoch": 1.46, "grad_norm": 17.904992821265832, "learning_rate": 6.11370670449237e-06, "loss": 0.639, "step": 8570 }, { "epoch": 1.46, "grad_norm": 10.478604606870359, "learning_rate": 6.108884883115256e-06, "loss": 0.6413, "step": 8575 }, { "epoch": 1.46, "grad_norm": 19.41975081931992, "learning_rate": 6.1040619768861505e-06, "loss": 0.6376, "step": 8580 }, { "epoch": 1.46, "grad_norm": 28.473262465248553, "learning_rate": 6.099237990523437e-06, "loss": 0.619, "step": 8585 }, { "epoch": 1.46, "grad_norm": 6.790384150641972, "learning_rate": 6.094412928746546e-06, "loss": 0.6234, "step": 8590 }, { "epoch": 1.46, "grad_norm": 19.06048377191451, "learning_rate": 6.089586796275968e-06, "loss": 0.6226, "step": 8595 }, { "epoch": 1.46, "grad_norm": 23.277298880330747, "learning_rate": 6.084759597833239e-06, "loss": 0.6264, "step": 8600 }, { "epoch": 1.46, "grad_norm": 8.554662803269148, "learning_rate": 6.079931338140936e-06, "loss": 0.6305, "step": 8605 }, { "epoch": 1.46, "grad_norm": 19.270349394789058, "learning_rate": 6.075102021922677e-06, "loss": 0.6439, "step": 8610 }, { "epoch": 1.46, "grad_norm": 67.3469154291316, "learning_rate": 6.070271653903112e-06, "loss": 0.6062, "step": 8615 }, { "epoch": 1.47, "grad_norm": 27.770308384712457, "learning_rate": 6.06544023880792e-06, "loss": 0.6329, "step": 8620 }, { "epoch": 1.47, "grad_norm": 25.407479290475464, "learning_rate": 6.060607781363807e-06, "loss": 0.6315, "step": 8625 }, { "epoch": 1.47, "grad_norm": 15.509769672420044, "learning_rate": 6.055774286298492e-06, "loss": 0.6365, "step": 8630 }, { "epoch": 1.47, "grad_norm": 27.6938281196245, "learning_rate": 6.050939758340716e-06, "loss": 0.6473, "step": 8635 }, { "epoch": 1.47, "grad_norm": 6.082490505267428, "learning_rate": 6.046104202220228e-06, "loss": 0.614, "step": 8640 }, { "epoch": 1.47, "grad_norm": 10.77340992162146, "learning_rate": 6.041267622667784e-06, "loss": 0.6326, "step": 8645 }, { "epoch": 1.47, "grad_norm": 10.152095095597254, "learning_rate": 6.0364300244151385e-06, "loss": 0.6233, "step": 8650 }, { "epoch": 1.47, "grad_norm": 23.517070545128238, "learning_rate": 6.031591412195046e-06, "loss": 0.621, "step": 8655 }, { "epoch": 1.47, "grad_norm": 10.288564339786074, "learning_rate": 6.02675179074125e-06, "loss": 0.6504, "step": 8660 }, { "epoch": 1.47, "grad_norm": 6.512244767455876, "learning_rate": 6.021911164788483e-06, "loss": 0.6419, "step": 8665 }, { "epoch": 1.47, "grad_norm": 11.139035823232772, "learning_rate": 6.0170695390724595e-06, "loss": 0.6154, "step": 8670 }, { "epoch": 1.47, "grad_norm": 30.611473228247544, "learning_rate": 6.012226918329874e-06, "loss": 0.6163, "step": 8675 }, { "epoch": 1.48, "grad_norm": 23.228208297353323, "learning_rate": 6.007383307298391e-06, "loss": 0.6235, "step": 8680 }, { "epoch": 1.48, "grad_norm": 17.16179449932119, "learning_rate": 6.002538710716649e-06, "loss": 0.6376, "step": 8685 }, { "epoch": 1.48, "grad_norm": 11.613436166968452, "learning_rate": 5.997693133324244e-06, "loss": 0.6184, "step": 8690 }, { "epoch": 1.48, "grad_norm": 15.000653168372537, "learning_rate": 5.992846579861737e-06, "loss": 0.6334, "step": 8695 }, { "epoch": 1.48, "grad_norm": 21.01047179127842, "learning_rate": 5.98799905507064e-06, "loss": 0.6309, "step": 8700 }, { "epoch": 1.48, "grad_norm": 9.397516068200028, "learning_rate": 5.98315056369342e-06, "loss": 0.6194, "step": 8705 }, { "epoch": 1.48, "grad_norm": 10.549455568952771, "learning_rate": 5.978301110473486e-06, "loss": 0.6266, "step": 8710 }, { "epoch": 1.48, "grad_norm": 8.407050992301864, "learning_rate": 5.973450700155187e-06, "loss": 0.6169, "step": 8715 }, { "epoch": 1.48, "grad_norm": 9.507586579212115, "learning_rate": 5.968599337483814e-06, "loss": 0.6138, "step": 8720 }, { "epoch": 1.48, "grad_norm": 8.604919897561592, "learning_rate": 5.963747027205583e-06, "loss": 0.6308, "step": 8725 }, { "epoch": 1.48, "grad_norm": 8.18989665536131, "learning_rate": 5.95889377406764e-06, "loss": 0.6248, "step": 8730 }, { "epoch": 1.49, "grad_norm": 9.554625835371105, "learning_rate": 5.954039582818053e-06, "loss": 0.6174, "step": 8735 }, { "epoch": 1.49, "grad_norm": 20.92839999126903, "learning_rate": 5.949184458205811e-06, "loss": 0.6188, "step": 8740 }, { "epoch": 1.49, "grad_norm": 13.267413663263957, "learning_rate": 5.944328404980813e-06, "loss": 0.6414, "step": 8745 }, { "epoch": 1.49, "grad_norm": 23.29476861797064, "learning_rate": 5.939471427893862e-06, "loss": 0.6219, "step": 8750 }, { "epoch": 1.49, "grad_norm": 22.204976774549824, "learning_rate": 5.934613531696677e-06, "loss": 0.6137, "step": 8755 }, { "epoch": 1.49, "grad_norm": 38.20480863930992, "learning_rate": 5.929754721141863e-06, "loss": 0.6239, "step": 8760 }, { "epoch": 1.49, "grad_norm": 17.401029185094114, "learning_rate": 5.924895000982929e-06, "loss": 0.6106, "step": 8765 }, { "epoch": 1.49, "grad_norm": 27.96896375943163, "learning_rate": 5.920034375974267e-06, "loss": 0.625, "step": 8770 }, { "epoch": 1.49, "grad_norm": 15.712006261527762, "learning_rate": 5.91517285087116e-06, "loss": 0.6301, "step": 8775 }, { "epoch": 1.49, "grad_norm": 10.332878892952024, "learning_rate": 5.910310430429768e-06, "loss": 0.6277, "step": 8780 }, { "epoch": 1.49, "grad_norm": 8.420058266879154, "learning_rate": 5.905447119407132e-06, "loss": 0.6302, "step": 8785 }, { "epoch": 1.49, "grad_norm": 20.417067516686085, "learning_rate": 5.900582922561155e-06, "loss": 0.6167, "step": 8790 }, { "epoch": 1.5, "grad_norm": 7.297247346641591, "learning_rate": 5.895717844650613e-06, "loss": 0.6135, "step": 8795 }, { "epoch": 1.5, "grad_norm": 31.71322909627282, "learning_rate": 5.890851890435144e-06, "loss": 0.6064, "step": 8800 }, { "epoch": 1.5, "grad_norm": 15.536266619232359, "learning_rate": 5.885985064675243e-06, "loss": 0.609, "step": 8805 }, { "epoch": 1.5, "grad_norm": 40.11580915341827, "learning_rate": 5.881117372132257e-06, "loss": 0.6287, "step": 8810 }, { "epoch": 1.5, "grad_norm": 7.751306111236599, "learning_rate": 5.876248817568379e-06, "loss": 0.6161, "step": 8815 }, { "epoch": 1.5, "grad_norm": 13.192346826103622, "learning_rate": 5.871379405746647e-06, "loss": 0.629, "step": 8820 }, { "epoch": 1.5, "grad_norm": 18.071210970161694, "learning_rate": 5.8665091414309395e-06, "loss": 0.6142, "step": 8825 }, { "epoch": 1.5, "grad_norm": 6.558916531059064, "learning_rate": 5.861638029385969e-06, "loss": 0.6144, "step": 8830 }, { "epoch": 1.5, "grad_norm": 38.805705537435244, "learning_rate": 5.856766074377273e-06, "loss": 0.5987, "step": 8835 }, { "epoch": 1.5, "grad_norm": 29.412936674979335, "learning_rate": 5.851893281171217e-06, "loss": 0.6259, "step": 8840 }, { "epoch": 1.5, "grad_norm": 12.380415844971367, "learning_rate": 5.8470196545349865e-06, "loss": 0.6221, "step": 8845 }, { "epoch": 1.5, "grad_norm": 21.738500373870316, "learning_rate": 5.842145199236583e-06, "loss": 0.6163, "step": 8850 }, { "epoch": 1.51, "grad_norm": 44.689830654455925, "learning_rate": 5.837269920044815e-06, "loss": 0.6088, "step": 8855 }, { "epoch": 1.51, "grad_norm": 22.913294256785697, "learning_rate": 5.832393821729301e-06, "loss": 0.6165, "step": 8860 }, { "epoch": 1.51, "grad_norm": 22.51389955214667, "learning_rate": 5.827516909060459e-06, "loss": 0.5958, "step": 8865 }, { "epoch": 1.51, "grad_norm": 21.03446683194453, "learning_rate": 5.8226391868095064e-06, "loss": 0.6123, "step": 8870 }, { "epoch": 1.51, "grad_norm": 8.297799547031005, "learning_rate": 5.817760659748448e-06, "loss": 0.6207, "step": 8875 }, { "epoch": 1.51, "grad_norm": 18.64278180009031, "learning_rate": 5.812881332650079e-06, "loss": 0.6187, "step": 8880 }, { "epoch": 1.51, "grad_norm": 18.235311973907905, "learning_rate": 5.808001210287978e-06, "loss": 0.6083, "step": 8885 }, { "epoch": 1.51, "grad_norm": 14.63252366698247, "learning_rate": 5.803120297436498e-06, "loss": 0.6127, "step": 8890 }, { "epoch": 1.51, "grad_norm": 21.192227497855644, "learning_rate": 5.7982385988707705e-06, "loss": 0.6142, "step": 8895 }, { "epoch": 1.51, "grad_norm": 19.526216402550123, "learning_rate": 5.793356119366689e-06, "loss": 0.6367, "step": 8900 }, { "epoch": 1.51, "grad_norm": 7.321169117285131, "learning_rate": 5.788472863700918e-06, "loss": 0.6125, "step": 8905 }, { "epoch": 1.51, "grad_norm": 10.52084507199846, "learning_rate": 5.7835888366508785e-06, "loss": 0.6203, "step": 8910 }, { "epoch": 1.52, "grad_norm": 6.1271021467657425, "learning_rate": 5.778704042994744e-06, "loss": 0.6008, "step": 8915 }, { "epoch": 1.52, "grad_norm": 7.254171708462301, "learning_rate": 5.77381848751144e-06, "loss": 0.6138, "step": 8920 }, { "epoch": 1.52, "grad_norm": 11.320024335417965, "learning_rate": 5.76893217498064e-06, "loss": 0.618, "step": 8925 }, { "epoch": 1.52, "grad_norm": 16.131875846221742, "learning_rate": 5.764045110182752e-06, "loss": 0.6103, "step": 8930 }, { "epoch": 1.52, "grad_norm": 10.229969236321889, "learning_rate": 5.759157297898924e-06, "loss": 0.6286, "step": 8935 }, { "epoch": 1.52, "grad_norm": 9.203086200392013, "learning_rate": 5.754268742911037e-06, "loss": 0.6078, "step": 8940 }, { "epoch": 1.52, "grad_norm": 13.337291102785596, "learning_rate": 5.749379450001693e-06, "loss": 0.6291, "step": 8945 }, { "epoch": 1.52, "grad_norm": 6.130884826076313, "learning_rate": 5.74448942395422e-06, "loss": 0.615, "step": 8950 }, { "epoch": 1.52, "grad_norm": 7.161302708063478, "learning_rate": 5.739598669552664e-06, "loss": 0.6025, "step": 8955 }, { "epoch": 1.52, "grad_norm": 10.614457781504585, "learning_rate": 5.73470719158178e-06, "loss": 0.5931, "step": 8960 }, { "epoch": 1.52, "grad_norm": 16.727423296416706, "learning_rate": 5.729814994827034e-06, "loss": 0.6256, "step": 8965 }, { "epoch": 1.52, "grad_norm": 20.883727699858422, "learning_rate": 5.724922084074595e-06, "loss": 0.6195, "step": 8970 }, { "epoch": 1.53, "grad_norm": 12.987766905601958, "learning_rate": 5.720028464111326e-06, "loss": 0.6148, "step": 8975 }, { "epoch": 1.53, "grad_norm": 7.839778190277152, "learning_rate": 5.715134139724792e-06, "loss": 0.5994, "step": 8980 }, { "epoch": 1.53, "grad_norm": 15.502188480226746, "learning_rate": 5.710239115703238e-06, "loss": 0.5999, "step": 8985 }, { "epoch": 1.53, "grad_norm": 16.162420148029522, "learning_rate": 5.705343396835602e-06, "loss": 0.6065, "step": 8990 }, { "epoch": 1.53, "grad_norm": 16.526083835319188, "learning_rate": 5.7004469879114955e-06, "loss": 0.6127, "step": 8995 }, { "epoch": 1.53, "grad_norm": 13.161343365344532, "learning_rate": 5.6955498937212074e-06, "loss": 0.6183, "step": 9000 }, { "epoch": 1.53, "grad_norm": 13.014971463269138, "learning_rate": 5.6906521190556976e-06, "loss": 0.6128, "step": 9005 }, { "epoch": 1.53, "grad_norm": 9.317961414655818, "learning_rate": 5.685753668706591e-06, "loss": 0.6106, "step": 9010 }, { "epoch": 1.53, "grad_norm": 8.993800769863125, "learning_rate": 5.680854547466174e-06, "loss": 0.6173, "step": 9015 }, { "epoch": 1.53, "grad_norm": 18.032312729410066, "learning_rate": 5.67595476012739e-06, "loss": 0.6204, "step": 9020 }, { "epoch": 1.53, "grad_norm": 6.913492906194817, "learning_rate": 5.671054311483833e-06, "loss": 0.5956, "step": 9025 }, { "epoch": 1.54, "grad_norm": 12.011346666019541, "learning_rate": 5.66615320632974e-06, "loss": 0.6032, "step": 9030 }, { "epoch": 1.54, "grad_norm": 7.120145904095579, "learning_rate": 5.66125144946e-06, "loss": 0.6298, "step": 9035 }, { "epoch": 1.54, "grad_norm": 9.41274470876883, "learning_rate": 5.6563490456701296e-06, "loss": 0.587, "step": 9040 }, { "epoch": 1.54, "grad_norm": 10.915343026406369, "learning_rate": 5.6514459997562855e-06, "loss": 0.6216, "step": 9045 }, { "epoch": 1.54, "grad_norm": 5.895671288900355, "learning_rate": 5.64654231651525e-06, "loss": 0.6183, "step": 9050 }, { "epoch": 1.54, "grad_norm": 23.881102306393533, "learning_rate": 5.641638000744425e-06, "loss": 0.6228, "step": 9055 }, { "epoch": 1.54, "grad_norm": 20.546332683774185, "learning_rate": 5.636733057241839e-06, "loss": 0.6191, "step": 9060 }, { "epoch": 1.54, "grad_norm": 14.134020839761392, "learning_rate": 5.631827490806128e-06, "loss": 0.5941, "step": 9065 }, { "epoch": 1.54, "grad_norm": 10.355455888132816, "learning_rate": 5.626921306236541e-06, "loss": 0.5877, "step": 9070 }, { "epoch": 1.54, "grad_norm": 30.435020932897146, "learning_rate": 5.622014508332932e-06, "loss": 0.6148, "step": 9075 }, { "epoch": 1.54, "grad_norm": 16.35724195784234, "learning_rate": 5.617107101895751e-06, "loss": 0.6164, "step": 9080 }, { "epoch": 1.54, "grad_norm": 7.154665214652813, "learning_rate": 5.6121990917260474e-06, "loss": 0.6191, "step": 9085 }, { "epoch": 1.55, "grad_norm": 16.512763141117652, "learning_rate": 5.607290482625461e-06, "loss": 0.6041, "step": 9090 }, { "epoch": 1.55, "grad_norm": 7.269055262135025, "learning_rate": 5.602381279396213e-06, "loss": 0.6069, "step": 9095 }, { "epoch": 1.55, "grad_norm": 18.536300771361045, "learning_rate": 5.5974714868411125e-06, "loss": 0.6053, "step": 9100 }, { "epoch": 1.55, "grad_norm": 10.77546270410342, "learning_rate": 5.592561109763542e-06, "loss": 0.6037, "step": 9105 }, { "epoch": 1.55, "grad_norm": 13.052644342202596, "learning_rate": 5.587650152967454e-06, "loss": 0.609, "step": 9110 }, { "epoch": 1.55, "grad_norm": 11.336728354774031, "learning_rate": 5.582738621257372e-06, "loss": 0.6046, "step": 9115 }, { "epoch": 1.55, "grad_norm": 6.189860501039402, "learning_rate": 5.57782651943838e-06, "loss": 0.6294, "step": 9120 }, { "epoch": 1.55, "grad_norm": 15.104760202629917, "learning_rate": 5.572913852316116e-06, "loss": 0.6019, "step": 9125 }, { "epoch": 1.55, "grad_norm": 10.372469587828709, "learning_rate": 5.56800062469678e-06, "loss": 0.5982, "step": 9130 }, { "epoch": 1.55, "grad_norm": 26.10833699620634, "learning_rate": 5.563086841387111e-06, "loss": 0.5945, "step": 9135 }, { "epoch": 1.55, "grad_norm": 5.931729991395199, "learning_rate": 5.558172507194397e-06, "loss": 0.6159, "step": 9140 }, { "epoch": 1.55, "grad_norm": 9.704437918283011, "learning_rate": 5.5532576269264635e-06, "loss": 0.6167, "step": 9145 }, { "epoch": 1.56, "grad_norm": 9.00692493270044, "learning_rate": 5.5483422053916735e-06, "loss": 0.5915, "step": 9150 }, { "epoch": 1.56, "grad_norm": 9.21951340821045, "learning_rate": 5.543426247398912e-06, "loss": 0.5967, "step": 9155 }, { "epoch": 1.56, "grad_norm": 5.8956292819034815, "learning_rate": 5.538509757757594e-06, "loss": 0.6109, "step": 9160 }, { "epoch": 1.56, "grad_norm": 7.693620673196478, "learning_rate": 5.533592741277658e-06, "loss": 0.6159, "step": 9165 }, { "epoch": 1.56, "grad_norm": 8.139425257113912, "learning_rate": 5.528675202769549e-06, "loss": 0.5993, "step": 9170 }, { "epoch": 1.56, "grad_norm": 9.863801913245817, "learning_rate": 5.52375714704423e-06, "loss": 0.6, "step": 9175 }, { "epoch": 1.56, "grad_norm": 18.185617291207205, "learning_rate": 5.518838578913167e-06, "loss": 0.6063, "step": 9180 }, { "epoch": 1.56, "grad_norm": 7.2860494408934295, "learning_rate": 5.513919503188328e-06, "loss": 0.591, "step": 9185 }, { "epoch": 1.56, "grad_norm": 34.38305181082742, "learning_rate": 5.508999924682178e-06, "loss": 0.6076, "step": 9190 }, { "epoch": 1.56, "grad_norm": 22.92435759386956, "learning_rate": 5.504079848207671e-06, "loss": 0.6167, "step": 9195 }, { "epoch": 1.56, "grad_norm": 12.760529664653179, "learning_rate": 5.499159278578253e-06, "loss": 0.6014, "step": 9200 }, { "epoch": 1.56, "grad_norm": 13.628714969215856, "learning_rate": 5.4942382206078495e-06, "loss": 0.5977, "step": 9205 }, { "epoch": 1.57, "grad_norm": 16.656187450738024, "learning_rate": 5.489316679110864e-06, "loss": 0.5956, "step": 9210 }, { "epoch": 1.57, "grad_norm": 7.2821838796463245, "learning_rate": 5.484394658902173e-06, "loss": 0.6061, "step": 9215 }, { "epoch": 1.57, "grad_norm": 19.831834598197847, "learning_rate": 5.479472164797124e-06, "loss": 0.5959, "step": 9220 }, { "epoch": 1.57, "grad_norm": 10.214689267998624, "learning_rate": 5.474549201611521e-06, "loss": 0.6063, "step": 9225 }, { "epoch": 1.57, "grad_norm": 6.16812247258966, "learning_rate": 5.4696257741616366e-06, "loss": 0.5894, "step": 9230 }, { "epoch": 1.57, "grad_norm": 6.923420658187072, "learning_rate": 5.464701887264188e-06, "loss": 0.6208, "step": 9235 }, { "epoch": 1.57, "grad_norm": 16.284791283442523, "learning_rate": 5.45977754573635e-06, "loss": 0.6002, "step": 9240 }, { "epoch": 1.57, "grad_norm": 9.352219537142107, "learning_rate": 5.454852754395738e-06, "loss": 0.5926, "step": 9245 }, { "epoch": 1.57, "grad_norm": 7.119697378052444, "learning_rate": 5.449927518060407e-06, "loss": 0.5869, "step": 9250 }, { "epoch": 1.57, "grad_norm": 9.266374499933223, "learning_rate": 5.44500184154885e-06, "loss": 0.5999, "step": 9255 }, { "epoch": 1.57, "grad_norm": 7.297673633134746, "learning_rate": 5.44007572967999e-06, "loss": 0.5976, "step": 9260 }, { "epoch": 1.58, "grad_norm": 21.96269904481905, "learning_rate": 5.435149187273172e-06, "loss": 0.6065, "step": 9265 }, { "epoch": 1.58, "grad_norm": 7.492653965184815, "learning_rate": 5.430222219148168e-06, "loss": 0.5974, "step": 9270 }, { "epoch": 1.58, "grad_norm": 23.22740569704164, "learning_rate": 5.4252948301251615e-06, "loss": 0.5934, "step": 9275 }, { "epoch": 1.58, "grad_norm": 8.173788220397405, "learning_rate": 5.420367025024753e-06, "loss": 0.6119, "step": 9280 }, { "epoch": 1.58, "grad_norm": 6.937326719215955, "learning_rate": 5.415438808667944e-06, "loss": 0.6035, "step": 9285 }, { "epoch": 1.58, "grad_norm": 25.668499087828007, "learning_rate": 5.410510185876146e-06, "loss": 0.6221, "step": 9290 }, { "epoch": 1.58, "grad_norm": 8.964394347664827, "learning_rate": 5.405581161471157e-06, "loss": 0.6056, "step": 9295 }, { "epoch": 1.58, "grad_norm": 6.995289947861624, "learning_rate": 5.40065174027518e-06, "loss": 0.5984, "step": 9300 }, { "epoch": 1.58, "grad_norm": 7.214810192286005, "learning_rate": 5.3957219271108e-06, "loss": 0.5886, "step": 9305 }, { "epoch": 1.58, "grad_norm": 11.989591849187818, "learning_rate": 5.390791726800983e-06, "loss": 0.6138, "step": 9310 }, { "epoch": 1.58, "grad_norm": 8.808671808767826, "learning_rate": 5.385861144169081e-06, "loss": 0.6074, "step": 9315 }, { "epoch": 1.58, "grad_norm": 7.561526838673997, "learning_rate": 5.3809301840388126e-06, "loss": 0.5923, "step": 9320 }, { "epoch": 1.59, "grad_norm": 5.999545774089387, "learning_rate": 5.375998851234272e-06, "loss": 0.6118, "step": 9325 }, { "epoch": 1.59, "grad_norm": 18.190251238079696, "learning_rate": 5.371067150579912e-06, "loss": 0.5854, "step": 9330 }, { "epoch": 1.59, "grad_norm": 9.054453094014715, "learning_rate": 5.366135086900552e-06, "loss": 0.5986, "step": 9335 }, { "epoch": 1.59, "grad_norm": 18.541785491546758, "learning_rate": 5.361202665021359e-06, "loss": 0.6017, "step": 9340 }, { "epoch": 1.59, "grad_norm": 15.566548150106643, "learning_rate": 5.356269889767857e-06, "loss": 0.5875, "step": 9345 }, { "epoch": 1.59, "grad_norm": 8.093235129596696, "learning_rate": 5.351336765965913e-06, "loss": 0.6059, "step": 9350 }, { "epoch": 1.59, "grad_norm": 16.523942808346206, "learning_rate": 5.3464032984417345e-06, "loss": 0.5845, "step": 9355 }, { "epoch": 1.59, "grad_norm": 9.225232390971318, "learning_rate": 5.341469492021866e-06, "loss": 0.5858, "step": 9360 }, { "epoch": 1.59, "grad_norm": 10.56203769259536, "learning_rate": 5.336535351533182e-06, "loss": 0.5957, "step": 9365 }, { "epoch": 1.59, "grad_norm": 8.809870152961832, "learning_rate": 5.331600881802887e-06, "loss": 0.5988, "step": 9370 }, { "epoch": 1.59, "grad_norm": 8.809022292311157, "learning_rate": 5.326666087658505e-06, "loss": 0.607, "step": 9375 }, { "epoch": 1.59, "grad_norm": 12.171141932866165, "learning_rate": 5.321730973927879e-06, "loss": 0.6137, "step": 9380 }, { "epoch": 1.6, "grad_norm": 6.533028033705287, "learning_rate": 5.316795545439162e-06, "loss": 0.5943, "step": 9385 }, { "epoch": 1.6, "grad_norm": 11.502217924497344, "learning_rate": 5.31185980702082e-06, "loss": 0.5949, "step": 9390 }, { "epoch": 1.6, "grad_norm": 6.901020453643195, "learning_rate": 5.306923763501616e-06, "loss": 0.6005, "step": 9395 }, { "epoch": 1.6, "grad_norm": 19.593915315383, "learning_rate": 5.301987419710617e-06, "loss": 0.6089, "step": 9400 }, { "epoch": 1.6, "grad_norm": 9.3117371932334, "learning_rate": 5.297050780477179e-06, "loss": 0.5964, "step": 9405 }, { "epoch": 1.6, "grad_norm": 8.826971591181108, "learning_rate": 5.29211385063095e-06, "loss": 0.592, "step": 9410 }, { "epoch": 1.6, "grad_norm": 31.45574299091673, "learning_rate": 5.287176635001863e-06, "loss": 0.6082, "step": 9415 }, { "epoch": 1.6, "grad_norm": 15.088086236450218, "learning_rate": 5.282239138420127e-06, "loss": 0.5993, "step": 9420 }, { "epoch": 1.6, "grad_norm": 42.54559890377032, "learning_rate": 5.277301365716228e-06, "loss": 0.6037, "step": 9425 }, { "epoch": 1.6, "grad_norm": 6.003789314693376, "learning_rate": 5.272363321720926e-06, "loss": 0.6034, "step": 9430 }, { "epoch": 1.6, "grad_norm": 36.70258270216636, "learning_rate": 5.267425011265239e-06, "loss": 0.6, "step": 9435 }, { "epoch": 1.6, "grad_norm": 20.528336862629228, "learning_rate": 5.26248643918045e-06, "loss": 0.5939, "step": 9440 }, { "epoch": 1.61, "grad_norm": 26.2131550907509, "learning_rate": 5.2575476102980995e-06, "loss": 0.5955, "step": 9445 }, { "epoch": 1.61, "grad_norm": 29.015949407973547, "learning_rate": 5.252608529449973e-06, "loss": 0.5867, "step": 9450 }, { "epoch": 1.61, "grad_norm": 5.981498137225529, "learning_rate": 5.2476692014681095e-06, "loss": 0.6001, "step": 9455 }, { "epoch": 1.61, "grad_norm": 28.895944965854035, "learning_rate": 5.242729631184786e-06, "loss": 0.6044, "step": 9460 }, { "epoch": 1.61, "grad_norm": 11.041760611777507, "learning_rate": 5.237789823432517e-06, "loss": 0.5943, "step": 9465 }, { "epoch": 1.61, "grad_norm": 8.521764242332235, "learning_rate": 5.232849783044052e-06, "loss": 0.5826, "step": 9470 }, { "epoch": 1.61, "grad_norm": 10.398708372866475, "learning_rate": 5.227909514852361e-06, "loss": 0.607, "step": 9475 }, { "epoch": 1.61, "grad_norm": 20.215730658188253, "learning_rate": 5.222969023690645e-06, "loss": 0.5931, "step": 9480 }, { "epoch": 1.61, "grad_norm": 8.599945335944277, "learning_rate": 5.218028314392318e-06, "loss": 0.6104, "step": 9485 }, { "epoch": 1.61, "grad_norm": 7.166393268974675, "learning_rate": 5.213087391791013e-06, "loss": 0.5799, "step": 9490 }, { "epoch": 1.61, "grad_norm": 10.588862984883596, "learning_rate": 5.208146260720565e-06, "loss": 0.5951, "step": 9495 }, { "epoch": 1.62, "grad_norm": 17.894396498559342, "learning_rate": 5.203204926015014e-06, "loss": 0.5937, "step": 9500 }, { "epoch": 1.62, "grad_norm": 27.202565212697937, "learning_rate": 5.1982633925086035e-06, "loss": 0.5935, "step": 9505 }, { "epoch": 1.62, "grad_norm": 23.281425496642882, "learning_rate": 5.1933216650357685e-06, "loss": 0.5886, "step": 9510 }, { "epoch": 1.62, "grad_norm": 22.44444843338854, "learning_rate": 5.188379748431135e-06, "loss": 0.6094, "step": 9515 }, { "epoch": 1.62, "grad_norm": 10.051494013021621, "learning_rate": 5.1834376475295126e-06, "loss": 0.5863, "step": 9520 }, { "epoch": 1.62, "grad_norm": 13.514138256595807, "learning_rate": 5.17849536716589e-06, "loss": 0.5763, "step": 9525 }, { "epoch": 1.62, "grad_norm": 7.214611501380019, "learning_rate": 5.173552912175437e-06, "loss": 0.5877, "step": 9530 }, { "epoch": 1.62, "grad_norm": 7.452926704996535, "learning_rate": 5.168610287393489e-06, "loss": 0.5908, "step": 9535 }, { "epoch": 1.62, "grad_norm": 9.43734189421902, "learning_rate": 5.163667497655549e-06, "loss": 0.5944, "step": 9540 }, { "epoch": 1.62, "grad_norm": 5.50824964629719, "learning_rate": 5.15872454779728e-06, "loss": 0.5913, "step": 9545 }, { "epoch": 1.62, "grad_norm": 6.772975708340726, "learning_rate": 5.153781442654505e-06, "loss": 0.5899, "step": 9550 }, { "epoch": 1.62, "grad_norm": 5.791507395852812, "learning_rate": 5.148838187063199e-06, "loss": 0.5664, "step": 9555 }, { "epoch": 1.63, "grad_norm": 9.19245147769233, "learning_rate": 5.143894785859478e-06, "loss": 0.579, "step": 9560 }, { "epoch": 1.63, "grad_norm": 7.188990357721228, "learning_rate": 5.138951243879608e-06, "loss": 0.5956, "step": 9565 }, { "epoch": 1.63, "grad_norm": 6.134172803566005, "learning_rate": 5.134007565959986e-06, "loss": 0.5759, "step": 9570 }, { "epoch": 1.63, "grad_norm": 8.083896517270064, "learning_rate": 5.1290637569371504e-06, "loss": 0.5877, "step": 9575 }, { "epoch": 1.63, "grad_norm": 19.32150938461579, "learning_rate": 5.124119821647759e-06, "loss": 0.5677, "step": 9580 }, { "epoch": 1.63, "grad_norm": 7.6213419594999525, "learning_rate": 5.119175764928599e-06, "loss": 0.5851, "step": 9585 }, { "epoch": 1.63, "grad_norm": 7.008101960171448, "learning_rate": 5.114231591616573e-06, "loss": 0.5703, "step": 9590 }, { "epoch": 1.63, "grad_norm": 5.606662403436859, "learning_rate": 5.1092873065487e-06, "loss": 0.5754, "step": 9595 }, { "epoch": 1.63, "grad_norm": 13.279716551224174, "learning_rate": 5.104342914562107e-06, "loss": 0.5706, "step": 9600 }, { "epoch": 1.63, "grad_norm": 6.533706059181452, "learning_rate": 5.0993984204940265e-06, "loss": 0.5842, "step": 9605 }, { "epoch": 1.63, "grad_norm": 27.84735675215916, "learning_rate": 5.0944538291817904e-06, "loss": 0.569, "step": 9610 }, { "epoch": 1.63, "grad_norm": 5.401334890929006, "learning_rate": 5.089509145462825e-06, "loss": 0.5832, "step": 9615 }, { "epoch": 1.64, "grad_norm": 10.823274768680948, "learning_rate": 5.084564374174649e-06, "loss": 0.5911, "step": 9620 }, { "epoch": 1.64, "grad_norm": 8.325720202355667, "learning_rate": 5.079619520154865e-06, "loss": 0.5927, "step": 9625 }, { "epoch": 1.64, "grad_norm": 12.803797542503668, "learning_rate": 5.074674588241157e-06, "loss": 0.5723, "step": 9630 }, { "epoch": 1.64, "grad_norm": 7.970088435158851, "learning_rate": 5.069729583271285e-06, "loss": 0.5959, "step": 9635 }, { "epoch": 1.64, "grad_norm": 24.213551346003864, "learning_rate": 5.0647845100830805e-06, "loss": 0.5713, "step": 9640 }, { "epoch": 1.64, "grad_norm": 11.224768652073715, "learning_rate": 5.059839373514441e-06, "loss": 0.5854, "step": 9645 }, { "epoch": 1.64, "grad_norm": 5.8733589879068955, "learning_rate": 5.05489417840333e-06, "loss": 0.5768, "step": 9650 }, { "epoch": 1.64, "grad_norm": 8.377198052908513, "learning_rate": 5.049948929587764e-06, "loss": 0.5836, "step": 9655 }, { "epoch": 1.64, "grad_norm": 8.73180875581141, "learning_rate": 5.045003631905813e-06, "loss": 0.5842, "step": 9660 }, { "epoch": 1.64, "grad_norm": 25.008213493513512, "learning_rate": 5.040058290195594e-06, "loss": 0.5947, "step": 9665 }, { "epoch": 1.64, "grad_norm": 6.41992480953314, "learning_rate": 5.0351129092952685e-06, "loss": 0.5729, "step": 9670 }, { "epoch": 1.64, "grad_norm": 10.591693362525524, "learning_rate": 5.030167494043039e-06, "loss": 0.5941, "step": 9675 }, { "epoch": 1.65, "grad_norm": 13.601293521527685, "learning_rate": 5.025222049277136e-06, "loss": 0.5862, "step": 9680 }, { "epoch": 1.65, "grad_norm": 7.893404495312224, "learning_rate": 5.020276579835821e-06, "loss": 0.5965, "step": 9685 }, { "epoch": 1.65, "grad_norm": 13.84532963343138, "learning_rate": 5.0153310905573815e-06, "loss": 0.5938, "step": 9690 }, { "epoch": 1.65, "grad_norm": 7.06878354567659, "learning_rate": 5.0103855862801235e-06, "loss": 0.5921, "step": 9695 }, { "epoch": 1.65, "grad_norm": 31.63918706961253, "learning_rate": 5.005440071842365e-06, "loss": 0.5951, "step": 9700 }, { "epoch": 1.65, "grad_norm": 12.578560024309926, "learning_rate": 5.000494552082437e-06, "loss": 0.5893, "step": 9705 }, { "epoch": 1.65, "grad_norm": 24.101825694531986, "learning_rate": 4.995549031838675e-06, "loss": 0.583, "step": 9710 }, { "epoch": 1.65, "grad_norm": 29.86430102484688, "learning_rate": 4.990603515949416e-06, "loss": 0.5828, "step": 9715 }, { "epoch": 1.65, "grad_norm": 10.822902890432811, "learning_rate": 4.985658009252992e-06, "loss": 0.5617, "step": 9720 }, { "epoch": 1.65, "grad_norm": 26.90223147024694, "learning_rate": 4.980712516587724e-06, "loss": 0.5908, "step": 9725 }, { "epoch": 1.65, "grad_norm": 34.950449323343015, "learning_rate": 4.975767042791921e-06, "loss": 0.5766, "step": 9730 }, { "epoch": 1.66, "grad_norm": 12.257839463549047, "learning_rate": 4.970821592703874e-06, "loss": 0.5859, "step": 9735 }, { "epoch": 1.66, "grad_norm": 8.386917175444268, "learning_rate": 4.965876171161848e-06, "loss": 0.5797, "step": 9740 }, { "epoch": 1.66, "grad_norm": 22.194715608074517, "learning_rate": 4.960930783004085e-06, "loss": 0.5698, "step": 9745 }, { "epoch": 1.66, "grad_norm": 21.670717750103282, "learning_rate": 4.955985433068791e-06, "loss": 0.5858, "step": 9750 }, { "epoch": 1.66, "grad_norm": 24.122686284627108, "learning_rate": 4.951040126194135e-06, "loss": 0.5921, "step": 9755 }, { "epoch": 1.66, "grad_norm": 16.764344436833607, "learning_rate": 4.946094867218243e-06, "loss": 0.5677, "step": 9760 }, { "epoch": 1.66, "grad_norm": 19.31689565138828, "learning_rate": 4.941149660979201e-06, "loss": 0.5904, "step": 9765 }, { "epoch": 1.66, "grad_norm": 11.942338302320184, "learning_rate": 4.936204512315029e-06, "loss": 0.5971, "step": 9770 }, { "epoch": 1.66, "grad_norm": 11.9944235406172, "learning_rate": 4.931259426063704e-06, "loss": 0.5843, "step": 9775 }, { "epoch": 1.66, "grad_norm": 21.039797821647916, "learning_rate": 4.926314407063136e-06, "loss": 0.577, "step": 9780 }, { "epoch": 1.66, "grad_norm": 12.865950004267344, "learning_rate": 4.9213694601511714e-06, "loss": 0.5809, "step": 9785 }, { "epoch": 1.66, "grad_norm": 7.363302596636814, "learning_rate": 4.9164245901655845e-06, "loss": 0.5825, "step": 9790 }, { "epoch": 1.67, "grad_norm": 7.5785200823388505, "learning_rate": 4.911479801944076e-06, "loss": 0.5878, "step": 9795 }, { "epoch": 1.67, "grad_norm": 29.945610829274326, "learning_rate": 4.906535100324264e-06, "loss": 0.5877, "step": 9800 }, { "epoch": 1.67, "grad_norm": 7.813855108965385, "learning_rate": 4.901590490143686e-06, "loss": 0.5727, "step": 9805 }, { "epoch": 1.67, "grad_norm": 6.498703494352979, "learning_rate": 4.896645976239785e-06, "loss": 0.5813, "step": 9810 }, { "epoch": 1.67, "grad_norm": 16.94551286609573, "learning_rate": 4.8917015634499125e-06, "loss": 0.5937, "step": 9815 }, { "epoch": 1.67, "grad_norm": 6.006180458225585, "learning_rate": 4.886757256611323e-06, "loss": 0.5783, "step": 9820 }, { "epoch": 1.67, "grad_norm": 10.992668492922249, "learning_rate": 4.881813060561162e-06, "loss": 0.5793, "step": 9825 }, { "epoch": 1.67, "grad_norm": 7.09319164500253, "learning_rate": 4.876868980136472e-06, "loss": 0.5774, "step": 9830 }, { "epoch": 1.67, "grad_norm": 19.408392042835462, "learning_rate": 4.87192502017418e-06, "loss": 0.5756, "step": 9835 }, { "epoch": 1.67, "grad_norm": 17.818354113938415, "learning_rate": 4.866981185511095e-06, "loss": 0.5755, "step": 9840 }, { "epoch": 1.67, "grad_norm": 8.504576142845876, "learning_rate": 4.862037480983906e-06, "loss": 0.5762, "step": 9845 }, { "epoch": 1.67, "grad_norm": 21.296112438487995, "learning_rate": 4.857093911429169e-06, "loss": 0.5928, "step": 9850 }, { "epoch": 1.68, "grad_norm": 17.99023787677679, "learning_rate": 4.852150481683313e-06, "loss": 0.5809, "step": 9855 }, { "epoch": 1.68, "grad_norm": 6.901437789610188, "learning_rate": 4.847207196582628e-06, "loss": 0.5883, "step": 9860 }, { "epoch": 1.68, "grad_norm": 14.855858691741162, "learning_rate": 4.842264060963265e-06, "loss": 0.5962, "step": 9865 }, { "epoch": 1.68, "grad_norm": 27.968729321502533, "learning_rate": 4.837321079661225e-06, "loss": 0.5897, "step": 9870 }, { "epoch": 1.68, "grad_norm": 10.75320400924953, "learning_rate": 4.83237825751236e-06, "loss": 0.5836, "step": 9875 }, { "epoch": 1.68, "grad_norm": 9.5378797267475, "learning_rate": 4.827435599352367e-06, "loss": 0.5679, "step": 9880 }, { "epoch": 1.68, "grad_norm": 26.342093199593943, "learning_rate": 4.822493110016785e-06, "loss": 0.5743, "step": 9885 }, { "epoch": 1.68, "grad_norm": 5.642984156198046, "learning_rate": 4.817550794340977e-06, "loss": 0.5702, "step": 9890 }, { "epoch": 1.68, "grad_norm": 13.668599007883666, "learning_rate": 4.812608657160149e-06, "loss": 0.5739, "step": 9895 }, { "epoch": 1.68, "grad_norm": 10.331624056006664, "learning_rate": 4.807666703309327e-06, "loss": 0.5671, "step": 9900 }, { "epoch": 1.68, "grad_norm": 25.002481425875065, "learning_rate": 4.802724937623355e-06, "loss": 0.5769, "step": 9905 }, { "epoch": 1.68, "grad_norm": 18.291991609730832, "learning_rate": 4.7977833649369e-06, "loss": 0.5797, "step": 9910 }, { "epoch": 1.69, "grad_norm": 9.706725759701401, "learning_rate": 4.7928419900844316e-06, "loss": 0.5755, "step": 9915 }, { "epoch": 1.69, "grad_norm": 14.17580923262539, "learning_rate": 4.787900817900232e-06, "loss": 0.5749, "step": 9920 }, { "epoch": 1.69, "grad_norm": 7.019499640302362, "learning_rate": 4.782959853218386e-06, "loss": 0.5819, "step": 9925 }, { "epoch": 1.69, "grad_norm": 13.759696126688855, "learning_rate": 4.778019100872767e-06, "loss": 0.5792, "step": 9930 }, { "epoch": 1.69, "grad_norm": 26.831148200227613, "learning_rate": 4.773078565697048e-06, "loss": 0.5565, "step": 9935 }, { "epoch": 1.69, "grad_norm": 6.270356675527533, "learning_rate": 4.76813825252469e-06, "loss": 0.5843, "step": 9940 }, { "epoch": 1.69, "grad_norm": 12.231540027130091, "learning_rate": 4.763198166188933e-06, "loss": 0.5774, "step": 9945 }, { "epoch": 1.69, "grad_norm": 7.9990434864943545, "learning_rate": 4.758258311522798e-06, "loss": 0.5961, "step": 9950 }, { "epoch": 1.69, "grad_norm": 5.583122420995502, "learning_rate": 4.7533186933590766e-06, "loss": 0.5779, "step": 9955 }, { "epoch": 1.69, "grad_norm": 12.375315990982129, "learning_rate": 4.748379316530331e-06, "loss": 0.5557, "step": 9960 }, { "epoch": 1.69, "grad_norm": 33.351291331106125, "learning_rate": 4.743440185868888e-06, "loss": 0.5775, "step": 9965 }, { "epoch": 1.7, "grad_norm": 6.461224217936112, "learning_rate": 4.738501306206831e-06, "loss": 0.5718, "step": 9970 }, { "epoch": 1.7, "grad_norm": 20.359831655936695, "learning_rate": 4.733562682375999e-06, "loss": 0.5673, "step": 9975 }, { "epoch": 1.7, "grad_norm": 12.654800710201974, "learning_rate": 4.728624319207979e-06, "loss": 0.5715, "step": 9980 }, { "epoch": 1.7, "grad_norm": 11.9141816673019, "learning_rate": 4.723686221534109e-06, "loss": 0.5612, "step": 9985 }, { "epoch": 1.7, "grad_norm": 21.915149604282366, "learning_rate": 4.7187483941854615e-06, "loss": 0.5743, "step": 9990 }, { "epoch": 1.7, "grad_norm": 12.012368837366665, "learning_rate": 4.713810841992845e-06, "loss": 0.5684, "step": 9995 }, { "epoch": 1.7, "grad_norm": 23.658206543376345, "learning_rate": 4.708873569786803e-06, "loss": 0.588, "step": 10000 }, { "epoch": 1.7, "grad_norm": 12.610097088990205, "learning_rate": 4.7039365823976e-06, "loss": 0.557, "step": 10005 }, { "epoch": 1.7, "grad_norm": 10.946014682934994, "learning_rate": 4.6989998846552234e-06, "loss": 0.5691, "step": 10010 }, { "epoch": 1.7, "grad_norm": 5.491836666841432, "learning_rate": 4.694063481389377e-06, "loss": 0.5755, "step": 10015 }, { "epoch": 1.7, "grad_norm": 12.281593897023107, "learning_rate": 4.68912737742948e-06, "loss": 0.5824, "step": 10020 }, { "epoch": 1.7, "grad_norm": 6.680821452661381, "learning_rate": 4.684191577604653e-06, "loss": 0.5528, "step": 10025 }, { "epoch": 1.71, "grad_norm": 8.924164351060437, "learning_rate": 4.679256086743725e-06, "loss": 0.5757, "step": 10030 }, { "epoch": 1.71, "grad_norm": 6.566207423001996, "learning_rate": 4.674320909675218e-06, "loss": 0.5593, "step": 10035 }, { "epoch": 1.71, "grad_norm": 9.428569933754627, "learning_rate": 4.66938605122735e-06, "loss": 0.5751, "step": 10040 }, { "epoch": 1.71, "grad_norm": 16.645878410906143, "learning_rate": 4.664451516228027e-06, "loss": 0.5799, "step": 10045 }, { "epoch": 1.71, "grad_norm": 14.705526960881357, "learning_rate": 4.659517309504834e-06, "loss": 0.5736, "step": 10050 }, { "epoch": 1.71, "grad_norm": 17.44097547370709, "learning_rate": 4.6545834358850415e-06, "loss": 0.5816, "step": 10055 }, { "epoch": 1.71, "grad_norm": 6.76221489972865, "learning_rate": 4.649649900195591e-06, "loss": 0.5735, "step": 10060 }, { "epoch": 1.71, "grad_norm": 12.732878022187395, "learning_rate": 4.644716707263091e-06, "loss": 0.579, "step": 10065 }, { "epoch": 1.71, "grad_norm": 5.972228567376378, "learning_rate": 4.6397838619138205e-06, "loss": 0.574, "step": 10070 }, { "epoch": 1.71, "grad_norm": 11.090807111400876, "learning_rate": 4.634851368973713e-06, "loss": 0.5678, "step": 10075 }, { "epoch": 1.71, "grad_norm": 17.469211038126204, "learning_rate": 4.6299192332683605e-06, "loss": 0.5764, "step": 10080 }, { "epoch": 1.71, "grad_norm": 9.812271489373423, "learning_rate": 4.6249874596230056e-06, "loss": 0.5643, "step": 10085 }, { "epoch": 1.72, "grad_norm": 14.540827875393065, "learning_rate": 4.620056052862532e-06, "loss": 0.5597, "step": 10090 }, { "epoch": 1.72, "grad_norm": 9.747842561670529, "learning_rate": 4.615125017811471e-06, "loss": 0.5727, "step": 10095 }, { "epoch": 1.72, "grad_norm": 6.5164902547307815, "learning_rate": 4.6101943592939855e-06, "loss": 0.5644, "step": 10100 }, { "epoch": 1.72, "grad_norm": 27.124478750771498, "learning_rate": 4.605264082133872e-06, "loss": 0.5743, "step": 10105 }, { "epoch": 1.72, "grad_norm": 8.513661237874295, "learning_rate": 4.600334191154554e-06, "loss": 0.555, "step": 10110 }, { "epoch": 1.72, "grad_norm": 15.864940327869741, "learning_rate": 4.595404691179077e-06, "loss": 0.5547, "step": 10115 }, { "epoch": 1.72, "grad_norm": 7.06766107265483, "learning_rate": 4.5904755870301035e-06, "loss": 0.5793, "step": 10120 }, { "epoch": 1.72, "grad_norm": 8.085298717926536, "learning_rate": 4.585546883529911e-06, "loss": 0.578, "step": 10125 }, { "epoch": 1.72, "grad_norm": 7.041429483784615, "learning_rate": 4.5806185855003786e-06, "loss": 0.5677, "step": 10130 }, { "epoch": 1.72, "grad_norm": 5.799211328435109, "learning_rate": 4.575690697762996e-06, "loss": 0.5665, "step": 10135 }, { "epoch": 1.72, "grad_norm": 17.448452244387205, "learning_rate": 4.5707632251388484e-06, "loss": 0.5753, "step": 10140 }, { "epoch": 1.72, "grad_norm": 26.61989099099489, "learning_rate": 4.5658361724486165e-06, "loss": 0.5736, "step": 10145 }, { "epoch": 1.73, "grad_norm": 7.459385864952437, "learning_rate": 4.5609095445125665e-06, "loss": 0.5675, "step": 10150 }, { "epoch": 1.73, "grad_norm": 11.14015548413005, "learning_rate": 4.555983346150551e-06, "loss": 0.552, "step": 10155 }, { "epoch": 1.73, "grad_norm": 17.124694141352393, "learning_rate": 4.551057582182005e-06, "loss": 0.5673, "step": 10160 }, { "epoch": 1.73, "grad_norm": 22.63468479100616, "learning_rate": 4.546132257425939e-06, "loss": 0.5709, "step": 10165 }, { "epoch": 1.73, "grad_norm": 17.668158526720532, "learning_rate": 4.541207376700924e-06, "loss": 0.5714, "step": 10170 }, { "epoch": 1.73, "grad_norm": 9.798478426561271, "learning_rate": 4.5362829448251076e-06, "loss": 0.5721, "step": 10175 }, { "epoch": 1.73, "grad_norm": 5.943369172881516, "learning_rate": 4.5313589666161935e-06, "loss": 0.5763, "step": 10180 }, { "epoch": 1.73, "grad_norm": 5.960859334311488, "learning_rate": 4.5264354468914425e-06, "loss": 0.5775, "step": 10185 }, { "epoch": 1.73, "grad_norm": 15.36159808951015, "learning_rate": 4.521512390467668e-06, "loss": 0.5545, "step": 10190 }, { "epoch": 1.73, "grad_norm": 5.850169100527044, "learning_rate": 4.516589802161228e-06, "loss": 0.5657, "step": 10195 }, { "epoch": 1.73, "grad_norm": 6.8472842610984985, "learning_rate": 4.511667686788022e-06, "loss": 0.5613, "step": 10200 }, { "epoch": 1.73, "grad_norm": 6.381231330134624, "learning_rate": 4.50674604916349e-06, "loss": 0.5597, "step": 10205 }, { "epoch": 1.74, "grad_norm": 5.380650887041463, "learning_rate": 4.501824894102604e-06, "loss": 0.5746, "step": 10210 }, { "epoch": 1.74, "grad_norm": 6.9616226566722625, "learning_rate": 4.49690422641986e-06, "loss": 0.568, "step": 10215 }, { "epoch": 1.74, "grad_norm": 7.128136408250823, "learning_rate": 4.49198405092928e-06, "loss": 0.5635, "step": 10220 }, { "epoch": 1.74, "grad_norm": 7.7778297659247375, "learning_rate": 4.487064372444406e-06, "loss": 0.5661, "step": 10225 }, { "epoch": 1.74, "grad_norm": 9.148750564471536, "learning_rate": 4.4821451957782915e-06, "loss": 0.559, "step": 10230 }, { "epoch": 1.74, "grad_norm": 5.749591740727813, "learning_rate": 4.4772265257435e-06, "loss": 0.5698, "step": 10235 }, { "epoch": 1.74, "grad_norm": 10.949616805563828, "learning_rate": 4.472308367152098e-06, "loss": 0.5518, "step": 10240 }, { "epoch": 1.74, "grad_norm": 10.755053911637036, "learning_rate": 4.467390724815654e-06, "loss": 0.572, "step": 10245 }, { "epoch": 1.74, "grad_norm": 7.60832823711757, "learning_rate": 4.462473603545232e-06, "loss": 0.5794, "step": 10250 }, { "epoch": 1.74, "grad_norm": 14.883077773789456, "learning_rate": 4.457557008151379e-06, "loss": 0.5713, "step": 10255 }, { "epoch": 1.74, "grad_norm": 8.519039378885232, "learning_rate": 4.452640943444137e-06, "loss": 0.5762, "step": 10260 }, { "epoch": 1.75, "grad_norm": 6.12176426140937, "learning_rate": 4.447725414233024e-06, "loss": 0.5575, "step": 10265 }, { "epoch": 1.75, "grad_norm": 9.597445584862303, "learning_rate": 4.442810425327033e-06, "loss": 0.5682, "step": 10270 }, { "epoch": 1.75, "grad_norm": 15.899622508028797, "learning_rate": 4.437895981534632e-06, "loss": 0.5663, "step": 10275 }, { "epoch": 1.75, "grad_norm": 15.23241821477259, "learning_rate": 4.432982087663755e-06, "loss": 0.5706, "step": 10280 }, { "epoch": 1.75, "grad_norm": 10.064482315135152, "learning_rate": 4.428068748521794e-06, "loss": 0.5626, "step": 10285 }, { "epoch": 1.75, "grad_norm": 7.224225338476222, "learning_rate": 4.423155968915605e-06, "loss": 0.5682, "step": 10290 }, { "epoch": 1.75, "grad_norm": 9.104061848723966, "learning_rate": 4.418243753651488e-06, "loss": 0.5513, "step": 10295 }, { "epoch": 1.75, "grad_norm": 7.6983397688120485, "learning_rate": 4.413332107535199e-06, "loss": 0.575, "step": 10300 }, { "epoch": 1.75, "grad_norm": 9.031076235814748, "learning_rate": 4.408421035371932e-06, "loss": 0.5583, "step": 10305 }, { "epoch": 1.75, "grad_norm": 23.865179039994, "learning_rate": 4.4035105419663234e-06, "loss": 0.5685, "step": 10310 }, { "epoch": 1.75, "grad_norm": 9.3402880658064, "learning_rate": 4.39860063212244e-06, "loss": 0.5672, "step": 10315 }, { "epoch": 1.75, "grad_norm": 8.376246146568219, "learning_rate": 4.393691310643779e-06, "loss": 0.5549, "step": 10320 }, { "epoch": 1.76, "grad_norm": 7.5076088706963535, "learning_rate": 4.388782582333263e-06, "loss": 0.5456, "step": 10325 }, { "epoch": 1.76, "grad_norm": 9.076302725867919, "learning_rate": 4.3838744519932345e-06, "loss": 0.5735, "step": 10330 }, { "epoch": 1.76, "grad_norm": 11.155201996360862, "learning_rate": 4.378966924425447e-06, "loss": 0.5672, "step": 10335 }, { "epoch": 1.76, "grad_norm": 18.51270787542137, "learning_rate": 4.3740600044310664e-06, "loss": 0.569, "step": 10340 }, { "epoch": 1.76, "grad_norm": 14.943870912242375, "learning_rate": 4.3691536968106675e-06, "loss": 0.5794, "step": 10345 }, { "epoch": 1.76, "grad_norm": 16.036753964316397, "learning_rate": 4.364248006364222e-06, "loss": 0.5575, "step": 10350 }, { "epoch": 1.76, "grad_norm": 14.337387313826945, "learning_rate": 4.359342937891099e-06, "loss": 0.56, "step": 10355 }, { "epoch": 1.76, "grad_norm": 12.632921011142148, "learning_rate": 4.354438496190061e-06, "loss": 0.5733, "step": 10360 }, { "epoch": 1.76, "grad_norm": 7.001797244074437, "learning_rate": 4.349534686059255e-06, "loss": 0.5841, "step": 10365 }, { "epoch": 1.76, "grad_norm": 9.677734308629427, "learning_rate": 4.344631512296211e-06, "loss": 0.5562, "step": 10370 }, { "epoch": 1.76, "grad_norm": 6.974231493449727, "learning_rate": 4.3397289796978335e-06, "loss": 0.5582, "step": 10375 }, { "epoch": 1.76, "grad_norm": 8.82580270027655, "learning_rate": 4.334827093060406e-06, "loss": 0.5525, "step": 10380 }, { "epoch": 1.77, "grad_norm": 13.300464842483924, "learning_rate": 4.329925857179573e-06, "loss": 0.5677, "step": 10385 }, { "epoch": 1.77, "grad_norm": 9.408626674668492, "learning_rate": 4.325025276850347e-06, "loss": 0.556, "step": 10390 }, { "epoch": 1.77, "grad_norm": 6.281700862386464, "learning_rate": 4.3201253568671e-06, "loss": 0.5579, "step": 10395 }, { "epoch": 1.77, "grad_norm": 6.4899340428012025, "learning_rate": 4.3152261020235516e-06, "loss": 0.5609, "step": 10400 }, { "epoch": 1.77, "grad_norm": 15.976312177486976, "learning_rate": 4.31032751711278e-06, "loss": 0.5598, "step": 10405 }, { "epoch": 1.77, "grad_norm": 17.343807709088885, "learning_rate": 4.305429606927202e-06, "loss": 0.5547, "step": 10410 }, { "epoch": 1.77, "grad_norm": 11.69312788328246, "learning_rate": 4.300532376258571e-06, "loss": 0.5574, "step": 10415 }, { "epoch": 1.77, "grad_norm": 6.87987772594223, "learning_rate": 4.295635829897983e-06, "loss": 0.5494, "step": 10420 }, { "epoch": 1.77, "grad_norm": 13.887701568026129, "learning_rate": 4.2907399726358626e-06, "loss": 0.5559, "step": 10425 }, { "epoch": 1.77, "grad_norm": 24.364887801382608, "learning_rate": 4.285844809261955e-06, "loss": 0.556, "step": 10430 }, { "epoch": 1.77, "grad_norm": 10.924058864204879, "learning_rate": 4.280950344565335e-06, "loss": 0.5576, "step": 10435 }, { "epoch": 1.77, "grad_norm": 6.91510691811666, "learning_rate": 4.276056583334386e-06, "loss": 0.5637, "step": 10440 }, { "epoch": 1.78, "grad_norm": 5.736864177986743, "learning_rate": 4.271163530356808e-06, "loss": 0.5521, "step": 10445 }, { "epoch": 1.78, "grad_norm": 28.15747552724219, "learning_rate": 4.266271190419609e-06, "loss": 0.562, "step": 10450 }, { "epoch": 1.78, "grad_norm": 17.159897130514594, "learning_rate": 4.261379568309093e-06, "loss": 0.574, "step": 10455 }, { "epoch": 1.78, "grad_norm": 5.932452219225763, "learning_rate": 4.256488668810868e-06, "loss": 0.5581, "step": 10460 }, { "epoch": 1.78, "grad_norm": 13.313755617902103, "learning_rate": 4.251598496709832e-06, "loss": 0.5526, "step": 10465 }, { "epoch": 1.78, "grad_norm": 18.111505571206518, "learning_rate": 4.2467090567901735e-06, "loss": 0.5582, "step": 10470 }, { "epoch": 1.78, "grad_norm": 17.77722314473156, "learning_rate": 4.241820353835363e-06, "loss": 0.556, "step": 10475 }, { "epoch": 1.78, "grad_norm": 14.39601648137402, "learning_rate": 4.236932392628149e-06, "loss": 0.5528, "step": 10480 }, { "epoch": 1.78, "grad_norm": 24.56336728851155, "learning_rate": 4.2320451779505575e-06, "loss": 0.5469, "step": 10485 }, { "epoch": 1.78, "grad_norm": 9.487147299657114, "learning_rate": 4.227158714583884e-06, "loss": 0.5588, "step": 10490 }, { "epoch": 1.78, "grad_norm": 16.364733268739467, "learning_rate": 4.222273007308684e-06, "loss": 0.5639, "step": 10495 }, { "epoch": 1.79, "grad_norm": 6.441684490504245, "learning_rate": 4.217388060904778e-06, "loss": 0.5534, "step": 10500 }, { "epoch": 1.79, "grad_norm": 15.84464990205949, "learning_rate": 4.21250388015124e-06, "loss": 0.5473, "step": 10505 }, { "epoch": 1.79, "grad_norm": 11.347143338237716, "learning_rate": 4.207620469826397e-06, "loss": 0.5563, "step": 10510 }, { "epoch": 1.79, "grad_norm": 20.032525340936964, "learning_rate": 4.2027378347078225e-06, "loss": 0.541, "step": 10515 }, { "epoch": 1.79, "grad_norm": 22.353036370097144, "learning_rate": 4.197855979572326e-06, "loss": 0.5461, "step": 10520 }, { "epoch": 1.79, "grad_norm": 6.145827342584123, "learning_rate": 4.192974909195962e-06, "loss": 0.5538, "step": 10525 }, { "epoch": 1.79, "grad_norm": 16.284130529239555, "learning_rate": 4.188094628354013e-06, "loss": 0.5523, "step": 10530 }, { "epoch": 1.79, "grad_norm": 21.944753217572533, "learning_rate": 4.1832151418209865e-06, "loss": 0.5497, "step": 10535 }, { "epoch": 1.79, "grad_norm": 10.726946766485042, "learning_rate": 4.1783364543706165e-06, "loss": 0.5385, "step": 10540 }, { "epoch": 1.79, "grad_norm": 8.947396885772298, "learning_rate": 4.173458570775856e-06, "loss": 0.5655, "step": 10545 }, { "epoch": 1.79, "grad_norm": 10.966857276634125, "learning_rate": 4.1685814958088696e-06, "loss": 0.5756, "step": 10550 }, { "epoch": 1.79, "grad_norm": 7.402810816083218, "learning_rate": 4.1637052342410315e-06, "loss": 0.5623, "step": 10555 }, { "epoch": 1.8, "grad_norm": 5.900704617475461, "learning_rate": 4.1588297908429195e-06, "loss": 0.54, "step": 10560 }, { "epoch": 1.8, "grad_norm": 8.269730671746444, "learning_rate": 4.153955170384312e-06, "loss": 0.5483, "step": 10565 }, { "epoch": 1.8, "grad_norm": 18.570352607344855, "learning_rate": 4.149081377634182e-06, "loss": 0.5409, "step": 10570 }, { "epoch": 1.8, "grad_norm": 9.479458512559505, "learning_rate": 4.14420841736069e-06, "loss": 0.5487, "step": 10575 }, { "epoch": 1.8, "grad_norm": 20.410675325402618, "learning_rate": 4.1393362943311866e-06, "loss": 0.5704, "step": 10580 }, { "epoch": 1.8, "grad_norm": 8.033492396039378, "learning_rate": 4.1344650133122e-06, "loss": 0.5527, "step": 10585 }, { "epoch": 1.8, "grad_norm": 8.608181918412429, "learning_rate": 4.129594579069436e-06, "loss": 0.5503, "step": 10590 }, { "epoch": 1.8, "grad_norm": 16.721543496224996, "learning_rate": 4.1247249963677725e-06, "loss": 0.5527, "step": 10595 }, { "epoch": 1.8, "grad_norm": 8.756527197932947, "learning_rate": 4.119856269971254e-06, "loss": 0.5569, "step": 10600 }, { "epoch": 1.8, "grad_norm": 13.928787184558088, "learning_rate": 4.114988404643086e-06, "loss": 0.551, "step": 10605 }, { "epoch": 1.8, "grad_norm": 27.003024941869466, "learning_rate": 4.110121405145634e-06, "loss": 0.5454, "step": 10610 }, { "epoch": 1.8, "grad_norm": 24.19555970041987, "learning_rate": 4.105255276240413e-06, "loss": 0.5502, "step": 10615 }, { "epoch": 1.81, "grad_norm": 6.339057342338457, "learning_rate": 4.100390022688087e-06, "loss": 0.5623, "step": 10620 }, { "epoch": 1.81, "grad_norm": 11.422037220563851, "learning_rate": 4.095525649248467e-06, "loss": 0.544, "step": 10625 }, { "epoch": 1.81, "grad_norm": 14.684061712602356, "learning_rate": 4.0906621606805e-06, "loss": 0.5611, "step": 10630 }, { "epoch": 1.81, "grad_norm": 19.907732420246848, "learning_rate": 4.085799561742269e-06, "loss": 0.5619, "step": 10635 }, { "epoch": 1.81, "grad_norm": 13.149132567592444, "learning_rate": 4.080937857190984e-06, "loss": 0.5567, "step": 10640 }, { "epoch": 1.81, "grad_norm": 17.153750553470985, "learning_rate": 4.076077051782983e-06, "loss": 0.5652, "step": 10645 }, { "epoch": 1.81, "grad_norm": 5.938875135207251, "learning_rate": 4.0712171502737245e-06, "loss": 0.5475, "step": 10650 }, { "epoch": 1.81, "grad_norm": 28.712317930091356, "learning_rate": 4.0663581574177764e-06, "loss": 0.5492, "step": 10655 }, { "epoch": 1.81, "grad_norm": 6.907991497479526, "learning_rate": 4.061500077968829e-06, "loss": 0.5519, "step": 10660 }, { "epoch": 1.81, "grad_norm": 11.26578908835014, "learning_rate": 4.056642916679666e-06, "loss": 0.5678, "step": 10665 }, { "epoch": 1.81, "grad_norm": 6.9969212767908955, "learning_rate": 4.051786678302182e-06, "loss": 0.5542, "step": 10670 }, { "epoch": 1.81, "grad_norm": 9.811354898565146, "learning_rate": 4.046931367587367e-06, "loss": 0.5547, "step": 10675 }, { "epoch": 1.82, "grad_norm": 5.351676942851948, "learning_rate": 4.042076989285301e-06, "loss": 0.5487, "step": 10680 }, { "epoch": 1.82, "grad_norm": 12.690168235430562, "learning_rate": 4.037223548145155e-06, "loss": 0.5376, "step": 10685 }, { "epoch": 1.82, "grad_norm": 13.10316249163135, "learning_rate": 4.0323710489151816e-06, "loss": 0.5542, "step": 10690 }, { "epoch": 1.82, "grad_norm": 17.238263388190468, "learning_rate": 4.027519496342707e-06, "loss": 0.5477, "step": 10695 }, { "epoch": 1.82, "grad_norm": 7.111697756767847, "learning_rate": 4.0226688951741415e-06, "loss": 0.541, "step": 10700 }, { "epoch": 1.82, "grad_norm": 9.312504651719623, "learning_rate": 4.017819250154957e-06, "loss": 0.5625, "step": 10705 }, { "epoch": 1.82, "grad_norm": 13.855123982730133, "learning_rate": 4.01297056602969e-06, "loss": 0.5456, "step": 10710 }, { "epoch": 1.82, "grad_norm": 11.61601724706946, "learning_rate": 4.008122847541942e-06, "loss": 0.5484, "step": 10715 }, { "epoch": 1.82, "grad_norm": 14.541933682803487, "learning_rate": 4.003276099434365e-06, "loss": 0.5408, "step": 10720 }, { "epoch": 1.82, "grad_norm": 16.02298738803633, "learning_rate": 3.998430326448664e-06, "loss": 0.5623, "step": 10725 }, { "epoch": 1.82, "grad_norm": 17.4550356670551, "learning_rate": 3.993585533325591e-06, "loss": 0.5351, "step": 10730 }, { "epoch": 1.83, "grad_norm": 6.076748410436516, "learning_rate": 3.988741724804935e-06, "loss": 0.5471, "step": 10735 }, { "epoch": 1.83, "grad_norm": 7.060203765038019, "learning_rate": 3.983898905625525e-06, "loss": 0.5606, "step": 10740 }, { "epoch": 1.83, "grad_norm": 6.697863097937426, "learning_rate": 3.979057080525223e-06, "loss": 0.5733, "step": 10745 }, { "epoch": 1.83, "grad_norm": 11.600904312186005, "learning_rate": 3.974216254240917e-06, "loss": 0.5571, "step": 10750 }, { "epoch": 1.83, "grad_norm": 8.41737306070303, "learning_rate": 3.969376431508516e-06, "loss": 0.5434, "step": 10755 }, { "epoch": 1.83, "grad_norm": 7.975078682574957, "learning_rate": 3.964537617062951e-06, "loss": 0.5439, "step": 10760 }, { "epoch": 1.83, "grad_norm": 17.113910948663463, "learning_rate": 3.959699815638163e-06, "loss": 0.55, "step": 10765 }, { "epoch": 1.83, "grad_norm": 9.726618538250287, "learning_rate": 3.954863031967108e-06, "loss": 0.5516, "step": 10770 }, { "epoch": 1.83, "grad_norm": 6.149850327053818, "learning_rate": 3.950027270781736e-06, "loss": 0.5534, "step": 10775 }, { "epoch": 1.83, "grad_norm": 6.751245969017911, "learning_rate": 3.945192536813006e-06, "loss": 0.5286, "step": 10780 }, { "epoch": 1.83, "grad_norm": 9.60655821899745, "learning_rate": 3.940358834790867e-06, "loss": 0.5523, "step": 10785 }, { "epoch": 1.83, "grad_norm": 5.613630439695627, "learning_rate": 3.935526169444261e-06, "loss": 0.5413, "step": 10790 }, { "epoch": 1.84, "grad_norm": 7.455914843357684, "learning_rate": 3.930694545501117e-06, "loss": 0.5377, "step": 10795 }, { "epoch": 1.84, "grad_norm": 8.107483270728077, "learning_rate": 3.925863967688339e-06, "loss": 0.5428, "step": 10800 }, { "epoch": 1.84, "grad_norm": 24.49841838362915, "learning_rate": 3.921034440731813e-06, "loss": 0.5484, "step": 10805 }, { "epoch": 1.84, "grad_norm": 9.02938698091131, "learning_rate": 3.916205969356399e-06, "loss": 0.5291, "step": 10810 }, { "epoch": 1.84, "grad_norm": 11.780566938233218, "learning_rate": 3.911378558285915e-06, "loss": 0.5498, "step": 10815 }, { "epoch": 1.84, "grad_norm": 10.707041202570831, "learning_rate": 3.906552212243151e-06, "loss": 0.558, "step": 10820 }, { "epoch": 1.84, "grad_norm": 12.845285542740227, "learning_rate": 3.90172693594985e-06, "loss": 0.54, "step": 10825 }, { "epoch": 1.84, "grad_norm": 6.30898838434219, "learning_rate": 3.89690273412671e-06, "loss": 0.5421, "step": 10830 }, { "epoch": 1.84, "grad_norm": 6.456838345718717, "learning_rate": 3.892079611493379e-06, "loss": 0.5268, "step": 10835 }, { "epoch": 1.84, "grad_norm": 6.483645969550005, "learning_rate": 3.8872575727684485e-06, "loss": 0.5467, "step": 10840 }, { "epoch": 1.84, "grad_norm": 9.01162771308466, "learning_rate": 3.882436622669447e-06, "loss": 0.5395, "step": 10845 }, { "epoch": 1.84, "grad_norm": 6.646301598055061, "learning_rate": 3.877616765912843e-06, "loss": 0.5301, "step": 10850 }, { "epoch": 1.85, "grad_norm": 11.206351159952076, "learning_rate": 3.872798007214028e-06, "loss": 0.5457, "step": 10855 }, { "epoch": 1.85, "grad_norm": 6.460428079828267, "learning_rate": 3.867980351287326e-06, "loss": 0.5459, "step": 10860 }, { "epoch": 1.85, "grad_norm": 9.058509320670087, "learning_rate": 3.863163802845979e-06, "loss": 0.5513, "step": 10865 }, { "epoch": 1.85, "grad_norm": 8.923932413895969, "learning_rate": 3.858348366602147e-06, "loss": 0.5312, "step": 10870 }, { "epoch": 1.85, "grad_norm": 6.311599242077515, "learning_rate": 3.853534047266902e-06, "loss": 0.5456, "step": 10875 }, { "epoch": 1.85, "grad_norm": 17.53201788074447, "learning_rate": 3.848720849550221e-06, "loss": 0.5415, "step": 10880 }, { "epoch": 1.85, "grad_norm": 6.801911731802575, "learning_rate": 3.843908778160986e-06, "loss": 0.5391, "step": 10885 }, { "epoch": 1.85, "grad_norm": 8.338680249038083, "learning_rate": 3.839097837806977e-06, "loss": 0.5482, "step": 10890 }, { "epoch": 1.85, "grad_norm": 6.535259979212849, "learning_rate": 3.834288033194864e-06, "loss": 0.546, "step": 10895 }, { "epoch": 1.85, "grad_norm": 15.935828201938994, "learning_rate": 3.829479369030211e-06, "loss": 0.5377, "step": 10900 }, { "epoch": 1.85, "grad_norm": 10.688589701889827, "learning_rate": 3.824671850017462e-06, "loss": 0.5372, "step": 10905 }, { "epoch": 1.85, "grad_norm": 31.066244041899697, "learning_rate": 3.819865480859943e-06, "loss": 0.5361, "step": 10910 }, { "epoch": 1.86, "grad_norm": 15.014970505246703, "learning_rate": 3.815060266259856e-06, "loss": 0.5493, "step": 10915 }, { "epoch": 1.86, "grad_norm": 5.526054798739284, "learning_rate": 3.8102562109182713e-06, "loss": 0.543, "step": 10920 }, { "epoch": 1.86, "grad_norm": 13.449964955547665, "learning_rate": 3.805453319535126e-06, "loss": 0.5313, "step": 10925 }, { "epoch": 1.86, "grad_norm": 17.86887071360535, "learning_rate": 3.8006515968092176e-06, "loss": 0.5498, "step": 10930 }, { "epoch": 1.86, "grad_norm": 5.578980091545276, "learning_rate": 3.7958510474382027e-06, "loss": 0.5485, "step": 10935 }, { "epoch": 1.86, "grad_norm": 6.05700161143195, "learning_rate": 3.7910516761185864e-06, "loss": 0.5416, "step": 10940 }, { "epoch": 1.86, "grad_norm": 5.476537423347814, "learning_rate": 3.7862534875457226e-06, "loss": 0.539, "step": 10945 }, { "epoch": 1.86, "grad_norm": 12.67141676921221, "learning_rate": 3.781456486413809e-06, "loss": 0.5176, "step": 10950 }, { "epoch": 1.86, "grad_norm": 11.665045618301248, "learning_rate": 3.7766606774158828e-06, "loss": 0.5338, "step": 10955 }, { "epoch": 1.86, "grad_norm": 8.702942638630219, "learning_rate": 3.7718660652438115e-06, "loss": 0.5382, "step": 10960 }, { "epoch": 1.86, "grad_norm": 6.780251465298803, "learning_rate": 3.7670726545882945e-06, "loss": 0.5342, "step": 10965 }, { "epoch": 1.87, "grad_norm": 5.284822222671517, "learning_rate": 3.7622804501388554e-06, "loss": 0.5461, "step": 10970 }, { "epoch": 1.87, "grad_norm": 7.327317512329649, "learning_rate": 3.7574894565838364e-06, "loss": 0.5494, "step": 10975 }, { "epoch": 1.87, "grad_norm": 8.938292463573944, "learning_rate": 3.752699678610395e-06, "loss": 0.5337, "step": 10980 }, { "epoch": 1.87, "grad_norm": 6.505501291362319, "learning_rate": 3.747911120904501e-06, "loss": 0.5291, "step": 10985 }, { "epoch": 1.87, "grad_norm": 6.4303775627203015, "learning_rate": 3.7431237881509287e-06, "loss": 0.5246, "step": 10990 }, { "epoch": 1.87, "grad_norm": 11.991768485557685, "learning_rate": 3.7383376850332546e-06, "loss": 0.5243, "step": 10995 }, { "epoch": 1.87, "grad_norm": 5.495221718797736, "learning_rate": 3.733552816233854e-06, "loss": 0.5337, "step": 11000 }, { "epoch": 1.87, "grad_norm": 5.162826956882016, "learning_rate": 3.7287691864338926e-06, "loss": 0.5384, "step": 11005 }, { "epoch": 1.87, "grad_norm": 5.199569881998118, "learning_rate": 3.723986800313324e-06, "loss": 0.5195, "step": 11010 }, { "epoch": 1.87, "grad_norm": 9.756550802893384, "learning_rate": 3.7192056625508877e-06, "loss": 0.539, "step": 11015 }, { "epoch": 1.87, "grad_norm": 6.743213887283496, "learning_rate": 3.7144257778240955e-06, "loss": 0.5365, "step": 11020 }, { "epoch": 1.87, "grad_norm": 11.84475921976795, "learning_rate": 3.70964715080924e-06, "loss": 0.5423, "step": 11025 }, { "epoch": 1.88, "grad_norm": 8.338435642943008, "learning_rate": 3.704869786181382e-06, "loss": 0.5352, "step": 11030 }, { "epoch": 1.88, "grad_norm": 5.387321434056955, "learning_rate": 3.700093688614344e-06, "loss": 0.5361, "step": 11035 }, { "epoch": 1.88, "grad_norm": 5.6140088905394885, "learning_rate": 3.695318862780712e-06, "loss": 0.5336, "step": 11040 }, { "epoch": 1.88, "grad_norm": 5.222058289128649, "learning_rate": 3.6905453133518266e-06, "loss": 0.5342, "step": 11045 }, { "epoch": 1.88, "grad_norm": 9.083708298443856, "learning_rate": 3.6857730449977807e-06, "loss": 0.532, "step": 11050 }, { "epoch": 1.88, "grad_norm": 10.372511390623997, "learning_rate": 3.6810020623874143e-06, "loss": 0.5339, "step": 11055 }, { "epoch": 1.88, "grad_norm": 7.828966228991336, "learning_rate": 3.676232370188305e-06, "loss": 0.5449, "step": 11060 }, { "epoch": 1.88, "grad_norm": 5.061506476661475, "learning_rate": 3.6714639730667733e-06, "loss": 0.5439, "step": 11065 }, { "epoch": 1.88, "grad_norm": 6.986951778483274, "learning_rate": 3.6666968756878706e-06, "loss": 0.5391, "step": 11070 }, { "epoch": 1.88, "grad_norm": 6.697073681510214, "learning_rate": 3.6619310827153777e-06, "loss": 0.5255, "step": 11075 }, { "epoch": 1.88, "grad_norm": 7.48917727035367, "learning_rate": 3.6571665988117964e-06, "loss": 0.5353, "step": 11080 }, { "epoch": 1.88, "grad_norm": 14.95409608824707, "learning_rate": 3.6524034286383512e-06, "loss": 0.5388, "step": 11085 }, { "epoch": 1.89, "grad_norm": 8.739997106110412, "learning_rate": 3.647641576854979e-06, "loss": 0.5397, "step": 11090 }, { "epoch": 1.89, "grad_norm": 5.702913793269424, "learning_rate": 3.6428810481203314e-06, "loss": 0.5392, "step": 11095 }, { "epoch": 1.89, "grad_norm": 10.152523882009815, "learning_rate": 3.6381218470917566e-06, "loss": 0.5367, "step": 11100 }, { "epoch": 1.89, "grad_norm": 14.924401718875243, "learning_rate": 3.6333639784253116e-06, "loss": 0.5328, "step": 11105 }, { "epoch": 1.89, "grad_norm": 11.220084367718904, "learning_rate": 3.6286074467757488e-06, "loss": 0.5378, "step": 11110 }, { "epoch": 1.89, "grad_norm": 7.6355010730653685, "learning_rate": 3.623852256796511e-06, "loss": 0.5417, "step": 11115 }, { "epoch": 1.89, "grad_norm": 16.697080845793394, "learning_rate": 3.6190984131397277e-06, "loss": 0.5458, "step": 11120 }, { "epoch": 1.89, "grad_norm": 7.4948154448340105, "learning_rate": 3.6143459204562128e-06, "loss": 0.529, "step": 11125 }, { "epoch": 1.89, "grad_norm": 16.98852882390966, "learning_rate": 3.609594783395458e-06, "loss": 0.5299, "step": 11130 }, { "epoch": 1.89, "grad_norm": 5.254750013758175, "learning_rate": 3.604845006605632e-06, "loss": 0.5268, "step": 11135 }, { "epoch": 1.89, "grad_norm": 9.746954504175433, "learning_rate": 3.600096594733564e-06, "loss": 0.5483, "step": 11140 }, { "epoch": 1.89, "grad_norm": 7.365857124321258, "learning_rate": 3.5953495524247573e-06, "loss": 0.5281, "step": 11145 }, { "epoch": 1.9, "grad_norm": 12.329693543819747, "learning_rate": 3.5906038843233693e-06, "loss": 0.5374, "step": 11150 }, { "epoch": 1.9, "grad_norm": 5.801535909625722, "learning_rate": 3.585859595072216e-06, "loss": 0.5276, "step": 11155 }, { "epoch": 1.9, "grad_norm": 12.397119396349337, "learning_rate": 3.5811166893127646e-06, "loss": 0.5316, "step": 11160 }, { "epoch": 1.9, "grad_norm": 21.842813538079348, "learning_rate": 3.576375171685126e-06, "loss": 0.5408, "step": 11165 }, { "epoch": 1.9, "grad_norm": 12.231416398872692, "learning_rate": 3.5716350468280553e-06, "loss": 0.5301, "step": 11170 }, { "epoch": 1.9, "grad_norm": 5.519093114896531, "learning_rate": 3.566896319378947e-06, "loss": 0.5304, "step": 11175 }, { "epoch": 1.9, "grad_norm": 5.940488754631659, "learning_rate": 3.562158993973821e-06, "loss": 0.5465, "step": 11180 }, { "epoch": 1.9, "grad_norm": 5.299602282158513, "learning_rate": 3.5574230752473336e-06, "loss": 0.5242, "step": 11185 }, { "epoch": 1.9, "grad_norm": 5.554576788811912, "learning_rate": 3.5526885678327617e-06, "loss": 0.5384, "step": 11190 }, { "epoch": 1.9, "grad_norm": 6.904279904285887, "learning_rate": 3.5479554763620016e-06, "loss": 0.5238, "step": 11195 }, { "epoch": 1.9, "grad_norm": 9.20256437712205, "learning_rate": 3.5432238054655633e-06, "loss": 0.5232, "step": 11200 }, { "epoch": 1.9, "grad_norm": 8.122344904169903, "learning_rate": 3.53849355977257e-06, "loss": 0.5497, "step": 11205 }, { "epoch": 1.91, "grad_norm": 7.152097367658841, "learning_rate": 3.533764743910747e-06, "loss": 0.5314, "step": 11210 }, { "epoch": 1.91, "grad_norm": 7.84948937139005, "learning_rate": 3.529037362506424e-06, "loss": 0.537, "step": 11215 }, { "epoch": 1.91, "grad_norm": 5.577212182155189, "learning_rate": 3.5243114201845242e-06, "loss": 0.5112, "step": 11220 }, { "epoch": 1.91, "grad_norm": 6.170134880436668, "learning_rate": 3.519586921568564e-06, "loss": 0.5338, "step": 11225 }, { "epoch": 1.91, "grad_norm": 8.220419192071601, "learning_rate": 3.5148638712806486e-06, "loss": 0.5082, "step": 11230 }, { "epoch": 1.91, "grad_norm": 6.7340417910071455, "learning_rate": 3.5101422739414657e-06, "loss": 0.5183, "step": 11235 }, { "epoch": 1.91, "grad_norm": 6.07802322303307, "learning_rate": 3.5054221341702815e-06, "loss": 0.5209, "step": 11240 }, { "epoch": 1.91, "grad_norm": 10.700269535527903, "learning_rate": 3.500703456584935e-06, "loss": 0.5133, "step": 11245 }, { "epoch": 1.91, "grad_norm": 19.52570025614485, "learning_rate": 3.495986245801839e-06, "loss": 0.5262, "step": 11250 }, { "epoch": 1.91, "grad_norm": 18.298198955013202, "learning_rate": 3.4912705064359643e-06, "loss": 0.5205, "step": 11255 }, { "epoch": 1.91, "grad_norm": 10.119842025529667, "learning_rate": 3.486556243100847e-06, "loss": 0.5261, "step": 11260 }, { "epoch": 1.92, "grad_norm": 8.108706346417808, "learning_rate": 3.481843460408579e-06, "loss": 0.5328, "step": 11265 }, { "epoch": 1.92, "grad_norm": 6.859265643783456, "learning_rate": 3.4771321629698008e-06, "loss": 0.5452, "step": 11270 }, { "epoch": 1.92, "grad_norm": 8.659912383492827, "learning_rate": 3.472422355393703e-06, "loss": 0.5287, "step": 11275 }, { "epoch": 1.92, "grad_norm": 6.979094507974204, "learning_rate": 3.4677140422880172e-06, "loss": 0.5408, "step": 11280 }, { "epoch": 1.92, "grad_norm": 6.489033753103027, "learning_rate": 3.4630072282590135e-06, "loss": 0.5448, "step": 11285 }, { "epoch": 1.92, "grad_norm": 6.576397068629151, "learning_rate": 3.4583019179114948e-06, "loss": 0.5268, "step": 11290 }, { "epoch": 1.92, "grad_norm": 7.561886265766148, "learning_rate": 3.453598115848795e-06, "loss": 0.5308, "step": 11295 }, { "epoch": 1.92, "grad_norm": 6.515198041434377, "learning_rate": 3.448895826672767e-06, "loss": 0.5279, "step": 11300 }, { "epoch": 1.92, "grad_norm": 6.760420913784969, "learning_rate": 3.444195054983788e-06, "loss": 0.5204, "step": 11305 }, { "epoch": 1.92, "grad_norm": 20.619225924917643, "learning_rate": 3.439495805380752e-06, "loss": 0.5316, "step": 11310 }, { "epoch": 1.92, "grad_norm": 9.939695464936968, "learning_rate": 3.4347980824610593e-06, "loss": 0.5282, "step": 11315 }, { "epoch": 1.92, "grad_norm": 8.492607689047357, "learning_rate": 3.4301018908206198e-06, "loss": 0.5282, "step": 11320 }, { "epoch": 1.93, "grad_norm": 9.5726352155456, "learning_rate": 3.4254072350538437e-06, "loss": 0.5154, "step": 11325 }, { "epoch": 1.93, "grad_norm": 7.521845531151163, "learning_rate": 3.420714119753641e-06, "loss": 0.5391, "step": 11330 }, { "epoch": 1.93, "grad_norm": 6.394116452417119, "learning_rate": 3.4160225495114134e-06, "loss": 0.541, "step": 11335 }, { "epoch": 1.93, "grad_norm": 11.445030342660791, "learning_rate": 3.4113325289170475e-06, "loss": 0.5275, "step": 11340 }, { "epoch": 1.93, "grad_norm": 18.940504419644416, "learning_rate": 3.4066440625589186e-06, "loss": 0.5179, "step": 11345 }, { "epoch": 1.93, "grad_norm": 5.6345428352537965, "learning_rate": 3.4019571550238816e-06, "loss": 0.5142, "step": 11350 }, { "epoch": 1.93, "grad_norm": 9.461369428184476, "learning_rate": 3.3972718108972612e-06, "loss": 0.5224, "step": 11355 }, { "epoch": 1.93, "grad_norm": 29.354729919652407, "learning_rate": 3.3925880347628577e-06, "loss": 0.5264, "step": 11360 }, { "epoch": 1.93, "grad_norm": 14.403518167501197, "learning_rate": 3.3879058312029354e-06, "loss": 0.5282, "step": 11365 }, { "epoch": 1.93, "grad_norm": 5.988468122093689, "learning_rate": 3.3832252047982206e-06, "loss": 0.5311, "step": 11370 }, { "epoch": 1.93, "grad_norm": 6.384055884376355, "learning_rate": 3.378546160127899e-06, "loss": 0.5235, "step": 11375 }, { "epoch": 1.93, "grad_norm": 5.98755000776099, "learning_rate": 3.3738687017696004e-06, "loss": 0.5422, "step": 11380 }, { "epoch": 1.94, "grad_norm": 5.865728926427982, "learning_rate": 3.3691928342994117e-06, "loss": 0.5299, "step": 11385 }, { "epoch": 1.94, "grad_norm": 7.669535823097867, "learning_rate": 3.364518562291861e-06, "loss": 0.5216, "step": 11390 }, { "epoch": 1.94, "grad_norm": 20.042979966523028, "learning_rate": 3.359845890319914e-06, "loss": 0.5181, "step": 11395 }, { "epoch": 1.94, "grad_norm": 7.242460329586381, "learning_rate": 3.3551748229549695e-06, "loss": 0.5314, "step": 11400 }, { "epoch": 1.94, "grad_norm": 6.409967046664902, "learning_rate": 3.3505053647668616e-06, "loss": 0.5403, "step": 11405 }, { "epoch": 1.94, "grad_norm": 15.280372712285226, "learning_rate": 3.3458375203238456e-06, "loss": 0.5252, "step": 11410 }, { "epoch": 1.94, "grad_norm": 19.565929616107933, "learning_rate": 3.3411712941926027e-06, "loss": 0.5301, "step": 11415 }, { "epoch": 1.94, "grad_norm": 7.77112520924579, "learning_rate": 3.3365066909382233e-06, "loss": 0.5281, "step": 11420 }, { "epoch": 1.94, "grad_norm": 8.77064494485486, "learning_rate": 3.331843715124216e-06, "loss": 0.5188, "step": 11425 }, { "epoch": 1.94, "grad_norm": 5.3463404607000555, "learning_rate": 3.3271823713124973e-06, "loss": 0.5279, "step": 11430 }, { "epoch": 1.94, "grad_norm": 7.202088692534696, "learning_rate": 3.3225226640633835e-06, "loss": 0.5118, "step": 11435 }, { "epoch": 1.94, "grad_norm": 5.539992983116485, "learning_rate": 3.317864597935595e-06, "loss": 0.5225, "step": 11440 }, { "epoch": 1.95, "grad_norm": 9.612631868756232, "learning_rate": 3.3132081774862403e-06, "loss": 0.527, "step": 11445 }, { "epoch": 1.95, "grad_norm": 7.363407207718291, "learning_rate": 3.308553407270822e-06, "loss": 0.5433, "step": 11450 }, { "epoch": 1.95, "grad_norm": 5.753353395071274, "learning_rate": 3.30390029184323e-06, "loss": 0.5056, "step": 11455 }, { "epoch": 1.95, "grad_norm": 6.972212324066545, "learning_rate": 3.299248835755728e-06, "loss": 0.5234, "step": 11460 }, { "epoch": 1.95, "grad_norm": 5.2989976391151785, "learning_rate": 3.2945990435589636e-06, "loss": 0.5205, "step": 11465 }, { "epoch": 1.95, "grad_norm": 14.382665579425526, "learning_rate": 3.289950919801954e-06, "loss": 0.5344, "step": 11470 }, { "epoch": 1.95, "grad_norm": 5.614165948682854, "learning_rate": 3.2853044690320836e-06, "loss": 0.5335, "step": 11475 }, { "epoch": 1.95, "grad_norm": 18.54799855202085, "learning_rate": 3.2806596957951003e-06, "loss": 0.518, "step": 11480 }, { "epoch": 1.95, "grad_norm": 11.260869030494261, "learning_rate": 3.2760166046351127e-06, "loss": 0.5177, "step": 11485 }, { "epoch": 1.95, "grad_norm": 8.396344032296104, "learning_rate": 3.2713752000945792e-06, "loss": 0.5346, "step": 11490 }, { "epoch": 1.95, "grad_norm": 23.880420866969953, "learning_rate": 3.266735486714314e-06, "loss": 0.5279, "step": 11495 }, { "epoch": 1.96, "grad_norm": 22.70728514331903, "learning_rate": 3.2620974690334723e-06, "loss": 0.5239, "step": 11500 }, { "epoch": 1.96, "grad_norm": 16.493266956279886, "learning_rate": 3.257461151589551e-06, "loss": 0.5259, "step": 11505 }, { "epoch": 1.96, "grad_norm": 22.029387258063146, "learning_rate": 3.2528265389183857e-06, "loss": 0.5322, "step": 11510 }, { "epoch": 1.96, "grad_norm": 28.678050417980476, "learning_rate": 3.2481936355541425e-06, "loss": 0.5215, "step": 11515 }, { "epoch": 1.96, "grad_norm": 12.50147938948209, "learning_rate": 3.2435624460293163e-06, "loss": 0.5234, "step": 11520 }, { "epoch": 1.96, "grad_norm": 14.973628043093376, "learning_rate": 3.2389329748747246e-06, "loss": 0.5102, "step": 11525 }, { "epoch": 1.96, "grad_norm": 9.999565283031597, "learning_rate": 3.2343052266195044e-06, "loss": 0.5101, "step": 11530 }, { "epoch": 1.96, "grad_norm": 5.374175195674296, "learning_rate": 3.2296792057911064e-06, "loss": 0.5122, "step": 11535 }, { "epoch": 1.96, "grad_norm": 6.887472400804496, "learning_rate": 3.22505491691529e-06, "loss": 0.5198, "step": 11540 }, { "epoch": 1.96, "grad_norm": 11.888601482195552, "learning_rate": 3.220432364516124e-06, "loss": 0.5243, "step": 11545 }, { "epoch": 1.96, "grad_norm": 5.117889813946124, "learning_rate": 3.215811553115974e-06, "loss": 0.517, "step": 11550 }, { "epoch": 1.96, "grad_norm": 6.489191260544375, "learning_rate": 3.2111924872355055e-06, "loss": 0.5341, "step": 11555 }, { "epoch": 1.97, "grad_norm": 10.108146934501852, "learning_rate": 3.2065751713936757e-06, "loss": 0.5316, "step": 11560 }, { "epoch": 1.97, "grad_norm": 8.939846904829047, "learning_rate": 3.2019596101077276e-06, "loss": 0.5276, "step": 11565 }, { "epoch": 1.97, "grad_norm": 5.798886561943347, "learning_rate": 3.197345807893191e-06, "loss": 0.5185, "step": 11570 }, { "epoch": 1.97, "grad_norm": 5.562157928123866, "learning_rate": 3.192733769263874e-06, "loss": 0.5167, "step": 11575 }, { "epoch": 1.97, "grad_norm": 8.255025145423334, "learning_rate": 3.1881234987318554e-06, "loss": 0.5287, "step": 11580 }, { "epoch": 1.97, "grad_norm": 9.00519837851638, "learning_rate": 3.183515000807488e-06, "loss": 0.5323, "step": 11585 }, { "epoch": 1.97, "grad_norm": 10.33740503879071, "learning_rate": 3.178908279999392e-06, "loss": 0.5257, "step": 11590 }, { "epoch": 1.97, "grad_norm": 8.941774823430713, "learning_rate": 3.174303340814443e-06, "loss": 0.5405, "step": 11595 }, { "epoch": 1.97, "grad_norm": 5.764515099338116, "learning_rate": 3.16970018775778e-06, "loss": 0.5194, "step": 11600 }, { "epoch": 1.97, "grad_norm": 7.769381791545381, "learning_rate": 3.1650988253327906e-06, "loss": 0.5261, "step": 11605 }, { "epoch": 1.97, "grad_norm": 5.430163930611821, "learning_rate": 3.160499258041112e-06, "loss": 0.5204, "step": 11610 }, { "epoch": 1.97, "grad_norm": 5.8678313815022545, "learning_rate": 3.1559014903826245e-06, "loss": 0.5267, "step": 11615 }, { "epoch": 1.98, "grad_norm": 6.961306154968712, "learning_rate": 3.1513055268554518e-06, "loss": 0.5123, "step": 11620 }, { "epoch": 1.98, "grad_norm": 6.386787947968249, "learning_rate": 3.146711371955943e-06, "loss": 0.5293, "step": 11625 }, { "epoch": 1.98, "grad_norm": 13.380022599514302, "learning_rate": 3.142119030178688e-06, "loss": 0.519, "step": 11630 }, { "epoch": 1.98, "grad_norm": 15.601956854275004, "learning_rate": 3.1375285060164963e-06, "loss": 0.5218, "step": 11635 }, { "epoch": 1.98, "grad_norm": 9.956209340415413, "learning_rate": 3.132939803960402e-06, "loss": 0.5152, "step": 11640 }, { "epoch": 1.98, "grad_norm": 14.5091717789471, "learning_rate": 3.128352928499657e-06, "loss": 0.5064, "step": 11645 }, { "epoch": 1.98, "grad_norm": 27.95353269895987, "learning_rate": 3.123767884121725e-06, "loss": 0.5348, "step": 11650 }, { "epoch": 1.98, "grad_norm": 16.97543978293298, "learning_rate": 3.1191846753122783e-06, "loss": 0.5094, "step": 11655 }, { "epoch": 1.98, "grad_norm": 14.329302870828528, "learning_rate": 3.1146033065551964e-06, "loss": 0.5126, "step": 11660 }, { "epoch": 1.98, "grad_norm": 26.946136861360525, "learning_rate": 3.11002378233255e-06, "loss": 0.5222, "step": 11665 }, { "epoch": 1.98, "grad_norm": 26.85683438822377, "learning_rate": 3.1054461071246155e-06, "loss": 0.5176, "step": 11670 }, { "epoch": 1.98, "grad_norm": 32.54303271050792, "learning_rate": 3.100870285409856e-06, "loss": 0.5285, "step": 11675 }, { "epoch": 1.99, "grad_norm": 22.70343510365221, "learning_rate": 3.0962963216649196e-06, "loss": 0.5281, "step": 11680 }, { "epoch": 1.99, "grad_norm": 13.917301804129767, "learning_rate": 3.0917242203646385e-06, "loss": 0.5043, "step": 11685 }, { "epoch": 1.99, "grad_norm": 5.110336385901124, "learning_rate": 3.087153985982024e-06, "loss": 0.5175, "step": 11690 }, { "epoch": 1.99, "grad_norm": 5.677973496169243, "learning_rate": 3.0825856229882584e-06, "loss": 0.5124, "step": 11695 }, { "epoch": 1.99, "grad_norm": 6.415213598801499, "learning_rate": 3.078019135852698e-06, "loss": 0.5211, "step": 11700 }, { "epoch": 1.99, "grad_norm": 16.720758651500194, "learning_rate": 3.073454529042854e-06, "loss": 0.5154, "step": 11705 }, { "epoch": 1.99, "grad_norm": 5.50117824566899, "learning_rate": 3.068891807024409e-06, "loss": 0.52, "step": 11710 }, { "epoch": 1.99, "grad_norm": 5.441597888328433, "learning_rate": 3.064330974261196e-06, "loss": 0.5222, "step": 11715 }, { "epoch": 1.99, "grad_norm": 12.022956215814487, "learning_rate": 3.0597720352152004e-06, "loss": 0.5113, "step": 11720 }, { "epoch": 1.99, "grad_norm": 11.45410754613074, "learning_rate": 3.0552149943465554e-06, "loss": 0.5313, "step": 11725 }, { "epoch": 1.99, "grad_norm": 9.656188542011064, "learning_rate": 3.0506598561135362e-06, "loss": 0.5154, "step": 11730 }, { "epoch": 2.0, "grad_norm": 6.627915183745318, "learning_rate": 3.0461066249725584e-06, "loss": 0.5188, "step": 11735 }, { "epoch": 2.0, "grad_norm": 5.616229006704577, "learning_rate": 3.0415553053781725e-06, "loss": 0.5208, "step": 11740 }, { "epoch": 2.0, "grad_norm": 7.0943460014009, "learning_rate": 3.037005901783053e-06, "loss": 0.5194, "step": 11745 }, { "epoch": 2.0, "grad_norm": 7.190068268493544, "learning_rate": 3.032458418638008e-06, "loss": 0.5208, "step": 11750 }, { "epoch": 2.0, "grad_norm": 5.1286362517199136, "learning_rate": 3.0279128603919593e-06, "loss": 0.5125, "step": 11755 }, { "epoch": 2.0, "grad_norm": 6.220220298533826, "learning_rate": 3.0233692314919525e-06, "loss": 0.5235, "step": 11760 }, { "epoch": 2.0, "eval_loss": 0.3905054032802582, "eval_runtime": 75.0128, "eval_samples_per_second": 4.826, "eval_steps_per_second": 0.613, "step": 11764 }, { "epoch": 2.0, "grad_norm": 8.611507507714085, "learning_rate": 3.018827536383142e-06, "loss": 0.4829, "step": 11765 }, { "epoch": 2.0, "grad_norm": 5.752742447036074, "learning_rate": 3.0142877795087876e-06, "loss": 0.3723, "step": 11770 }, { "epoch": 2.0, "grad_norm": 8.804207868987879, "learning_rate": 3.009749965310259e-06, "loss": 0.3751, "step": 11775 }, { "epoch": 2.0, "grad_norm": 6.48577340760751, "learning_rate": 3.0052140982270228e-06, "loss": 0.3786, "step": 11780 }, { "epoch": 2.0, "grad_norm": 5.96000499515076, "learning_rate": 3.0006801826966366e-06, "loss": 0.3715, "step": 11785 }, { "epoch": 2.0, "grad_norm": 5.683037451115184, "learning_rate": 2.9961482231547535e-06, "loss": 0.3728, "step": 11790 }, { "epoch": 2.01, "grad_norm": 4.947056755410368, "learning_rate": 2.9916182240351132e-06, "loss": 0.3658, "step": 11795 }, { "epoch": 2.01, "grad_norm": 9.175381671925878, "learning_rate": 2.987090189769535e-06, "loss": 0.3672, "step": 11800 }, { "epoch": 2.01, "grad_norm": 7.202043240807192, "learning_rate": 2.9825641247879167e-06, "loss": 0.3792, "step": 11805 }, { "epoch": 2.01, "grad_norm": 10.919930890072514, "learning_rate": 2.9780400335182312e-06, "loss": 0.357, "step": 11810 }, { "epoch": 2.01, "grad_norm": 5.458484845692553, "learning_rate": 2.973517920386517e-06, "loss": 0.3642, "step": 11815 }, { "epoch": 2.01, "grad_norm": 12.605985113201717, "learning_rate": 2.968997789816882e-06, "loss": 0.366, "step": 11820 }, { "epoch": 2.01, "grad_norm": 10.17730548605672, "learning_rate": 2.9644796462314897e-06, "loss": 0.3685, "step": 11825 }, { "epoch": 2.01, "grad_norm": 10.988272278395259, "learning_rate": 2.959963494050562e-06, "loss": 0.3653, "step": 11830 }, { "epoch": 2.01, "grad_norm": 11.730790106634236, "learning_rate": 2.955449337692372e-06, "loss": 0.3605, "step": 11835 }, { "epoch": 2.01, "grad_norm": 5.492211689650247, "learning_rate": 2.9509371815732415e-06, "loss": 0.3645, "step": 11840 }, { "epoch": 2.01, "grad_norm": 7.540976427148941, "learning_rate": 2.946427030107534e-06, "loss": 0.3642, "step": 11845 }, { "epoch": 2.01, "grad_norm": 4.579453176000582, "learning_rate": 2.9419188877076534e-06, "loss": 0.3693, "step": 11850 }, { "epoch": 2.02, "grad_norm": 5.304416580656278, "learning_rate": 2.9374127587840373e-06, "loss": 0.3522, "step": 11855 }, { "epoch": 2.02, "grad_norm": 5.3593333215489, "learning_rate": 2.932908647745152e-06, "loss": 0.367, "step": 11860 }, { "epoch": 2.02, "grad_norm": 12.497643274627988, "learning_rate": 2.9284065589974915e-06, "loss": 0.3633, "step": 11865 }, { "epoch": 2.02, "grad_norm": 13.575720144681346, "learning_rate": 2.9239064969455686e-06, "loss": 0.3656, "step": 11870 }, { "epoch": 2.02, "grad_norm": 5.418062852100978, "learning_rate": 2.9194084659919176e-06, "loss": 0.3624, "step": 11875 }, { "epoch": 2.02, "grad_norm": 8.713970180416974, "learning_rate": 2.914912470537081e-06, "loss": 0.3605, "step": 11880 }, { "epoch": 2.02, "grad_norm": 6.3675790522641655, "learning_rate": 2.9104185149796166e-06, "loss": 0.3577, "step": 11885 }, { "epoch": 2.02, "grad_norm": 5.380851351218553, "learning_rate": 2.9059266037160804e-06, "loss": 0.3642, "step": 11890 }, { "epoch": 2.02, "grad_norm": 5.881021953784742, "learning_rate": 2.9014367411410272e-06, "loss": 0.3689, "step": 11895 }, { "epoch": 2.02, "grad_norm": 5.224788190005951, "learning_rate": 2.896948931647018e-06, "loss": 0.3706, "step": 11900 }, { "epoch": 2.02, "grad_norm": 7.267290416239944, "learning_rate": 2.8924631796245896e-06, "loss": 0.3626, "step": 11905 }, { "epoch": 2.02, "grad_norm": 5.722158276142785, "learning_rate": 2.8879794894622794e-06, "loss": 0.3549, "step": 11910 }, { "epoch": 2.03, "grad_norm": 6.665683638849933, "learning_rate": 2.883497865546599e-06, "loss": 0.3582, "step": 11915 }, { "epoch": 2.03, "grad_norm": 11.717660943671277, "learning_rate": 2.8790183122620455e-06, "loss": 0.3557, "step": 11920 }, { "epoch": 2.03, "grad_norm": 6.744844839312123, "learning_rate": 2.8745408339910857e-06, "loss": 0.3629, "step": 11925 }, { "epoch": 2.03, "grad_norm": 5.254584277873432, "learning_rate": 2.8700654351141546e-06, "loss": 0.3626, "step": 11930 }, { "epoch": 2.03, "grad_norm": 5.076849011568416, "learning_rate": 2.865592120009659e-06, "loss": 0.3665, "step": 11935 }, { "epoch": 2.03, "grad_norm": 9.89105922641013, "learning_rate": 2.8611208930539635e-06, "loss": 0.3748, "step": 11940 }, { "epoch": 2.03, "grad_norm": 8.278339030031562, "learning_rate": 2.8566517586213895e-06, "loss": 0.3752, "step": 11945 }, { "epoch": 2.03, "grad_norm": 5.055266207685965, "learning_rate": 2.852184721084208e-06, "loss": 0.3645, "step": 11950 }, { "epoch": 2.03, "grad_norm": 7.148044998103296, "learning_rate": 2.847719784812648e-06, "loss": 0.3722, "step": 11955 }, { "epoch": 2.03, "grad_norm": 8.54289037896895, "learning_rate": 2.8432569541748728e-06, "loss": 0.3609, "step": 11960 }, { "epoch": 2.03, "grad_norm": 8.931371433064498, "learning_rate": 2.8387962335369935e-06, "loss": 0.3658, "step": 11965 }, { "epoch": 2.04, "grad_norm": 13.930370778361347, "learning_rate": 2.8343376272630524e-06, "loss": 0.3487, "step": 11970 }, { "epoch": 2.04, "grad_norm": 10.26844641650417, "learning_rate": 2.8298811397150217e-06, "loss": 0.3588, "step": 11975 }, { "epoch": 2.04, "grad_norm": 9.382054822171709, "learning_rate": 2.825426775252806e-06, "loss": 0.3578, "step": 11980 }, { "epoch": 2.04, "grad_norm": 5.611521064546941, "learning_rate": 2.8209745382342312e-06, "loss": 0.3558, "step": 11985 }, { "epoch": 2.04, "grad_norm": 5.068563602349961, "learning_rate": 2.8165244330150383e-06, "loss": 0.364, "step": 11990 }, { "epoch": 2.04, "grad_norm": 4.357350781394905, "learning_rate": 2.812076463948884e-06, "loss": 0.3672, "step": 11995 }, { "epoch": 2.04, "grad_norm": 6.591605468812917, "learning_rate": 2.8076306353873413e-06, "loss": 0.3651, "step": 12000 }, { "epoch": 2.04, "grad_norm": 5.106233634138435, "learning_rate": 2.8031869516798794e-06, "loss": 0.3523, "step": 12005 }, { "epoch": 2.04, "grad_norm": 5.718458446762907, "learning_rate": 2.798745417173877e-06, "loss": 0.3632, "step": 12010 }, { "epoch": 2.04, "grad_norm": 6.2380488137176675, "learning_rate": 2.7943060362146068e-06, "loss": 0.357, "step": 12015 }, { "epoch": 2.04, "grad_norm": 13.238093997370028, "learning_rate": 2.7898688131452344e-06, "loss": 0.3616, "step": 12020 }, { "epoch": 2.04, "grad_norm": 4.566161035481949, "learning_rate": 2.785433752306812e-06, "loss": 0.3568, "step": 12025 }, { "epoch": 2.05, "grad_norm": 5.614923482223124, "learning_rate": 2.7810008580382843e-06, "loss": 0.358, "step": 12030 }, { "epoch": 2.05, "grad_norm": 4.801101666626508, "learning_rate": 2.776570134676469e-06, "loss": 0.3627, "step": 12035 }, { "epoch": 2.05, "grad_norm": 5.401932712886612, "learning_rate": 2.77214158655606e-06, "loss": 0.3577, "step": 12040 }, { "epoch": 2.05, "grad_norm": 5.629093893600516, "learning_rate": 2.76771521800963e-06, "loss": 0.3602, "step": 12045 }, { "epoch": 2.05, "grad_norm": 8.84615246777452, "learning_rate": 2.76329103336761e-06, "loss": 0.3629, "step": 12050 }, { "epoch": 2.05, "grad_norm": 6.493679745481419, "learning_rate": 2.7588690369583025e-06, "loss": 0.3584, "step": 12055 }, { "epoch": 2.05, "grad_norm": 4.650688020597786, "learning_rate": 2.7544492331078667e-06, "loss": 0.3573, "step": 12060 }, { "epoch": 2.05, "grad_norm": 4.875759976969528, "learning_rate": 2.750031626140313e-06, "loss": 0.3633, "step": 12065 }, { "epoch": 2.05, "grad_norm": 12.228597460057165, "learning_rate": 2.745616220377504e-06, "loss": 0.3627, "step": 12070 }, { "epoch": 2.05, "grad_norm": 12.359671405062235, "learning_rate": 2.7412030201391553e-06, "loss": 0.3614, "step": 12075 }, { "epoch": 2.05, "grad_norm": 6.22622924385044, "learning_rate": 2.7367920297428174e-06, "loss": 0.3591, "step": 12080 }, { "epoch": 2.05, "grad_norm": 9.388816381649836, "learning_rate": 2.7323832535038787e-06, "loss": 0.3547, "step": 12085 }, { "epoch": 2.06, "grad_norm": 13.70592439616937, "learning_rate": 2.727976695735568e-06, "loss": 0.3613, "step": 12090 }, { "epoch": 2.06, "grad_norm": 8.429611413044588, "learning_rate": 2.7235723607489357e-06, "loss": 0.3639, "step": 12095 }, { "epoch": 2.06, "grad_norm": 5.446747254918879, "learning_rate": 2.719170252852868e-06, "loss": 0.3565, "step": 12100 }, { "epoch": 2.06, "grad_norm": 9.694603382556856, "learning_rate": 2.7147703763540567e-06, "loss": 0.3575, "step": 12105 }, { "epoch": 2.06, "grad_norm": 10.344482002591908, "learning_rate": 2.7103727355570277e-06, "loss": 0.3599, "step": 12110 }, { "epoch": 2.06, "grad_norm": 7.230043253110429, "learning_rate": 2.7059773347641048e-06, "loss": 0.3783, "step": 12115 }, { "epoch": 2.06, "grad_norm": 4.911178835416952, "learning_rate": 2.701584178275433e-06, "loss": 0.3507, "step": 12120 }, { "epoch": 2.06, "grad_norm": 7.060856281038126, "learning_rate": 2.6971932703889534e-06, "loss": 0.3577, "step": 12125 }, { "epoch": 2.06, "grad_norm": 5.095825633746809, "learning_rate": 2.6928046154004083e-06, "loss": 0.348, "step": 12130 }, { "epoch": 2.06, "grad_norm": 5.33022918116732, "learning_rate": 2.6884182176033397e-06, "loss": 0.3597, "step": 12135 }, { "epoch": 2.06, "grad_norm": 7.14854869257719, "learning_rate": 2.684034081289078e-06, "loss": 0.3654, "step": 12140 }, { "epoch": 2.06, "grad_norm": 7.175699068398426, "learning_rate": 2.6796522107467417e-06, "loss": 0.3471, "step": 12145 }, { "epoch": 2.07, "grad_norm": 11.87073571076359, "learning_rate": 2.6752726102632307e-06, "loss": 0.3646, "step": 12150 }, { "epoch": 2.07, "grad_norm": 5.265772381937061, "learning_rate": 2.670895284123231e-06, "loss": 0.3583, "step": 12155 }, { "epoch": 2.07, "grad_norm": 9.51482295186804, "learning_rate": 2.666520236609196e-06, "loss": 0.3474, "step": 12160 }, { "epoch": 2.07, "grad_norm": 8.52752360844317, "learning_rate": 2.662147472001352e-06, "loss": 0.3687, "step": 12165 }, { "epoch": 2.07, "grad_norm": 5.657318428452209, "learning_rate": 2.6577769945776942e-06, "loss": 0.3708, "step": 12170 }, { "epoch": 2.07, "grad_norm": 14.585897856994992, "learning_rate": 2.653408808613977e-06, "loss": 0.3748, "step": 12175 }, { "epoch": 2.07, "grad_norm": 12.455744504231912, "learning_rate": 2.6490429183837195e-06, "loss": 0.3574, "step": 12180 }, { "epoch": 2.07, "grad_norm": 6.3858811970077705, "learning_rate": 2.6446793281581815e-06, "loss": 0.356, "step": 12185 }, { "epoch": 2.07, "grad_norm": 13.118397592546135, "learning_rate": 2.640318042206387e-06, "loss": 0.3654, "step": 12190 }, { "epoch": 2.07, "grad_norm": 9.898867934283263, "learning_rate": 2.635959064795097e-06, "loss": 0.3513, "step": 12195 }, { "epoch": 2.07, "grad_norm": 5.454220363751431, "learning_rate": 2.6316024001888195e-06, "loss": 0.3684, "step": 12200 }, { "epoch": 2.07, "grad_norm": 5.251292384038412, "learning_rate": 2.6272480526497952e-06, "loss": 0.3517, "step": 12205 }, { "epoch": 2.08, "grad_norm": 9.71362624669713, "learning_rate": 2.622896026437998e-06, "loss": 0.3635, "step": 12210 }, { "epoch": 2.08, "grad_norm": 6.455864586011419, "learning_rate": 2.6185463258111355e-06, "loss": 0.3568, "step": 12215 }, { "epoch": 2.08, "grad_norm": 6.345032911435628, "learning_rate": 2.614198955024637e-06, "loss": 0.3621, "step": 12220 }, { "epoch": 2.08, "grad_norm": 7.085325440928504, "learning_rate": 2.6098539183316508e-06, "loss": 0.3572, "step": 12225 }, { "epoch": 2.08, "grad_norm": 8.755306263476994, "learning_rate": 2.6055112199830423e-06, "loss": 0.3632, "step": 12230 }, { "epoch": 2.08, "grad_norm": 4.37675672449101, "learning_rate": 2.601170864227395e-06, "loss": 0.362, "step": 12235 }, { "epoch": 2.08, "grad_norm": 10.004487280961618, "learning_rate": 2.5968328553109912e-06, "loss": 0.3607, "step": 12240 }, { "epoch": 2.08, "grad_norm": 11.383232716403265, "learning_rate": 2.5924971974778257e-06, "loss": 0.3695, "step": 12245 }, { "epoch": 2.08, "grad_norm": 6.952935057045326, "learning_rate": 2.5881638949695886e-06, "loss": 0.3528, "step": 12250 }, { "epoch": 2.08, "grad_norm": 6.056211499559283, "learning_rate": 2.5838329520256645e-06, "loss": 0.3605, "step": 12255 }, { "epoch": 2.08, "grad_norm": 8.277901307181128, "learning_rate": 2.579504372883134e-06, "loss": 0.3646, "step": 12260 }, { "epoch": 2.09, "grad_norm": 6.69486470616986, "learning_rate": 2.575178161776763e-06, "loss": 0.3647, "step": 12265 }, { "epoch": 2.09, "grad_norm": 9.170144636951997, "learning_rate": 2.5708543229389995e-06, "loss": 0.3556, "step": 12270 }, { "epoch": 2.09, "grad_norm": 4.872922231980344, "learning_rate": 2.5665328605999696e-06, "loss": 0.3726, "step": 12275 }, { "epoch": 2.09, "grad_norm": 7.288978068586903, "learning_rate": 2.5622137789874803e-06, "loss": 0.3619, "step": 12280 }, { "epoch": 2.09, "grad_norm": 5.131810848831009, "learning_rate": 2.557897082327002e-06, "loss": 0.3658, "step": 12285 }, { "epoch": 2.09, "grad_norm": 4.919657846426334, "learning_rate": 2.5535827748416797e-06, "loss": 0.3698, "step": 12290 }, { "epoch": 2.09, "grad_norm": 5.168193650629757, "learning_rate": 2.5492708607523144e-06, "loss": 0.3661, "step": 12295 }, { "epoch": 2.09, "grad_norm": 10.285402235676456, "learning_rate": 2.544961344277368e-06, "loss": 0.3745, "step": 12300 }, { "epoch": 2.09, "grad_norm": 10.514162836792714, "learning_rate": 2.540654229632955e-06, "loss": 0.3721, "step": 12305 }, { "epoch": 2.09, "grad_norm": 6.014041099796996, "learning_rate": 2.536349521032846e-06, "loss": 0.3486, "step": 12310 }, { "epoch": 2.09, "grad_norm": 11.404746919512409, "learning_rate": 2.5320472226884506e-06, "loss": 0.3553, "step": 12315 }, { "epoch": 2.09, "grad_norm": 9.926602435491768, "learning_rate": 2.527747338808822e-06, "loss": 0.3705, "step": 12320 }, { "epoch": 2.1, "grad_norm": 7.43243287500068, "learning_rate": 2.5234498736006563e-06, "loss": 0.3758, "step": 12325 }, { "epoch": 2.1, "grad_norm": 4.5893380700500135, "learning_rate": 2.5191548312682758e-06, "loss": 0.3685, "step": 12330 }, { "epoch": 2.1, "grad_norm": 5.739196866534467, "learning_rate": 2.5148622160136406e-06, "loss": 0.3644, "step": 12335 }, { "epoch": 2.1, "grad_norm": 7.223888941673534, "learning_rate": 2.5105720320363287e-06, "loss": 0.3566, "step": 12340 }, { "epoch": 2.1, "grad_norm": 4.760901049999837, "learning_rate": 2.5062842835335442e-06, "loss": 0.3517, "step": 12345 }, { "epoch": 2.1, "grad_norm": 8.717510780154683, "learning_rate": 2.5019989747001043e-06, "loss": 0.3576, "step": 12350 }, { "epoch": 2.1, "grad_norm": 9.274190474502552, "learning_rate": 2.4977161097284468e-06, "loss": 0.3555, "step": 12355 }, { "epoch": 2.1, "grad_norm": 5.661456821603926, "learning_rate": 2.4934356928086124e-06, "loss": 0.3526, "step": 12360 }, { "epoch": 2.1, "grad_norm": 13.14206177144619, "learning_rate": 2.489157728128245e-06, "loss": 0.3536, "step": 12365 }, { "epoch": 2.1, "grad_norm": 4.8359452582871425, "learning_rate": 2.4848822198725974e-06, "loss": 0.3545, "step": 12370 }, { "epoch": 2.1, "grad_norm": 5.562661402570011, "learning_rate": 2.4806091722245114e-06, "loss": 0.3557, "step": 12375 }, { "epoch": 2.1, "grad_norm": 7.216730692799468, "learning_rate": 2.4763385893644283e-06, "loss": 0.3527, "step": 12380 }, { "epoch": 2.11, "grad_norm": 6.175993021978491, "learning_rate": 2.4720704754703715e-06, "loss": 0.3705, "step": 12385 }, { "epoch": 2.11, "grad_norm": 11.558130455617105, "learning_rate": 2.467804834717954e-06, "loss": 0.3611, "step": 12390 }, { "epoch": 2.11, "grad_norm": 12.155852876216986, "learning_rate": 2.4635416712803635e-06, "loss": 0.3584, "step": 12395 }, { "epoch": 2.11, "grad_norm": 5.056660813979677, "learning_rate": 2.4592809893283725e-06, "loss": 0.3523, "step": 12400 }, { "epoch": 2.11, "grad_norm": 12.155316694768354, "learning_rate": 2.455022793030319e-06, "loss": 0.351, "step": 12405 }, { "epoch": 2.11, "grad_norm": 6.137254627367353, "learning_rate": 2.4507670865521093e-06, "loss": 0.3555, "step": 12410 }, { "epoch": 2.11, "grad_norm": 11.335378484644814, "learning_rate": 2.446513874057219e-06, "loss": 0.3475, "step": 12415 }, { "epoch": 2.11, "grad_norm": 7.202345783395633, "learning_rate": 2.442263159706678e-06, "loss": 0.3627, "step": 12420 }, { "epoch": 2.11, "grad_norm": 5.040729939592325, "learning_rate": 2.4380149476590805e-06, "loss": 0.362, "step": 12425 }, { "epoch": 2.11, "grad_norm": 14.663934834177535, "learning_rate": 2.4337692420705578e-06, "loss": 0.352, "step": 12430 }, { "epoch": 2.11, "grad_norm": 5.6804959838619205, "learning_rate": 2.4295260470948058e-06, "loss": 0.3551, "step": 12435 }, { "epoch": 2.11, "grad_norm": 8.268339570358487, "learning_rate": 2.425285366883053e-06, "loss": 0.3601, "step": 12440 }, { "epoch": 2.12, "grad_norm": 6.822741932012296, "learning_rate": 2.42104720558407e-06, "loss": 0.3484, "step": 12445 }, { "epoch": 2.12, "grad_norm": 9.813642685372308, "learning_rate": 2.416811567344169e-06, "loss": 0.3679, "step": 12450 }, { "epoch": 2.12, "grad_norm": 10.837042565036516, "learning_rate": 2.4125784563071843e-06, "loss": 0.3476, "step": 12455 }, { "epoch": 2.12, "grad_norm": 4.782218813101773, "learning_rate": 2.4083478766144863e-06, "loss": 0.3606, "step": 12460 }, { "epoch": 2.12, "grad_norm": 11.059837450405126, "learning_rate": 2.4041198324049634e-06, "loss": 0.3672, "step": 12465 }, { "epoch": 2.12, "grad_norm": 8.250155523400553, "learning_rate": 2.3998943278150265e-06, "loss": 0.3653, "step": 12470 }, { "epoch": 2.12, "grad_norm": 8.595823286695708, "learning_rate": 2.3956713669785974e-06, "loss": 0.3572, "step": 12475 }, { "epoch": 2.12, "grad_norm": 9.43108384835417, "learning_rate": 2.391450954027117e-06, "loss": 0.3578, "step": 12480 }, { "epoch": 2.12, "grad_norm": 4.888909316789247, "learning_rate": 2.387233093089527e-06, "loss": 0.3716, "step": 12485 }, { "epoch": 2.12, "grad_norm": 4.484941394571873, "learning_rate": 2.383017788292273e-06, "loss": 0.3568, "step": 12490 }, { "epoch": 2.12, "grad_norm": 4.746259770066021, "learning_rate": 2.3788050437593042e-06, "loss": 0.3651, "step": 12495 }, { "epoch": 2.13, "grad_norm": 4.9129131917140505, "learning_rate": 2.37459486361206e-06, "loss": 0.3515, "step": 12500 }, { "epoch": 2.13, "grad_norm": 6.071685633633804, "learning_rate": 2.370387251969477e-06, "loss": 0.3692, "step": 12505 }, { "epoch": 2.13, "grad_norm": 4.804804003375496, "learning_rate": 2.366182212947969e-06, "loss": 0.3534, "step": 12510 }, { "epoch": 2.13, "grad_norm": 8.700029141854401, "learning_rate": 2.3619797506614447e-06, "loss": 0.3575, "step": 12515 }, { "epoch": 2.13, "grad_norm": 4.871791337356873, "learning_rate": 2.3577798692212817e-06, "loss": 0.3479, "step": 12520 }, { "epoch": 2.13, "grad_norm": 9.445874880209733, "learning_rate": 2.353582572736342e-06, "loss": 0.3593, "step": 12525 }, { "epoch": 2.13, "grad_norm": 6.388800020380725, "learning_rate": 2.349387865312951e-06, "loss": 0.3607, "step": 12530 }, { "epoch": 2.13, "grad_norm": 7.061201502997077, "learning_rate": 2.3451957510549034e-06, "loss": 0.3547, "step": 12535 }, { "epoch": 2.13, "grad_norm": 4.40790926491953, "learning_rate": 2.34100623406346e-06, "loss": 0.3569, "step": 12540 }, { "epoch": 2.13, "grad_norm": 6.112629901368969, "learning_rate": 2.336819318437338e-06, "loss": 0.3571, "step": 12545 }, { "epoch": 2.13, "grad_norm": 5.5893900154448986, "learning_rate": 2.3326350082727093e-06, "loss": 0.3549, "step": 12550 }, { "epoch": 2.13, "grad_norm": 5.773355334844262, "learning_rate": 2.3284533076631954e-06, "loss": 0.3628, "step": 12555 }, { "epoch": 2.14, "grad_norm": 5.176848576237044, "learning_rate": 2.3242742206998703e-06, "loss": 0.3525, "step": 12560 }, { "epoch": 2.14, "grad_norm": 5.064208261078508, "learning_rate": 2.3200977514712434e-06, "loss": 0.3646, "step": 12565 }, { "epoch": 2.14, "grad_norm": 8.572635391436043, "learning_rate": 2.3159239040632725e-06, "loss": 0.3496, "step": 12570 }, { "epoch": 2.14, "grad_norm": 6.203301780474356, "learning_rate": 2.3117526825593417e-06, "loss": 0.3509, "step": 12575 }, { "epoch": 2.14, "grad_norm": 9.019304670820222, "learning_rate": 2.307584091040268e-06, "loss": 0.3532, "step": 12580 }, { "epoch": 2.14, "grad_norm": 11.743250315644167, "learning_rate": 2.303418133584301e-06, "loss": 0.3549, "step": 12585 }, { "epoch": 2.14, "grad_norm": 4.598745561581672, "learning_rate": 2.299254814267107e-06, "loss": 0.3494, "step": 12590 }, { "epoch": 2.14, "grad_norm": 10.03454149898803, "learning_rate": 2.295094137161774e-06, "loss": 0.3588, "step": 12595 }, { "epoch": 2.14, "grad_norm": 4.63513008315468, "learning_rate": 2.2909361063388024e-06, "loss": 0.3514, "step": 12600 }, { "epoch": 2.14, "grad_norm": 12.429596505739113, "learning_rate": 2.28678072586611e-06, "loss": 0.3504, "step": 12605 }, { "epoch": 2.14, "grad_norm": 7.747186379508115, "learning_rate": 2.282627999809014e-06, "loss": 0.3517, "step": 12610 }, { "epoch": 2.14, "grad_norm": 11.766026429105853, "learning_rate": 2.2784779322302408e-06, "loss": 0.3515, "step": 12615 }, { "epoch": 2.15, "grad_norm": 8.19796998489907, "learning_rate": 2.274330527189913e-06, "loss": 0.3563, "step": 12620 }, { "epoch": 2.15, "grad_norm": 5.687175841710557, "learning_rate": 2.2701857887455482e-06, "loss": 0.3553, "step": 12625 }, { "epoch": 2.15, "grad_norm": 5.492044808621296, "learning_rate": 2.266043720952054e-06, "loss": 0.346, "step": 12630 }, { "epoch": 2.15, "grad_norm": 8.698079597137065, "learning_rate": 2.2619043278617307e-06, "loss": 0.355, "step": 12635 }, { "epoch": 2.15, "grad_norm": 6.647526337896694, "learning_rate": 2.2577676135242566e-06, "loss": 0.356, "step": 12640 }, { "epoch": 2.15, "grad_norm": 6.7027116634400175, "learning_rate": 2.253633581986689e-06, "loss": 0.3624, "step": 12645 }, { "epoch": 2.15, "grad_norm": 5.394766540445826, "learning_rate": 2.249502237293466e-06, "loss": 0.3621, "step": 12650 }, { "epoch": 2.15, "grad_norm": 5.1860900535938, "learning_rate": 2.2453735834863897e-06, "loss": 0.3479, "step": 12655 }, { "epoch": 2.15, "grad_norm": 10.678739729089573, "learning_rate": 2.2412476246046377e-06, "loss": 0.3508, "step": 12660 }, { "epoch": 2.15, "grad_norm": 7.542513089344192, "learning_rate": 2.2371243646847444e-06, "loss": 0.3579, "step": 12665 }, { "epoch": 2.15, "grad_norm": 5.907712371792205, "learning_rate": 2.233003807760607e-06, "loss": 0.3553, "step": 12670 }, { "epoch": 2.15, "grad_norm": 7.918275784958399, "learning_rate": 2.228885957863477e-06, "loss": 0.3488, "step": 12675 }, { "epoch": 2.16, "grad_norm": 5.0345251496741605, "learning_rate": 2.224770819021956e-06, "loss": 0.3516, "step": 12680 }, { "epoch": 2.16, "grad_norm": 6.360579403001685, "learning_rate": 2.220658395261999e-06, "loss": 0.3505, "step": 12685 }, { "epoch": 2.16, "grad_norm": 5.071208825906468, "learning_rate": 2.216548690606898e-06, "loss": 0.3503, "step": 12690 }, { "epoch": 2.16, "grad_norm": 7.3340027233657255, "learning_rate": 2.2124417090772903e-06, "loss": 0.3498, "step": 12695 }, { "epoch": 2.16, "grad_norm": 5.058112027557419, "learning_rate": 2.2083374546911444e-06, "loss": 0.364, "step": 12700 }, { "epoch": 2.16, "grad_norm": 4.4348235788957675, "learning_rate": 2.2042359314637683e-06, "loss": 0.3618, "step": 12705 }, { "epoch": 2.16, "grad_norm": 9.226367642118475, "learning_rate": 2.200137143407785e-06, "loss": 0.3644, "step": 12710 }, { "epoch": 2.16, "grad_norm": 4.612977427099218, "learning_rate": 2.196041094533155e-06, "loss": 0.3588, "step": 12715 }, { "epoch": 2.16, "grad_norm": 6.7396878974458385, "learning_rate": 2.1919477888471522e-06, "loss": 0.3558, "step": 12720 }, { "epoch": 2.16, "grad_norm": 4.955526064834778, "learning_rate": 2.1878572303543655e-06, "loss": 0.3601, "step": 12725 }, { "epoch": 2.16, "grad_norm": 14.447660951252212, "learning_rate": 2.183769423056702e-06, "loss": 0.3562, "step": 12730 }, { "epoch": 2.17, "grad_norm": 7.579752616290158, "learning_rate": 2.179684370953371e-06, "loss": 0.3577, "step": 12735 }, { "epoch": 2.17, "grad_norm": 5.6468210027675445, "learning_rate": 2.175602078040892e-06, "loss": 0.352, "step": 12740 }, { "epoch": 2.17, "grad_norm": 4.578563303804438, "learning_rate": 2.1715225483130815e-06, "loss": 0.3456, "step": 12745 }, { "epoch": 2.17, "grad_norm": 5.508536921531665, "learning_rate": 2.167445785761052e-06, "loss": 0.358, "step": 12750 }, { "epoch": 2.17, "grad_norm": 4.501753862781778, "learning_rate": 2.1633717943732098e-06, "loss": 0.3413, "step": 12755 }, { "epoch": 2.17, "grad_norm": 6.196928032996506, "learning_rate": 2.159300578135254e-06, "loss": 0.3581, "step": 12760 }, { "epoch": 2.17, "grad_norm": 6.074066358859239, "learning_rate": 2.1552321410301626e-06, "loss": 0.3529, "step": 12765 }, { "epoch": 2.17, "grad_norm": 8.809753435779445, "learning_rate": 2.1511664870381956e-06, "loss": 0.3591, "step": 12770 }, { "epoch": 2.17, "grad_norm": 4.725861909837628, "learning_rate": 2.1471036201368968e-06, "loss": 0.352, "step": 12775 }, { "epoch": 2.17, "grad_norm": 5.314575776197405, "learning_rate": 2.1430435443010733e-06, "loss": 0.3479, "step": 12780 }, { "epoch": 2.17, "grad_norm": 4.603148181328639, "learning_rate": 2.1389862635028136e-06, "loss": 0.352, "step": 12785 }, { "epoch": 2.17, "grad_norm": 6.357563224881423, "learning_rate": 2.134931781711457e-06, "loss": 0.3524, "step": 12790 }, { "epoch": 2.18, "grad_norm": 5.679609149108052, "learning_rate": 2.130880102893618e-06, "loss": 0.3547, "step": 12795 }, { "epoch": 2.18, "grad_norm": 4.653289583363171, "learning_rate": 2.126831231013159e-06, "loss": 0.3433, "step": 12800 }, { "epoch": 2.18, "grad_norm": 5.150465354689677, "learning_rate": 2.122785170031205e-06, "loss": 0.3553, "step": 12805 }, { "epoch": 2.18, "grad_norm": 6.49152039269724, "learning_rate": 2.118741923906125e-06, "loss": 0.3545, "step": 12810 }, { "epoch": 2.18, "grad_norm": 4.968638433195927, "learning_rate": 2.1147014965935327e-06, "loss": 0.3511, "step": 12815 }, { "epoch": 2.18, "grad_norm": 15.476562767724293, "learning_rate": 2.110663892046292e-06, "loss": 0.3475, "step": 12820 }, { "epoch": 2.18, "grad_norm": 5.193980384967873, "learning_rate": 2.1066291142144978e-06, "loss": 0.3561, "step": 12825 }, { "epoch": 2.18, "grad_norm": 20.28983778856591, "learning_rate": 2.1025971670454827e-06, "loss": 0.3646, "step": 12830 }, { "epoch": 2.18, "grad_norm": 4.287610844059453, "learning_rate": 2.098568054483807e-06, "loss": 0.3655, "step": 12835 }, { "epoch": 2.18, "grad_norm": 21.597390203424133, "learning_rate": 2.094541780471264e-06, "loss": 0.3608, "step": 12840 }, { "epoch": 2.18, "grad_norm": 13.785556614757647, "learning_rate": 2.0905183489468623e-06, "loss": 0.3641, "step": 12845 }, { "epoch": 2.18, "grad_norm": 7.368036043761198, "learning_rate": 2.0864977638468376e-06, "loss": 0.3596, "step": 12850 }, { "epoch": 2.19, "grad_norm": 5.858919450916035, "learning_rate": 2.0824800291046347e-06, "loss": 0.371, "step": 12855 }, { "epoch": 2.19, "grad_norm": 13.328796201165607, "learning_rate": 2.07846514865091e-06, "loss": 0.3434, "step": 12860 }, { "epoch": 2.19, "grad_norm": 4.384829402670883, "learning_rate": 2.0744531264135327e-06, "loss": 0.3554, "step": 12865 }, { "epoch": 2.19, "grad_norm": 12.8962393661322, "learning_rate": 2.0704439663175714e-06, "loss": 0.3517, "step": 12870 }, { "epoch": 2.19, "grad_norm": 4.3887445041630295, "learning_rate": 2.0664376722852948e-06, "loss": 0.3555, "step": 12875 }, { "epoch": 2.19, "grad_norm": 19.531962902066855, "learning_rate": 2.0624342482361664e-06, "loss": 0.3501, "step": 12880 }, { "epoch": 2.19, "grad_norm": 5.102304932018561, "learning_rate": 2.058433698086848e-06, "loss": 0.3382, "step": 12885 }, { "epoch": 2.19, "grad_norm": 14.280312009210922, "learning_rate": 2.0544360257511826e-06, "loss": 0.3578, "step": 12890 }, { "epoch": 2.19, "grad_norm": 5.072495666599368, "learning_rate": 2.050441235140203e-06, "loss": 0.3507, "step": 12895 }, { "epoch": 2.19, "grad_norm": 4.7319798260058095, "learning_rate": 2.046449330162121e-06, "loss": 0.3427, "step": 12900 }, { "epoch": 2.19, "grad_norm": 4.997556168618728, "learning_rate": 2.0424603147223228e-06, "loss": 0.3487, "step": 12905 }, { "epoch": 2.19, "grad_norm": 4.668830722747984, "learning_rate": 2.0384741927233687e-06, "loss": 0.3471, "step": 12910 }, { "epoch": 2.2, "grad_norm": 4.498365158807844, "learning_rate": 2.0344909680649937e-06, "loss": 0.3444, "step": 12915 }, { "epoch": 2.2, "grad_norm": 4.035491184131442, "learning_rate": 2.030510644644091e-06, "loss": 0.3531, "step": 12920 }, { "epoch": 2.2, "grad_norm": 4.3573911283481035, "learning_rate": 2.0265332263547175e-06, "loss": 0.3546, "step": 12925 }, { "epoch": 2.2, "grad_norm": 12.465513201544523, "learning_rate": 2.022558717088092e-06, "loss": 0.351, "step": 12930 }, { "epoch": 2.2, "grad_norm": 4.656423916603554, "learning_rate": 2.01858712073258e-06, "loss": 0.3478, "step": 12935 }, { "epoch": 2.2, "grad_norm": 5.118832543339823, "learning_rate": 2.0146184411737057e-06, "loss": 0.3521, "step": 12940 }, { "epoch": 2.2, "grad_norm": 5.423815170561722, "learning_rate": 2.0106526822941336e-06, "loss": 0.3537, "step": 12945 }, { "epoch": 2.2, "grad_norm": 4.497609982111329, "learning_rate": 2.006689847973672e-06, "loss": 0.362, "step": 12950 }, { "epoch": 2.2, "grad_norm": 8.724143543231287, "learning_rate": 2.0027299420892687e-06, "loss": 0.3605, "step": 12955 }, { "epoch": 2.2, "grad_norm": 6.545635461939133, "learning_rate": 1.9987729685150054e-06, "loss": 0.3477, "step": 12960 }, { "epoch": 2.2, "grad_norm": 8.329174384014586, "learning_rate": 1.994818931122099e-06, "loss": 0.353, "step": 12965 }, { "epoch": 2.21, "grad_norm": 5.631646120301006, "learning_rate": 1.9908678337788866e-06, "loss": 0.3623, "step": 12970 }, { "epoch": 2.21, "grad_norm": 4.587745944032032, "learning_rate": 1.9869196803508383e-06, "loss": 0.3512, "step": 12975 }, { "epoch": 2.21, "grad_norm": 4.614550001666569, "learning_rate": 1.9829744747005355e-06, "loss": 0.3486, "step": 12980 }, { "epoch": 2.21, "grad_norm": 6.188568299003901, "learning_rate": 1.979032220687683e-06, "loss": 0.3576, "step": 12985 }, { "epoch": 2.21, "grad_norm": 5.165804827286981, "learning_rate": 1.975092922169089e-06, "loss": 0.3532, "step": 12990 }, { "epoch": 2.21, "grad_norm": 5.097043978976538, "learning_rate": 1.9711565829986795e-06, "loss": 0.3489, "step": 12995 }, { "epoch": 2.21, "grad_norm": 4.939939440373387, "learning_rate": 1.9672232070274803e-06, "loss": 0.3582, "step": 13000 }, { "epoch": 2.21, "grad_norm": 5.601542152733169, "learning_rate": 1.963292798103617e-06, "loss": 0.3549, "step": 13005 }, { "epoch": 2.21, "grad_norm": 10.330609981982413, "learning_rate": 1.9593653600723184e-06, "loss": 0.3459, "step": 13010 }, { "epoch": 2.21, "grad_norm": 5.109084999537791, "learning_rate": 1.9554408967758996e-06, "loss": 0.3554, "step": 13015 }, { "epoch": 2.21, "grad_norm": 5.586639449236359, "learning_rate": 1.951519412053772e-06, "loss": 0.3496, "step": 13020 }, { "epoch": 2.21, "grad_norm": 8.917383245650884, "learning_rate": 1.947600909742427e-06, "loss": 0.3497, "step": 13025 }, { "epoch": 2.22, "grad_norm": 5.944441252305898, "learning_rate": 1.9436853936754456e-06, "loss": 0.3571, "step": 13030 }, { "epoch": 2.22, "grad_norm": 15.326786665243585, "learning_rate": 1.9397728676834772e-06, "loss": 0.3525, "step": 13035 }, { "epoch": 2.22, "grad_norm": 5.722319121428556, "learning_rate": 1.9358633355942547e-06, "loss": 0.3523, "step": 13040 }, { "epoch": 2.22, "grad_norm": 4.987853647705775, "learning_rate": 1.9319568012325785e-06, "loss": 0.346, "step": 13045 }, { "epoch": 2.22, "grad_norm": 4.821900932918403, "learning_rate": 1.928053268420314e-06, "loss": 0.3535, "step": 13050 }, { "epoch": 2.22, "grad_norm": 5.759054880221154, "learning_rate": 1.924152740976397e-06, "loss": 0.3507, "step": 13055 }, { "epoch": 2.22, "grad_norm": 4.184172045514366, "learning_rate": 1.920255222716815e-06, "loss": 0.3547, "step": 13060 }, { "epoch": 2.22, "grad_norm": 8.550200708516813, "learning_rate": 1.916360717454618e-06, "loss": 0.3586, "step": 13065 }, { "epoch": 2.22, "grad_norm": 7.201643974584935, "learning_rate": 1.9124692289999043e-06, "loss": 0.3534, "step": 13070 }, { "epoch": 2.22, "grad_norm": 8.145930153650735, "learning_rate": 1.908580761159822e-06, "loss": 0.3493, "step": 13075 }, { "epoch": 2.22, "grad_norm": 6.265935915615525, "learning_rate": 1.9046953177385623e-06, "loss": 0.3474, "step": 13080 }, { "epoch": 2.22, "grad_norm": 5.750015111410342, "learning_rate": 1.9008129025373629e-06, "loss": 0.3522, "step": 13085 }, { "epoch": 2.23, "grad_norm": 4.615266133527854, "learning_rate": 1.896933519354493e-06, "loss": 0.3465, "step": 13090 }, { "epoch": 2.23, "grad_norm": 5.064289801459999, "learning_rate": 1.893057171985257e-06, "loss": 0.3433, "step": 13095 }, { "epoch": 2.23, "grad_norm": 4.437705557970718, "learning_rate": 1.889183864221993e-06, "loss": 0.3432, "step": 13100 }, { "epoch": 2.23, "grad_norm": 4.890599420576279, "learning_rate": 1.885313599854059e-06, "loss": 0.3489, "step": 13105 }, { "epoch": 2.23, "grad_norm": 5.3124975455109755, "learning_rate": 1.8814463826678442e-06, "loss": 0.3511, "step": 13110 }, { "epoch": 2.23, "grad_norm": 9.871965635586927, "learning_rate": 1.877582216446745e-06, "loss": 0.3502, "step": 13115 }, { "epoch": 2.23, "grad_norm": 4.646315847922393, "learning_rate": 1.873721104971184e-06, "loss": 0.3505, "step": 13120 }, { "epoch": 2.23, "grad_norm": 6.027378772777941, "learning_rate": 1.8698630520185874e-06, "loss": 0.3479, "step": 13125 }, { "epoch": 2.23, "grad_norm": 5.038666599099896, "learning_rate": 1.8660080613633963e-06, "loss": 0.347, "step": 13130 }, { "epoch": 2.23, "grad_norm": 5.010673079253129, "learning_rate": 1.8621561367770497e-06, "loss": 0.3529, "step": 13135 }, { "epoch": 2.23, "grad_norm": 8.602362535722602, "learning_rate": 1.8583072820279885e-06, "loss": 0.3444, "step": 13140 }, { "epoch": 2.23, "grad_norm": 4.896038325823834, "learning_rate": 1.8544615008816536e-06, "loss": 0.3442, "step": 13145 }, { "epoch": 2.24, "grad_norm": 5.876220617037956, "learning_rate": 1.8506187971004753e-06, "loss": 0.3434, "step": 13150 }, { "epoch": 2.24, "grad_norm": 5.195201460399498, "learning_rate": 1.8467791744438735e-06, "loss": 0.3522, "step": 13155 }, { "epoch": 2.24, "grad_norm": 5.816129760139536, "learning_rate": 1.842942636668254e-06, "loss": 0.3514, "step": 13160 }, { "epoch": 2.24, "grad_norm": 4.810474650882271, "learning_rate": 1.8391091875270083e-06, "loss": 0.3541, "step": 13165 }, { "epoch": 2.24, "grad_norm": 5.278313176164161, "learning_rate": 1.8352788307704994e-06, "loss": 0.3551, "step": 13170 }, { "epoch": 2.24, "grad_norm": 5.719003770806738, "learning_rate": 1.8314515701460728e-06, "loss": 0.3526, "step": 13175 }, { "epoch": 2.24, "grad_norm": 4.4429678197703275, "learning_rate": 1.8276274093980378e-06, "loss": 0.3465, "step": 13180 }, { "epoch": 2.24, "grad_norm": 8.14256452509486, "learning_rate": 1.823806352267673e-06, "loss": 0.3431, "step": 13185 }, { "epoch": 2.24, "grad_norm": 4.746051066675634, "learning_rate": 1.8199884024932269e-06, "loss": 0.3499, "step": 13190 }, { "epoch": 2.24, "grad_norm": 7.986311949760354, "learning_rate": 1.8161735638098954e-06, "loss": 0.3448, "step": 13195 }, { "epoch": 2.24, "grad_norm": 6.914570318914096, "learning_rate": 1.8123618399498443e-06, "loss": 0.3507, "step": 13200 }, { "epoch": 2.24, "grad_norm": 7.1579141005865, "learning_rate": 1.8085532346421813e-06, "loss": 0.3493, "step": 13205 }, { "epoch": 2.25, "grad_norm": 6.884851366646707, "learning_rate": 1.8047477516129714e-06, "loss": 0.3462, "step": 13210 }, { "epoch": 2.25, "grad_norm": 4.922576484520524, "learning_rate": 1.800945394585218e-06, "loss": 0.3484, "step": 13215 }, { "epoch": 2.25, "grad_norm": 5.022729298673702, "learning_rate": 1.797146167278873e-06, "loss": 0.3559, "step": 13220 }, { "epoch": 2.25, "grad_norm": 10.15404561937025, "learning_rate": 1.7933500734108217e-06, "loss": 0.3461, "step": 13225 }, { "epoch": 2.25, "grad_norm": 4.677057476274528, "learning_rate": 1.7895571166948839e-06, "loss": 0.3531, "step": 13230 }, { "epoch": 2.25, "grad_norm": 16.877583132023236, "learning_rate": 1.7857673008418126e-06, "loss": 0.3499, "step": 13235 }, { "epoch": 2.25, "grad_norm": 5.918368999806285, "learning_rate": 1.7819806295592846e-06, "loss": 0.3479, "step": 13240 }, { "epoch": 2.25, "grad_norm": 13.074015583450297, "learning_rate": 1.778197106551906e-06, "loss": 0.3469, "step": 13245 }, { "epoch": 2.25, "grad_norm": 5.204338553008846, "learning_rate": 1.7744167355211967e-06, "loss": 0.3437, "step": 13250 }, { "epoch": 2.25, "grad_norm": 11.941548788313813, "learning_rate": 1.770639520165598e-06, "loss": 0.3347, "step": 13255 }, { "epoch": 2.25, "grad_norm": 4.311730600707324, "learning_rate": 1.7668654641804583e-06, "loss": 0.3517, "step": 13260 }, { "epoch": 2.26, "grad_norm": 6.954675892706177, "learning_rate": 1.7630945712580427e-06, "loss": 0.3657, "step": 13265 }, { "epoch": 2.26, "grad_norm": 4.517930593178529, "learning_rate": 1.7593268450875145e-06, "loss": 0.3505, "step": 13270 }, { "epoch": 2.26, "grad_norm": 10.857689798245556, "learning_rate": 1.7555622893549429e-06, "loss": 0.3502, "step": 13275 }, { "epoch": 2.26, "grad_norm": 6.640722053004046, "learning_rate": 1.751800907743294e-06, "loss": 0.3531, "step": 13280 }, { "epoch": 2.26, "grad_norm": 7.540364031508203, "learning_rate": 1.7480427039324266e-06, "loss": 0.3543, "step": 13285 }, { "epoch": 2.26, "grad_norm": 6.191867604546337, "learning_rate": 1.7442876815990972e-06, "loss": 0.3579, "step": 13290 }, { "epoch": 2.26, "grad_norm": 10.730074278228338, "learning_rate": 1.7405358444169413e-06, "loss": 0.3585, "step": 13295 }, { "epoch": 2.26, "grad_norm": 6.655593162182091, "learning_rate": 1.7367871960564865e-06, "loss": 0.3635, "step": 13300 }, { "epoch": 2.26, "grad_norm": 4.232812680915814, "learning_rate": 1.7330417401851317e-06, "loss": 0.3463, "step": 13305 }, { "epoch": 2.26, "grad_norm": 4.764333836098204, "learning_rate": 1.7292994804671648e-06, "loss": 0.3489, "step": 13310 }, { "epoch": 2.26, "grad_norm": 4.560495803368249, "learning_rate": 1.7255604205637305e-06, "loss": 0.3567, "step": 13315 }, { "epoch": 2.26, "grad_norm": 6.599711992803756, "learning_rate": 1.7218245641328585e-06, "loss": 0.3439, "step": 13320 }, { "epoch": 2.27, "grad_norm": 6.5763943704902665, "learning_rate": 1.7180919148294356e-06, "loss": 0.3478, "step": 13325 }, { "epoch": 2.27, "grad_norm": 6.192559204919637, "learning_rate": 1.7143624763052113e-06, "loss": 0.3555, "step": 13330 }, { "epoch": 2.27, "grad_norm": 6.530792324752998, "learning_rate": 1.7106362522088e-06, "loss": 0.3484, "step": 13335 }, { "epoch": 2.27, "grad_norm": 5.030676755391766, "learning_rate": 1.7069132461856636e-06, "loss": 0.3502, "step": 13340 }, { "epoch": 2.27, "grad_norm": 5.3288817707798675, "learning_rate": 1.703193461878122e-06, "loss": 0.3426, "step": 13345 }, { "epoch": 2.27, "grad_norm": 6.86473762505183, "learning_rate": 1.69947690292534e-06, "loss": 0.3425, "step": 13350 }, { "epoch": 2.27, "grad_norm": 6.179694085709815, "learning_rate": 1.6957635729633265e-06, "loss": 0.3411, "step": 13355 }, { "epoch": 2.27, "grad_norm": 4.590706960514583, "learning_rate": 1.6920534756249313e-06, "loss": 0.3386, "step": 13360 }, { "epoch": 2.27, "grad_norm": 6.085792274530593, "learning_rate": 1.6883466145398458e-06, "loss": 0.3517, "step": 13365 }, { "epoch": 2.27, "grad_norm": 9.457605435397486, "learning_rate": 1.6846429933345909e-06, "loss": 0.3608, "step": 13370 }, { "epoch": 2.27, "grad_norm": 4.232165753807701, "learning_rate": 1.6809426156325165e-06, "loss": 0.3487, "step": 13375 }, { "epoch": 2.27, "grad_norm": 5.420720267383381, "learning_rate": 1.6772454850538062e-06, "loss": 0.3393, "step": 13380 }, { "epoch": 2.28, "grad_norm": 7.003590199085304, "learning_rate": 1.6735516052154581e-06, "loss": 0.3444, "step": 13385 }, { "epoch": 2.28, "grad_norm": 4.89964111274835, "learning_rate": 1.6698609797313015e-06, "loss": 0.3614, "step": 13390 }, { "epoch": 2.28, "grad_norm": 4.51153800199699, "learning_rate": 1.666173612211966e-06, "loss": 0.3566, "step": 13395 }, { "epoch": 2.28, "grad_norm": 4.191882587472384, "learning_rate": 1.6624895062649087e-06, "loss": 0.3313, "step": 13400 }, { "epoch": 2.28, "grad_norm": 5.991928299959772, "learning_rate": 1.658808665494387e-06, "loss": 0.3519, "step": 13405 }, { "epoch": 2.28, "grad_norm": 4.8209245667833525, "learning_rate": 1.6551310935014686e-06, "loss": 0.3414, "step": 13410 }, { "epoch": 2.28, "grad_norm": 8.351080819575392, "learning_rate": 1.6514567938840215e-06, "loss": 0.3518, "step": 13415 }, { "epoch": 2.28, "grad_norm": 5.231585925437265, "learning_rate": 1.6477857702367088e-06, "loss": 0.3517, "step": 13420 }, { "epoch": 2.28, "grad_norm": 6.649497456792598, "learning_rate": 1.644118026150996e-06, "loss": 0.3473, "step": 13425 }, { "epoch": 2.28, "grad_norm": 6.598271100066865, "learning_rate": 1.640453565215135e-06, "loss": 0.3471, "step": 13430 }, { "epoch": 2.28, "grad_norm": 4.560216547673767, "learning_rate": 1.636792391014166e-06, "loss": 0.3402, "step": 13435 }, { "epoch": 2.28, "grad_norm": 4.525873620362791, "learning_rate": 1.6331345071299126e-06, "loss": 0.3444, "step": 13440 }, { "epoch": 2.29, "grad_norm": 4.65064534196253, "learning_rate": 1.6294799171409847e-06, "loss": 0.3358, "step": 13445 }, { "epoch": 2.29, "grad_norm": 4.884507828410077, "learning_rate": 1.6258286246227639e-06, "loss": 0.3444, "step": 13450 }, { "epoch": 2.29, "grad_norm": 5.0941968873682155, "learning_rate": 1.6221806331474105e-06, "loss": 0.3451, "step": 13455 }, { "epoch": 2.29, "grad_norm": 5.836567699217456, "learning_rate": 1.6185359462838517e-06, "loss": 0.342, "step": 13460 }, { "epoch": 2.29, "grad_norm": 6.717508618110533, "learning_rate": 1.614894567597781e-06, "loss": 0.3408, "step": 13465 }, { "epoch": 2.29, "grad_norm": 4.417072529234567, "learning_rate": 1.6112565006516628e-06, "loss": 0.348, "step": 13470 }, { "epoch": 2.29, "grad_norm": 8.232586277623032, "learning_rate": 1.6076217490047092e-06, "loss": 0.3471, "step": 13475 }, { "epoch": 2.29, "grad_norm": 12.738691740688402, "learning_rate": 1.6039903162129005e-06, "loss": 0.346, "step": 13480 }, { "epoch": 2.29, "grad_norm": 4.476783163143757, "learning_rate": 1.6003622058289625e-06, "loss": 0.339, "step": 13485 }, { "epoch": 2.29, "grad_norm": 6.1786516010413735, "learning_rate": 1.5967374214023767e-06, "loss": 0.3426, "step": 13490 }, { "epoch": 2.29, "grad_norm": 9.240119993457492, "learning_rate": 1.5931159664793638e-06, "loss": 0.3576, "step": 13495 }, { "epoch": 2.3, "grad_norm": 4.617265515344786, "learning_rate": 1.5894978446028948e-06, "loss": 0.3571, "step": 13500 }, { "epoch": 2.3, "grad_norm": 8.100899996233863, "learning_rate": 1.5858830593126733e-06, "loss": 0.3476, "step": 13505 }, { "epoch": 2.3, "grad_norm": 5.180130314731739, "learning_rate": 1.582271614145142e-06, "loss": 0.3479, "step": 13510 }, { "epoch": 2.3, "grad_norm": 4.490433584866408, "learning_rate": 1.5786635126334748e-06, "loss": 0.3493, "step": 13515 }, { "epoch": 2.3, "grad_norm": 4.815477303917275, "learning_rate": 1.5750587583075732e-06, "loss": 0.3326, "step": 13520 }, { "epoch": 2.3, "grad_norm": 4.583652687710298, "learning_rate": 1.5714573546940692e-06, "loss": 0.3457, "step": 13525 }, { "epoch": 2.3, "grad_norm": 4.454232968768451, "learning_rate": 1.5678593053163093e-06, "loss": 0.3405, "step": 13530 }, { "epoch": 2.3, "grad_norm": 8.43772017874306, "learning_rate": 1.5642646136943657e-06, "loss": 0.3352, "step": 13535 }, { "epoch": 2.3, "grad_norm": 7.122617314101993, "learning_rate": 1.5606732833450189e-06, "loss": 0.3398, "step": 13540 }, { "epoch": 2.3, "grad_norm": 4.665676662010156, "learning_rate": 1.5570853177817675e-06, "loss": 0.3396, "step": 13545 }, { "epoch": 2.3, "grad_norm": 4.511840143186782, "learning_rate": 1.5535007205148134e-06, "loss": 0.3554, "step": 13550 }, { "epoch": 2.3, "grad_norm": 4.798262124458812, "learning_rate": 1.549919495051065e-06, "loss": 0.3454, "step": 13555 }, { "epoch": 2.31, "grad_norm": 9.802067833224381, "learning_rate": 1.546341644894131e-06, "loss": 0.3409, "step": 13560 }, { "epoch": 2.31, "grad_norm": 6.21854918317779, "learning_rate": 1.5427671735443179e-06, "loss": 0.3396, "step": 13565 }, { "epoch": 2.31, "grad_norm": 9.027154433511654, "learning_rate": 1.5391960844986303e-06, "loss": 0.3347, "step": 13570 }, { "epoch": 2.31, "grad_norm": 5.9085323414991215, "learning_rate": 1.5356283812507583e-06, "loss": 0.3556, "step": 13575 }, { "epoch": 2.31, "grad_norm": 5.97029483532059, "learning_rate": 1.5320640672910847e-06, "loss": 0.3553, "step": 13580 }, { "epoch": 2.31, "grad_norm": 4.560186552875563, "learning_rate": 1.5285031461066707e-06, "loss": 0.3388, "step": 13585 }, { "epoch": 2.31, "grad_norm": 4.753874546064374, "learning_rate": 1.524945621181267e-06, "loss": 0.3467, "step": 13590 }, { "epoch": 2.31, "grad_norm": 5.343057377128513, "learning_rate": 1.521391495995289e-06, "loss": 0.3394, "step": 13595 }, { "epoch": 2.31, "grad_norm": 4.353250063424364, "learning_rate": 1.517840774025839e-06, "loss": 0.3498, "step": 13600 }, { "epoch": 2.31, "grad_norm": 7.217358273512686, "learning_rate": 1.5142934587466818e-06, "loss": 0.3369, "step": 13605 }, { "epoch": 2.31, "grad_norm": 5.135377518107784, "learning_rate": 1.5107495536282501e-06, "loss": 0.3443, "step": 13610 }, { "epoch": 2.31, "grad_norm": 4.732260437297867, "learning_rate": 1.507209062137645e-06, "loss": 0.3451, "step": 13615 }, { "epoch": 2.32, "grad_norm": 4.850580660514156, "learning_rate": 1.5036719877386219e-06, "loss": 0.3456, "step": 13620 }, { "epoch": 2.32, "grad_norm": 6.058081962272013, "learning_rate": 1.5001383338915992e-06, "loss": 0.3438, "step": 13625 }, { "epoch": 2.32, "grad_norm": 5.18034489260238, "learning_rate": 1.496608104053644e-06, "loss": 0.342, "step": 13630 }, { "epoch": 2.32, "grad_norm": 4.738934622372447, "learning_rate": 1.4930813016784757e-06, "loss": 0.3525, "step": 13635 }, { "epoch": 2.32, "grad_norm": 7.446377218869251, "learning_rate": 1.4895579302164582e-06, "loss": 0.344, "step": 13640 }, { "epoch": 2.32, "grad_norm": 5.134281629105313, "learning_rate": 1.486037993114604e-06, "loss": 0.3424, "step": 13645 }, { "epoch": 2.32, "grad_norm": 6.422097460058927, "learning_rate": 1.482521493816561e-06, "loss": 0.3414, "step": 13650 }, { "epoch": 2.32, "grad_norm": 8.43992691411807, "learning_rate": 1.4790084357626144e-06, "loss": 0.3343, "step": 13655 }, { "epoch": 2.32, "grad_norm": 6.523273549487832, "learning_rate": 1.475498822389686e-06, "loss": 0.3338, "step": 13660 }, { "epoch": 2.32, "grad_norm": 5.435185890787804, "learning_rate": 1.4719926571313225e-06, "loss": 0.3386, "step": 13665 }, { "epoch": 2.32, "grad_norm": 8.036234796730652, "learning_rate": 1.4684899434177042e-06, "loss": 0.3418, "step": 13670 }, { "epoch": 2.32, "grad_norm": 4.985559920161983, "learning_rate": 1.4649906846756246e-06, "loss": 0.3483, "step": 13675 }, { "epoch": 2.33, "grad_norm": 9.198127442793323, "learning_rate": 1.4614948843285075e-06, "loss": 0.3383, "step": 13680 }, { "epoch": 2.33, "grad_norm": 4.405724182538725, "learning_rate": 1.4580025457963853e-06, "loss": 0.3441, "step": 13685 }, { "epoch": 2.33, "grad_norm": 9.447408149158544, "learning_rate": 1.4545136724959103e-06, "loss": 0.3393, "step": 13690 }, { "epoch": 2.33, "grad_norm": 4.682546730853727, "learning_rate": 1.4510282678403398e-06, "loss": 0.3432, "step": 13695 }, { "epoch": 2.33, "grad_norm": 5.311866051573056, "learning_rate": 1.4475463352395375e-06, "loss": 0.3409, "step": 13700 }, { "epoch": 2.33, "grad_norm": 4.960718718240998, "learning_rate": 1.4440678780999755e-06, "loss": 0.3392, "step": 13705 }, { "epoch": 2.33, "grad_norm": 5.020210423031231, "learning_rate": 1.4405928998247198e-06, "loss": 0.3457, "step": 13710 }, { "epoch": 2.33, "grad_norm": 7.037044644399787, "learning_rate": 1.4371214038134369e-06, "loss": 0.3529, "step": 13715 }, { "epoch": 2.33, "grad_norm": 6.904175449827646, "learning_rate": 1.4336533934623815e-06, "loss": 0.3381, "step": 13720 }, { "epoch": 2.33, "grad_norm": 10.645039334534665, "learning_rate": 1.4301888721644059e-06, "loss": 0.3474, "step": 13725 }, { "epoch": 2.33, "grad_norm": 4.999952519704578, "learning_rate": 1.426727843308942e-06, "loss": 0.3431, "step": 13730 }, { "epoch": 2.34, "grad_norm": 5.703475806749105, "learning_rate": 1.42327031028201e-06, "loss": 0.3368, "step": 13735 }, { "epoch": 2.34, "grad_norm": 7.277357316094131, "learning_rate": 1.419816276466206e-06, "loss": 0.338, "step": 13740 }, { "epoch": 2.34, "grad_norm": 5.857459866364857, "learning_rate": 1.4163657452407037e-06, "loss": 0.3473, "step": 13745 }, { "epoch": 2.34, "grad_norm": 5.60983307528463, "learning_rate": 1.4129187199812539e-06, "loss": 0.341, "step": 13750 }, { "epoch": 2.34, "grad_norm": 5.131631743151706, "learning_rate": 1.4094752040601722e-06, "loss": 0.3404, "step": 13755 }, { "epoch": 2.34, "grad_norm": 5.013967935247216, "learning_rate": 1.4060352008463434e-06, "loss": 0.3361, "step": 13760 }, { "epoch": 2.34, "grad_norm": 4.727283192259992, "learning_rate": 1.4025987137052138e-06, "loss": 0.3404, "step": 13765 }, { "epoch": 2.34, "grad_norm": 4.777124384197018, "learning_rate": 1.399165745998795e-06, "loss": 0.3486, "step": 13770 }, { "epoch": 2.34, "grad_norm": 4.506027592709238, "learning_rate": 1.3957363010856485e-06, "loss": 0.3348, "step": 13775 }, { "epoch": 2.34, "grad_norm": 5.325554245222836, "learning_rate": 1.3923103823208956e-06, "loss": 0.3505, "step": 13780 }, { "epoch": 2.34, "grad_norm": 5.141132954617194, "learning_rate": 1.3888879930562033e-06, "loss": 0.3359, "step": 13785 }, { "epoch": 2.34, "grad_norm": 4.567212775700831, "learning_rate": 1.3854691366397866e-06, "loss": 0.3457, "step": 13790 }, { "epoch": 2.35, "grad_norm": 5.297810518315015, "learning_rate": 1.3820538164164093e-06, "loss": 0.3438, "step": 13795 }, { "epoch": 2.35, "grad_norm": 9.281204002436738, "learning_rate": 1.378642035727365e-06, "loss": 0.3369, "step": 13800 }, { "epoch": 2.35, "grad_norm": 4.36525808599186, "learning_rate": 1.375233797910495e-06, "loss": 0.3365, "step": 13805 }, { "epoch": 2.35, "grad_norm": 4.4490501640437765, "learning_rate": 1.3718291063001682e-06, "loss": 0.3398, "step": 13810 }, { "epoch": 2.35, "grad_norm": 5.715687665200049, "learning_rate": 1.3684279642272885e-06, "loss": 0.3455, "step": 13815 }, { "epoch": 2.35, "grad_norm": 6.08889261493712, "learning_rate": 1.3650303750192817e-06, "loss": 0.344, "step": 13820 }, { "epoch": 2.35, "grad_norm": 8.533607736470023, "learning_rate": 1.361636342000105e-06, "loss": 0.3381, "step": 13825 }, { "epoch": 2.35, "grad_norm": 9.119921821708115, "learning_rate": 1.35824586849023e-06, "loss": 0.3522, "step": 13830 }, { "epoch": 2.35, "grad_norm": 7.88098686226356, "learning_rate": 1.3548589578066483e-06, "loss": 0.3347, "step": 13835 }, { "epoch": 2.35, "grad_norm": 8.405121362717537, "learning_rate": 1.351475613262867e-06, "loss": 0.343, "step": 13840 }, { "epoch": 2.35, "grad_norm": 5.427892201945487, "learning_rate": 1.3480958381689007e-06, "loss": 0.3356, "step": 13845 }, { "epoch": 2.35, "grad_norm": 5.109650437688848, "learning_rate": 1.3447196358312785e-06, "loss": 0.336, "step": 13850 }, { "epoch": 2.36, "grad_norm": 6.899595498191738, "learning_rate": 1.3413470095530267e-06, "loss": 0.3448, "step": 13855 }, { "epoch": 2.36, "grad_norm": 13.105610717701218, "learning_rate": 1.3379779626336792e-06, "loss": 0.3493, "step": 13860 }, { "epoch": 2.36, "grad_norm": 4.477048054743193, "learning_rate": 1.3346124983692633e-06, "loss": 0.3459, "step": 13865 }, { "epoch": 2.36, "grad_norm": 7.3026896755353565, "learning_rate": 1.3312506200523056e-06, "loss": 0.3442, "step": 13870 }, { "epoch": 2.36, "grad_norm": 4.830324581300535, "learning_rate": 1.3278923309718216e-06, "loss": 0.3424, "step": 13875 }, { "epoch": 2.36, "grad_norm": 5.589760157400933, "learning_rate": 1.3245376344133154e-06, "loss": 0.3367, "step": 13880 }, { "epoch": 2.36, "grad_norm": 7.4284197300354275, "learning_rate": 1.321186533658778e-06, "loss": 0.345, "step": 13885 }, { "epoch": 2.36, "grad_norm": 7.245485052632024, "learning_rate": 1.3178390319866796e-06, "loss": 0.343, "step": 13890 }, { "epoch": 2.36, "grad_norm": 7.468765278758668, "learning_rate": 1.314495132671974e-06, "loss": 0.3358, "step": 13895 }, { "epoch": 2.36, "grad_norm": 6.730884937980976, "learning_rate": 1.3111548389860856e-06, "loss": 0.3532, "step": 13900 }, { "epoch": 2.36, "grad_norm": 6.02533076184626, "learning_rate": 1.307818154196917e-06, "loss": 0.3387, "step": 13905 }, { "epoch": 2.36, "grad_norm": 5.535237415331175, "learning_rate": 1.3044850815688336e-06, "loss": 0.3372, "step": 13910 }, { "epoch": 2.37, "grad_norm": 6.078079559136438, "learning_rate": 1.3011556243626744e-06, "loss": 0.3395, "step": 13915 }, { "epoch": 2.37, "grad_norm": 5.862215623470651, "learning_rate": 1.2978297858357319e-06, "loss": 0.3286, "step": 13920 }, { "epoch": 2.37, "grad_norm": 5.075655997097684, "learning_rate": 1.294507569241768e-06, "loss": 0.3476, "step": 13925 }, { "epoch": 2.37, "grad_norm": 6.601342646915005, "learning_rate": 1.291188977830995e-06, "loss": 0.3412, "step": 13930 }, { "epoch": 2.37, "grad_norm": 6.377658288362509, "learning_rate": 1.2878740148500784e-06, "loss": 0.3469, "step": 13935 }, { "epoch": 2.37, "grad_norm": 10.988578510111449, "learning_rate": 1.2845626835421405e-06, "loss": 0.3386, "step": 13940 }, { "epoch": 2.37, "grad_norm": 5.089807389276085, "learning_rate": 1.2812549871467417e-06, "loss": 0.3514, "step": 13945 }, { "epoch": 2.37, "grad_norm": 8.255751972220425, "learning_rate": 1.2779509288998937e-06, "loss": 0.3441, "step": 13950 }, { "epoch": 2.37, "grad_norm": 7.210664216579693, "learning_rate": 1.2746505120340447e-06, "loss": 0.3439, "step": 13955 }, { "epoch": 2.37, "grad_norm": 8.13247241723326, "learning_rate": 1.271353739778081e-06, "loss": 0.3329, "step": 13960 }, { "epoch": 2.37, "grad_norm": 4.318434472721774, "learning_rate": 1.2680606153573233e-06, "loss": 0.3416, "step": 13965 }, { "epoch": 2.38, "grad_norm": 7.922606543380108, "learning_rate": 1.264771141993526e-06, "loss": 0.3374, "step": 13970 }, { "epoch": 2.38, "grad_norm": 7.009083560944203, "learning_rate": 1.261485322904869e-06, "loss": 0.3347, "step": 13975 }, { "epoch": 2.38, "grad_norm": 5.43461490473705, "learning_rate": 1.2582031613059553e-06, "loss": 0.3399, "step": 13980 }, { "epoch": 2.38, "grad_norm": 7.245591319250889, "learning_rate": 1.2549246604078164e-06, "loss": 0.3403, "step": 13985 }, { "epoch": 2.38, "grad_norm": 8.551751274701207, "learning_rate": 1.2516498234178937e-06, "loss": 0.3415, "step": 13990 }, { "epoch": 2.38, "grad_norm": 4.776017027952166, "learning_rate": 1.2483786535400538e-06, "loss": 0.3407, "step": 13995 }, { "epoch": 2.38, "grad_norm": 5.242890436893526, "learning_rate": 1.2451111539745646e-06, "loss": 0.3426, "step": 14000 }, { "epoch": 2.38, "grad_norm": 5.3470389354152426, "learning_rate": 1.2418473279181132e-06, "loss": 0.3338, "step": 14005 }, { "epoch": 2.38, "grad_norm": 6.534546174432391, "learning_rate": 1.238587178563786e-06, "loss": 0.3398, "step": 14010 }, { "epoch": 2.38, "grad_norm": 6.389675063098023, "learning_rate": 1.2353307091010775e-06, "loss": 0.3376, "step": 14015 }, { "epoch": 2.38, "grad_norm": 6.168606843433336, "learning_rate": 1.2320779227158786e-06, "loss": 0.3459, "step": 14020 }, { "epoch": 2.38, "grad_norm": 4.196101054526336, "learning_rate": 1.2288288225904766e-06, "loss": 0.3403, "step": 14025 }, { "epoch": 2.39, "grad_norm": 4.274286352545161, "learning_rate": 1.225583411903556e-06, "loss": 0.3325, "step": 14030 }, { "epoch": 2.39, "grad_norm": 4.637182685870529, "learning_rate": 1.2223416938301885e-06, "loss": 0.3291, "step": 14035 }, { "epoch": 2.39, "grad_norm": 4.649448721624295, "learning_rate": 1.2191036715418347e-06, "loss": 0.3358, "step": 14040 }, { "epoch": 2.39, "grad_norm": 4.74179880034857, "learning_rate": 1.2158693482063377e-06, "loss": 0.3353, "step": 14045 }, { "epoch": 2.39, "grad_norm": 6.631733274197324, "learning_rate": 1.2126387269879254e-06, "loss": 0.3378, "step": 14050 }, { "epoch": 2.39, "grad_norm": 4.935487631839041, "learning_rate": 1.2094118110471998e-06, "loss": 0.3474, "step": 14055 }, { "epoch": 2.39, "grad_norm": 3.9412089332004983, "learning_rate": 1.2061886035411412e-06, "loss": 0.3331, "step": 14060 }, { "epoch": 2.39, "grad_norm": 5.5029694305110555, "learning_rate": 1.202969107623101e-06, "loss": 0.3372, "step": 14065 }, { "epoch": 2.39, "grad_norm": 5.074013228108049, "learning_rate": 1.1997533264427958e-06, "loss": 0.3297, "step": 14070 }, { "epoch": 2.39, "grad_norm": 6.3155285236036915, "learning_rate": 1.1965412631463164e-06, "loss": 0.3376, "step": 14075 }, { "epoch": 2.39, "grad_norm": 4.655360501282674, "learning_rate": 1.193332920876104e-06, "loss": 0.3341, "step": 14080 }, { "epoch": 2.39, "grad_norm": 6.8202273238215465, "learning_rate": 1.190128302770972e-06, "loss": 0.3317, "step": 14085 }, { "epoch": 2.4, "grad_norm": 5.661250004762066, "learning_rate": 1.1869274119660818e-06, "loss": 0.3341, "step": 14090 }, { "epoch": 2.4, "grad_norm": 4.750711228093705, "learning_rate": 1.1837302515929526e-06, "loss": 0.3319, "step": 14095 }, { "epoch": 2.4, "grad_norm": 5.636421776212692, "learning_rate": 1.180536824779452e-06, "loss": 0.3328, "step": 14100 }, { "epoch": 2.4, "grad_norm": 7.944300055635633, "learning_rate": 1.177347134649796e-06, "loss": 0.35, "step": 14105 }, { "epoch": 2.4, "grad_norm": 5.783967696727534, "learning_rate": 1.1741611843245448e-06, "loss": 0.3384, "step": 14110 }, { "epoch": 2.4, "grad_norm": 6.422475390958299, "learning_rate": 1.1709789769205993e-06, "loss": 0.3407, "step": 14115 }, { "epoch": 2.4, "grad_norm": 6.936647200414145, "learning_rate": 1.1678005155511984e-06, "loss": 0.3398, "step": 14120 }, { "epoch": 2.4, "grad_norm": 6.250589249776812, "learning_rate": 1.164625803325915e-06, "loss": 0.3458, "step": 14125 }, { "epoch": 2.4, "grad_norm": 4.970889235761936, "learning_rate": 1.1614548433506596e-06, "loss": 0.3436, "step": 14130 }, { "epoch": 2.4, "grad_norm": 4.836692761132451, "learning_rate": 1.1582876387276636e-06, "loss": 0.336, "step": 14135 }, { "epoch": 2.4, "grad_norm": 9.944435936043519, "learning_rate": 1.1551241925554923e-06, "loss": 0.3367, "step": 14140 }, { "epoch": 2.4, "grad_norm": 6.236576529492092, "learning_rate": 1.1519645079290277e-06, "loss": 0.3384, "step": 14145 }, { "epoch": 2.41, "grad_norm": 4.470189006998729, "learning_rate": 1.1488085879394773e-06, "loss": 0.3485, "step": 14150 }, { "epoch": 2.41, "grad_norm": 4.5188328073800585, "learning_rate": 1.145656435674361e-06, "loss": 0.3329, "step": 14155 }, { "epoch": 2.41, "grad_norm": 4.352260040907254, "learning_rate": 1.1425080542175143e-06, "loss": 0.3341, "step": 14160 }, { "epoch": 2.41, "grad_norm": 5.822775962292289, "learning_rate": 1.1393634466490843e-06, "loss": 0.3405, "step": 14165 }, { "epoch": 2.41, "grad_norm": 4.361876934910723, "learning_rate": 1.1362226160455237e-06, "loss": 0.3344, "step": 14170 }, { "epoch": 2.41, "grad_norm": 8.88786874074134, "learning_rate": 1.1330855654795948e-06, "loss": 0.3389, "step": 14175 }, { "epoch": 2.41, "grad_norm": 14.262895017022414, "learning_rate": 1.1299522980203554e-06, "loss": 0.3286, "step": 14180 }, { "epoch": 2.41, "grad_norm": 5.149422771083544, "learning_rate": 1.1268228167331686e-06, "loss": 0.3461, "step": 14185 }, { "epoch": 2.41, "grad_norm": 4.574946646085239, "learning_rate": 1.123697124679688e-06, "loss": 0.349, "step": 14190 }, { "epoch": 2.41, "grad_norm": 16.266679457171783, "learning_rate": 1.120575224917866e-06, "loss": 0.3411, "step": 14195 }, { "epoch": 2.41, "grad_norm": 9.663923228409695, "learning_rate": 1.1174571205019358e-06, "loss": 0.3334, "step": 14200 }, { "epoch": 2.41, "grad_norm": 6.356748299028564, "learning_rate": 1.114342814482428e-06, "loss": 0.3316, "step": 14205 }, { "epoch": 2.42, "grad_norm": 6.616836478636501, "learning_rate": 1.11123230990615e-06, "loss": 0.3516, "step": 14210 }, { "epoch": 2.42, "grad_norm": 4.507223327833415, "learning_rate": 1.1081256098161913e-06, "loss": 0.3408, "step": 14215 }, { "epoch": 2.42, "grad_norm": 7.034187350742189, "learning_rate": 1.1050227172519234e-06, "loss": 0.3415, "step": 14220 }, { "epoch": 2.42, "grad_norm": 15.689685827911726, "learning_rate": 1.1019236352489865e-06, "loss": 0.3412, "step": 14225 }, { "epoch": 2.42, "grad_norm": 7.8734984726878245, "learning_rate": 1.098828366839299e-06, "loss": 0.3357, "step": 14230 }, { "epoch": 2.42, "grad_norm": 4.382933277091549, "learning_rate": 1.0957369150510445e-06, "loss": 0.3475, "step": 14235 }, { "epoch": 2.42, "grad_norm": 6.1833242818970815, "learning_rate": 1.0926492829086728e-06, "loss": 0.3384, "step": 14240 }, { "epoch": 2.42, "grad_norm": 7.661831755769597, "learning_rate": 1.089565473432897e-06, "loss": 0.3314, "step": 14245 }, { "epoch": 2.42, "grad_norm": 5.278366302398714, "learning_rate": 1.086485489640694e-06, "loss": 0.3407, "step": 14250 }, { "epoch": 2.42, "grad_norm": 9.140543659429433, "learning_rate": 1.0834093345452934e-06, "loss": 0.3349, "step": 14255 }, { "epoch": 2.42, "grad_norm": 4.1394738008804595, "learning_rate": 1.0803370111561789e-06, "loss": 0.3335, "step": 14260 }, { "epoch": 2.43, "grad_norm": 4.4178291423734315, "learning_rate": 1.0772685224790907e-06, "loss": 0.3406, "step": 14265 }, { "epoch": 2.43, "grad_norm": 6.519217722729064, "learning_rate": 1.0742038715160108e-06, "loss": 0.3403, "step": 14270 }, { "epoch": 2.43, "grad_norm": 4.799302187562816, "learning_rate": 1.0711430612651747e-06, "loss": 0.3458, "step": 14275 }, { "epoch": 2.43, "grad_norm": 4.851722917122433, "learning_rate": 1.0680860947210492e-06, "loss": 0.3329, "step": 14280 }, { "epoch": 2.43, "grad_norm": 7.937356964578476, "learning_rate": 1.065032974874352e-06, "loss": 0.332, "step": 14285 }, { "epoch": 2.43, "grad_norm": 5.096391120098633, "learning_rate": 1.0619837047120296e-06, "loss": 0.3375, "step": 14290 }, { "epoch": 2.43, "grad_norm": 4.967495395371477, "learning_rate": 1.0589382872172682e-06, "loss": 0.3434, "step": 14295 }, { "epoch": 2.43, "grad_norm": 6.419841773328182, "learning_rate": 1.0558967253694802e-06, "loss": 0.336, "step": 14300 }, { "epoch": 2.43, "grad_norm": 6.0973018817246984, "learning_rate": 1.0528590221443064e-06, "loss": 0.3383, "step": 14305 }, { "epoch": 2.43, "grad_norm": 5.074301199329289, "learning_rate": 1.0498251805136162e-06, "loss": 0.3361, "step": 14310 }, { "epoch": 2.43, "grad_norm": 4.976987817402853, "learning_rate": 1.0467952034454976e-06, "loss": 0.334, "step": 14315 }, { "epoch": 2.43, "grad_norm": 6.122772139785621, "learning_rate": 1.0437690939042594e-06, "loss": 0.3376, "step": 14320 }, { "epoch": 2.44, "grad_norm": 4.564482802942002, "learning_rate": 1.0407468548504234e-06, "loss": 0.3302, "step": 14325 }, { "epoch": 2.44, "grad_norm": 6.250238575968118, "learning_rate": 1.0377284892407318e-06, "loss": 0.3333, "step": 14330 }, { "epoch": 2.44, "grad_norm": 5.218686261207107, "learning_rate": 1.0347140000281297e-06, "loss": 0.3353, "step": 14335 }, { "epoch": 2.44, "grad_norm": 5.063182793310301, "learning_rate": 1.0317033901617763e-06, "loss": 0.3419, "step": 14340 }, { "epoch": 2.44, "grad_norm": 4.656996118444409, "learning_rate": 1.0286966625870304e-06, "loss": 0.3317, "step": 14345 }, { "epoch": 2.44, "grad_norm": 5.4213939092521874, "learning_rate": 1.0256938202454536e-06, "loss": 0.3429, "step": 14350 }, { "epoch": 2.44, "grad_norm": 8.364522415132814, "learning_rate": 1.022694866074812e-06, "loss": 0.334, "step": 14355 }, { "epoch": 2.44, "grad_norm": 4.928770765768932, "learning_rate": 1.0196998030090577e-06, "loss": 0.33, "step": 14360 }, { "epoch": 2.44, "grad_norm": 6.018321446051179, "learning_rate": 1.0167086339783455e-06, "loss": 0.3488, "step": 14365 }, { "epoch": 2.44, "grad_norm": 5.015085898456728, "learning_rate": 1.0137213619090142e-06, "loss": 0.336, "step": 14370 }, { "epoch": 2.44, "grad_norm": 10.398286045146108, "learning_rate": 1.0107379897235959e-06, "loss": 0.356, "step": 14375 }, { "epoch": 2.44, "grad_norm": 4.891907399568669, "learning_rate": 1.0077585203408003e-06, "loss": 0.3352, "step": 14380 }, { "epoch": 2.45, "grad_norm": 6.407190738784242, "learning_rate": 1.0047829566755262e-06, "loss": 0.3306, "step": 14385 }, { "epoch": 2.45, "grad_norm": 8.082744545042239, "learning_rate": 1.001811301638846e-06, "loss": 0.3399, "step": 14390 }, { "epoch": 2.45, "grad_norm": 4.318854243071567, "learning_rate": 9.988435581380102e-07, "loss": 0.3395, "step": 14395 }, { "epoch": 2.45, "grad_norm": 4.723240011964577, "learning_rate": 9.95879729076442e-07, "loss": 0.3308, "step": 14400 }, { "epoch": 2.45, "grad_norm": 5.3405520836823985, "learning_rate": 9.929198173537346e-07, "loss": 0.3298, "step": 14405 }, { "epoch": 2.45, "grad_norm": 5.770996377634228, "learning_rate": 9.899638258656518e-07, "loss": 0.3364, "step": 14410 }, { "epoch": 2.45, "grad_norm": 4.911279512730939, "learning_rate": 9.870117575041172e-07, "loss": 0.3254, "step": 14415 }, { "epoch": 2.45, "grad_norm": 5.785308442014827, "learning_rate": 9.840636151572215e-07, "loss": 0.3395, "step": 14420 }, { "epoch": 2.45, "grad_norm": 4.7477711398773295, "learning_rate": 9.811194017092086e-07, "loss": 0.3422, "step": 14425 }, { "epoch": 2.45, "grad_norm": 4.243548036699693, "learning_rate": 9.781791200404855e-07, "loss": 0.3408, "step": 14430 }, { "epoch": 2.45, "grad_norm": 4.289453405665378, "learning_rate": 9.752427730276076e-07, "loss": 0.3391, "step": 14435 }, { "epoch": 2.45, "grad_norm": 5.724613198135481, "learning_rate": 9.723103635432823e-07, "loss": 0.3361, "step": 14440 }, { "epoch": 2.46, "grad_norm": 8.191455303424572, "learning_rate": 9.693818944563644e-07, "loss": 0.341, "step": 14445 }, { "epoch": 2.46, "grad_norm": 11.469298515326782, "learning_rate": 9.664573686318535e-07, "loss": 0.3313, "step": 14450 }, { "epoch": 2.46, "grad_norm": 5.138448172334449, "learning_rate": 9.635367889308945e-07, "loss": 0.3467, "step": 14455 }, { "epoch": 2.46, "grad_norm": 4.280952234714455, "learning_rate": 9.606201582107666e-07, "loss": 0.3464, "step": 14460 }, { "epoch": 2.46, "grad_norm": 4.780709911013356, "learning_rate": 9.577074793248908e-07, "loss": 0.3312, "step": 14465 }, { "epoch": 2.46, "grad_norm": 4.421025545234076, "learning_rate": 9.547987551228172e-07, "loss": 0.3322, "step": 14470 }, { "epoch": 2.46, "grad_norm": 4.595092239009826, "learning_rate": 9.518939884502315e-07, "loss": 0.3373, "step": 14475 }, { "epoch": 2.46, "grad_norm": 6.702879081922312, "learning_rate": 9.489931821489439e-07, "loss": 0.3409, "step": 14480 }, { "epoch": 2.46, "grad_norm": 5.4970994508148445, "learning_rate": 9.460963390568922e-07, "loss": 0.3347, "step": 14485 }, { "epoch": 2.46, "grad_norm": 4.376402649743197, "learning_rate": 9.432034620081349e-07, "loss": 0.3371, "step": 14490 }, { "epoch": 2.46, "grad_norm": 4.8537603318498395, "learning_rate": 9.403145538328512e-07, "loss": 0.3202, "step": 14495 }, { "epoch": 2.47, "grad_norm": 4.55627346536457, "learning_rate": 9.3742961735734e-07, "loss": 0.3382, "step": 14500 }, { "epoch": 2.47, "grad_norm": 4.337405894436932, "learning_rate": 9.3454865540401e-07, "loss": 0.3389, "step": 14505 }, { "epoch": 2.47, "grad_norm": 5.558365503858435, "learning_rate": 9.31671670791387e-07, "loss": 0.3355, "step": 14510 }, { "epoch": 2.47, "grad_norm": 4.468843945765404, "learning_rate": 9.287986663340998e-07, "loss": 0.3321, "step": 14515 }, { "epoch": 2.47, "grad_norm": 8.908580862152249, "learning_rate": 9.259296448428895e-07, "loss": 0.3367, "step": 14520 }, { "epoch": 2.47, "grad_norm": 5.877623694657907, "learning_rate": 9.230646091245932e-07, "loss": 0.3399, "step": 14525 }, { "epoch": 2.47, "grad_norm": 4.82838509264817, "learning_rate": 9.202035619821553e-07, "loss": 0.3391, "step": 14530 }, { "epoch": 2.47, "grad_norm": 7.028694803195866, "learning_rate": 9.173465062146148e-07, "loss": 0.3369, "step": 14535 }, { "epoch": 2.47, "grad_norm": 4.237706071931036, "learning_rate": 9.14493444617105e-07, "loss": 0.334, "step": 14540 }, { "epoch": 2.47, "grad_norm": 4.825383076615841, "learning_rate": 9.11644379980855e-07, "loss": 0.3352, "step": 14545 }, { "epoch": 2.47, "grad_norm": 4.316499817256464, "learning_rate": 9.087993150931801e-07, "loss": 0.3326, "step": 14550 }, { "epoch": 2.47, "grad_norm": 4.911143231324706, "learning_rate": 9.05958252737486e-07, "loss": 0.3298, "step": 14555 }, { "epoch": 2.48, "grad_norm": 4.6358832021455445, "learning_rate": 9.03121195693259e-07, "loss": 0.332, "step": 14560 }, { "epoch": 2.48, "grad_norm": 4.947430264832714, "learning_rate": 9.002881467360692e-07, "loss": 0.3344, "step": 14565 }, { "epoch": 2.48, "grad_norm": 7.248449927175377, "learning_rate": 8.974591086375634e-07, "loss": 0.335, "step": 14570 }, { "epoch": 2.48, "grad_norm": 5.458948458123161, "learning_rate": 8.946340841654677e-07, "loss": 0.331, "step": 14575 }, { "epoch": 2.48, "grad_norm": 5.198133276647935, "learning_rate": 8.918130760835797e-07, "loss": 0.329, "step": 14580 }, { "epoch": 2.48, "grad_norm": 4.712618679236314, "learning_rate": 8.88996087151765e-07, "loss": 0.3355, "step": 14585 }, { "epoch": 2.48, "grad_norm": 5.002139102868113, "learning_rate": 8.861831201259635e-07, "loss": 0.3386, "step": 14590 }, { "epoch": 2.48, "grad_norm": 5.133980115384712, "learning_rate": 8.833741777581739e-07, "loss": 0.3274, "step": 14595 }, { "epoch": 2.48, "grad_norm": 8.734894342249994, "learning_rate": 8.80569262796464e-07, "loss": 0.333, "step": 14600 }, { "epoch": 2.48, "grad_norm": 4.053567103499762, "learning_rate": 8.777683779849527e-07, "loss": 0.3423, "step": 14605 }, { "epoch": 2.48, "grad_norm": 4.718447412100042, "learning_rate": 8.749715260638247e-07, "loss": 0.3273, "step": 14610 }, { "epoch": 2.48, "grad_norm": 6.513812256605959, "learning_rate": 8.721787097693141e-07, "loss": 0.3336, "step": 14615 }, { "epoch": 2.49, "grad_norm": 4.621238267112457, "learning_rate": 8.693899318337095e-07, "loss": 0.3302, "step": 14620 }, { "epoch": 2.49, "grad_norm": 4.445122668669881, "learning_rate": 8.666051949853472e-07, "loss": 0.3257, "step": 14625 }, { "epoch": 2.49, "grad_norm": 4.709037015599132, "learning_rate": 8.638245019486091e-07, "loss": 0.3338, "step": 14630 }, { "epoch": 2.49, "grad_norm": 4.360940980024568, "learning_rate": 8.610478554439244e-07, "loss": 0.3343, "step": 14635 }, { "epoch": 2.49, "grad_norm": 4.6143899100962, "learning_rate": 8.582752581877607e-07, "loss": 0.3326, "step": 14640 }, { "epoch": 2.49, "grad_norm": 6.7097884600896, "learning_rate": 8.555067128926236e-07, "loss": 0.3355, "step": 14645 }, { "epoch": 2.49, "grad_norm": 4.298828685434539, "learning_rate": 8.52742222267055e-07, "loss": 0.3291, "step": 14650 }, { "epoch": 2.49, "grad_norm": 4.260336253602938, "learning_rate": 8.499817890156331e-07, "loss": 0.3317, "step": 14655 }, { "epoch": 2.49, "grad_norm": 5.064517398086245, "learning_rate": 8.47225415838962e-07, "loss": 0.3272, "step": 14660 }, { "epoch": 2.49, "grad_norm": 4.483713586833253, "learning_rate": 8.44473105433678e-07, "loss": 0.3346, "step": 14665 }, { "epoch": 2.49, "grad_norm": 4.264668270536011, "learning_rate": 8.417248604924394e-07, "loss": 0.3399, "step": 14670 }, { "epoch": 2.49, "grad_norm": 5.172403867722749, "learning_rate": 8.389806837039272e-07, "loss": 0.3337, "step": 14675 }, { "epoch": 2.5, "grad_norm": 4.8912479936973945, "learning_rate": 8.362405777528471e-07, "loss": 0.3433, "step": 14680 }, { "epoch": 2.5, "grad_norm": 5.494072929459606, "learning_rate": 8.335045453199142e-07, "loss": 0.3418, "step": 14685 }, { "epoch": 2.5, "grad_norm": 5.405357055704501, "learning_rate": 8.307725890818658e-07, "loss": 0.3388, "step": 14690 }, { "epoch": 2.5, "grad_norm": 7.187527389254743, "learning_rate": 8.280447117114465e-07, "loss": 0.3323, "step": 14695 }, { "epoch": 2.5, "grad_norm": 4.665466163749549, "learning_rate": 8.25320915877415e-07, "loss": 0.3339, "step": 14700 }, { "epoch": 2.5, "grad_norm": 4.553292854497034, "learning_rate": 8.226012042445308e-07, "loss": 0.3361, "step": 14705 }, { "epoch": 2.5, "grad_norm": 6.328542620083238, "learning_rate": 8.198855794735644e-07, "loss": 0.3329, "step": 14710 }, { "epoch": 2.5, "grad_norm": 5.279384402377715, "learning_rate": 8.171740442212833e-07, "loss": 0.3492, "step": 14715 }, { "epoch": 2.5, "grad_norm": 5.645996678991626, "learning_rate": 8.144666011404556e-07, "loss": 0.3399, "step": 14720 }, { "epoch": 2.5, "grad_norm": 4.832297637021295, "learning_rate": 8.117632528798458e-07, "loss": 0.3485, "step": 14725 }, { "epoch": 2.5, "grad_norm": 4.637719837256791, "learning_rate": 8.090640020842117e-07, "loss": 0.3327, "step": 14730 }, { "epoch": 2.51, "grad_norm": 4.937432153179189, "learning_rate": 8.063688513943046e-07, "loss": 0.3298, "step": 14735 }, { "epoch": 2.51, "grad_norm": 5.247228681295939, "learning_rate": 8.036778034468617e-07, "loss": 0.3348, "step": 14740 }, { "epoch": 2.51, "grad_norm": 4.601109205992861, "learning_rate": 8.009908608746097e-07, "loss": 0.3265, "step": 14745 }, { "epoch": 2.51, "grad_norm": 7.610534797221197, "learning_rate": 7.983080263062542e-07, "loss": 0.3333, "step": 14750 }, { "epoch": 2.51, "grad_norm": 6.139040766800028, "learning_rate": 7.956293023664879e-07, "loss": 0.3309, "step": 14755 }, { "epoch": 2.51, "grad_norm": 11.412769842768002, "learning_rate": 7.929546916759772e-07, "loss": 0.3306, "step": 14760 }, { "epoch": 2.51, "grad_norm": 7.884387501900604, "learning_rate": 7.902841968513652e-07, "loss": 0.3341, "step": 14765 }, { "epoch": 2.51, "grad_norm": 6.846825276989522, "learning_rate": 7.876178205052698e-07, "loss": 0.3415, "step": 14770 }, { "epoch": 2.51, "grad_norm": 7.665868376155784, "learning_rate": 7.849555652462775e-07, "loss": 0.3269, "step": 14775 }, { "epoch": 2.51, "grad_norm": 4.423483816559562, "learning_rate": 7.822974336789468e-07, "loss": 0.3184, "step": 14780 }, { "epoch": 2.51, "grad_norm": 5.625052189667723, "learning_rate": 7.796434284037973e-07, "loss": 0.3375, "step": 14785 }, { "epoch": 2.51, "grad_norm": 4.130904432799836, "learning_rate": 7.769935520173155e-07, "loss": 0.3402, "step": 14790 }, { "epoch": 2.52, "grad_norm": 4.298938169660414, "learning_rate": 7.743478071119459e-07, "loss": 0.3269, "step": 14795 }, { "epoch": 2.52, "grad_norm": 4.9781642622490025, "learning_rate": 7.717061962760947e-07, "loss": 0.3334, "step": 14800 }, { "epoch": 2.52, "grad_norm": 4.914812405158503, "learning_rate": 7.690687220941162e-07, "loss": 0.3292, "step": 14805 }, { "epoch": 2.52, "grad_norm": 4.047822614780733, "learning_rate": 7.664353871463264e-07, "loss": 0.3182, "step": 14810 }, { "epoch": 2.52, "grad_norm": 7.91907198905064, "learning_rate": 7.638061940089875e-07, "loss": 0.3363, "step": 14815 }, { "epoch": 2.52, "grad_norm": 5.3497987908934865, "learning_rate": 7.611811452543072e-07, "loss": 0.331, "step": 14820 }, { "epoch": 2.52, "grad_norm": 4.267298014785521, "learning_rate": 7.585602434504453e-07, "loss": 0.3287, "step": 14825 }, { "epoch": 2.52, "grad_norm": 4.802029820505902, "learning_rate": 7.559434911614977e-07, "loss": 0.3252, "step": 14830 }, { "epoch": 2.52, "grad_norm": 4.32890062588132, "learning_rate": 7.533308909475068e-07, "loss": 0.3271, "step": 14835 }, { "epoch": 2.52, "grad_norm": 4.268375713285703, "learning_rate": 7.507224453644474e-07, "loss": 0.3405, "step": 14840 }, { "epoch": 2.52, "grad_norm": 4.399286170087374, "learning_rate": 7.481181569642332e-07, "loss": 0.3243, "step": 14845 }, { "epoch": 2.52, "grad_norm": 4.304404015063495, "learning_rate": 7.455180282947083e-07, "loss": 0.3337, "step": 14850 }, { "epoch": 2.53, "grad_norm": 5.029073052501918, "learning_rate": 7.429220618996507e-07, "loss": 0.3262, "step": 14855 }, { "epoch": 2.53, "grad_norm": 6.378391554889924, "learning_rate": 7.40330260318764e-07, "loss": 0.3384, "step": 14860 }, { "epoch": 2.53, "grad_norm": 4.621059063167957, "learning_rate": 7.377426260876757e-07, "loss": 0.3335, "step": 14865 }, { "epoch": 2.53, "grad_norm": 5.791322359845037, "learning_rate": 7.351591617379411e-07, "loss": 0.3324, "step": 14870 }, { "epoch": 2.53, "grad_norm": 4.678482618842939, "learning_rate": 7.325798697970305e-07, "loss": 0.3365, "step": 14875 }, { "epoch": 2.53, "grad_norm": 4.583563067785552, "learning_rate": 7.300047527883375e-07, "loss": 0.3338, "step": 14880 }, { "epoch": 2.53, "grad_norm": 5.391075538129845, "learning_rate": 7.274338132311653e-07, "loss": 0.3414, "step": 14885 }, { "epoch": 2.53, "grad_norm": 4.478843806589986, "learning_rate": 7.248670536407354e-07, "loss": 0.3129, "step": 14890 }, { "epoch": 2.53, "grad_norm": 4.5267513455016415, "learning_rate": 7.223044765281767e-07, "loss": 0.3326, "step": 14895 }, { "epoch": 2.53, "grad_norm": 5.660762723698685, "learning_rate": 7.197460844005294e-07, "loss": 0.3379, "step": 14900 }, { "epoch": 2.53, "grad_norm": 8.65434649620914, "learning_rate": 7.171918797607369e-07, "loss": 0.3294, "step": 14905 }, { "epoch": 2.53, "grad_norm": 7.1632545699544945, "learning_rate": 7.146418651076443e-07, "loss": 0.3294, "step": 14910 }, { "epoch": 2.54, "grad_norm": 10.121338816522481, "learning_rate": 7.12096042936003e-07, "loss": 0.3395, "step": 14915 }, { "epoch": 2.54, "grad_norm": 8.581526830547222, "learning_rate": 7.095544157364575e-07, "loss": 0.3359, "step": 14920 }, { "epoch": 2.54, "grad_norm": 5.044604807753416, "learning_rate": 7.070169859955506e-07, "loss": 0.335, "step": 14925 }, { "epoch": 2.54, "grad_norm": 7.000469258632066, "learning_rate": 7.04483756195718e-07, "loss": 0.3355, "step": 14930 }, { "epoch": 2.54, "grad_norm": 6.017536310415441, "learning_rate": 7.019547288152872e-07, "loss": 0.3344, "step": 14935 }, { "epoch": 2.54, "grad_norm": 7.900616115213088, "learning_rate": 6.994299063284738e-07, "loss": 0.3161, "step": 14940 }, { "epoch": 2.54, "grad_norm": 5.694735372740905, "learning_rate": 6.969092912053798e-07, "loss": 0.3301, "step": 14945 }, { "epoch": 2.54, "grad_norm": 4.03599084217576, "learning_rate": 6.943928859119914e-07, "loss": 0.3233, "step": 14950 }, { "epoch": 2.54, "grad_norm": 7.050873466380064, "learning_rate": 6.91880692910174e-07, "loss": 0.3257, "step": 14955 }, { "epoch": 2.54, "grad_norm": 4.478973666964457, "learning_rate": 6.893727146576773e-07, "loss": 0.3301, "step": 14960 }, { "epoch": 2.54, "grad_norm": 5.442444582301273, "learning_rate": 6.868689536081197e-07, "loss": 0.3403, "step": 14965 }, { "epoch": 2.55, "grad_norm": 4.26838730409746, "learning_rate": 6.843694122110017e-07, "loss": 0.3344, "step": 14970 }, { "epoch": 2.55, "grad_norm": 6.4244297663026595, "learning_rate": 6.81874092911689e-07, "loss": 0.327, "step": 14975 }, { "epoch": 2.55, "grad_norm": 6.237606573282461, "learning_rate": 6.793829981514228e-07, "loss": 0.3373, "step": 14980 }, { "epoch": 2.55, "grad_norm": 6.296940542734641, "learning_rate": 6.768961303673055e-07, "loss": 0.318, "step": 14985 }, { "epoch": 2.55, "grad_norm": 4.660218223419286, "learning_rate": 6.744134919923096e-07, "loss": 0.3253, "step": 14990 }, { "epoch": 2.55, "grad_norm": 7.067592424361357, "learning_rate": 6.719350854552659e-07, "loss": 0.3245, "step": 14995 }, { "epoch": 2.55, "grad_norm": 4.552954424557108, "learning_rate": 6.694609131808666e-07, "loss": 0.3287, "step": 15000 }, { "epoch": 2.55, "grad_norm": 4.652184851420003, "learning_rate": 6.669909775896605e-07, "loss": 0.322, "step": 15005 }, { "epoch": 2.55, "grad_norm": 4.855649009523, "learning_rate": 6.645252810980519e-07, "loss": 0.3314, "step": 15010 }, { "epoch": 2.55, "grad_norm": 6.9922125059213265, "learning_rate": 6.620638261182998e-07, "loss": 0.3279, "step": 15015 }, { "epoch": 2.55, "grad_norm": 4.399204658913523, "learning_rate": 6.596066150585107e-07, "loss": 0.3338, "step": 15020 }, { "epoch": 2.55, "grad_norm": 4.694809045991095, "learning_rate": 6.571536503226411e-07, "loss": 0.3288, "step": 15025 }, { "epoch": 2.56, "grad_norm": 7.276032527676246, "learning_rate": 6.547049343104916e-07, "loss": 0.342, "step": 15030 }, { "epoch": 2.56, "grad_norm": 5.683686403735051, "learning_rate": 6.522604694177093e-07, "loss": 0.3233, "step": 15035 }, { "epoch": 2.56, "grad_norm": 9.314664074593344, "learning_rate": 6.498202580357788e-07, "loss": 0.3339, "step": 15040 }, { "epoch": 2.56, "grad_norm": 7.271181418939168, "learning_rate": 6.473843025520243e-07, "loss": 0.3413, "step": 15045 }, { "epoch": 2.56, "grad_norm": 9.026745251495655, "learning_rate": 6.449526053496069e-07, "loss": 0.3275, "step": 15050 }, { "epoch": 2.56, "grad_norm": 4.973143978132485, "learning_rate": 6.425251688075212e-07, "loss": 0.3409, "step": 15055 }, { "epoch": 2.56, "grad_norm": 4.734733913002473, "learning_rate": 6.401019953005949e-07, "loss": 0.3299, "step": 15060 }, { "epoch": 2.56, "grad_norm": 4.264578374363782, "learning_rate": 6.376830871994827e-07, "loss": 0.3257, "step": 15065 }, { "epoch": 2.56, "grad_norm": 6.804642974562434, "learning_rate": 6.352684468706699e-07, "loss": 0.3347, "step": 15070 }, { "epoch": 2.56, "grad_norm": 4.75835234659942, "learning_rate": 6.328580766764613e-07, "loss": 0.3276, "step": 15075 }, { "epoch": 2.56, "grad_norm": 4.707073653371905, "learning_rate": 6.304519789749907e-07, "loss": 0.3185, "step": 15080 }, { "epoch": 2.56, "grad_norm": 4.670681759892701, "learning_rate": 6.28050156120204e-07, "loss": 0.3323, "step": 15085 }, { "epoch": 2.57, "grad_norm": 4.666206453580839, "learning_rate": 6.256526104618732e-07, "loss": 0.338, "step": 15090 }, { "epoch": 2.57, "grad_norm": 4.61513244052899, "learning_rate": 6.232593443455797e-07, "loss": 0.3252, "step": 15095 }, { "epoch": 2.57, "grad_norm": 5.2786063959854825, "learning_rate": 6.208703601127198e-07, "loss": 0.3361, "step": 15100 }, { "epoch": 2.57, "grad_norm": 4.458489925474581, "learning_rate": 6.184856601005035e-07, "loss": 0.3347, "step": 15105 }, { "epoch": 2.57, "grad_norm": 7.942362062095153, "learning_rate": 6.161052466419449e-07, "loss": 0.3317, "step": 15110 }, { "epoch": 2.57, "grad_norm": 7.201628485037839, "learning_rate": 6.137291220658687e-07, "loss": 0.3323, "step": 15115 }, { "epoch": 2.57, "grad_norm": 6.107208916924716, "learning_rate": 6.113572886968994e-07, "loss": 0.3258, "step": 15120 }, { "epoch": 2.57, "grad_norm": 5.377864924772755, "learning_rate": 6.089897488554685e-07, "loss": 0.3304, "step": 15125 }, { "epoch": 2.57, "grad_norm": 4.212508677166943, "learning_rate": 6.066265048578007e-07, "loss": 0.3345, "step": 15130 }, { "epoch": 2.57, "grad_norm": 4.611635469346788, "learning_rate": 6.042675590159241e-07, "loss": 0.3303, "step": 15135 }, { "epoch": 2.57, "grad_norm": 4.553090232084375, "learning_rate": 6.019129136376578e-07, "loss": 0.3343, "step": 15140 }, { "epoch": 2.57, "grad_norm": 5.034051982014956, "learning_rate": 5.99562571026614e-07, "loss": 0.3354, "step": 15145 }, { "epoch": 2.58, "grad_norm": 4.272555573157402, "learning_rate": 5.972165334821983e-07, "loss": 0.3366, "step": 15150 }, { "epoch": 2.58, "grad_norm": 6.82950133715513, "learning_rate": 5.94874803299601e-07, "loss": 0.3234, "step": 15155 }, { "epoch": 2.58, "grad_norm": 6.959497100574585, "learning_rate": 5.925373827698011e-07, "loss": 0.3163, "step": 15160 }, { "epoch": 2.58, "grad_norm": 5.261785061241256, "learning_rate": 5.902042741795594e-07, "loss": 0.3275, "step": 15165 }, { "epoch": 2.58, "grad_norm": 5.01198603476133, "learning_rate": 5.878754798114189e-07, "loss": 0.3256, "step": 15170 }, { "epoch": 2.58, "grad_norm": 5.558766408563095, "learning_rate": 5.855510019437011e-07, "loss": 0.327, "step": 15175 }, { "epoch": 2.58, "grad_norm": 4.556461907057693, "learning_rate": 5.83230842850508e-07, "loss": 0.3342, "step": 15180 }, { "epoch": 2.58, "grad_norm": 8.951444850543265, "learning_rate": 5.809150048017115e-07, "loss": 0.3355, "step": 15185 }, { "epoch": 2.58, "grad_norm": 4.21513520852271, "learning_rate": 5.786034900629584e-07, "loss": 0.3265, "step": 15190 }, { "epoch": 2.58, "grad_norm": 4.822438591492039, "learning_rate": 5.762963008956674e-07, "loss": 0.3196, "step": 15195 }, { "epoch": 2.58, "grad_norm": 4.088036776916842, "learning_rate": 5.739934395570224e-07, "loss": 0.329, "step": 15200 }, { "epoch": 2.59, "grad_norm": 5.32928136338505, "learning_rate": 5.716949082999773e-07, "loss": 0.3319, "step": 15205 }, { "epoch": 2.59, "grad_norm": 4.406320074819915, "learning_rate": 5.694007093732434e-07, "loss": 0.3246, "step": 15210 }, { "epoch": 2.59, "grad_norm": 4.283054897769128, "learning_rate": 5.671108450213009e-07, "loss": 0.3255, "step": 15215 }, { "epoch": 2.59, "grad_norm": 5.293700654067463, "learning_rate": 5.648253174843826e-07, "loss": 0.3305, "step": 15220 }, { "epoch": 2.59, "grad_norm": 6.174784098397865, "learning_rate": 5.625441289984851e-07, "loss": 0.309, "step": 15225 }, { "epoch": 2.59, "grad_norm": 4.554691783757481, "learning_rate": 5.602672817953547e-07, "loss": 0.3309, "step": 15230 }, { "epoch": 2.59, "grad_norm": 7.632898096662614, "learning_rate": 5.579947781024919e-07, "loss": 0.326, "step": 15235 }, { "epoch": 2.59, "grad_norm": 6.394787708139916, "learning_rate": 5.5572662014315e-07, "loss": 0.3218, "step": 15240 }, { "epoch": 2.59, "grad_norm": 8.185992925087373, "learning_rate": 5.534628101363287e-07, "loss": 0.3383, "step": 15245 }, { "epoch": 2.59, "grad_norm": 4.80027617699584, "learning_rate": 5.51203350296774e-07, "loss": 0.3289, "step": 15250 }, { "epoch": 2.59, "grad_norm": 5.719033975776043, "learning_rate": 5.489482428349751e-07, "loss": 0.3349, "step": 15255 }, { "epoch": 2.59, "grad_norm": 4.394999674061311, "learning_rate": 5.46697489957167e-07, "loss": 0.3406, "step": 15260 }, { "epoch": 2.6, "grad_norm": 5.365908251012121, "learning_rate": 5.444510938653191e-07, "loss": 0.3295, "step": 15265 }, { "epoch": 2.6, "grad_norm": 4.531697620960981, "learning_rate": 5.422090567571448e-07, "loss": 0.3352, "step": 15270 }, { "epoch": 2.6, "grad_norm": 4.818715998208998, "learning_rate": 5.399713808260871e-07, "loss": 0.332, "step": 15275 }, { "epoch": 2.6, "grad_norm": 4.722096102165326, "learning_rate": 5.377380682613243e-07, "loss": 0.3303, "step": 15280 }, { "epoch": 2.6, "grad_norm": 4.677021215940097, "learning_rate": 5.355091212477693e-07, "loss": 0.3251, "step": 15285 }, { "epoch": 2.6, "grad_norm": 5.33303885654041, "learning_rate": 5.33284541966057e-07, "loss": 0.3324, "step": 15290 }, { "epoch": 2.6, "grad_norm": 5.959350957878013, "learning_rate": 5.310643325925563e-07, "loss": 0.3139, "step": 15295 }, { "epoch": 2.6, "grad_norm": 7.711470113011544, "learning_rate": 5.288484952993556e-07, "loss": 0.3306, "step": 15300 }, { "epoch": 2.6, "grad_norm": 4.881691636537076, "learning_rate": 5.266370322542713e-07, "loss": 0.3299, "step": 15305 }, { "epoch": 2.6, "grad_norm": 4.341716160517968, "learning_rate": 5.244299456208341e-07, "loss": 0.3282, "step": 15310 }, { "epoch": 2.6, "grad_norm": 4.6246624806682375, "learning_rate": 5.222272375582993e-07, "loss": 0.3269, "step": 15315 }, { "epoch": 2.6, "grad_norm": 4.538722876182721, "learning_rate": 5.200289102216338e-07, "loss": 0.3291, "step": 15320 }, { "epoch": 2.61, "grad_norm": 4.1239510295431305, "learning_rate": 5.178349657615217e-07, "loss": 0.3253, "step": 15325 }, { "epoch": 2.61, "grad_norm": 4.21239231566736, "learning_rate": 5.156454063243566e-07, "loss": 0.3269, "step": 15330 }, { "epoch": 2.61, "grad_norm": 6.697072352286349, "learning_rate": 5.134602340522437e-07, "loss": 0.328, "step": 15335 }, { "epoch": 2.61, "grad_norm": 6.146424287472719, "learning_rate": 5.112794510829977e-07, "loss": 0.3259, "step": 15340 }, { "epoch": 2.61, "grad_norm": 4.001634187370734, "learning_rate": 5.091030595501351e-07, "loss": 0.3314, "step": 15345 }, { "epoch": 2.61, "grad_norm": 4.675020804818011, "learning_rate": 5.069310615828804e-07, "loss": 0.3298, "step": 15350 }, { "epoch": 2.61, "grad_norm": 5.884773853738382, "learning_rate": 5.047634593061562e-07, "loss": 0.341, "step": 15355 }, { "epoch": 2.61, "grad_norm": 4.686820504059619, "learning_rate": 5.026002548405878e-07, "loss": 0.3222, "step": 15360 }, { "epoch": 2.61, "grad_norm": 6.570517269507593, "learning_rate": 5.004414503024962e-07, "loss": 0.3336, "step": 15365 }, { "epoch": 2.61, "grad_norm": 4.382963602034246, "learning_rate": 4.982870478038976e-07, "loss": 0.3315, "step": 15370 }, { "epoch": 2.61, "grad_norm": 4.8333082592597085, "learning_rate": 4.96137049452502e-07, "loss": 0.3108, "step": 15375 }, { "epoch": 2.61, "grad_norm": 4.703537645888523, "learning_rate": 4.939914573517097e-07, "loss": 0.33, "step": 15380 }, { "epoch": 2.62, "grad_norm": 4.055508354992322, "learning_rate": 4.918502736006136e-07, "loss": 0.3272, "step": 15385 }, { "epoch": 2.62, "grad_norm": 5.140679591165281, "learning_rate": 4.897135002939896e-07, "loss": 0.316, "step": 15390 }, { "epoch": 2.62, "grad_norm": 4.6297789890950165, "learning_rate": 4.875811395223023e-07, "loss": 0.317, "step": 15395 }, { "epoch": 2.62, "grad_norm": 4.887885997760296, "learning_rate": 4.85453193371696e-07, "loss": 0.33, "step": 15400 }, { "epoch": 2.62, "grad_norm": 5.376128690566044, "learning_rate": 4.83329663924001e-07, "loss": 0.3245, "step": 15405 }, { "epoch": 2.62, "grad_norm": 4.8938060055908865, "learning_rate": 4.812105532567191e-07, "loss": 0.3427, "step": 15410 }, { "epoch": 2.62, "grad_norm": 5.11132863259498, "learning_rate": 4.790958634430365e-07, "loss": 0.3317, "step": 15415 }, { "epoch": 2.62, "grad_norm": 4.724591336463593, "learning_rate": 4.7698559655181e-07, "loss": 0.3357, "step": 15420 }, { "epoch": 2.62, "grad_norm": 4.951037262704773, "learning_rate": 4.748797546475703e-07, "loss": 0.3268, "step": 15425 }, { "epoch": 2.62, "grad_norm": 4.8802443891193565, "learning_rate": 4.727783397905211e-07, "loss": 0.3331, "step": 15430 }, { "epoch": 2.62, "grad_norm": 4.373028538851771, "learning_rate": 4.706813540365313e-07, "loss": 0.3267, "step": 15435 }, { "epoch": 2.62, "grad_norm": 6.122718060013959, "learning_rate": 4.6858879943713965e-07, "loss": 0.3286, "step": 15440 }, { "epoch": 2.63, "grad_norm": 5.987546920660024, "learning_rate": 4.665006780395492e-07, "loss": 0.3243, "step": 15445 }, { "epoch": 2.63, "grad_norm": 5.851157448444356, "learning_rate": 4.6441699188662424e-07, "loss": 0.3309, "step": 15450 }, { "epoch": 2.63, "grad_norm": 4.222443952757895, "learning_rate": 4.623377430168913e-07, "loss": 0.325, "step": 15455 }, { "epoch": 2.63, "grad_norm": 3.9519127965546392, "learning_rate": 4.6026293346453644e-07, "loss": 0.3299, "step": 15460 }, { "epoch": 2.63, "grad_norm": 4.281245218736686, "learning_rate": 4.581925652594016e-07, "loss": 0.3293, "step": 15465 }, { "epoch": 2.63, "grad_norm": 5.425900579865196, "learning_rate": 4.561266404269826e-07, "loss": 0.3161, "step": 15470 }, { "epoch": 2.63, "grad_norm": 4.723935686904716, "learning_rate": 4.5406516098843166e-07, "loss": 0.326, "step": 15475 }, { "epoch": 2.63, "grad_norm": 4.551245390291361, "learning_rate": 4.5200812896054714e-07, "loss": 0.3285, "step": 15480 }, { "epoch": 2.63, "grad_norm": 4.385884864216109, "learning_rate": 4.49955546355782e-07, "loss": 0.3308, "step": 15485 }, { "epoch": 2.63, "grad_norm": 5.182454178384056, "learning_rate": 4.479074151822299e-07, "loss": 0.337, "step": 15490 }, { "epoch": 2.63, "grad_norm": 4.427294600601199, "learning_rate": 4.458637374436353e-07, "loss": 0.3282, "step": 15495 }, { "epoch": 2.64, "grad_norm": 4.293405692501016, "learning_rate": 4.4382451513938163e-07, "loss": 0.3333, "step": 15500 }, { "epoch": 2.64, "grad_norm": 4.942125262624205, "learning_rate": 4.4178975026449634e-07, "loss": 0.3322, "step": 15505 }, { "epoch": 2.64, "grad_norm": 4.20532593695376, "learning_rate": 4.397594448096448e-07, "loss": 0.3221, "step": 15510 }, { "epoch": 2.64, "grad_norm": 4.226395874757872, "learning_rate": 4.377336007611277e-07, "loss": 0.3227, "step": 15515 }, { "epoch": 2.64, "grad_norm": 4.300060759903124, "learning_rate": 4.357122201008851e-07, "loss": 0.3342, "step": 15520 }, { "epoch": 2.64, "grad_norm": 4.533745505806048, "learning_rate": 4.3369530480648737e-07, "loss": 0.332, "step": 15525 }, { "epoch": 2.64, "grad_norm": 5.335895542748602, "learning_rate": 4.316828568511372e-07, "loss": 0.3218, "step": 15530 }, { "epoch": 2.64, "grad_norm": 5.288348245563574, "learning_rate": 4.296748782036658e-07, "loss": 0.3345, "step": 15535 }, { "epoch": 2.64, "grad_norm": 5.385575084440829, "learning_rate": 4.276713708285346e-07, "loss": 0.3355, "step": 15540 }, { "epoch": 2.64, "grad_norm": 5.16265164571498, "learning_rate": 4.256723366858267e-07, "loss": 0.3326, "step": 15545 }, { "epoch": 2.64, "grad_norm": 4.65564953851326, "learning_rate": 4.236777777312534e-07, "loss": 0.3206, "step": 15550 }, { "epoch": 2.64, "grad_norm": 4.706592827325074, "learning_rate": 4.2168769591614476e-07, "loss": 0.3363, "step": 15555 }, { "epoch": 2.65, "grad_norm": 4.220505773528335, "learning_rate": 4.197020931874507e-07, "loss": 0.3274, "step": 15560 }, { "epoch": 2.65, "grad_norm": 4.213911131499332, "learning_rate": 4.1772097148774173e-07, "loss": 0.3255, "step": 15565 }, { "epoch": 2.65, "grad_norm": 4.282918448441257, "learning_rate": 4.1574433275519963e-07, "loss": 0.3278, "step": 15570 }, { "epoch": 2.65, "grad_norm": 4.404275303777677, "learning_rate": 4.1377217892362653e-07, "loss": 0.3296, "step": 15575 }, { "epoch": 2.65, "grad_norm": 5.876173248789661, "learning_rate": 4.118045119224312e-07, "loss": 0.3306, "step": 15580 }, { "epoch": 2.65, "grad_norm": 5.255389054786569, "learning_rate": 4.0984133367663717e-07, "loss": 0.3213, "step": 15585 }, { "epoch": 2.65, "grad_norm": 5.024576780909882, "learning_rate": 4.07882646106873e-07, "loss": 0.3324, "step": 15590 }, { "epoch": 2.65, "grad_norm": 4.443562631028897, "learning_rate": 4.0592845112937764e-07, "loss": 0.3287, "step": 15595 }, { "epoch": 2.65, "grad_norm": 5.128444756291535, "learning_rate": 4.0397875065599225e-07, "loss": 0.3266, "step": 15600 }, { "epoch": 2.65, "grad_norm": 4.473835014555687, "learning_rate": 4.0203354659415995e-07, "loss": 0.3269, "step": 15605 }, { "epoch": 2.65, "grad_norm": 5.302270282731905, "learning_rate": 4.0009284084692734e-07, "loss": 0.3195, "step": 15610 }, { "epoch": 2.65, "grad_norm": 4.244008416918633, "learning_rate": 3.981566353129385e-07, "loss": 0.3298, "step": 15615 }, { "epoch": 2.66, "grad_norm": 4.511728239678076, "learning_rate": 3.9622493188643695e-07, "loss": 0.3235, "step": 15620 }, { "epoch": 2.66, "grad_norm": 4.7468068473137235, "learning_rate": 3.9429773245725836e-07, "loss": 0.33, "step": 15625 }, { "epoch": 2.66, "grad_norm": 4.532017615851619, "learning_rate": 3.9237503891083604e-07, "loss": 0.3328, "step": 15630 }, { "epoch": 2.66, "grad_norm": 3.9984381654002403, "learning_rate": 3.904568531281905e-07, "loss": 0.3259, "step": 15635 }, { "epoch": 2.66, "grad_norm": 4.369737981053874, "learning_rate": 3.8854317698593713e-07, "loss": 0.3219, "step": 15640 }, { "epoch": 2.66, "grad_norm": 4.891949730049542, "learning_rate": 3.866340123562756e-07, "loss": 0.3275, "step": 15645 }, { "epoch": 2.66, "grad_norm": 4.777212770846676, "learning_rate": 3.8472936110699354e-07, "loss": 0.3311, "step": 15650 }, { "epoch": 2.66, "grad_norm": 4.511407488239449, "learning_rate": 3.8282922510146257e-07, "loss": 0.3275, "step": 15655 }, { "epoch": 2.66, "grad_norm": 4.363462161440987, "learning_rate": 3.8093360619863575e-07, "loss": 0.3244, "step": 15660 }, { "epoch": 2.66, "grad_norm": 4.1267355991414805, "learning_rate": 3.7904250625305006e-07, "loss": 0.3266, "step": 15665 }, { "epoch": 2.66, "grad_norm": 4.521972951609363, "learning_rate": 3.771559271148184e-07, "loss": 0.333, "step": 15670 }, { "epoch": 2.66, "grad_norm": 4.264437267832434, "learning_rate": 3.7527387062963274e-07, "loss": 0.3327, "step": 15675 }, { "epoch": 2.67, "grad_norm": 6.003377017199365, "learning_rate": 3.7339633863875956e-07, "loss": 0.3262, "step": 15680 }, { "epoch": 2.67, "grad_norm": 6.080670004347522, "learning_rate": 3.715233329790391e-07, "loss": 0.3284, "step": 15685 }, { "epoch": 2.67, "grad_norm": 4.331208989720204, "learning_rate": 3.6965485548288217e-07, "loss": 0.3187, "step": 15690 }, { "epoch": 2.67, "grad_norm": 4.463432793538451, "learning_rate": 3.677909079782721e-07, "loss": 0.3214, "step": 15695 }, { "epoch": 2.67, "grad_norm": 4.765134561944942, "learning_rate": 3.6593149228875915e-07, "loss": 0.3294, "step": 15700 }, { "epoch": 2.67, "grad_norm": 5.288035996501678, "learning_rate": 3.640766102334581e-07, "loss": 0.3286, "step": 15705 }, { "epoch": 2.67, "grad_norm": 6.418751421483361, "learning_rate": 3.622262636270518e-07, "loss": 0.3216, "step": 15710 }, { "epoch": 2.67, "grad_norm": 4.867108747967822, "learning_rate": 3.603804542797829e-07, "loss": 0.3272, "step": 15715 }, { "epoch": 2.67, "grad_norm": 5.276748456421316, "learning_rate": 3.5853918399745836e-07, "loss": 0.3212, "step": 15720 }, { "epoch": 2.67, "grad_norm": 4.2089463813160854, "learning_rate": 3.567024545814413e-07, "loss": 0.3262, "step": 15725 }, { "epoch": 2.67, "grad_norm": 5.083727820870298, "learning_rate": 3.548702678286542e-07, "loss": 0.3294, "step": 15730 }, { "epoch": 2.68, "grad_norm": 4.290004487380496, "learning_rate": 3.5304262553157277e-07, "loss": 0.3369, "step": 15735 }, { "epoch": 2.68, "grad_norm": 6.074764534438417, "learning_rate": 3.5121952947823166e-07, "loss": 0.3298, "step": 15740 }, { "epoch": 2.68, "grad_norm": 4.5570105710259305, "learning_rate": 3.494009814522137e-07, "loss": 0.3286, "step": 15745 }, { "epoch": 2.68, "grad_norm": 6.788132016382686, "learning_rate": 3.475869832326523e-07, "loss": 0.3233, "step": 15750 }, { "epoch": 2.68, "grad_norm": 4.219825053242776, "learning_rate": 3.4577753659423287e-07, "loss": 0.3308, "step": 15755 }, { "epoch": 2.68, "grad_norm": 4.414709813806513, "learning_rate": 3.4397264330718437e-07, "loss": 0.3344, "step": 15760 }, { "epoch": 2.68, "grad_norm": 4.2904459457604265, "learning_rate": 3.421723051372844e-07, "loss": 0.3254, "step": 15765 }, { "epoch": 2.68, "grad_norm": 4.503347436179465, "learning_rate": 3.403765238458495e-07, "loss": 0.3218, "step": 15770 }, { "epoch": 2.68, "grad_norm": 5.221171414137864, "learning_rate": 3.385853011897433e-07, "loss": 0.3319, "step": 15775 }, { "epoch": 2.68, "grad_norm": 5.93086889960561, "learning_rate": 3.367986389213662e-07, "loss": 0.327, "step": 15780 }, { "epoch": 2.68, "grad_norm": 4.106430303910091, "learning_rate": 3.350165387886584e-07, "loss": 0.3215, "step": 15785 }, { "epoch": 2.68, "grad_norm": 5.28094196659452, "learning_rate": 3.3323900253509736e-07, "loss": 0.3275, "step": 15790 }, { "epoch": 2.69, "grad_norm": 4.613510431621033, "learning_rate": 3.314660318996921e-07, "loss": 0.3171, "step": 15795 }, { "epoch": 2.69, "grad_norm": 5.309412604149584, "learning_rate": 3.296976286169906e-07, "loss": 0.3258, "step": 15800 }, { "epoch": 2.69, "grad_norm": 4.364148651082251, "learning_rate": 3.2793379441706854e-07, "loss": 0.333, "step": 15805 }, { "epoch": 2.69, "grad_norm": 9.300943879556085, "learning_rate": 3.261745310255321e-07, "loss": 0.3151, "step": 15810 }, { "epoch": 2.69, "grad_norm": 7.040209421392553, "learning_rate": 3.244198401635157e-07, "loss": 0.3209, "step": 15815 }, { "epoch": 2.69, "grad_norm": 4.794991959613715, "learning_rate": 3.226697235476817e-07, "loss": 0.306, "step": 15820 }, { "epoch": 2.69, "grad_norm": 4.935248036664671, "learning_rate": 3.2092418289021487e-07, "loss": 0.3253, "step": 15825 }, { "epoch": 2.69, "grad_norm": 6.056519209770571, "learning_rate": 3.1918321989882706e-07, "loss": 0.318, "step": 15830 }, { "epoch": 2.69, "grad_norm": 5.269083367129021, "learning_rate": 3.17446836276748e-07, "loss": 0.3314, "step": 15835 }, { "epoch": 2.69, "grad_norm": 4.326862720235322, "learning_rate": 3.157150337227277e-07, "loss": 0.3167, "step": 15840 }, { "epoch": 2.69, "grad_norm": 4.069723672961724, "learning_rate": 3.1398781393103763e-07, "loss": 0.3294, "step": 15845 }, { "epoch": 2.69, "grad_norm": 5.8522878283056725, "learning_rate": 3.1226517859146157e-07, "loss": 0.3302, "step": 15850 }, { "epoch": 2.7, "grad_norm": 4.231814147547882, "learning_rate": 3.105471293893009e-07, "loss": 0.3266, "step": 15855 }, { "epoch": 2.7, "grad_norm": 4.909474166584724, "learning_rate": 3.088336680053688e-07, "loss": 0.3243, "step": 15860 }, { "epoch": 2.7, "grad_norm": 5.235909628911115, "learning_rate": 3.071247961159918e-07, "loss": 0.324, "step": 15865 }, { "epoch": 2.7, "grad_norm": 4.734972101490973, "learning_rate": 3.0542051539300455e-07, "loss": 0.3275, "step": 15870 }, { "epoch": 2.7, "grad_norm": 3.9506257702838843, "learning_rate": 3.037208275037512e-07, "loss": 0.3251, "step": 15875 }, { "epoch": 2.7, "grad_norm": 4.538223913166323, "learning_rate": 3.020257341110827e-07, "loss": 0.3264, "step": 15880 }, { "epoch": 2.7, "grad_norm": 4.358203367211378, "learning_rate": 3.0033523687335363e-07, "loss": 0.3279, "step": 15885 }, { "epoch": 2.7, "grad_norm": 4.231501878147074, "learning_rate": 2.986493374444244e-07, "loss": 0.3239, "step": 15890 }, { "epoch": 2.7, "grad_norm": 4.837316439950837, "learning_rate": 2.969680374736539e-07, "loss": 0.3238, "step": 15895 }, { "epoch": 2.7, "grad_norm": 4.018421603392979, "learning_rate": 2.952913386059053e-07, "loss": 0.318, "step": 15900 }, { "epoch": 2.7, "grad_norm": 4.1435525006903475, "learning_rate": 2.936192424815365e-07, "loss": 0.3175, "step": 15905 }, { "epoch": 2.7, "grad_norm": 4.12970076162308, "learning_rate": 2.919517507364056e-07, "loss": 0.3218, "step": 15910 }, { "epoch": 2.71, "grad_norm": 4.27748140244397, "learning_rate": 2.902888650018648e-07, "loss": 0.3274, "step": 15915 }, { "epoch": 2.71, "grad_norm": 4.885317669031329, "learning_rate": 2.886305869047584e-07, "loss": 0.3287, "step": 15920 }, { "epoch": 2.71, "grad_norm": 4.28260729481659, "learning_rate": 2.86976918067427e-07, "loss": 0.3251, "step": 15925 }, { "epoch": 2.71, "grad_norm": 5.475073429347808, "learning_rate": 2.8532786010769753e-07, "loss": 0.3391, "step": 15930 }, { "epoch": 2.71, "grad_norm": 4.821053704563675, "learning_rate": 2.8368341463888895e-07, "loss": 0.3192, "step": 15935 }, { "epoch": 2.71, "grad_norm": 5.130502760793864, "learning_rate": 2.8204358326980543e-07, "loss": 0.3255, "step": 15940 }, { "epoch": 2.71, "grad_norm": 4.893758489623744, "learning_rate": 2.8040836760473977e-07, "loss": 0.3357, "step": 15945 }, { "epoch": 2.71, "grad_norm": 4.2516531067849845, "learning_rate": 2.7877776924346625e-07, "loss": 0.3132, "step": 15950 }, { "epoch": 2.71, "grad_norm": 7.406973652341834, "learning_rate": 2.771517897812437e-07, "loss": 0.3293, "step": 15955 }, { "epoch": 2.71, "grad_norm": 5.322483066999841, "learning_rate": 2.755304308088125e-07, "loss": 0.3166, "step": 15960 }, { "epoch": 2.71, "grad_norm": 5.474306932264857, "learning_rate": 2.7391369391239043e-07, "loss": 0.3288, "step": 15965 }, { "epoch": 2.72, "grad_norm": 4.20725319692026, "learning_rate": 2.723015806736756e-07, "loss": 0.3171, "step": 15970 }, { "epoch": 2.72, "grad_norm": 4.535491629522536, "learning_rate": 2.7069409266984204e-07, "loss": 0.3264, "step": 15975 }, { "epoch": 2.72, "grad_norm": 5.761847179835859, "learning_rate": 2.690912314735383e-07, "loss": 0.3269, "step": 15980 }, { "epoch": 2.72, "grad_norm": 5.109385173462457, "learning_rate": 2.6749299865288626e-07, "loss": 0.3224, "step": 15985 }, { "epoch": 2.72, "grad_norm": 4.879185729585172, "learning_rate": 2.6589939577148115e-07, "loss": 0.3297, "step": 15990 }, { "epoch": 2.72, "grad_norm": 4.056409679988566, "learning_rate": 2.6431042438838707e-07, "loss": 0.322, "step": 15995 }, { "epoch": 2.72, "grad_norm": 5.058301298589732, "learning_rate": 2.6272608605813766e-07, "loss": 0.3372, "step": 16000 }, { "epoch": 2.72, "grad_norm": 4.0829892398529095, "learning_rate": 2.611463823307342e-07, "loss": 0.3256, "step": 16005 }, { "epoch": 2.72, "grad_norm": 4.781469546114506, "learning_rate": 2.595713147516432e-07, "loss": 0.3281, "step": 16010 }, { "epoch": 2.72, "grad_norm": 5.154008309359028, "learning_rate": 2.5800088486179545e-07, "loss": 0.326, "step": 16015 }, { "epoch": 2.72, "grad_norm": 4.4988103210311365, "learning_rate": 2.564350941975852e-07, "loss": 0.3319, "step": 16020 }, { "epoch": 2.72, "grad_norm": 4.230648636823393, "learning_rate": 2.5487394429086764e-07, "loss": 0.328, "step": 16025 }, { "epoch": 2.73, "grad_norm": 5.220354766605433, "learning_rate": 2.5331743666895725e-07, "loss": 0.3251, "step": 16030 }, { "epoch": 2.73, "grad_norm": 4.181949627869047, "learning_rate": 2.517655728546287e-07, "loss": 0.3202, "step": 16035 }, { "epoch": 2.73, "grad_norm": 4.18793309691398, "learning_rate": 2.5021835436611076e-07, "loss": 0.3219, "step": 16040 }, { "epoch": 2.73, "grad_norm": 3.7932237688490047, "learning_rate": 2.486757827170905e-07, "loss": 0.3205, "step": 16045 }, { "epoch": 2.73, "grad_norm": 4.637007418320029, "learning_rate": 2.471378594167062e-07, "loss": 0.3255, "step": 16050 }, { "epoch": 2.73, "grad_norm": 4.274747694027581, "learning_rate": 2.456045859695505e-07, "loss": 0.318, "step": 16055 }, { "epoch": 2.73, "grad_norm": 3.888514536811012, "learning_rate": 2.440759638756651e-07, "loss": 0.3147, "step": 16060 }, { "epoch": 2.73, "grad_norm": 4.070119120643892, "learning_rate": 2.425519946305438e-07, "loss": 0.3255, "step": 16065 }, { "epoch": 2.73, "grad_norm": 4.318133851055312, "learning_rate": 2.4103267972512554e-07, "loss": 0.3202, "step": 16070 }, { "epoch": 2.73, "grad_norm": 4.812137184792789, "learning_rate": 2.39518020645797e-07, "loss": 0.3237, "step": 16075 }, { "epoch": 2.73, "grad_norm": 4.288701026818577, "learning_rate": 2.3800801887439106e-07, "loss": 0.3229, "step": 16080 }, { "epoch": 2.73, "grad_norm": 5.333033708892183, "learning_rate": 2.365026758881822e-07, "loss": 0.3267, "step": 16085 }, { "epoch": 2.74, "grad_norm": 4.423311250484052, "learning_rate": 2.350019931598896e-07, "loss": 0.3196, "step": 16090 }, { "epoch": 2.74, "grad_norm": 4.8143452561896725, "learning_rate": 2.3350597215766878e-07, "loss": 0.3212, "step": 16095 }, { "epoch": 2.74, "grad_norm": 4.9134689439462935, "learning_rate": 2.3201461434512075e-07, "loss": 0.3289, "step": 16100 }, { "epoch": 2.74, "grad_norm": 4.438047747163149, "learning_rate": 2.305279211812783e-07, "loss": 0.3161, "step": 16105 }, { "epoch": 2.74, "grad_norm": 4.079736872305593, "learning_rate": 2.2904589412061528e-07, "loss": 0.3229, "step": 16110 }, { "epoch": 2.74, "grad_norm": 4.72236932999239, "learning_rate": 2.2756853461303797e-07, "loss": 0.33, "step": 16115 }, { "epoch": 2.74, "grad_norm": 6.102576174544622, "learning_rate": 2.2609584410388685e-07, "loss": 0.3234, "step": 16120 }, { "epoch": 2.74, "grad_norm": 6.018929984105898, "learning_rate": 2.2462782403393557e-07, "loss": 0.3098, "step": 16125 }, { "epoch": 2.74, "grad_norm": 4.505430379967904, "learning_rate": 2.2316447583938694e-07, "loss": 0.3351, "step": 16130 }, { "epoch": 2.74, "grad_norm": 4.084487341521747, "learning_rate": 2.2170580095187466e-07, "loss": 0.3242, "step": 16135 }, { "epoch": 2.74, "grad_norm": 6.836398225817673, "learning_rate": 2.2025180079845886e-07, "loss": 0.3267, "step": 16140 }, { "epoch": 2.74, "grad_norm": 4.944041701120263, "learning_rate": 2.1880247680162836e-07, "loss": 0.331, "step": 16145 }, { "epoch": 2.75, "grad_norm": 4.132936779080271, "learning_rate": 2.1735783037929391e-07, "loss": 0.3283, "step": 16150 }, { "epoch": 2.75, "grad_norm": 6.9944360578145135, "learning_rate": 2.1591786294479444e-07, "loss": 0.3258, "step": 16155 }, { "epoch": 2.75, "grad_norm": 4.352649165025649, "learning_rate": 2.1448257590688747e-07, "loss": 0.3259, "step": 16160 }, { "epoch": 2.75, "grad_norm": 4.013748515607696, "learning_rate": 2.1305197066975315e-07, "loss": 0.3219, "step": 16165 }, { "epoch": 2.75, "grad_norm": 5.852245030863292, "learning_rate": 2.1162604863299241e-07, "loss": 0.3263, "step": 16170 }, { "epoch": 2.75, "grad_norm": 4.875041183822977, "learning_rate": 2.1020481119162106e-07, "loss": 0.3197, "step": 16175 }, { "epoch": 2.75, "grad_norm": 4.902276796757562, "learning_rate": 2.0878825973607575e-07, "loss": 0.3186, "step": 16180 }, { "epoch": 2.75, "grad_norm": 4.382093906560683, "learning_rate": 2.0737639565220568e-07, "loss": 0.3271, "step": 16185 }, { "epoch": 2.75, "grad_norm": 7.1592131547023605, "learning_rate": 2.059692203212771e-07, "loss": 0.3238, "step": 16190 }, { "epoch": 2.75, "grad_norm": 5.699480539733266, "learning_rate": 2.0456673511996705e-07, "loss": 0.3365, "step": 16195 }, { "epoch": 2.75, "grad_norm": 4.58969484654952, "learning_rate": 2.0316894142036303e-07, "loss": 0.3269, "step": 16200 }, { "epoch": 2.76, "grad_norm": 5.709577048665052, "learning_rate": 2.0177584058996667e-07, "loss": 0.3193, "step": 16205 }, { "epoch": 2.76, "grad_norm": 4.366505950527198, "learning_rate": 2.0038743399168504e-07, "loss": 0.3215, "step": 16210 }, { "epoch": 2.76, "grad_norm": 4.957699551759988, "learning_rate": 1.9900372298383374e-07, "loss": 0.323, "step": 16215 }, { "epoch": 2.76, "grad_norm": 4.719601013061237, "learning_rate": 1.976247089201344e-07, "loss": 0.3297, "step": 16220 }, { "epoch": 2.76, "grad_norm": 3.935689059330765, "learning_rate": 1.9625039314971394e-07, "loss": 0.3357, "step": 16225 }, { "epoch": 2.76, "grad_norm": 6.791720105209688, "learning_rate": 1.9488077701710238e-07, "loss": 0.3336, "step": 16230 }, { "epoch": 2.76, "grad_norm": 5.143417367694795, "learning_rate": 1.9351586186223237e-07, "loss": 0.3238, "step": 16235 }, { "epoch": 2.76, "grad_norm": 8.216116273198738, "learning_rate": 1.9215564902043738e-07, "loss": 0.3154, "step": 16240 }, { "epoch": 2.76, "grad_norm": 5.107020221296381, "learning_rate": 1.908001398224496e-07, "loss": 0.3225, "step": 16245 }, { "epoch": 2.76, "grad_norm": 4.588168833266675, "learning_rate": 1.8944933559440105e-07, "loss": 0.3143, "step": 16250 }, { "epoch": 2.76, "grad_norm": 4.79229475914689, "learning_rate": 1.8810323765781956e-07, "loss": 0.3296, "step": 16255 }, { "epoch": 2.76, "grad_norm": 4.36897448726827, "learning_rate": 1.8676184732962953e-07, "loss": 0.3276, "step": 16260 }, { "epoch": 2.77, "grad_norm": 4.567793253702501, "learning_rate": 1.8542516592214788e-07, "loss": 0.3323, "step": 16265 }, { "epoch": 2.77, "grad_norm": 5.50180730085157, "learning_rate": 1.840931947430874e-07, "loss": 0.3236, "step": 16270 }, { "epoch": 2.77, "grad_norm": 5.784739469539132, "learning_rate": 1.8276593509555073e-07, "loss": 0.3254, "step": 16275 }, { "epoch": 2.77, "grad_norm": 5.093679564094168, "learning_rate": 1.8144338827803253e-07, "loss": 0.3212, "step": 16280 }, { "epoch": 2.77, "grad_norm": 4.188116061505996, "learning_rate": 1.8012555558441558e-07, "loss": 0.3268, "step": 16285 }, { "epoch": 2.77, "grad_norm": 4.188884125901071, "learning_rate": 1.7881243830397133e-07, "loss": 0.3254, "step": 16290 }, { "epoch": 2.77, "grad_norm": 4.3028610933926466, "learning_rate": 1.7750403772135716e-07, "loss": 0.3221, "step": 16295 }, { "epoch": 2.77, "grad_norm": 4.748550442739596, "learning_rate": 1.7620035511661748e-07, "loss": 0.3197, "step": 16300 }, { "epoch": 2.77, "grad_norm": 4.34403152927218, "learning_rate": 1.749013917651804e-07, "loss": 0.317, "step": 16305 }, { "epoch": 2.77, "grad_norm": 4.615831309228839, "learning_rate": 1.736071489378549e-07, "loss": 0.3144, "step": 16310 }, { "epoch": 2.77, "grad_norm": 5.699662803614298, "learning_rate": 1.7231762790083594e-07, "loss": 0.3213, "step": 16315 }, { "epoch": 2.77, "grad_norm": 4.770633996070142, "learning_rate": 1.7103282991569548e-07, "loss": 0.3267, "step": 16320 }, { "epoch": 2.78, "grad_norm": 4.955789013505318, "learning_rate": 1.6975275623938637e-07, "loss": 0.3246, "step": 16325 }, { "epoch": 2.78, "grad_norm": 4.521376850056406, "learning_rate": 1.6847740812423962e-07, "loss": 0.3397, "step": 16330 }, { "epoch": 2.78, "grad_norm": 5.013497701337801, "learning_rate": 1.6720678681796165e-07, "loss": 0.3107, "step": 16335 }, { "epoch": 2.78, "grad_norm": 5.053988036087268, "learning_rate": 1.659408935636364e-07, "loss": 0.3183, "step": 16340 }, { "epoch": 2.78, "grad_norm": 5.341405350563486, "learning_rate": 1.6467972959972102e-07, "loss": 0.3217, "step": 16345 }, { "epoch": 2.78, "grad_norm": 3.9646969352659425, "learning_rate": 1.6342329616004683e-07, "loss": 0.3227, "step": 16350 }, { "epoch": 2.78, "grad_norm": 4.1896122643170335, "learning_rate": 1.6217159447381502e-07, "loss": 0.3202, "step": 16355 }, { "epoch": 2.78, "grad_norm": 4.565785467430683, "learning_rate": 1.609246257656011e-07, "loss": 0.3202, "step": 16360 }, { "epoch": 2.78, "grad_norm": 6.4492287907440105, "learning_rate": 1.596823912553469e-07, "loss": 0.3148, "step": 16365 }, { "epoch": 2.78, "grad_norm": 4.0934813034293605, "learning_rate": 1.584448921583648e-07, "loss": 0.3248, "step": 16370 }, { "epoch": 2.78, "grad_norm": 4.659745455359364, "learning_rate": 1.5721212968533238e-07, "loss": 0.3165, "step": 16375 }, { "epoch": 2.78, "grad_norm": 4.000420361796629, "learning_rate": 1.5598410504229554e-07, "loss": 0.3213, "step": 16380 }, { "epoch": 2.79, "grad_norm": 4.169494550177258, "learning_rate": 1.5476081943066268e-07, "loss": 0.3222, "step": 16385 }, { "epoch": 2.79, "grad_norm": 4.52313110624559, "learning_rate": 1.5354227404720867e-07, "loss": 0.3212, "step": 16390 }, { "epoch": 2.79, "grad_norm": 5.40933196464466, "learning_rate": 1.523284700840688e-07, "loss": 0.3264, "step": 16395 }, { "epoch": 2.79, "grad_norm": 5.64794423861692, "learning_rate": 1.511194087287393e-07, "loss": 0.3221, "step": 16400 }, { "epoch": 2.79, "grad_norm": 4.240945621396869, "learning_rate": 1.4991509116407842e-07, "loss": 0.3204, "step": 16405 }, { "epoch": 2.79, "grad_norm": 5.319882401349022, "learning_rate": 1.4871551856830259e-07, "loss": 0.332, "step": 16410 }, { "epoch": 2.79, "grad_norm": 4.365765429318108, "learning_rate": 1.475206921149852e-07, "loss": 0.3272, "step": 16415 }, { "epoch": 2.79, "grad_norm": 4.018989346460705, "learning_rate": 1.4633061297305796e-07, "loss": 0.318, "step": 16420 }, { "epoch": 2.79, "grad_norm": 4.323980559831332, "learning_rate": 1.4514528230680726e-07, "loss": 0.3299, "step": 16425 }, { "epoch": 2.79, "grad_norm": 5.19585151471915, "learning_rate": 1.4396470127587382e-07, "loss": 0.3181, "step": 16430 }, { "epoch": 2.79, "grad_norm": 6.836398734241736, "learning_rate": 1.4278887103525153e-07, "loss": 0.3227, "step": 16435 }, { "epoch": 2.79, "grad_norm": 4.60566893832175, "learning_rate": 1.4161779273528797e-07, "loss": 0.3225, "step": 16440 }, { "epoch": 2.8, "grad_norm": 4.06213953867403, "learning_rate": 1.4045146752167948e-07, "loss": 0.3299, "step": 16445 }, { "epoch": 2.8, "grad_norm": 5.8865681200680235, "learning_rate": 1.3928989653547498e-07, "loss": 0.3276, "step": 16450 }, { "epoch": 2.8, "grad_norm": 4.753344877185966, "learning_rate": 1.3813308091306876e-07, "loss": 0.3342, "step": 16455 }, { "epoch": 2.8, "grad_norm": 5.38252543437366, "learning_rate": 1.3698102178620664e-07, "loss": 0.3261, "step": 16460 }, { "epoch": 2.8, "grad_norm": 5.443970363526169, "learning_rate": 1.3583372028197704e-07, "loss": 0.326, "step": 16465 }, { "epoch": 2.8, "grad_norm": 4.581110482318744, "learning_rate": 1.3469117752281767e-07, "loss": 0.3259, "step": 16470 }, { "epoch": 2.8, "grad_norm": 5.142538575844111, "learning_rate": 1.335533946265083e-07, "loss": 0.3255, "step": 16475 }, { "epoch": 2.8, "grad_norm": 4.4172771144012435, "learning_rate": 1.3242037270617292e-07, "loss": 0.3406, "step": 16480 }, { "epoch": 2.8, "grad_norm": 6.996640252062232, "learning_rate": 1.312921128702771e-07, "loss": 0.3238, "step": 16485 }, { "epoch": 2.8, "grad_norm": 4.147013677924641, "learning_rate": 1.3016861622262788e-07, "loss": 0.3222, "step": 16490 }, { "epoch": 2.8, "grad_norm": 3.969175635638045, "learning_rate": 1.2904988386237272e-07, "loss": 0.3211, "step": 16495 }, { "epoch": 2.81, "grad_norm": 4.304735817041214, "learning_rate": 1.2793591688399665e-07, "loss": 0.3154, "step": 16500 }, { "epoch": 2.81, "grad_norm": 4.417624275374276, "learning_rate": 1.2682671637732512e-07, "loss": 0.3281, "step": 16505 }, { "epoch": 2.81, "grad_norm": 4.3438042106250485, "learning_rate": 1.2572228342751737e-07, "loss": 0.324, "step": 16510 }, { "epoch": 2.81, "grad_norm": 5.222667005638983, "learning_rate": 1.2462261911507124e-07, "loss": 0.3226, "step": 16515 }, { "epoch": 2.81, "grad_norm": 4.666061908982401, "learning_rate": 1.2352772451581784e-07, "loss": 0.318, "step": 16520 }, { "epoch": 2.81, "grad_norm": 4.977480571113075, "learning_rate": 1.2243760070092093e-07, "loss": 0.3197, "step": 16525 }, { "epoch": 2.81, "grad_norm": 4.020839554052547, "learning_rate": 1.21352248736879e-07, "loss": 0.3143, "step": 16530 }, { "epoch": 2.81, "grad_norm": 4.9117798883194865, "learning_rate": 1.2027166968552163e-07, "loss": 0.3185, "step": 16535 }, { "epoch": 2.81, "grad_norm": 5.00269934718407, "learning_rate": 1.19195864604007e-07, "loss": 0.3221, "step": 16540 }, { "epoch": 2.81, "grad_norm": 6.2316602057613455, "learning_rate": 1.1812483454482493e-07, "loss": 0.3251, "step": 16545 }, { "epoch": 2.81, "grad_norm": 3.958986638942587, "learning_rate": 1.1705858055579389e-07, "loss": 0.316, "step": 16550 }, { "epoch": 2.81, "grad_norm": 4.43777934051335, "learning_rate": 1.1599710368005723e-07, "loss": 0.3128, "step": 16555 }, { "epoch": 2.82, "grad_norm": 4.872643950834902, "learning_rate": 1.149404049560876e-07, "loss": 0.3172, "step": 16560 }, { "epoch": 2.82, "grad_norm": 5.0242774912669885, "learning_rate": 1.1388848541768193e-07, "loss": 0.3302, "step": 16565 }, { "epoch": 2.82, "grad_norm": 4.6394760087647064, "learning_rate": 1.1284134609396091e-07, "loss": 0.3134, "step": 16570 }, { "epoch": 2.82, "grad_norm": 4.20248467236056, "learning_rate": 1.117989880093695e-07, "loss": 0.3242, "step": 16575 }, { "epoch": 2.82, "grad_norm": 4.993861326186423, "learning_rate": 1.107614121836742e-07, "loss": 0.3232, "step": 16580 }, { "epoch": 2.82, "grad_norm": 4.140204526928821, "learning_rate": 1.0972861963196469e-07, "loss": 0.3131, "step": 16585 }, { "epoch": 2.82, "grad_norm": 6.362294713393221, "learning_rate": 1.0870061136464772e-07, "loss": 0.3183, "step": 16590 }, { "epoch": 2.82, "grad_norm": 6.061443962737918, "learning_rate": 1.0767738838745379e-07, "loss": 0.32, "step": 16595 }, { "epoch": 2.82, "grad_norm": 5.021965056921146, "learning_rate": 1.066589517014277e-07, "loss": 0.3165, "step": 16600 }, { "epoch": 2.82, "grad_norm": 4.388055243217996, "learning_rate": 1.0564530230293468e-07, "loss": 0.3165, "step": 16605 }, { "epoch": 2.82, "grad_norm": 5.35685916039647, "learning_rate": 1.0463644118365535e-07, "loss": 0.3169, "step": 16610 }, { "epoch": 2.82, "grad_norm": 5.082810603055631, "learning_rate": 1.0363236933058462e-07, "loss": 0.3223, "step": 16615 }, { "epoch": 2.83, "grad_norm": 4.12496359069897, "learning_rate": 1.0263308772603397e-07, "loss": 0.3259, "step": 16620 }, { "epoch": 2.83, "grad_norm": 4.348108139298327, "learning_rate": 1.0163859734762749e-07, "loss": 0.3187, "step": 16625 }, { "epoch": 2.83, "grad_norm": 4.1193485171849975, "learning_rate": 1.006488991683019e-07, "loss": 0.3202, "step": 16630 }, { "epoch": 2.83, "grad_norm": 4.8436774397202775, "learning_rate": 9.96639941563049e-08, "loss": 0.3316, "step": 16635 }, { "epoch": 2.83, "grad_norm": 4.415829203215659, "learning_rate": 9.868388327519685e-08, "loss": 0.3238, "step": 16640 }, { "epoch": 2.83, "grad_norm": 4.197213779335068, "learning_rate": 9.770856748384516e-08, "loss": 0.3248, "step": 16645 }, { "epoch": 2.83, "grad_norm": 4.196155455714435, "learning_rate": 9.673804773642936e-08, "loss": 0.3249, "step": 16650 }, { "epoch": 2.83, "grad_norm": 7.855173996053146, "learning_rate": 9.577232498243383e-08, "loss": 0.3245, "step": 16655 }, { "epoch": 2.83, "grad_norm": 5.120386816838393, "learning_rate": 9.481140016665169e-08, "loss": 0.3151, "step": 16660 }, { "epoch": 2.83, "grad_norm": 6.56032471761875, "learning_rate": 9.385527422918095e-08, "loss": 0.3141, "step": 16665 }, { "epoch": 2.83, "grad_norm": 4.838102733097684, "learning_rate": 9.290394810542669e-08, "loss": 0.319, "step": 16670 }, { "epoch": 2.83, "grad_norm": 4.237242674037893, "learning_rate": 9.195742272609609e-08, "loss": 0.3299, "step": 16675 }, { "epoch": 2.84, "grad_norm": 5.069975175557291, "learning_rate": 9.101569901719953e-08, "loss": 0.3269, "step": 16680 }, { "epoch": 2.84, "grad_norm": 4.70395556339215, "learning_rate": 9.007877790005281e-08, "loss": 0.3216, "step": 16685 }, { "epoch": 2.84, "grad_norm": 4.484240354831281, "learning_rate": 8.91466602912694e-08, "loss": 0.3258, "step": 16690 }, { "epoch": 2.84, "grad_norm": 4.619820251785469, "learning_rate": 8.821934710276648e-08, "loss": 0.3264, "step": 16695 }, { "epoch": 2.84, "grad_norm": 4.306695765403448, "learning_rate": 8.72968392417578e-08, "loss": 0.3186, "step": 16700 }, { "epoch": 2.84, "grad_norm": 4.678091335257884, "learning_rate": 8.637913761075922e-08, "loss": 0.3136, "step": 16705 }, { "epoch": 2.84, "grad_norm": 4.140977332989994, "learning_rate": 8.546624310758256e-08, "loss": 0.319, "step": 16710 }, { "epoch": 2.84, "grad_norm": 4.245543097098709, "learning_rate": 8.455815662533617e-08, "loss": 0.3191, "step": 16715 }, { "epoch": 2.84, "grad_norm": 4.2871480871847005, "learning_rate": 8.36548790524272e-08, "loss": 0.3154, "step": 16720 }, { "epoch": 2.84, "grad_norm": 5.821588086606466, "learning_rate": 8.27564112725554e-08, "loss": 0.3282, "step": 16725 }, { "epoch": 2.84, "grad_norm": 4.331167692980463, "learning_rate": 8.186275416471656e-08, "loss": 0.3108, "step": 16730 }, { "epoch": 2.85, "grad_norm": 4.137864192771517, "learning_rate": 8.097390860319909e-08, "loss": 0.3191, "step": 16735 }, { "epoch": 2.85, "grad_norm": 6.561135191481652, "learning_rate": 8.008987545758518e-08, "loss": 0.3168, "step": 16740 }, { "epoch": 2.85, "grad_norm": 5.209127320403764, "learning_rate": 7.921065559274688e-08, "loss": 0.3311, "step": 16745 }, { "epoch": 2.85, "grad_norm": 4.808937295788496, "learning_rate": 7.833624986885058e-08, "loss": 0.3204, "step": 16750 }, { "epoch": 2.85, "grad_norm": 4.061855686895446, "learning_rate": 7.746665914134977e-08, "loss": 0.3224, "step": 16755 }, { "epoch": 2.85, "grad_norm": 6.742392831215071, "learning_rate": 7.66018842609889e-08, "loss": 0.3245, "step": 16760 }, { "epoch": 2.85, "grad_norm": 4.374524764337193, "learning_rate": 7.574192607380071e-08, "loss": 0.3318, "step": 16765 }, { "epoch": 2.85, "grad_norm": 4.019406843271415, "learning_rate": 7.488678542110495e-08, "loss": 0.3212, "step": 16770 }, { "epoch": 2.85, "grad_norm": 4.926694113312759, "learning_rate": 7.403646313950962e-08, "loss": 0.3328, "step": 16775 }, { "epoch": 2.85, "grad_norm": 4.566359108289269, "learning_rate": 7.319096006090654e-08, "loss": 0.3108, "step": 16780 }, { "epoch": 2.85, "grad_norm": 4.408310731840014, "learning_rate": 7.235027701247621e-08, "loss": 0.3212, "step": 16785 }, { "epoch": 2.85, "grad_norm": 4.152290333016591, "learning_rate": 7.151441481667965e-08, "loss": 0.3355, "step": 16790 }, { "epoch": 2.86, "grad_norm": 4.565863091344188, "learning_rate": 7.068337429126437e-08, "loss": 0.3247, "step": 16795 }, { "epoch": 2.86, "grad_norm": 4.721947965138385, "learning_rate": 6.985715624925948e-08, "loss": 0.3219, "step": 16800 }, { "epoch": 2.86, "grad_norm": 4.5162074092895645, "learning_rate": 6.903576149897617e-08, "loss": 0.3204, "step": 16805 }, { "epoch": 2.86, "grad_norm": 4.236228619641246, "learning_rate": 6.821919084400774e-08, "loss": 0.3308, "step": 16810 }, { "epoch": 2.86, "grad_norm": 4.141813607931332, "learning_rate": 6.740744508322683e-08, "loss": 0.3135, "step": 16815 }, { "epoch": 2.86, "grad_norm": 5.665322840238465, "learning_rate": 6.660052501078596e-08, "loss": 0.3252, "step": 16820 }, { "epoch": 2.86, "grad_norm": 4.767201979259087, "learning_rate": 6.579843141611697e-08, "loss": 0.3273, "step": 16825 }, { "epoch": 2.86, "grad_norm": 4.165531389035974, "learning_rate": 6.50011650839305e-08, "loss": 0.3223, "step": 16830 }, { "epoch": 2.86, "grad_norm": 4.192803078576282, "learning_rate": 6.420872679421208e-08, "loss": 0.3151, "step": 16835 }, { "epoch": 2.86, "grad_norm": 5.385516489304573, "learning_rate": 6.342111732222655e-08, "loss": 0.3328, "step": 16840 }, { "epoch": 2.86, "grad_norm": 5.425070913092375, "learning_rate": 6.263833743851367e-08, "loss": 0.3204, "step": 16845 }, { "epoch": 2.86, "grad_norm": 4.471432430744481, "learning_rate": 6.186038790888749e-08, "loss": 0.3248, "step": 16850 }, { "epoch": 2.87, "grad_norm": 4.3856903583224875, "learning_rate": 6.108726949443756e-08, "loss": 0.3201, "step": 16855 }, { "epoch": 2.87, "grad_norm": 4.589501882050007, "learning_rate": 6.031898295152605e-08, "loss": 0.3224, "step": 16860 }, { "epoch": 2.87, "grad_norm": 5.674711932036349, "learning_rate": 5.955552903178896e-08, "loss": 0.3267, "step": 16865 }, { "epoch": 2.87, "grad_norm": 4.10918736978263, "learning_rate": 5.8796908482132706e-08, "loss": 0.3152, "step": 16870 }, { "epoch": 2.87, "grad_norm": 4.9623222898979105, "learning_rate": 5.80431220447375e-08, "loss": 0.3255, "step": 16875 }, { "epoch": 2.87, "grad_norm": 4.83341224963951, "learning_rate": 5.7294170457052326e-08, "loss": 0.3129, "step": 16880 }, { "epoch": 2.87, "grad_norm": 5.207285877907312, "learning_rate": 5.655005445179662e-08, "loss": 0.3302, "step": 16885 }, { "epoch": 2.87, "grad_norm": 4.267072915271972, "learning_rate": 5.581077475695973e-08, "loss": 0.3282, "step": 16890 }, { "epoch": 2.87, "grad_norm": 4.213948223184893, "learning_rate": 5.50763320957981e-08, "loss": 0.328, "step": 16895 }, { "epoch": 2.87, "grad_norm": 4.630031411426699, "learning_rate": 5.4346727186837534e-08, "loss": 0.3341, "step": 16900 }, { "epoch": 2.87, "grad_norm": 6.480499974246501, "learning_rate": 5.362196074386983e-08, "loss": 0.3305, "step": 16905 }, { "epoch": 2.87, "grad_norm": 5.359901605698903, "learning_rate": 5.290203347595335e-08, "loss": 0.3232, "step": 16910 }, { "epoch": 2.88, "grad_norm": 4.488707476211648, "learning_rate": 5.218694608741304e-08, "loss": 0.3237, "step": 16915 }, { "epoch": 2.88, "grad_norm": 4.183021393567609, "learning_rate": 5.1476699277837605e-08, "loss": 0.3186, "step": 16920 }, { "epoch": 2.88, "grad_norm": 5.804647675972266, "learning_rate": 5.077129374208012e-08, "loss": 0.3257, "step": 16925 }, { "epoch": 2.88, "grad_norm": 4.494163540567883, "learning_rate": 5.007073017025965e-08, "loss": 0.3314, "step": 16930 }, { "epoch": 2.88, "grad_norm": 5.1840909862780515, "learning_rate": 4.9375009247754626e-08, "loss": 0.3279, "step": 16935 }, { "epoch": 2.88, "grad_norm": 4.572678380765821, "learning_rate": 4.8684131655208353e-08, "loss": 0.3116, "step": 16940 }, { "epoch": 2.88, "grad_norm": 4.196403827581241, "learning_rate": 4.799809806852518e-08, "loss": 0.3184, "step": 16945 }, { "epoch": 2.88, "grad_norm": 4.357098767447156, "learning_rate": 4.7316909158869884e-08, "loss": 0.3297, "step": 16950 }, { "epoch": 2.88, "grad_norm": 5.343916649756927, "learning_rate": 4.6640565592668276e-08, "loss": 0.311, "step": 16955 }, { "epoch": 2.88, "grad_norm": 4.291618378770368, "learning_rate": 4.5969068031604945e-08, "loss": 0.329, "step": 16960 }, { "epoch": 2.88, "grad_norm": 4.25102189320903, "learning_rate": 4.530241713262495e-08, "loss": 0.3209, "step": 16965 }, { "epoch": 2.89, "grad_norm": 4.134754275534955, "learning_rate": 4.4640613547929925e-08, "loss": 0.3101, "step": 16970 }, { "epoch": 2.89, "grad_norm": 4.545599286611421, "learning_rate": 4.398365792498083e-08, "loss": 0.3242, "step": 16975 }, { "epoch": 2.89, "grad_norm": 5.196296248690342, "learning_rate": 4.3331550906494656e-08, "loss": 0.3239, "step": 16980 }, { "epoch": 2.89, "grad_norm": 4.364358485774829, "learning_rate": 4.268429313044553e-08, "loss": 0.3226, "step": 16985 }, { "epoch": 2.89, "grad_norm": 5.446202143056948, "learning_rate": 4.204188523006303e-08, "loss": 0.3247, "step": 16990 }, { "epoch": 2.89, "grad_norm": 4.79378156017565, "learning_rate": 4.140432783383219e-08, "loss": 0.3221, "step": 16995 }, { "epoch": 2.89, "grad_norm": 5.010838753010564, "learning_rate": 4.077162156549297e-08, "loss": 0.3324, "step": 17000 }, { "epoch": 2.89, "grad_norm": 3.9813207535154596, "learning_rate": 4.0143767044038554e-08, "loss": 0.3231, "step": 17005 }, { "epoch": 2.89, "grad_norm": 5.901540079957885, "learning_rate": 3.9520764883715924e-08, "loss": 0.3275, "step": 17010 }, { "epoch": 2.89, "grad_norm": 4.208544291007741, "learning_rate": 3.8902615694025313e-08, "loss": 0.3186, "step": 17015 }, { "epoch": 2.89, "grad_norm": 4.042927808490137, "learning_rate": 3.828932007971797e-08, "loss": 0.3151, "step": 17020 }, { "epoch": 2.89, "grad_norm": 4.514916487809097, "learning_rate": 3.768087864079839e-08, "loss": 0.3234, "step": 17025 }, { "epoch": 2.9, "grad_norm": 5.432387190050278, "learning_rate": 3.707729197252097e-08, "loss": 0.3174, "step": 17030 }, { "epoch": 2.9, "grad_norm": 6.9894227760113425, "learning_rate": 3.6478560665390574e-08, "loss": 0.33, "step": 17035 }, { "epoch": 2.9, "grad_norm": 4.295747928089492, "learning_rate": 3.588468530516198e-08, "loss": 0.3266, "step": 17040 }, { "epoch": 2.9, "grad_norm": 4.407466468401592, "learning_rate": 3.529566647284044e-08, "loss": 0.3257, "step": 17045 }, { "epoch": 2.9, "grad_norm": 4.747434671821218, "learning_rate": 3.471150474467777e-08, "loss": 0.3323, "step": 17050 }, { "epoch": 2.9, "grad_norm": 6.148672477395743, "learning_rate": 3.413220069217627e-08, "loss": 0.3232, "step": 17055 }, { "epoch": 2.9, "grad_norm": 4.212980371901559, "learning_rate": 3.355775488208368e-08, "loss": 0.3205, "step": 17060 }, { "epoch": 2.9, "grad_norm": 4.595464952249602, "learning_rate": 3.298816787639714e-08, "loss": 0.3308, "step": 17065 }, { "epoch": 2.9, "grad_norm": 4.261453414240882, "learning_rate": 3.242344023235755e-08, "loss": 0.3235, "step": 17070 }, { "epoch": 2.9, "grad_norm": 4.140376013153457, "learning_rate": 3.186357250245409e-08, "loss": 0.3192, "step": 17075 }, { "epoch": 2.9, "grad_norm": 4.201478258105787, "learning_rate": 3.1308565234420275e-08, "loss": 0.3211, "step": 17080 }, { "epoch": 2.9, "grad_norm": 5.146822624316797, "learning_rate": 3.0758418971233995e-08, "loss": 0.3154, "step": 17085 }, { "epoch": 2.91, "grad_norm": 4.174234028597232, "learning_rate": 3.0213134251119716e-08, "loss": 0.3183, "step": 17090 }, { "epoch": 2.91, "grad_norm": 4.298240807790573, "learning_rate": 2.967271160754237e-08, "loss": 0.3268, "step": 17095 }, { "epoch": 2.91, "grad_norm": 4.227795333228735, "learning_rate": 2.9137151569213486e-08, "loss": 0.3214, "step": 17100 }, { "epoch": 2.91, "grad_norm": 4.210920820302368, "learning_rate": 2.8606454660085047e-08, "loss": 0.3201, "step": 17105 }, { "epoch": 2.91, "grad_norm": 4.688694831097193, "learning_rate": 2.8080621399352857e-08, "loss": 0.3274, "step": 17110 }, { "epoch": 2.91, "grad_norm": 4.016657299349907, "learning_rate": 2.7559652301452632e-08, "loss": 0.3165, "step": 17115 }, { "epoch": 2.91, "grad_norm": 4.427423155416518, "learning_rate": 2.704354787606389e-08, "loss": 0.3265, "step": 17120 }, { "epoch": 2.91, "grad_norm": 4.048773944525423, "learning_rate": 2.653230862810441e-08, "loss": 0.316, "step": 17125 }, { "epoch": 2.91, "grad_norm": 4.363179533603372, "learning_rate": 2.602593505773354e-08, "loss": 0.3294, "step": 17130 }, { "epoch": 2.91, "grad_norm": 4.343244247482888, "learning_rate": 2.5524427660351125e-08, "loss": 0.3286, "step": 17135 }, { "epoch": 2.91, "grad_norm": 5.1011993397027995, "learning_rate": 2.5027786926594132e-08, "loss": 0.3294, "step": 17140 }, { "epoch": 2.91, "grad_norm": 5.095024825445343, "learning_rate": 2.453601334234057e-08, "loss": 0.3184, "step": 17145 }, { "epoch": 2.92, "grad_norm": 5.533694904679112, "learning_rate": 2.4049107388705028e-08, "loss": 0.3165, "step": 17150 }, { "epoch": 2.92, "grad_norm": 4.338434029875889, "learning_rate": 2.356706954204091e-08, "loss": 0.3194, "step": 17155 }, { "epoch": 2.92, "grad_norm": 4.59201757614292, "learning_rate": 2.3089900273938758e-08, "loss": 0.3288, "step": 17160 }, { "epoch": 2.92, "grad_norm": 4.1761127099824575, "learning_rate": 2.2617600051226818e-08, "loss": 0.3285, "step": 17165 }, { "epoch": 2.92, "grad_norm": 4.518604552105859, "learning_rate": 2.2150169335968807e-08, "loss": 0.322, "step": 17170 }, { "epoch": 2.92, "grad_norm": 4.641904310325613, "learning_rate": 2.168760858546448e-08, "loss": 0.3237, "step": 17175 }, { "epoch": 2.92, "grad_norm": 4.752248201594694, "learning_rate": 2.1229918252249627e-08, "loss": 0.3242, "step": 17180 }, { "epoch": 2.92, "grad_norm": 4.765539723924974, "learning_rate": 2.0777098784095507e-08, "loss": 0.3281, "step": 17185 }, { "epoch": 2.92, "grad_norm": 4.333491188039683, "learning_rate": 2.0329150624006645e-08, "loss": 0.3343, "step": 17190 }, { "epoch": 2.92, "grad_norm": 4.9243081578917645, "learning_rate": 1.9886074210223592e-08, "loss": 0.3285, "step": 17195 }, { "epoch": 2.92, "grad_norm": 4.142304534229642, "learning_rate": 1.9447869976220167e-08, "loss": 0.3264, "step": 17200 }, { "epoch": 2.93, "grad_norm": 4.05735236631641, "learning_rate": 1.901453835070233e-08, "loss": 0.3115, "step": 17205 }, { "epoch": 2.93, "grad_norm": 3.95977328598609, "learning_rate": 1.858607975761095e-08, "loss": 0.3274, "step": 17210 }, { "epoch": 2.93, "grad_norm": 4.0736743171757075, "learning_rate": 1.816249461611852e-08, "loss": 0.3267, "step": 17215 }, { "epoch": 2.93, "grad_norm": 4.349859105884953, "learning_rate": 1.774378334062965e-08, "loss": 0.3272, "step": 17220 }, { "epoch": 2.93, "grad_norm": 4.296848237552809, "learning_rate": 1.732994634078111e-08, "loss": 0.3187, "step": 17225 }, { "epoch": 2.93, "grad_norm": 4.696438474912808, "learning_rate": 1.692098402144071e-08, "loss": 0.331, "step": 17230 }, { "epoch": 2.93, "grad_norm": 4.615718612464431, "learning_rate": 1.6516896782706736e-08, "loss": 0.3345, "step": 17235 }, { "epoch": 2.93, "grad_norm": 4.568930088531357, "learning_rate": 1.6117685019909623e-08, "loss": 0.322, "step": 17240 }, { "epoch": 2.93, "grad_norm": 5.404550793819637, "learning_rate": 1.5723349123608067e-08, "loss": 0.3163, "step": 17245 }, { "epoch": 2.93, "grad_norm": 5.629118831077295, "learning_rate": 1.5333889479592356e-08, "loss": 0.3237, "step": 17250 }, { "epoch": 2.93, "grad_norm": 4.603765706275918, "learning_rate": 1.4949306468880486e-08, "loss": 0.3176, "step": 17255 }, { "epoch": 2.93, "grad_norm": 4.365997654720079, "learning_rate": 1.456960046772149e-08, "loss": 0.3255, "step": 17260 }, { "epoch": 2.94, "grad_norm": 4.130421217693394, "learning_rate": 1.4194771847590994e-08, "loss": 0.3187, "step": 17265 }, { "epoch": 2.94, "grad_norm": 4.305110035082924, "learning_rate": 1.3824820975194553e-08, "loss": 0.3094, "step": 17270 }, { "epoch": 2.94, "grad_norm": 4.422436622511799, "learning_rate": 1.3459748212464318e-08, "loss": 0.3225, "step": 17275 }, { "epoch": 2.94, "grad_norm": 4.431422170606385, "learning_rate": 1.3099553916561813e-08, "loss": 0.3287, "step": 17280 }, { "epoch": 2.94, "grad_norm": 4.241567806730011, "learning_rate": 1.2744238439874046e-08, "loss": 0.3261, "step": 17285 }, { "epoch": 2.94, "grad_norm": 4.461893699555121, "learning_rate": 1.239380213001684e-08, "loss": 0.3182, "step": 17290 }, { "epoch": 2.94, "grad_norm": 5.755366473966939, "learning_rate": 1.2048245329829844e-08, "loss": 0.3261, "step": 17295 }, { "epoch": 2.94, "grad_norm": 4.086374403285044, "learning_rate": 1.1707568377382072e-08, "loss": 0.3237, "step": 17300 }, { "epoch": 2.94, "grad_norm": 4.37192574562722, "learning_rate": 1.137177160596581e-08, "loss": 0.3209, "step": 17305 }, { "epoch": 2.94, "grad_norm": 4.113981142802513, "learning_rate": 1.1040855344101043e-08, "loss": 0.3163, "step": 17310 }, { "epoch": 2.94, "grad_norm": 4.4901520195537765, "learning_rate": 1.0714819915531582e-08, "loss": 0.3317, "step": 17315 }, { "epoch": 2.94, "grad_norm": 4.06515915220569, "learning_rate": 1.0393665639226724e-08, "loss": 0.3286, "step": 17320 }, { "epoch": 2.95, "grad_norm": 4.305528353338569, "learning_rate": 1.007739282938014e-08, "loss": 0.3205, "step": 17325 }, { "epoch": 2.95, "grad_norm": 5.283052479749407, "learning_rate": 9.766001795410984e-09, "loss": 0.3217, "step": 17330 }, { "epoch": 2.95, "grad_norm": 4.007913866226925, "learning_rate": 9.459492841960572e-09, "loss": 0.319, "step": 17335 }, { "epoch": 2.95, "grad_norm": 4.892225816928856, "learning_rate": 9.157866268895144e-09, "loss": 0.3195, "step": 17340 }, { "epoch": 2.95, "grad_norm": 4.2700543147223975, "learning_rate": 8.861122371303654e-09, "loss": 0.3251, "step": 17345 }, { "epoch": 2.95, "grad_norm": 4.746315215212394, "learning_rate": 8.569261439499432e-09, "loss": 0.3291, "step": 17350 }, { "epoch": 2.95, "grad_norm": 4.227497950738878, "learning_rate": 8.282283759017962e-09, "loss": 0.3251, "step": 17355 }, { "epoch": 2.95, "grad_norm": 4.526948346687582, "learning_rate": 8.000189610616883e-09, "loss": 0.3167, "step": 17360 }, { "epoch": 2.95, "grad_norm": 4.397402173863833, "learning_rate": 7.722979270275988e-09, "loss": 0.3139, "step": 17365 }, { "epoch": 2.95, "grad_norm": 4.019686407837929, "learning_rate": 7.450653009198338e-09, "loss": 0.3203, "step": 17370 }, { "epoch": 2.95, "grad_norm": 4.113894086863957, "learning_rate": 7.18321109380804e-09, "loss": 0.3153, "step": 17375 }, { "epoch": 2.95, "grad_norm": 3.9802828861675468, "learning_rate": 6.920653785750797e-09, "loss": 0.3123, "step": 17380 }, { "epoch": 2.96, "grad_norm": 7.397341279147858, "learning_rate": 6.662981341892805e-09, "loss": 0.3222, "step": 17385 }, { "epoch": 2.96, "grad_norm": 4.2876178038487325, "learning_rate": 6.410194014322413e-09, "loss": 0.3254, "step": 17390 }, { "epoch": 2.96, "grad_norm": 6.362407584057065, "learning_rate": 6.162292050348462e-09, "loss": 0.3215, "step": 17395 }, { "epoch": 2.96, "grad_norm": 5.260197573800328, "learning_rate": 5.919275692500281e-09, "loss": 0.3342, "step": 17400 }, { "epoch": 2.96, "grad_norm": 5.652129288543425, "learning_rate": 5.681145178526581e-09, "loss": 0.3266, "step": 17405 }, { "epoch": 2.96, "grad_norm": 4.0329427262401225, "learning_rate": 5.4479007413976715e-09, "loss": 0.3261, "step": 17410 }, { "epoch": 2.96, "grad_norm": 4.212095217891655, "learning_rate": 5.21954260930213e-09, "loss": 0.3189, "step": 17415 }, { "epoch": 2.96, "grad_norm": 4.388186784150905, "learning_rate": 4.996071005649583e-09, "loss": 0.3258, "step": 17420 }, { "epoch": 2.96, "grad_norm": 4.403577402430027, "learning_rate": 4.777486149067923e-09, "loss": 0.3229, "step": 17425 }, { "epoch": 2.96, "grad_norm": 5.372666014770122, "learning_rate": 4.563788253404422e-09, "loss": 0.3168, "step": 17430 }, { "epoch": 2.96, "grad_norm": 5.920216648474848, "learning_rate": 4.3549775277262895e-09, "loss": 0.3219, "step": 17435 }, { "epoch": 2.96, "grad_norm": 4.75224480939319, "learning_rate": 4.151054176317337e-09, "loss": 0.3224, "step": 17440 }, { "epoch": 2.97, "grad_norm": 5.17217886620779, "learning_rate": 3.9520183986829776e-09, "loss": 0.3291, "step": 17445 }, { "epoch": 2.97, "grad_norm": 4.521520496183765, "learning_rate": 3.757870389544116e-09, "loss": 0.3193, "step": 17450 }, { "epoch": 2.97, "grad_norm": 3.9911833599850017, "learning_rate": 3.5686103388410385e-09, "loss": 0.3261, "step": 17455 }, { "epoch": 2.97, "grad_norm": 5.8047227129063055, "learning_rate": 3.384238431732301e-09, "loss": 0.3251, "step": 17460 }, { "epoch": 2.97, "grad_norm": 4.3984464082370645, "learning_rate": 3.2047548485941714e-09, "loss": 0.3188, "step": 17465 }, { "epoch": 2.97, "grad_norm": 4.579762521361893, "learning_rate": 3.0301597650195247e-09, "loss": 0.3259, "step": 17470 }, { "epoch": 2.97, "grad_norm": 4.880006797112499, "learning_rate": 2.8604533518200585e-09, "loss": 0.3245, "step": 17475 }, { "epoch": 2.97, "grad_norm": 4.29310428512058, "learning_rate": 2.6956357750235195e-09, "loss": 0.3288, "step": 17480 }, { "epoch": 2.97, "grad_norm": 5.429366982118829, "learning_rate": 2.53570719587648e-09, "loss": 0.3261, "step": 17485 }, { "epoch": 2.97, "grad_norm": 5.47672322500388, "learning_rate": 2.3806677708398952e-09, "loss": 0.326, "step": 17490 }, { "epoch": 2.97, "grad_norm": 4.315788135319219, "learning_rate": 2.230517651594655e-09, "loss": 0.3145, "step": 17495 }, { "epoch": 2.98, "grad_norm": 4.145232204854416, "learning_rate": 2.0852569850354778e-09, "loss": 0.3167, "step": 17500 }, { "epoch": 2.98, "grad_norm": 4.61449092787545, "learning_rate": 1.9448859132747965e-09, "loss": 0.3093, "step": 17505 }, { "epoch": 2.98, "grad_norm": 6.349955321675779, "learning_rate": 1.809404573642204e-09, "loss": 0.3289, "step": 17510 }, { "epoch": 2.98, "grad_norm": 5.081659098663104, "learning_rate": 1.6788130986816754e-09, "loss": 0.3171, "step": 17515 }, { "epoch": 2.98, "grad_norm": 5.59987534077367, "learning_rate": 1.553111616155456e-09, "loss": 0.3243, "step": 17520 }, { "epoch": 2.98, "grad_norm": 4.241357555063181, "learning_rate": 1.432300249040175e-09, "loss": 0.3163, "step": 17525 }, { "epoch": 2.98, "grad_norm": 4.650346457651398, "learning_rate": 1.316379115529065e-09, "loss": 0.3216, "step": 17530 }, { "epoch": 2.98, "grad_norm": 4.413006568192071, "learning_rate": 1.2053483290308533e-09, "loss": 0.3165, "step": 17535 }, { "epoch": 2.98, "grad_norm": 5.247458222002462, "learning_rate": 1.099207998169205e-09, "loss": 0.3255, "step": 17540 }, { "epoch": 2.98, "grad_norm": 4.332420786778957, "learning_rate": 9.979582267855004e-10, "loss": 0.3211, "step": 17545 }, { "epoch": 2.98, "grad_norm": 4.774136612916247, "learning_rate": 9.015991139338376e-10, "loss": 0.3266, "step": 17550 }, { "epoch": 2.98, "grad_norm": 4.276016406258255, "learning_rate": 8.101307538854741e-10, "loss": 0.3287, "step": 17555 }, { "epoch": 2.99, "grad_norm": 3.9933204057622826, "learning_rate": 7.235532361266062e-10, "loss": 0.3267, "step": 17560 }, { "epoch": 2.99, "grad_norm": 4.015434379642638, "learning_rate": 6.418666453578137e-10, "loss": 0.3279, "step": 17565 }, { "epoch": 2.99, "grad_norm": 4.3100411946404105, "learning_rate": 5.650710614957255e-10, "loss": 0.3261, "step": 17570 }, { "epoch": 2.99, "grad_norm": 5.620246737157527, "learning_rate": 4.931665596713542e-10, "loss": 0.3089, "step": 17575 }, { "epoch": 2.99, "grad_norm": 4.131919436844451, "learning_rate": 4.2615321023065094e-10, "loss": 0.329, "step": 17580 }, { "epoch": 2.99, "grad_norm": 5.213045629785342, "learning_rate": 3.6403107873450583e-10, "loss": 0.3256, "step": 17585 }, { "epoch": 2.99, "grad_norm": 4.629428548039462, "learning_rate": 3.068002259593028e-10, "loss": 0.3195, "step": 17590 }, { "epoch": 2.99, "grad_norm": 4.332419507807568, "learning_rate": 2.5446070789525437e-10, "loss": 0.3204, "step": 17595 }, { "epoch": 2.99, "grad_norm": 5.240834030496161, "learning_rate": 2.0701257574751165e-10, "loss": 0.3203, "step": 17600 }, { "epoch": 2.99, "grad_norm": 4.572494727034468, "learning_rate": 1.6445587593560964e-10, "loss": 0.327, "step": 17605 }, { "epoch": 2.99, "grad_norm": 4.2630889633032805, "learning_rate": 1.267906500940219e-10, "loss": 0.326, "step": 17610 }, { "epoch": 2.99, "grad_norm": 4.181006413141019, "learning_rate": 9.40169350716058e-11, "loss": 0.3255, "step": 17615 }, { "epoch": 3.0, "grad_norm": 4.117134318017797, "learning_rate": 6.613476293160226e-11, "loss": 0.3272, "step": 17620 }, { "epoch": 3.0, "grad_norm": 4.00361578124429, "learning_rate": 4.3144160952746096e-11, "loss": 0.334, "step": 17625 }, { "epoch": 3.0, "grad_norm": 4.3128277913830155, "learning_rate": 2.5045151626490462e-11, "loss": 0.3257, "step": 17630 }, { "epoch": 3.0, "grad_norm": 4.347718473004982, "learning_rate": 1.1837752659782375e-11, "loss": 0.3184, "step": 17635 }, { "epoch": 3.0, "grad_norm": 4.857624199795862, "learning_rate": 3.5219769745076237e-12, "loss": 0.3295, "step": 17640 }, { "epoch": 3.0, "grad_norm": 4.70428450020991, "learning_rate": 9.783270471519502e-14, "loss": 0.3309, "step": 17645 }, { "epoch": 3.0, "eval_loss": 0.273702472448349, "eval_runtime": 75.049, "eval_samples_per_second": 4.824, "eval_steps_per_second": 0.613, "step": 17646 }, { "epoch": 3.0, "step": 17646, "total_flos": 607478932832256.0, "train_loss": 0.7695021375204772, "train_runtime": 155405.9935, "train_samples_per_second": 1.817, "train_steps_per_second": 0.114 } ], "logging_steps": 5, "max_steps": 17646, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 607478932832256.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }