{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.6557956322798859, "learning_rate": 5.802707930367505e-09, "loss": 1.2371, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.555537955912783, "learning_rate": 2.9013539651837526e-08, "loss": 1.2332, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.6560886812503646, "learning_rate": 5.802707930367505e-08, "loss": 1.2786, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.6387105093397438, "learning_rate": 8.704061895551257e-08, "loss": 1.2475, "step": 15 }, { "epoch": 0.0, "grad_norm": 0.6532524825579088, "learning_rate": 1.160541586073501e-07, "loss": 1.2885, "step": 20 }, { "epoch": 0.0, "grad_norm": 0.7627038914530329, "learning_rate": 1.450676982591876e-07, "loss": 1.2707, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.6865350725464494, "learning_rate": 1.7408123791102514e-07, "loss": 1.2573, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.5944185772651999, "learning_rate": 2.0309477756286268e-07, "loss": 1.2236, "step": 35 }, { "epoch": 0.01, "grad_norm": 0.570802653009915, "learning_rate": 2.321083172147002e-07, "loss": 1.2354, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.6254702727850132, "learning_rate": 2.6112185686653774e-07, "loss": 1.2396, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.6505985921117032, "learning_rate": 2.901353965183752e-07, "loss": 1.2415, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.5645977078030876, "learning_rate": 3.1914893617021275e-07, "loss": 1.2242, "step": 55 }, { "epoch": 0.01, "grad_norm": 0.5679797693766294, "learning_rate": 3.481624758220503e-07, "loss": 1.2021, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.5881688854072822, "learning_rate": 3.771760154738878e-07, "loss": 1.241, "step": 65 }, { "epoch": 0.01, "grad_norm": 0.6171671681192435, "learning_rate": 4.0618955512572535e-07, "loss": 1.238, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.6251626743581414, "learning_rate": 4.3520309477756283e-07, "loss": 1.2623, "step": 75 }, { "epoch": 0.02, "grad_norm": 0.536899552637251, "learning_rate": 4.642166344294004e-07, "loss": 1.2525, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.6363284385208114, "learning_rate": 4.93230174081238e-07, "loss": 1.2807, "step": 85 }, { "epoch": 0.02, "grad_norm": 0.7156337106339976, "learning_rate": 5.222437137330755e-07, "loss": 1.233, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.4922672527647211, "learning_rate": 5.512572533849129e-07, "loss": 1.2296, "step": 95 }, { "epoch": 0.02, "grad_norm": 0.5995169385614093, "learning_rate": 5.802707930367504e-07, "loss": 1.2263, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.5270868824616388, "learning_rate": 6.092843326885881e-07, "loss": 1.256, "step": 105 }, { "epoch": 0.02, "grad_norm": 0.45317247546954, "learning_rate": 6.382978723404255e-07, "loss": 1.2344, "step": 110 }, { "epoch": 0.02, "grad_norm": 0.43848067309021005, "learning_rate": 6.67311411992263e-07, "loss": 1.2667, "step": 115 }, { "epoch": 0.02, "grad_norm": 0.4920996149570544, "learning_rate": 6.963249516441006e-07, "loss": 1.2048, "step": 120 }, { "epoch": 0.02, "grad_norm": 0.5458024812690938, "learning_rate": 7.253384912959381e-07, "loss": 1.221, "step": 125 }, { "epoch": 0.03, "grad_norm": 0.49239446969985223, "learning_rate": 7.543520309477756e-07, "loss": 1.2122, "step": 130 }, { "epoch": 0.03, "grad_norm": 0.6153580588728397, "learning_rate": 7.833655705996132e-07, "loss": 1.254, "step": 135 }, { "epoch": 0.03, "grad_norm": 0.5652181830271799, "learning_rate": 8.123791102514507e-07, "loss": 1.2144, "step": 140 }, { "epoch": 0.03, "grad_norm": 0.5688846265610772, "learning_rate": 8.413926499032881e-07, "loss": 1.2215, "step": 145 }, { "epoch": 0.03, "grad_norm": 0.43147864423001453, "learning_rate": 8.704061895551257e-07, "loss": 1.2489, "step": 150 }, { "epoch": 0.03, "grad_norm": 0.41719278261688, "learning_rate": 8.994197292069632e-07, "loss": 1.2424, "step": 155 }, { "epoch": 0.03, "grad_norm": 0.4879569605970493, "learning_rate": 9.284332688588008e-07, "loss": 1.2103, "step": 160 }, { "epoch": 0.03, "grad_norm": 0.5016955054762027, "learning_rate": 9.574468085106384e-07, "loss": 1.2229, "step": 165 }, { "epoch": 0.03, "grad_norm": 0.41471316258825075, "learning_rate": 9.86460348162476e-07, "loss": 1.2301, "step": 170 }, { "epoch": 0.03, "grad_norm": 0.37515142480035085, "learning_rate": 1.0154738878143134e-06, "loss": 1.2339, "step": 175 }, { "epoch": 0.03, "grad_norm": 0.5357151850842313, "learning_rate": 1.044487427466151e-06, "loss": 1.1782, "step": 180 }, { "epoch": 0.04, "grad_norm": 0.3653264458948328, "learning_rate": 1.0735009671179885e-06, "loss": 1.1319, "step": 185 }, { "epoch": 0.04, "grad_norm": 0.4383207715794612, "learning_rate": 1.1025145067698258e-06, "loss": 1.2032, "step": 190 }, { "epoch": 0.04, "grad_norm": 0.519104554154961, "learning_rate": 1.1315280464216634e-06, "loss": 1.2016, "step": 195 }, { "epoch": 0.04, "grad_norm": 0.3108713036635766, "learning_rate": 1.1605415860735009e-06, "loss": 1.2205, "step": 200 }, { "epoch": 0.04, "grad_norm": 0.39072663996333046, "learning_rate": 1.1895551257253386e-06, "loss": 1.178, "step": 205 }, { "epoch": 0.04, "grad_norm": 0.4159099519388948, "learning_rate": 1.2185686653771762e-06, "loss": 1.196, "step": 210 }, { "epoch": 0.04, "grad_norm": 0.40315240562925386, "learning_rate": 1.2475822050290137e-06, "loss": 1.2217, "step": 215 }, { "epoch": 0.04, "grad_norm": 0.381069833836644, "learning_rate": 1.276595744680851e-06, "loss": 1.1392, "step": 220 }, { "epoch": 0.04, "grad_norm": 0.45265751317200875, "learning_rate": 1.3056092843326885e-06, "loss": 1.1618, "step": 225 }, { "epoch": 0.04, "grad_norm": 0.4277031654442431, "learning_rate": 1.334622823984526e-06, "loss": 1.1457, "step": 230 }, { "epoch": 0.05, "grad_norm": 0.3444956221080674, "learning_rate": 1.3636363636363636e-06, "loss": 1.2013, "step": 235 }, { "epoch": 0.05, "grad_norm": 0.3421518714382308, "learning_rate": 1.3926499032882011e-06, "loss": 1.1327, "step": 240 }, { "epoch": 0.05, "grad_norm": 0.36336912925802345, "learning_rate": 1.4216634429400387e-06, "loss": 1.1283, "step": 245 }, { "epoch": 0.05, "grad_norm": 0.29948215466081957, "learning_rate": 1.4506769825918762e-06, "loss": 1.1337, "step": 250 }, { "epoch": 0.05, "grad_norm": 0.32776001109162717, "learning_rate": 1.4796905222437137e-06, "loss": 1.1058, "step": 255 }, { "epoch": 0.05, "grad_norm": 0.2866893939255288, "learning_rate": 1.5087040618955513e-06, "loss": 1.1276, "step": 260 }, { "epoch": 0.05, "grad_norm": 0.33107588796069914, "learning_rate": 1.5377176015473888e-06, "loss": 1.1665, "step": 265 }, { "epoch": 0.05, "grad_norm": 0.2771016817248348, "learning_rate": 1.5667311411992263e-06, "loss": 1.1298, "step": 270 }, { "epoch": 0.05, "grad_norm": 0.2950279622080215, "learning_rate": 1.5957446808510639e-06, "loss": 1.1179, "step": 275 }, { "epoch": 0.05, "grad_norm": 0.28628767272935396, "learning_rate": 1.6247582205029014e-06, "loss": 1.121, "step": 280 }, { "epoch": 0.06, "grad_norm": 0.32121632735389827, "learning_rate": 1.653771760154739e-06, "loss": 1.1191, "step": 285 }, { "epoch": 0.06, "grad_norm": 0.2673147970392464, "learning_rate": 1.6827852998065763e-06, "loss": 1.1308, "step": 290 }, { "epoch": 0.06, "grad_norm": 0.25690056840881836, "learning_rate": 1.7117988394584138e-06, "loss": 1.1028, "step": 295 }, { "epoch": 0.06, "grad_norm": 0.34733553615110824, "learning_rate": 1.7408123791102513e-06, "loss": 1.1204, "step": 300 }, { "epoch": 0.06, "grad_norm": 0.21641858619604487, "learning_rate": 1.7698259187620889e-06, "loss": 1.0929, "step": 305 }, { "epoch": 0.06, "grad_norm": 0.2671526741250324, "learning_rate": 1.7988394584139264e-06, "loss": 1.106, "step": 310 }, { "epoch": 0.06, "grad_norm": 0.2418205867189562, "learning_rate": 1.8278529980657641e-06, "loss": 1.0815, "step": 315 }, { "epoch": 0.06, "grad_norm": 0.2864594215791563, "learning_rate": 1.8568665377176017e-06, "loss": 1.0888, "step": 320 }, { "epoch": 0.06, "grad_norm": 0.22573020842814867, "learning_rate": 1.8858800773694392e-06, "loss": 1.108, "step": 325 }, { "epoch": 0.06, "grad_norm": 0.18929290310787175, "learning_rate": 1.9148936170212767e-06, "loss": 1.1112, "step": 330 }, { "epoch": 0.06, "grad_norm": 0.229402778930268, "learning_rate": 1.943907156673114e-06, "loss": 1.0799, "step": 335 }, { "epoch": 0.07, "grad_norm": 0.2459230628386859, "learning_rate": 1.972920696324952e-06, "loss": 1.1307, "step": 340 }, { "epoch": 0.07, "grad_norm": 0.207680539285622, "learning_rate": 2.001934235976789e-06, "loss": 1.0809, "step": 345 }, { "epoch": 0.07, "grad_norm": 0.2175863700443701, "learning_rate": 2.030947775628627e-06, "loss": 1.057, "step": 350 }, { "epoch": 0.07, "grad_norm": 0.23383978650057566, "learning_rate": 2.059961315280464e-06, "loss": 1.0926, "step": 355 }, { "epoch": 0.07, "grad_norm": 0.2200995259512634, "learning_rate": 2.088974854932302e-06, "loss": 1.0747, "step": 360 }, { "epoch": 0.07, "grad_norm": 0.21194139151827074, "learning_rate": 2.1179883945841393e-06, "loss": 1.0342, "step": 365 }, { "epoch": 0.07, "grad_norm": 0.30994391618458367, "learning_rate": 2.147001934235977e-06, "loss": 1.0625, "step": 370 }, { "epoch": 0.07, "grad_norm": 0.24113685123230963, "learning_rate": 2.1760154738878143e-06, "loss": 1.0649, "step": 375 }, { "epoch": 0.07, "grad_norm": 0.18918994099116018, "learning_rate": 2.2050290135396516e-06, "loss": 1.0749, "step": 380 }, { "epoch": 0.07, "grad_norm": 0.21256820119734426, "learning_rate": 2.2340425531914894e-06, "loss": 1.0562, "step": 385 }, { "epoch": 0.08, "grad_norm": 0.17510665051403598, "learning_rate": 2.2630560928433267e-06, "loss": 1.0538, "step": 390 }, { "epoch": 0.08, "grad_norm": 0.2008673882220927, "learning_rate": 2.2920696324951644e-06, "loss": 1.0571, "step": 395 }, { "epoch": 0.08, "grad_norm": 0.23508712912756172, "learning_rate": 2.3210831721470018e-06, "loss": 1.0505, "step": 400 }, { "epoch": 0.08, "grad_norm": 0.20306310295149868, "learning_rate": 2.3500967117988395e-06, "loss": 1.0454, "step": 405 }, { "epoch": 0.08, "grad_norm": 0.21068331150828368, "learning_rate": 2.3791102514506773e-06, "loss": 1.0551, "step": 410 }, { "epoch": 0.08, "grad_norm": 0.13689273954962194, "learning_rate": 2.4081237911025146e-06, "loss": 1.0474, "step": 415 }, { "epoch": 0.08, "grad_norm": 0.16160461077870894, "learning_rate": 2.4371373307543523e-06, "loss": 1.0478, "step": 420 }, { "epoch": 0.08, "grad_norm": 0.17395359723345014, "learning_rate": 2.4661508704061896e-06, "loss": 1.0525, "step": 425 }, { "epoch": 0.08, "grad_norm": 0.18279794025145507, "learning_rate": 2.4951644100580274e-06, "loss": 0.9952, "step": 430 }, { "epoch": 0.08, "grad_norm": 0.15844978893507106, "learning_rate": 2.5241779497098647e-06, "loss": 1.0216, "step": 435 }, { "epoch": 0.09, "grad_norm": 0.1436621940675145, "learning_rate": 2.553191489361702e-06, "loss": 1.0133, "step": 440 }, { "epoch": 0.09, "grad_norm": 0.1746824119521344, "learning_rate": 2.5822050290135398e-06, "loss": 1.0086, "step": 445 }, { "epoch": 0.09, "grad_norm": 0.16843680061281097, "learning_rate": 2.611218568665377e-06, "loss": 1.0124, "step": 450 }, { "epoch": 0.09, "grad_norm": 0.13899776670000386, "learning_rate": 2.640232108317215e-06, "loss": 1.0258, "step": 455 }, { "epoch": 0.09, "grad_norm": 0.16885171252583628, "learning_rate": 2.669245647969052e-06, "loss": 1.0025, "step": 460 }, { "epoch": 0.09, "grad_norm": 0.13947774165225663, "learning_rate": 2.69825918762089e-06, "loss": 1.0209, "step": 465 }, { "epoch": 0.09, "grad_norm": 0.16016862657644082, "learning_rate": 2.7272727272727272e-06, "loss": 1.0202, "step": 470 }, { "epoch": 0.09, "grad_norm": 0.1560774813153456, "learning_rate": 2.7562862669245645e-06, "loss": 1.0094, "step": 475 }, { "epoch": 0.09, "grad_norm": 0.15349128775811013, "learning_rate": 2.7852998065764023e-06, "loss": 1.0222, "step": 480 }, { "epoch": 0.09, "grad_norm": 0.1581693949842608, "learning_rate": 2.8143133462282396e-06, "loss": 1.0099, "step": 485 }, { "epoch": 0.09, "grad_norm": 0.1338165501774845, "learning_rate": 2.8433268858800774e-06, "loss": 0.9919, "step": 490 }, { "epoch": 0.1, "grad_norm": 0.16833331660791553, "learning_rate": 2.872340425531915e-06, "loss": 1.0109, "step": 495 }, { "epoch": 0.1, "grad_norm": 0.12418854228845702, "learning_rate": 2.9013539651837524e-06, "loss": 1.0114, "step": 500 }, { "epoch": 0.1, "grad_norm": 0.17269898089870625, "learning_rate": 2.93036750483559e-06, "loss": 1.0116, "step": 505 }, { "epoch": 0.1, "grad_norm": 0.1398155290290283, "learning_rate": 2.9593810444874275e-06, "loss": 0.9834, "step": 510 }, { "epoch": 0.1, "grad_norm": 0.13812113362847925, "learning_rate": 2.9883945841392652e-06, "loss": 1.0278, "step": 515 }, { "epoch": 0.1, "grad_norm": 0.1304950913697068, "learning_rate": 2.9999969229307894e-06, "loss": 0.994, "step": 520 }, { "epoch": 0.1, "grad_norm": 0.12161480167420421, "learning_rate": 2.999978118664665e-06, "loss": 0.9931, "step": 525 }, { "epoch": 0.1, "grad_norm": 0.12896036742563166, "learning_rate": 2.9999422198293556e-06, "loss": 1.0097, "step": 530 }, { "epoch": 0.1, "grad_norm": 0.13140405926118037, "learning_rate": 2.9998892268339835e-06, "loss": 1.004, "step": 535 }, { "epoch": 0.1, "grad_norm": 0.12459210092265861, "learning_rate": 2.999819140282485e-06, "loss": 0.9933, "step": 540 }, { "epoch": 0.11, "grad_norm": 0.11304692947321707, "learning_rate": 2.9997319609736057e-06, "loss": 1.024, "step": 545 }, { "epoch": 0.11, "grad_norm": 0.10712457456439482, "learning_rate": 2.9996276899008886e-06, "loss": 0.997, "step": 550 }, { "epoch": 0.11, "grad_norm": 0.12723199655873965, "learning_rate": 2.9995063282526635e-06, "loss": 0.999, "step": 555 }, { "epoch": 0.11, "grad_norm": 0.1117511114286072, "learning_rate": 2.9993678774120335e-06, "loss": 1.0005, "step": 560 }, { "epoch": 0.11, "grad_norm": 0.11921831176513668, "learning_rate": 2.9992123389568606e-06, "loss": 1.0128, "step": 565 }, { "epoch": 0.11, "grad_norm": 0.13264814163917732, "learning_rate": 2.9990397146597453e-06, "loss": 0.9958, "step": 570 }, { "epoch": 0.11, "grad_norm": 0.10853923751603145, "learning_rate": 2.998850006488009e-06, "loss": 1.008, "step": 575 }, { "epoch": 0.11, "grad_norm": 0.11972246590833577, "learning_rate": 2.9986432166036694e-06, "loss": 0.984, "step": 580 }, { "epoch": 0.11, "grad_norm": 0.11545852087538627, "learning_rate": 2.9984193473634165e-06, "loss": 0.9846, "step": 585 }, { "epoch": 0.11, "grad_norm": 0.10903802682217484, "learning_rate": 2.998178401318586e-06, "loss": 1.0103, "step": 590 }, { "epoch": 0.12, "grad_norm": 0.1056100500869663, "learning_rate": 2.9979203812151314e-06, "loss": 0.9846, "step": 595 }, { "epoch": 0.12, "grad_norm": 0.1086522268106536, "learning_rate": 2.9976452899935897e-06, "loss": 1.012, "step": 600 }, { "epoch": 0.12, "grad_norm": 0.11349091913892485, "learning_rate": 2.997353130789052e-06, "loss": 0.9868, "step": 605 }, { "epoch": 0.12, "grad_norm": 0.11520557146606587, "learning_rate": 2.9970439069311227e-06, "loss": 0.9859, "step": 610 }, { "epoch": 0.12, "grad_norm": 0.10731896545642554, "learning_rate": 2.996717621943886e-06, "loss": 0.9677, "step": 615 }, { "epoch": 0.12, "grad_norm": 0.10199258578885077, "learning_rate": 2.9963742795458634e-06, "loss": 0.9912, "step": 620 }, { "epoch": 0.12, "grad_norm": 0.1004784901885883, "learning_rate": 2.9960138836499727e-06, "loss": 0.974, "step": 625 }, { "epoch": 0.12, "grad_norm": 0.11816705487740833, "learning_rate": 2.9956364383634826e-06, "loss": 1.0011, "step": 630 }, { "epoch": 0.12, "grad_norm": 0.10828490126824755, "learning_rate": 2.9952419479879643e-06, "loss": 1.0004, "step": 635 }, { "epoch": 0.12, "grad_norm": 0.1064264302560214, "learning_rate": 2.9948304170192465e-06, "loss": 0.9906, "step": 640 }, { "epoch": 0.12, "grad_norm": 0.1111563680571522, "learning_rate": 2.99440185014736e-06, "loss": 0.9785, "step": 645 }, { "epoch": 0.13, "grad_norm": 0.10140413194917164, "learning_rate": 2.9939562522564877e-06, "loss": 1.0137, "step": 650 }, { "epoch": 0.13, "grad_norm": 0.10517761570743359, "learning_rate": 2.9934936284249047e-06, "loss": 0.9954, "step": 655 }, { "epoch": 0.13, "grad_norm": 0.10621130418725885, "learning_rate": 2.993013983924926e-06, "loss": 0.9724, "step": 660 }, { "epoch": 0.13, "grad_norm": 0.10215078910100184, "learning_rate": 2.992517324222842e-06, "loss": 0.9902, "step": 665 }, { "epoch": 0.13, "grad_norm": 0.10412671370027254, "learning_rate": 2.9920036549788573e-06, "loss": 0.9809, "step": 670 }, { "epoch": 0.13, "grad_norm": 0.11732922386471892, "learning_rate": 2.991472982047027e-06, "loss": 0.9623, "step": 675 }, { "epoch": 0.13, "grad_norm": 0.09628074134733036, "learning_rate": 2.990925311475189e-06, "loss": 0.9882, "step": 680 }, { "epoch": 0.13, "grad_norm": 0.10783922143897923, "learning_rate": 2.9903606495048965e-06, "loss": 0.983, "step": 685 }, { "epoch": 0.13, "grad_norm": 0.10412363645704524, "learning_rate": 2.9897790025713453e-06, "loss": 1.0016, "step": 690 }, { "epoch": 0.13, "grad_norm": 0.10425594495538223, "learning_rate": 2.9891803773033017e-06, "loss": 0.9834, "step": 695 }, { "epoch": 0.14, "grad_norm": 0.10570028300258567, "learning_rate": 2.9885647805230253e-06, "loss": 0.9608, "step": 700 }, { "epoch": 0.14, "grad_norm": 0.1156869590258465, "learning_rate": 2.987932219246193e-06, "loss": 1.0092, "step": 705 }, { "epoch": 0.14, "grad_norm": 0.1055268544146359, "learning_rate": 2.987282700681819e-06, "loss": 0.9927, "step": 710 }, { "epoch": 0.14, "grad_norm": 0.0962316020072262, "learning_rate": 2.9866162322321704e-06, "loss": 0.9824, "step": 715 }, { "epoch": 0.14, "grad_norm": 0.09521663449081877, "learning_rate": 2.9859328214926856e-06, "loss": 0.9623, "step": 720 }, { "epoch": 0.14, "grad_norm": 0.10471356087898716, "learning_rate": 2.9852324762518867e-06, "loss": 1.0006, "step": 725 }, { "epoch": 0.14, "grad_norm": 0.10041568704144976, "learning_rate": 2.98451520449129e-06, "loss": 0.9904, "step": 730 }, { "epoch": 0.14, "grad_norm": 0.11321291186855274, "learning_rate": 2.9837810143853162e-06, "loss": 1.0015, "step": 735 }, { "epoch": 0.14, "grad_norm": 0.09570436916323284, "learning_rate": 2.9830299143011955e-06, "loss": 0.9659, "step": 740 }, { "epoch": 0.14, "grad_norm": 0.11085995422039718, "learning_rate": 2.982261912798876e-06, "loss": 0.9865, "step": 745 }, { "epoch": 0.15, "grad_norm": 0.09744417943549286, "learning_rate": 2.9814770186309197e-06, "loss": 0.9662, "step": 750 }, { "epoch": 0.15, "grad_norm": 0.10429458744917756, "learning_rate": 2.980675240742411e-06, "loss": 0.9846, "step": 755 }, { "epoch": 0.15, "grad_norm": 0.10446461600654952, "learning_rate": 2.979856588270846e-06, "loss": 0.9822, "step": 760 }, { "epoch": 0.15, "grad_norm": 0.10542142436501455, "learning_rate": 2.979021070546038e-06, "loss": 0.9805, "step": 765 }, { "epoch": 0.15, "grad_norm": 0.10172022425622489, "learning_rate": 2.9781686970899998e-06, "loss": 0.9702, "step": 770 }, { "epoch": 0.15, "grad_norm": 0.11458755826070066, "learning_rate": 2.9772994776168466e-06, "loss": 0.9773, "step": 775 }, { "epoch": 0.15, "grad_norm": 0.09277700475991077, "learning_rate": 2.976413422032677e-06, "loss": 0.9767, "step": 780 }, { "epoch": 0.15, "grad_norm": 0.10884138178056307, "learning_rate": 2.9755105404354637e-06, "loss": 0.9742, "step": 785 }, { "epoch": 0.15, "grad_norm": 0.09160224444355052, "learning_rate": 2.974590843114939e-06, "loss": 0.9874, "step": 790 }, { "epoch": 0.15, "grad_norm": 0.09983015498372462, "learning_rate": 2.9736543405524747e-06, "loss": 0.9689, "step": 795 }, { "epoch": 0.15, "grad_norm": 0.10901476562057139, "learning_rate": 2.9727010434209652e-06, "loss": 0.9591, "step": 800 }, { "epoch": 0.16, "grad_norm": 0.0945610375498226, "learning_rate": 2.9717309625847053e-06, "loss": 0.9997, "step": 805 }, { "epoch": 0.16, "grad_norm": 0.1063437137043428, "learning_rate": 2.970744109099265e-06, "loss": 0.9787, "step": 810 }, { "epoch": 0.16, "grad_norm": 0.10165659250994069, "learning_rate": 2.9697404942113655e-06, "loss": 0.9559, "step": 815 }, { "epoch": 0.16, "grad_norm": 0.09899486787838711, "learning_rate": 2.9687201293587495e-06, "loss": 0.9515, "step": 820 }, { "epoch": 0.16, "grad_norm": 0.10160491144724951, "learning_rate": 2.967683026170052e-06, "loss": 0.9478, "step": 825 }, { "epoch": 0.16, "grad_norm": 0.10508036334433964, "learning_rate": 2.9666291964646663e-06, "loss": 0.966, "step": 830 }, { "epoch": 0.16, "grad_norm": 0.10340523865343762, "learning_rate": 2.9655586522526115e-06, "loss": 0.9757, "step": 835 }, { "epoch": 0.16, "grad_norm": 0.10542777862775049, "learning_rate": 2.9644714057343925e-06, "loss": 0.9753, "step": 840 }, { "epoch": 0.16, "grad_norm": 0.10247584372465218, "learning_rate": 2.9633674693008656e-06, "loss": 0.9607, "step": 845 }, { "epoch": 0.16, "grad_norm": 0.09993848260863396, "learning_rate": 2.9622468555330916e-06, "loss": 0.9775, "step": 850 }, { "epoch": 0.17, "grad_norm": 0.10504556965317913, "learning_rate": 2.961109577202197e-06, "loss": 0.9727, "step": 855 }, { "epoch": 0.17, "grad_norm": 0.10286749907745354, "learning_rate": 2.9599556472692262e-06, "loss": 0.9796, "step": 860 }, { "epoch": 0.17, "grad_norm": 0.0946833657296664, "learning_rate": 2.9587850788849942e-06, "loss": 0.9667, "step": 865 }, { "epoch": 0.17, "grad_norm": 0.10526928030356757, "learning_rate": 2.9575978853899377e-06, "loss": 0.9623, "step": 870 }, { "epoch": 0.17, "grad_norm": 0.0967614781879407, "learning_rate": 2.9563940803139607e-06, "loss": 0.9607, "step": 875 }, { "epoch": 0.17, "grad_norm": 0.10162924582712457, "learning_rate": 2.955173677376284e-06, "loss": 0.9698, "step": 880 }, { "epoch": 0.17, "grad_norm": 0.1144304223101541, "learning_rate": 2.9539366904852843e-06, "loss": 0.9852, "step": 885 }, { "epoch": 0.17, "grad_norm": 0.10636310716558237, "learning_rate": 2.9526831337383394e-06, "loss": 0.9606, "step": 890 }, { "epoch": 0.17, "grad_norm": 0.112534519091396, "learning_rate": 2.9514130214216665e-06, "loss": 0.9736, "step": 895 }, { "epoch": 0.17, "grad_norm": 0.10324814326836546, "learning_rate": 2.9501263680101588e-06, "loss": 0.9816, "step": 900 }, { "epoch": 0.18, "grad_norm": 0.10818625130621462, "learning_rate": 2.9488231881672203e-06, "loss": 0.9326, "step": 905 }, { "epoch": 0.18, "grad_norm": 0.10965885593819354, "learning_rate": 2.9475034967445993e-06, "loss": 0.9767, "step": 910 }, { "epoch": 0.18, "grad_norm": 0.10090790956890588, "learning_rate": 2.9461673087822204e-06, "loss": 0.9706, "step": 915 }, { "epoch": 0.18, "grad_norm": 0.10727272176169492, "learning_rate": 2.94481463950801e-06, "loss": 0.9647, "step": 920 }, { "epoch": 0.18, "grad_norm": 0.10704584677647615, "learning_rate": 2.9434455043377255e-06, "loss": 0.9683, "step": 925 }, { "epoch": 0.18, "grad_norm": 0.09401780473717516, "learning_rate": 2.9420599188747786e-06, "loss": 0.9499, "step": 930 }, { "epoch": 0.18, "grad_norm": 0.10562202866274173, "learning_rate": 2.9406578989100573e-06, "loss": 0.9814, "step": 935 }, { "epoch": 0.18, "grad_norm": 0.11078753036018767, "learning_rate": 2.9392394604217463e-06, "loss": 0.9522, "step": 940 }, { "epoch": 0.18, "grad_norm": 0.10599183497345623, "learning_rate": 2.937804619575144e-06, "loss": 0.9785, "step": 945 }, { "epoch": 0.18, "grad_norm": 0.10821166303387769, "learning_rate": 2.936353392722481e-06, "loss": 0.9484, "step": 950 }, { "epoch": 0.18, "grad_norm": 0.10164561197403676, "learning_rate": 2.934885796402729e-06, "loss": 0.9695, "step": 955 }, { "epoch": 0.19, "grad_norm": 0.0963480746857516, "learning_rate": 2.933401847341417e-06, "loss": 0.9704, "step": 960 }, { "epoch": 0.19, "grad_norm": 0.11877783442156198, "learning_rate": 2.931901562450439e-06, "loss": 0.9727, "step": 965 }, { "epoch": 0.19, "grad_norm": 0.11235009932152866, "learning_rate": 2.93038495882786e-06, "loss": 0.9836, "step": 970 }, { "epoch": 0.19, "grad_norm": 0.10489582071240841, "learning_rate": 2.9288520537577223e-06, "loss": 0.9715, "step": 975 }, { "epoch": 0.19, "grad_norm": 0.10530019662564404, "learning_rate": 2.927302864709848e-06, "loss": 0.947, "step": 980 }, { "epoch": 0.19, "grad_norm": 0.10948590499675388, "learning_rate": 2.9257374093396423e-06, "loss": 0.9544, "step": 985 }, { "epoch": 0.19, "grad_norm": 0.1101428849431525, "learning_rate": 2.9241557054878876e-06, "loss": 0.9736, "step": 990 }, { "epoch": 0.19, "grad_norm": 0.10161055478860785, "learning_rate": 2.9225577711805446e-06, "loss": 0.9579, "step": 995 }, { "epoch": 0.19, "grad_norm": 0.10234104818115608, "learning_rate": 2.920943624628545e-06, "loss": 0.9494, "step": 1000 }, { "epoch": 0.19, "grad_norm": 0.11125410578921856, "learning_rate": 2.9193132842275834e-06, "loss": 0.9665, "step": 1005 }, { "epoch": 0.2, "grad_norm": 0.10448226196682155, "learning_rate": 2.917666768557908e-06, "loss": 0.9492, "step": 1010 }, { "epoch": 0.2, "grad_norm": 0.10464379124965918, "learning_rate": 2.916004096384112e-06, "loss": 0.9485, "step": 1015 }, { "epoch": 0.2, "grad_norm": 0.11240399703260856, "learning_rate": 2.9143252866549126e-06, "loss": 0.9805, "step": 1020 }, { "epoch": 0.2, "grad_norm": 0.10262467247255205, "learning_rate": 2.9126303585029424e-06, "loss": 0.9533, "step": 1025 }, { "epoch": 0.2, "grad_norm": 0.10660529098219367, "learning_rate": 2.9109193312445277e-06, "loss": 0.9797, "step": 1030 }, { "epoch": 0.2, "grad_norm": 0.10233872211834552, "learning_rate": 2.909192224379469e-06, "loss": 0.9755, "step": 1035 }, { "epoch": 0.2, "grad_norm": 0.10675971763004717, "learning_rate": 2.907449057590818e-06, "loss": 0.958, "step": 1040 }, { "epoch": 0.2, "grad_norm": 0.11494064560394811, "learning_rate": 2.9056898507446553e-06, "loss": 0.9426, "step": 1045 }, { "epoch": 0.2, "grad_norm": 0.11375236977081475, "learning_rate": 2.9039146238898615e-06, "loss": 0.9438, "step": 1050 }, { "epoch": 0.2, "grad_norm": 0.10951341713317103, "learning_rate": 2.9021233972578917e-06, "loss": 0.954, "step": 1055 }, { "epoch": 0.21, "grad_norm": 0.11289886578757521, "learning_rate": 2.9003161912625412e-06, "loss": 0.9651, "step": 1060 }, { "epoch": 0.21, "grad_norm": 0.10761927389343565, "learning_rate": 2.8984930264997153e-06, "loss": 0.9855, "step": 1065 }, { "epoch": 0.21, "grad_norm": 0.11084863010339359, "learning_rate": 2.8966539237471957e-06, "loss": 0.9749, "step": 1070 }, { "epoch": 0.21, "grad_norm": 0.10914255215103161, "learning_rate": 2.8947989039644e-06, "loss": 0.9434, "step": 1075 }, { "epoch": 0.21, "grad_norm": 0.11264654500852792, "learning_rate": 2.8929279882921465e-06, "loss": 0.9776, "step": 1080 }, { "epoch": 0.21, "grad_norm": 0.11132127023457686, "learning_rate": 2.891041198052411e-06, "loss": 0.9507, "step": 1085 }, { "epoch": 0.21, "grad_norm": 0.10653706131717319, "learning_rate": 2.8891385547480846e-06, "loss": 0.9535, "step": 1090 }, { "epoch": 0.21, "grad_norm": 0.1151077199781543, "learning_rate": 2.887220080062729e-06, "loss": 0.9761, "step": 1095 }, { "epoch": 0.21, "grad_norm": 0.10470041169674428, "learning_rate": 2.8852857958603284e-06, "loss": 0.9736, "step": 1100 }, { "epoch": 0.21, "grad_norm": 0.11918031326179271, "learning_rate": 2.883335724185041e-06, "loss": 0.9437, "step": 1105 }, { "epoch": 0.21, "grad_norm": 0.1107590462985888, "learning_rate": 2.8813698872609478e-06, "loss": 0.9461, "step": 1110 }, { "epoch": 0.22, "grad_norm": 0.11545803285412702, "learning_rate": 2.8793883074917996e-06, "loss": 0.9741, "step": 1115 }, { "epoch": 0.22, "grad_norm": 0.10989642132296704, "learning_rate": 2.8773910074607604e-06, "loss": 0.9375, "step": 1120 }, { "epoch": 0.22, "grad_norm": 0.11350061884840995, "learning_rate": 2.875378009930151e-06, "loss": 0.9762, "step": 1125 }, { "epoch": 0.22, "grad_norm": 0.10670246968609752, "learning_rate": 2.8733493378411908e-06, "loss": 0.9611, "step": 1130 }, { "epoch": 0.22, "grad_norm": 0.11315693520491527, "learning_rate": 2.8713050143137327e-06, "loss": 0.9574, "step": 1135 }, { "epoch": 0.22, "grad_norm": 0.10792619360055242, "learning_rate": 2.869245062646004e-06, "loss": 0.9755, "step": 1140 }, { "epoch": 0.22, "grad_norm": 0.11860109794592168, "learning_rate": 2.8671695063143373e-06, "loss": 0.9682, "step": 1145 }, { "epoch": 0.22, "grad_norm": 0.11295073321194798, "learning_rate": 2.865078368972907e-06, "loss": 0.9732, "step": 1150 }, { "epoch": 0.22, "grad_norm": 0.10877608828125572, "learning_rate": 2.862971674453453e-06, "loss": 0.9319, "step": 1155 }, { "epoch": 0.22, "grad_norm": 0.11905716007142109, "learning_rate": 2.860849446765017e-06, "loss": 0.9563, "step": 1160 }, { "epoch": 0.23, "grad_norm": 0.108659995668165, "learning_rate": 2.8587117100936642e-06, "loss": 0.9323, "step": 1165 }, { "epoch": 0.23, "grad_norm": 0.11330495373273691, "learning_rate": 2.856558488802207e-06, "loss": 0.9461, "step": 1170 }, { "epoch": 0.23, "grad_norm": 0.11183891702192725, "learning_rate": 2.854389807429932e-06, "loss": 0.937, "step": 1175 }, { "epoch": 0.23, "grad_norm": 0.10647343953458478, "learning_rate": 2.8522056906923136e-06, "loss": 0.9432, "step": 1180 }, { "epoch": 0.23, "grad_norm": 0.10989413716941382, "learning_rate": 2.8500061634807397e-06, "loss": 0.9434, "step": 1185 }, { "epoch": 0.23, "grad_norm": 0.10609789647222649, "learning_rate": 2.847791250862222e-06, "loss": 0.9708, "step": 1190 }, { "epoch": 0.23, "grad_norm": 0.11449356746375824, "learning_rate": 2.845560978079113e-06, "loss": 0.9493, "step": 1195 }, { "epoch": 0.23, "grad_norm": 0.11863005755900938, "learning_rate": 2.843315370548819e-06, "loss": 0.9402, "step": 1200 }, { "epoch": 0.23, "grad_norm": 0.11152572810815058, "learning_rate": 2.8410544538635086e-06, "loss": 0.9669, "step": 1205 }, { "epoch": 0.23, "grad_norm": 0.1118716690063177, "learning_rate": 2.838778253789822e-06, "loss": 0.9469, "step": 1210 }, { "epoch": 0.24, "grad_norm": 0.11513622367346048, "learning_rate": 2.8364867962685775e-06, "loss": 0.9732, "step": 1215 }, { "epoch": 0.24, "grad_norm": 0.11916471698468781, "learning_rate": 2.834180107414476e-06, "loss": 0.9588, "step": 1220 }, { "epoch": 0.24, "grad_norm": 0.10636597317515512, "learning_rate": 2.831858213515802e-06, "loss": 0.9781, "step": 1225 }, { "epoch": 0.24, "grad_norm": 0.1144580288076685, "learning_rate": 2.829521141034125e-06, "loss": 0.956, "step": 1230 }, { "epoch": 0.24, "grad_norm": 0.11413870856691348, "learning_rate": 2.8271689166039986e-06, "loss": 0.9568, "step": 1235 }, { "epoch": 0.24, "grad_norm": 0.11086368743242728, "learning_rate": 2.8248015670326564e-06, "loss": 0.9455, "step": 1240 }, { "epoch": 0.24, "grad_norm": 0.10920372063922966, "learning_rate": 2.822419119299706e-06, "loss": 0.9435, "step": 1245 }, { "epoch": 0.24, "grad_norm": 0.11758849733693692, "learning_rate": 2.8200216005568218e-06, "loss": 0.9421, "step": 1250 }, { "epoch": 0.24, "grad_norm": 0.10936170827027436, "learning_rate": 2.817609038127435e-06, "loss": 0.9538, "step": 1255 }, { "epoch": 0.24, "grad_norm": 0.11188798595384854, "learning_rate": 2.815181459506425e-06, "loss": 0.9823, "step": 1260 }, { "epoch": 0.24, "grad_norm": 0.14295692634361193, "learning_rate": 2.8127388923598008e-06, "loss": 0.9533, "step": 1265 }, { "epoch": 0.25, "grad_norm": 0.1147686682077821, "learning_rate": 2.810281364524392e-06, "loss": 0.9714, "step": 1270 }, { "epoch": 0.25, "grad_norm": 0.11140244758844407, "learning_rate": 2.807808904007526e-06, "loss": 0.9554, "step": 1275 }, { "epoch": 0.25, "grad_norm": 0.10882014916760172, "learning_rate": 2.805321538986713e-06, "loss": 0.9445, "step": 1280 }, { "epoch": 0.25, "grad_norm": 0.11197943958186041, "learning_rate": 2.802819297809321e-06, "loss": 0.9433, "step": 1285 }, { "epoch": 0.25, "grad_norm": 0.1127674218112967, "learning_rate": 2.8003022089922564e-06, "loss": 0.9612, "step": 1290 }, { "epoch": 0.25, "grad_norm": 0.11516015506964294, "learning_rate": 2.7977703012216375e-06, "loss": 0.9562, "step": 1295 }, { "epoch": 0.25, "grad_norm": 0.12293491236553014, "learning_rate": 2.7952236033524658e-06, "loss": 0.9593, "step": 1300 }, { "epoch": 0.25, "grad_norm": 0.12143251214899849, "learning_rate": 2.7926621444083015e-06, "loss": 0.9569, "step": 1305 }, { "epoch": 0.25, "grad_norm": 0.1094629806227622, "learning_rate": 2.790085953580927e-06, "loss": 0.9568, "step": 1310 }, { "epoch": 0.25, "grad_norm": 0.11455799113079224, "learning_rate": 2.7874950602300197e-06, "loss": 0.953, "step": 1315 }, { "epoch": 0.26, "grad_norm": 0.11535461499254474, "learning_rate": 2.7848894938828134e-06, "loss": 0.9035, "step": 1320 }, { "epoch": 0.26, "grad_norm": 0.1149148628451183, "learning_rate": 2.7822692842337654e-06, "loss": 0.9709, "step": 1325 }, { "epoch": 0.26, "grad_norm": 0.10799586189243776, "learning_rate": 2.7796344611442133e-06, "loss": 0.9492, "step": 1330 }, { "epoch": 0.26, "grad_norm": 0.11235503423781165, "learning_rate": 2.7769850546420396e-06, "loss": 1.0031, "step": 1335 }, { "epoch": 0.26, "grad_norm": 0.11596192217593441, "learning_rate": 2.774321094921326e-06, "loss": 0.9478, "step": 1340 }, { "epoch": 0.26, "grad_norm": 0.11386850099809975, "learning_rate": 2.7716426123420114e-06, "loss": 0.9464, "step": 1345 }, { "epoch": 0.26, "grad_norm": 0.12212660771198781, "learning_rate": 2.768949637429546e-06, "loss": 0.9588, "step": 1350 }, { "epoch": 0.26, "grad_norm": 0.11252131716640058, "learning_rate": 2.76624220087454e-06, "loss": 0.942, "step": 1355 }, { "epoch": 0.26, "grad_norm": 0.11804484758605481, "learning_rate": 2.7635203335324185e-06, "loss": 0.9492, "step": 1360 }, { "epoch": 0.26, "grad_norm": 0.11757492161038868, "learning_rate": 2.7607840664230674e-06, "loss": 0.9664, "step": 1365 }, { "epoch": 0.26, "grad_norm": 0.11458358844876558, "learning_rate": 2.758033430730479e-06, "loss": 0.9495, "step": 1370 }, { "epoch": 0.27, "grad_norm": 0.1228068142781927, "learning_rate": 2.7552684578023998e-06, "loss": 0.9473, "step": 1375 }, { "epoch": 0.27, "grad_norm": 0.1167571702347808, "learning_rate": 2.752489179149969e-06, "loss": 0.9743, "step": 1380 }, { "epoch": 0.27, "grad_norm": 0.11259236811273615, "learning_rate": 2.7496956264473635e-06, "loss": 0.9517, "step": 1385 }, { "epoch": 0.27, "grad_norm": 0.114209224633214, "learning_rate": 2.746887831531434e-06, "loss": 0.9608, "step": 1390 }, { "epoch": 0.27, "grad_norm": 0.12180028064162973, "learning_rate": 2.744065826401344e-06, "loss": 0.9357, "step": 1395 }, { "epoch": 0.27, "grad_norm": 0.11724624103676141, "learning_rate": 2.7412296432182035e-06, "loss": 0.955, "step": 1400 }, { "epoch": 0.27, "grad_norm": 0.11928864261797452, "learning_rate": 2.738379314304704e-06, "loss": 0.938, "step": 1405 }, { "epoch": 0.27, "grad_norm": 0.1183836619102445, "learning_rate": 2.735514872144749e-06, "loss": 0.9638, "step": 1410 }, { "epoch": 0.27, "grad_norm": 0.11343899459393017, "learning_rate": 2.732636349383085e-06, "loss": 0.9648, "step": 1415 }, { "epoch": 0.27, "grad_norm": 0.11856005446375929, "learning_rate": 2.7297437788249276e-06, "loss": 0.9663, "step": 1420 }, { "epoch": 0.28, "grad_norm": 0.1502591765852686, "learning_rate": 2.72683719343559e-06, "loss": 0.9361, "step": 1425 }, { "epoch": 0.28, "grad_norm": 0.11914573111945241, "learning_rate": 2.7239166263401056e-06, "loss": 0.9595, "step": 1430 }, { "epoch": 0.28, "grad_norm": 0.1230018369983442, "learning_rate": 2.7209821108228497e-06, "loss": 0.9565, "step": 1435 }, { "epoch": 0.28, "grad_norm": 0.12071540798615119, "learning_rate": 2.718033680327163e-06, "loss": 0.9737, "step": 1440 }, { "epoch": 0.28, "grad_norm": 0.11960934492681863, "learning_rate": 2.715071368454969e-06, "loss": 0.9185, "step": 1445 }, { "epoch": 0.28, "grad_norm": 0.11733580889296377, "learning_rate": 2.7120952089663894e-06, "loss": 0.9414, "step": 1450 }, { "epoch": 0.28, "grad_norm": 0.11053633322216715, "learning_rate": 2.7091052357793627e-06, "loss": 0.9349, "step": 1455 }, { "epoch": 0.28, "grad_norm": 0.12986099128088718, "learning_rate": 2.7061014829692546e-06, "loss": 0.9807, "step": 1460 }, { "epoch": 0.28, "grad_norm": 0.12933111225425914, "learning_rate": 2.703083984768471e-06, "loss": 0.9442, "step": 1465 }, { "epoch": 0.28, "grad_norm": 0.115955353513501, "learning_rate": 2.7000527755660684e-06, "loss": 0.948, "step": 1470 }, { "epoch": 0.29, "grad_norm": 0.11943463657143313, "learning_rate": 2.697007889907361e-06, "loss": 0.9573, "step": 1475 }, { "epoch": 0.29, "grad_norm": 0.11673257150242644, "learning_rate": 2.693949362493527e-06, "loss": 0.9387, "step": 1480 }, { "epoch": 0.29, "grad_norm": 0.11899065882144028, "learning_rate": 2.690877228181215e-06, "loss": 0.9493, "step": 1485 }, { "epoch": 0.29, "grad_norm": 0.12183609939386461, "learning_rate": 2.6877915219821427e-06, "loss": 0.9539, "step": 1490 }, { "epoch": 0.29, "grad_norm": 0.11127194999506593, "learning_rate": 2.6846922790627024e-06, "loss": 0.9443, "step": 1495 }, { "epoch": 0.29, "grad_norm": 0.11553816505121053, "learning_rate": 2.6815795347435577e-06, "loss": 0.9298, "step": 1500 }, { "epoch": 0.29, "grad_norm": 0.11447251373564948, "learning_rate": 2.6784533244992416e-06, "loss": 0.9375, "step": 1505 }, { "epoch": 0.29, "grad_norm": 0.12166179843418228, "learning_rate": 2.6753136839577522e-06, "loss": 0.9349, "step": 1510 }, { "epoch": 0.29, "grad_norm": 0.12100641500283442, "learning_rate": 2.6721606489001457e-06, "loss": 0.9293, "step": 1515 }, { "epoch": 0.29, "grad_norm": 0.12359155363514698, "learning_rate": 2.668994255260131e-06, "loss": 0.9624, "step": 1520 }, { "epoch": 0.29, "grad_norm": 0.11179542251492335, "learning_rate": 2.6658145391236574e-06, "loss": 0.9375, "step": 1525 }, { "epoch": 0.3, "grad_norm": 0.1113458165161423, "learning_rate": 2.6626215367285054e-06, "loss": 0.92, "step": 1530 }, { "epoch": 0.3, "grad_norm": 0.11826530596830692, "learning_rate": 2.659415284463873e-06, "loss": 0.9829, "step": 1535 }, { "epoch": 0.3, "grad_norm": 0.1223957965290031, "learning_rate": 2.6561958188699604e-06, "loss": 0.9485, "step": 1540 }, { "epoch": 0.3, "grad_norm": 0.12481799756632796, "learning_rate": 2.6529631766375546e-06, "loss": 0.9532, "step": 1545 }, { "epoch": 0.3, "grad_norm": 0.11360313455180103, "learning_rate": 2.6497173946076098e-06, "loss": 0.9648, "step": 1550 }, { "epoch": 0.3, "grad_norm": 0.11832505401191586, "learning_rate": 2.64645850977083e-06, "loss": 0.9353, "step": 1555 }, { "epoch": 0.3, "grad_norm": 0.11917615522222746, "learning_rate": 2.643186559267245e-06, "loss": 0.9453, "step": 1560 }, { "epoch": 0.3, "grad_norm": 0.12184287945841704, "learning_rate": 2.6399015803857885e-06, "loss": 0.9543, "step": 1565 }, { "epoch": 0.3, "grad_norm": 0.11651535092179631, "learning_rate": 2.636603610563872e-06, "loss": 0.946, "step": 1570 }, { "epoch": 0.3, "grad_norm": 0.13089416464535625, "learning_rate": 2.6332926873869595e-06, "loss": 0.9612, "step": 1575 }, { "epoch": 0.31, "grad_norm": 0.12640696317783878, "learning_rate": 2.629968848588138e-06, "loss": 0.9485, "step": 1580 }, { "epoch": 0.31, "grad_norm": 0.12467989812698095, "learning_rate": 2.6266321320476893e-06, "loss": 0.9467, "step": 1585 }, { "epoch": 0.31, "grad_norm": 0.12292162295288515, "learning_rate": 2.6232825757926555e-06, "loss": 0.9526, "step": 1590 }, { "epoch": 0.31, "grad_norm": 0.12693069162671494, "learning_rate": 2.6199202179964064e-06, "loss": 0.9495, "step": 1595 }, { "epoch": 0.31, "grad_norm": 0.12251580300224744, "learning_rate": 2.6165450969782074e-06, "loss": 0.9479, "step": 1600 }, { "epoch": 0.31, "grad_norm": 0.12239346691673264, "learning_rate": 2.61315725120278e-06, "loss": 0.9592, "step": 1605 }, { "epoch": 0.31, "grad_norm": 0.11659453736794827, "learning_rate": 2.609756719279862e-06, "loss": 0.9378, "step": 1610 }, { "epoch": 0.31, "grad_norm": 0.12683066622391057, "learning_rate": 2.606343539963772e-06, "loss": 0.9412, "step": 1615 }, { "epoch": 0.31, "grad_norm": 0.11735665618288187, "learning_rate": 2.6029177521529633e-06, "loss": 0.936, "step": 1620 }, { "epoch": 0.31, "grad_norm": 0.1216598234634421, "learning_rate": 2.5994793948895835e-06, "loss": 0.9627, "step": 1625 }, { "epoch": 0.32, "grad_norm": 0.12601877660770533, "learning_rate": 2.596028507359029e-06, "loss": 0.9529, "step": 1630 }, { "epoch": 0.32, "grad_norm": 0.12702458316754647, "learning_rate": 2.5925651288894965e-06, "loss": 0.9515, "step": 1635 }, { "epoch": 0.32, "grad_norm": 0.12058379659459599, "learning_rate": 2.5890892989515367e-06, "loss": 0.9298, "step": 1640 }, { "epoch": 0.32, "grad_norm": 0.12573278202145702, "learning_rate": 2.585601057157605e-06, "loss": 0.9575, "step": 1645 }, { "epoch": 0.32, "grad_norm": 0.11676843442815175, "learning_rate": 2.582100443261609e-06, "loss": 0.9466, "step": 1650 }, { "epoch": 0.32, "grad_norm": 0.12406638621197374, "learning_rate": 2.5785874971584536e-06, "loss": 0.9403, "step": 1655 }, { "epoch": 0.32, "grad_norm": 0.12389135267465634, "learning_rate": 2.5750622588835903e-06, "loss": 0.9423, "step": 1660 }, { "epoch": 0.32, "grad_norm": 0.12180646520632062, "learning_rate": 2.571524768612558e-06, "loss": 0.9223, "step": 1665 }, { "epoch": 0.32, "grad_norm": 0.11486707403126087, "learning_rate": 2.567975066660527e-06, "loss": 0.9275, "step": 1670 }, { "epoch": 0.32, "grad_norm": 0.12857636220545796, "learning_rate": 2.564413193481837e-06, "loss": 0.9749, "step": 1675 }, { "epoch": 0.32, "grad_norm": 0.12086931508695424, "learning_rate": 2.5608391896695388e-06, "loss": 0.9439, "step": 1680 }, { "epoch": 0.33, "grad_norm": 0.12178686326127208, "learning_rate": 2.55725309595493e-06, "loss": 0.954, "step": 1685 }, { "epoch": 0.33, "grad_norm": 0.12960869330311783, "learning_rate": 2.5536549532070913e-06, "loss": 0.9352, "step": 1690 }, { "epoch": 0.33, "grad_norm": 0.12553474416457935, "learning_rate": 2.550044802432422e-06, "loss": 0.9442, "step": 1695 }, { "epoch": 0.33, "grad_norm": 0.12732282668760914, "learning_rate": 2.5464226847741695e-06, "loss": 0.9314, "step": 1700 }, { "epoch": 0.33, "grad_norm": 0.13055875843349435, "learning_rate": 2.5427886415119635e-06, "loss": 0.9186, "step": 1705 }, { "epoch": 0.33, "grad_norm": 0.12814219216348366, "learning_rate": 2.539142714061344e-06, "loss": 0.93, "step": 1710 }, { "epoch": 0.33, "grad_norm": 0.13703362060653562, "learning_rate": 2.5354849439732902e-06, "loss": 0.9353, "step": 1715 }, { "epoch": 0.33, "grad_norm": 0.11733228892071898, "learning_rate": 2.5318153729337457e-06, "loss": 0.9549, "step": 1720 }, { "epoch": 0.33, "grad_norm": 0.12639938357266184, "learning_rate": 2.5281340427631445e-06, "loss": 0.9479, "step": 1725 }, { "epoch": 0.33, "grad_norm": 0.12858912657134408, "learning_rate": 2.5244409954159343e-06, "loss": 0.9157, "step": 1730 }, { "epoch": 0.34, "grad_norm": 0.13765344027585624, "learning_rate": 2.5207362729800986e-06, "loss": 0.9567, "step": 1735 }, { "epoch": 0.34, "grad_norm": 0.1188666008027966, "learning_rate": 2.5170199176766746e-06, "loss": 0.9454, "step": 1740 }, { "epoch": 0.34, "grad_norm": 0.12528858240136181, "learning_rate": 2.5132919718592767e-06, "loss": 0.9445, "step": 1745 }, { "epoch": 0.34, "grad_norm": 0.12298871563801664, "learning_rate": 2.5095524780136096e-06, "loss": 0.9543, "step": 1750 }, { "epoch": 0.34, "grad_norm": 0.1311433270714553, "learning_rate": 2.5058014787569847e-06, "loss": 0.9501, "step": 1755 }, { "epoch": 0.34, "grad_norm": 0.12625986029021932, "learning_rate": 2.5020390168378376e-06, "loss": 0.991, "step": 1760 }, { "epoch": 0.34, "grad_norm": 0.12627600348385226, "learning_rate": 2.498265135135237e-06, "loss": 0.9804, "step": 1765 }, { "epoch": 0.34, "grad_norm": 0.12480939156448727, "learning_rate": 2.4944798766583986e-06, "loss": 0.9575, "step": 1770 }, { "epoch": 0.34, "grad_norm": 0.12814473985468958, "learning_rate": 2.490683284546193e-06, "loss": 0.94, "step": 1775 }, { "epoch": 0.34, "grad_norm": 0.12354291356370957, "learning_rate": 2.4868754020666566e-06, "loss": 0.9441, "step": 1780 }, { "epoch": 0.35, "grad_norm": 0.1230166173419696, "learning_rate": 2.4830562726164958e-06, "loss": 0.9207, "step": 1785 }, { "epoch": 0.35, "grad_norm": 0.11599834288712259, "learning_rate": 2.479225939720593e-06, "loss": 0.9233, "step": 1790 }, { "epoch": 0.35, "grad_norm": 0.12460890724939186, "learning_rate": 2.4753844470315135e-06, "loss": 0.938, "step": 1795 }, { "epoch": 0.35, "grad_norm": 0.1235331336241235, "learning_rate": 2.4715318383290037e-06, "loss": 0.9638, "step": 1800 }, { "epoch": 0.35, "grad_norm": 0.12749668661162603, "learning_rate": 2.4676681575194943e-06, "loss": 0.9297, "step": 1805 }, { "epoch": 0.35, "grad_norm": 0.13092231220069622, "learning_rate": 2.4637934486356012e-06, "loss": 0.9482, "step": 1810 }, { "epoch": 0.35, "grad_norm": 0.12567362421402142, "learning_rate": 2.4599077558356207e-06, "loss": 0.9716, "step": 1815 }, { "epoch": 0.35, "grad_norm": 0.12291260255078236, "learning_rate": 2.456011123403028e-06, "loss": 0.9442, "step": 1820 }, { "epoch": 0.35, "grad_norm": 0.13018458909985667, "learning_rate": 2.452103595745974e-06, "loss": 0.9583, "step": 1825 }, { "epoch": 0.35, "grad_norm": 0.12359082787357942, "learning_rate": 2.4481852173967746e-06, "loss": 0.9143, "step": 1830 }, { "epoch": 0.35, "grad_norm": 0.12792177515126044, "learning_rate": 2.4442560330114092e-06, "loss": 0.9359, "step": 1835 }, { "epoch": 0.36, "grad_norm": 0.12085993870314579, "learning_rate": 2.4403160873690063e-06, "loss": 0.9397, "step": 1840 }, { "epoch": 0.36, "grad_norm": 0.12401548468347032, "learning_rate": 2.436365425371337e-06, "loss": 0.8997, "step": 1845 }, { "epoch": 0.36, "grad_norm": 0.13214018330862026, "learning_rate": 2.432404092042301e-06, "loss": 0.927, "step": 1850 }, { "epoch": 0.36, "grad_norm": 0.12399112060015242, "learning_rate": 2.4284321325274144e-06, "loss": 0.9359, "step": 1855 }, { "epoch": 0.36, "grad_norm": 0.1251239358952118, "learning_rate": 2.424449592093296e-06, "loss": 0.9526, "step": 1860 }, { "epoch": 0.36, "grad_norm": 0.1255660262761407, "learning_rate": 2.42045651612715e-06, "loss": 0.9569, "step": 1865 }, { "epoch": 0.36, "grad_norm": 0.12365762191881352, "learning_rate": 2.416452950136248e-06, "loss": 0.9303, "step": 1870 }, { "epoch": 0.36, "grad_norm": 0.12535472693272393, "learning_rate": 2.412438939747414e-06, "loss": 0.9374, "step": 1875 }, { "epoch": 0.36, "grad_norm": 0.1339666694324748, "learning_rate": 2.4084145307065e-06, "loss": 0.9214, "step": 1880 }, { "epoch": 0.36, "grad_norm": 0.12465441927649695, "learning_rate": 2.404379768877868e-06, "loss": 0.9258, "step": 1885 }, { "epoch": 0.37, "grad_norm": 0.13785224280245373, "learning_rate": 2.4003347002438657e-06, "loss": 0.9534, "step": 1890 }, { "epoch": 0.37, "grad_norm": 0.12897658276955143, "learning_rate": 2.396279370904303e-06, "loss": 0.9378, "step": 1895 }, { "epoch": 0.37, "grad_norm": 0.1342231606408141, "learning_rate": 2.3922138270759247e-06, "loss": 0.9313, "step": 1900 }, { "epoch": 0.37, "grad_norm": 0.12478455394570859, "learning_rate": 2.388138115091888e-06, "loss": 0.9715, "step": 1905 }, { "epoch": 0.37, "grad_norm": 0.12400147036199631, "learning_rate": 2.3840522814012304e-06, "loss": 0.9335, "step": 1910 }, { "epoch": 0.37, "grad_norm": 0.12614825735019372, "learning_rate": 2.379956372568343e-06, "loss": 0.9389, "step": 1915 }, { "epoch": 0.37, "grad_norm": 0.12819460837673466, "learning_rate": 2.375850435272437e-06, "loss": 0.9298, "step": 1920 }, { "epoch": 0.37, "grad_norm": 0.13218189606634853, "learning_rate": 2.371734516307015e-06, "loss": 0.9271, "step": 1925 }, { "epoch": 0.37, "grad_norm": 0.12031632571886923, "learning_rate": 2.3676086625793353e-06, "loss": 0.9191, "step": 1930 }, { "epoch": 0.37, "grad_norm": 0.1368420193828202, "learning_rate": 2.3634729211098786e-06, "loss": 0.9335, "step": 1935 }, { "epoch": 0.38, "grad_norm": 0.12612432940530838, "learning_rate": 2.3593273390318118e-06, "loss": 0.9505, "step": 1940 }, { "epoch": 0.38, "grad_norm": 0.1291015620161155, "learning_rate": 2.355171963590451e-06, "loss": 0.9072, "step": 1945 }, { "epoch": 0.38, "grad_norm": 0.13561142947536237, "learning_rate": 2.3510068421427205e-06, "loss": 0.9557, "step": 1950 }, { "epoch": 0.38, "grad_norm": 0.1304501468364583, "learning_rate": 2.3468320221566194e-06, "loss": 0.9606, "step": 1955 }, { "epoch": 0.38, "grad_norm": 0.12763130154749866, "learning_rate": 2.3426475512106737e-06, "loss": 0.9699, "step": 1960 }, { "epoch": 0.38, "grad_norm": 0.13527714632992727, "learning_rate": 2.3384534769933968e-06, "loss": 0.9303, "step": 1965 }, { "epoch": 0.38, "grad_norm": 0.12632208652934207, "learning_rate": 2.3342498473027487e-06, "loss": 0.9403, "step": 1970 }, { "epoch": 0.38, "grad_norm": 0.13322198624906814, "learning_rate": 2.3300367100455857e-06, "loss": 0.946, "step": 1975 }, { "epoch": 0.38, "grad_norm": 0.12936391117162524, "learning_rate": 2.3258141132371215e-06, "loss": 0.9489, "step": 1980 }, { "epoch": 0.38, "grad_norm": 0.13012323743231977, "learning_rate": 2.321582105000371e-06, "loss": 0.9474, "step": 1985 }, { "epoch": 0.38, "grad_norm": 0.12787846971167063, "learning_rate": 2.317340733565611e-06, "loss": 0.9546, "step": 1990 }, { "epoch": 0.39, "grad_norm": 0.13369044731603097, "learning_rate": 2.3130900472698252e-06, "loss": 0.9638, "step": 1995 }, { "epoch": 0.39, "grad_norm": 0.13019183472993442, "learning_rate": 2.308830094556153e-06, "loss": 0.9474, "step": 2000 }, { "epoch": 0.39, "grad_norm": 0.13265098197617997, "learning_rate": 2.30456092397334e-06, "loss": 0.9323, "step": 2005 }, { "epoch": 0.39, "grad_norm": 0.1302535176783885, "learning_rate": 2.300282584175186e-06, "loss": 0.9167, "step": 2010 }, { "epoch": 0.39, "grad_norm": 0.14336847605116843, "learning_rate": 2.2959951239199844e-06, "loss": 0.9724, "step": 2015 }, { "epoch": 0.39, "grad_norm": 0.1314846076011854, "learning_rate": 2.291698592069972e-06, "loss": 0.9379, "step": 2020 }, { "epoch": 0.39, "grad_norm": 0.12542537335155546, "learning_rate": 2.2873930375907707e-06, "loss": 0.9416, "step": 2025 }, { "epoch": 0.39, "grad_norm": 0.13451062144224887, "learning_rate": 2.283078509550829e-06, "loss": 0.9423, "step": 2030 }, { "epoch": 0.39, "grad_norm": 0.12982420568281253, "learning_rate": 2.278755057120863e-06, "loss": 0.9643, "step": 2035 }, { "epoch": 0.39, "grad_norm": 0.1323171694681192, "learning_rate": 2.2744227295732956e-06, "loss": 0.9301, "step": 2040 }, { "epoch": 0.4, "grad_norm": 0.12532510813835535, "learning_rate": 2.270081576281696e-06, "loss": 0.9423, "step": 2045 }, { "epoch": 0.4, "grad_norm": 0.13571475473397304, "learning_rate": 2.2657316467202156e-06, "loss": 0.9503, "step": 2050 }, { "epoch": 0.4, "grad_norm": 0.1375461995901152, "learning_rate": 2.2613729904630256e-06, "loss": 0.9081, "step": 2055 }, { "epoch": 0.4, "grad_norm": 0.12670725904405272, "learning_rate": 2.257005657183752e-06, "loss": 0.9642, "step": 2060 }, { "epoch": 0.4, "grad_norm": 0.11945786027175435, "learning_rate": 2.2526296966549072e-06, "loss": 0.9197, "step": 2065 }, { "epoch": 0.4, "grad_norm": 0.14286568243399034, "learning_rate": 2.2482451587473258e-06, "loss": 0.9399, "step": 2070 }, { "epoch": 0.4, "grad_norm": 0.13710025446972535, "learning_rate": 2.2438520934295943e-06, "loss": 0.9213, "step": 2075 }, { "epoch": 0.4, "grad_norm": 0.13298225052401855, "learning_rate": 2.2394505507674825e-06, "loss": 0.9547, "step": 2080 }, { "epoch": 0.4, "grad_norm": 0.1314181279581931, "learning_rate": 2.2350405809233722e-06, "loss": 0.9401, "step": 2085 }, { "epoch": 0.4, "grad_norm": 0.1350139369080771, "learning_rate": 2.2306222341556866e-06, "loss": 0.9255, "step": 2090 }, { "epoch": 0.41, "grad_norm": 0.125979705316961, "learning_rate": 2.226195560818317e-06, "loss": 0.9196, "step": 2095 }, { "epoch": 0.41, "grad_norm": 0.13645001654584013, "learning_rate": 2.221760611360048e-06, "loss": 0.9383, "step": 2100 }, { "epoch": 0.41, "grad_norm": 0.13497646785844908, "learning_rate": 2.217317436323983e-06, "loss": 0.9438, "step": 2105 }, { "epoch": 0.41, "grad_norm": 0.13373081145156018, "learning_rate": 2.212866086346971e-06, "loss": 0.9498, "step": 2110 }, { "epoch": 0.41, "grad_norm": 0.15418672754446455, "learning_rate": 2.2084066121590242e-06, "loss": 0.9542, "step": 2115 }, { "epoch": 0.41, "grad_norm": 0.13946090813340417, "learning_rate": 2.2039390645827443e-06, "loss": 0.9182, "step": 2120 }, { "epoch": 0.41, "grad_norm": 0.13403421008347952, "learning_rate": 2.1994634945327416e-06, "loss": 0.9411, "step": 2125 }, { "epoch": 0.41, "grad_norm": 0.14217560114276748, "learning_rate": 2.1949799530150545e-06, "loss": 0.9449, "step": 2130 }, { "epoch": 0.41, "grad_norm": 0.13116778692015293, "learning_rate": 2.1904884911265695e-06, "loss": 0.9236, "step": 2135 }, { "epoch": 0.41, "grad_norm": 0.14745228268417065, "learning_rate": 2.185989160054436e-06, "loss": 0.9564, "step": 2140 }, { "epoch": 0.41, "grad_norm": 0.1321060448025065, "learning_rate": 2.1814820110754874e-06, "loss": 0.9392, "step": 2145 }, { "epoch": 0.42, "grad_norm": 0.12064961504005225, "learning_rate": 2.1769670955556526e-06, "loss": 0.9381, "step": 2150 }, { "epoch": 0.42, "grad_norm": 0.19148426920556538, "learning_rate": 2.1724444649493733e-06, "loss": 0.9465, "step": 2155 }, { "epoch": 0.42, "grad_norm": 0.14149357097999177, "learning_rate": 2.167914170799014e-06, "loss": 0.9536, "step": 2160 }, { "epoch": 0.42, "grad_norm": 0.13696368177795465, "learning_rate": 2.163376264734281e-06, "loss": 0.9426, "step": 2165 }, { "epoch": 0.42, "grad_norm": 0.132327168385185, "learning_rate": 2.1588307984716276e-06, "loss": 0.9415, "step": 2170 }, { "epoch": 0.42, "grad_norm": 0.1226548149068113, "learning_rate": 2.154277823813668e-06, "loss": 0.9126, "step": 2175 }, { "epoch": 0.42, "grad_norm": 0.13412829227143383, "learning_rate": 2.1497173926485853e-06, "loss": 0.9263, "step": 2180 }, { "epoch": 0.42, "grad_norm": 0.1370416338042778, "learning_rate": 2.145149556949542e-06, "loss": 0.9222, "step": 2185 }, { "epoch": 0.42, "grad_norm": 0.1348834947967263, "learning_rate": 2.1405743687740865e-06, "loss": 0.9143, "step": 2190 }, { "epoch": 0.42, "grad_norm": 0.14200964631669566, "learning_rate": 2.13599188026356e-06, "loss": 0.8973, "step": 2195 }, { "epoch": 0.43, "grad_norm": 0.14200012930084902, "learning_rate": 2.1314021436425027e-06, "loss": 0.9438, "step": 2200 }, { "epoch": 0.43, "grad_norm": 0.14116621082081662, "learning_rate": 2.126805211218057e-06, "loss": 0.9604, "step": 2205 }, { "epoch": 0.43, "grad_norm": 0.1262029892604575, "learning_rate": 2.1222011353793735e-06, "loss": 0.9436, "step": 2210 }, { "epoch": 0.43, "grad_norm": 0.13476560349631844, "learning_rate": 2.1175899685970133e-06, "loss": 0.958, "step": 2215 }, { "epoch": 0.43, "grad_norm": 0.13243928557585383, "learning_rate": 2.112971763422349e-06, "loss": 0.9356, "step": 2220 }, { "epoch": 0.43, "grad_norm": 0.14102330703687754, "learning_rate": 2.1083465724869675e-06, "loss": 0.9183, "step": 2225 }, { "epoch": 0.43, "grad_norm": 0.13195228822616697, "learning_rate": 2.1037144485020684e-06, "loss": 0.9225, "step": 2230 }, { "epoch": 0.43, "grad_norm": 0.14110146966641385, "learning_rate": 2.0990754442578637e-06, "loss": 0.9396, "step": 2235 }, { "epoch": 0.43, "grad_norm": 0.1281526405819837, "learning_rate": 2.0944296126229784e-06, "loss": 0.9115, "step": 2240 }, { "epoch": 0.43, "grad_norm": 0.1309793127327286, "learning_rate": 2.0897770065438444e-06, "loss": 0.9408, "step": 2245 }, { "epoch": 0.44, "grad_norm": 0.13497364113187624, "learning_rate": 2.0851176790440995e-06, "loss": 0.8897, "step": 2250 }, { "epoch": 0.44, "grad_norm": 0.1306605766376586, "learning_rate": 2.080451683223983e-06, "loss": 0.9038, "step": 2255 }, { "epoch": 0.44, "grad_norm": 0.1355543202117501, "learning_rate": 2.075779072259729e-06, "loss": 0.9391, "step": 2260 }, { "epoch": 0.44, "grad_norm": 0.12860928764170376, "learning_rate": 2.0710998994029625e-06, "loss": 0.9426, "step": 2265 }, { "epoch": 0.44, "grad_norm": 0.144696140607215, "learning_rate": 2.0664142179800904e-06, "loss": 0.9302, "step": 2270 }, { "epoch": 0.44, "grad_norm": 0.1417303172042183, "learning_rate": 2.061722081391695e-06, "loss": 0.9168, "step": 2275 }, { "epoch": 0.44, "grad_norm": 0.1413230349204647, "learning_rate": 2.057023543111926e-06, "loss": 0.936, "step": 2280 }, { "epoch": 0.44, "grad_norm": 0.14177967145771603, "learning_rate": 2.052318656687889e-06, "loss": 0.9258, "step": 2285 }, { "epoch": 0.44, "grad_norm": 0.13957587807590546, "learning_rate": 2.0476074757390377e-06, "loss": 0.9244, "step": 2290 }, { "epoch": 0.44, "grad_norm": 0.14172401928885273, "learning_rate": 2.042890053956561e-06, "loss": 0.9325, "step": 2295 }, { "epoch": 0.44, "grad_norm": 0.13902009470334928, "learning_rate": 2.0381664451027717e-06, "loss": 0.9226, "step": 2300 }, { "epoch": 0.45, "grad_norm": 0.1342038780728634, "learning_rate": 2.0334367030104936e-06, "loss": 0.9549, "step": 2305 }, { "epoch": 0.45, "grad_norm": 0.1355958677734405, "learning_rate": 2.0287008815824494e-06, "loss": 0.924, "step": 2310 }, { "epoch": 0.45, "grad_norm": 0.13040433649479655, "learning_rate": 2.023959034790644e-06, "loss": 0.94, "step": 2315 }, { "epoch": 0.45, "grad_norm": 0.13624869289738803, "learning_rate": 2.019211216675751e-06, "loss": 0.9112, "step": 2320 }, { "epoch": 0.45, "grad_norm": 0.14498884964787695, "learning_rate": 2.0144574813464972e-06, "loss": 0.9188, "step": 2325 }, { "epoch": 0.45, "grad_norm": 0.14355069485318295, "learning_rate": 2.009697882979044e-06, "loss": 0.9434, "step": 2330 }, { "epoch": 0.45, "grad_norm": 0.13722263129525908, "learning_rate": 2.0049324758163714e-06, "loss": 0.9304, "step": 2335 }, { "epoch": 0.45, "grad_norm": 0.13150966633376412, "learning_rate": 2.000161314167661e-06, "loss": 0.9359, "step": 2340 }, { "epoch": 0.45, "grad_norm": 0.13841892885164878, "learning_rate": 1.995384452407673e-06, "loss": 0.9394, "step": 2345 }, { "epoch": 0.45, "grad_norm": 0.13321843330058455, "learning_rate": 1.990601944976133e-06, "loss": 0.9711, "step": 2350 }, { "epoch": 0.46, "grad_norm": 0.15061604889591293, "learning_rate": 1.985813846377103e-06, "loss": 0.9272, "step": 2355 }, { "epoch": 0.46, "grad_norm": 0.1361929007088866, "learning_rate": 1.9810202111783694e-06, "loss": 0.9525, "step": 2360 }, { "epoch": 0.46, "grad_norm": 0.13830562465905946, "learning_rate": 1.976221094010814e-06, "loss": 0.9283, "step": 2365 }, { "epoch": 0.46, "grad_norm": 0.1279081357272712, "learning_rate": 1.9714165495677955e-06, "loss": 0.9431, "step": 2370 }, { "epoch": 0.46, "grad_norm": 0.14809572862407092, "learning_rate": 1.9666066326045235e-06, "loss": 0.9341, "step": 2375 }, { "epoch": 0.46, "grad_norm": 0.13408718583428794, "learning_rate": 1.961791397937437e-06, "loss": 0.9423, "step": 2380 }, { "epoch": 0.46, "grad_norm": 0.13271952301063553, "learning_rate": 1.9569709004435776e-06, "loss": 0.9167, "step": 2385 }, { "epoch": 0.46, "grad_norm": 0.14683108447360405, "learning_rate": 1.9521451950599658e-06, "loss": 0.929, "step": 2390 }, { "epoch": 0.46, "grad_norm": 0.14319313513188295, "learning_rate": 1.947314336782973e-06, "loss": 0.9152, "step": 2395 }, { "epoch": 0.46, "grad_norm": 0.14548075342960598, "learning_rate": 1.942478380667697e-06, "loss": 0.9561, "step": 2400 }, { "epoch": 0.47, "grad_norm": 0.14674027808693163, "learning_rate": 1.937637381827332e-06, "loss": 0.9176, "step": 2405 }, { "epoch": 0.47, "grad_norm": 0.13901704473297072, "learning_rate": 1.932791395432543e-06, "loss": 0.943, "step": 2410 }, { "epoch": 0.47, "grad_norm": 0.14173457335906417, "learning_rate": 1.927940476710836e-06, "loss": 0.974, "step": 2415 }, { "epoch": 0.47, "grad_norm": 0.14520506983186532, "learning_rate": 1.9230846809459268e-06, "loss": 0.9347, "step": 2420 }, { "epoch": 0.47, "grad_norm": 0.14661866708240862, "learning_rate": 1.918224063477114e-06, "loss": 0.9229, "step": 2425 }, { "epoch": 0.47, "grad_norm": 0.13549663100208073, "learning_rate": 1.9133586796986475e-06, "loss": 0.9021, "step": 2430 }, { "epoch": 0.47, "grad_norm": 0.13619817098434184, "learning_rate": 1.9084885850590945e-06, "loss": 0.9563, "step": 2435 }, { "epoch": 0.47, "grad_norm": 0.14561195607002267, "learning_rate": 1.9036138350607125e-06, "loss": 0.9473, "step": 2440 }, { "epoch": 0.47, "grad_norm": 0.13462906219833434, "learning_rate": 1.8987344852588126e-06, "loss": 0.9247, "step": 2445 }, { "epoch": 0.47, "grad_norm": 0.15242666999590032, "learning_rate": 1.893850591261127e-06, "loss": 0.9364, "step": 2450 }, { "epoch": 0.47, "grad_norm": 0.13889983715691157, "learning_rate": 1.8889622087271771e-06, "loss": 0.9413, "step": 2455 }, { "epoch": 0.48, "grad_norm": 0.14135800831918405, "learning_rate": 1.8840693933676378e-06, "loss": 0.9207, "step": 2460 }, { "epoch": 0.48, "grad_norm": 0.1372780862087748, "learning_rate": 1.879172200943704e-06, "loss": 0.9331, "step": 2465 }, { "epoch": 0.48, "grad_norm": 0.15017799031764617, "learning_rate": 1.8742706872664516e-06, "loss": 0.9336, "step": 2470 }, { "epoch": 0.48, "grad_norm": 0.13301418405514617, "learning_rate": 1.8693649081962059e-06, "loss": 0.9575, "step": 2475 }, { "epoch": 0.48, "grad_norm": 0.13920099695451857, "learning_rate": 1.864454919641902e-06, "loss": 0.9452, "step": 2480 }, { "epoch": 0.48, "grad_norm": 0.13582823833343818, "learning_rate": 1.8595407775604495e-06, "loss": 0.914, "step": 2485 }, { "epoch": 0.48, "grad_norm": 0.14431368387268362, "learning_rate": 1.8546225379560928e-06, "loss": 0.9199, "step": 2490 }, { "epoch": 0.48, "grad_norm": 0.14026316815195494, "learning_rate": 1.8497002568797739e-06, "loss": 0.9411, "step": 2495 }, { "epoch": 0.48, "grad_norm": 0.13878672097268965, "learning_rate": 1.844773990428495e-06, "loss": 0.9208, "step": 2500 }, { "epoch": 0.48, "grad_norm": 0.13541879797436218, "learning_rate": 1.839843794744676e-06, "loss": 0.9554, "step": 2505 }, { "epoch": 0.49, "grad_norm": 0.14283432429319542, "learning_rate": 1.8349097260155178e-06, "loss": 0.941, "step": 2510 }, { "epoch": 0.49, "grad_norm": 0.14515720243880362, "learning_rate": 1.8299718404723604e-06, "loss": 0.9102, "step": 2515 }, { "epoch": 0.49, "grad_norm": 0.14050528252451772, "learning_rate": 1.8250301943900415e-06, "loss": 0.9124, "step": 2520 }, { "epoch": 0.49, "grad_norm": 0.1435086593604132, "learning_rate": 1.8200848440862568e-06, "loss": 0.9384, "step": 2525 }, { "epoch": 0.49, "grad_norm": 0.14004434383038292, "learning_rate": 1.8151358459209168e-06, "loss": 0.9256, "step": 2530 }, { "epoch": 0.49, "grad_norm": 0.1504347210308783, "learning_rate": 1.810183256295506e-06, "loss": 0.9181, "step": 2535 }, { "epoch": 0.49, "grad_norm": 0.13844797279531637, "learning_rate": 1.805227131652438e-06, "loss": 0.9286, "step": 2540 }, { "epoch": 0.49, "grad_norm": 0.13788813272277844, "learning_rate": 1.800267528474414e-06, "loss": 0.9098, "step": 2545 }, { "epoch": 0.49, "grad_norm": 0.14105382541615677, "learning_rate": 1.7953045032837773e-06, "loss": 0.9289, "step": 2550 }, { "epoch": 0.49, "grad_norm": 0.1376343316669543, "learning_rate": 1.7903381126418725e-06, "loss": 0.9147, "step": 2555 }, { "epoch": 0.5, "grad_norm": 0.1393057886246714, "learning_rate": 1.7853684131483972e-06, "loss": 0.9583, "step": 2560 }, { "epoch": 0.5, "grad_norm": 0.1347402653981868, "learning_rate": 1.7803954614407588e-06, "loss": 0.956, "step": 2565 }, { "epoch": 0.5, "grad_norm": 0.1420224811227815, "learning_rate": 1.7754193141934286e-06, "loss": 0.9288, "step": 2570 }, { "epoch": 0.5, "grad_norm": 0.14376583147030975, "learning_rate": 1.7704400281172962e-06, "loss": 0.9195, "step": 2575 }, { "epoch": 0.5, "grad_norm": 0.13126659878484417, "learning_rate": 1.7654576599590229e-06, "loss": 0.9468, "step": 2580 }, { "epoch": 0.5, "grad_norm": 0.13581121757423928, "learning_rate": 1.7604722665003958e-06, "loss": 0.906, "step": 2585 }, { "epoch": 0.5, "grad_norm": 0.15390158532500306, "learning_rate": 1.7554839045576778e-06, "loss": 0.9699, "step": 2590 }, { "epoch": 0.5, "grad_norm": 0.14405781787739771, "learning_rate": 1.7504926309809655e-06, "loss": 0.9174, "step": 2595 }, { "epoch": 0.5, "grad_norm": 0.13696698824707879, "learning_rate": 1.7454985026535348e-06, "loss": 0.9178, "step": 2600 }, { "epoch": 0.5, "grad_norm": 0.14183018142151826, "learning_rate": 1.7405015764911985e-06, "loss": 0.93, "step": 2605 }, { "epoch": 0.5, "grad_norm": 0.14244315668176377, "learning_rate": 1.735501909441654e-06, "loss": 0.9081, "step": 2610 }, { "epoch": 0.51, "grad_norm": 0.13715525306632836, "learning_rate": 1.7304995584838346e-06, "loss": 0.9453, "step": 2615 }, { "epoch": 0.51, "grad_norm": 0.14544225825185408, "learning_rate": 1.7254945806272619e-06, "loss": 0.9377, "step": 2620 }, { "epoch": 0.51, "grad_norm": 0.12862157823453402, "learning_rate": 1.7204870329113952e-06, "loss": 0.9111, "step": 2625 }, { "epoch": 0.51, "grad_norm": 0.14387593948992988, "learning_rate": 1.7154769724049805e-06, "loss": 0.9179, "step": 2630 }, { "epoch": 0.51, "grad_norm": 0.14310554120599442, "learning_rate": 1.7104644562054017e-06, "loss": 0.9264, "step": 2635 }, { "epoch": 0.51, "grad_norm": 0.14369268756275277, "learning_rate": 1.705449541438028e-06, "loss": 0.9179, "step": 2640 }, { "epoch": 0.51, "grad_norm": 0.14156019346421533, "learning_rate": 1.7004322852555657e-06, "loss": 0.9411, "step": 2645 }, { "epoch": 0.51, "grad_norm": 0.13431638177331276, "learning_rate": 1.6954127448374036e-06, "loss": 0.9211, "step": 2650 }, { "epoch": 0.51, "grad_norm": 0.14619505394748813, "learning_rate": 1.6903909773889638e-06, "loss": 0.9272, "step": 2655 }, { "epoch": 0.51, "grad_norm": 0.14836272472317252, "learning_rate": 1.6853670401410484e-06, "loss": 0.9343, "step": 2660 }, { "epoch": 0.52, "grad_norm": 0.1373985024055969, "learning_rate": 1.6803409903491877e-06, "loss": 0.9318, "step": 2665 }, { "epoch": 0.52, "grad_norm": 0.15326454301403541, "learning_rate": 1.6753128852929884e-06, "loss": 0.9578, "step": 2670 }, { "epoch": 0.52, "grad_norm": 0.14559815110391214, "learning_rate": 1.6702827822754788e-06, "loss": 0.9272, "step": 2675 }, { "epoch": 0.52, "grad_norm": 0.13744648077417837, "learning_rate": 1.6652507386224587e-06, "loss": 0.8995, "step": 2680 }, { "epoch": 0.52, "grad_norm": 0.14647628387598488, "learning_rate": 1.6602168116818428e-06, "loss": 0.9162, "step": 2685 }, { "epoch": 0.52, "grad_norm": 0.14404142802195286, "learning_rate": 1.65518105882301e-06, "loss": 0.9242, "step": 2690 }, { "epoch": 0.52, "grad_norm": 0.15243037059220865, "learning_rate": 1.6501435374361478e-06, "loss": 0.93, "step": 2695 }, { "epoch": 0.52, "grad_norm": 0.14267865323341203, "learning_rate": 1.6451043049315989e-06, "loss": 0.9137, "step": 2700 }, { "epoch": 0.52, "grad_norm": 0.1481814130813317, "learning_rate": 1.6400634187392068e-06, "loss": 0.9295, "step": 2705 }, { "epoch": 0.52, "grad_norm": 0.14103202841048518, "learning_rate": 1.635020936307662e-06, "loss": 0.9286, "step": 2710 }, { "epoch": 0.53, "grad_norm": 0.14180132297439638, "learning_rate": 1.629976915103845e-06, "loss": 0.9472, "step": 2715 }, { "epoch": 0.53, "grad_norm": 0.14440535406295116, "learning_rate": 1.6249314126121743e-06, "loss": 0.916, "step": 2720 }, { "epoch": 0.53, "grad_norm": 0.14297238889743974, "learning_rate": 1.61988448633395e-06, "loss": 0.9428, "step": 2725 }, { "epoch": 0.53, "grad_norm": 0.14278206678104752, "learning_rate": 1.614836193786698e-06, "loss": 0.9388, "step": 2730 }, { "epoch": 0.53, "grad_norm": 0.14708980236362657, "learning_rate": 1.6097865925035148e-06, "loss": 0.9263, "step": 2735 }, { "epoch": 0.53, "grad_norm": 0.1493782348617741, "learning_rate": 1.6047357400324125e-06, "loss": 0.9453, "step": 2740 }, { "epoch": 0.53, "grad_norm": 0.14792348168682745, "learning_rate": 1.599683693935662e-06, "loss": 0.9471, "step": 2745 }, { "epoch": 0.53, "grad_norm": 0.14644504231264188, "learning_rate": 1.5946305117891372e-06, "loss": 0.9543, "step": 2750 }, { "epoch": 0.53, "grad_norm": 0.15603436515208155, "learning_rate": 1.5895762511816603e-06, "loss": 0.9403, "step": 2755 }, { "epoch": 0.53, "grad_norm": 0.1492183413320477, "learning_rate": 1.5845209697143427e-06, "loss": 0.9347, "step": 2760 }, { "epoch": 0.53, "grad_norm": 0.14310505430645265, "learning_rate": 1.5794647249999302e-06, "loss": 0.9284, "step": 2765 }, { "epoch": 0.54, "grad_norm": 0.15219696170127922, "learning_rate": 1.5744075746621477e-06, "loss": 0.9446, "step": 2770 }, { "epoch": 0.54, "grad_norm": 0.14278980302720323, "learning_rate": 1.5693495763350399e-06, "loss": 0.942, "step": 2775 }, { "epoch": 0.54, "grad_norm": 0.15485157792551277, "learning_rate": 1.5642907876623155e-06, "loss": 0.9495, "step": 2780 }, { "epoch": 0.54, "grad_norm": 0.14415653748935103, "learning_rate": 1.5592312662966912e-06, "loss": 0.95, "step": 2785 }, { "epoch": 0.54, "grad_norm": 0.14626040552803168, "learning_rate": 1.5541710698992333e-06, "loss": 0.9272, "step": 2790 }, { "epoch": 0.54, "grad_norm": 0.14459798185856082, "learning_rate": 1.5491102561387017e-06, "loss": 0.9287, "step": 2795 }, { "epoch": 0.54, "grad_norm": 0.14606201408180883, "learning_rate": 1.5440488826908916e-06, "loss": 0.9093, "step": 2800 }, { "epoch": 0.54, "grad_norm": 0.15371875458355483, "learning_rate": 1.5389870072379764e-06, "loss": 0.9365, "step": 2805 }, { "epoch": 0.54, "grad_norm": 0.1402365522072789, "learning_rate": 1.5339246874678514e-06, "loss": 0.9179, "step": 2810 }, { "epoch": 0.54, "grad_norm": 0.1543389255747757, "learning_rate": 1.528861981073475e-06, "loss": 0.9223, "step": 2815 }, { "epoch": 0.55, "grad_norm": 0.14977412118551237, "learning_rate": 1.523798945752212e-06, "loss": 0.9246, "step": 2820 }, { "epoch": 0.55, "grad_norm": 0.15214617356605256, "learning_rate": 1.5187356392051763e-06, "loss": 0.9199, "step": 2825 }, { "epoch": 0.55, "grad_norm": 0.14138993564510816, "learning_rate": 1.5136721191365722e-06, "loss": 0.9678, "step": 2830 }, { "epoch": 0.55, "grad_norm": 0.14972625540064466, "learning_rate": 1.5086084432530372e-06, "loss": 0.9371, "step": 2835 }, { "epoch": 0.55, "grad_norm": 0.15911745951707099, "learning_rate": 1.5035446692629851e-06, "loss": 0.9264, "step": 2840 }, { "epoch": 0.55, "grad_norm": 0.15920032142731483, "learning_rate": 1.498480854875948e-06, "loss": 0.9483, "step": 2845 }, { "epoch": 0.55, "grad_norm": 0.14963882238441822, "learning_rate": 1.4934170578019175e-06, "loss": 0.9339, "step": 2850 }, { "epoch": 0.55, "grad_norm": 0.15286707979378059, "learning_rate": 1.488353335750689e-06, "loss": 0.9406, "step": 2855 }, { "epoch": 0.55, "grad_norm": 0.1521031280190717, "learning_rate": 1.483289746431202e-06, "loss": 0.9127, "step": 2860 }, { "epoch": 0.55, "grad_norm": 0.15222047385687712, "learning_rate": 1.4782263475508832e-06, "loss": 0.9315, "step": 2865 }, { "epoch": 0.56, "grad_norm": 0.14731189974755135, "learning_rate": 1.4731631968149895e-06, "loss": 0.904, "step": 2870 }, { "epoch": 0.56, "grad_norm": 0.13976065735361923, "learning_rate": 1.4681003519259502e-06, "loss": 0.9117, "step": 2875 }, { "epoch": 0.56, "grad_norm": 0.14696780830304437, "learning_rate": 1.463037870582708e-06, "loss": 0.9206, "step": 2880 }, { "epoch": 0.56, "grad_norm": 0.14710865075058713, "learning_rate": 1.457975810480063e-06, "loss": 0.9188, "step": 2885 }, { "epoch": 0.56, "grad_norm": 0.14110608951947717, "learning_rate": 1.4529142293080148e-06, "loss": 0.9563, "step": 2890 }, { "epoch": 0.56, "grad_norm": 0.15100956168840318, "learning_rate": 1.447853184751104e-06, "loss": 0.9117, "step": 2895 }, { "epoch": 0.56, "grad_norm": 0.14173007606413557, "learning_rate": 1.4427927344877572e-06, "loss": 0.9197, "step": 2900 }, { "epoch": 0.56, "grad_norm": 0.15488805146642348, "learning_rate": 1.437732936189626e-06, "loss": 0.9286, "step": 2905 }, { "epoch": 0.56, "grad_norm": 0.1541544010687315, "learning_rate": 1.4326738475209337e-06, "loss": 0.9599, "step": 2910 }, { "epoch": 0.56, "grad_norm": 0.14095218223371167, "learning_rate": 1.427615526137815e-06, "loss": 0.8989, "step": 2915 }, { "epoch": 0.56, "grad_norm": 0.15293667596322041, "learning_rate": 1.4225580296876608e-06, "loss": 0.9447, "step": 2920 }, { "epoch": 0.57, "grad_norm": 0.14652744921172597, "learning_rate": 1.417501415808461e-06, "loss": 0.9217, "step": 2925 }, { "epoch": 0.57, "grad_norm": 0.1504944288827222, "learning_rate": 1.4124457421281463e-06, "loss": 0.9673, "step": 2930 }, { "epoch": 0.57, "grad_norm": 0.15585236316583084, "learning_rate": 1.4073910662639332e-06, "loss": 0.9065, "step": 2935 }, { "epoch": 0.57, "grad_norm": 0.14742315841540685, "learning_rate": 1.402337445821666e-06, "loss": 0.9411, "step": 2940 }, { "epoch": 0.57, "grad_norm": 0.15430455119028572, "learning_rate": 1.3972849383951611e-06, "loss": 0.9355, "step": 2945 }, { "epoch": 0.57, "grad_norm": 0.1555666818366108, "learning_rate": 1.3922336015655506e-06, "loss": 0.9167, "step": 2950 }, { "epoch": 0.57, "grad_norm": 0.14962961591804877, "learning_rate": 1.3871834929006256e-06, "loss": 0.941, "step": 2955 }, { "epoch": 0.57, "grad_norm": 0.15455497529184967, "learning_rate": 1.3821346699541796e-06, "loss": 0.9192, "step": 2960 }, { "epoch": 0.57, "grad_norm": 0.14982121619688704, "learning_rate": 1.3770871902653545e-06, "loss": 0.9248, "step": 2965 }, { "epoch": 0.57, "grad_norm": 0.14737972139745104, "learning_rate": 1.3720411113579831e-06, "loss": 0.9282, "step": 2970 }, { "epoch": 0.58, "grad_norm": 0.1435094517528824, "learning_rate": 1.3669964907399345e-06, "loss": 0.931, "step": 2975 }, { "epoch": 0.58, "grad_norm": 0.15220182643955849, "learning_rate": 1.361953385902458e-06, "loss": 0.9452, "step": 2980 }, { "epoch": 0.58, "grad_norm": 0.1565960862677695, "learning_rate": 1.3569118543195285e-06, "loss": 0.9265, "step": 2985 }, { "epoch": 0.58, "grad_norm": 0.1468387324121908, "learning_rate": 1.3518719534471912e-06, "loss": 0.929, "step": 2990 }, { "epoch": 0.58, "grad_norm": 0.1603995121000358, "learning_rate": 1.3468337407229064e-06, "loss": 0.9226, "step": 2995 }, { "epoch": 0.58, "grad_norm": 0.1571110585893993, "learning_rate": 1.341797273564896e-06, "loss": 0.929, "step": 3000 }, { "epoch": 0.58, "grad_norm": 0.14317070475253646, "learning_rate": 1.3367626093714884e-06, "loss": 0.9173, "step": 3005 }, { "epoch": 0.58, "grad_norm": 0.14419206921916547, "learning_rate": 1.3317298055204635e-06, "loss": 0.9381, "step": 3010 }, { "epoch": 0.58, "grad_norm": 0.1479165442891011, "learning_rate": 1.3266989193684006e-06, "loss": 0.9229, "step": 3015 }, { "epoch": 0.58, "grad_norm": 0.1509048030581506, "learning_rate": 1.3216700082500238e-06, "loss": 0.9346, "step": 3020 }, { "epoch": 0.59, "grad_norm": 0.1524427495079411, "learning_rate": 1.3166431294775486e-06, "loss": 0.9263, "step": 3025 }, { "epoch": 0.59, "grad_norm": 0.1363749471323768, "learning_rate": 1.3116183403400286e-06, "loss": 0.9233, "step": 3030 }, { "epoch": 0.59, "grad_norm": 0.15571108260671188, "learning_rate": 1.3065956981027027e-06, "loss": 0.9254, "step": 3035 }, { "epoch": 0.59, "grad_norm": 0.14738034590459953, "learning_rate": 1.3015752600063428e-06, "loss": 0.9356, "step": 3040 }, { "epoch": 0.59, "grad_norm": 0.15214039883958294, "learning_rate": 1.2965570832666014e-06, "loss": 0.9479, "step": 3045 }, { "epoch": 0.59, "grad_norm": 0.1367470768499345, "learning_rate": 1.2915412250733592e-06, "loss": 0.9328, "step": 3050 }, { "epoch": 0.59, "grad_norm": 0.14302733162614767, "learning_rate": 1.2865277425900725e-06, "loss": 0.9212, "step": 3055 }, { "epoch": 0.59, "grad_norm": 0.14429292224162268, "learning_rate": 1.2815166929531242e-06, "loss": 0.9071, "step": 3060 }, { "epoch": 0.59, "grad_norm": 0.15632981550136582, "learning_rate": 1.2765081332711703e-06, "loss": 0.9196, "step": 3065 }, { "epoch": 0.59, "grad_norm": 0.16001339885276877, "learning_rate": 1.2715021206244902e-06, "loss": 0.9241, "step": 3070 }, { "epoch": 0.59, "grad_norm": 0.15777057516222137, "learning_rate": 1.266498712064336e-06, "loss": 0.9261, "step": 3075 }, { "epoch": 0.6, "grad_norm": 0.15137232676250217, "learning_rate": 1.2614979646122817e-06, "loss": 0.9437, "step": 3080 }, { "epoch": 0.6, "grad_norm": 0.1509908233769637, "learning_rate": 1.2564999352595746e-06, "loss": 0.9022, "step": 3085 }, { "epoch": 0.6, "grad_norm": 0.1446990823675756, "learning_rate": 1.2515046809664841e-06, "loss": 0.9324, "step": 3090 }, { "epoch": 0.6, "grad_norm": 0.14529842278696345, "learning_rate": 1.2465122586616548e-06, "loss": 0.9186, "step": 3095 }, { "epoch": 0.6, "grad_norm": 0.14890041058005424, "learning_rate": 1.2415227252414555e-06, "loss": 0.8839, "step": 3100 }, { "epoch": 0.6, "grad_norm": 0.15401076762014934, "learning_rate": 1.2365361375693311e-06, "loss": 0.9526, "step": 3105 }, { "epoch": 0.6, "grad_norm": 0.147809786608213, "learning_rate": 1.2315525524751565e-06, "loss": 0.9561, "step": 3110 }, { "epoch": 0.6, "grad_norm": 0.13972074920029123, "learning_rate": 1.226572026754587e-06, "loss": 0.906, "step": 3115 }, { "epoch": 0.6, "grad_norm": 0.1463242760428321, "learning_rate": 1.2215946171684115e-06, "loss": 0.9261, "step": 3120 }, { "epoch": 0.6, "grad_norm": 0.15629569558792494, "learning_rate": 1.216620380441906e-06, "loss": 0.9301, "step": 3125 }, { "epoch": 0.61, "grad_norm": 0.15002654546854, "learning_rate": 1.2116493732641862e-06, "loss": 0.9271, "step": 3130 }, { "epoch": 0.61, "grad_norm": 0.154693913708028, "learning_rate": 1.2066816522875634e-06, "loss": 0.9603, "step": 3135 }, { "epoch": 0.61, "grad_norm": 0.15302292304097528, "learning_rate": 1.2017172741268962e-06, "loss": 0.9562, "step": 3140 }, { "epoch": 0.61, "grad_norm": 0.15744416055939195, "learning_rate": 1.1967562953589479e-06, "loss": 0.9249, "step": 3145 }, { "epoch": 0.61, "grad_norm": 0.15274452723176532, "learning_rate": 1.1917987725217386e-06, "loss": 0.9098, "step": 3150 }, { "epoch": 0.61, "grad_norm": 0.14903013683191468, "learning_rate": 1.1868447621139045e-06, "loss": 0.9341, "step": 3155 }, { "epoch": 0.61, "grad_norm": 0.14670691399106886, "learning_rate": 1.181894320594052e-06, "loss": 0.9349, "step": 3160 }, { "epoch": 0.61, "grad_norm": 0.1408976877727933, "learning_rate": 1.1769475043801133e-06, "loss": 0.9112, "step": 3165 }, { "epoch": 0.61, "grad_norm": 0.14431525425160555, "learning_rate": 1.1720043698487063e-06, "loss": 0.9384, "step": 3170 }, { "epoch": 0.61, "grad_norm": 0.1524691711633023, "learning_rate": 1.167064973334489e-06, "loss": 0.9309, "step": 3175 }, { "epoch": 0.62, "grad_norm": 0.1496938194214256, "learning_rate": 1.16212937112952e-06, "loss": 0.9498, "step": 3180 }, { "epoch": 0.62, "grad_norm": 0.158618142809653, "learning_rate": 1.157197619482615e-06, "loss": 0.9136, "step": 3185 }, { "epoch": 0.62, "grad_norm": 0.14386034744502232, "learning_rate": 1.1522697745987075e-06, "loss": 0.9168, "step": 3190 }, { "epoch": 0.62, "grad_norm": 0.13819775275169346, "learning_rate": 1.147345892638207e-06, "loss": 0.9169, "step": 3195 }, { "epoch": 0.62, "grad_norm": 0.1463761110381215, "learning_rate": 1.1424260297163588e-06, "loss": 0.9229, "step": 3200 }, { "epoch": 0.62, "grad_norm": 0.15981042409351381, "learning_rate": 1.1375102419026054e-06, "loss": 0.9111, "step": 3205 }, { "epoch": 0.62, "grad_norm": 0.15444315539373016, "learning_rate": 1.132598585219948e-06, "loss": 0.9368, "step": 3210 }, { "epoch": 0.62, "grad_norm": 0.1523026328416241, "learning_rate": 1.1276911156443059e-06, "loss": 0.9424, "step": 3215 }, { "epoch": 0.62, "grad_norm": 0.1420446227672182, "learning_rate": 1.122787889103881e-06, "loss": 0.9238, "step": 3220 }, { "epoch": 0.62, "grad_norm": 0.1523202126073691, "learning_rate": 1.117888961478518e-06, "loss": 0.9231, "step": 3225 }, { "epoch": 0.62, "grad_norm": 0.14814643719132345, "learning_rate": 1.1129943885990697e-06, "loss": 0.9214, "step": 3230 }, { "epoch": 0.63, "grad_norm": 0.14772444209447835, "learning_rate": 1.10810422624676e-06, "loss": 0.9151, "step": 3235 }, { "epoch": 0.63, "grad_norm": 0.14630970085839903, "learning_rate": 1.103218530152548e-06, "loss": 0.8945, "step": 3240 }, { "epoch": 0.63, "grad_norm": 0.15164841898736303, "learning_rate": 1.098337355996491e-06, "loss": 0.9372, "step": 3245 }, { "epoch": 0.63, "grad_norm": 0.1544725525584187, "learning_rate": 1.0934607594071146e-06, "loss": 0.9416, "step": 3250 }, { "epoch": 0.63, "grad_norm": 0.14788884792842535, "learning_rate": 1.0885887959607744e-06, "loss": 0.9274, "step": 3255 }, { "epoch": 0.63, "grad_norm": 0.1574408320339308, "learning_rate": 1.0837215211810242e-06, "loss": 0.929, "step": 3260 }, { "epoch": 0.63, "grad_norm": 0.1613973917431587, "learning_rate": 1.078858990537984e-06, "loss": 0.8949, "step": 3265 }, { "epoch": 0.63, "grad_norm": 0.1598506012172115, "learning_rate": 1.074001259447706e-06, "loss": 0.9188, "step": 3270 }, { "epoch": 0.63, "grad_norm": 0.1588121231918017, "learning_rate": 1.0691483832715451e-06, "loss": 0.9245, "step": 3275 }, { "epoch": 0.63, "grad_norm": 0.1649194556317263, "learning_rate": 1.0643004173155262e-06, "loss": 0.9288, "step": 3280 }, { "epoch": 0.64, "grad_norm": 0.14006308330926057, "learning_rate": 1.059457416829715e-06, "loss": 0.964, "step": 3285 }, { "epoch": 0.64, "grad_norm": 0.1485669272917807, "learning_rate": 1.0546194370075883e-06, "loss": 0.9181, "step": 3290 }, { "epoch": 0.64, "grad_norm": 0.14858509037038672, "learning_rate": 1.049786532985403e-06, "loss": 0.9272, "step": 3295 }, { "epoch": 0.64, "grad_norm": 0.14487395597716682, "learning_rate": 1.0449587598415714e-06, "loss": 0.917, "step": 3300 }, { "epoch": 0.64, "grad_norm": 0.14710872415034287, "learning_rate": 1.040136172596031e-06, "loss": 0.9247, "step": 3305 }, { "epoch": 0.64, "grad_norm": 0.1626479302951273, "learning_rate": 1.0353188262096175e-06, "loss": 0.9275, "step": 3310 }, { "epoch": 0.64, "grad_norm": 0.15781073519135003, "learning_rate": 1.0305067755834393e-06, "loss": 0.9253, "step": 3315 }, { "epoch": 0.64, "grad_norm": 0.14128421978977157, "learning_rate": 1.0257000755582512e-06, "loss": 0.9211, "step": 3320 }, { "epoch": 0.64, "grad_norm": 0.15143104599521942, "learning_rate": 1.0208987809138298e-06, "loss": 0.922, "step": 3325 }, { "epoch": 0.64, "grad_norm": 0.1622702675456415, "learning_rate": 1.0161029463683486e-06, "loss": 0.9305, "step": 3330 }, { "epoch": 0.65, "grad_norm": 0.1438713828814328, "learning_rate": 1.0113126265777563e-06, "loss": 0.9423, "step": 3335 }, { "epoch": 0.65, "grad_norm": 0.16559678050928944, "learning_rate": 1.00652787613515e-06, "loss": 0.9419, "step": 3340 }, { "epoch": 0.65, "grad_norm": 0.15093953285534764, "learning_rate": 1.0017487495701574e-06, "loss": 0.9137, "step": 3345 }, { "epoch": 0.65, "grad_norm": 0.15008677116743874, "learning_rate": 9.969753013483127e-07, "loss": 0.9304, "step": 3350 }, { "epoch": 0.65, "grad_norm": 0.16077001381591202, "learning_rate": 9.922075858704368e-07, "loss": 0.9129, "step": 3355 }, { "epoch": 0.65, "grad_norm": 0.14718070657922894, "learning_rate": 9.87445657472017e-07, "loss": 0.9213, "step": 3360 }, { "epoch": 0.65, "grad_norm": 0.14856763126559924, "learning_rate": 9.82689570422588e-07, "loss": 0.9165, "step": 3365 }, { "epoch": 0.65, "grad_norm": 0.17239452738903066, "learning_rate": 9.779393789251134e-07, "loss": 0.9234, "step": 3370 }, { "epoch": 0.65, "grad_norm": 0.1432746904130774, "learning_rate": 9.731951371153675e-07, "loss": 0.9329, "step": 3375 }, { "epoch": 0.65, "grad_norm": 0.17149057728657496, "learning_rate": 9.684568990613192e-07, "loss": 0.9489, "step": 3380 }, { "epoch": 0.65, "grad_norm": 0.15393538189231964, "learning_rate": 9.637247187625146e-07, "loss": 0.916, "step": 3385 }, { "epoch": 0.66, "grad_norm": 0.15174331734813315, "learning_rate": 9.58998650149463e-07, "loss": 0.9205, "step": 3390 }, { "epoch": 0.66, "grad_norm": 0.14922998507146534, "learning_rate": 9.542787470830209e-07, "loss": 0.9343, "step": 3395 }, { "epoch": 0.66, "grad_norm": 0.1500848629292273, "learning_rate": 9.4956506335378e-07, "loss": 0.9241, "step": 3400 }, { "epoch": 0.66, "grad_norm": 0.14949618673786386, "learning_rate": 9.44857652681452e-07, "loss": 0.9327, "step": 3405 }, { "epoch": 0.66, "grad_norm": 0.15397347493957886, "learning_rate": 9.401565687142579e-07, "loss": 0.9407, "step": 3410 }, { "epoch": 0.66, "grad_norm": 0.1533729596254089, "learning_rate": 9.354618650283159e-07, "loss": 0.9226, "step": 3415 }, { "epoch": 0.66, "grad_norm": 0.15161923850474765, "learning_rate": 9.307735951270313e-07, "loss": 0.9279, "step": 3420 }, { "epoch": 0.66, "grad_norm": 0.1522671281157681, "learning_rate": 9.260918124404861e-07, "loss": 0.9145, "step": 3425 }, { "epoch": 0.66, "grad_norm": 0.15106036750963397, "learning_rate": 9.214165703248314e-07, "loss": 0.8922, "step": 3430 }, { "epoch": 0.66, "grad_norm": 0.1508508419060107, "learning_rate": 9.167479220616762e-07, "loss": 0.9096, "step": 3435 }, { "epoch": 0.67, "grad_norm": 0.15691718553565023, "learning_rate": 9.120859208574848e-07, "loss": 0.9276, "step": 3440 }, { "epoch": 0.67, "grad_norm": 0.14818102813867765, "learning_rate": 9.074306198429669e-07, "loss": 0.9062, "step": 3445 }, { "epoch": 0.67, "grad_norm": 0.1473880251613391, "learning_rate": 9.02782072072473e-07, "loss": 0.9197, "step": 3450 }, { "epoch": 0.67, "grad_norm": 0.1526172026124484, "learning_rate": 8.981403305233904e-07, "loss": 0.9514, "step": 3455 }, { "epoch": 0.67, "grad_norm": 0.17512228057747395, "learning_rate": 8.935054480955389e-07, "loss": 0.9107, "step": 3460 }, { "epoch": 0.67, "grad_norm": 0.15716161285598004, "learning_rate": 8.888774776105679e-07, "loss": 0.8967, "step": 3465 }, { "epoch": 0.67, "grad_norm": 0.14818527302391432, "learning_rate": 8.842564718113546e-07, "loss": 0.9309, "step": 3470 }, { "epoch": 0.67, "grad_norm": 0.1575709023231356, "learning_rate": 8.796424833614026e-07, "loss": 0.9042, "step": 3475 }, { "epoch": 0.67, "grad_norm": 0.16168153914116115, "learning_rate": 8.750355648442425e-07, "loss": 0.9109, "step": 3480 }, { "epoch": 0.67, "grad_norm": 0.156594691500565, "learning_rate": 8.704357687628317e-07, "loss": 0.9162, "step": 3485 }, { "epoch": 0.68, "grad_norm": 0.14861481291797754, "learning_rate": 8.658431475389554e-07, "loss": 0.9169, "step": 3490 }, { "epoch": 0.68, "grad_norm": 0.14500543908075786, "learning_rate": 8.612577535126329e-07, "loss": 0.9372, "step": 3495 }, { "epoch": 0.68, "grad_norm": 0.15940891480295918, "learning_rate": 8.566796389415154e-07, "loss": 0.9415, "step": 3500 }, { "epoch": 0.68, "grad_norm": 0.15562117644201193, "learning_rate": 8.521088560002961e-07, "loss": 0.9133, "step": 3505 }, { "epoch": 0.68, "grad_norm": 0.15740195784796523, "learning_rate": 8.475454567801106e-07, "loss": 0.9177, "step": 3510 }, { "epoch": 0.68, "grad_norm": 0.15359832007067548, "learning_rate": 8.429894932879477e-07, "loss": 0.9243, "step": 3515 }, { "epoch": 0.68, "grad_norm": 0.16314606057976921, "learning_rate": 8.384410174460524e-07, "loss": 0.9526, "step": 3520 }, { "epoch": 0.68, "grad_norm": 0.14174905302998128, "learning_rate": 8.339000810913388e-07, "loss": 0.9268, "step": 3525 }, { "epoch": 0.68, "grad_norm": 0.1508355501376482, "learning_rate": 8.293667359747949e-07, "loss": 0.9111, "step": 3530 }, { "epoch": 0.68, "grad_norm": 0.15620625232845353, "learning_rate": 8.248410337608957e-07, "loss": 0.9258, "step": 3535 }, { "epoch": 0.68, "grad_norm": 0.15836435275133684, "learning_rate": 8.203230260270127e-07, "loss": 0.9202, "step": 3540 }, { "epoch": 0.69, "grad_norm": 0.1490971119373064, "learning_rate": 8.158127642628285e-07, "loss": 0.93, "step": 3545 }, { "epoch": 0.69, "grad_norm": 0.1498758710125973, "learning_rate": 8.113102998697464e-07, "loss": 0.9332, "step": 3550 }, { "epoch": 0.69, "grad_norm": 0.15427230559339164, "learning_rate": 8.068156841603089e-07, "loss": 0.9393, "step": 3555 }, { "epoch": 0.69, "grad_norm": 0.1554483829700364, "learning_rate": 8.02328968357608e-07, "loss": 0.9365, "step": 3560 }, { "epoch": 0.69, "grad_norm": 0.16829536457561103, "learning_rate": 7.978502035947067e-07, "loss": 0.9185, "step": 3565 }, { "epoch": 0.69, "grad_norm": 0.15319450100142196, "learning_rate": 7.933794409140512e-07, "loss": 0.9302, "step": 3570 }, { "epoch": 0.69, "grad_norm": 0.15517496668151076, "learning_rate": 7.889167312668937e-07, "loss": 0.962, "step": 3575 }, { "epoch": 0.69, "grad_norm": 0.1496297874459852, "learning_rate": 7.844621255127083e-07, "loss": 0.9217, "step": 3580 }, { "epoch": 0.69, "grad_norm": 0.15476953612136735, "learning_rate": 7.800156744186124e-07, "loss": 0.9519, "step": 3585 }, { "epoch": 0.69, "grad_norm": 0.15224644017556324, "learning_rate": 7.755774286587901e-07, "loss": 0.932, "step": 3590 }, { "epoch": 0.7, "grad_norm": 0.14505520530000182, "learning_rate": 7.711474388139111e-07, "loss": 0.9125, "step": 3595 }, { "epoch": 0.7, "grad_norm": 0.16828167804824415, "learning_rate": 7.667257553705584e-07, "loss": 0.9132, "step": 3600 }, { "epoch": 0.7, "grad_norm": 0.16605220073103116, "learning_rate": 7.623124287206483e-07, "loss": 0.9549, "step": 3605 }, { "epoch": 0.7, "grad_norm": 0.16228786623012695, "learning_rate": 7.579075091608605e-07, "loss": 0.9203, "step": 3610 }, { "epoch": 0.7, "grad_norm": 0.1599479558357973, "learning_rate": 7.535110468920611e-07, "loss": 0.9262, "step": 3615 }, { "epoch": 0.7, "grad_norm": 0.1573093783507514, "learning_rate": 7.491230920187344e-07, "loss": 0.9366, "step": 3620 }, { "epoch": 0.7, "grad_norm": 0.15963712113833328, "learning_rate": 7.447436945484082e-07, "loss": 0.9013, "step": 3625 }, { "epoch": 0.7, "grad_norm": 0.1399827930339598, "learning_rate": 7.40372904391086e-07, "loss": 0.9457, "step": 3630 }, { "epoch": 0.7, "grad_norm": 0.1562062988863496, "learning_rate": 7.360107713586768e-07, "loss": 0.9352, "step": 3635 }, { "epoch": 0.7, "grad_norm": 0.1488352670738681, "learning_rate": 7.316573451644303e-07, "loss": 0.8734, "step": 3640 }, { "epoch": 0.71, "grad_norm": 0.1552420073883167, "learning_rate": 7.27312675422366e-07, "loss": 0.9386, "step": 3645 }, { "epoch": 0.71, "grad_norm": 0.15531261843265345, "learning_rate": 7.229768116467124e-07, "loss": 0.929, "step": 3650 }, { "epoch": 0.71, "grad_norm": 0.15795986251543662, "learning_rate": 7.186498032513378e-07, "loss": 0.9157, "step": 3655 }, { "epoch": 0.71, "grad_norm": 0.14791527959203932, "learning_rate": 7.143316995491923e-07, "loss": 0.9391, "step": 3660 }, { "epoch": 0.71, "grad_norm": 0.1552589054762009, "learning_rate": 7.100225497517415e-07, "loss": 0.906, "step": 3665 }, { "epoch": 0.71, "grad_norm": 0.15429494696285626, "learning_rate": 7.05722402968409e-07, "loss": 0.9301, "step": 3670 }, { "epoch": 0.71, "grad_norm": 0.1539770736197306, "learning_rate": 7.014313082060122e-07, "loss": 0.9409, "step": 3675 }, { "epoch": 0.71, "grad_norm": 0.15378389305689893, "learning_rate": 6.971493143682105e-07, "loss": 0.9536, "step": 3680 }, { "epoch": 0.71, "grad_norm": 0.14841643591153145, "learning_rate": 6.928764702549411e-07, "loss": 0.9455, "step": 3685 }, { "epoch": 0.71, "grad_norm": 0.1424702517238874, "learning_rate": 6.886128245618684e-07, "loss": 0.9177, "step": 3690 }, { "epoch": 0.71, "grad_norm": 0.1559018526800424, "learning_rate": 6.843584258798242e-07, "loss": 0.9376, "step": 3695 }, { "epoch": 0.72, "grad_norm": 0.15722991689607704, "learning_rate": 6.801133226942587e-07, "loss": 0.9208, "step": 3700 }, { "epoch": 0.72, "grad_norm": 0.14610737260386278, "learning_rate": 6.758775633846834e-07, "loss": 0.9095, "step": 3705 }, { "epoch": 0.72, "grad_norm": 0.16090010441145305, "learning_rate": 6.716511962241237e-07, "loss": 0.929, "step": 3710 }, { "epoch": 0.72, "grad_norm": 0.1554441965831577, "learning_rate": 6.674342693785651e-07, "loss": 0.9394, "step": 3715 }, { "epoch": 0.72, "grad_norm": 0.1546491123373585, "learning_rate": 6.632268309064086e-07, "loss": 0.9409, "step": 3720 }, { "epoch": 0.72, "grad_norm": 0.14715256425800258, "learning_rate": 6.590289287579178e-07, "loss": 0.9055, "step": 3725 }, { "epoch": 0.72, "grad_norm": 0.14599984212897624, "learning_rate": 6.548406107746771e-07, "loss": 0.9433, "step": 3730 }, { "epoch": 0.72, "grad_norm": 0.16347085408735626, "learning_rate": 6.506619246890428e-07, "loss": 0.9548, "step": 3735 }, { "epoch": 0.72, "grad_norm": 0.15831789964538523, "learning_rate": 6.464929181236033e-07, "loss": 0.9386, "step": 3740 }, { "epoch": 0.72, "grad_norm": 0.15355363351088241, "learning_rate": 6.423336385906309e-07, "loss": 0.9344, "step": 3745 }, { "epoch": 0.73, "grad_norm": 0.1501784569255906, "learning_rate": 6.381841334915464e-07, "loss": 0.9293, "step": 3750 }, { "epoch": 0.73, "grad_norm": 0.16042288291190004, "learning_rate": 6.340444501163731e-07, "loss": 0.9393, "step": 3755 }, { "epoch": 0.73, "grad_norm": 0.16316861059946366, "learning_rate": 6.29914635643203e-07, "loss": 0.929, "step": 3760 }, { "epoch": 0.73, "grad_norm": 0.1643610541161199, "learning_rate": 6.257947371376546e-07, "loss": 0.9426, "step": 3765 }, { "epoch": 0.73, "grad_norm": 0.14616311834478152, "learning_rate": 6.216848015523392e-07, "loss": 0.9377, "step": 3770 }, { "epoch": 0.73, "grad_norm": 0.16152029615425262, "learning_rate": 6.175848757263268e-07, "loss": 0.9429, "step": 3775 }, { "epoch": 0.73, "grad_norm": 0.15699642229578464, "learning_rate": 6.134950063846083e-07, "loss": 0.9199, "step": 3780 }, { "epoch": 0.73, "grad_norm": 0.1530620485910159, "learning_rate": 6.094152401375673e-07, "loss": 0.922, "step": 3785 }, { "epoch": 0.73, "grad_norm": 0.15712637436393728, "learning_rate": 6.053456234804455e-07, "loss": 0.9433, "step": 3790 }, { "epoch": 0.73, "grad_norm": 0.15422389473184495, "learning_rate": 6.012862027928163e-07, "loss": 0.888, "step": 3795 }, { "epoch": 0.74, "grad_norm": 0.1566886231926118, "learning_rate": 5.972370243380519e-07, "loss": 0.925, "step": 3800 }, { "epoch": 0.74, "grad_norm": 0.159378149867113, "learning_rate": 5.931981342628009e-07, "loss": 0.9244, "step": 3805 }, { "epoch": 0.74, "grad_norm": 0.1633607022367415, "learning_rate": 5.891695785964572e-07, "loss": 0.905, "step": 3810 }, { "epoch": 0.74, "grad_norm": 0.16458140405159416, "learning_rate": 5.851514032506414e-07, "loss": 0.9371, "step": 3815 }, { "epoch": 0.74, "grad_norm": 0.17783212358538342, "learning_rate": 5.811436540186702e-07, "loss": 0.9275, "step": 3820 }, { "epoch": 0.74, "grad_norm": 0.1557140157487442, "learning_rate": 5.771463765750429e-07, "loss": 0.9483, "step": 3825 }, { "epoch": 0.74, "grad_norm": 0.16047316591282632, "learning_rate": 5.731596164749129e-07, "loss": 0.9286, "step": 3830 }, { "epoch": 0.74, "grad_norm": 0.1562396765552942, "learning_rate": 5.691834191535754e-07, "loss": 0.9419, "step": 3835 }, { "epoch": 0.74, "grad_norm": 0.16172641388514977, "learning_rate": 5.652178299259437e-07, "loss": 0.952, "step": 3840 }, { "epoch": 0.74, "grad_norm": 0.16113805975363696, "learning_rate": 5.612628939860378e-07, "loss": 0.9351, "step": 3845 }, { "epoch": 0.74, "grad_norm": 0.1437287491294515, "learning_rate": 5.573186564064649e-07, "loss": 0.9505, "step": 3850 }, { "epoch": 0.75, "grad_norm": 0.16834817212399983, "learning_rate": 5.533851621379097e-07, "loss": 0.959, "step": 3855 }, { "epoch": 0.75, "grad_norm": 0.1616903254935569, "learning_rate": 5.494624560086189e-07, "loss": 0.9197, "step": 3860 }, { "epoch": 0.75, "grad_norm": 0.15641360380963865, "learning_rate": 5.455505827238926e-07, "loss": 0.902, "step": 3865 }, { "epoch": 0.75, "grad_norm": 0.16461127069550324, "learning_rate": 5.416495868655723e-07, "loss": 0.9054, "step": 3870 }, { "epoch": 0.75, "grad_norm": 0.15407955424345696, "learning_rate": 5.377595128915371e-07, "loss": 0.9383, "step": 3875 }, { "epoch": 0.75, "grad_norm": 0.16169022882100972, "learning_rate": 5.338804051351918e-07, "loss": 0.9203, "step": 3880 }, { "epoch": 0.75, "grad_norm": 0.1671241995582254, "learning_rate": 5.30012307804966e-07, "loss": 0.9165, "step": 3885 }, { "epoch": 0.75, "grad_norm": 0.1474324575569325, "learning_rate": 5.261552649838068e-07, "loss": 0.9235, "step": 3890 }, { "epoch": 0.75, "grad_norm": 0.1585059938742927, "learning_rate": 5.223093206286801e-07, "loss": 0.9214, "step": 3895 }, { "epoch": 0.75, "grad_norm": 0.1497680366174517, "learning_rate": 5.184745185700654e-07, "loss": 0.9314, "step": 3900 }, { "epoch": 0.76, "grad_norm": 0.1546850610459381, "learning_rate": 5.146509025114608e-07, "loss": 0.9235, "step": 3905 }, { "epoch": 0.76, "grad_norm": 0.15045132406714182, "learning_rate": 5.108385160288809e-07, "loss": 0.9202, "step": 3910 }, { "epoch": 0.76, "grad_norm": 0.1583744842728306, "learning_rate": 5.070374025703618e-07, "loss": 0.9146, "step": 3915 }, { "epoch": 0.76, "grad_norm": 0.1562840274285093, "learning_rate": 5.032476054554679e-07, "loss": 0.9302, "step": 3920 }, { "epoch": 0.76, "grad_norm": 0.15846456988668864, "learning_rate": 4.994691678747944e-07, "loss": 0.9339, "step": 3925 }, { "epoch": 0.76, "grad_norm": 0.16307391364816642, "learning_rate": 4.957021328894786e-07, "loss": 0.9005, "step": 3930 }, { "epoch": 0.76, "grad_norm": 0.16346422226751806, "learning_rate": 4.919465434307062e-07, "loss": 0.9345, "step": 3935 }, { "epoch": 0.76, "grad_norm": 0.1695222160282978, "learning_rate": 4.882024422992248e-07, "loss": 0.9234, "step": 3940 }, { "epoch": 0.76, "grad_norm": 0.16278959029884632, "learning_rate": 4.844698721648531e-07, "loss": 0.9196, "step": 3945 }, { "epoch": 0.76, "grad_norm": 0.15957302783445512, "learning_rate": 4.807488755659985e-07, "loss": 0.9413, "step": 3950 }, { "epoch": 0.76, "grad_norm": 0.1560040784890354, "learning_rate": 4.770394949091678e-07, "loss": 0.9355, "step": 3955 }, { "epoch": 0.77, "grad_norm": 0.1413278988058616, "learning_rate": 4.7334177246848794e-07, "loss": 0.9295, "step": 3960 }, { "epoch": 0.77, "grad_norm": 0.1605581810941067, "learning_rate": 4.6965575038522055e-07, "loss": 0.9239, "step": 3965 }, { "epoch": 0.77, "grad_norm": 0.15990736616330292, "learning_rate": 4.6598147066728613e-07, "loss": 0.927, "step": 3970 }, { "epoch": 0.77, "grad_norm": 0.1523970977431131, "learning_rate": 4.6231897518878015e-07, "loss": 0.9268, "step": 3975 }, { "epoch": 0.77, "grad_norm": 0.15895125148362393, "learning_rate": 4.5866830568950103e-07, "loss": 0.9083, "step": 3980 }, { "epoch": 0.77, "grad_norm": 0.16109242763716947, "learning_rate": 4.550295037744694e-07, "loss": 0.9295, "step": 3985 }, { "epoch": 0.77, "grad_norm": 0.15788410886290616, "learning_rate": 4.5140261091345867e-07, "loss": 0.9355, "step": 3990 }, { "epoch": 0.77, "grad_norm": 0.16134673966779897, "learning_rate": 4.4778766844051793e-07, "loss": 0.884, "step": 3995 }, { "epoch": 0.77, "grad_norm": 0.1532078082485744, "learning_rate": 4.4418471755350544e-07, "loss": 0.9288, "step": 4000 }, { "epoch": 0.77, "grad_norm": 0.15779718366905426, "learning_rate": 4.405937993136151e-07, "loss": 0.9344, "step": 4005 }, { "epoch": 0.78, "grad_norm": 0.14884341139812074, "learning_rate": 4.370149546449109e-07, "loss": 0.891, "step": 4010 }, { "epoch": 0.78, "grad_norm": 0.1600941628867109, "learning_rate": 4.3344822433385896e-07, "loss": 0.9287, "step": 4015 }, { "epoch": 0.78, "grad_norm": 0.1617161511554377, "learning_rate": 4.2989364902886545e-07, "loss": 0.94, "step": 4020 }, { "epoch": 0.78, "grad_norm": 0.15247804429393655, "learning_rate": 4.263512692398091e-07, "loss": 0.9213, "step": 4025 }, { "epoch": 0.78, "grad_norm": 0.16018944449095257, "learning_rate": 4.228211253375843e-07, "loss": 0.9024, "step": 4030 }, { "epoch": 0.78, "grad_norm": 0.16007374852429948, "learning_rate": 4.193032575536363e-07, "loss": 0.9241, "step": 4035 }, { "epoch": 0.78, "grad_norm": 0.15889603417134412, "learning_rate": 4.1579770597950693e-07, "loss": 0.9239, "step": 4040 }, { "epoch": 0.78, "grad_norm": 0.17459525180727348, "learning_rate": 4.123045105663743e-07, "loss": 0.8917, "step": 4045 }, { "epoch": 0.78, "grad_norm": 0.15763900800425762, "learning_rate": 4.088237111246e-07, "loss": 0.9211, "step": 4050 }, { "epoch": 0.78, "grad_norm": 0.1575161337518218, "learning_rate": 4.053553473232742e-07, "loss": 0.915, "step": 4055 }, { "epoch": 0.79, "grad_norm": 0.15300282276303448, "learning_rate": 4.018994586897624e-07, "loss": 0.9249, "step": 4060 }, { "epoch": 0.79, "grad_norm": 0.1504264330652952, "learning_rate": 3.9845608460925854e-07, "loss": 0.9508, "step": 4065 }, { "epoch": 0.79, "grad_norm": 0.14959628504165817, "learning_rate": 3.950252643243317e-07, "loss": 0.9095, "step": 4070 }, { "epoch": 0.79, "grad_norm": 0.1630583733534405, "learning_rate": 3.916070369344831e-07, "loss": 0.927, "step": 4075 }, { "epoch": 0.79, "grad_norm": 0.15434075528005992, "learning_rate": 3.8820144139569635e-07, "loss": 0.9441, "step": 4080 }, { "epoch": 0.79, "grad_norm": 0.15652793092715717, "learning_rate": 3.8480851651999785e-07, "loss": 0.9061, "step": 4085 }, { "epoch": 0.79, "grad_norm": 0.24690420142064057, "learning_rate": 3.814283009750098e-07, "loss": 0.9291, "step": 4090 }, { "epoch": 0.79, "grad_norm": 0.1561055095998655, "learning_rate": 3.7806083328351425e-07, "loss": 0.9141, "step": 4095 }, { "epoch": 0.79, "grad_norm": 0.15547212939941615, "learning_rate": 3.7470615182301005e-07, "loss": 0.936, "step": 4100 }, { "epoch": 0.79, "grad_norm": 0.1398346443024953, "learning_rate": 3.713642948252779e-07, "loss": 0.9173, "step": 4105 }, { "epoch": 0.79, "grad_norm": 0.1596932224744843, "learning_rate": 3.680353003759433e-07, "loss": 0.9354, "step": 4110 }, { "epoch": 0.8, "grad_norm": 0.15975646326696724, "learning_rate": 3.6471920641404466e-07, "loss": 0.9448, "step": 4115 }, { "epoch": 0.8, "grad_norm": 0.1508038761650183, "learning_rate": 3.614160507315973e-07, "loss": 0.9207, "step": 4120 }, { "epoch": 0.8, "grad_norm": 0.1569010196095641, "learning_rate": 3.581258709731671e-07, "loss": 0.9152, "step": 4125 }, { "epoch": 0.8, "grad_norm": 0.15735074242891792, "learning_rate": 3.548487046354368e-07, "loss": 0.9048, "step": 4130 }, { "epoch": 0.8, "grad_norm": 0.1696586295135802, "learning_rate": 3.515845890667835e-07, "loss": 0.9265, "step": 4135 }, { "epoch": 0.8, "grad_norm": 0.16094178127192826, "learning_rate": 3.4833356146684856e-07, "loss": 0.9095, "step": 4140 }, { "epoch": 0.8, "grad_norm": 0.15722063022161345, "learning_rate": 3.450956588861173e-07, "loss": 0.8987, "step": 4145 }, { "epoch": 0.8, "grad_norm": 0.15767024672708221, "learning_rate": 3.418709182254943e-07, "loss": 0.9444, "step": 4150 }, { "epoch": 0.8, "grad_norm": 0.15337104954670655, "learning_rate": 3.3865937623588354e-07, "loss": 0.9231, "step": 4155 }, { "epoch": 0.8, "grad_norm": 0.16566851753448866, "learning_rate": 3.3546106951776993e-07, "loss": 0.9007, "step": 4160 }, { "epoch": 0.81, "grad_norm": 0.16192142530048215, "learning_rate": 3.322760345208031e-07, "loss": 0.9421, "step": 4165 }, { "epoch": 0.81, "grad_norm": 0.163107039234031, "learning_rate": 3.2910430754337874e-07, "loss": 0.9318, "step": 4170 }, { "epoch": 0.81, "grad_norm": 0.1545452516991764, "learning_rate": 3.259459247322295e-07, "loss": 0.9136, "step": 4175 }, { "epoch": 0.81, "grad_norm": 0.15663230748229626, "learning_rate": 3.2280092208200853e-07, "loss": 0.8954, "step": 4180 }, { "epoch": 0.81, "grad_norm": 0.15500795050421287, "learning_rate": 3.19669335434883e-07, "loss": 0.941, "step": 4185 }, { "epoch": 0.81, "grad_norm": 0.15928222838638134, "learning_rate": 3.1655120048012244e-07, "loss": 0.9408, "step": 4190 }, { "epoch": 0.81, "grad_norm": 0.15639857359171666, "learning_rate": 3.1344655275369524e-07, "loss": 0.9159, "step": 4195 }, { "epoch": 0.81, "grad_norm": 0.15912741439338757, "learning_rate": 3.1035542763786077e-07, "loss": 0.9083, "step": 4200 }, { "epoch": 0.81, "grad_norm": 0.15518719515657925, "learning_rate": 3.072778603607672e-07, "loss": 0.8945, "step": 4205 }, { "epoch": 0.81, "grad_norm": 0.15102896207837074, "learning_rate": 3.0421388599605167e-07, "loss": 0.9241, "step": 4210 }, { "epoch": 0.82, "grad_norm": 0.15953041281656985, "learning_rate": 3.0116353946243717e-07, "loss": 0.9552, "step": 4215 }, { "epoch": 0.82, "grad_norm": 0.15914050285439954, "learning_rate": 2.981268555233376e-07, "loss": 0.9346, "step": 4220 }, { "epoch": 0.82, "grad_norm": 0.1535411640949839, "learning_rate": 2.9510386878646066e-07, "loss": 0.9208, "step": 4225 }, { "epoch": 0.82, "grad_norm": 0.1624383805682274, "learning_rate": 2.920946137034121e-07, "loss": 0.9115, "step": 4230 }, { "epoch": 0.82, "grad_norm": 0.1576525021299245, "learning_rate": 2.890991245693059e-07, "loss": 0.9175, "step": 4235 }, { "epoch": 0.82, "grad_norm": 0.1689987920094538, "learning_rate": 2.861174355223702e-07, "loss": 0.9467, "step": 4240 }, { "epoch": 0.82, "grad_norm": 0.15702484189102198, "learning_rate": 2.8314958054356106e-07, "loss": 0.9432, "step": 4245 }, { "epoch": 0.82, "grad_norm": 0.16865544192759432, "learning_rate": 2.801955934561731e-07, "loss": 0.9287, "step": 4250 }, { "epoch": 0.82, "grad_norm": 0.15716123855295028, "learning_rate": 2.772555079254547e-07, "loss": 0.9393, "step": 4255 }, { "epoch": 0.82, "grad_norm": 0.15715960298271278, "learning_rate": 2.74329357458226e-07, "loss": 0.9396, "step": 4260 }, { "epoch": 0.82, "grad_norm": 0.1567680825612272, "learning_rate": 2.714171754024935e-07, "loss": 0.9387, "step": 4265 }, { "epoch": 0.83, "grad_norm": 0.1583254982488342, "learning_rate": 2.6851899494707397e-07, "loss": 0.9149, "step": 4270 }, { "epoch": 0.83, "grad_norm": 0.1528791273519717, "learning_rate": 2.6563484912121284e-07, "loss": 0.9263, "step": 4275 }, { "epoch": 0.83, "grad_norm": 0.1548488356579178, "learning_rate": 2.627647707942103e-07, "loss": 0.9125, "step": 4280 }, { "epoch": 0.83, "grad_norm": 0.16277999785485375, "learning_rate": 2.5990879267504456e-07, "loss": 0.9203, "step": 4285 }, { "epoch": 0.83, "grad_norm": 0.15633370731860066, "learning_rate": 2.5706694731200194e-07, "loss": 0.8966, "step": 4290 }, { "epoch": 0.83, "grad_norm": 0.16163084541833064, "learning_rate": 2.542392670923014e-07, "loss": 0.9185, "step": 4295 }, { "epoch": 0.83, "grad_norm": 0.1507577049675187, "learning_rate": 2.5142578424173116e-07, "loss": 0.931, "step": 4300 }, { "epoch": 0.83, "grad_norm": 0.15543875551416902, "learning_rate": 2.486265308242761e-07, "loss": 0.9197, "step": 4305 }, { "epoch": 0.83, "grad_norm": 0.16140974809909986, "learning_rate": 2.458415387417565e-07, "loss": 0.9098, "step": 4310 }, { "epoch": 0.83, "grad_norm": 0.1541006388383221, "learning_rate": 2.4307083973346144e-07, "loss": 0.9075, "step": 4315 }, { "epoch": 0.84, "grad_norm": 0.1536331376849194, "learning_rate": 2.403144653757892e-07, "loss": 0.9226, "step": 4320 }, { "epoch": 0.84, "grad_norm": 0.16664523654188895, "learning_rate": 2.3757244708188557e-07, "loss": 0.929, "step": 4325 }, { "epoch": 0.84, "grad_norm": 0.15068817418063912, "learning_rate": 2.3484481610128815e-07, "loss": 0.9422, "step": 4330 }, { "epoch": 0.84, "grad_norm": 0.15114736535904352, "learning_rate": 2.3213160351956725e-07, "loss": 0.8884, "step": 4335 }, { "epoch": 0.84, "grad_norm": 0.16457615699636646, "learning_rate": 2.2943284025797523e-07, "loss": 0.9331, "step": 4340 }, { "epoch": 0.84, "grad_norm": 0.15618814241892395, "learning_rate": 2.2674855707308938e-07, "loss": 0.9194, "step": 4345 }, { "epoch": 0.84, "grad_norm": 0.15796367715432505, "learning_rate": 2.2407878455646667e-07, "loss": 0.9318, "step": 4350 }, { "epoch": 0.84, "grad_norm": 0.14282593168610988, "learning_rate": 2.2142355313429136e-07, "loss": 0.9244, "step": 4355 }, { "epoch": 0.84, "grad_norm": 0.15601800726624074, "learning_rate": 2.1878289306702986e-07, "loss": 0.9152, "step": 4360 }, { "epoch": 0.84, "grad_norm": 0.157516630029107, "learning_rate": 2.1615683444908517e-07, "loss": 0.9228, "step": 4365 }, { "epoch": 0.85, "grad_norm": 0.16141550481008685, "learning_rate": 2.1354540720845456e-07, "loss": 0.9437, "step": 4370 }, { "epoch": 0.85, "grad_norm": 0.15114290632124544, "learning_rate": 2.1094864110638746e-07, "loss": 0.9113, "step": 4375 }, { "epoch": 0.85, "grad_norm": 0.1629233031052345, "learning_rate": 2.0836656573704817e-07, "loss": 0.9359, "step": 4380 }, { "epoch": 0.85, "grad_norm": 0.1540979875498575, "learning_rate": 2.057992105271762e-07, "loss": 0.9232, "step": 4385 }, { "epoch": 0.85, "grad_norm": 0.16470643668441212, "learning_rate": 2.0324660473575218e-07, "loss": 0.9267, "step": 4390 }, { "epoch": 0.85, "grad_norm": 0.1552333426235452, "learning_rate": 2.0070877745366546e-07, "loss": 0.9037, "step": 4395 }, { "epoch": 0.85, "grad_norm": 0.15933287223217704, "learning_rate": 1.9818575760337991e-07, "loss": 0.9572, "step": 4400 }, { "epoch": 0.85, "grad_norm": 0.1543328823907667, "learning_rate": 1.9567757393860735e-07, "loss": 0.9204, "step": 4405 }, { "epoch": 0.85, "grad_norm": 0.15475180185924467, "learning_rate": 1.9318425504397675e-07, "loss": 0.9289, "step": 4410 }, { "epoch": 0.85, "grad_norm": 0.15563510159853952, "learning_rate": 1.9070582933471158e-07, "loss": 0.9104, "step": 4415 }, { "epoch": 0.85, "grad_norm": 0.1553271103735474, "learning_rate": 1.88242325056303e-07, "loss": 0.938, "step": 4420 }, { "epoch": 0.86, "grad_norm": 0.15980736761542, "learning_rate": 1.8579377028419082e-07, "loss": 0.9622, "step": 4425 }, { "epoch": 0.86, "grad_norm": 0.15927747684592666, "learning_rate": 1.833601929234406e-07, "loss": 0.9094, "step": 4430 }, { "epoch": 0.86, "grad_norm": 0.160839016361983, "learning_rate": 1.809416207084293e-07, "loss": 0.9395, "step": 4435 }, { "epoch": 0.86, "grad_norm": 0.15946588975620996, "learning_rate": 1.7853808120252403e-07, "loss": 0.9223, "step": 4440 }, { "epoch": 0.86, "grad_norm": 0.15940399215075765, "learning_rate": 1.7614960179777373e-07, "loss": 0.9353, "step": 4445 }, { "epoch": 0.86, "grad_norm": 0.15770387513942463, "learning_rate": 1.7377620971459251e-07, "loss": 0.9348, "step": 4450 }, { "epoch": 0.86, "grad_norm": 0.15059096100398262, "learning_rate": 1.7141793200145234e-07, "loss": 0.9414, "step": 4455 }, { "epoch": 0.86, "grad_norm": 0.14981443053828966, "learning_rate": 1.6907479553457228e-07, "loss": 0.9561, "step": 4460 }, { "epoch": 0.86, "grad_norm": 0.14989095698958532, "learning_rate": 1.6674682701761496e-07, "loss": 0.9482, "step": 4465 }, { "epoch": 0.86, "grad_norm": 0.16037252571389166, "learning_rate": 1.644340529813791e-07, "loss": 0.9025, "step": 4470 }, { "epoch": 0.87, "grad_norm": 0.15692439644356643, "learning_rate": 1.6213649978350042e-07, "loss": 0.9276, "step": 4475 }, { "epoch": 0.87, "grad_norm": 0.15736372136885854, "learning_rate": 1.5985419360814878e-07, "loss": 0.903, "step": 4480 }, { "epoch": 0.87, "grad_norm": 0.16863539575661857, "learning_rate": 1.5758716046573068e-07, "loss": 0.9147, "step": 4485 }, { "epoch": 0.87, "grad_norm": 0.16040567859365207, "learning_rate": 1.553354261925925e-07, "loss": 0.9162, "step": 4490 }, { "epoch": 0.87, "grad_norm": 0.157140629226284, "learning_rate": 1.5309901645072777e-07, "loss": 0.948, "step": 4495 }, { "epoch": 0.87, "grad_norm": 0.1608315729686174, "learning_rate": 1.5087795672748156e-07, "loss": 0.9321, "step": 4500 }, { "epoch": 0.87, "grad_norm": 0.167819710766768, "learning_rate": 1.4867227233526303e-07, "loss": 0.9123, "step": 4505 }, { "epoch": 0.87, "grad_norm": 0.15821303219930233, "learning_rate": 1.4648198841125454e-07, "loss": 0.9211, "step": 4510 }, { "epoch": 0.87, "grad_norm": 0.1551506251400254, "learning_rate": 1.443071299171278e-07, "loss": 0.921, "step": 4515 }, { "epoch": 0.87, "grad_norm": 0.16270472915600528, "learning_rate": 1.4214772163875618e-07, "loss": 0.9476, "step": 4520 }, { "epoch": 0.88, "grad_norm": 0.16257841775735968, "learning_rate": 1.4000378818593534e-07, "loss": 0.9233, "step": 4525 }, { "epoch": 0.88, "grad_norm": 0.156562288683159, "learning_rate": 1.3787535399210094e-07, "loss": 0.9182, "step": 4530 }, { "epoch": 0.88, "grad_norm": 0.15476777543631168, "learning_rate": 1.3576244331404987e-07, "loss": 0.9282, "step": 4535 }, { "epoch": 0.88, "grad_norm": 0.1526747916838123, "learning_rate": 1.3366508023166618e-07, "loss": 0.9109, "step": 4540 }, { "epoch": 0.88, "grad_norm": 0.15868273523439957, "learning_rate": 1.3158328864764325e-07, "loss": 0.9183, "step": 4545 }, { "epoch": 0.88, "grad_norm": 0.157997397996658, "learning_rate": 1.2951709228721466e-07, "loss": 0.8927, "step": 4550 }, { "epoch": 0.88, "grad_norm": 0.15965300631556217, "learning_rate": 1.274665146978812e-07, "loss": 0.9422, "step": 4555 }, { "epoch": 0.88, "grad_norm": 0.16102069291165114, "learning_rate": 1.2543157924914451e-07, "loss": 0.9136, "step": 4560 }, { "epoch": 0.88, "grad_norm": 0.16644483473793495, "learning_rate": 1.234123091322389e-07, "loss": 0.9182, "step": 4565 }, { "epoch": 0.88, "grad_norm": 0.16785461200909674, "learning_rate": 1.2140872735986908e-07, "loss": 0.9019, "step": 4570 }, { "epoch": 0.88, "grad_norm": 0.15102979016575213, "learning_rate": 1.1942085676594617e-07, "loss": 0.9242, "step": 4575 }, { "epoch": 0.89, "grad_norm": 0.15019421611211958, "learning_rate": 1.1744872000532814e-07, "loss": 0.8977, "step": 4580 }, { "epoch": 0.89, "grad_norm": 0.16805390366486953, "learning_rate": 1.1549233955356143e-07, "loss": 0.9281, "step": 4585 }, { "epoch": 0.89, "grad_norm": 0.15062889503591012, "learning_rate": 1.1355173770662592e-07, "loss": 0.9197, "step": 4590 }, { "epoch": 0.89, "grad_norm": 0.15940944783864333, "learning_rate": 1.1162693658067852e-07, "loss": 0.8694, "step": 4595 }, { "epoch": 0.89, "grad_norm": 0.1594448371528468, "learning_rate": 1.0971795811180402e-07, "loss": 0.9173, "step": 4600 }, { "epoch": 0.89, "grad_norm": 0.15723697995020128, "learning_rate": 1.0782482405576194e-07, "loss": 0.9331, "step": 4605 }, { "epoch": 0.89, "grad_norm": 0.16818241136639087, "learning_rate": 1.0594755598774192e-07, "loss": 0.9224, "step": 4610 }, { "epoch": 0.89, "grad_norm": 0.16505122113793874, "learning_rate": 1.0408617530211473e-07, "loss": 0.9146, "step": 4615 }, { "epoch": 0.89, "grad_norm": 0.16226442869385957, "learning_rate": 1.0224070321219065e-07, "loss": 0.9163, "step": 4620 }, { "epoch": 0.89, "grad_norm": 0.15760858471916925, "learning_rate": 1.004111607499768e-07, "loss": 0.9125, "step": 4625 }, { "epoch": 0.9, "grad_norm": 0.167789388282492, "learning_rate": 9.859756876593723e-08, "loss": 0.953, "step": 4630 }, { "epoch": 0.9, "grad_norm": 0.1589666026077562, "learning_rate": 9.679994792875585e-08, "loss": 0.9142, "step": 4635 }, { "epoch": 0.9, "grad_norm": 0.1603705114000376, "learning_rate": 9.501831872510086e-08, "loss": 0.9343, "step": 4640 }, { "epoch": 0.9, "grad_norm": 0.16565499499646644, "learning_rate": 9.325270145939075e-08, "loss": 0.9568, "step": 4645 }, { "epoch": 0.9, "grad_norm": 0.16897573366166183, "learning_rate": 9.150311625356378e-08, "loss": 0.9335, "step": 4650 }, { "epoch": 0.9, "grad_norm": 0.15687694406207942, "learning_rate": 8.976958304684707e-08, "loss": 0.913, "step": 4655 }, { "epoch": 0.9, "grad_norm": 0.1560825616023689, "learning_rate": 8.805212159553171e-08, "loss": 0.9184, "step": 4660 }, { "epoch": 0.9, "grad_norm": 0.1517859392958044, "learning_rate": 8.635075147274501e-08, "loss": 0.9127, "step": 4665 }, { "epoch": 0.9, "grad_norm": 0.15438722856001558, "learning_rate": 8.466549206822993e-08, "loss": 0.9096, "step": 4670 }, { "epoch": 0.9, "grad_norm": 0.1652750261831842, "learning_rate": 8.299636258812199e-08, "loss": 0.9247, "step": 4675 }, { "epoch": 0.91, "grad_norm": 0.16015267448347073, "learning_rate": 8.134338205473124e-08, "loss": 0.9228, "step": 4680 }, { "epoch": 0.91, "grad_norm": 0.16415589317028237, "learning_rate": 7.970656930632663e-08, "loss": 0.9351, "step": 4685 }, { "epoch": 0.91, "grad_norm": 0.1603710732217631, "learning_rate": 7.808594299691902e-08, "loss": 0.9308, "step": 4690 }, { "epoch": 0.91, "grad_norm": 0.16313461612356253, "learning_rate": 7.64815215960501e-08, "loss": 0.9093, "step": 4695 }, { "epoch": 0.91, "grad_norm": 0.16350768777439237, "learning_rate": 7.489332338858202e-08, "loss": 0.9133, "step": 4700 }, { "epoch": 0.91, "grad_norm": 0.15832339009257615, "learning_rate": 7.332136647448795e-08, "loss": 0.9108, "step": 4705 }, { "epoch": 0.91, "grad_norm": 0.15056874945041787, "learning_rate": 7.176566876864699e-08, "loss": 0.9266, "step": 4710 }, { "epoch": 0.91, "grad_norm": 0.15648829717155813, "learning_rate": 7.022624800063876e-08, "loss": 0.924, "step": 4715 }, { "epoch": 0.91, "grad_norm": 0.1621955135948283, "learning_rate": 6.870312171454296e-08, "loss": 0.9451, "step": 4720 }, { "epoch": 0.91, "grad_norm": 0.1604060029409816, "learning_rate": 6.719630726873748e-08, "loss": 0.9418, "step": 4725 }, { "epoch": 0.91, "grad_norm": 0.1676076936878336, "learning_rate": 6.570582183570211e-08, "loss": 0.9424, "step": 4730 }, { "epoch": 0.92, "grad_norm": 0.16378443128060385, "learning_rate": 6.42316824018223e-08, "loss": 0.925, "step": 4735 }, { "epoch": 0.92, "grad_norm": 0.1541488328010328, "learning_rate": 6.277390576719538e-08, "loss": 0.9308, "step": 4740 }, { "epoch": 0.92, "grad_norm": 0.15020602128889035, "learning_rate": 6.133250854543948e-08, "loss": 0.9044, "step": 4745 }, { "epoch": 0.92, "grad_norm": 0.15152489639770436, "learning_rate": 5.990750716350374e-08, "loss": 0.9107, "step": 4750 }, { "epoch": 0.92, "grad_norm": 0.15521047884723635, "learning_rate": 5.849891786148193e-08, "loss": 0.9013, "step": 4755 }, { "epoch": 0.92, "grad_norm": 0.1516663880407599, "learning_rate": 5.710675669242577e-08, "loss": 0.9183, "step": 4760 }, { "epoch": 0.92, "grad_norm": 0.176609659206763, "learning_rate": 5.573103952216457e-08, "loss": 0.9266, "step": 4765 }, { "epoch": 0.92, "grad_norm": 0.16607598187822306, "learning_rate": 5.4371782029121074e-08, "loss": 0.9317, "step": 4770 }, { "epoch": 0.92, "grad_norm": 0.16280029991460063, "learning_rate": 5.302899970413588e-08, "loss": 0.9407, "step": 4775 }, { "epoch": 0.92, "grad_norm": 0.1501137796362083, "learning_rate": 5.17027078502888e-08, "loss": 0.9098, "step": 4780 }, { "epoch": 0.93, "grad_norm": 0.16436918493854402, "learning_rate": 5.039292158272596e-08, "loss": 0.9244, "step": 4785 }, { "epoch": 0.93, "grad_norm": 0.16169601675540524, "learning_rate": 4.909965582848614e-08, "loss": 0.8792, "step": 4790 }, { "epoch": 0.93, "grad_norm": 0.1600927235212448, "learning_rate": 4.782292532633187e-08, "loss": 0.953, "step": 4795 }, { "epoch": 0.93, "grad_norm": 0.16263427667936725, "learning_rate": 4.656274462658028e-08, "loss": 0.9308, "step": 4800 }, { "epoch": 0.93, "grad_norm": 0.16315939755816827, "learning_rate": 4.5319128090938686e-08, "loss": 0.9051, "step": 4805 }, { "epoch": 0.93, "grad_norm": 0.1612104752152402, "learning_rate": 4.409208989233943e-08, "loss": 0.9317, "step": 4810 }, { "epoch": 0.93, "grad_norm": 0.1654098368858686, "learning_rate": 4.288164401477995e-08, "loss": 0.9066, "step": 4815 }, { "epoch": 0.93, "grad_norm": 0.1609318647136051, "learning_rate": 4.1687804253161485e-08, "loss": 0.9053, "step": 4820 }, { "epoch": 0.93, "grad_norm": 0.16875575692294012, "learning_rate": 4.05105842131338e-08, "loss": 0.9519, "step": 4825 }, { "epoch": 0.93, "grad_norm": 0.1549606486338682, "learning_rate": 3.934999731093852e-08, "loss": 0.9307, "step": 4830 }, { "epoch": 0.94, "grad_norm": 0.1676676232013085, "learning_rate": 3.820605677325756e-08, "loss": 0.9626, "step": 4835 }, { "epoch": 0.94, "grad_norm": 0.15773160804940514, "learning_rate": 3.707877563706158e-08, "loss": 0.9165, "step": 4840 }, { "epoch": 0.94, "grad_norm": 0.15826127121512837, "learning_rate": 3.5968166749461463e-08, "loss": 0.8953, "step": 4845 }, { "epoch": 0.94, "grad_norm": 0.1544058879678145, "learning_rate": 3.487424276756207e-08, "loss": 0.9007, "step": 4850 }, { "epoch": 0.94, "grad_norm": 0.1735946591788913, "learning_rate": 3.379701615831837e-08, "loss": 0.9368, "step": 4855 }, { "epoch": 0.94, "grad_norm": 0.16629351979756113, "learning_rate": 3.273649919839239e-08, "loss": 0.9366, "step": 4860 }, { "epoch": 0.94, "grad_norm": 0.1670530577981281, "learning_rate": 3.16927039740143e-08, "loss": 0.8889, "step": 4865 }, { "epoch": 0.94, "grad_norm": 0.18047483077210333, "learning_rate": 3.06656423808439e-08, "loss": 0.9581, "step": 4870 }, { "epoch": 0.94, "grad_norm": 0.14862297706656663, "learning_rate": 2.9655326123835702e-08, "loss": 0.9082, "step": 4875 }, { "epoch": 0.94, "grad_norm": 0.14156600846882356, "learning_rate": 2.866176671710502e-08, "loss": 0.9334, "step": 4880 }, { "epoch": 0.94, "grad_norm": 0.152720300985099, "learning_rate": 2.7684975483797113e-08, "loss": 0.9098, "step": 4885 }, { "epoch": 0.95, "grad_norm": 0.1541159122179482, "learning_rate": 2.6724963555957937e-08, "loss": 0.9086, "step": 4890 }, { "epoch": 0.95, "grad_norm": 0.1639110432246253, "learning_rate": 2.5781741874407073e-08, "loss": 0.9278, "step": 4895 }, { "epoch": 0.95, "grad_norm": 0.15764633567026584, "learning_rate": 2.4855321188614e-08, "loss": 0.9199, "step": 4900 }, { "epoch": 0.95, "grad_norm": 0.16367449606999576, "learning_rate": 2.3945712056573866e-08, "loss": 0.9218, "step": 4905 }, { "epoch": 0.95, "grad_norm": 0.1556361631328831, "learning_rate": 2.3052924844689237e-08, "loss": 0.9185, "step": 4910 }, { "epoch": 0.95, "grad_norm": 0.14651345358604176, "learning_rate": 2.2176969727650043e-08, "loss": 0.8805, "step": 4915 }, { "epoch": 0.95, "grad_norm": 0.16911174215759373, "learning_rate": 2.1317856688318815e-08, "loss": 0.9463, "step": 4920 }, { "epoch": 0.95, "grad_norm": 0.16222471379625428, "learning_rate": 2.0475595517616465e-08, "loss": 0.9126, "step": 4925 }, { "epoch": 0.95, "grad_norm": 0.15781798751754259, "learning_rate": 1.9650195814411353e-08, "loss": 0.9225, "step": 4930 }, { "epoch": 0.95, "grad_norm": 0.14707569100008253, "learning_rate": 1.8841666985408568e-08, "loss": 0.8883, "step": 4935 }, { "epoch": 0.96, "grad_norm": 0.1714960810767451, "learning_rate": 1.8050018245043987e-08, "loss": 0.9226, "step": 4940 }, { "epoch": 0.96, "grad_norm": 0.16621777369986387, "learning_rate": 1.7275258615378377e-08, "loss": 0.9245, "step": 4945 }, { "epoch": 0.96, "grad_norm": 0.16812621541501718, "learning_rate": 1.65173969259958e-08, "loss": 0.9409, "step": 4950 }, { "epoch": 0.96, "grad_norm": 0.1587153310627031, "learning_rate": 1.5776441813901197e-08, "loss": 0.9004, "step": 4955 }, { "epoch": 0.96, "grad_norm": 0.16493984218007582, "learning_rate": 1.5052401723423815e-08, "loss": 0.9166, "step": 4960 }, { "epoch": 0.96, "grad_norm": 0.15890567241661818, "learning_rate": 1.4345284906119082e-08, "loss": 0.9117, "step": 4965 }, { "epoch": 0.96, "grad_norm": 0.15091338059065335, "learning_rate": 1.3655099420676553e-08, "loss": 0.9404, "step": 4970 }, { "epoch": 0.96, "grad_norm": 0.15944974898000105, "learning_rate": 1.2981853132826293e-08, "loss": 0.9531, "step": 4975 }, { "epoch": 0.96, "grad_norm": 0.15818858463324417, "learning_rate": 1.2325553715250792e-08, "loss": 0.912, "step": 4980 }, { "epoch": 0.96, "grad_norm": 0.15569962439666524, "learning_rate": 1.1686208647496032e-08, "loss": 0.8903, "step": 4985 }, { "epoch": 0.97, "grad_norm": 0.1783511934328571, "learning_rate": 1.1063825215887557e-08, "loss": 0.9388, "step": 4990 }, { "epoch": 0.97, "grad_norm": 0.16371221965885455, "learning_rate": 1.0458410513446203e-08, "loss": 0.9171, "step": 4995 }, { "epoch": 0.97, "grad_norm": 0.16608474399678907, "learning_rate": 9.869971439808834e-09, "loss": 0.924, "step": 5000 }, { "epoch": 0.97, "grad_norm": 0.16264204123747195, "learning_rate": 9.298514701147897e-09, "loss": 0.932, "step": 5005 }, { "epoch": 0.97, "grad_norm": 0.15931886887708344, "learning_rate": 8.744046810096329e-09, "loss": 0.9317, "step": 5010 }, { "epoch": 0.97, "grad_norm": 0.16017670612884605, "learning_rate": 8.206574085672769e-09, "loss": 0.9398, "step": 5015 }, { "epoch": 0.97, "grad_norm": 0.15982177239936074, "learning_rate": 7.68610265320946e-09, "loss": 0.9272, "step": 5020 }, { "epoch": 0.97, "grad_norm": 0.15118097697397376, "learning_rate": 7.182638444283296e-09, "loss": 0.9019, "step": 5025 }, { "epoch": 0.97, "grad_norm": 0.16365399699832117, "learning_rate": 6.6961871966470525e-09, "loss": 0.9341, "step": 5030 }, { "epoch": 0.97, "grad_norm": 0.16106966499050715, "learning_rate": 6.2267544541642625e-09, "loss": 0.9142, "step": 5035 }, { "epoch": 0.97, "grad_norm": 0.16356964266490379, "learning_rate": 5.774345566746942e-09, "loss": 0.9136, "step": 5040 }, { "epoch": 0.98, "grad_norm": 0.16839366533706696, "learning_rate": 5.338965690293795e-09, "loss": 0.9341, "step": 5045 }, { "epoch": 0.98, "grad_norm": 0.15742905820134248, "learning_rate": 4.920619786630942e-09, "loss": 0.9209, "step": 5050 }, { "epoch": 0.98, "grad_norm": 0.14496950871622408, "learning_rate": 4.519312623457117e-09, "loss": 0.9016, "step": 5055 }, { "epoch": 0.98, "grad_norm": 0.15117212300765606, "learning_rate": 4.135048774287553e-09, "loss": 0.9103, "step": 5060 }, { "epoch": 0.98, "grad_norm": 0.16671405192275532, "learning_rate": 3.767832618402689e-09, "loss": 0.9248, "step": 5065 }, { "epoch": 0.98, "grad_norm": 0.1602778426913734, "learning_rate": 3.4176683407983744e-09, "loss": 0.9405, "step": 5070 }, { "epoch": 0.98, "grad_norm": 0.14293417532619934, "learning_rate": 3.0845599321377427e-09, "loss": 0.8966, "step": 5075 }, { "epoch": 0.98, "grad_norm": 0.1513599972047693, "learning_rate": 2.7685111887059133e-09, "loss": 0.9002, "step": 5080 }, { "epoch": 0.98, "grad_norm": 0.15441045322521502, "learning_rate": 2.4695257123668602e-09, "loss": 0.8972, "step": 5085 }, { "epoch": 0.98, "grad_norm": 0.15337187659541607, "learning_rate": 2.1876069105224437e-09, "loss": 0.9051, "step": 5090 }, { "epoch": 0.99, "grad_norm": 0.15842607503539186, "learning_rate": 1.9227579960729434e-09, "loss": 0.9358, "step": 5095 }, { "epoch": 0.99, "grad_norm": 0.16236135287127654, "learning_rate": 1.6749819873810857e-09, "loss": 0.9403, "step": 5100 }, { "epoch": 0.99, "grad_norm": 0.15112590057516626, "learning_rate": 1.4442817082377379e-09, "loss": 0.8968, "step": 5105 }, { "epoch": 0.99, "grad_norm": 0.16919776377500648, "learning_rate": 1.2306597878289361e-09, "loss": 0.9206, "step": 5110 }, { "epoch": 0.99, "grad_norm": 0.16262851154680527, "learning_rate": 1.03411866070674e-09, "loss": 0.9096, "step": 5115 }, { "epoch": 0.99, "grad_norm": 0.14577473193606455, "learning_rate": 8.546605667610896e-10, "loss": 0.9482, "step": 5120 }, { "epoch": 0.99, "grad_norm": 0.15883938592062852, "learning_rate": 6.922875511943261e-10, "loss": 0.9248, "step": 5125 }, { "epoch": 0.99, "grad_norm": 0.15113652144655165, "learning_rate": 5.470014644980426e-10, "loss": 0.9265, "step": 5130 }, { "epoch": 0.99, "grad_norm": 0.1510997478676236, "learning_rate": 4.18803962431602e-10, "loss": 0.8814, "step": 5135 }, { "epoch": 0.99, "grad_norm": 0.15722861475429373, "learning_rate": 3.076965060038184e-10, "loss": 0.9215, "step": 5140 }, { "epoch": 1.0, "grad_norm": 0.15915656709758227, "learning_rate": 2.1368036145597013e-10, "loss": 0.9631, "step": 5145 }, { "epoch": 1.0, "grad_norm": 0.14617851831272438, "learning_rate": 1.3675660024714541e-10, "loss": 0.9366, "step": 5150 }, { "epoch": 1.0, "grad_norm": 0.15902826549810223, "learning_rate": 7.692609904258463e-11, "loss": 0.9326, "step": 5155 }, { "epoch": 1.0, "grad_norm": 0.16030959960966326, "learning_rate": 3.4189539703355364e-11, "loss": 0.8914, "step": 5160 }, { "epoch": 1.0, "grad_norm": 0.16538320040136756, "learning_rate": 8.547409278525376e-12, "loss": 0.9356, "step": 5165 }, { "epoch": 1.0, "grad_norm": 0.1617655412465529, "learning_rate": 0.0, "loss": 0.918, "step": 5170 }, { "epoch": 1.0, "eval_loss": 0.9631486535072327, "eval_runtime": 5393.193, "eval_samples_per_second": 5.747, "eval_steps_per_second": 0.12, "step": 5170 }, { "epoch": 1.0, "step": 5170, "total_flos": 1.360193862500352e+16, "train_loss": 0.9572911218241059, "train_runtime": 140901.3648, "train_samples_per_second": 1.761, "train_steps_per_second": 0.037 } ], "logging_steps": 5, "max_steps": 5170, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 1.360193862500352e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }