{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 83265, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005404431633939831, "grad_norm": 0.9054504632949829, "learning_rate": 3.602738080941516e-06, "loss": 1.2514, "step": 150 }, { "epoch": 0.010808863267879661, "grad_norm": 0.6858287453651428, "learning_rate": 7.205476161883032e-06, "loss": 1.1276, "step": 300 }, { "epoch": 0.016213294901819494, "grad_norm": 0.6737526655197144, "learning_rate": 1.0808214242824548e-05, "loss": 1.0866, "step": 450 }, { "epoch": 0.021617726535759323, "grad_norm": 0.6372509002685547, "learning_rate": 1.4410952323766064e-05, "loss": 1.0705, "step": 600 }, { "epoch": 0.027022158169699155, "grad_norm": 0.5196628570556641, "learning_rate": 1.8013690404707578e-05, "loss": 1.0525, "step": 750 }, { "epoch": 0.03242658980363899, "grad_norm": 0.4922332167625427, "learning_rate": 2.1616428485649097e-05, "loss": 1.0418, "step": 900 }, { "epoch": 0.03783102143757881, "grad_norm": 0.5018568634986877, "learning_rate": 2.521916656659061e-05, "loss": 1.0359, "step": 1050 }, { "epoch": 0.043235453071518645, "grad_norm": 0.4413062334060669, "learning_rate": 2.8821904647532128e-05, "loss": 1.037, "step": 1200 }, { "epoch": 0.04863988470545848, "grad_norm": 0.3888317048549652, "learning_rate": 3.242464272847364e-05, "loss": 1.0232, "step": 1350 }, { "epoch": 0.05404431633939831, "grad_norm": 0.43577057123184204, "learning_rate": 3.6027380809415156e-05, "loss": 1.0124, "step": 1500 }, { "epoch": 0.059448747973338135, "grad_norm": 0.41379234194755554, "learning_rate": 3.963011889035667e-05, "loss": 1.0225, "step": 1650 }, { "epoch": 0.06485317960727797, "grad_norm": 0.48700177669525146, "learning_rate": 4.3232856971298193e-05, "loss": 1.0138, "step": 1800 }, { "epoch": 0.0702576112412178, "grad_norm": 0.40877047181129456, "learning_rate": 4.683559505223971e-05, "loss": 1.0068, "step": 1950 }, { "epoch": 0.07566204287515763, "grad_norm": 0.37194114923477173, "learning_rate": 5.043833313318122e-05, "loss": 1.0007, "step": 2100 }, { "epoch": 0.08106647450909746, "grad_norm": 0.49839073419570923, "learning_rate": 5.404107121412274e-05, "loss": 1.0038, "step": 2250 }, { "epoch": 0.08647090614303729, "grad_norm": 0.3880678117275238, "learning_rate": 5.7643809295064256e-05, "loss": 0.9996, "step": 2400 }, { "epoch": 0.09187533777697712, "grad_norm": 0.4280707538127899, "learning_rate": 6.124654737600577e-05, "loss": 1.0049, "step": 2550 }, { "epoch": 0.09727976941091696, "grad_norm": 0.4451320469379425, "learning_rate": 6.484928545694728e-05, "loss": 1.0057, "step": 2700 }, { "epoch": 0.10268420104485679, "grad_norm": 0.38181596994400024, "learning_rate": 6.84520235378888e-05, "loss": 1.0019, "step": 2850 }, { "epoch": 0.10808863267879662, "grad_norm": 0.38614770770072937, "learning_rate": 7.205476161883031e-05, "loss": 1.0045, "step": 3000 }, { "epoch": 0.11349306431273644, "grad_norm": 0.3148934543132782, "learning_rate": 7.565749969977183e-05, "loss": 1.0041, "step": 3150 }, { "epoch": 0.11889749594667627, "grad_norm": 0.41060400009155273, "learning_rate": 7.926023778071334e-05, "loss": 1.0001, "step": 3300 }, { "epoch": 0.1243019275806161, "grad_norm": 0.40537866950035095, "learning_rate": 8.286297586165485e-05, "loss": 1.0014, "step": 3450 }, { "epoch": 0.12970635921455595, "grad_norm": 0.3297308683395386, "learning_rate": 8.646571394259639e-05, "loss": 1.0055, "step": 3600 }, { "epoch": 0.13511079084849575, "grad_norm": 0.39976179599761963, "learning_rate": 9.00684520235379e-05, "loss": 0.9993, "step": 3750 }, { "epoch": 0.1405152224824356, "grad_norm": 0.39322683215141296, "learning_rate": 9.367119010447942e-05, "loss": 0.9965, "step": 3900 }, { "epoch": 0.14591965411637542, "grad_norm": 0.45231467485427856, "learning_rate": 9.727392818542093e-05, "loss": 0.9978, "step": 4050 }, { "epoch": 0.15132408575031525, "grad_norm": 0.41241922974586487, "learning_rate": 0.00010087666626636244, "loss": 1.0051, "step": 4200 }, { "epoch": 0.15672851738425508, "grad_norm": 0.5085678100585938, "learning_rate": 0.00010447940434730397, "loss": 0.9971, "step": 4350 }, { "epoch": 0.16213294901819492, "grad_norm": 0.4659586548805237, "learning_rate": 0.00010808214242824548, "loss": 1.0083, "step": 4500 }, { "epoch": 0.16753738065213475, "grad_norm": 0.330456018447876, "learning_rate": 0.00011168488050918699, "loss": 1.0013, "step": 4650 }, { "epoch": 0.17294181228607458, "grad_norm": 0.4083492159843445, "learning_rate": 0.00011528761859012851, "loss": 1.0107, "step": 4800 }, { "epoch": 0.1783462439200144, "grad_norm": 0.5598177909851074, "learning_rate": 0.00011889035667107002, "loss": 0.9992, "step": 4950 }, { "epoch": 0.18375067555395425, "grad_norm": 0.4554787576198578, "learning_rate": 0.00012249309475201154, "loss": 0.9972, "step": 5100 }, { "epoch": 0.18915510718789408, "grad_norm": 0.5599480271339417, "learning_rate": 0.00012609583283295305, "loss": 1.0017, "step": 5250 }, { "epoch": 0.1945595388218339, "grad_norm": 0.4103052318096161, "learning_rate": 0.00012969857091389456, "loss": 1.0075, "step": 5400 }, { "epoch": 0.19996397045577374, "grad_norm": 0.5033989548683167, "learning_rate": 0.0001333013089948361, "loss": 0.9998, "step": 5550 }, { "epoch": 0.20536840208971358, "grad_norm": 0.41184836626052856, "learning_rate": 0.0001369040470757776, "loss": 1.0116, "step": 5700 }, { "epoch": 0.2107728337236534, "grad_norm": 0.4604012370109558, "learning_rate": 0.0001405067851567191, "loss": 1.0144, "step": 5850 }, { "epoch": 0.21617726535759324, "grad_norm": 0.5769256949424744, "learning_rate": 0.00014410952323766062, "loss": 1.0142, "step": 6000 }, { "epoch": 0.22158169699153304, "grad_norm": 0.49323058128356934, "learning_rate": 0.00014771226131860213, "loss": 1.0224, "step": 6150 }, { "epoch": 0.22698612862547288, "grad_norm": 0.4065729081630707, "learning_rate": 0.00015131499939954367, "loss": 1.011, "step": 6300 }, { "epoch": 0.2323905602594127, "grad_norm": 0.4484567642211914, "learning_rate": 0.00015491773748048518, "loss": 1.0135, "step": 6450 }, { "epoch": 0.23779499189335254, "grad_norm": 0.5265558958053589, "learning_rate": 0.00015852047556142668, "loss": 1.0266, "step": 6600 }, { "epoch": 0.24319942352729237, "grad_norm": 0.43009766936302185, "learning_rate": 0.0001621232136423682, "loss": 1.025, "step": 6750 }, { "epoch": 0.2486038551612322, "grad_norm": 0.45328229665756226, "learning_rate": 0.0001657259517233097, "loss": 1.0256, "step": 6900 }, { "epoch": 0.25400828679517207, "grad_norm": 0.4880930781364441, "learning_rate": 0.00016932868980425124, "loss": 1.0268, "step": 7050 }, { "epoch": 0.2594127184291119, "grad_norm": 0.4783656597137451, "learning_rate": 0.00017293142788519277, "loss": 1.0281, "step": 7200 }, { "epoch": 0.2648171500630517, "grad_norm": 0.40857091546058655, "learning_rate": 0.00017653416596613428, "loss": 1.0436, "step": 7350 }, { "epoch": 0.2702215816969915, "grad_norm": 0.5468364953994751, "learning_rate": 0.0001801369040470758, "loss": 1.0431, "step": 7500 }, { "epoch": 0.27562601333093134, "grad_norm": 0.4680778384208679, "learning_rate": 0.0001837396421280173, "loss": 1.0449, "step": 7650 }, { "epoch": 0.2810304449648712, "grad_norm": 0.5532673001289368, "learning_rate": 0.00018734238020895884, "loss": 1.0453, "step": 7800 }, { "epoch": 0.286434876598811, "grad_norm": 0.5404918789863586, "learning_rate": 0.00019094511828990034, "loss": 1.0592, "step": 7950 }, { "epoch": 0.29183930823275084, "grad_norm": 0.5416702628135681, "learning_rate": 0.00019454785637084185, "loss": 1.0541, "step": 8100 }, { "epoch": 0.29724373986669067, "grad_norm": 0.5036255121231079, "learning_rate": 0.00019815059445178336, "loss": 1.0544, "step": 8250 }, { "epoch": 0.3026481715006305, "grad_norm": 0.564854621887207, "learning_rate": 0.00019999953171425823, "loss": 1.0528, "step": 8400 }, { "epoch": 0.30805260313457034, "grad_norm": 0.5236982107162476, "learning_rate": 0.00019999563009378472, "loss": 1.0595, "step": 8550 }, { "epoch": 0.31345703476851017, "grad_norm": 0.5642319917678833, "learning_rate": 0.00019998777428218277, "loss": 1.0733, "step": 8700 }, { "epoch": 0.31886146640245, "grad_norm": 0.5522397756576538, "learning_rate": 0.00019997596459009974, "loss": 1.0685, "step": 8850 }, { "epoch": 0.32426589803638983, "grad_norm": 0.5239744782447815, "learning_rate": 0.00019996020148453384, "loss": 1.068, "step": 9000 }, { "epoch": 0.32967032967032966, "grad_norm": 0.5960803627967834, "learning_rate": 0.00019994048558881562, "loss": 1.0681, "step": 9150 }, { "epoch": 0.3350747613042695, "grad_norm": 0.5771428942680359, "learning_rate": 0.00019991681768258336, "loss": 1.0649, "step": 9300 }, { "epoch": 0.34047919293820933, "grad_norm": 0.5502661466598511, "learning_rate": 0.00019988919870175223, "loss": 1.0632, "step": 9450 }, { "epoch": 0.34588362457214916, "grad_norm": 0.5481303930282593, "learning_rate": 0.0001998576297384772, "loss": 1.0604, "step": 9600 }, { "epoch": 0.351288056206089, "grad_norm": 0.520757257938385, "learning_rate": 0.00019982211204111, "loss": 1.0703, "step": 9750 }, { "epoch": 0.3566924878400288, "grad_norm": 0.5234895348548889, "learning_rate": 0.00019978264701414963, "loss": 1.0693, "step": 9900 }, { "epoch": 0.36209691947396866, "grad_norm": 0.669703483581543, "learning_rate": 0.0001997392362181869, "loss": 1.0706, "step": 10050 }, { "epoch": 0.3675013511079085, "grad_norm": 0.5472550392150879, "learning_rate": 0.00019969188136984267, "loss": 1.0743, "step": 10200 }, { "epoch": 0.3729057827418483, "grad_norm": 0.5862524509429932, "learning_rate": 0.00019964058434169995, "loss": 1.069, "step": 10350 }, { "epoch": 0.37831021437578816, "grad_norm": 0.5793502330780029, "learning_rate": 0.0001995853471622299, "loss": 1.0686, "step": 10500 }, { "epoch": 0.383714646009728, "grad_norm": 0.670881986618042, "learning_rate": 0.0001995261720157117, "loss": 1.0749, "step": 10650 }, { "epoch": 0.3891190776436678, "grad_norm": 0.698593258857727, "learning_rate": 0.00019946306124214594, "loss": 1.0678, "step": 10800 }, { "epoch": 0.39452350927760765, "grad_norm": 0.5866215229034424, "learning_rate": 0.00019939601733716232, "loss": 1.0605, "step": 10950 }, { "epoch": 0.3999279409115475, "grad_norm": 0.5571088790893555, "learning_rate": 0.0001993250429519208, "loss": 1.0732, "step": 11100 }, { "epoch": 0.4053323725454873, "grad_norm": 0.6108280420303345, "learning_rate": 0.0001992501408930069, "loss": 1.0717, "step": 11250 }, { "epoch": 0.41073680417942715, "grad_norm": 0.5834035873413086, "learning_rate": 0.00019917131412232057, "loss": 1.0767, "step": 11400 }, { "epoch": 0.416141235813367, "grad_norm": 0.6449561715126038, "learning_rate": 0.00019908856575695925, "loss": 1.0679, "step": 11550 }, { "epoch": 0.4215456674473068, "grad_norm": 0.6005063652992249, "learning_rate": 0.00019900189906909446, "loss": 1.0697, "step": 11700 }, { "epoch": 0.42695009908124665, "grad_norm": 0.48533475399017334, "learning_rate": 0.0001989113174858424, "loss": 1.0759, "step": 11850 }, { "epoch": 0.4323545307151865, "grad_norm": 0.6543179154396057, "learning_rate": 0.00019881682458912855, "loss": 1.068, "step": 12000 }, { "epoch": 0.43775896234912626, "grad_norm": 0.6233469843864441, "learning_rate": 0.00019871842411554598, "loss": 1.0665, "step": 12150 }, { "epoch": 0.4431633939830661, "grad_norm": 0.5530846118927002, "learning_rate": 0.0001986161199562074, "loss": 1.0759, "step": 12300 }, { "epoch": 0.4485678256170059, "grad_norm": 0.6484875679016113, "learning_rate": 0.00019850991615659173, "loss": 1.0799, "step": 12450 }, { "epoch": 0.45397225725094575, "grad_norm": 0.5916330814361572, "learning_rate": 0.00019839981691638364, "loss": 1.0732, "step": 12600 }, { "epoch": 0.4593766888848856, "grad_norm": 0.6168014407157898, "learning_rate": 0.00019828582658930777, "loss": 1.063, "step": 12750 }, { "epoch": 0.4647811205188254, "grad_norm": 0.7302340269088745, "learning_rate": 0.00019816794968295648, "loss": 1.0694, "step": 12900 }, { "epoch": 0.47018555215276525, "grad_norm": 0.7804449200630188, "learning_rate": 0.00019804619085861172, "loss": 1.0681, "step": 13050 }, { "epoch": 0.4755899837867051, "grad_norm": 0.690500020980835, "learning_rate": 0.00019792055493106042, "loss": 1.0662, "step": 13200 }, { "epoch": 0.4809944154206449, "grad_norm": 0.6514592170715332, "learning_rate": 0.00019779104686840445, "loss": 1.0682, "step": 13350 }, { "epoch": 0.48639884705458475, "grad_norm": 0.7182182669639587, "learning_rate": 0.00019765767179186393, "loss": 1.0761, "step": 13500 }, { "epoch": 0.4918032786885246, "grad_norm": 0.6194586157798767, "learning_rate": 0.00019752043497557473, "loss": 1.0637, "step": 13650 }, { "epoch": 0.4972077103224644, "grad_norm": 0.5965324640274048, "learning_rate": 0.00019737934184638006, "loss": 1.0658, "step": 13800 }, { "epoch": 0.5026121419564042, "grad_norm": 0.6684099435806274, "learning_rate": 0.0001972343979836157, "loss": 1.0788, "step": 13950 }, { "epoch": 0.5080165735903441, "grad_norm": 0.6042500734329224, "learning_rate": 0.00019708560911888947, "loss": 1.0748, "step": 14100 }, { "epoch": 0.5134210052242839, "grad_norm": 0.6769179701805115, "learning_rate": 0.0001969329811358546, "loss": 1.08, "step": 14250 }, { "epoch": 0.5188254368582238, "grad_norm": 0.6137043237686157, "learning_rate": 0.000196776520069977, "loss": 1.0752, "step": 14400 }, { "epoch": 0.5242298684921636, "grad_norm": 0.5905526280403137, "learning_rate": 0.00019661623210829657, "loss": 1.0711, "step": 14550 }, { "epoch": 0.5296343001261034, "grad_norm": 0.5724222660064697, "learning_rate": 0.00019645212358918273, "loss": 1.0665, "step": 14700 }, { "epoch": 0.5350387317600432, "grad_norm": 0.6485213041305542, "learning_rate": 0.00019628420100208354, "loss": 1.075, "step": 14850 }, { "epoch": 0.540443163393983, "grad_norm": 0.6828542351722717, "learning_rate": 0.00019611247098726917, "loss": 1.0742, "step": 15000 }, { "epoch": 0.5458475950279229, "grad_norm": 0.7089459300041199, "learning_rate": 0.00019593694033556944, "loss": 1.0717, "step": 15150 }, { "epoch": 0.5512520266618627, "grad_norm": 0.6180184483528137, "learning_rate": 0.00019575761598810508, "loss": 1.0701, "step": 15300 }, { "epoch": 0.5566564582958026, "grad_norm": 0.6298936605453491, "learning_rate": 0.00019557450503601345, "loss": 1.0693, "step": 15450 }, { "epoch": 0.5620608899297423, "grad_norm": 0.7352581024169922, "learning_rate": 0.00019538761472016796, "loss": 1.0773, "step": 15600 }, { "epoch": 0.5674653215636822, "grad_norm": 0.5634006857872009, "learning_rate": 0.00019519695243089188, "loss": 1.0747, "step": 15750 }, { "epoch": 0.572869753197622, "grad_norm": 0.6061451435089111, "learning_rate": 0.00019500252570766599, "loss": 1.0659, "step": 15900 }, { "epoch": 0.5782741848315619, "grad_norm": 0.7047978043556213, "learning_rate": 0.00019480434223883046, "loss": 1.0695, "step": 16050 }, { "epoch": 0.5836786164655017, "grad_norm": 0.7310365438461304, "learning_rate": 0.00019460240986128095, "loss": 1.074, "step": 16200 }, { "epoch": 0.5890830480994416, "grad_norm": 0.7517262697219849, "learning_rate": 0.00019439673656015857, "loss": 1.0675, "step": 16350 }, { "epoch": 0.5944874797333813, "grad_norm": 0.6441323757171631, "learning_rate": 0.00019418733046853412, "loss": 1.0832, "step": 16500 }, { "epoch": 0.5998919113673212, "grad_norm": 0.7108227014541626, "learning_rate": 0.00019397419986708658, "loss": 1.0702, "step": 16650 }, { "epoch": 0.605296343001261, "grad_norm": 0.7227650284767151, "learning_rate": 0.00019375735318377557, "loss": 1.0676, "step": 16800 }, { "epoch": 0.6107007746352009, "grad_norm": 0.7566308975219727, "learning_rate": 0.00019353679899350814, "loss": 1.076, "step": 16950 }, { "epoch": 0.6161052062691407, "grad_norm": 0.5554959177970886, "learning_rate": 0.00019331254601779959, "loss": 1.0758, "step": 17100 }, { "epoch": 0.6215096379030806, "grad_norm": 0.6587594747543335, "learning_rate": 0.0001930846031244287, "loss": 1.0671, "step": 17250 }, { "epoch": 0.6269140695370203, "grad_norm": 0.7100338339805603, "learning_rate": 0.0001928529793270871, "loss": 1.067, "step": 17400 }, { "epoch": 0.6323185011709602, "grad_norm": 0.6286484003067017, "learning_rate": 0.00019261768378502262, "loss": 1.0668, "step": 17550 }, { "epoch": 0.6377229328049, "grad_norm": 0.7707709670066833, "learning_rate": 0.00019237872580267734, "loss": 1.0672, "step": 17700 }, { "epoch": 0.6431273644388399, "grad_norm": 0.7858836054801941, "learning_rate": 0.00019213611482931953, "loss": 1.0736, "step": 17850 }, { "epoch": 0.6485317960727797, "grad_norm": 0.6796938180923462, "learning_rate": 0.00019188986045866997, "loss": 1.0759, "step": 18000 }, { "epoch": 0.6539362277067196, "grad_norm": 0.6615278124809265, "learning_rate": 0.0001916399724285227, "loss": 1.0713, "step": 18150 }, { "epoch": 0.6593406593406593, "grad_norm": 0.6353105306625366, "learning_rate": 0.00019138646062035982, "loss": 1.0769, "step": 18300 }, { "epoch": 0.6647450909745992, "grad_norm": 0.6170017123222351, "learning_rate": 0.0001911293350589609, "loss": 1.07, "step": 18450 }, { "epoch": 0.670149522608539, "grad_norm": 0.6368488073348999, "learning_rate": 0.00019086860591200632, "loss": 1.0774, "step": 18600 }, { "epoch": 0.6755539542424789, "grad_norm": 0.5853469371795654, "learning_rate": 0.00019060428348967548, "loss": 1.0732, "step": 18750 }, { "epoch": 0.6809583858764187, "grad_norm": 0.7817432880401611, "learning_rate": 0.00019033637824423884, "loss": 1.0732, "step": 18900 }, { "epoch": 0.6863628175103585, "grad_norm": 0.6566998362541199, "learning_rate": 0.00019006490076964487, "loss": 1.0671, "step": 19050 }, { "epoch": 0.6917672491442983, "grad_norm": 0.5824844837188721, "learning_rate": 0.00018978986180110088, "loss": 1.0656, "step": 19200 }, { "epoch": 0.6971716807782381, "grad_norm": 0.5842050909996033, "learning_rate": 0.0001895112722146486, "loss": 1.0646, "step": 19350 }, { "epoch": 0.702576112412178, "grad_norm": 0.6520604491233826, "learning_rate": 0.00018922914302673421, "loss": 1.0745, "step": 19500 }, { "epoch": 0.7079805440461178, "grad_norm": 0.648113489151001, "learning_rate": 0.0001889434853937725, "loss": 1.0711, "step": 19650 }, { "epoch": 0.7133849756800577, "grad_norm": 1.0153329372406006, "learning_rate": 0.00018865431061170588, "loss": 1.0643, "step": 19800 }, { "epoch": 0.7187894073139974, "grad_norm": 0.6522130370140076, "learning_rate": 0.00018836163011555764, "loss": 1.0629, "step": 19950 }, { "epoch": 0.7241938389479373, "grad_norm": 0.6235710978507996, "learning_rate": 0.0001880654554789798, "loss": 1.0637, "step": 20100 }, { "epoch": 0.7295982705818771, "grad_norm": 0.6486189365386963, "learning_rate": 0.00018776579841379528, "loss": 1.0679, "step": 20250 }, { "epoch": 0.735002702215817, "grad_norm": 0.7326012849807739, "learning_rate": 0.00018746267076953505, "loss": 1.0624, "step": 20400 }, { "epoch": 0.7404071338497568, "grad_norm": 0.7451658248901367, "learning_rate": 0.00018715608453296926, "loss": 1.0799, "step": 20550 }, { "epoch": 0.7458115654836966, "grad_norm": 0.5677480101585388, "learning_rate": 0.00018684605182763355, "loss": 1.0665, "step": 20700 }, { "epoch": 0.7512159971176364, "grad_norm": 0.6265568137168884, "learning_rate": 0.00018653258491334933, "loss": 1.0562, "step": 20850 }, { "epoch": 0.7566204287515763, "grad_norm": 0.5560349225997925, "learning_rate": 0.0001862156961857392, "loss": 1.0696, "step": 21000 }, { "epoch": 0.7620248603855161, "grad_norm": 0.7811048626899719, "learning_rate": 0.0001858953981757367, "loss": 1.0713, "step": 21150 }, { "epoch": 0.767429292019456, "grad_norm": 0.8111995458602905, "learning_rate": 0.00018557170354909088, "loss": 1.0641, "step": 21300 }, { "epoch": 0.7728337236533958, "grad_norm": 0.6084979176521301, "learning_rate": 0.0001852446251058652, "loss": 1.0609, "step": 21450 }, { "epoch": 0.7782381552873356, "grad_norm": 0.6472198963165283, "learning_rate": 0.0001849141757799317, "loss": 1.0659, "step": 21600 }, { "epoch": 0.7836425869212754, "grad_norm": 0.6767707467079163, "learning_rate": 0.00018458036863845933, "loss": 1.0687, "step": 21750 }, { "epoch": 0.7890470185552153, "grad_norm": 0.6994395852088928, "learning_rate": 0.00018424321688139729, "loss": 1.0634, "step": 21900 }, { "epoch": 0.7944514501891551, "grad_norm": 0.6968779563903809, "learning_rate": 0.000183902733840953, "loss": 1.0552, "step": 22050 }, { "epoch": 0.799855881823095, "grad_norm": 0.6974983215332031, "learning_rate": 0.0001835589329810651, "loss": 1.0722, "step": 22200 }, { "epoch": 0.8052603134570347, "grad_norm": 0.6921077966690063, "learning_rate": 0.00018321182789687068, "loss": 1.0557, "step": 22350 }, { "epoch": 0.8106647450909746, "grad_norm": 0.6887233257293701, "learning_rate": 0.00018286143231416806, "loss": 1.0633, "step": 22500 }, { "epoch": 0.8160691767249144, "grad_norm": 0.6151506900787354, "learning_rate": 0.00018250776008887375, "loss": 1.0694, "step": 22650 }, { "epoch": 0.8214736083588543, "grad_norm": 0.682551383972168, "learning_rate": 0.00018215082520647467, "loss": 1.0677, "step": 22800 }, { "epoch": 0.8268780399927941, "grad_norm": 0.6813539862632751, "learning_rate": 0.00018179064178147506, "loss": 1.0628, "step": 22950 }, { "epoch": 0.832282471626734, "grad_norm": 0.583910346031189, "learning_rate": 0.00018142722405683839, "loss": 1.0605, "step": 23100 }, { "epoch": 0.8376869032606737, "grad_norm": 0.6265426278114319, "learning_rate": 0.000181060586403424, "loss": 1.0709, "step": 23250 }, { "epoch": 0.8430913348946136, "grad_norm": 0.5985749959945679, "learning_rate": 0.0001806907433194191, "loss": 1.0521, "step": 23400 }, { "epoch": 0.8484957665285534, "grad_norm": 0.6286662220954895, "learning_rate": 0.00018031770942976514, "loss": 1.0648, "step": 23550 }, { "epoch": 0.8539001981624933, "grad_norm": 0.6208794713020325, "learning_rate": 0.00017994149948557975, "loss": 1.0565, "step": 23700 }, { "epoch": 0.8593046297964331, "grad_norm": 0.7522740960121155, "learning_rate": 0.00017956212836357324, "loss": 1.0583, "step": 23850 }, { "epoch": 0.864709061430373, "grad_norm": 0.791959285736084, "learning_rate": 0.0001791796110654604, "loss": 1.0663, "step": 24000 }, { "epoch": 0.8701134930643127, "grad_norm": 0.5950735211372375, "learning_rate": 0.0001787939627173673, "loss": 1.0652, "step": 24150 }, { "epoch": 0.8755179246982525, "grad_norm": 0.6595513820648193, "learning_rate": 0.0001784051985692332, "loss": 1.051, "step": 24300 }, { "epoch": 0.8809223563321924, "grad_norm": 0.6468363404273987, "learning_rate": 0.00017801333399420724, "loss": 1.0465, "step": 24450 }, { "epoch": 0.8863267879661322, "grad_norm": 3.451094150543213, "learning_rate": 0.0001776183844880409, "loss": 1.0534, "step": 24600 }, { "epoch": 0.8917312196000721, "grad_norm": 0.6846780180931091, "learning_rate": 0.00017722036566847495, "loss": 1.0554, "step": 24750 }, { "epoch": 0.8971356512340118, "grad_norm": 0.7100343704223633, "learning_rate": 0.00017681929327462205, "loss": 1.0524, "step": 24900 }, { "epoch": 0.9025400828679517, "grad_norm": 0.5465316772460938, "learning_rate": 0.00017641518316634426, "loss": 1.046, "step": 25050 }, { "epoch": 0.9079445145018915, "grad_norm": 0.7278814911842346, "learning_rate": 0.000176008051323626, "loss": 1.0543, "step": 25200 }, { "epoch": 0.9133489461358314, "grad_norm": 0.6412672996520996, "learning_rate": 0.00017559791384594192, "loss": 1.0477, "step": 25350 }, { "epoch": 0.9187533777697712, "grad_norm": 0.6557443141937256, "learning_rate": 0.00017518478695162056, "loss": 1.0638, "step": 25500 }, { "epoch": 0.9241578094037111, "grad_norm": 0.7106101512908936, "learning_rate": 0.00017476868697720278, "loss": 1.0588, "step": 25650 }, { "epoch": 0.9295622410376508, "grad_norm": 0.6246557235717773, "learning_rate": 0.00017434963037679592, "loss": 1.054, "step": 25800 }, { "epoch": 0.9349666726715907, "grad_norm": 0.6114718914031982, "learning_rate": 0.000173927633721423, "loss": 1.0504, "step": 25950 }, { "epoch": 0.9403711043055305, "grad_norm": 0.7704567909240723, "learning_rate": 0.0001735027136983676, "loss": 1.0537, "step": 26100 }, { "epoch": 0.9457755359394704, "grad_norm": 0.6341020464897156, "learning_rate": 0.0001730748871105138, "loss": 1.0493, "step": 26250 }, { "epoch": 0.9511799675734102, "grad_norm": 0.5861644148826599, "learning_rate": 0.00017264417087568189, "loss": 1.052, "step": 26400 }, { "epoch": 0.9565843992073501, "grad_norm": 0.5983610153198242, "learning_rate": 0.00017221058202595928, "loss": 1.052, "step": 26550 }, { "epoch": 0.9619888308412898, "grad_norm": 0.6839273571968079, "learning_rate": 0.0001717741377070271, "loss": 1.0632, "step": 26700 }, { "epoch": 0.9673932624752297, "grad_norm": 0.7345322966575623, "learning_rate": 0.000171334855177482, "loss": 1.0416, "step": 26850 }, { "epoch": 0.9727976941091695, "grad_norm": 0.6669878363609314, "learning_rate": 0.00017089275180815394, "loss": 1.0499, "step": 27000 }, { "epoch": 0.9782021257431094, "grad_norm": 0.5807615518569946, "learning_rate": 0.0001704478450814191, "loss": 1.0469, "step": 27150 }, { "epoch": 0.9836065573770492, "grad_norm": 0.6089076399803162, "learning_rate": 0.00017000015259050855, "loss": 1.0403, "step": 27300 }, { "epoch": 0.989010989010989, "grad_norm": 0.6615424156188965, "learning_rate": 0.00016954969203881272, "loss": 1.0492, "step": 27450 }, { "epoch": 0.9944154206449288, "grad_norm": 0.660163164138794, "learning_rate": 0.00016909648123918116, "loss": 1.0543, "step": 27600 }, { "epoch": 0.9998198522788687, "grad_norm": 0.631686806678772, "learning_rate": 0.0001686405381132183, "loss": 1.0474, "step": 27750 }, { "epoch": 1.0052242839128085, "grad_norm": 0.7013711333274841, "learning_rate": 0.00016818188069057458, "loss": 0.9965, "step": 27900 }, { "epoch": 1.0106287155467484, "grad_norm": 0.76506507396698, "learning_rate": 0.00016772052710823374, "loss": 0.9981, "step": 28050 }, { "epoch": 1.0160331471806883, "grad_norm": 0.8097601532936096, "learning_rate": 0.00016725649560979546, "loss": 0.9995, "step": 28200 }, { "epoch": 1.021437578814628, "grad_norm": 0.795626163482666, "learning_rate": 0.00016678980454475385, "loss": 0.9983, "step": 28350 }, { "epoch": 1.0268420104485678, "grad_norm": 0.6494497060775757, "learning_rate": 0.00016632047236777214, "loss": 1.0075, "step": 28500 }, { "epoch": 1.0322464420825077, "grad_norm": 0.7171606421470642, "learning_rate": 0.00016584851763795262, "loss": 0.9972, "step": 28650 }, { "epoch": 1.0376508737164474, "grad_norm": 0.604192316532135, "learning_rate": 0.00016537395901810288, "loss": 0.9943, "step": 28800 }, { "epoch": 1.0430553053503873, "grad_norm": 0.6858931183815002, "learning_rate": 0.0001648968152739978, "loss": 1.0092, "step": 28950 }, { "epoch": 1.0484597369843272, "grad_norm": 0.685265839099884, "learning_rate": 0.00016441710527363753, "loss": 0.9936, "step": 29100 }, { "epoch": 1.053864168618267, "grad_norm": 0.6720730066299438, "learning_rate": 0.00016393484798650132, "loss": 0.993, "step": 29250 }, { "epoch": 1.0592686002522067, "grad_norm": 0.7085748314857483, "learning_rate": 0.0001634500624827973, "loss": 1.0083, "step": 29400 }, { "epoch": 1.0646730318861466, "grad_norm": 0.6460698843002319, "learning_rate": 0.00016296276793270864, "loss": 0.9952, "step": 29550 }, { "epoch": 1.0700774635200865, "grad_norm": 0.6689881086349487, "learning_rate": 0.0001624729836056352, "loss": 0.9958, "step": 29700 }, { "epoch": 1.0754818951540264, "grad_norm": 0.7271780967712402, "learning_rate": 0.00016198072886943181, "loss": 0.9954, "step": 29850 }, { "epoch": 1.080886326787966, "grad_norm": 0.5559628009796143, "learning_rate": 0.0001614860231896422, "loss": 0.9984, "step": 30000 }, { "epoch": 1.086290758421906, "grad_norm": 0.6752548813819885, "learning_rate": 0.0001609888861287293, "loss": 1.0019, "step": 30150 }, { "epoch": 1.0916951900558458, "grad_norm": 0.7046670913696289, "learning_rate": 0.0001604893373453017, "loss": 0.9936, "step": 30300 }, { "epoch": 1.0970996216897857, "grad_norm": 0.6102576851844788, "learning_rate": 0.00015998739659333638, "loss": 1.0061, "step": 30450 }, { "epoch": 1.1025040533237254, "grad_norm": 0.7669439911842346, "learning_rate": 0.00015948308372139739, "loss": 1.0017, "step": 30600 }, { "epoch": 1.1079084849576653, "grad_norm": 0.7437514662742615, "learning_rate": 0.00015897641867185092, "loss": 0.9947, "step": 30750 }, { "epoch": 1.1133129165916051, "grad_norm": 0.7851073741912842, "learning_rate": 0.0001584674214800771, "loss": 1.0026, "step": 30900 }, { "epoch": 1.118717348225545, "grad_norm": 0.7046276926994324, "learning_rate": 0.0001579561122736772, "loss": 0.9893, "step": 31050 }, { "epoch": 1.1241217798594847, "grad_norm": 0.8143602013587952, "learning_rate": 0.000157442511271678, "loss": 1.0013, "step": 31200 }, { "epoch": 1.1295262114934246, "grad_norm": 1.2338451147079468, "learning_rate": 0.0001569266387837324, "loss": 1.002, "step": 31350 }, { "epoch": 1.1349306431273645, "grad_norm": 0.7588093876838684, "learning_rate": 0.00015640851520931588, "loss": 1.0064, "step": 31500 }, { "epoch": 1.1403350747613044, "grad_norm": 0.7656028270721436, "learning_rate": 0.00015588816103692023, "loss": 0.9963, "step": 31650 }, { "epoch": 1.145739506395244, "grad_norm": 0.82599937915802, "learning_rate": 0.00015536559684324315, "loss": 0.9961, "step": 31800 }, { "epoch": 1.151143938029184, "grad_norm": 0.6491279006004333, "learning_rate": 0.0001548408432923746, "loss": 0.9946, "step": 31950 }, { "epoch": 1.1565483696631238, "grad_norm": 0.49154847860336304, "learning_rate": 0.00015431392113497979, "loss": 1.0035, "step": 32100 }, { "epoch": 1.1619528012970637, "grad_norm": 0.5830157399177551, "learning_rate": 0.00015378485120747835, "loss": 0.9978, "step": 32250 }, { "epoch": 1.1673572329310034, "grad_norm": 0.6672685146331787, "learning_rate": 0.00015325365443122078, "loss": 1.0079, "step": 32400 }, { "epoch": 1.1727616645649432, "grad_norm": 0.7243463397026062, "learning_rate": 0.00015272035181166066, "loss": 1.0023, "step": 32550 }, { "epoch": 1.1781660961988831, "grad_norm": 0.6492652893066406, "learning_rate": 0.00015218496443752456, "loss": 0.9972, "step": 32700 }, { "epoch": 1.1835705278328228, "grad_norm": 0.6047407388687134, "learning_rate": 0.00015164751347997762, "loss": 0.9864, "step": 32850 }, { "epoch": 1.1889749594667627, "grad_norm": 0.6448661088943481, "learning_rate": 0.00015110802019178661, "loss": 1.0046, "step": 33000 }, { "epoch": 1.1943793911007026, "grad_norm": 0.7006458044052124, "learning_rate": 0.0001505665059064796, "loss": 1.0018, "step": 33150 }, { "epoch": 1.1997838227346425, "grad_norm": 0.6918825507164001, "learning_rate": 0.00015002299203750212, "loss": 0.991, "step": 33300 }, { "epoch": 1.2051882543685823, "grad_norm": 0.6090679168701172, "learning_rate": 0.00014947750007737062, "loss": 0.9939, "step": 33450 }, { "epoch": 1.210592686002522, "grad_norm": 0.718387246131897, "learning_rate": 0.00014893005159682233, "loss": 0.9873, "step": 33600 }, { "epoch": 1.215997117636462, "grad_norm": 0.6664546132087708, "learning_rate": 0.00014838066824396256, "loss": 0.9926, "step": 33750 }, { "epoch": 1.2214015492704018, "grad_norm": 0.6758761405944824, "learning_rate": 0.00014782937174340845, "loss": 0.9924, "step": 33900 }, { "epoch": 1.2268059809043415, "grad_norm": 0.5241803526878357, "learning_rate": 0.00014727618389542995, "loss": 0.9935, "step": 34050 }, { "epoch": 1.2322104125382813, "grad_norm": 0.6897122859954834, "learning_rate": 0.00014672112657508778, "loss": 0.9859, "step": 34200 }, { "epoch": 1.2376148441722212, "grad_norm": 0.6511486172676086, "learning_rate": 0.00014616422173136846, "loss": 0.9905, "step": 34350 }, { "epoch": 1.2430192758061611, "grad_norm": 0.8631020784378052, "learning_rate": 0.00014560549138631617, "loss": 0.9996, "step": 34500 }, { "epoch": 1.248423707440101, "grad_norm": 0.5925600528717041, "learning_rate": 0.00014504495763416225, "loss": 0.9961, "step": 34650 }, { "epoch": 1.2538281390740407, "grad_norm": 0.6121050715446472, "learning_rate": 0.00014448264264045114, "loss": 1.0039, "step": 34800 }, { "epoch": 1.2592325707079806, "grad_norm": 0.628056526184082, "learning_rate": 0.00014391856864116414, "loss": 1.0004, "step": 34950 }, { "epoch": 1.2646370023419204, "grad_norm": 0.6576303243637085, "learning_rate": 0.00014335275794184003, "loss": 0.9978, "step": 35100 }, { "epoch": 1.2700414339758601, "grad_norm": 0.5684065222740173, "learning_rate": 0.00014278523291669302, "loss": 0.9874, "step": 35250 }, { "epoch": 1.2754458656098, "grad_norm": 0.8131369352340698, "learning_rate": 0.000142216016007728, "loss": 1.0006, "step": 35400 }, { "epoch": 1.2808502972437399, "grad_norm": 0.6513379216194153, "learning_rate": 0.00014164512972385306, "loss": 0.9817, "step": 35550 }, { "epoch": 1.2862547288776798, "grad_norm": 0.6244243383407593, "learning_rate": 0.0001410725966399896, "loss": 0.9805, "step": 35700 }, { "epoch": 1.2916591605116197, "grad_norm": 0.760666012763977, "learning_rate": 0.00014049843939617924, "loss": 0.9889, "step": 35850 }, { "epoch": 1.2970635921455593, "grad_norm": 0.7188459634780884, "learning_rate": 0.00013992268069668904, "loss": 0.9895, "step": 36000 }, { "epoch": 1.3024680237794992, "grad_norm": 0.6034685969352722, "learning_rate": 0.0001393453433091133, "loss": 0.9882, "step": 36150 }, { "epoch": 1.307872455413439, "grad_norm": 0.6076464653015137, "learning_rate": 0.0001387664500634734, "loss": 0.9823, "step": 36300 }, { "epoch": 1.3132768870473788, "grad_norm": 0.6652275323867798, "learning_rate": 0.00013818602385131512, "loss": 0.9784, "step": 36450 }, { "epoch": 1.3186813186813187, "grad_norm": 0.6014280319213867, "learning_rate": 0.00013760408762480316, "loss": 0.9812, "step": 36600 }, { "epoch": 1.3240857503152585, "grad_norm": 0.6998510360717773, "learning_rate": 0.00013702066439581382, "loss": 0.9886, "step": 36750 }, { "epoch": 1.3294901819491982, "grad_norm": 0.5891895294189453, "learning_rate": 0.00013643577723502476, "loss": 0.9873, "step": 36900 }, { "epoch": 1.334894613583138, "grad_norm": 0.7246126532554626, "learning_rate": 0.00013584944927100298, "loss": 0.9859, "step": 37050 }, { "epoch": 1.340299045217078, "grad_norm": 0.664380669593811, "learning_rate": 0.00013526170368928993, "loss": 0.9793, "step": 37200 }, { "epoch": 1.3457034768510179, "grad_norm": 0.6437602639198303, "learning_rate": 0.00013467256373148496, "loss": 0.9853, "step": 37350 }, { "epoch": 1.3511079084849578, "grad_norm": 0.6728150844573975, "learning_rate": 0.000134082052694326, "loss": 0.9792, "step": 37500 }, { "epoch": 1.3565123401188974, "grad_norm": 0.8101018071174622, "learning_rate": 0.00013349019392876858, "loss": 0.9791, "step": 37650 }, { "epoch": 1.3619167717528373, "grad_norm": 0.6081525683403015, "learning_rate": 0.00013289701083906214, "loss": 0.9825, "step": 37800 }, { "epoch": 1.3673212033867772, "grad_norm": 0.6776862740516663, "learning_rate": 0.00013230252688182497, "loss": 0.9693, "step": 37950 }, { "epoch": 1.3727256350207169, "grad_norm": 0.6200093030929565, "learning_rate": 0.0001317067655651161, "loss": 0.9677, "step": 38100 }, { "epoch": 1.3781300666546568, "grad_norm": 0.7349710464477539, "learning_rate": 0.00013110975044750621, "loss": 0.9714, "step": 38250 }, { "epoch": 1.3835344982885966, "grad_norm": 0.5907526612281799, "learning_rate": 0.0001305115051371458, "loss": 0.9779, "step": 38400 }, { "epoch": 1.3889389299225365, "grad_norm": 0.6219062805175781, "learning_rate": 0.0001299120532908316, "loss": 0.9647, "step": 38550 }, { "epoch": 1.3943433615564764, "grad_norm": 0.777947723865509, "learning_rate": 0.0001293114186130712, "loss": 0.97, "step": 38700 }, { "epoch": 1.399747793190416, "grad_norm": 0.686892569065094, "learning_rate": 0.00012870962485514567, "loss": 0.9683, "step": 38850 }, { "epoch": 1.405152224824356, "grad_norm": 0.6655575633049011, "learning_rate": 0.00012810669581417032, "loss": 0.9674, "step": 39000 }, { "epoch": 1.4105566564582959, "grad_norm": 0.679595947265625, "learning_rate": 0.0001275026553321536, "loss": 0.9725, "step": 39150 }, { "epoch": 1.4159610880922355, "grad_norm": 0.6671122312545776, "learning_rate": 0.00012689752729505457, "loss": 0.9677, "step": 39300 }, { "epoch": 1.4213655197261754, "grad_norm": 0.6357312202453613, "learning_rate": 0.00012629133563183797, "loss": 0.9651, "step": 39450 }, { "epoch": 1.4267699513601153, "grad_norm": 0.7441504001617432, "learning_rate": 0.0001256841043135283, "loss": 0.9704, "step": 39600 }, { "epoch": 1.4321743829940552, "grad_norm": 0.5487176179885864, "learning_rate": 0.00012507585735226185, "loss": 0.9714, "step": 39750 }, { "epoch": 1.437578814627995, "grad_norm": 0.6709308624267578, "learning_rate": 0.00012446661880033698, "loss": 0.9587, "step": 39900 }, { "epoch": 1.4429832462619347, "grad_norm": 0.638081431388855, "learning_rate": 0.00012385641274926328, "loss": 0.9631, "step": 40050 }, { "epoch": 1.4483876778958746, "grad_norm": 0.6448566913604736, "learning_rate": 0.00012324526332880867, "loss": 0.9634, "step": 40200 }, { "epoch": 1.4537921095298145, "grad_norm": 0.7188845872879028, "learning_rate": 0.0001226331947060455, "loss": 0.9669, "step": 40350 }, { "epoch": 1.4591965411637542, "grad_norm": 0.5700541138648987, "learning_rate": 0.00012202023108439455, "loss": 0.9598, "step": 40500 }, { "epoch": 1.464600972797694, "grad_norm": 0.6200810670852661, "learning_rate": 0.0001214063967026682, "loss": 0.9651, "step": 40650 }, { "epoch": 1.470005404431634, "grad_norm": 0.6882332563400269, "learning_rate": 0.00012079171583411184, "loss": 0.9649, "step": 40800 }, { "epoch": 1.4754098360655736, "grad_norm": 0.6133975982666016, "learning_rate": 0.00012017621278544402, "loss": 0.9495, "step": 40950 }, { "epoch": 1.4808142676995135, "grad_norm": 0.8365902304649353, "learning_rate": 0.00011955991189589526, "loss": 0.95, "step": 41100 }, { "epoch": 1.4862186993334534, "grad_norm": 0.5351865887641907, "learning_rate": 0.0001189428375362457, "loss": 0.9579, "step": 41250 }, { "epoch": 1.4916231309673933, "grad_norm": 0.6488143801689148, "learning_rate": 0.00011832501410786116, "loss": 0.9513, "step": 41400 }, { "epoch": 1.4970275626013332, "grad_norm": 0.6101202964782715, "learning_rate": 0.0001177064660417285, "loss": 0.9573, "step": 41550 }, { "epoch": 1.5024319942352728, "grad_norm": 0.7013749480247498, "learning_rate": 0.00011708721779748933, "loss": 0.9508, "step": 41700 }, { "epoch": 1.5078364258692127, "grad_norm": 0.5707131028175354, "learning_rate": 0.00011646729386247286, "loss": 0.9486, "step": 41850 }, { "epoch": 1.5132408575031526, "grad_norm": 0.6973045468330383, "learning_rate": 0.00011584671875072757, "loss": 0.962, "step": 42000 }, { "epoch": 1.5186452891370923, "grad_norm": 0.6686086654663086, "learning_rate": 0.00011522551700205184, "loss": 0.9606, "step": 42150 }, { "epoch": 1.5240497207710324, "grad_norm": 0.5340304970741272, "learning_rate": 0.00011460371318102358, "loss": 0.9584, "step": 42300 }, { "epoch": 1.529454152404972, "grad_norm": 0.6170547008514404, "learning_rate": 0.00011398133187602873, "loss": 0.947, "step": 42450 }, { "epoch": 1.534858584038912, "grad_norm": 0.5485740900039673, "learning_rate": 0.00011335839769828924, "loss": 0.961, "step": 42600 }, { "epoch": 1.5402630156728518, "grad_norm": 0.6151200532913208, "learning_rate": 0.00011273493528088945, "loss": 0.9531, "step": 42750 }, { "epoch": 1.5456674473067915, "grad_norm": 0.6902984976768494, "learning_rate": 0.00011211096927780236, "loss": 0.9418, "step": 42900 }, { "epoch": 1.5510718789407314, "grad_norm": 0.7150260806083679, "learning_rate": 0.00011148652436291451, "loss": 0.948, "step": 43050 }, { "epoch": 1.5564763105746713, "grad_norm": 0.6931044459342957, "learning_rate": 0.0001108616252290504, "loss": 0.9571, "step": 43200 }, { "epoch": 1.561880742208611, "grad_norm": 0.641190230846405, "learning_rate": 0.00011023629658699596, "loss": 0.9412, "step": 43350 }, { "epoch": 1.5672851738425508, "grad_norm": 0.6901960968971252, "learning_rate": 0.00010961056316452145, "loss": 0.954, "step": 43500 }, { "epoch": 1.5726896054764907, "grad_norm": 0.6115658283233643, "learning_rate": 0.00010898444970540372, "loss": 0.952, "step": 43650 }, { "epoch": 1.5780940371104304, "grad_norm": 0.7072962522506714, "learning_rate": 0.00010835798096844743, "loss": 0.9484, "step": 43800 }, { "epoch": 1.5834984687443705, "grad_norm": 0.5898342728614807, "learning_rate": 0.00010773118172650643, "loss": 0.9421, "step": 43950 }, { "epoch": 1.5889029003783102, "grad_norm": 0.503633439540863, "learning_rate": 0.00010710407676550382, "loss": 0.935, "step": 44100 }, { "epoch": 1.59430733201225, "grad_norm": 0.5756278038024902, "learning_rate": 0.00010647669088345204, "loss": 0.9514, "step": 44250 }, { "epoch": 1.59971176364619, "grad_norm": 0.6327024102210999, "learning_rate": 0.00010584904888947204, "loss": 0.9398, "step": 44400 }, { "epoch": 1.6051161952801296, "grad_norm": 0.6922555565834045, "learning_rate": 0.00010522117560281251, "loss": 0.9411, "step": 44550 }, { "epoch": 1.6105206269140695, "grad_norm": 0.7153000235557556, "learning_rate": 0.00010459309585186818, "loss": 0.9437, "step": 44700 }, { "epoch": 1.6159250585480094, "grad_norm": 0.7171802520751953, "learning_rate": 0.0001039648344731982, "loss": 0.9305, "step": 44850 }, { "epoch": 1.621329490181949, "grad_norm": 0.5943671464920044, "learning_rate": 0.00010333641631054391, "loss": 0.938, "step": 45000 }, { "epoch": 1.6267339218158892, "grad_norm": 0.7467085123062134, "learning_rate": 0.00010270786621384645, "loss": 0.9416, "step": 45150 }, { "epoch": 1.6321383534498288, "grad_norm": 0.6827779412269592, "learning_rate": 0.00010207920903826415, "loss": 0.9381, "step": 45300 }, { "epoch": 1.6375427850837687, "grad_norm": 0.6708967089653015, "learning_rate": 0.00010145046964318963, "loss": 0.9495, "step": 45450 }, { "epoch": 1.6429472167177086, "grad_norm": 0.6415010094642639, "learning_rate": 0.00010082167289126672, "loss": 0.9312, "step": 45600 }, { "epoch": 1.6483516483516483, "grad_norm": 0.695865273475647, "learning_rate": 0.00010019284364740731, "loss": 0.9309, "step": 45750 }, { "epoch": 1.6537560799855882, "grad_norm": 0.6317395567893982, "learning_rate": 9.956400677780833e-05, "loss": 0.941, "step": 45900 }, { "epoch": 1.659160511619528, "grad_norm": 0.6181449294090271, "learning_rate": 9.893518714896805e-05, "loss": 0.9295, "step": 46050 }, { "epoch": 1.6645649432534677, "grad_norm": 0.5777118802070618, "learning_rate": 9.830640962670306e-05, "loss": 0.9264, "step": 46200 }, { "epoch": 1.6699693748874078, "grad_norm": 0.6352208852767944, "learning_rate": 9.767769907516495e-05, "loss": 0.9311, "step": 46350 }, { "epoch": 1.6753738065213475, "grad_norm": 0.6197606325149536, "learning_rate": 9.704908035585692e-05, "loss": 0.9302, "step": 46500 }, { "epoch": 1.6807782381552874, "grad_norm": 0.6172420382499695, "learning_rate": 9.642057832665095e-05, "loss": 0.9253, "step": 46650 }, { "epoch": 1.6861826697892273, "grad_norm": 0.6538959741592407, "learning_rate": 9.579221784080455e-05, "loss": 0.9376, "step": 46800 }, { "epoch": 1.691587101423167, "grad_norm": 0.6067585945129395, "learning_rate": 9.516402374597812e-05, "loss": 0.927, "step": 46950 }, { "epoch": 1.6969915330571068, "grad_norm": 0.5777443647384644, "learning_rate": 9.453602088325234e-05, "loss": 0.9289, "step": 47100 }, { "epoch": 1.7023959646910467, "grad_norm": 0.5103596448898315, "learning_rate": 9.390823408614598e-05, "loss": 0.9137, "step": 47250 }, { "epoch": 1.7078003963249864, "grad_norm": 0.624183714389801, "learning_rate": 9.328068817963359e-05, "loss": 0.9236, "step": 47400 }, { "epoch": 1.7132048279589265, "grad_norm": 0.5513512492179871, "learning_rate": 9.265340797916421e-05, "loss": 0.918, "step": 47550 }, { "epoch": 1.7186092595928661, "grad_norm": 0.7002034187316895, "learning_rate": 9.202641828967985e-05, "loss": 0.9149, "step": 47700 }, { "epoch": 1.724013691226806, "grad_norm": 0.5479480028152466, "learning_rate": 9.139974390463459e-05, "loss": 0.9265, "step": 47850 }, { "epoch": 1.729418122860746, "grad_norm": 0.570182204246521, "learning_rate": 9.077340960501425e-05, "loss": 0.9079, "step": 48000 }, { "epoch": 1.7348225544946856, "grad_norm": 0.6392347812652588, "learning_rate": 9.014744015835656e-05, "loss": 0.911, "step": 48150 }, { "epoch": 1.7402269861286255, "grad_norm": 0.6063001751899719, "learning_rate": 8.952186031777144e-05, "loss": 0.9113, "step": 48300 }, { "epoch": 1.7456314177625654, "grad_norm": 0.6585242748260498, "learning_rate": 8.88966948209625e-05, "loss": 0.9137, "step": 48450 }, { "epoch": 1.751035849396505, "grad_norm": 0.5171977281570435, "learning_rate": 8.827196838924867e-05, "loss": 0.9211, "step": 48600 }, { "epoch": 1.756440281030445, "grad_norm": 0.6493880152702332, "learning_rate": 8.764770572658655e-05, "loss": 0.9056, "step": 48750 }, { "epoch": 1.7618447126643848, "grad_norm": 0.8104442954063416, "learning_rate": 8.70239315185938e-05, "loss": 0.9045, "step": 48900 }, { "epoch": 1.7672491442983245, "grad_norm": 0.5967045426368713, "learning_rate": 8.64006704315727e-05, "loss": 0.9164, "step": 49050 }, { "epoch": 1.7726535759322646, "grad_norm": 0.6888705492019653, "learning_rate": 8.577794711153479e-05, "loss": 0.9111, "step": 49200 }, { "epoch": 1.7780580075662042, "grad_norm": 0.5948097705841064, "learning_rate": 8.515578618322648e-05, "loss": 0.9095, "step": 49350 }, { "epoch": 1.7834624392001441, "grad_norm": 0.6458430886268616, "learning_rate": 8.453421224915511e-05, "loss": 0.9029, "step": 49500 }, { "epoch": 1.788866870834084, "grad_norm": 0.8202154040336609, "learning_rate": 8.391324988861611e-05, "loss": 0.9168, "step": 49650 }, { "epoch": 1.7942713024680237, "grad_norm": 0.5799959897994995, "learning_rate": 8.32929236567211e-05, "loss": 0.9005, "step": 49800 }, { "epoch": 1.7996757341019636, "grad_norm": 0.7229143381118774, "learning_rate": 8.267325808342685e-05, "loss": 0.897, "step": 49950 }, { "epoch": 1.8050801657359035, "grad_norm": 0.5912762880325317, "learning_rate": 8.205427767256524e-05, "loss": 0.9015, "step": 50100 }, { "epoch": 1.8104845973698431, "grad_norm": 0.6438339352607727, "learning_rate": 8.143600690087443e-05, "loss": 0.9137, "step": 50250 }, { "epoch": 1.8158890290037832, "grad_norm": 0.5374941229820251, "learning_rate": 8.08184702170308e-05, "loss": 0.9008, "step": 50400 }, { "epoch": 1.821293460637723, "grad_norm": 0.5253046751022339, "learning_rate": 8.020169204068219e-05, "loss": 0.9015, "step": 50550 }, { "epoch": 1.8266978922716628, "grad_norm": 0.6589975357055664, "learning_rate": 7.958569676148234e-05, "loss": 0.9117, "step": 50700 }, { "epoch": 1.8321023239056027, "grad_norm": 0.5939854979515076, "learning_rate": 7.897050873812647e-05, "loss": 0.9024, "step": 50850 }, { "epoch": 1.8375067555395423, "grad_norm": 0.6179183721542358, "learning_rate": 7.835615229738775e-05, "loss": 0.9111, "step": 51000 }, { "epoch": 1.8429111871734822, "grad_norm": 0.6526548266410828, "learning_rate": 7.774265173315581e-05, "loss": 0.9002, "step": 51150 }, { "epoch": 1.8483156188074221, "grad_norm": 0.5846490263938904, "learning_rate": 7.713003130547556e-05, "loss": 0.8889, "step": 51300 }, { "epoch": 1.8537200504413618, "grad_norm": 0.5639694333076477, "learning_rate": 7.651831523958827e-05, "loss": 0.896, "step": 51450 }, { "epoch": 1.859124482075302, "grad_norm": 0.5969030857086182, "learning_rate": 7.590752772497345e-05, "loss": 0.8899, "step": 51600 }, { "epoch": 1.8645289137092416, "grad_norm": 0.57610023021698, "learning_rate": 7.529769291439216e-05, "loss": 0.8908, "step": 51750 }, { "epoch": 1.8699333453431815, "grad_norm": 0.7263045907020569, "learning_rate": 7.468883492293228e-05, "loss": 0.8956, "step": 51900 }, { "epoch": 1.8753377769771213, "grad_norm": 0.5964723825454712, "learning_rate": 7.40809778270546e-05, "loss": 0.8944, "step": 52050 }, { "epoch": 1.880742208611061, "grad_norm": 0.6026207804679871, "learning_rate": 7.347414566364085e-05, "loss": 0.8892, "step": 52200 }, { "epoch": 1.886146640245001, "grad_norm": 0.6354103684425354, "learning_rate": 7.28683624290432e-05, "loss": 0.8972, "step": 52350 }, { "epoch": 1.8915510718789408, "grad_norm": 0.6123978495597839, "learning_rate": 7.226365207813542e-05, "loss": 0.8951, "step": 52500 }, { "epoch": 1.8969555035128804, "grad_norm": 0.7344669699668884, "learning_rate": 7.166003852336548e-05, "loss": 0.8825, "step": 52650 }, { "epoch": 1.9023599351468206, "grad_norm": 0.5727975368499756, "learning_rate": 7.105754563381006e-05, "loss": 0.8815, "step": 52800 }, { "epoch": 1.9077643667807602, "grad_norm": 0.5696874856948853, "learning_rate": 7.045619723423072e-05, "loss": 0.8868, "step": 52950 }, { "epoch": 1.9131687984147, "grad_norm": 0.6967275142669678, "learning_rate": 6.985601710413158e-05, "loss": 0.8845, "step": 53100 }, { "epoch": 1.91857323004864, "grad_norm": 0.64991295337677, "learning_rate": 6.92570289768193e-05, "loss": 0.8824, "step": 53250 }, { "epoch": 1.9239776616825797, "grad_norm": 0.6261005997657776, "learning_rate": 6.865925653846432e-05, "loss": 0.881, "step": 53400 }, { "epoch": 1.9293820933165196, "grad_norm": 0.6127173900604248, "learning_rate": 6.806272342716431e-05, "loss": 0.8878, "step": 53550 }, { "epoch": 1.9347865249504594, "grad_norm": 0.552493691444397, "learning_rate": 6.746745323200943e-05, "loss": 0.888, "step": 53700 }, { "epoch": 1.940190956584399, "grad_norm": 0.641351580619812, "learning_rate": 6.687346949214966e-05, "loss": 0.8834, "step": 53850 }, { "epoch": 1.945595388218339, "grad_norm": 0.5708601474761963, "learning_rate": 6.628079569586365e-05, "loss": 0.8901, "step": 54000 }, { "epoch": 1.9509998198522789, "grad_norm": 0.5919014811515808, "learning_rate": 6.56894552796303e-05, "loss": 0.8833, "step": 54150 }, { "epoch": 1.9564042514862185, "grad_norm": 0.5352922677993774, "learning_rate": 6.509947162720172e-05, "loss": 0.8762, "step": 54300 }, { "epoch": 1.9618086831201587, "grad_norm": 0.5126431584358215, "learning_rate": 6.451086806867864e-05, "loss": 0.8719, "step": 54450 }, { "epoch": 1.9672131147540983, "grad_norm": 0.6120204329490662, "learning_rate": 6.392366787958786e-05, "loss": 0.882, "step": 54600 }, { "epoch": 1.9726175463880382, "grad_norm": 0.641154408454895, "learning_rate": 6.333789427996191e-05, "loss": 0.8743, "step": 54750 }, { "epoch": 1.978021978021978, "grad_norm": 0.648558497428894, "learning_rate": 6.275357043342069e-05, "loss": 0.8645, "step": 54900 }, { "epoch": 1.9834264096559178, "grad_norm": 0.6066434979438782, "learning_rate": 6.217071944625562e-05, "loss": 0.8622, "step": 55050 }, { "epoch": 1.9888308412898577, "grad_norm": 0.5739848613739014, "learning_rate": 6.158936436651593e-05, "loss": 0.8718, "step": 55200 }, { "epoch": 1.9942352729237975, "grad_norm": 0.5929279923439026, "learning_rate": 6.100952818309715e-05, "loss": 0.8686, "step": 55350 }, { "epoch": 1.9996397045577372, "grad_norm": 0.5922086238861084, "learning_rate": 6.043123382483224e-05, "loss": 0.8753, "step": 55500 }, { "epoch": 2.0050441361916773, "grad_norm": 0.6458303332328796, "learning_rate": 5.98545041595847e-05, "loss": 0.791, "step": 55650 }, { "epoch": 2.010448567825617, "grad_norm": 0.5965596437454224, "learning_rate": 5.927936199334435e-05, "loss": 0.7904, "step": 55800 }, { "epoch": 2.0158529994595566, "grad_norm": 0.523539125919342, "learning_rate": 5.8705830069325566e-05, "loss": 0.7859, "step": 55950 }, { "epoch": 2.0212574310934968, "grad_norm": 0.5941675305366516, "learning_rate": 5.813393106706795e-05, "loss": 0.7907, "step": 56100 }, { "epoch": 2.0266618627274364, "grad_norm": 0.5710470080375671, "learning_rate": 5.7563687601539276e-05, "loss": 0.787, "step": 56250 }, { "epoch": 2.0320662943613765, "grad_norm": 0.7543295621871948, "learning_rate": 5.699512222224148e-05, "loss": 0.7925, "step": 56400 }, { "epoch": 2.037470725995316, "grad_norm": 0.7011525630950928, "learning_rate": 5.642825741231889e-05, "loss": 0.7863, "step": 56550 }, { "epoch": 2.042875157629256, "grad_norm": 0.7366952300071716, "learning_rate": 5.586311558766908e-05, "loss": 0.7845, "step": 56700 }, { "epoch": 2.048279589263196, "grad_norm": 0.5936063528060913, "learning_rate": 5.5299719096056444e-05, "loss": 0.7878, "step": 56850 }, { "epoch": 2.0536840208971356, "grad_norm": 0.6049606800079346, "learning_rate": 5.4738090216228724e-05, "loss": 0.7856, "step": 57000 }, { "epoch": 2.0590884525310753, "grad_norm": 0.6939170360565186, "learning_rate": 5.4178251157035675e-05, "loss": 0.7886, "step": 57150 }, { "epoch": 2.0644928841650154, "grad_norm": 0.5444577932357788, "learning_rate": 5.3620224056551224e-05, "loss": 0.7806, "step": 57300 }, { "epoch": 2.069897315798955, "grad_norm": 0.6011742949485779, "learning_rate": 5.30640309811977e-05, "loss": 0.7852, "step": 57450 }, { "epoch": 2.0753017474328948, "grad_norm": 0.6152522563934326, "learning_rate": 5.250969392487343e-05, "loss": 0.7777, "step": 57600 }, { "epoch": 2.080706179066835, "grad_norm": 0.4750346839427948, "learning_rate": 5.195723480808309e-05, "loss": 0.7735, "step": 57750 }, { "epoch": 2.0861106107007745, "grad_norm": 0.5713702440261841, "learning_rate": 5.140667547707064e-05, "loss": 0.7874, "step": 57900 }, { "epoch": 2.0915150423347146, "grad_norm": 0.5541932582855225, "learning_rate": 5.085803770295579e-05, "loss": 0.789, "step": 58050 }, { "epoch": 2.0969194739686543, "grad_norm": 0.571283221244812, "learning_rate": 5.03113431808727e-05, "loss": 0.789, "step": 58200 }, { "epoch": 2.102323905602594, "grad_norm": 0.6038793325424194, "learning_rate": 4.976661352911237e-05, "loss": 0.7887, "step": 58350 }, { "epoch": 2.107728337236534, "grad_norm": 0.6276759505271912, "learning_rate": 4.922387028826768e-05, "loss": 0.7858, "step": 58500 }, { "epoch": 2.1131327688704737, "grad_norm": 0.6171843409538269, "learning_rate": 4.8683134920381665e-05, "loss": 0.7813, "step": 58650 }, { "epoch": 2.1185372005044134, "grad_norm": 0.6076928973197937, "learning_rate": 4.814442880809853e-05, "loss": 0.7871, "step": 58800 }, { "epoch": 2.1239416321383535, "grad_norm": 0.6066181063652039, "learning_rate": 4.760777325381852e-05, "loss": 0.7793, "step": 58950 }, { "epoch": 2.129346063772293, "grad_norm": 0.6619130373001099, "learning_rate": 4.707318947885537e-05, "loss": 0.7842, "step": 59100 }, { "epoch": 2.1347504954062333, "grad_norm": 0.6103502511978149, "learning_rate": 4.6540698622597e-05, "loss": 0.7858, "step": 59250 }, { "epoch": 2.140154927040173, "grad_norm": 0.6459470391273499, "learning_rate": 4.6010321741669726e-05, "loss": 0.7817, "step": 59400 }, { "epoch": 2.1455593586741126, "grad_norm": 0.643363356590271, "learning_rate": 4.5482079809105704e-05, "loss": 0.7743, "step": 59550 }, { "epoch": 2.1509637903080527, "grad_norm": 0.518678605556488, "learning_rate": 4.495599371351331e-05, "loss": 0.7826, "step": 59700 }, { "epoch": 2.1563682219419924, "grad_norm": 0.5462015867233276, "learning_rate": 4.4432084258251415e-05, "loss": 0.7729, "step": 59850 }, { "epoch": 2.161772653575932, "grad_norm": 0.5519649982452393, "learning_rate": 4.39103721606065e-05, "loss": 0.7765, "step": 60000 }, { "epoch": 2.167177085209872, "grad_norm": 0.672087550163269, "learning_rate": 4.3390878050973573e-05, "loss": 0.7808, "step": 60150 }, { "epoch": 2.172581516843812, "grad_norm": 0.5825379490852356, "learning_rate": 4.287362247204033e-05, "loss": 0.7711, "step": 60300 }, { "epoch": 2.177985948477752, "grad_norm": 0.6448932886123657, "learning_rate": 4.2358625877974864e-05, "loss": 0.7767, "step": 60450 }, { "epoch": 2.1833903801116916, "grad_norm": 0.60658860206604, "learning_rate": 4.1845908633616695e-05, "loss": 0.772, "step": 60600 }, { "epoch": 2.1887948117456313, "grad_norm": 0.6476044058799744, "learning_rate": 4.1335491013671565e-05, "loss": 0.7784, "step": 60750 }, { "epoch": 2.1941992433795714, "grad_norm": 0.7101139426231384, "learning_rate": 4.0827393201909794e-05, "loss": 0.7727, "step": 60900 }, { "epoch": 2.199603675013511, "grad_norm": 0.7003293633460999, "learning_rate": 4.032163529036792e-05, "loss": 0.7806, "step": 61050 }, { "epoch": 2.2050081066474507, "grad_norm": 0.5855246782302856, "learning_rate": 3.981823727855444e-05, "loss": 0.7814, "step": 61200 }, { "epoch": 2.210412538281391, "grad_norm": 0.5075130462646484, "learning_rate": 3.9317219072658726e-05, "loss": 0.7689, "step": 61350 }, { "epoch": 2.2158169699153305, "grad_norm": 0.5855611562728882, "learning_rate": 3.881860048476396e-05, "loss": 0.7777, "step": 61500 }, { "epoch": 2.22122140154927, "grad_norm": 0.5581937432289124, "learning_rate": 3.8322401232063765e-05, "loss": 0.7845, "step": 61650 }, { "epoch": 2.2266258331832103, "grad_norm": 0.5910426378250122, "learning_rate": 3.782864093608245e-05, "loss": 0.7792, "step": 61800 }, { "epoch": 2.23203026481715, "grad_norm": 0.5566779971122742, "learning_rate": 3.733733912189903e-05, "loss": 0.7711, "step": 61950 }, { "epoch": 2.23743469645109, "grad_norm": 0.5984916090965271, "learning_rate": 3.68485152173752e-05, "loss": 0.7675, "step": 62100 }, { "epoch": 2.2428391280850297, "grad_norm": 0.5687974095344543, "learning_rate": 3.6362188552387186e-05, "loss": 0.7752, "step": 62250 }, { "epoch": 2.2482435597189694, "grad_norm": 0.5997481942176819, "learning_rate": 3.587837835806116e-05, "loss": 0.7762, "step": 62400 }, { "epoch": 2.2536479913529095, "grad_norm": 0.6333452463150024, "learning_rate": 3.539710376601299e-05, "loss": 0.776, "step": 62550 }, { "epoch": 2.259052422986849, "grad_norm": 0.49814724922180176, "learning_rate": 3.4918383807591516e-05, "loss": 0.7704, "step": 62700 }, { "epoch": 2.2644568546207893, "grad_norm": 0.6359221935272217, "learning_rate": 3.444223741312608e-05, "loss": 0.7749, "step": 62850 }, { "epoch": 2.269861286254729, "grad_norm": 0.5802394151687622, "learning_rate": 3.396868341117798e-05, "loss": 0.7755, "step": 63000 }, { "epoch": 2.2752657178886686, "grad_norm": 0.6383761763572693, "learning_rate": 3.3497740527795905e-05, "loss": 0.775, "step": 63150 }, { "epoch": 2.2806701495226087, "grad_norm": 0.5394207835197449, "learning_rate": 3.3029427385775335e-05, "loss": 0.7755, "step": 63300 }, { "epoch": 2.2860745811565484, "grad_norm": 0.5275822877883911, "learning_rate": 3.25637625039222e-05, "loss": 0.7728, "step": 63450 }, { "epoch": 2.291479012790488, "grad_norm": 0.5123447775840759, "learning_rate": 3.21007642963207e-05, "loss": 0.7721, "step": 63600 }, { "epoch": 2.296883444424428, "grad_norm": 0.586459755897522, "learning_rate": 3.164045107160487e-05, "loss": 0.7708, "step": 63750 }, { "epoch": 2.302287876058368, "grad_norm": 0.6412725448608398, "learning_rate": 3.1182841032234924e-05, "loss": 0.7695, "step": 63900 }, { "epoch": 2.3076923076923075, "grad_norm": 0.5762320160865784, "learning_rate": 3.072795227377716e-05, "loss": 0.7602, "step": 64050 }, { "epoch": 2.3130967393262476, "grad_norm": 0.5541566014289856, "learning_rate": 3.027580278418852e-05, "loss": 0.7649, "step": 64200 }, { "epoch": 2.3185011709601873, "grad_norm": 0.5710071921348572, "learning_rate": 2.9826410443105422e-05, "loss": 0.7643, "step": 64350 }, { "epoch": 2.3239056025941274, "grad_norm": 0.6665874719619751, "learning_rate": 2.9379793021136427e-05, "loss": 0.7619, "step": 64500 }, { "epoch": 2.329310034228067, "grad_norm": 0.5459585189819336, "learning_rate": 2.8935968179159843e-05, "loss": 0.7503, "step": 64650 }, { "epoch": 2.3347144658620067, "grad_norm": 0.6013796925544739, "learning_rate": 2.8494953467625107e-05, "loss": 0.7616, "step": 64800 }, { "epoch": 2.340118897495947, "grad_norm": 0.6519309282302856, "learning_rate": 2.8056766325858863e-05, "loss": 0.7582, "step": 64950 }, { "epoch": 2.3455233291298865, "grad_norm": 0.6198135614395142, "learning_rate": 2.7621424081375423e-05, "loss": 0.7538, "step": 65100 }, { "epoch": 2.350927760763826, "grad_norm": 0.580227792263031, "learning_rate": 2.718894394919155e-05, "loss": 0.7604, "step": 65250 }, { "epoch": 2.3563321923977663, "grad_norm": 0.5496440529823303, "learning_rate": 2.6759343031145467e-05, "loss": 0.7629, "step": 65400 }, { "epoch": 2.361736624031706, "grad_norm": 0.6118148565292358, "learning_rate": 2.633263831522098e-05, "loss": 0.7543, "step": 65550 }, { "epoch": 2.3671410556656456, "grad_norm": 0.5903668403625488, "learning_rate": 2.5908846674875497e-05, "loss": 0.7626, "step": 65700 }, { "epoch": 2.3725454872995857, "grad_norm": 0.5964175462722778, "learning_rate": 2.548798486837276e-05, "loss": 0.7584, "step": 65850 }, { "epoch": 2.3779499189335254, "grad_norm": 0.6447151899337769, "learning_rate": 2.5070069538120212e-05, "loss": 0.7659, "step": 66000 }, { "epoch": 2.3833543505674655, "grad_norm": 0.5526403188705444, "learning_rate": 2.465511721001098e-05, "loss": 0.7528, "step": 66150 }, { "epoch": 2.388758782201405, "grad_norm": 0.6118183732032776, "learning_rate": 2.4243144292770215e-05, "loss": 0.7447, "step": 66300 }, { "epoch": 2.394163213835345, "grad_norm": 0.5308869481086731, "learning_rate": 2.383416707730637e-05, "loss": 0.7593, "step": 66450 }, { "epoch": 2.399567645469285, "grad_norm": 0.6109766364097595, "learning_rate": 2.3428201736067003e-05, "loss": 0.761, "step": 66600 }, { "epoch": 2.4049720771032246, "grad_norm": 0.6102012991905212, "learning_rate": 2.302526432239902e-05, "loss": 0.7533, "step": 66750 }, { "epoch": 2.4103765087371647, "grad_norm": 0.5869913697242737, "learning_rate": 2.2625370769914233e-05, "loss": 0.7514, "step": 66900 }, { "epoch": 2.4157809403711044, "grad_norm": 0.5591433644294739, "learning_rate": 2.2228536891859063e-05, "loss": 0.7608, "step": 67050 }, { "epoch": 2.421185372005044, "grad_norm": 0.48755505681037903, "learning_rate": 2.183477838048923e-05, "loss": 0.7581, "step": 67200 }, { "epoch": 2.426589803638984, "grad_norm": 0.5120564103126526, "learning_rate": 2.144411080644925e-05, "loss": 0.7609, "step": 67350 }, { "epoch": 2.431994235272924, "grad_norm": 0.5482677221298218, "learning_rate": 2.1056549618156796e-05, "loss": 0.7618, "step": 67500 }, { "epoch": 2.4373986669068635, "grad_norm": 0.6918262243270874, "learning_rate": 2.067211014119168e-05, "loss": 0.757, "step": 67650 }, { "epoch": 2.4428030985408036, "grad_norm": 0.455586701631546, "learning_rate": 2.029080757768994e-05, "loss": 0.7446, "step": 67800 }, { "epoch": 2.4482075301747432, "grad_norm": 0.5845438838005066, "learning_rate": 1.9912657005742608e-05, "loss": 0.7558, "step": 67950 }, { "epoch": 2.453611961808683, "grad_norm": 0.6255479454994202, "learning_rate": 1.953767337879947e-05, "loss": 0.7426, "step": 68100 }, { "epoch": 2.459016393442623, "grad_norm": 0.5470909476280212, "learning_rate": 1.9165871525077828e-05, "loss": 0.7597, "step": 68250 }, { "epoch": 2.4644208250765627, "grad_norm": 0.5875541567802429, "learning_rate": 1.879726614697612e-05, "loss": 0.7491, "step": 68400 }, { "epoch": 2.469825256710503, "grad_norm": 0.6186181306838989, "learning_rate": 1.843187182049244e-05, "loss": 0.7556, "step": 68550 }, { "epoch": 2.4752296883444425, "grad_norm": 0.6414260268211365, "learning_rate": 1.8069702994648208e-05, "loss": 0.7534, "step": 68700 }, { "epoch": 2.480634119978382, "grad_norm": 0.5647196173667908, "learning_rate": 1.7710773990916885e-05, "loss": 0.7467, "step": 68850 }, { "epoch": 2.4860385516123222, "grad_norm": 0.5534460544586182, "learning_rate": 1.7355099002657495e-05, "loss": 0.7591, "step": 69000 }, { "epoch": 2.491442983246262, "grad_norm": 0.5535364151000977, "learning_rate": 1.7002692094553506e-05, "loss": 0.7497, "step": 69150 }, { "epoch": 2.496847414880202, "grad_norm": 0.5928584337234497, "learning_rate": 1.6653567202056585e-05, "loss": 0.7496, "step": 69300 }, { "epoch": 2.5022518465141417, "grad_norm": 0.5369604825973511, "learning_rate": 1.6307738130835515e-05, "loss": 0.761, "step": 69450 }, { "epoch": 2.5076562781480813, "grad_norm": 0.6959002614021301, "learning_rate": 1.5965218556230375e-05, "loss": 0.7461, "step": 69600 }, { "epoch": 2.513060709782021, "grad_norm": 0.6277987360954285, "learning_rate": 1.5626022022711694e-05, "loss": 0.7467, "step": 69750 }, { "epoch": 2.518465141415961, "grad_norm": 0.6087015867233276, "learning_rate": 1.529016194334484e-05, "loss": 0.7556, "step": 69900 }, { "epoch": 2.523869573049901, "grad_norm": 0.5043054819107056, "learning_rate": 1.4957651599259615e-05, "loss": 0.7397, "step": 70050 }, { "epoch": 2.529274004683841, "grad_norm": 0.6836428642272949, "learning_rate": 1.4628504139125177e-05, "loss": 0.741, "step": 70200 }, { "epoch": 2.5346784363177806, "grad_norm": 0.5704199075698853, "learning_rate": 1.4302732578629918e-05, "loss": 0.7513, "step": 70350 }, { "epoch": 2.5400828679517202, "grad_norm": 0.5928525328636169, "learning_rate": 1.3980349799966985e-05, "loss": 0.7485, "step": 70500 }, { "epoch": 2.5454872995856603, "grad_norm": 0.6592413783073425, "learning_rate": 1.3661368551324648e-05, "loss": 0.7452, "step": 70650 }, { "epoch": 2.5508917312196, "grad_norm": 0.5700178146362305, "learning_rate": 1.3345801446382344e-05, "loss": 0.7496, "step": 70800 }, { "epoch": 2.55629616285354, "grad_norm": 0.5675559043884277, "learning_rate": 1.3033660963811878e-05, "loss": 0.7488, "step": 70950 }, { "epoch": 2.5617005944874798, "grad_norm": 0.5796085596084595, "learning_rate": 1.2724959446783868e-05, "loss": 0.7454, "step": 71100 }, { "epoch": 2.5671050261214194, "grad_norm": 0.6384360194206238, "learning_rate": 1.2419709102479804e-05, "loss": 0.7387, "step": 71250 }, { "epoch": 2.5725094577553596, "grad_norm": 0.5239229798316956, "learning_rate": 1.2117922001609173e-05, "loss": 0.7371, "step": 71400 }, { "epoch": 2.577913889389299, "grad_norm": 0.5770368576049805, "learning_rate": 1.181961007793222e-05, "loss": 0.7451, "step": 71550 }, { "epoch": 2.5833183210232393, "grad_norm": 0.5493025779724121, "learning_rate": 1.1524785127788074e-05, "loss": 0.7396, "step": 71700 }, { "epoch": 2.588722752657179, "grad_norm": 0.5658043622970581, "learning_rate": 1.123345880962826e-05, "loss": 0.7448, "step": 71850 }, { "epoch": 2.5941271842911187, "grad_norm": 0.5434427857398987, "learning_rate": 1.0945642643555542e-05, "loss": 0.7471, "step": 72000 }, { "epoch": 2.5995316159250583, "grad_norm": 0.5109556913375854, "learning_rate": 1.066134801086862e-05, "loss": 0.7434, "step": 72150 }, { "epoch": 2.6049360475589984, "grad_norm": 0.5859112739562988, "learning_rate": 1.0380586153611926e-05, "loss": 0.7391, "step": 72300 }, { "epoch": 2.610340479192938, "grad_norm": 0.5381293296813965, "learning_rate": 1.0103368174131044e-05, "loss": 0.7402, "step": 72450 }, { "epoch": 2.615744910826878, "grad_norm": 0.5799181461334229, "learning_rate": 9.829705034633763e-06, "loss": 0.746, "step": 72600 }, { "epoch": 2.621149342460818, "grad_norm": 0.5245427489280701, "learning_rate": 9.559607556756589e-06, "loss": 0.7374, "step": 72750 }, { "epoch": 2.6265537740947575, "grad_norm": 0.5755253434181213, "learning_rate": 9.29308642113672e-06, "loss": 0.7335, "step": 72900 }, { "epoch": 2.6319582057286977, "grad_norm": 0.5702092051506042, "learning_rate": 9.030152166989848e-06, "loss": 0.7441, "step": 73050 }, { "epoch": 2.6373626373626373, "grad_norm": 0.5722294449806213, "learning_rate": 8.770815191693294e-06, "loss": 0.745, "step": 73200 }, { "epoch": 2.6427670689965774, "grad_norm": 0.5095585584640503, "learning_rate": 8.515085750374819e-06, "loss": 0.7399, "step": 73350 }, { "epoch": 2.648171500630517, "grad_norm": 0.7061243057250977, "learning_rate": 8.262973955507213e-06, "loss": 0.7317, "step": 73500 }, { "epoch": 2.6535759322644568, "grad_norm": 0.6071792244911194, "learning_rate": 8.014489776508406e-06, "loss": 0.7457, "step": 73650 }, { "epoch": 2.6589803638983964, "grad_norm": 0.6209822297096252, "learning_rate": 7.769643039347118e-06, "loss": 0.7304, "step": 73800 }, { "epoch": 2.6643847955323365, "grad_norm": 0.5465585589408875, "learning_rate": 7.528443426154386e-06, "loss": 0.7348, "step": 73950 }, { "epoch": 2.669789227166276, "grad_norm": 0.5735740661621094, "learning_rate": 7.290900474840745e-06, "loss": 0.7509, "step": 74100 }, { "epoch": 2.6751936588002163, "grad_norm": 0.5864896178245544, "learning_rate": 7.0570235787189575e-06, "loss": 0.7422, "step": 74250 }, { "epoch": 2.680598090434156, "grad_norm": 0.5019831657409668, "learning_rate": 6.82682198613267e-06, "loss": 0.74, "step": 74400 }, { "epoch": 2.6860025220680956, "grad_norm": 0.4947664141654968, "learning_rate": 6.600304800090629e-06, "loss": 0.7424, "step": 74550 }, { "epoch": 2.6914069537020358, "grad_norm": 0.5284778475761414, "learning_rate": 6.3774809779066914e-06, "loss": 0.741, "step": 74700 }, { "epoch": 2.6968113853359754, "grad_norm": 0.5382539629936218, "learning_rate": 6.158359330845742e-06, "loss": 0.7384, "step": 74850 }, { "epoch": 2.7022158169699155, "grad_norm": 0.6098785996437073, "learning_rate": 5.942948523775172e-06, "loss": 0.732, "step": 75000 }, { "epoch": 2.707620248603855, "grad_norm": 0.5111733675003052, "learning_rate": 5.731257074822227e-06, "loss": 0.7401, "step": 75150 }, { "epoch": 2.713024680237795, "grad_norm": 0.563735842704773, "learning_rate": 5.523293355037174e-06, "loss": 0.7373, "step": 75300 }, { "epoch": 2.718429111871735, "grad_norm": 0.48581522703170776, "learning_rate": 5.319065588062389e-06, "loss": 0.7355, "step": 75450 }, { "epoch": 2.7238335435056746, "grad_norm": 0.6022956371307373, "learning_rate": 5.118581849806991e-06, "loss": 0.752, "step": 75600 }, { "epoch": 2.7292379751396147, "grad_norm": 0.5350160002708435, "learning_rate": 4.92185006812762e-06, "loss": 0.7302, "step": 75750 }, { "epoch": 2.7346424067735544, "grad_norm": 0.5559709668159485, "learning_rate": 4.728878022514904e-06, "loss": 0.7258, "step": 75900 }, { "epoch": 2.740046838407494, "grad_norm": 0.5401473045349121, "learning_rate": 4.5396733437857885e-06, "loss": 0.7485, "step": 76050 }, { "epoch": 2.7454512700414337, "grad_norm": 0.5016641020774841, "learning_rate": 4.354243513781841e-06, "loss": 0.7257, "step": 76200 }, { "epoch": 2.750855701675374, "grad_norm": 0.5274752974510193, "learning_rate": 4.172595865073414e-06, "loss": 0.7307, "step": 76350 }, { "epoch": 2.7562601333093135, "grad_norm": 0.5795451402664185, "learning_rate": 3.994737580669572e-06, "loss": 0.7431, "step": 76500 }, { "epoch": 2.7616645649432536, "grad_norm": 0.584701418876648, "learning_rate": 3.820675693734166e-06, "loss": 0.7333, "step": 76650 }, { "epoch": 2.7670689965771933, "grad_norm": 0.5679466724395752, "learning_rate": 3.6504170873076894e-06, "loss": 0.7457, "step": 76800 }, { "epoch": 2.772473428211133, "grad_norm": 0.5592213869094849, "learning_rate": 3.483968494035039e-06, "loss": 0.7438, "step": 76950 }, { "epoch": 2.777877859845073, "grad_norm": 0.6507932543754578, "learning_rate": 3.3213364958993633e-06, "loss": 0.7332, "step": 77100 }, { "epoch": 2.7832822914790127, "grad_norm": 0.5836296081542969, "learning_rate": 3.1625275239617447e-06, "loss": 0.7341, "step": 77250 }, { "epoch": 2.788686723112953, "grad_norm": 0.6291818618774414, "learning_rate": 3.0075478581068517e-06, "loss": 0.7391, "step": 77400 }, { "epoch": 2.7940911547468925, "grad_norm": 0.59623783826828, "learning_rate": 2.8564036267947347e-06, "loss": 0.7281, "step": 77550 }, { "epoch": 2.799495586380832, "grad_norm": 0.5835798978805542, "learning_rate": 2.7091008068183323e-06, "loss": 0.7385, "step": 77700 }, { "epoch": 2.804900018014772, "grad_norm": 0.5502892732620239, "learning_rate": 2.565645223067237e-06, "loss": 0.7441, "step": 77850 }, { "epoch": 2.810304449648712, "grad_norm": 0.5453166365623474, "learning_rate": 2.4260425482973025e-06, "loss": 0.7338, "step": 78000 }, { "epoch": 2.8157088812826516, "grad_norm": 0.5541927814483643, "learning_rate": 2.2902983029063463e-06, "loss": 0.7325, "step": 78150 }, { "epoch": 2.8211133129165917, "grad_norm": 0.5624451041221619, "learning_rate": 2.158417854715844e-06, "loss": 0.7311, "step": 78300 }, { "epoch": 2.8265177445505314, "grad_norm": 0.6407118439674377, "learning_rate": 2.0304064187587012e-06, "loss": 0.7343, "step": 78450 }, { "epoch": 2.831922176184471, "grad_norm": 0.6349582076072693, "learning_rate": 1.906269057072918e-06, "loss": 0.7289, "step": 78600 }, { "epoch": 2.837326607818411, "grad_norm": 0.511360764503479, "learning_rate": 1.7860106785015707e-06, "loss": 0.7362, "step": 78750 }, { "epoch": 2.842731039452351, "grad_norm": 0.6116952300071716, "learning_rate": 1.669636038498612e-06, "loss": 0.7357, "step": 78900 }, { "epoch": 2.848135471086291, "grad_norm": 0.5288776159286499, "learning_rate": 1.5571497389408218e-06, "loss": 0.7377, "step": 79050 }, { "epoch": 2.8535399027202306, "grad_norm": 0.5661271810531616, "learning_rate": 1.4485562279458742e-06, "loss": 0.7335, "step": 79200 }, { "epoch": 2.8589443343541703, "grad_norm": 0.46028730273246765, "learning_rate": 1.3438597996963675e-06, "loss": 0.7306, "step": 79350 }, { "epoch": 2.8643487659881104, "grad_norm": 0.5887011289596558, "learning_rate": 1.243064594270127e-06, "loss": 0.7348, "step": 79500 }, { "epoch": 2.86975319762205, "grad_norm": 0.5686684846878052, "learning_rate": 1.1461745974763682e-06, "loss": 0.7305, "step": 79650 }, { "epoch": 2.87515762925599, "grad_norm": 0.5735449194908142, "learning_rate": 1.0531936406982247e-06, "loss": 0.726, "step": 79800 }, { "epoch": 2.88056206088993, "grad_norm": 0.6428796648979187, "learning_rate": 9.64125400741056e-07, "loss": 0.7288, "step": 79950 }, { "epoch": 2.8859664925238695, "grad_norm": 0.6176515817642212, "learning_rate": 8.789733996872551e-07, "loss": 0.7345, "step": 80100 }, { "epoch": 2.891370924157809, "grad_norm": 0.5095422267913818, "learning_rate": 7.977410047568246e-07, "loss": 0.7419, "step": 80250 }, { "epoch": 2.8967753557917493, "grad_norm": 0.5800315141677856, "learning_rate": 7.204314281742952e-07, "loss": 0.7375, "step": 80400 }, { "epoch": 2.902179787425689, "grad_norm": 0.5727178454399109, "learning_rate": 6.470477270416719e-07, "loss": 0.7356, "step": 80550 }, { "epoch": 2.907584219059629, "grad_norm": 0.5594687461853027, "learning_rate": 5.775928032175637e-07, "loss": 0.7363, "step": 80700 }, { "epoch": 2.9129886506935687, "grad_norm": 0.6071078777313232, "learning_rate": 5.120694032024309e-07, "loss": 0.7491, "step": 80850 }, { "epoch": 2.9183930823275084, "grad_norm": 0.6253530383110046, "learning_rate": 4.5048011802997226e-07, "loss": 0.7495, "step": 81000 }, { "epoch": 2.9237975139614485, "grad_norm": 0.7043154835700989, "learning_rate": 3.928273831646512e-07, "loss": 0.7349, "step": 81150 }, { "epoch": 2.929201945595388, "grad_norm": 0.5901583433151245, "learning_rate": 3.391134784054284e-07, "loss": 0.7388, "step": 81300 }, { "epoch": 2.9346063772293283, "grad_norm": 0.5171722173690796, "learning_rate": 2.8934052779558965e-07, "loss": 0.7357, "step": 81450 }, { "epoch": 2.940010808863268, "grad_norm": 0.5885277986526489, "learning_rate": 2.4351049953872386e-07, "loss": 0.7294, "step": 81600 }, { "epoch": 2.9454152404972076, "grad_norm": 0.5369580388069153, "learning_rate": 2.0162520592095225e-07, "loss": 0.724, "step": 81750 }, { "epoch": 2.9508196721311473, "grad_norm": 0.505922794342041, "learning_rate": 1.6368630323920776e-07, "loss": 0.7376, "step": 81900 }, { "epoch": 2.9562241037650874, "grad_norm": 0.5709424018859863, "learning_rate": 1.2969529173577633e-07, "loss": 0.7273, "step": 82050 }, { "epoch": 2.961628535399027, "grad_norm": 0.5696266293525696, "learning_rate": 9.965351553895552e-08, "loss": 0.7358, "step": 82200 }, { "epoch": 2.967032967032967, "grad_norm": 0.6568360924720764, "learning_rate": 7.356216260990811e-08, "loss": 0.7337, "step": 82350 }, { "epoch": 2.972437398666907, "grad_norm": 0.6210362911224365, "learning_rate": 5.142226469568856e-08, "loss": 0.7301, "step": 82500 }, { "epoch": 2.9778418303008465, "grad_norm": 0.5563607811927795, "learning_rate": 3.32346972884312e-08, "loss": 0.7311, "step": 82650 }, { "epoch": 2.9832462619347866, "grad_norm": 0.6156190633773804, "learning_rate": 1.9000179590733525e-08, "loss": 0.7248, "step": 82800 }, { "epoch": 2.9886506935687263, "grad_norm": 0.6303669810295105, "learning_rate": 8.719274487245522e-09, "loss": 0.7412, "step": 82950 }, { "epoch": 2.9940551252026664, "grad_norm": 0.4844772517681122, "learning_rate": 2.392388522343136e-09, "loss": 0.7329, "step": 83100 }, { "epoch": 2.999459556836606, "grad_norm": 0.5367130041122437, "learning_rate": 1.977188415214215e-11, "loss": 0.7302, "step": 83250 } ], "logging_steps": 150, "max_steps": 83265, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.661509740266363e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }