{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7809745229100065, "eval_steps": 1000000, "global_step": 140000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024217766153250025, "grad_norm": 1.516142725944519, "learning_rate": 9.997578223384676e-06, "loss": 9.1668, "step": 500 }, { "epoch": 0.04843553230650005, "grad_norm": 1.0418092012405396, "learning_rate": 9.995156446769351e-06, "loss": 7.7891, "step": 1000 }, { "epoch": 0.07265329845975008, "grad_norm": 0.9371763467788696, "learning_rate": 9.992734670154026e-06, "loss": 7.1939, "step": 1500 }, { "epoch": 0.0968710646130001, "grad_norm": 1.1670751571655273, "learning_rate": 9.990312893538701e-06, "loss": 6.9189, "step": 2000 }, { "epoch": 0.12108883076625013, "grad_norm": 1.3695101737976074, "learning_rate": 9.987891116923376e-06, "loss": 6.7278, "step": 2500 }, { "epoch": 0.14530659691950015, "grad_norm": 1.796486735343933, "learning_rate": 9.985469340308051e-06, "loss": 6.5867, "step": 3000 }, { "epoch": 0.16952436307275018, "grad_norm": 1.509717583656311, "learning_rate": 9.983047563692726e-06, "loss": 6.4508, "step": 3500 }, { "epoch": 0.1937421292260002, "grad_norm": 1.8329906463623047, "learning_rate": 9.9806257870774e-06, "loss": 6.3551, "step": 4000 }, { "epoch": 0.21795989537925023, "grad_norm": 1.5139986276626587, "learning_rate": 9.978204010462076e-06, "loss": 6.2743, "step": 4500 }, { "epoch": 0.24217766153250025, "grad_norm": 2.2407052516937256, "learning_rate": 9.97578223384675e-06, "loss": 6.2001, "step": 5000 }, { "epoch": 0.26639542768575025, "grad_norm": 2.087357521057129, "learning_rate": 9.973360457231426e-06, "loss": 6.1334, "step": 5500 }, { "epoch": 0.2906131938390003, "grad_norm": 2.0182762145996094, "learning_rate": 9.970938680616102e-06, "loss": 6.0656, "step": 6000 }, { "epoch": 0.3148309599922503, "grad_norm": 1.9544531106948853, "learning_rate": 9.968516904000775e-06, "loss": 6.0134, "step": 6500 }, { "epoch": 0.33904872614550036, "grad_norm": 2.3156166076660156, "learning_rate": 9.966095127385452e-06, "loss": 5.9546, "step": 7000 }, { "epoch": 0.36326649229875035, "grad_norm": 2.5564098358154297, "learning_rate": 9.963673350770125e-06, "loss": 5.9063, "step": 7500 }, { "epoch": 0.3874842584520004, "grad_norm": 2.191112518310547, "learning_rate": 9.961251574154802e-06, "loss": 5.8564, "step": 8000 }, { "epoch": 0.4117020246052504, "grad_norm": 2.1813371181488037, "learning_rate": 9.958829797539475e-06, "loss": 5.8053, "step": 8500 }, { "epoch": 0.43591979075850046, "grad_norm": 1.9756942987442017, "learning_rate": 9.95640802092415e-06, "loss": 5.7669, "step": 9000 }, { "epoch": 0.46013755691175046, "grad_norm": 2.2932822704315186, "learning_rate": 9.953986244308825e-06, "loss": 5.7218, "step": 9500 }, { "epoch": 0.4843553230650005, "grad_norm": 2.218536376953125, "learning_rate": 9.9515644676935e-06, "loss": 5.6818, "step": 10000 }, { "epoch": 0.5085730892182505, "grad_norm": 2.3896877765655518, "learning_rate": 9.949142691078175e-06, "loss": 5.6481, "step": 10500 }, { "epoch": 0.5327908553715005, "grad_norm": 2.5433712005615234, "learning_rate": 9.94672091446285e-06, "loss": 5.6124, "step": 11000 }, { "epoch": 0.5570086215247505, "grad_norm": 2.5442490577697754, "learning_rate": 9.944299137847525e-06, "loss": 5.5728, "step": 11500 }, { "epoch": 0.5812263876780006, "grad_norm": 2.327425241470337, "learning_rate": 9.9418773612322e-06, "loss": 5.5406, "step": 12000 }, { "epoch": 0.6054441538312506, "grad_norm": 2.290090799331665, "learning_rate": 9.939455584616876e-06, "loss": 5.5121, "step": 12500 }, { "epoch": 0.6296619199845006, "grad_norm": 3.161325216293335, "learning_rate": 9.93703380800155e-06, "loss": 5.4739, "step": 13000 }, { "epoch": 0.6538796861377506, "grad_norm": 2.6134533882141113, "learning_rate": 9.934612031386226e-06, "loss": 5.4384, "step": 13500 }, { "epoch": 0.6780974522910007, "grad_norm": 2.674760580062866, "learning_rate": 9.9321902547709e-06, "loss": 5.413, "step": 14000 }, { "epoch": 0.7023152184442507, "grad_norm": 2.431614398956299, "learning_rate": 9.929768478155576e-06, "loss": 5.3903, "step": 14500 }, { "epoch": 0.7265329845975007, "grad_norm": 2.4028687477111816, "learning_rate": 9.927346701540251e-06, "loss": 5.3637, "step": 15000 }, { "epoch": 0.7507507507507507, "grad_norm": 2.4807944297790527, "learning_rate": 9.924924924924926e-06, "loss": 5.3279, "step": 15500 }, { "epoch": 0.7749685169040008, "grad_norm": 2.9065611362457275, "learning_rate": 9.922503148309601e-06, "loss": 5.3098, "step": 16000 }, { "epoch": 0.7991862830572508, "grad_norm": 2.359736204147339, "learning_rate": 9.920081371694276e-06, "loss": 5.2858, "step": 16500 }, { "epoch": 0.8234040492105008, "grad_norm": 2.642854690551758, "learning_rate": 9.917659595078951e-06, "loss": 5.2518, "step": 17000 }, { "epoch": 0.8476218153637508, "grad_norm": 3.2326414585113525, "learning_rate": 9.915237818463626e-06, "loss": 5.2354, "step": 17500 }, { "epoch": 0.8718395815170009, "grad_norm": 2.285203218460083, "learning_rate": 9.912816041848301e-06, "loss": 5.2117, "step": 18000 }, { "epoch": 0.8960573476702509, "grad_norm": 2.551164388656616, "learning_rate": 9.910394265232976e-06, "loss": 5.1941, "step": 18500 }, { "epoch": 0.9202751138235009, "grad_norm": 2.678759813308716, "learning_rate": 9.907972488617651e-06, "loss": 5.1706, "step": 19000 }, { "epoch": 0.9444928799767509, "grad_norm": 2.6895062923431396, "learning_rate": 9.905550712002325e-06, "loss": 5.1499, "step": 19500 }, { "epoch": 0.968710646130001, "grad_norm": 2.554659128189087, "learning_rate": 9.903128935387001e-06, "loss": 5.1276, "step": 20000 }, { "epoch": 0.992928412283251, "grad_norm": 2.785282850265503, "learning_rate": 9.900707158771675e-06, "loss": 5.1079, "step": 20500 }, { "epoch": 1.017146178436501, "grad_norm": 2.7283270359039307, "learning_rate": 9.89828538215635e-06, "loss": 5.0726, "step": 21000 }, { "epoch": 1.0413639445897511, "grad_norm": 2.654245615005493, "learning_rate": 9.895863605541027e-06, "loss": 5.0654, "step": 21500 }, { "epoch": 1.065581710743001, "grad_norm": 2.563713550567627, "learning_rate": 9.8934418289257e-06, "loss": 5.0436, "step": 22000 }, { "epoch": 1.0897994768962511, "grad_norm": 2.6896631717681885, "learning_rate": 9.891020052310377e-06, "loss": 5.0161, "step": 22500 }, { "epoch": 1.114017243049501, "grad_norm": 2.8477983474731445, "learning_rate": 9.88859827569505e-06, "loss": 5.008, "step": 23000 }, { "epoch": 1.1382350092027511, "grad_norm": 2.6253600120544434, "learning_rate": 9.886176499079725e-06, "loss": 4.987, "step": 23500 }, { "epoch": 1.1624527753560012, "grad_norm": 2.7618229389190674, "learning_rate": 9.8837547224644e-06, "loss": 4.9655, "step": 24000 }, { "epoch": 1.186670541509251, "grad_norm": 2.7631571292877197, "learning_rate": 9.881332945849075e-06, "loss": 4.9426, "step": 24500 }, { "epoch": 1.2108883076625012, "grad_norm": 3.108574390411377, "learning_rate": 9.87891116923375e-06, "loss": 4.9264, "step": 25000 }, { "epoch": 1.2351060738157513, "grad_norm": 2.5930752754211426, "learning_rate": 9.876489392618425e-06, "loss": 4.9068, "step": 25500 }, { "epoch": 1.2593238399690012, "grad_norm": 2.4590559005737305, "learning_rate": 9.8740676160031e-06, "loss": 4.8908, "step": 26000 }, { "epoch": 1.2835416061222513, "grad_norm": 2.7004990577697754, "learning_rate": 9.871645839387776e-06, "loss": 4.8767, "step": 26500 }, { "epoch": 1.3077593722755014, "grad_norm": 2.5023412704467773, "learning_rate": 9.86922406277245e-06, "loss": 4.8543, "step": 27000 }, { "epoch": 1.3319771384287513, "grad_norm": 3.338123083114624, "learning_rate": 9.866802286157126e-06, "loss": 4.8324, "step": 27500 }, { "epoch": 1.3561949045820014, "grad_norm": 2.871856689453125, "learning_rate": 9.8643805095418e-06, "loss": 4.8138, "step": 28000 }, { "epoch": 1.3804126707352513, "grad_norm": 3.148714303970337, "learning_rate": 9.861958732926476e-06, "loss": 4.7991, "step": 28500 }, { "epoch": 1.4046304368885014, "grad_norm": 2.986448287963867, "learning_rate": 9.85953695631115e-06, "loss": 4.781, "step": 29000 }, { "epoch": 1.4288482030417513, "grad_norm": 2.5939040184020996, "learning_rate": 9.857115179695826e-06, "loss": 4.7634, "step": 29500 }, { "epoch": 1.4530659691950014, "grad_norm": 2.674027442932129, "learning_rate": 9.854693403080501e-06, "loss": 4.7446, "step": 30000 }, { "epoch": 1.4772837353482515, "grad_norm": 3.018937826156616, "learning_rate": 9.852271626465176e-06, "loss": 4.7332, "step": 30500 }, { "epoch": 1.5015015015015014, "grad_norm": 2.862410306930542, "learning_rate": 9.849849849849851e-06, "loss": 4.7151, "step": 31000 }, { "epoch": 1.5257192676547515, "grad_norm": 2.9605488777160645, "learning_rate": 9.847428073234524e-06, "loss": 4.7023, "step": 31500 }, { "epoch": 1.5499370338080016, "grad_norm": 3.116225242614746, "learning_rate": 9.845006296619201e-06, "loss": 4.6834, "step": 32000 }, { "epoch": 1.5741547999612515, "grad_norm": 3.074164390563965, "learning_rate": 9.842584520003876e-06, "loss": 4.6676, "step": 32500 }, { "epoch": 1.5983725661145016, "grad_norm": 2.677706003189087, "learning_rate": 9.840162743388551e-06, "loss": 4.6547, "step": 33000 }, { "epoch": 1.6225903322677517, "grad_norm": 2.832223653793335, "learning_rate": 9.837740966773226e-06, "loss": 4.6402, "step": 33500 }, { "epoch": 1.6468080984210016, "grad_norm": 3.1041297912597656, "learning_rate": 9.8353191901579e-06, "loss": 4.6271, "step": 34000 }, { "epoch": 1.6710258645742515, "grad_norm": 2.883216381072998, "learning_rate": 9.832897413542576e-06, "loss": 4.6141, "step": 34500 }, { "epoch": 1.6952436307275018, "grad_norm": 2.894000291824341, "learning_rate": 9.83047563692725e-06, "loss": 4.6031, "step": 35000 }, { "epoch": 1.7194613968807517, "grad_norm": 2.9335453510284424, "learning_rate": 9.828053860311927e-06, "loss": 4.5911, "step": 35500 }, { "epoch": 1.7436791630340016, "grad_norm": 2.7511613368988037, "learning_rate": 9.8256320836966e-06, "loss": 4.5824, "step": 36000 }, { "epoch": 1.7678969291872517, "grad_norm": 2.8148419857025146, "learning_rate": 9.823210307081275e-06, "loss": 4.5693, "step": 36500 }, { "epoch": 1.7921146953405018, "grad_norm": 2.8832480907440186, "learning_rate": 9.820788530465952e-06, "loss": 4.5622, "step": 37000 }, { "epoch": 1.8163324614937517, "grad_norm": 2.9674079418182373, "learning_rate": 9.818366753850625e-06, "loss": 4.5473, "step": 37500 }, { "epoch": 1.8405502276470018, "grad_norm": 2.971090793609619, "learning_rate": 9.815944977235302e-06, "loss": 4.538, "step": 38000 }, { "epoch": 1.864767993800252, "grad_norm": 2.785881996154785, "learning_rate": 9.813523200619975e-06, "loss": 4.5327, "step": 38500 }, { "epoch": 1.8889857599535018, "grad_norm": 2.9853248596191406, "learning_rate": 9.81110142400465e-06, "loss": 4.5078, "step": 39000 }, { "epoch": 1.913203526106752, "grad_norm": 2.899179697036743, "learning_rate": 9.808679647389325e-06, "loss": 4.5002, "step": 39500 }, { "epoch": 1.937421292260002, "grad_norm": 2.5843992233276367, "learning_rate": 9.806257870774e-06, "loss": 4.4928, "step": 40000 }, { "epoch": 1.961639058413252, "grad_norm": 2.8425755500793457, "learning_rate": 9.803836094158675e-06, "loss": 4.4881, "step": 40500 }, { "epoch": 1.985856824566502, "grad_norm": 2.87211275100708, "learning_rate": 9.80141431754335e-06, "loss": 4.4822, "step": 41000 }, { "epoch": 2.010074590719752, "grad_norm": 3.0297703742980957, "learning_rate": 9.798992540928026e-06, "loss": 4.4619, "step": 41500 }, { "epoch": 2.034292356873002, "grad_norm": 2.9869863986968994, "learning_rate": 9.7965707643127e-06, "loss": 4.4526, "step": 42000 }, { "epoch": 2.058510123026252, "grad_norm": 2.777209520339966, "learning_rate": 9.794148987697376e-06, "loss": 4.432, "step": 42500 }, { "epoch": 2.0827278891795022, "grad_norm": 3.0258235931396484, "learning_rate": 9.79172721108205e-06, "loss": 4.4386, "step": 43000 }, { "epoch": 2.106945655332752, "grad_norm": 2.8184220790863037, "learning_rate": 9.789305434466726e-06, "loss": 4.4254, "step": 43500 }, { "epoch": 2.131163421486002, "grad_norm": 2.9428908824920654, "learning_rate": 9.7868836578514e-06, "loss": 4.4172, "step": 44000 }, { "epoch": 2.1553811876392523, "grad_norm": 3.1215102672576904, "learning_rate": 9.784461881236076e-06, "loss": 4.4078, "step": 44500 }, { "epoch": 2.1795989537925022, "grad_norm": 3.032611846923828, "learning_rate": 9.782040104620751e-06, "loss": 4.4036, "step": 45000 }, { "epoch": 2.203816719945752, "grad_norm": 2.9431488513946533, "learning_rate": 9.779618328005426e-06, "loss": 4.3997, "step": 45500 }, { "epoch": 2.228034486099002, "grad_norm": 2.9058682918548584, "learning_rate": 9.7771965513901e-06, "loss": 4.389, "step": 46000 }, { "epoch": 2.2522522522522523, "grad_norm": 2.703967809677124, "learning_rate": 9.774774774774776e-06, "loss": 4.3753, "step": 46500 }, { "epoch": 2.2764700184055022, "grad_norm": 2.764721155166626, "learning_rate": 9.77235299815945e-06, "loss": 4.3658, "step": 47000 }, { "epoch": 2.300687784558752, "grad_norm": 2.834578514099121, "learning_rate": 9.769931221544126e-06, "loss": 4.3577, "step": 47500 }, { "epoch": 2.3249055507120024, "grad_norm": 2.9823198318481445, "learning_rate": 9.767509444928801e-06, "loss": 4.3531, "step": 48000 }, { "epoch": 2.3491233168652523, "grad_norm": 2.8373069763183594, "learning_rate": 9.765087668313475e-06, "loss": 4.3515, "step": 48500 }, { "epoch": 2.373341083018502, "grad_norm": 2.6971516609191895, "learning_rate": 9.762665891698151e-06, "loss": 4.3367, "step": 49000 }, { "epoch": 2.3975588491717525, "grad_norm": 2.8022115230560303, "learning_rate": 9.760244115082825e-06, "loss": 4.3302, "step": 49500 }, { "epoch": 2.4217766153250024, "grad_norm": 2.9047532081604004, "learning_rate": 9.757822338467502e-06, "loss": 4.3202, "step": 50000 }, { "epoch": 2.4459943814782523, "grad_norm": 2.81803297996521, "learning_rate": 9.755400561852175e-06, "loss": 4.3184, "step": 50500 }, { "epoch": 2.4702121476315027, "grad_norm": 2.9668848514556885, "learning_rate": 9.75297878523685e-06, "loss": 4.3093, "step": 51000 }, { "epoch": 2.4944299137847525, "grad_norm": 3.0008721351623535, "learning_rate": 9.750557008621525e-06, "loss": 4.3089, "step": 51500 }, { "epoch": 2.5186476799380024, "grad_norm": 2.76766300201416, "learning_rate": 9.7481352320062e-06, "loss": 4.2961, "step": 52000 }, { "epoch": 2.5428654460912528, "grad_norm": 2.961453914642334, "learning_rate": 9.745713455390875e-06, "loss": 4.284, "step": 52500 }, { "epoch": 2.5670832122445026, "grad_norm": 3.030158758163452, "learning_rate": 9.74329167877555e-06, "loss": 4.2849, "step": 53000 }, { "epoch": 2.5913009783977525, "grad_norm": 2.9656057357788086, "learning_rate": 9.740869902160225e-06, "loss": 4.2712, "step": 53500 }, { "epoch": 2.615518744551003, "grad_norm": 3.3482959270477295, "learning_rate": 9.7384481255449e-06, "loss": 4.2833, "step": 54000 }, { "epoch": 2.6397365107042527, "grad_norm": 2.8142096996307373, "learning_rate": 9.736026348929575e-06, "loss": 4.2652, "step": 54500 }, { "epoch": 2.6639542768575026, "grad_norm": 2.776679277420044, "learning_rate": 9.73360457231425e-06, "loss": 4.2653, "step": 55000 }, { "epoch": 2.688172043010753, "grad_norm": 2.7612788677215576, "learning_rate": 9.731182795698925e-06, "loss": 4.2562, "step": 55500 }, { "epoch": 2.712389809164003, "grad_norm": 2.959991931915283, "learning_rate": 9.7287610190836e-06, "loss": 4.2515, "step": 56000 }, { "epoch": 2.7366075753172527, "grad_norm": 2.969061851501465, "learning_rate": 9.726339242468276e-06, "loss": 4.2378, "step": 56500 }, { "epoch": 2.7608253414705026, "grad_norm": 3.1710784435272217, "learning_rate": 9.72391746585295e-06, "loss": 4.2408, "step": 57000 }, { "epoch": 2.7850431076237525, "grad_norm": 2.9343762397766113, "learning_rate": 9.721495689237626e-06, "loss": 4.2316, "step": 57500 }, { "epoch": 2.809260873777003, "grad_norm": 2.98744535446167, "learning_rate": 9.7190739126223e-06, "loss": 4.2302, "step": 58000 }, { "epoch": 2.8334786399302527, "grad_norm": 2.8376593589782715, "learning_rate": 9.716652136006976e-06, "loss": 4.2229, "step": 58500 }, { "epoch": 2.8576964060835026, "grad_norm": 2.7830283641815186, "learning_rate": 9.714230359391651e-06, "loss": 4.2138, "step": 59000 }, { "epoch": 2.881914172236753, "grad_norm": 2.824352741241455, "learning_rate": 9.711808582776326e-06, "loss": 4.2039, "step": 59500 }, { "epoch": 2.906131938390003, "grad_norm": 2.8537116050720215, "learning_rate": 9.709386806161001e-06, "loss": 4.2063, "step": 60000 }, { "epoch": 2.9303497045432527, "grad_norm": 3.004157543182373, "learning_rate": 9.706965029545674e-06, "loss": 4.1983, "step": 60500 }, { "epoch": 2.954567470696503, "grad_norm": 2.8163509368896484, "learning_rate": 9.704543252930351e-06, "loss": 4.1938, "step": 61000 }, { "epoch": 2.978785236849753, "grad_norm": 2.8276596069335938, "learning_rate": 9.702121476315024e-06, "loss": 4.1915, "step": 61500 }, { "epoch": 3.003003003003003, "grad_norm": 2.7849977016448975, "learning_rate": 9.699699699699701e-06, "loss": 4.1942, "step": 62000 }, { "epoch": 3.027220769156253, "grad_norm": 2.782846212387085, "learning_rate": 9.697277923084375e-06, "loss": 4.1741, "step": 62500 }, { "epoch": 3.051438535309503, "grad_norm": 2.906552314758301, "learning_rate": 9.69485614646905e-06, "loss": 4.1767, "step": 63000 }, { "epoch": 3.075656301462753, "grad_norm": 3.0256595611572266, "learning_rate": 9.692434369853726e-06, "loss": 4.1665, "step": 63500 }, { "epoch": 3.0998740676160033, "grad_norm": 2.847698450088501, "learning_rate": 9.6900125932384e-06, "loss": 4.1642, "step": 64000 }, { "epoch": 3.124091833769253, "grad_norm": 2.8021674156188965, "learning_rate": 9.687590816623077e-06, "loss": 4.1663, "step": 64500 }, { "epoch": 3.148309599922503, "grad_norm": 2.784911632537842, "learning_rate": 9.68516904000775e-06, "loss": 4.1546, "step": 65000 }, { "epoch": 3.1725273660757534, "grad_norm": 3.019435167312622, "learning_rate": 9.682747263392425e-06, "loss": 4.1443, "step": 65500 }, { "epoch": 3.1967451322290033, "grad_norm": 2.60965895652771, "learning_rate": 9.6803254867771e-06, "loss": 4.1465, "step": 66000 }, { "epoch": 3.220962898382253, "grad_norm": 2.740164041519165, "learning_rate": 9.677903710161775e-06, "loss": 4.1345, "step": 66500 }, { "epoch": 3.2451806645355035, "grad_norm": 2.862274646759033, "learning_rate": 9.67548193354645e-06, "loss": 4.1461, "step": 67000 }, { "epoch": 3.2693984306887534, "grad_norm": 2.8547213077545166, "learning_rate": 9.673060156931125e-06, "loss": 4.137, "step": 67500 }, { "epoch": 3.2936161968420032, "grad_norm": 3.0033137798309326, "learning_rate": 9.6706383803158e-06, "loss": 4.1253, "step": 68000 }, { "epoch": 3.317833962995253, "grad_norm": 2.795989513397217, "learning_rate": 9.668216603700475e-06, "loss": 4.1232, "step": 68500 }, { "epoch": 3.3420517291485035, "grad_norm": 2.8020830154418945, "learning_rate": 9.66579482708515e-06, "loss": 4.1238, "step": 69000 }, { "epoch": 3.3662694953017533, "grad_norm": 2.808565855026245, "learning_rate": 9.663373050469825e-06, "loss": 4.1155, "step": 69500 }, { "epoch": 3.3904872614550032, "grad_norm": 2.7904319763183594, "learning_rate": 9.6609512738545e-06, "loss": 4.1143, "step": 70000 }, { "epoch": 3.4147050276082536, "grad_norm": 2.7850215435028076, "learning_rate": 9.658529497239176e-06, "loss": 4.1102, "step": 70500 }, { "epoch": 3.4389227937615034, "grad_norm": 2.6868176460266113, "learning_rate": 9.65610772062385e-06, "loss": 4.0994, "step": 71000 }, { "epoch": 3.4631405599147533, "grad_norm": 2.862273931503296, "learning_rate": 9.653685944008526e-06, "loss": 4.1054, "step": 71500 }, { "epoch": 3.4873583260680037, "grad_norm": 3.01948881149292, "learning_rate": 9.6512641673932e-06, "loss": 4.0975, "step": 72000 }, { "epoch": 3.5115760922212536, "grad_norm": 2.945227861404419, "learning_rate": 9.648842390777876e-06, "loss": 4.0941, "step": 72500 }, { "epoch": 3.5357938583745034, "grad_norm": 3.265650987625122, "learning_rate": 9.64642061416255e-06, "loss": 4.0854, "step": 73000 }, { "epoch": 3.5600116245277533, "grad_norm": 2.839852809906006, "learning_rate": 9.643998837547224e-06, "loss": 4.0889, "step": 73500 }, { "epoch": 3.5842293906810037, "grad_norm": 3.0958175659179688, "learning_rate": 9.641577060931901e-06, "loss": 4.0793, "step": 74000 }, { "epoch": 3.6084471568342535, "grad_norm": 2.957026481628418, "learning_rate": 9.639155284316576e-06, "loss": 4.0764, "step": 74500 }, { "epoch": 3.6326649229875034, "grad_norm": 3.0738115310668945, "learning_rate": 9.636733507701251e-06, "loss": 4.0742, "step": 75000 }, { "epoch": 3.6568826891407538, "grad_norm": 2.877403736114502, "learning_rate": 9.634311731085926e-06, "loss": 4.0744, "step": 75500 }, { "epoch": 3.6811004552940036, "grad_norm": 3.0667495727539062, "learning_rate": 9.6318899544706e-06, "loss": 4.0715, "step": 76000 }, { "epoch": 3.7053182214472535, "grad_norm": 2.8147807121276855, "learning_rate": 9.629468177855276e-06, "loss": 4.0666, "step": 76500 }, { "epoch": 3.729535987600504, "grad_norm": 2.8717801570892334, "learning_rate": 9.62704640123995e-06, "loss": 4.064, "step": 77000 }, { "epoch": 3.7537537537537538, "grad_norm": 2.7591042518615723, "learning_rate": 9.624624624624626e-06, "loss": 4.0557, "step": 77500 }, { "epoch": 3.7779715199070036, "grad_norm": 2.843806743621826, "learning_rate": 9.6222028480093e-06, "loss": 4.0579, "step": 78000 }, { "epoch": 3.802189286060254, "grad_norm": 2.869080066680908, "learning_rate": 9.619781071393975e-06, "loss": 4.0537, "step": 78500 }, { "epoch": 3.826407052213504, "grad_norm": 2.792863607406616, "learning_rate": 9.617359294778652e-06, "loss": 4.0484, "step": 79000 }, { "epoch": 3.8506248183667537, "grad_norm": 2.991138458251953, "learning_rate": 9.614937518163325e-06, "loss": 4.039, "step": 79500 }, { "epoch": 3.874842584520004, "grad_norm": 2.7616770267486572, "learning_rate": 9.612515741548002e-06, "loss": 4.0438, "step": 80000 }, { "epoch": 3.899060350673254, "grad_norm": 2.718642234802246, "learning_rate": 9.610093964932675e-06, "loss": 4.0333, "step": 80500 }, { "epoch": 3.923278116826504, "grad_norm": 2.8432154655456543, "learning_rate": 9.60767218831735e-06, "loss": 4.0419, "step": 81000 }, { "epoch": 3.947495882979754, "grad_norm": 3.018446683883667, "learning_rate": 9.605250411702025e-06, "loss": 4.0374, "step": 81500 }, { "epoch": 3.971713649133004, "grad_norm": 2.909247636795044, "learning_rate": 9.6028286350867e-06, "loss": 4.0299, "step": 82000 }, { "epoch": 3.995931415286254, "grad_norm": 3.047041654586792, "learning_rate": 9.600406858471375e-06, "loss": 4.0195, "step": 82500 }, { "epoch": 4.020149181439504, "grad_norm": 2.8578057289123535, "learning_rate": 9.59798508185605e-06, "loss": 4.0212, "step": 83000 }, { "epoch": 4.044366947592754, "grad_norm": 2.8038136959075928, "learning_rate": 9.595563305240725e-06, "loss": 4.0196, "step": 83500 }, { "epoch": 4.068584713746004, "grad_norm": 2.879891872406006, "learning_rate": 9.5931415286254e-06, "loss": 4.0127, "step": 84000 }, { "epoch": 4.092802479899254, "grad_norm": 2.875603437423706, "learning_rate": 9.590719752010075e-06, "loss": 4.0142, "step": 84500 }, { "epoch": 4.117020246052504, "grad_norm": 2.975302219390869, "learning_rate": 9.58829797539475e-06, "loss": 4.0002, "step": 85000 }, { "epoch": 4.141238012205754, "grad_norm": 2.9974005222320557, "learning_rate": 9.585876198779426e-06, "loss": 4.0038, "step": 85500 }, { "epoch": 4.1654557783590045, "grad_norm": 2.8580379486083984, "learning_rate": 9.5834544221641e-06, "loss": 4.0003, "step": 86000 }, { "epoch": 4.189673544512254, "grad_norm": 2.987436056137085, "learning_rate": 9.581032645548776e-06, "loss": 4.005, "step": 86500 }, { "epoch": 4.213891310665504, "grad_norm": 2.6872076988220215, "learning_rate": 9.57861086893345e-06, "loss": 3.9909, "step": 87000 }, { "epoch": 4.238109076818755, "grad_norm": 2.991762638092041, "learning_rate": 9.576189092318126e-06, "loss": 3.9897, "step": 87500 }, { "epoch": 4.262326842972004, "grad_norm": 2.8275723457336426, "learning_rate": 9.5737673157028e-06, "loss": 3.996, "step": 88000 }, { "epoch": 4.286544609125254, "grad_norm": 2.892839193344116, "learning_rate": 9.571345539087476e-06, "loss": 3.9946, "step": 88500 }, { "epoch": 4.310762375278505, "grad_norm": 2.8410208225250244, "learning_rate": 9.56892376247215e-06, "loss": 3.9862, "step": 89000 }, { "epoch": 4.334980141431754, "grad_norm": 2.797422409057617, "learning_rate": 9.566501985856826e-06, "loss": 3.9843, "step": 89500 }, { "epoch": 4.3591979075850045, "grad_norm": 2.855832099914551, "learning_rate": 9.564080209241501e-06, "loss": 3.9796, "step": 90000 }, { "epoch": 4.383415673738254, "grad_norm": 3.0120160579681396, "learning_rate": 9.561658432626174e-06, "loss": 3.9768, "step": 90500 }, { "epoch": 4.407633439891504, "grad_norm": 2.7952980995178223, "learning_rate": 9.559236656010851e-06, "loss": 3.9742, "step": 91000 }, { "epoch": 4.431851206044755, "grad_norm": 2.8430566787719727, "learning_rate": 9.556814879395525e-06, "loss": 3.9741, "step": 91500 }, { "epoch": 4.456068972198004, "grad_norm": 2.9674031734466553, "learning_rate": 9.554393102780201e-06, "loss": 3.9697, "step": 92000 }, { "epoch": 4.480286738351254, "grad_norm": 3.0408644676208496, "learning_rate": 9.551971326164875e-06, "loss": 3.9624, "step": 92500 }, { "epoch": 4.504504504504505, "grad_norm": 2.9981327056884766, "learning_rate": 9.54954954954955e-06, "loss": 3.9652, "step": 93000 }, { "epoch": 4.528722270657754, "grad_norm": 2.7843706607818604, "learning_rate": 9.547127772934225e-06, "loss": 3.9722, "step": 93500 }, { "epoch": 4.5529400368110045, "grad_norm": 2.7166874408721924, "learning_rate": 9.5447059963189e-06, "loss": 3.9628, "step": 94000 }, { "epoch": 4.577157802964255, "grad_norm": 2.923854351043701, "learning_rate": 9.542284219703575e-06, "loss": 3.9594, "step": 94500 }, { "epoch": 4.601375569117504, "grad_norm": 2.915800094604492, "learning_rate": 9.53986244308825e-06, "loss": 3.9609, "step": 95000 }, { "epoch": 4.625593335270755, "grad_norm": 2.9524765014648438, "learning_rate": 9.537440666472925e-06, "loss": 3.9587, "step": 95500 }, { "epoch": 4.649811101424005, "grad_norm": 2.898005723953247, "learning_rate": 9.5350188898576e-06, "loss": 3.9498, "step": 96000 }, { "epoch": 4.674028867577254, "grad_norm": 2.9840903282165527, "learning_rate": 9.532597113242275e-06, "loss": 3.9508, "step": 96500 }, { "epoch": 4.698246633730505, "grad_norm": 2.7765541076660156, "learning_rate": 9.53017533662695e-06, "loss": 3.9481, "step": 97000 }, { "epoch": 4.722464399883755, "grad_norm": 2.8900692462921143, "learning_rate": 9.527753560011625e-06, "loss": 3.9365, "step": 97500 }, { "epoch": 4.746682166037004, "grad_norm": 2.8892781734466553, "learning_rate": 9.5253317833963e-06, "loss": 3.9492, "step": 98000 }, { "epoch": 4.770899932190255, "grad_norm": 2.960374355316162, "learning_rate": 9.522910006780975e-06, "loss": 3.942, "step": 98500 }, { "epoch": 4.795117698343505, "grad_norm": 2.7404415607452393, "learning_rate": 9.52048823016565e-06, "loss": 3.938, "step": 99000 }, { "epoch": 4.8193354644967545, "grad_norm": 3.024486780166626, "learning_rate": 9.518066453550326e-06, "loss": 3.9296, "step": 99500 }, { "epoch": 4.843553230650005, "grad_norm": 2.8316361904144287, "learning_rate": 9.515644676935e-06, "loss": 3.9351, "step": 100000 }, { "epoch": 4.867770996803255, "grad_norm": 2.8669049739837646, "learning_rate": 9.513222900319676e-06, "loss": 3.9373, "step": 100500 }, { "epoch": 4.891988762956505, "grad_norm": 2.7152950763702393, "learning_rate": 9.51080112370435e-06, "loss": 3.9163, "step": 101000 }, { "epoch": 4.916206529109755, "grad_norm": 2.7430613040924072, "learning_rate": 9.508379347089026e-06, "loss": 3.9273, "step": 101500 }, { "epoch": 4.940424295263005, "grad_norm": 3.0171566009521484, "learning_rate": 9.5059575704737e-06, "loss": 3.9247, "step": 102000 }, { "epoch": 4.964642061416255, "grad_norm": 2.833829164505005, "learning_rate": 9.503535793858374e-06, "loss": 3.9307, "step": 102500 }, { "epoch": 4.988859827569505, "grad_norm": 2.7739973068237305, "learning_rate": 9.501114017243051e-06, "loss": 3.9195, "step": 103000 }, { "epoch": 5.013077593722755, "grad_norm": 2.774411201477051, "learning_rate": 9.498692240627724e-06, "loss": 3.9116, "step": 103500 }, { "epoch": 5.037295359876005, "grad_norm": 2.851175546646118, "learning_rate": 9.496270464012401e-06, "loss": 3.9113, "step": 104000 }, { "epoch": 5.061513126029255, "grad_norm": 2.8700265884399414, "learning_rate": 9.493848687397074e-06, "loss": 3.9058, "step": 104500 }, { "epoch": 5.0857308921825055, "grad_norm": 2.8087737560272217, "learning_rate": 9.49142691078175e-06, "loss": 3.9087, "step": 105000 }, { "epoch": 5.109948658335755, "grad_norm": 2.882826328277588, "learning_rate": 9.489005134166426e-06, "loss": 3.907, "step": 105500 }, { "epoch": 5.134166424489005, "grad_norm": 2.900575637817383, "learning_rate": 9.4865833575511e-06, "loss": 3.9022, "step": 106000 }, { "epoch": 5.158384190642255, "grad_norm": 2.7019128799438477, "learning_rate": 9.484161580935776e-06, "loss": 3.9017, "step": 106500 }, { "epoch": 5.182601956795505, "grad_norm": 2.8361051082611084, "learning_rate": 9.48173980432045e-06, "loss": 3.9108, "step": 107000 }, { "epoch": 5.206819722948755, "grad_norm": 2.741563558578491, "learning_rate": 9.479318027705125e-06, "loss": 3.8919, "step": 107500 }, { "epoch": 5.231037489102005, "grad_norm": 2.967627763748169, "learning_rate": 9.4768962510898e-06, "loss": 3.8967, "step": 108000 }, { "epoch": 5.255255255255255, "grad_norm": 2.8605451583862305, "learning_rate": 9.474474474474475e-06, "loss": 3.897, "step": 108500 }, { "epoch": 5.2794730214085055, "grad_norm": 2.7184574604034424, "learning_rate": 9.47205269785915e-06, "loss": 3.8975, "step": 109000 }, { "epoch": 5.303690787561755, "grad_norm": 2.7433297634124756, "learning_rate": 9.469630921243825e-06, "loss": 3.8954, "step": 109500 }, { "epoch": 5.327908553715005, "grad_norm": 2.7750015258789062, "learning_rate": 9.4672091446285e-06, "loss": 3.8913, "step": 110000 }, { "epoch": 5.352126319868256, "grad_norm": 2.9533851146698, "learning_rate": 9.464787368013175e-06, "loss": 3.8844, "step": 110500 }, { "epoch": 5.376344086021505, "grad_norm": 2.8131632804870605, "learning_rate": 9.46236559139785e-06, "loss": 3.8809, "step": 111000 }, { "epoch": 5.400561852174755, "grad_norm": 2.791193723678589, "learning_rate": 9.459943814782525e-06, "loss": 3.885, "step": 111500 }, { "epoch": 5.424779618328006, "grad_norm": 2.869932174682617, "learning_rate": 9.4575220381672e-06, "loss": 3.8839, "step": 112000 }, { "epoch": 5.448997384481255, "grad_norm": 2.906806707382202, "learning_rate": 9.455100261551875e-06, "loss": 3.8816, "step": 112500 }, { "epoch": 5.4732151506345055, "grad_norm": 2.6837105751037598, "learning_rate": 9.45267848493655e-06, "loss": 3.88, "step": 113000 }, { "epoch": 5.497432916787756, "grad_norm": 2.9571547508239746, "learning_rate": 9.450256708321225e-06, "loss": 3.877, "step": 113500 }, { "epoch": 5.521650682941005, "grad_norm": 2.706204891204834, "learning_rate": 9.4478349317059e-06, "loss": 3.8772, "step": 114000 }, { "epoch": 5.545868449094256, "grad_norm": 2.7605583667755127, "learning_rate": 9.445413155090576e-06, "loss": 3.875, "step": 114500 }, { "epoch": 5.570086215247506, "grad_norm": 2.711221933364868, "learning_rate": 9.44299137847525e-06, "loss": 3.8712, "step": 115000 }, { "epoch": 5.594303981400755, "grad_norm": 2.9056496620178223, "learning_rate": 9.440569601859924e-06, "loss": 3.8639, "step": 115500 }, { "epoch": 5.618521747554006, "grad_norm": 2.7061548233032227, "learning_rate": 9.4381478252446e-06, "loss": 3.8677, "step": 116000 }, { "epoch": 5.642739513707256, "grad_norm": 2.9951186180114746, "learning_rate": 9.435726048629276e-06, "loss": 3.8667, "step": 116500 }, { "epoch": 5.6669572798605055, "grad_norm": 2.753833293914795, "learning_rate": 9.43330427201395e-06, "loss": 3.8669, "step": 117000 }, { "epoch": 5.691175046013756, "grad_norm": 2.8989222049713135, "learning_rate": 9.430882495398626e-06, "loss": 3.8751, "step": 117500 }, { "epoch": 5.715392812167005, "grad_norm": 3.0137453079223633, "learning_rate": 9.4284607187833e-06, "loss": 3.8706, "step": 118000 }, { "epoch": 5.739610578320256, "grad_norm": 2.7698261737823486, "learning_rate": 9.426038942167976e-06, "loss": 3.857, "step": 118500 }, { "epoch": 5.763828344473506, "grad_norm": 2.877211332321167, "learning_rate": 9.42361716555265e-06, "loss": 3.8541, "step": 119000 }, { "epoch": 5.788046110626755, "grad_norm": 2.8494150638580322, "learning_rate": 9.421195388937326e-06, "loss": 3.8594, "step": 119500 }, { "epoch": 5.812263876780006, "grad_norm": 2.72268009185791, "learning_rate": 9.418773612322e-06, "loss": 3.8526, "step": 120000 }, { "epoch": 5.836481642933256, "grad_norm": 3.0423946380615234, "learning_rate": 9.416351835706675e-06, "loss": 3.8558, "step": 120500 }, { "epoch": 5.860699409086505, "grad_norm": 2.7056820392608643, "learning_rate": 9.413930059091351e-06, "loss": 3.8494, "step": 121000 }, { "epoch": 5.884917175239756, "grad_norm": 2.7295594215393066, "learning_rate": 9.411508282476025e-06, "loss": 3.8557, "step": 121500 }, { "epoch": 5.909134941393006, "grad_norm": 2.8661701679229736, "learning_rate": 9.409086505860701e-06, "loss": 3.8528, "step": 122000 }, { "epoch": 5.9333527075462555, "grad_norm": 2.8183608055114746, "learning_rate": 9.406664729245375e-06, "loss": 3.8511, "step": 122500 }, { "epoch": 5.957570473699506, "grad_norm": 2.974858283996582, "learning_rate": 9.40424295263005e-06, "loss": 3.8438, "step": 123000 }, { "epoch": 5.981788239852756, "grad_norm": 2.8071188926696777, "learning_rate": 9.401821176014725e-06, "loss": 3.8338, "step": 123500 }, { "epoch": 6.006006006006006, "grad_norm": 2.679610252380371, "learning_rate": 9.3993993993994e-06, "loss": 3.8432, "step": 124000 }, { "epoch": 6.030223772159256, "grad_norm": 2.7918217182159424, "learning_rate": 9.396977622784075e-06, "loss": 3.8359, "step": 124500 }, { "epoch": 6.054441538312506, "grad_norm": 2.9353878498077393, "learning_rate": 9.39455584616875e-06, "loss": 3.8423, "step": 125000 }, { "epoch": 6.078659304465756, "grad_norm": 2.7717785835266113, "learning_rate": 9.392134069553425e-06, "loss": 3.8362, "step": 125500 }, { "epoch": 6.102877070619006, "grad_norm": 2.8372817039489746, "learning_rate": 9.3897122929381e-06, "loss": 3.8308, "step": 126000 }, { "epoch": 6.127094836772256, "grad_norm": 2.823821544647217, "learning_rate": 9.387290516322775e-06, "loss": 3.8314, "step": 126500 }, { "epoch": 6.151312602925506, "grad_norm": 2.7613956928253174, "learning_rate": 9.38486873970745e-06, "loss": 3.8317, "step": 127000 }, { "epoch": 6.175530369078756, "grad_norm": 2.745297431945801, "learning_rate": 9.382446963092125e-06, "loss": 3.8274, "step": 127500 }, { "epoch": 6.1997481352320065, "grad_norm": 2.7873809337615967, "learning_rate": 9.3800251864768e-06, "loss": 3.8203, "step": 128000 }, { "epoch": 6.223965901385256, "grad_norm": 2.871760606765747, "learning_rate": 9.377603409861475e-06, "loss": 3.8356, "step": 128500 }, { "epoch": 6.248183667538506, "grad_norm": 2.788484811782837, "learning_rate": 9.37518163324615e-06, "loss": 3.8239, "step": 129000 }, { "epoch": 6.272401433691757, "grad_norm": 2.7170634269714355, "learning_rate": 9.372759856630826e-06, "loss": 3.8208, "step": 129500 }, { "epoch": 6.296619199845006, "grad_norm": 2.8259615898132324, "learning_rate": 9.370338080015499e-06, "loss": 3.8241, "step": 130000 }, { "epoch": 6.320836965998256, "grad_norm": 2.9876561164855957, "learning_rate": 9.367916303400176e-06, "loss": 3.8259, "step": 130500 }, { "epoch": 6.345054732151507, "grad_norm": 2.844414710998535, "learning_rate": 9.365494526784849e-06, "loss": 3.8173, "step": 131000 }, { "epoch": 6.369272498304756, "grad_norm": 2.8100152015686035, "learning_rate": 9.363072750169526e-06, "loss": 3.8245, "step": 131500 }, { "epoch": 6.3934902644580065, "grad_norm": 2.8489105701446533, "learning_rate": 9.360650973554201e-06, "loss": 3.8147, "step": 132000 }, { "epoch": 6.417708030611257, "grad_norm": 2.8502120971679688, "learning_rate": 9.358229196938874e-06, "loss": 3.8122, "step": 132500 }, { "epoch": 6.441925796764506, "grad_norm": 2.9556784629821777, "learning_rate": 9.355807420323551e-06, "loss": 3.8131, "step": 133000 }, { "epoch": 6.466143562917757, "grad_norm": 2.762270212173462, "learning_rate": 9.353385643708224e-06, "loss": 3.807, "step": 133500 }, { "epoch": 6.490361329071007, "grad_norm": 2.7611629962921143, "learning_rate": 9.350963867092901e-06, "loss": 3.8113, "step": 134000 }, { "epoch": 6.514579095224256, "grad_norm": 2.738227605819702, "learning_rate": 9.348542090477574e-06, "loss": 3.8137, "step": 134500 }, { "epoch": 6.538796861377507, "grad_norm": 2.822857618331909, "learning_rate": 9.34612031386225e-06, "loss": 3.7986, "step": 135000 }, { "epoch": 6.563014627530757, "grad_norm": 2.731264591217041, "learning_rate": 9.343698537246925e-06, "loss": 3.808, "step": 135500 }, { "epoch": 6.5872323936840065, "grad_norm": 2.699312448501587, "learning_rate": 9.3412767606316e-06, "loss": 3.803, "step": 136000 }, { "epoch": 6.611450159837257, "grad_norm": 2.8332924842834473, "learning_rate": 9.338854984016275e-06, "loss": 3.8153, "step": 136500 }, { "epoch": 6.635667925990506, "grad_norm": 2.8680782318115234, "learning_rate": 9.33643320740095e-06, "loss": 3.7969, "step": 137000 }, { "epoch": 6.659885692143757, "grad_norm": 2.844148635864258, "learning_rate": 9.334011430785625e-06, "loss": 3.8032, "step": 137500 }, { "epoch": 6.684103458297007, "grad_norm": 2.7583229541778564, "learning_rate": 9.3315896541703e-06, "loss": 3.8099, "step": 138000 }, { "epoch": 6.708321224450256, "grad_norm": 2.8120036125183105, "learning_rate": 9.329167877554975e-06, "loss": 3.799, "step": 138500 }, { "epoch": 6.732538990603507, "grad_norm": 2.8804004192352295, "learning_rate": 9.32674610093965e-06, "loss": 3.7951, "step": 139000 }, { "epoch": 6.756756756756757, "grad_norm": 2.8031482696533203, "learning_rate": 9.324324324324325e-06, "loss": 3.7966, "step": 139500 }, { "epoch": 6.7809745229100065, "grad_norm": 2.7140092849731445, "learning_rate": 9.321902547709e-06, "loss": 3.7959, "step": 140000 }, { "epoch": 6.7809745229100065, "step": 140000, "total_flos": 5.861064073050849e+17, "train_loss": 4.399306026785714, "train_runtime": 93598.1238, "train_samples_per_second": 705.829, "train_steps_per_second": 22.058 } ], "logging_steps": 500, "max_steps": 2064600, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.861064073050849e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }