{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.896817068905212, "eval_steps": 500, "global_step": 21000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023318176518596245, "grad_norm": 0.3708130121231079, "learning_rate": 0.0002, "loss": 1.1701, "step": 100 }, { "epoch": 0.04663635303719249, "grad_norm": 0.7055436968803406, "learning_rate": 0.0002, "loss": 0.9527, "step": 200 }, { "epoch": 0.06995452955578874, "grad_norm": 0.310996949672699, "learning_rate": 0.0002, "loss": 0.871, "step": 300 }, { "epoch": 0.09327270607438498, "grad_norm": 0.34611570835113525, "learning_rate": 0.0002, "loss": 0.8128, "step": 400 }, { "epoch": 0.11659088259298123, "grad_norm": 0.2793200612068176, "learning_rate": 0.0002, "loss": 0.8008, "step": 500 }, { "epoch": 0.13990905911157747, "grad_norm": 0.2440558820962906, "learning_rate": 0.0002, "loss": 0.7364, "step": 600 }, { "epoch": 0.16322723563017372, "grad_norm": 0.20660006999969482, "learning_rate": 0.0002, "loss": 0.7016, "step": 700 }, { "epoch": 0.18654541214876996, "grad_norm": 0.3151717782020569, "learning_rate": 0.0002, "loss": 0.6986, "step": 800 }, { "epoch": 0.2098635886673662, "grad_norm": 0.4207448363304138, "learning_rate": 0.0002, "loss": 0.649, "step": 900 }, { "epoch": 0.23318176518596245, "grad_norm": 0.43152570724487305, "learning_rate": 0.0002, "loss": 0.6725, "step": 1000 }, { "epoch": 0.2564999417045587, "grad_norm": 0.31539487838745117, "learning_rate": 0.0002, "loss": 0.6395, "step": 1100 }, { "epoch": 0.27981811822315494, "grad_norm": 0.3349384665489197, "learning_rate": 0.0002, "loss": 0.6033, "step": 1200 }, { "epoch": 0.3031362947417512, "grad_norm": 0.2724147140979767, "learning_rate": 0.0002, "loss": 0.6076, "step": 1300 }, { "epoch": 0.32645447126034743, "grad_norm": 0.2925530970096588, "learning_rate": 0.0002, "loss": 0.585, "step": 1400 }, { "epoch": 0.3497726477789437, "grad_norm": 0.4674293100833893, "learning_rate": 0.0002, "loss": 0.5657, "step": 1500 }, { "epoch": 0.3730908242975399, "grad_norm": 0.3915441930294037, "learning_rate": 0.0002, "loss": 0.5453, "step": 1600 }, { "epoch": 0.39640900081613617, "grad_norm": 0.24304556846618652, "learning_rate": 0.0002, "loss": 0.5198, "step": 1700 }, { "epoch": 0.4197271773347324, "grad_norm": 0.5447902679443359, "learning_rate": 0.0002, "loss": 0.5427, "step": 1800 }, { "epoch": 0.44304535385332866, "grad_norm": 0.4133426547050476, "learning_rate": 0.0002, "loss": 0.5204, "step": 1900 }, { "epoch": 0.4663635303719249, "grad_norm": 0.41733473539352417, "learning_rate": 0.0002, "loss": 0.5204, "step": 2000 }, { "epoch": 0.48968170689052115, "grad_norm": 0.3181161880493164, "learning_rate": 0.0002, "loss": 0.4698, "step": 2100 }, { "epoch": 0.5129998834091174, "grad_norm": 0.34142622351646423, "learning_rate": 0.0002, "loss": 0.4871, "step": 2200 }, { "epoch": 0.5363180599277136, "grad_norm": 0.1926470398902893, "learning_rate": 0.0002, "loss": 0.4649, "step": 2300 }, { "epoch": 0.5596362364463099, "grad_norm": 0.30340591073036194, "learning_rate": 0.0002, "loss": 0.4665, "step": 2400 }, { "epoch": 0.5829544129649061, "grad_norm": 0.3195839524269104, "learning_rate": 0.0002, "loss": 0.4667, "step": 2500 }, { "epoch": 0.6062725894835024, "grad_norm": 0.2145429104566574, "learning_rate": 0.0002, "loss": 0.4463, "step": 2600 }, { "epoch": 0.6295907660020986, "grad_norm": 0.15962275862693787, "learning_rate": 0.0002, "loss": 0.429, "step": 2700 }, { "epoch": 0.6529089425206949, "grad_norm": 0.3597501516342163, "learning_rate": 0.0002, "loss": 0.4277, "step": 2800 }, { "epoch": 0.6762271190392911, "grad_norm": 0.44612497091293335, "learning_rate": 0.0002, "loss": 0.4123, "step": 2900 }, { "epoch": 0.6995452955578874, "grad_norm": 0.21562007069587708, "learning_rate": 0.0002, "loss": 0.4074, "step": 3000 }, { "epoch": 0.7228634720764836, "grad_norm": 0.23217037320137024, "learning_rate": 0.0002, "loss": 0.4037, "step": 3100 }, { "epoch": 0.7461816485950798, "grad_norm": 0.3096787631511688, "learning_rate": 0.0002, "loss": 0.401, "step": 3200 }, { "epoch": 0.7694998251136761, "grad_norm": 0.18558426201343536, "learning_rate": 0.0002, "loss": 0.3983, "step": 3300 }, { "epoch": 0.7928180016322723, "grad_norm": 0.2520066797733307, "learning_rate": 0.0002, "loss": 0.4056, "step": 3400 }, { "epoch": 0.8161361781508686, "grad_norm": 0.41013041138648987, "learning_rate": 0.0002, "loss": 0.3706, "step": 3500 }, { "epoch": 0.8394543546694648, "grad_norm": 0.14811871945858002, "learning_rate": 0.0002, "loss": 0.3829, "step": 3600 }, { "epoch": 0.8627725311880611, "grad_norm": 0.36381468176841736, "learning_rate": 0.0002, "loss": 0.3744, "step": 3700 }, { "epoch": 0.8860907077066573, "grad_norm": 0.28783467411994934, "learning_rate": 0.0002, "loss": 0.3538, "step": 3800 }, { "epoch": 0.9094088842252536, "grad_norm": 0.23508860170841217, "learning_rate": 0.0002, "loss": 0.3277, "step": 3900 }, { "epoch": 0.9327270607438498, "grad_norm": 0.3819214403629303, "learning_rate": 0.0002, "loss": 0.3317, "step": 4000 }, { "epoch": 0.9560452372624461, "grad_norm": 0.298714816570282, "learning_rate": 0.0002, "loss": 0.3329, "step": 4100 }, { "epoch": 0.9793634137810423, "grad_norm": 0.17287446558475494, "learning_rate": 0.0002, "loss": 0.3418, "step": 4200 }, { "epoch": 1.0026815902996387, "grad_norm": 0.3725602328777313, "learning_rate": 0.0002, "loss": 0.3224, "step": 4300 }, { "epoch": 1.0259997668182348, "grad_norm": 0.6124657988548279, "learning_rate": 0.0002, "loss": 0.2589, "step": 4400 }, { "epoch": 1.0493179433368311, "grad_norm": 0.5308946371078491, "learning_rate": 0.0002, "loss": 0.2718, "step": 4500 }, { "epoch": 1.0726361198554273, "grad_norm": 0.3070002496242523, "learning_rate": 0.0002, "loss": 0.2662, "step": 4600 }, { "epoch": 1.0959542963740236, "grad_norm": 0.44111424684524536, "learning_rate": 0.0002, "loss": 0.2516, "step": 4700 }, { "epoch": 1.1192724728926198, "grad_norm": 0.32735341787338257, "learning_rate": 0.0002, "loss": 0.2652, "step": 4800 }, { "epoch": 1.1425906494112161, "grad_norm": 0.3475642800331116, "learning_rate": 0.0002, "loss": 0.2498, "step": 4900 }, { "epoch": 1.1659088259298123, "grad_norm": 0.41938111186027527, "learning_rate": 0.0002, "loss": 0.2577, "step": 5000 }, { "epoch": 1.1892270024484086, "grad_norm": 0.47618812322616577, "learning_rate": 0.0002, "loss": 0.251, "step": 5100 }, { "epoch": 1.2125451789670048, "grad_norm": 0.27327144145965576, "learning_rate": 0.0002, "loss": 0.2511, "step": 5200 }, { "epoch": 1.2358633554856011, "grad_norm": 0.3251878321170807, "learning_rate": 0.0002, "loss": 0.2264, "step": 5300 }, { "epoch": 1.2591815320041972, "grad_norm": 0.5156410336494446, "learning_rate": 0.0002, "loss": 0.2617, "step": 5400 }, { "epoch": 1.2824997085227934, "grad_norm": 0.30861613154411316, "learning_rate": 0.0002, "loss": 0.2441, "step": 5500 }, { "epoch": 1.3058178850413897, "grad_norm": 0.43310919404029846, "learning_rate": 0.0002, "loss": 0.2331, "step": 5600 }, { "epoch": 1.329136061559986, "grad_norm": 0.36176246404647827, "learning_rate": 0.0002, "loss": 0.2431, "step": 5700 }, { "epoch": 1.3524542380785822, "grad_norm": 0.3790377974510193, "learning_rate": 0.0002, "loss": 0.2458, "step": 5800 }, { "epoch": 1.3757724145971786, "grad_norm": 0.4052121341228485, "learning_rate": 0.0002, "loss": 0.2446, "step": 5900 }, { "epoch": 1.3990905911157747, "grad_norm": 0.35783982276916504, "learning_rate": 0.0002, "loss": 0.2465, "step": 6000 }, { "epoch": 1.422408767634371, "grad_norm": 0.35436511039733887, "learning_rate": 0.0002, "loss": 0.2569, "step": 6100 }, { "epoch": 1.4457269441529672, "grad_norm": 0.2950509488582611, "learning_rate": 0.0002, "loss": 0.22, "step": 6200 }, { "epoch": 1.4690451206715636, "grad_norm": 0.36950767040252686, "learning_rate": 0.0002, "loss": 0.2433, "step": 6300 }, { "epoch": 1.4923632971901597, "grad_norm": 0.35253265500068665, "learning_rate": 0.0002, "loss": 0.2269, "step": 6400 }, { "epoch": 1.515681473708756, "grad_norm": 0.3378414213657379, "learning_rate": 0.0002, "loss": 0.2329, "step": 6500 }, { "epoch": 1.5389996502273522, "grad_norm": 0.4102073311805725, "learning_rate": 0.0002, "loss": 0.2404, "step": 6600 }, { "epoch": 1.5623178267459483, "grad_norm": 0.4430312216281891, "learning_rate": 0.0002, "loss": 0.235, "step": 6700 }, { "epoch": 1.5856360032645447, "grad_norm": 0.3363936245441437, "learning_rate": 0.0002, "loss": 0.2288, "step": 6800 }, { "epoch": 1.608954179783141, "grad_norm": 0.3177776634693146, "learning_rate": 0.0002, "loss": 0.2443, "step": 6900 }, { "epoch": 1.6322723563017372, "grad_norm": 0.33283111453056335, "learning_rate": 0.0002, "loss": 0.2267, "step": 7000 }, { "epoch": 1.6555905328203333, "grad_norm": 0.4799099564552307, "learning_rate": 0.0002, "loss": 0.2355, "step": 7100 }, { "epoch": 1.6789087093389297, "grad_norm": 0.38987642526626587, "learning_rate": 0.0002, "loss": 0.2268, "step": 7200 }, { "epoch": 1.702226885857526, "grad_norm": 0.32820141315460205, "learning_rate": 0.0002, "loss": 0.2098, "step": 7300 }, { "epoch": 1.7255450623761222, "grad_norm": 0.4211929142475128, "learning_rate": 0.0002, "loss": 0.2291, "step": 7400 }, { "epoch": 1.7488632388947183, "grad_norm": 0.42743125557899475, "learning_rate": 0.0002, "loss": 0.2192, "step": 7500 }, { "epoch": 1.7721814154133146, "grad_norm": 0.33759135007858276, "learning_rate": 0.0002, "loss": 0.2301, "step": 7600 }, { "epoch": 1.795499591931911, "grad_norm": 0.24578171968460083, "learning_rate": 0.0002, "loss": 0.2233, "step": 7700 }, { "epoch": 1.8188177684505071, "grad_norm": 0.3331544101238251, "learning_rate": 0.0002, "loss": 0.2308, "step": 7800 }, { "epoch": 1.8421359449691033, "grad_norm": 0.4028831720352173, "learning_rate": 0.0002, "loss": 0.2112, "step": 7900 }, { "epoch": 1.8654541214876996, "grad_norm": 0.3874329924583435, "learning_rate": 0.0002, "loss": 0.1998, "step": 8000 }, { "epoch": 1.888772298006296, "grad_norm": 0.30130070447921753, "learning_rate": 0.0002, "loss": 0.203, "step": 8100 }, { "epoch": 1.9120904745248921, "grad_norm": 0.41124048829078674, "learning_rate": 0.0002, "loss": 0.2184, "step": 8200 }, { "epoch": 1.9354086510434882, "grad_norm": 0.3104913532733917, "learning_rate": 0.0002, "loss": 0.2211, "step": 8300 }, { "epoch": 1.9587268275620846, "grad_norm": 0.30567994713783264, "learning_rate": 0.0002, "loss": 0.2039, "step": 8400 }, { "epoch": 1.982045004080681, "grad_norm": 0.3126045763492584, "learning_rate": 0.0002, "loss": 0.2107, "step": 8500 }, { "epoch": 2.0053631805992773, "grad_norm": 0.29460686445236206, "learning_rate": 0.0002, "loss": 0.1901, "step": 8600 }, { "epoch": 2.0286813571178732, "grad_norm": 0.4113939106464386, "learning_rate": 0.0002, "loss": 0.1621, "step": 8700 }, { "epoch": 2.0519995336364696, "grad_norm": 0.33105671405792236, "learning_rate": 0.0002, "loss": 0.1657, "step": 8800 }, { "epoch": 2.075317710155066, "grad_norm": 0.33191269636154175, "learning_rate": 0.0002, "loss": 0.1773, "step": 8900 }, { "epoch": 2.0986358866736623, "grad_norm": 0.3344513475894928, "learning_rate": 0.0002, "loss": 0.1654, "step": 9000 }, { "epoch": 2.121954063192258, "grad_norm": 0.31760096549987793, "learning_rate": 0.0002, "loss": 0.1677, "step": 9100 }, { "epoch": 2.1452722397108546, "grad_norm": 0.32853373885154724, "learning_rate": 0.0002, "loss": 0.1775, "step": 9200 }, { "epoch": 2.168590416229451, "grad_norm": 0.38260915875434875, "learning_rate": 0.0002, "loss": 0.1644, "step": 9300 }, { "epoch": 2.1919085927480473, "grad_norm": 0.3272022604942322, "learning_rate": 0.0002, "loss": 0.1632, "step": 9400 }, { "epoch": 2.215226769266643, "grad_norm": 0.40181514620780945, "learning_rate": 0.0002, "loss": 0.1672, "step": 9500 }, { "epoch": 2.2385449457852395, "grad_norm": 0.285182923078537, "learning_rate": 0.0002, "loss": 0.1695, "step": 9600 }, { "epoch": 2.261863122303836, "grad_norm": 0.3401045799255371, "learning_rate": 0.0002, "loss": 0.1658, "step": 9700 }, { "epoch": 2.2851812988224323, "grad_norm": 0.45088696479797363, "learning_rate": 0.0002, "loss": 0.173, "step": 9800 }, { "epoch": 2.308499475341028, "grad_norm": 0.09891465306282043, "learning_rate": 0.0002, "loss": 0.1725, "step": 9900 }, { "epoch": 2.3318176518596245, "grad_norm": 0.3077000081539154, "learning_rate": 0.0002, "loss": 0.1777, "step": 10000 }, { "epoch": 2.355135828378221, "grad_norm": 0.2650957703590393, "learning_rate": 0.0002, "loss": 0.1606, "step": 10100 }, { "epoch": 2.3784540048968172, "grad_norm": 0.2967466413974762, "learning_rate": 0.0002, "loss": 0.1626, "step": 10200 }, { "epoch": 2.401772181415413, "grad_norm": 0.21177765727043152, "learning_rate": 0.0002, "loss": 0.1762, "step": 10300 }, { "epoch": 2.4250903579340095, "grad_norm": 0.34562838077545166, "learning_rate": 0.0002, "loss": 0.1653, "step": 10400 }, { "epoch": 2.448408534452606, "grad_norm": 0.2537182569503784, "learning_rate": 0.0002, "loss": 0.1722, "step": 10500 }, { "epoch": 2.4717267109712022, "grad_norm": 0.22955211997032166, "learning_rate": 0.0002, "loss": 0.1713, "step": 10600 }, { "epoch": 2.495044887489798, "grad_norm": 0.3709162175655365, "learning_rate": 0.0002, "loss": 0.1679, "step": 10700 }, { "epoch": 2.5183630640083945, "grad_norm": 0.24581150710582733, "learning_rate": 0.0002, "loss": 0.1604, "step": 10800 }, { "epoch": 2.541681240526991, "grad_norm": 0.20854513347148895, "learning_rate": 0.0002, "loss": 0.1687, "step": 10900 }, { "epoch": 2.5649994170455868, "grad_norm": 0.2496633380651474, "learning_rate": 0.0002, "loss": 0.163, "step": 11000 }, { "epoch": 2.588317593564183, "grad_norm": 0.23603980243206024, "learning_rate": 0.0002, "loss": 0.1748, "step": 11100 }, { "epoch": 2.6116357700827795, "grad_norm": 0.36322489380836487, "learning_rate": 0.0002, "loss": 0.1798, "step": 11200 }, { "epoch": 2.634953946601376, "grad_norm": 0.32981303334236145, "learning_rate": 0.0002, "loss": 0.1588, "step": 11300 }, { "epoch": 2.658272123119972, "grad_norm": 0.4760492742061615, "learning_rate": 0.0002, "loss": 0.1723, "step": 11400 }, { "epoch": 2.681590299638568, "grad_norm": 0.22435927391052246, "learning_rate": 0.0002, "loss": 0.1742, "step": 11500 }, { "epoch": 2.7049084761571645, "grad_norm": 0.2695131003856659, "learning_rate": 0.0002, "loss": 0.1602, "step": 11600 }, { "epoch": 2.728226652675761, "grad_norm": 0.16897708177566528, "learning_rate": 0.0002, "loss": 0.1698, "step": 11700 }, { "epoch": 2.751544829194357, "grad_norm": 0.2540949881076813, "learning_rate": 0.0002, "loss": 0.1641, "step": 11800 }, { "epoch": 2.7748630057129535, "grad_norm": 0.40854746103286743, "learning_rate": 0.0002, "loss": 0.1747, "step": 11900 }, { "epoch": 2.7981811822315494, "grad_norm": 0.3012579679489136, "learning_rate": 0.0002, "loss": 0.1619, "step": 12000 }, { "epoch": 2.821499358750146, "grad_norm": 0.18468593060970306, "learning_rate": 0.0002, "loss": 0.1686, "step": 12100 }, { "epoch": 2.844817535268742, "grad_norm": 0.3668818175792694, "learning_rate": 0.0002, "loss": 0.1588, "step": 12200 }, { "epoch": 2.868135711787338, "grad_norm": 0.5856422185897827, "learning_rate": 0.0002, "loss": 0.1784, "step": 12300 }, { "epoch": 2.8914538883059344, "grad_norm": 0.37487712502479553, "learning_rate": 0.0002, "loss": 0.1701, "step": 12400 }, { "epoch": 2.9147720648245308, "grad_norm": 0.29282090067863464, "learning_rate": 0.0002, "loss": 0.1613, "step": 12500 }, { "epoch": 2.938090241343127, "grad_norm": 0.306607186794281, "learning_rate": 0.0002, "loss": 0.1655, "step": 12600 }, { "epoch": 2.9614084178617235, "grad_norm": 0.1990358531475067, "learning_rate": 0.0002, "loss": 0.17, "step": 12700 }, { "epoch": 2.9847265943803194, "grad_norm": 0.4855429232120514, "learning_rate": 0.0002, "loss": 0.1722, "step": 12800 }, { "epoch": 3.0080447708989158, "grad_norm": 0.39795544743537903, "learning_rate": 0.0002, "loss": 0.1548, "step": 12900 }, { "epoch": 3.031362947417512, "grad_norm": 0.3113553524017334, "learning_rate": 0.0002, "loss": 0.1396, "step": 13000 }, { "epoch": 3.054681123936108, "grad_norm": 0.3086554706096649, "learning_rate": 0.0002, "loss": 0.1364, "step": 13100 }, { "epoch": 3.0779993004547044, "grad_norm": 0.24818335473537445, "learning_rate": 0.0002, "loss": 0.1414, "step": 13200 }, { "epoch": 3.1013174769733007, "grad_norm": 0.37954941391944885, "learning_rate": 0.0002, "loss": 0.1388, "step": 13300 }, { "epoch": 3.124635653491897, "grad_norm": 0.2943727672100067, "learning_rate": 0.0002, "loss": 0.1408, "step": 13400 }, { "epoch": 3.147953830010493, "grad_norm": 0.35590696334838867, "learning_rate": 0.0002, "loss": 0.1363, "step": 13500 }, { "epoch": 3.1712720065290894, "grad_norm": 0.19578373432159424, "learning_rate": 0.0002, "loss": 0.137, "step": 13600 }, { "epoch": 3.1945901830476857, "grad_norm": 0.25028303265571594, "learning_rate": 0.0002, "loss": 0.1348, "step": 13700 }, { "epoch": 3.217908359566282, "grad_norm": 0.18405300378799438, "learning_rate": 0.0002, "loss": 0.1372, "step": 13800 }, { "epoch": 3.241226536084878, "grad_norm": 0.31417056918144226, "learning_rate": 0.0002, "loss": 0.1428, "step": 13900 }, { "epoch": 3.2645447126034743, "grad_norm": 0.22496923804283142, "learning_rate": 0.0002, "loss": 0.1378, "step": 14000 }, { "epoch": 3.2878628891220707, "grad_norm": 0.23862232267856598, "learning_rate": 0.0002, "loss": 0.1362, "step": 14100 }, { "epoch": 3.311181065640667, "grad_norm": 0.2142096310853958, "learning_rate": 0.0002, "loss": 0.139, "step": 14200 }, { "epoch": 3.334499242159263, "grad_norm": 0.2794269025325775, "learning_rate": 0.0002, "loss": 0.1376, "step": 14300 }, { "epoch": 3.3578174186778593, "grad_norm": 0.14498618245124817, "learning_rate": 0.0002, "loss": 0.1416, "step": 14400 }, { "epoch": 3.3811355951964557, "grad_norm": 0.2895399332046509, "learning_rate": 0.0002, "loss": 0.1379, "step": 14500 }, { "epoch": 3.404453771715052, "grad_norm": 0.2537992000579834, "learning_rate": 0.0002, "loss": 0.1356, "step": 14600 }, { "epoch": 3.427771948233648, "grad_norm": 0.20395183563232422, "learning_rate": 0.0002, "loss": 0.1424, "step": 14700 }, { "epoch": 3.4510901247522443, "grad_norm": 0.15283405780792236, "learning_rate": 0.0002, "loss": 0.1395, "step": 14800 }, { "epoch": 3.4744083012708407, "grad_norm": 0.4268224537372589, "learning_rate": 0.0002, "loss": 0.1359, "step": 14900 }, { "epoch": 3.497726477789437, "grad_norm": 0.22292669117450714, "learning_rate": 0.0002, "loss": 0.1386, "step": 15000 }, { "epoch": 3.5210446543080334, "grad_norm": 0.11900927871465683, "learning_rate": 0.0002, "loss": 0.1442, "step": 15100 }, { "epoch": 3.5443628308266293, "grad_norm": 0.45133286714553833, "learning_rate": 0.0002, "loss": 0.1365, "step": 15200 }, { "epoch": 3.5676810073452256, "grad_norm": 0.30186957120895386, "learning_rate": 0.0002, "loss": 0.1416, "step": 15300 }, { "epoch": 3.590999183863822, "grad_norm": 0.31408384442329407, "learning_rate": 0.0002, "loss": 0.1387, "step": 15400 }, { "epoch": 3.614317360382418, "grad_norm": 0.36072710156440735, "learning_rate": 0.0002, "loss": 0.1428, "step": 15500 }, { "epoch": 3.6376355369010143, "grad_norm": 0.28984448313713074, "learning_rate": 0.0002, "loss": 0.1393, "step": 15600 }, { "epoch": 3.6609537134196106, "grad_norm": 0.2014656662940979, "learning_rate": 0.0002, "loss": 0.1435, "step": 15700 }, { "epoch": 3.684271889938207, "grad_norm": 0.41273656487464905, "learning_rate": 0.0002, "loss": 0.1369, "step": 15800 }, { "epoch": 3.7075900664568033, "grad_norm": 0.48672163486480713, "learning_rate": 0.0002, "loss": 0.1433, "step": 15900 }, { "epoch": 3.7309082429753992, "grad_norm": 0.19120950996875763, "learning_rate": 0.0002, "loss": 0.1405, "step": 16000 }, { "epoch": 3.7542264194939956, "grad_norm": 0.19792740046977997, "learning_rate": 0.0002, "loss": 0.1451, "step": 16100 }, { "epoch": 3.777544596012592, "grad_norm": 0.14919213950634003, "learning_rate": 0.0002, "loss": 0.1382, "step": 16200 }, { "epoch": 3.800862772531188, "grad_norm": 0.4650104343891144, "learning_rate": 0.0002, "loss": 0.1339, "step": 16300 }, { "epoch": 3.8241809490497842, "grad_norm": 0.3627985417842865, "learning_rate": 0.0002, "loss": 0.1422, "step": 16400 }, { "epoch": 3.8474991255683806, "grad_norm": 0.7782896161079407, "learning_rate": 0.0002, "loss": 0.1432, "step": 16500 }, { "epoch": 3.870817302086977, "grad_norm": 0.2858645021915436, "learning_rate": 0.0002, "loss": 0.1413, "step": 16600 }, { "epoch": 3.8941354786055733, "grad_norm": 0.22150644659996033, "learning_rate": 0.0002, "loss": 0.1437, "step": 16700 }, { "epoch": 3.917453655124169, "grad_norm": 0.3596114218235016, "learning_rate": 0.0002, "loss": 0.1463, "step": 16800 }, { "epoch": 3.9407718316427656, "grad_norm": 0.14949366450309753, "learning_rate": 0.0002, "loss": 0.1449, "step": 16900 }, { "epoch": 3.964090008161362, "grad_norm": 0.32889851927757263, "learning_rate": 0.0002, "loss": 0.1396, "step": 17000 }, { "epoch": 3.987408184679958, "grad_norm": 0.1940721869468689, "learning_rate": 0.0002, "loss": 0.14, "step": 17100 }, { "epoch": 4.010726361198555, "grad_norm": 0.1328798085451126, "learning_rate": 0.0002, "loss": 0.1316, "step": 17200 }, { "epoch": 4.0340445377171505, "grad_norm": 0.09979192912578583, "learning_rate": 0.0002, "loss": 0.1224, "step": 17300 }, { "epoch": 4.0573627142357465, "grad_norm": 0.22828274965286255, "learning_rate": 0.0002, "loss": 0.1184, "step": 17400 }, { "epoch": 4.080680890754343, "grad_norm": 0.1396108716726303, "learning_rate": 0.0002, "loss": 0.1189, "step": 17500 }, { "epoch": 4.103999067272939, "grad_norm": 0.1849929839372635, "learning_rate": 0.0002, "loss": 0.1231, "step": 17600 }, { "epoch": 4.127317243791535, "grad_norm": 0.14947502315044403, "learning_rate": 0.0002, "loss": 0.1158, "step": 17700 }, { "epoch": 4.150635420310132, "grad_norm": 0.3471536934375763, "learning_rate": 0.0002, "loss": 0.1204, "step": 17800 }, { "epoch": 4.173953596828728, "grad_norm": 0.23290419578552246, "learning_rate": 0.0002, "loss": 0.1175, "step": 17900 }, { "epoch": 4.197271773347325, "grad_norm": 0.17477743327617645, "learning_rate": 0.0002, "loss": 0.1205, "step": 18000 }, { "epoch": 4.2205899498659205, "grad_norm": 0.1214243695139885, "learning_rate": 0.0002, "loss": 0.1188, "step": 18100 }, { "epoch": 4.243908126384516, "grad_norm": 0.12706777453422546, "learning_rate": 0.0002, "loss": 0.1196, "step": 18200 }, { "epoch": 4.267226302903113, "grad_norm": 0.18115375936031342, "learning_rate": 0.0002, "loss": 0.1179, "step": 18300 }, { "epoch": 4.290544479421709, "grad_norm": 0.05149231478571892, "learning_rate": 0.0002, "loss": 0.1224, "step": 18400 }, { "epoch": 4.313862655940305, "grad_norm": 0.47274354100227356, "learning_rate": 0.0002, "loss": 0.1192, "step": 18500 }, { "epoch": 4.337180832458902, "grad_norm": 0.218338742852211, "learning_rate": 0.0002, "loss": 0.1244, "step": 18600 }, { "epoch": 4.360499008977498, "grad_norm": 0.1247347891330719, "learning_rate": 0.0002, "loss": 0.1267, "step": 18700 }, { "epoch": 4.383817185496095, "grad_norm": 0.2586764991283417, "learning_rate": 0.0002, "loss": 0.1236, "step": 18800 }, { "epoch": 4.4071353620146905, "grad_norm": 0.11474807560443878, "learning_rate": 0.0002, "loss": 0.1252, "step": 18900 }, { "epoch": 4.430453538533286, "grad_norm": 0.34646329283714294, "learning_rate": 0.0002, "loss": 0.1237, "step": 19000 }, { "epoch": 4.453771715051883, "grad_norm": 0.17445826530456543, "learning_rate": 0.0002, "loss": 0.1183, "step": 19100 }, { "epoch": 4.477089891570479, "grad_norm": 0.3867531716823578, "learning_rate": 0.0002, "loss": 0.1248, "step": 19200 }, { "epoch": 4.500408068089076, "grad_norm": 0.15927106142044067, "learning_rate": 0.0002, "loss": 0.1258, "step": 19300 }, { "epoch": 4.523726244607672, "grad_norm": 0.2284346967935562, "learning_rate": 0.0002, "loss": 0.1244, "step": 19400 }, { "epoch": 4.547044421126268, "grad_norm": 0.3231777250766754, "learning_rate": 0.0002, "loss": 0.1257, "step": 19500 }, { "epoch": 4.5703625976448645, "grad_norm": 0.10116703063249588, "learning_rate": 0.0002, "loss": 0.1293, "step": 19600 }, { "epoch": 4.59368077416346, "grad_norm": 0.2922173738479614, "learning_rate": 0.0002, "loss": 0.1262, "step": 19700 }, { "epoch": 4.616998950682056, "grad_norm": 0.1958065629005432, "learning_rate": 0.0002, "loss": 0.1258, "step": 19800 }, { "epoch": 4.640317127200653, "grad_norm": 0.08755222707986832, "learning_rate": 0.0002, "loss": 0.1293, "step": 19900 }, { "epoch": 4.663635303719249, "grad_norm": 0.1416950523853302, "learning_rate": 0.0002, "loss": 0.1227, "step": 20000 }, { "epoch": 4.686953480237845, "grad_norm": 0.21383579075336456, "learning_rate": 0.0002, "loss": 0.1272, "step": 20100 }, { "epoch": 4.710271656756442, "grad_norm": 0.27910149097442627, "learning_rate": 0.0002, "loss": 0.1298, "step": 20200 }, { "epoch": 4.733589833275038, "grad_norm": 0.07715137302875519, "learning_rate": 0.0002, "loss": 0.1266, "step": 20300 }, { "epoch": 4.7569080097936345, "grad_norm": 0.08127077668905258, "learning_rate": 0.0002, "loss": 0.1269, "step": 20400 }, { "epoch": 4.78022618631223, "grad_norm": 0.3075973391532898, "learning_rate": 0.0002, "loss": 0.1308, "step": 20500 }, { "epoch": 4.803544362830826, "grad_norm": 0.23989351093769073, "learning_rate": 0.0002, "loss": 0.1217, "step": 20600 }, { "epoch": 4.826862539349423, "grad_norm": 0.1361120343208313, "learning_rate": 0.0002, "loss": 0.1237, "step": 20700 }, { "epoch": 4.850180715868019, "grad_norm": 0.3711351156234741, "learning_rate": 0.0002, "loss": 0.1248, "step": 20800 }, { "epoch": 4.873498892386616, "grad_norm": 0.3196912109851837, "learning_rate": 0.0002, "loss": 0.1236, "step": 20900 }, { "epoch": 4.896817068905212, "grad_norm": 0.10089880973100662, "learning_rate": 0.0002, "loss": 0.1248, "step": 21000 } ], "logging_steps": 100, "max_steps": 21440, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9056254817400013e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }