{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.12197420799929033, "eval_steps": 500, "global_step": 11000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.2580924034118652, "learning_rate": 2.2172949002217296e-06, "loss": 10.2933, "step": 50 }, { "epoch": 0.0, "grad_norm": 2.4347386360168457, "learning_rate": 4.434589800443459e-06, "loss": 10.1894, "step": 100 }, { "epoch": 0.0, "grad_norm": 2.3895885944366455, "learning_rate": 6.651884700665188e-06, "loss": 10.1424, "step": 150 }, { "epoch": 0.0, "grad_norm": 2.129647731781006, "learning_rate": 8.869179600886918e-06, "loss": 10.0995, "step": 200 }, { "epoch": 0.0, "grad_norm": 2.3564186096191406, "learning_rate": 1.1086474501108649e-05, "loss": 10.0479, "step": 250 }, { "epoch": 0.0, "grad_norm": 1.830551028251648, "learning_rate": 1.3303769401330377e-05, "loss": 9.9971, "step": 300 }, { "epoch": 0.0, "grad_norm": 2.1173911094665527, "learning_rate": 1.5521064301552106e-05, "loss": 9.9201, "step": 350 }, { "epoch": 0.0, "grad_norm": 1.6636557579040527, "learning_rate": 1.7738359201773837e-05, "loss": 9.8562, "step": 400 }, { "epoch": 0.0, "grad_norm": 2.4503839015960693, "learning_rate": 1.9955654101995567e-05, "loss": 9.7599, "step": 450 }, { "epoch": 0.01, "grad_norm": 1.822424054145813, "learning_rate": 2.2172949002217298e-05, "loss": 9.6608, "step": 500 }, { "epoch": 0.01, "grad_norm": 1.6598998308181763, "learning_rate": 2.4390243902439026e-05, "loss": 9.55, "step": 550 }, { "epoch": 0.01, "grad_norm": 1.8471707105636597, "learning_rate": 2.6607538802660753e-05, "loss": 9.4606, "step": 600 }, { "epoch": 0.01, "grad_norm": 1.4833533763885498, "learning_rate": 2.8824833702882487e-05, "loss": 9.3283, "step": 650 }, { "epoch": 0.01, "grad_norm": 1.688541054725647, "learning_rate": 3.104212860310421e-05, "loss": 9.2229, "step": 700 }, { "epoch": 0.01, "grad_norm": 1.6466543674468994, "learning_rate": 3.325942350332594e-05, "loss": 9.1093, "step": 750 }, { "epoch": 0.01, "grad_norm": 1.4169293642044067, "learning_rate": 3.547671840354767e-05, "loss": 8.9703, "step": 800 }, { "epoch": 0.01, "grad_norm": 1.7079193592071533, "learning_rate": 3.7694013303769404e-05, "loss": 8.8351, "step": 850 }, { "epoch": 0.01, "grad_norm": 1.5513204336166382, "learning_rate": 3.9911308203991135e-05, "loss": 8.7111, "step": 900 }, { "epoch": 0.01, "grad_norm": 1.485573172569275, "learning_rate": 4.212860310421286e-05, "loss": 8.5627, "step": 950 }, { "epoch": 0.01, "grad_norm": 1.511690616607666, "learning_rate": 4.4345898004434597e-05, "loss": 8.5042, "step": 1000 }, { "epoch": 0.01, "grad_norm": 2.1478614807128906, "learning_rate": 4.656319290465632e-05, "loss": 8.3287, "step": 1050 }, { "epoch": 0.01, "grad_norm": 1.4060652256011963, "learning_rate": 4.878048780487805e-05, "loss": 8.2341, "step": 1100 }, { "epoch": 0.01, "grad_norm": 1.3950035572052002, "learning_rate": 5.099778270509978e-05, "loss": 8.1277, "step": 1150 }, { "epoch": 0.01, "grad_norm": 1.5197688341140747, "learning_rate": 5.3215077605321506e-05, "loss": 8.0311, "step": 1200 }, { "epoch": 0.01, "grad_norm": 1.3406693935394287, "learning_rate": 5.543237250554324e-05, "loss": 7.9824, "step": 1250 }, { "epoch": 0.01, "grad_norm": 1.4520119428634644, "learning_rate": 5.7649667405764975e-05, "loss": 7.9948, "step": 1300 }, { "epoch": 0.01, "grad_norm": 1.179124116897583, "learning_rate": 5.98669623059867e-05, "loss": 7.9144, "step": 1350 }, { "epoch": 0.02, "grad_norm": 1.4039533138275146, "learning_rate": 6.208425720620842e-05, "loss": 7.8768, "step": 1400 }, { "epoch": 0.02, "grad_norm": 1.5542700290679932, "learning_rate": 6.430155210643016e-05, "loss": 7.894, "step": 1450 }, { "epoch": 0.02, "grad_norm": 1.4150550365447998, "learning_rate": 6.651884700665188e-05, "loss": 7.8409, "step": 1500 }, { "epoch": 0.02, "grad_norm": 1.6647827625274658, "learning_rate": 6.873614190687362e-05, "loss": 7.91, "step": 1550 }, { "epoch": 0.02, "grad_norm": 1.7795697450637817, "learning_rate": 7.095343680709535e-05, "loss": 7.8256, "step": 1600 }, { "epoch": 0.02, "grad_norm": 1.933110237121582, "learning_rate": 7.317073170731707e-05, "loss": 7.8463, "step": 1650 }, { "epoch": 0.02, "grad_norm": 1.1942570209503174, "learning_rate": 7.538802660753881e-05, "loss": 7.7827, "step": 1700 }, { "epoch": 0.02, "grad_norm": 1.6759297847747803, "learning_rate": 7.760532150776053e-05, "loss": 7.8, "step": 1750 }, { "epoch": 0.02, "grad_norm": 1.093256950378418, "learning_rate": 7.982261640798227e-05, "loss": 7.7461, "step": 1800 }, { "epoch": 0.02, "grad_norm": 1.567872166633606, "learning_rate": 8.2039911308204e-05, "loss": 7.7338, "step": 1850 }, { "epoch": 0.02, "grad_norm": 1.3017679452896118, "learning_rate": 8.425720620842572e-05, "loss": 7.804, "step": 1900 }, { "epoch": 0.02, "grad_norm": 1.7510960102081299, "learning_rate": 8.647450110864746e-05, "loss": 7.7405, "step": 1950 }, { "epoch": 0.02, "grad_norm": 1.7215120792388916, "learning_rate": 8.869179600886919e-05, "loss": 7.7429, "step": 2000 }, { "epoch": 0.02, "grad_norm": 1.6202715635299683, "learning_rate": 9.090909090909092e-05, "loss": 7.6588, "step": 2050 }, { "epoch": 0.02, "grad_norm": 1.5680756568908691, "learning_rate": 9.312638580931264e-05, "loss": 7.6224, "step": 2100 }, { "epoch": 0.02, "grad_norm": 1.462240219116211, "learning_rate": 9.534368070953438e-05, "loss": 7.6851, "step": 2150 }, { "epoch": 0.02, "grad_norm": 2.2018320560455322, "learning_rate": 9.75609756097561e-05, "loss": 7.6443, "step": 2200 }, { "epoch": 0.02, "grad_norm": 1.9520208835601807, "learning_rate": 9.977827050997783e-05, "loss": 7.6456, "step": 2250 }, { "epoch": 0.03, "grad_norm": 1.115421175956726, "learning_rate": 0.00010199556541019956, "loss": 7.5894, "step": 2300 }, { "epoch": 0.03, "grad_norm": 1.6002250909805298, "learning_rate": 0.0001042128603104213, "loss": 7.6017, "step": 2350 }, { "epoch": 0.03, "grad_norm": 1.6516796350479126, "learning_rate": 0.00010643015521064301, "loss": 7.4548, "step": 2400 }, { "epoch": 0.03, "grad_norm": 2.2168257236480713, "learning_rate": 0.00010864745011086475, "loss": 7.5867, "step": 2450 }, { "epoch": 0.03, "grad_norm": 1.5447593927383423, "learning_rate": 0.00011086474501108647, "loss": 7.5317, "step": 2500 }, { "epoch": 0.03, "grad_norm": 1.6840906143188477, "learning_rate": 0.00011308203991130821, "loss": 7.5127, "step": 2550 }, { "epoch": 0.03, "grad_norm": 1.2965503931045532, "learning_rate": 0.00011529933481152995, "loss": 7.4911, "step": 2600 }, { "epoch": 0.03, "grad_norm": 1.643584966659546, "learning_rate": 0.00011751662971175166, "loss": 7.4416, "step": 2650 }, { "epoch": 0.03, "grad_norm": 1.5419111251831055, "learning_rate": 0.0001197339246119734, "loss": 7.4944, "step": 2700 }, { "epoch": 0.03, "grad_norm": 1.7774205207824707, "learning_rate": 0.00012195121951219512, "loss": 7.4244, "step": 2750 }, { "epoch": 0.03, "grad_norm": 2.1709322929382324, "learning_rate": 0.00012416851441241685, "loss": 7.371, "step": 2800 }, { "epoch": 0.03, "grad_norm": 1.5503411293029785, "learning_rate": 0.0001263858093126386, "loss": 7.3031, "step": 2850 }, { "epoch": 0.03, "grad_norm": 1.7744035720825195, "learning_rate": 0.00012860310421286032, "loss": 7.3338, "step": 2900 }, { "epoch": 0.03, "grad_norm": 2.2014000415802, "learning_rate": 0.00013082039911308205, "loss": 7.2962, "step": 2950 }, { "epoch": 0.03, "grad_norm": 1.6716220378875732, "learning_rate": 0.00013303769401330377, "loss": 7.3348, "step": 3000 }, { "epoch": 0.03, "grad_norm": 1.7045074701309204, "learning_rate": 0.0001352549889135255, "loss": 7.2864, "step": 3050 }, { "epoch": 0.03, "grad_norm": 1.8933771848678589, "learning_rate": 0.00013747228381374724, "loss": 7.2744, "step": 3100 }, { "epoch": 0.03, "grad_norm": 2.298779249191284, "learning_rate": 0.00013968957871396897, "loss": 7.2472, "step": 3150 }, { "epoch": 0.04, "grad_norm": 1.3420922756195068, "learning_rate": 0.0001419068736141907, "loss": 7.3019, "step": 3200 }, { "epoch": 0.04, "grad_norm": 1.9339039325714111, "learning_rate": 0.00014412416851441242, "loss": 7.2982, "step": 3250 }, { "epoch": 0.04, "grad_norm": 2.69667387008667, "learning_rate": 0.00014634146341463414, "loss": 7.2851, "step": 3300 }, { "epoch": 0.04, "grad_norm": 2.3124189376831055, "learning_rate": 0.0001485587583148559, "loss": 7.258, "step": 3350 }, { "epoch": 0.04, "grad_norm": 1.975651741027832, "learning_rate": 0.00015077605321507762, "loss": 7.1275, "step": 3400 }, { "epoch": 0.04, "grad_norm": 1.9704022407531738, "learning_rate": 0.00015299334811529934, "loss": 7.1473, "step": 3450 }, { "epoch": 0.04, "grad_norm": 2.5047757625579834, "learning_rate": 0.00015521064301552106, "loss": 7.1096, "step": 3500 }, { "epoch": 0.04, "grad_norm": 1.5465894937515259, "learning_rate": 0.0001574279379157428, "loss": 7.1501, "step": 3550 }, { "epoch": 0.04, "grad_norm": 1.9557933807373047, "learning_rate": 0.00015964523281596454, "loss": 7.2033, "step": 3600 }, { "epoch": 0.04, "grad_norm": 2.420116424560547, "learning_rate": 0.00016186252771618626, "loss": 7.1275, "step": 3650 }, { "epoch": 0.04, "grad_norm": 2.114737033843994, "learning_rate": 0.000164079822616408, "loss": 7.0932, "step": 3700 }, { "epoch": 0.04, "grad_norm": 2.3085389137268066, "learning_rate": 0.00016629711751662974, "loss": 7.0311, "step": 3750 }, { "epoch": 0.04, "grad_norm": 2.5679140090942383, "learning_rate": 0.00016851441241685144, "loss": 6.9168, "step": 3800 }, { "epoch": 0.04, "grad_norm": 1.8611838817596436, "learning_rate": 0.0001707317073170732, "loss": 7.0085, "step": 3850 }, { "epoch": 0.04, "grad_norm": 1.8603994846343994, "learning_rate": 0.0001729490022172949, "loss": 6.9432, "step": 3900 }, { "epoch": 0.04, "grad_norm": 2.4244627952575684, "learning_rate": 0.00017516629711751663, "loss": 6.9333, "step": 3950 }, { "epoch": 0.04, "grad_norm": 2.177870750427246, "learning_rate": 0.00017738359201773839, "loss": 6.9499, "step": 4000 }, { "epoch": 0.04, "grad_norm": 1.9320554733276367, "learning_rate": 0.00017960088691796008, "loss": 6.8204, "step": 4050 }, { "epoch": 0.05, "grad_norm": 1.5062849521636963, "learning_rate": 0.00018181818181818183, "loss": 6.9505, "step": 4100 }, { "epoch": 0.05, "grad_norm": 2.9272422790527344, "learning_rate": 0.00018403547671840356, "loss": 6.8701, "step": 4150 }, { "epoch": 0.05, "grad_norm": 2.0309596061706543, "learning_rate": 0.00018625277161862528, "loss": 6.924, "step": 4200 }, { "epoch": 0.05, "grad_norm": 2.0265886783599854, "learning_rate": 0.00018847006651884703, "loss": 6.9223, "step": 4250 }, { "epoch": 0.05, "grad_norm": 2.5160486698150635, "learning_rate": 0.00019068736141906876, "loss": 6.8708, "step": 4300 }, { "epoch": 0.05, "grad_norm": 2.613301992416382, "learning_rate": 0.00019290465631929045, "loss": 6.8937, "step": 4350 }, { "epoch": 0.05, "grad_norm": 2.3031229972839355, "learning_rate": 0.0001951219512195122, "loss": 6.8337, "step": 4400 }, { "epoch": 0.05, "grad_norm": 2.54779052734375, "learning_rate": 0.00019733924611973393, "loss": 6.8334, "step": 4450 }, { "epoch": 0.05, "grad_norm": 2.8277971744537354, "learning_rate": 0.00019955654101995565, "loss": 6.7925, "step": 4500 }, { "epoch": 0.05, "grad_norm": 2.0113885402679443, "learning_rate": 0.00019999989242739025, "loss": 6.8458, "step": 4550 }, { "epoch": 0.05, "grad_norm": 2.2395377159118652, "learning_rate": 0.00019999945541405976, "loss": 6.6251, "step": 4600 }, { "epoch": 0.05, "grad_norm": 2.445993423461914, "learning_rate": 0.0001999986822381884, "loss": 6.8099, "step": 4650 }, { "epoch": 0.05, "grad_norm": 4.077752590179443, "learning_rate": 0.0001999975729023753, "loss": 6.8053, "step": 4700 }, { "epoch": 0.05, "grad_norm": 3.167569875717163, "learning_rate": 0.00019999612741034963, "loss": 6.7706, "step": 4750 }, { "epoch": 0.05, "grad_norm": 1.893659234046936, "learning_rate": 0.00019999434576697066, "loss": 6.8245, "step": 4800 }, { "epoch": 0.05, "grad_norm": 3.6101326942443848, "learning_rate": 0.00019999222797822762, "loss": 6.7407, "step": 4850 }, { "epoch": 0.05, "grad_norm": 2.2858726978302, "learning_rate": 0.00019998977405123974, "loss": 6.74, "step": 4900 }, { "epoch": 0.05, "grad_norm": 1.9325459003448486, "learning_rate": 0.0001999869839942563, "loss": 6.716, "step": 4950 }, { "epoch": 0.06, "grad_norm": 2.0043437480926514, "learning_rate": 0.00019998385781665643, "loss": 6.6003, "step": 5000 }, { "epoch": 0.06, "grad_norm": 4.151523113250732, "learning_rate": 0.00019998039552894924, "loss": 6.6801, "step": 5050 }, { "epoch": 0.06, "grad_norm": 3.8407771587371826, "learning_rate": 0.00019997659714277372, "loss": 6.608, "step": 5100 }, { "epoch": 0.06, "grad_norm": 2.230713129043579, "learning_rate": 0.00019997246267089867, "loss": 6.6479, "step": 5150 }, { "epoch": 0.06, "grad_norm": 2.2546942234039307, "learning_rate": 0.0001999679921272227, "loss": 6.6548, "step": 5200 }, { "epoch": 0.06, "grad_norm": 3.180986166000366, "learning_rate": 0.00019996318552677425, "loss": 6.6851, "step": 5250 }, { "epoch": 0.06, "grad_norm": 2.341231346130371, "learning_rate": 0.00019995804288571134, "loss": 6.547, "step": 5300 }, { "epoch": 0.06, "grad_norm": 3.1117124557495117, "learning_rate": 0.00019995256422132172, "loss": 6.7072, "step": 5350 }, { "epoch": 0.06, "grad_norm": 2.0082530975341797, "learning_rate": 0.0001999467495520227, "loss": 6.5422, "step": 5400 }, { "epoch": 0.06, "grad_norm": 2.409489870071411, "learning_rate": 0.0001999405988973611, "loss": 6.3716, "step": 5450 }, { "epoch": 0.06, "grad_norm": 2.649052381515503, "learning_rate": 0.00019993411227801328, "loss": 6.6434, "step": 5500 }, { "epoch": 0.06, "grad_norm": 3.081116199493408, "learning_rate": 0.00019992728971578492, "loss": 6.4624, "step": 5550 }, { "epoch": 0.06, "grad_norm": 3.1578280925750732, "learning_rate": 0.00019992013123361102, "loss": 6.5416, "step": 5600 }, { "epoch": 0.06, "grad_norm": 3.7874557971954346, "learning_rate": 0.0001999126368555559, "loss": 6.4512, "step": 5650 }, { "epoch": 0.06, "grad_norm": 2.7693099975585938, "learning_rate": 0.00019990480660681293, "loss": 6.5105, "step": 5700 }, { "epoch": 0.06, "grad_norm": 2.4338185787200928, "learning_rate": 0.00019989680712666593, "loss": 6.5092, "step": 5750 }, { "epoch": 0.06, "grad_norm": 3.656937837600708, "learning_rate": 0.00019988831193270577, "loss": 6.4269, "step": 5800 }, { "epoch": 0.06, "grad_norm": 2.857292652130127, "learning_rate": 0.00019987948094982952, "loss": 6.4387, "step": 5850 }, { "epoch": 0.07, "grad_norm": 3.4963467121124268, "learning_rate": 0.00019987031420772385, "loss": 6.3851, "step": 5900 }, { "epoch": 0.07, "grad_norm": 2.602522611618042, "learning_rate": 0.00019986081173720396, "loss": 6.3413, "step": 5950 }, { "epoch": 0.07, "grad_norm": 2.6455273628234863, "learning_rate": 0.00019985097357021385, "loss": 6.2965, "step": 6000 }, { "epoch": 0.07, "grad_norm": 3.5592167377471924, "learning_rate": 0.0001998407997398259, "loss": 6.4293, "step": 6050 }, { "epoch": 0.07, "grad_norm": 3.6016533374786377, "learning_rate": 0.00019983029028024094, "loss": 6.2897, "step": 6100 }, { "epoch": 0.07, "grad_norm": 2.5536839962005615, "learning_rate": 0.000199819445226788, "loss": 6.3157, "step": 6150 }, { "epoch": 0.07, "grad_norm": 2.0514349937438965, "learning_rate": 0.00019980826461592427, "loss": 6.3847, "step": 6200 }, { "epoch": 0.07, "grad_norm": 2.72495174407959, "learning_rate": 0.00019979674848523505, "loss": 6.3517, "step": 6250 }, { "epoch": 0.07, "grad_norm": 2.4264872074127197, "learning_rate": 0.00019978489687343335, "loss": 6.2533, "step": 6300 }, { "epoch": 0.07, "grad_norm": 2.8361423015594482, "learning_rate": 0.0001997727098203602, "loss": 6.3654, "step": 6350 }, { "epoch": 0.07, "grad_norm": 2.9690892696380615, "learning_rate": 0.00019976018736698404, "loss": 6.3968, "step": 6400 }, { "epoch": 0.07, "grad_norm": 2.6132867336273193, "learning_rate": 0.0001997473295554009, "loss": 6.3444, "step": 6450 }, { "epoch": 0.07, "grad_norm": 4.820697784423828, "learning_rate": 0.00019973413642883424, "loss": 6.2019, "step": 6500 }, { "epoch": 0.07, "grad_norm": 2.2316782474517822, "learning_rate": 0.00019972060803163458, "loss": 6.2049, "step": 6550 }, { "epoch": 0.07, "grad_norm": 3.9528305530548096, "learning_rate": 0.00019970674440927957, "loss": 6.1718, "step": 6600 }, { "epoch": 0.07, "grad_norm": 1.891073226928711, "learning_rate": 0.0001996925456083738, "loss": 6.2393, "step": 6650 }, { "epoch": 0.07, "grad_norm": 2.813270092010498, "learning_rate": 0.00019967801167664853, "loss": 6.2116, "step": 6700 }, { "epoch": 0.07, "grad_norm": 2.2726826667785645, "learning_rate": 0.00019966314266296173, "loss": 6.1521, "step": 6750 }, { "epoch": 0.08, "grad_norm": 2.3895318508148193, "learning_rate": 0.00019964793861729772, "loss": 6.1072, "step": 6800 }, { "epoch": 0.08, "grad_norm": 3.190431833267212, "learning_rate": 0.000199632399590767, "loss": 6.2009, "step": 6850 }, { "epoch": 0.08, "grad_norm": 3.79266095161438, "learning_rate": 0.00019961652563560634, "loss": 6.028, "step": 6900 }, { "epoch": 0.08, "grad_norm": 3.260039806365967, "learning_rate": 0.00019960031680517826, "loss": 6.0733, "step": 6950 }, { "epoch": 0.08, "grad_norm": 3.0739686489105225, "learning_rate": 0.0001995837731539711, "loss": 6.0521, "step": 7000 }, { "epoch": 0.08, "grad_norm": 3.0517771244049072, "learning_rate": 0.00019956689473759872, "loss": 6.0544, "step": 7050 }, { "epoch": 0.08, "grad_norm": 3.9524648189544678, "learning_rate": 0.0001995496816128003, "loss": 6.1326, "step": 7100 }, { "epoch": 0.08, "grad_norm": 4.498497486114502, "learning_rate": 0.00019953213383744033, "loss": 6.236, "step": 7150 }, { "epoch": 0.08, "grad_norm": 4.157576084136963, "learning_rate": 0.00019951425147050807, "loss": 5.9898, "step": 7200 }, { "epoch": 0.08, "grad_norm": 3.9297516345977783, "learning_rate": 0.00019949603457211775, "loss": 6.086, "step": 7250 }, { "epoch": 0.08, "grad_norm": 3.3214786052703857, "learning_rate": 0.00019947748320350804, "loss": 5.9589, "step": 7300 }, { "epoch": 0.08, "grad_norm": 2.8847291469573975, "learning_rate": 0.00019945859742704201, "loss": 6.1931, "step": 7350 }, { "epoch": 0.08, "grad_norm": 3.387896776199341, "learning_rate": 0.00019943937730620702, "loss": 6.0539, "step": 7400 }, { "epoch": 0.08, "grad_norm": 3.1214797496795654, "learning_rate": 0.00019941982290561417, "loss": 6.0288, "step": 7450 }, { "epoch": 0.08, "grad_norm": 3.7995123863220215, "learning_rate": 0.00019939993429099841, "loss": 6.0526, "step": 7500 }, { "epoch": 0.08, "grad_norm": 4.788393974304199, "learning_rate": 0.00019937971152921818, "loss": 5.9799, "step": 7550 }, { "epoch": 0.08, "grad_norm": 4.009220123291016, "learning_rate": 0.0001993591546882552, "loss": 6.1223, "step": 7600 }, { "epoch": 0.08, "grad_norm": 3.5576276779174805, "learning_rate": 0.00019933826383721428, "loss": 5.989, "step": 7650 }, { "epoch": 0.09, "grad_norm": 3.1287412643432617, "learning_rate": 0.00019931703904632294, "loss": 6.0542, "step": 7700 }, { "epoch": 0.09, "grad_norm": 3.6518595218658447, "learning_rate": 0.00019929548038693146, "loss": 6.041, "step": 7750 }, { "epoch": 0.09, "grad_norm": 3.268080472946167, "learning_rate": 0.0001992735879315123, "loss": 5.888, "step": 7800 }, { "epoch": 0.09, "grad_norm": 3.6055593490600586, "learning_rate": 0.00019925136175366007, "loss": 5.913, "step": 7850 }, { "epoch": 0.09, "grad_norm": 4.866463661193848, "learning_rate": 0.00019922880192809137, "loss": 5.9858, "step": 7900 }, { "epoch": 0.09, "grad_norm": 3.44808292388916, "learning_rate": 0.00019920590853064423, "loss": 5.7686, "step": 7950 }, { "epoch": 0.09, "grad_norm": 2.9507765769958496, "learning_rate": 0.00019918268163827808, "loss": 5.8557, "step": 8000 }, { "epoch": 0.09, "grad_norm": 3.441870927810669, "learning_rate": 0.00019915912132907352, "loss": 5.8268, "step": 8050 }, { "epoch": 0.09, "grad_norm": 3.838809013366699, "learning_rate": 0.00019913522768223182, "loss": 5.9833, "step": 8100 }, { "epoch": 0.09, "grad_norm": 4.165487289428711, "learning_rate": 0.00019911100077807498, "loss": 5.7422, "step": 8150 }, { "epoch": 0.09, "grad_norm": 3.5947463512420654, "learning_rate": 0.0001990864406980452, "loss": 5.7479, "step": 8200 }, { "epoch": 0.09, "grad_norm": 4.130446434020996, "learning_rate": 0.00019906154752470472, "loss": 5.7767, "step": 8250 }, { "epoch": 0.09, "grad_norm": 4.866550922393799, "learning_rate": 0.00019903632134173554, "loss": 5.7681, "step": 8300 }, { "epoch": 0.09, "grad_norm": 3.2839725017547607, "learning_rate": 0.00019901076223393903, "loss": 5.6656, "step": 8350 }, { "epoch": 0.09, "grad_norm": 3.0762476921081543, "learning_rate": 0.0001989848702872359, "loss": 5.789, "step": 8400 }, { "epoch": 0.09, "grad_norm": 3.7109107971191406, "learning_rate": 0.00019895864558866556, "loss": 5.773, "step": 8450 }, { "epoch": 0.09, "grad_norm": 5.400998115539551, "learning_rate": 0.00019893208822638618, "loss": 5.7506, "step": 8500 }, { "epoch": 0.09, "grad_norm": 3.3062849044799805, "learning_rate": 0.00019890519828967413, "loss": 5.7515, "step": 8550 }, { "epoch": 0.1, "grad_norm": 4.109920501708984, "learning_rate": 0.00019887797586892373, "loss": 5.7972, "step": 8600 }, { "epoch": 0.1, "grad_norm": 3.4838390350341797, "learning_rate": 0.00019885042105564717, "loss": 5.6753, "step": 8650 }, { "epoch": 0.1, "grad_norm": 4.251760959625244, "learning_rate": 0.00019882253394247381, "loss": 5.6303, "step": 8700 }, { "epoch": 0.1, "grad_norm": 4.042376518249512, "learning_rate": 0.00019879431462315025, "loss": 5.5753, "step": 8750 }, { "epoch": 0.1, "grad_norm": 4.239652633666992, "learning_rate": 0.0001987657631925398, "loss": 5.5335, "step": 8800 }, { "epoch": 0.1, "grad_norm": 5.15481424331665, "learning_rate": 0.00019873687974662215, "loss": 5.5396, "step": 8850 }, { "epoch": 0.1, "grad_norm": 4.36835241317749, "learning_rate": 0.00019870766438249317, "loss": 5.6017, "step": 8900 }, { "epoch": 0.1, "grad_norm": 4.165258407592773, "learning_rate": 0.00019867811719836452, "loss": 5.7228, "step": 8950 }, { "epoch": 0.1, "grad_norm": 4.125988006591797, "learning_rate": 0.0001986482382935633, "loss": 5.5787, "step": 9000 }, { "epoch": 0.1, "grad_norm": 4.177731037139893, "learning_rate": 0.0001986180277685317, "loss": 5.5829, "step": 9050 }, { "epoch": 0.1, "grad_norm": 5.006561279296875, "learning_rate": 0.00019858748572482683, "loss": 5.5466, "step": 9100 }, { "epoch": 0.1, "grad_norm": 4.33070182800293, "learning_rate": 0.00019855661226512007, "loss": 5.5544, "step": 9150 }, { "epoch": 0.1, "grad_norm": 4.358560085296631, "learning_rate": 0.00019852540749319708, "loss": 5.4599, "step": 9200 }, { "epoch": 0.1, "grad_norm": 4.536096096038818, "learning_rate": 0.00019849387151395708, "loss": 5.4983, "step": 9250 }, { "epoch": 0.1, "grad_norm": 4.66163444519043, "learning_rate": 0.0001984620044334129, "loss": 5.4097, "step": 9300 }, { "epoch": 0.1, "grad_norm": 4.4319233894348145, "learning_rate": 0.00019842980635869024, "loss": 5.4093, "step": 9350 }, { "epoch": 0.1, "grad_norm": 4.98419713973999, "learning_rate": 0.0001983972773980276, "loss": 5.4056, "step": 9400 }, { "epoch": 0.1, "grad_norm": 3.6354339122772217, "learning_rate": 0.0001983644176607757, "loss": 5.3171, "step": 9450 }, { "epoch": 0.11, "grad_norm": 4.495342254638672, "learning_rate": 0.00019833122725739736, "loss": 5.4521, "step": 9500 }, { "epoch": 0.11, "grad_norm": 4.5558671951293945, "learning_rate": 0.00019829770629946678, "loss": 5.5158, "step": 9550 }, { "epoch": 0.11, "grad_norm": 3.7165732383728027, "learning_rate": 0.00019826385489966957, "loss": 5.301, "step": 9600 }, { "epoch": 0.11, "grad_norm": 6.030915260314941, "learning_rate": 0.00019822967317180204, "loss": 5.3316, "step": 9650 }, { "epoch": 0.11, "grad_norm": 5.385923385620117, "learning_rate": 0.00019819516123077094, "loss": 5.3844, "step": 9700 }, { "epoch": 0.11, "grad_norm": 4.383516788482666, "learning_rate": 0.00019816101926755305, "loss": 5.2995, "step": 9750 }, { "epoch": 0.11, "grad_norm": 4.446406364440918, "learning_rate": 0.00019812585384780055, "loss": 5.386, "step": 9800 }, { "epoch": 0.11, "grad_norm": 4.345483303070068, "learning_rate": 0.00019809035856388805, "loss": 5.2815, "step": 9850 }, { "epoch": 0.11, "grad_norm": 4.791261672973633, "learning_rate": 0.00019805453353513813, "loss": 5.3757, "step": 9900 }, { "epoch": 0.11, "grad_norm": 5.622151851654053, "learning_rate": 0.00019801837888198172, "loss": 5.4405, "step": 9950 }, { "epoch": 0.11, "grad_norm": 4.934606075286865, "learning_rate": 0.0001979818947259579, "loss": 5.139, "step": 10000 }, { "epoch": 0.11, "grad_norm": 3.9659693241119385, "learning_rate": 0.0001979450811897134, "loss": 5.1726, "step": 10050 }, { "epoch": 0.11, "grad_norm": 5.214992046356201, "learning_rate": 0.00019790793839700226, "loss": 5.2864, "step": 10100 }, { "epoch": 0.11, "grad_norm": 4.5359601974487305, "learning_rate": 0.00019787046647268524, "loss": 5.1443, "step": 10150 }, { "epoch": 0.11, "grad_norm": 4.26462984085083, "learning_rate": 0.00019783266554272962, "loss": 5.0597, "step": 10200 }, { "epoch": 0.11, "grad_norm": 5.053945064544678, "learning_rate": 0.00019779453573420873, "loss": 5.2946, "step": 10250 }, { "epoch": 0.11, "grad_norm": 6.082211494445801, "learning_rate": 0.00019775607717530127, "loss": 5.2075, "step": 10300 }, { "epoch": 0.11, "grad_norm": 4.107390403747559, "learning_rate": 0.00019771728999529132, "loss": 5.1394, "step": 10350 }, { "epoch": 0.12, "grad_norm": 4.58411169052124, "learning_rate": 0.00019767817432456752, "loss": 5.1064, "step": 10400 }, { "epoch": 0.12, "grad_norm": 8.38965892791748, "learning_rate": 0.00019763952239228627, "loss": 5.0808, "step": 10450 }, { "epoch": 0.12, "grad_norm": 3.885803699493408, "learning_rate": 0.00019759975669894338, "loss": 5.0664, "step": 10500 }, { "epoch": 0.12, "grad_norm": 4.1605916023254395, "learning_rate": 0.00019755966290999167, "loss": 5.2469, "step": 10550 }, { "epoch": 0.12, "grad_norm": 4.821887016296387, "learning_rate": 0.00019751924116021225, "loss": 5.2451, "step": 10600 }, { "epoch": 0.12, "grad_norm": 3.865694761276245, "learning_rate": 0.00019747849158548858, "loss": 5.2334, "step": 10650 }, { "epoch": 0.12, "grad_norm": 3.640681028366089, "learning_rate": 0.00019743741432280625, "loss": 5.1206, "step": 10700 }, { "epoch": 0.12, "grad_norm": 4.04166316986084, "learning_rate": 0.00019739600951025236, "loss": 5.0059, "step": 10750 }, { "epoch": 0.12, "grad_norm": 4.637605667114258, "learning_rate": 0.00019735427728701516, "loss": 5.0302, "step": 10800 }, { "epoch": 0.12, "grad_norm": 4.08723783493042, "learning_rate": 0.0001973122177933835, "loss": 5.1551, "step": 10850 }, { "epoch": 0.12, "grad_norm": 3.7944953441619873, "learning_rate": 0.00019726983117074643, "loss": 5.0665, "step": 10900 }, { "epoch": 0.12, "grad_norm": 5.2847371101379395, "learning_rate": 0.00019722711756159266, "loss": 5.2212, "step": 10950 }, { "epoch": 0.12, "grad_norm": 4.109150409698486, "learning_rate": 0.00019718407710951012, "loss": 5.2645, "step": 11000 } ], "logging_steps": 50, "max_steps": 90183, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 3955495796736000.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }