diff --git "a/checkpoints/checkpoint-35000/trainer_state.json" "b/checkpoints/checkpoint-35000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/checkpoint-35000/trainer_state.json" @@ -0,0 +1,4921 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3880997527250147, + "eval_steps": 500, + "global_step": 35000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 3.2580924034118652, + "learning_rate": 2.2172949002217296e-06, + "loss": 10.2933, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 2.4347386360168457, + "learning_rate": 4.434589800443459e-06, + "loss": 10.1894, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 2.3895885944366455, + "learning_rate": 6.651884700665188e-06, + "loss": 10.1424, + "step": 150 + }, + { + "epoch": 0.0, + "grad_norm": 2.129647731781006, + "learning_rate": 8.869179600886918e-06, + "loss": 10.0995, + "step": 200 + }, + { + "epoch": 0.0, + "grad_norm": 2.3564186096191406, + "learning_rate": 1.1086474501108649e-05, + "loss": 10.0479, + "step": 250 + }, + { + "epoch": 0.0, + "grad_norm": 1.830551028251648, + "learning_rate": 1.3303769401330377e-05, + "loss": 9.9971, + "step": 300 + }, + { + "epoch": 0.0, + "grad_norm": 2.1173911094665527, + "learning_rate": 1.5521064301552106e-05, + "loss": 9.9201, + "step": 350 + }, + { + "epoch": 0.0, + "grad_norm": 1.6636557579040527, + "learning_rate": 1.7738359201773837e-05, + "loss": 9.8562, + "step": 400 + }, + { + "epoch": 0.0, + "grad_norm": 2.4503839015960693, + "learning_rate": 1.9955654101995567e-05, + "loss": 9.7599, + "step": 450 + }, + { + "epoch": 0.01, + "grad_norm": 1.822424054145813, + "learning_rate": 2.2172949002217298e-05, + "loss": 9.6608, + "step": 500 + }, + { + "epoch": 0.01, + "grad_norm": 1.6598998308181763, + "learning_rate": 2.4390243902439026e-05, + "loss": 9.55, + "step": 550 + }, + { + "epoch": 0.01, + "grad_norm": 1.8471707105636597, + "learning_rate": 2.6607538802660753e-05, + "loss": 9.4606, + "step": 600 + }, + { + "epoch": 0.01, + "grad_norm": 1.4833533763885498, + "learning_rate": 2.8824833702882487e-05, + "loss": 9.3283, + "step": 650 + }, + { + "epoch": 0.01, + "grad_norm": 1.688541054725647, + "learning_rate": 3.104212860310421e-05, + "loss": 9.2229, + "step": 700 + }, + { + "epoch": 0.01, + "grad_norm": 1.6466543674468994, + "learning_rate": 3.325942350332594e-05, + "loss": 9.1093, + "step": 750 + }, + { + "epoch": 0.01, + "grad_norm": 1.4169293642044067, + "learning_rate": 3.547671840354767e-05, + "loss": 8.9703, + "step": 800 + }, + { + "epoch": 0.01, + "grad_norm": 1.7079193592071533, + "learning_rate": 3.7694013303769404e-05, + "loss": 8.8351, + "step": 850 + }, + { + "epoch": 0.01, + "grad_norm": 1.5513204336166382, + "learning_rate": 3.9911308203991135e-05, + "loss": 8.7111, + "step": 900 + }, + { + "epoch": 0.01, + "grad_norm": 1.485573172569275, + "learning_rate": 4.212860310421286e-05, + "loss": 8.5627, + "step": 950 + }, + { + "epoch": 0.01, + "grad_norm": 1.511690616607666, + "learning_rate": 4.4345898004434597e-05, + "loss": 8.5042, + "step": 1000 + }, + { + "epoch": 0.01, + "grad_norm": 2.1478614807128906, + "learning_rate": 4.656319290465632e-05, + "loss": 8.3287, + "step": 1050 + }, + { + "epoch": 0.01, + "grad_norm": 1.4060652256011963, + "learning_rate": 4.878048780487805e-05, + "loss": 8.2341, + "step": 1100 + }, + { + "epoch": 0.01, + "grad_norm": 1.3950035572052002, + "learning_rate": 5.099778270509978e-05, + "loss": 8.1277, + "step": 1150 + }, + { + "epoch": 0.01, + "grad_norm": 1.5197688341140747, + "learning_rate": 5.3215077605321506e-05, + "loss": 8.0311, + "step": 1200 + }, + { + "epoch": 0.01, + "grad_norm": 1.3406693935394287, + "learning_rate": 5.543237250554324e-05, + "loss": 7.9824, + "step": 1250 + }, + { + "epoch": 0.01, + "grad_norm": 1.4520119428634644, + "learning_rate": 5.7649667405764975e-05, + "loss": 7.9948, + "step": 1300 + }, + { + "epoch": 0.01, + "grad_norm": 1.179124116897583, + "learning_rate": 5.98669623059867e-05, + "loss": 7.9144, + "step": 1350 + }, + { + "epoch": 0.02, + "grad_norm": 1.4039533138275146, + "learning_rate": 6.208425720620842e-05, + "loss": 7.8768, + "step": 1400 + }, + { + "epoch": 0.02, + "grad_norm": 1.5542700290679932, + "learning_rate": 6.430155210643016e-05, + "loss": 7.894, + "step": 1450 + }, + { + "epoch": 0.02, + "grad_norm": 1.4150550365447998, + "learning_rate": 6.651884700665188e-05, + "loss": 7.8409, + "step": 1500 + }, + { + "epoch": 0.02, + "grad_norm": 1.6647827625274658, + "learning_rate": 6.873614190687362e-05, + "loss": 7.91, + "step": 1550 + }, + { + "epoch": 0.02, + "grad_norm": 1.7795697450637817, + "learning_rate": 7.095343680709535e-05, + "loss": 7.8256, + "step": 1600 + }, + { + "epoch": 0.02, + "grad_norm": 1.933110237121582, + "learning_rate": 7.317073170731707e-05, + "loss": 7.8463, + "step": 1650 + }, + { + "epoch": 0.02, + "grad_norm": 1.1942570209503174, + "learning_rate": 7.538802660753881e-05, + "loss": 7.7827, + "step": 1700 + }, + { + "epoch": 0.02, + "grad_norm": 1.6759297847747803, + "learning_rate": 7.760532150776053e-05, + "loss": 7.8, + "step": 1750 + }, + { + "epoch": 0.02, + "grad_norm": 1.093256950378418, + "learning_rate": 7.982261640798227e-05, + "loss": 7.7461, + "step": 1800 + }, + { + "epoch": 0.02, + "grad_norm": 1.567872166633606, + "learning_rate": 8.2039911308204e-05, + "loss": 7.7338, + "step": 1850 + }, + { + "epoch": 0.02, + "grad_norm": 1.3017679452896118, + "learning_rate": 8.425720620842572e-05, + "loss": 7.804, + "step": 1900 + }, + { + "epoch": 0.02, + "grad_norm": 1.7510960102081299, + "learning_rate": 8.647450110864746e-05, + "loss": 7.7405, + "step": 1950 + }, + { + "epoch": 0.02, + "grad_norm": 1.7215120792388916, + "learning_rate": 8.869179600886919e-05, + "loss": 7.7429, + "step": 2000 + }, + { + "epoch": 0.02, + "grad_norm": 1.6202715635299683, + "learning_rate": 9.090909090909092e-05, + "loss": 7.6588, + "step": 2050 + }, + { + "epoch": 0.02, + "grad_norm": 1.5680756568908691, + "learning_rate": 9.312638580931264e-05, + "loss": 7.6224, + "step": 2100 + }, + { + "epoch": 0.02, + "grad_norm": 1.462240219116211, + "learning_rate": 9.534368070953438e-05, + "loss": 7.6851, + "step": 2150 + }, + { + "epoch": 0.02, + "grad_norm": 2.2018320560455322, + "learning_rate": 9.75609756097561e-05, + "loss": 7.6443, + "step": 2200 + }, + { + "epoch": 0.02, + "grad_norm": 1.9520208835601807, + "learning_rate": 9.977827050997783e-05, + "loss": 7.6456, + "step": 2250 + }, + { + "epoch": 0.03, + "grad_norm": 1.115421175956726, + "learning_rate": 0.00010199556541019956, + "loss": 7.5894, + "step": 2300 + }, + { + "epoch": 0.03, + "grad_norm": 1.6002250909805298, + "learning_rate": 0.0001042128603104213, + "loss": 7.6017, + "step": 2350 + }, + { + "epoch": 0.03, + "grad_norm": 1.6516796350479126, + "learning_rate": 0.00010643015521064301, + "loss": 7.4548, + "step": 2400 + }, + { + "epoch": 0.03, + "grad_norm": 2.2168257236480713, + "learning_rate": 0.00010864745011086475, + "loss": 7.5867, + "step": 2450 + }, + { + "epoch": 0.03, + "grad_norm": 1.5447593927383423, + "learning_rate": 0.00011086474501108647, + "loss": 7.5317, + "step": 2500 + }, + { + "epoch": 0.03, + "grad_norm": 1.6840906143188477, + "learning_rate": 0.00011308203991130821, + "loss": 7.5127, + "step": 2550 + }, + { + "epoch": 0.03, + "grad_norm": 1.2965503931045532, + "learning_rate": 0.00011529933481152995, + "loss": 7.4911, + "step": 2600 + }, + { + "epoch": 0.03, + "grad_norm": 1.643584966659546, + "learning_rate": 0.00011751662971175166, + "loss": 7.4416, + "step": 2650 + }, + { + "epoch": 0.03, + "grad_norm": 1.5419111251831055, + "learning_rate": 0.0001197339246119734, + "loss": 7.4944, + "step": 2700 + }, + { + "epoch": 0.03, + "grad_norm": 1.7774205207824707, + "learning_rate": 0.00012195121951219512, + "loss": 7.4244, + "step": 2750 + }, + { + "epoch": 0.03, + "grad_norm": 2.1709322929382324, + "learning_rate": 0.00012416851441241685, + "loss": 7.371, + "step": 2800 + }, + { + "epoch": 0.03, + "grad_norm": 1.5503411293029785, + "learning_rate": 0.0001263858093126386, + "loss": 7.3031, + "step": 2850 + }, + { + "epoch": 0.03, + "grad_norm": 1.7744035720825195, + "learning_rate": 0.00012860310421286032, + "loss": 7.3338, + "step": 2900 + }, + { + "epoch": 0.03, + "grad_norm": 2.2014000415802, + "learning_rate": 0.00013082039911308205, + "loss": 7.2962, + "step": 2950 + }, + { + "epoch": 0.03, + "grad_norm": 1.6716220378875732, + "learning_rate": 0.00013303769401330377, + "loss": 7.3348, + "step": 3000 + }, + { + "epoch": 0.03, + "grad_norm": 1.7045074701309204, + "learning_rate": 0.0001352549889135255, + "loss": 7.2864, + "step": 3050 + }, + { + "epoch": 0.03, + "grad_norm": 1.8933771848678589, + "learning_rate": 0.00013747228381374724, + "loss": 7.2744, + "step": 3100 + }, + { + "epoch": 0.03, + "grad_norm": 2.298779249191284, + "learning_rate": 0.00013968957871396897, + "loss": 7.2472, + "step": 3150 + }, + { + "epoch": 0.04, + "grad_norm": 1.3420922756195068, + "learning_rate": 0.0001419068736141907, + "loss": 7.3019, + "step": 3200 + }, + { + "epoch": 0.04, + "grad_norm": 1.9339039325714111, + "learning_rate": 0.00014412416851441242, + "loss": 7.2982, + "step": 3250 + }, + { + "epoch": 0.04, + "grad_norm": 2.69667387008667, + "learning_rate": 0.00014634146341463414, + "loss": 7.2851, + "step": 3300 + }, + { + "epoch": 0.04, + "grad_norm": 2.3124189376831055, + "learning_rate": 0.0001485587583148559, + "loss": 7.258, + "step": 3350 + }, + { + "epoch": 0.04, + "grad_norm": 1.975651741027832, + "learning_rate": 0.00015077605321507762, + "loss": 7.1275, + "step": 3400 + }, + { + "epoch": 0.04, + "grad_norm": 1.9704022407531738, + "learning_rate": 0.00015299334811529934, + "loss": 7.1473, + "step": 3450 + }, + { + "epoch": 0.04, + "grad_norm": 2.5047757625579834, + "learning_rate": 0.00015521064301552106, + "loss": 7.1096, + "step": 3500 + }, + { + "epoch": 0.04, + "grad_norm": 1.5465894937515259, + "learning_rate": 0.0001574279379157428, + "loss": 7.1501, + "step": 3550 + }, + { + "epoch": 0.04, + "grad_norm": 1.9557933807373047, + "learning_rate": 0.00015964523281596454, + "loss": 7.2033, + "step": 3600 + }, + { + "epoch": 0.04, + "grad_norm": 2.420116424560547, + "learning_rate": 0.00016186252771618626, + "loss": 7.1275, + "step": 3650 + }, + { + "epoch": 0.04, + "grad_norm": 2.114737033843994, + "learning_rate": 0.000164079822616408, + "loss": 7.0932, + "step": 3700 + }, + { + "epoch": 0.04, + "grad_norm": 2.3085389137268066, + "learning_rate": 0.00016629711751662974, + "loss": 7.0311, + "step": 3750 + }, + { + "epoch": 0.04, + "grad_norm": 2.5679140090942383, + "learning_rate": 0.00016851441241685144, + "loss": 6.9168, + "step": 3800 + }, + { + "epoch": 0.04, + "grad_norm": 1.8611838817596436, + "learning_rate": 0.0001707317073170732, + "loss": 7.0085, + "step": 3850 + }, + { + "epoch": 0.04, + "grad_norm": 1.8603994846343994, + "learning_rate": 0.0001729490022172949, + "loss": 6.9432, + "step": 3900 + }, + { + "epoch": 0.04, + "grad_norm": 2.4244627952575684, + "learning_rate": 0.00017516629711751663, + "loss": 6.9333, + "step": 3950 + }, + { + "epoch": 0.04, + "grad_norm": 2.177870750427246, + "learning_rate": 0.00017738359201773839, + "loss": 6.9499, + "step": 4000 + }, + { + "epoch": 0.04, + "grad_norm": 1.9320554733276367, + "learning_rate": 0.00017960088691796008, + "loss": 6.8204, + "step": 4050 + }, + { + "epoch": 0.05, + "grad_norm": 1.5062849521636963, + "learning_rate": 0.00018181818181818183, + "loss": 6.9505, + "step": 4100 + }, + { + "epoch": 0.05, + "grad_norm": 2.9272422790527344, + "learning_rate": 0.00018403547671840356, + "loss": 6.8701, + "step": 4150 + }, + { + "epoch": 0.05, + "grad_norm": 2.0309596061706543, + "learning_rate": 0.00018625277161862528, + "loss": 6.924, + "step": 4200 + }, + { + "epoch": 0.05, + "grad_norm": 2.0265886783599854, + "learning_rate": 0.00018847006651884703, + "loss": 6.9223, + "step": 4250 + }, + { + "epoch": 0.05, + "grad_norm": 2.5160486698150635, + "learning_rate": 0.00019068736141906876, + "loss": 6.8708, + "step": 4300 + }, + { + "epoch": 0.05, + "grad_norm": 2.613301992416382, + "learning_rate": 0.00019290465631929045, + "loss": 6.8937, + "step": 4350 + }, + { + "epoch": 0.05, + "grad_norm": 2.3031229972839355, + "learning_rate": 0.0001951219512195122, + "loss": 6.8337, + "step": 4400 + }, + { + "epoch": 0.05, + "grad_norm": 2.54779052734375, + "learning_rate": 0.00019733924611973393, + "loss": 6.8334, + "step": 4450 + }, + { + "epoch": 0.05, + "grad_norm": 2.8277971744537354, + "learning_rate": 0.00019955654101995565, + "loss": 6.7925, + "step": 4500 + }, + { + "epoch": 0.05, + "grad_norm": 2.0113885402679443, + "learning_rate": 0.00019999989242739025, + "loss": 6.8458, + "step": 4550 + }, + { + "epoch": 0.05, + "grad_norm": 2.2395377159118652, + "learning_rate": 0.00019999945541405976, + "loss": 6.6251, + "step": 4600 + }, + { + "epoch": 0.05, + "grad_norm": 2.445993423461914, + "learning_rate": 0.0001999986822381884, + "loss": 6.8099, + "step": 4650 + }, + { + "epoch": 0.05, + "grad_norm": 4.077752590179443, + "learning_rate": 0.0001999975729023753, + "loss": 6.8053, + "step": 4700 + }, + { + "epoch": 0.05, + "grad_norm": 3.167569875717163, + "learning_rate": 0.00019999612741034963, + "loss": 6.7706, + "step": 4750 + }, + { + "epoch": 0.05, + "grad_norm": 1.893659234046936, + "learning_rate": 0.00019999434576697066, + "loss": 6.8245, + "step": 4800 + }, + { + "epoch": 0.05, + "grad_norm": 3.6101326942443848, + "learning_rate": 0.00019999222797822762, + "loss": 6.7407, + "step": 4850 + }, + { + "epoch": 0.05, + "grad_norm": 2.2858726978302, + "learning_rate": 0.00019998977405123974, + "loss": 6.74, + "step": 4900 + }, + { + "epoch": 0.05, + "grad_norm": 1.9325459003448486, + "learning_rate": 0.0001999869839942563, + "loss": 6.716, + "step": 4950 + }, + { + "epoch": 0.06, + "grad_norm": 2.0043437480926514, + "learning_rate": 0.00019998385781665643, + "loss": 6.6003, + "step": 5000 + }, + { + "epoch": 0.06, + "grad_norm": 4.151523113250732, + "learning_rate": 0.00019998039552894924, + "loss": 6.6801, + "step": 5050 + }, + { + "epoch": 0.06, + "grad_norm": 3.8407771587371826, + "learning_rate": 0.00019997659714277372, + "loss": 6.608, + "step": 5100 + }, + { + "epoch": 0.06, + "grad_norm": 2.230713129043579, + "learning_rate": 0.00019997246267089867, + "loss": 6.6479, + "step": 5150 + }, + { + "epoch": 0.06, + "grad_norm": 2.2546942234039307, + "learning_rate": 0.0001999679921272227, + "loss": 6.6548, + "step": 5200 + }, + { + "epoch": 0.06, + "grad_norm": 3.180986166000366, + "learning_rate": 0.00019996318552677425, + "loss": 6.6851, + "step": 5250 + }, + { + "epoch": 0.06, + "grad_norm": 2.341231346130371, + "learning_rate": 0.00019995804288571134, + "loss": 6.547, + "step": 5300 + }, + { + "epoch": 0.06, + "grad_norm": 3.1117124557495117, + "learning_rate": 0.00019995256422132172, + "loss": 6.7072, + "step": 5350 + }, + { + "epoch": 0.06, + "grad_norm": 2.0082530975341797, + "learning_rate": 0.0001999467495520227, + "loss": 6.5422, + "step": 5400 + }, + { + "epoch": 0.06, + "grad_norm": 2.409489870071411, + "learning_rate": 0.0001999405988973611, + "loss": 6.3716, + "step": 5450 + }, + { + "epoch": 0.06, + "grad_norm": 2.649052381515503, + "learning_rate": 0.00019993411227801328, + "loss": 6.6434, + "step": 5500 + }, + { + "epoch": 0.06, + "grad_norm": 3.081116199493408, + "learning_rate": 0.00019992728971578492, + "loss": 6.4624, + "step": 5550 + }, + { + "epoch": 0.06, + "grad_norm": 3.1578280925750732, + "learning_rate": 0.00019992013123361102, + "loss": 6.5416, + "step": 5600 + }, + { + "epoch": 0.06, + "grad_norm": 3.7874557971954346, + "learning_rate": 0.0001999126368555559, + "loss": 6.4512, + "step": 5650 + }, + { + "epoch": 0.06, + "grad_norm": 2.7693099975585938, + "learning_rate": 0.00019990480660681293, + "loss": 6.5105, + "step": 5700 + }, + { + "epoch": 0.06, + "grad_norm": 2.4338185787200928, + "learning_rate": 0.00019989680712666593, + "loss": 6.5092, + "step": 5750 + }, + { + "epoch": 0.06, + "grad_norm": 3.656937837600708, + "learning_rate": 0.00019988831193270577, + "loss": 6.4269, + "step": 5800 + }, + { + "epoch": 0.06, + "grad_norm": 2.857292652130127, + "learning_rate": 0.00019987948094982952, + "loss": 6.4387, + "step": 5850 + }, + { + "epoch": 0.07, + "grad_norm": 3.4963467121124268, + "learning_rate": 0.00019987031420772385, + "loss": 6.3851, + "step": 5900 + }, + { + "epoch": 0.07, + "grad_norm": 2.602522611618042, + "learning_rate": 0.00019986081173720396, + "loss": 6.3413, + "step": 5950 + }, + { + "epoch": 0.07, + "grad_norm": 2.6455273628234863, + "learning_rate": 0.00019985097357021385, + "loss": 6.2965, + "step": 6000 + }, + { + "epoch": 0.07, + "grad_norm": 3.5592167377471924, + "learning_rate": 0.0001998407997398259, + "loss": 6.4293, + "step": 6050 + }, + { + "epoch": 0.07, + "grad_norm": 3.6016533374786377, + "learning_rate": 0.00019983029028024094, + "loss": 6.2897, + "step": 6100 + }, + { + "epoch": 0.07, + "grad_norm": 2.5536839962005615, + "learning_rate": 0.000199819445226788, + "loss": 6.3157, + "step": 6150 + }, + { + "epoch": 0.07, + "grad_norm": 2.0514349937438965, + "learning_rate": 0.00019980826461592427, + "loss": 6.3847, + "step": 6200 + }, + { + "epoch": 0.07, + "grad_norm": 2.72495174407959, + "learning_rate": 0.00019979674848523505, + "loss": 6.3517, + "step": 6250 + }, + { + "epoch": 0.07, + "grad_norm": 2.4264872074127197, + "learning_rate": 0.00019978489687343335, + "loss": 6.2533, + "step": 6300 + }, + { + "epoch": 0.07, + "grad_norm": 2.8361423015594482, + "learning_rate": 0.0001997727098203602, + "loss": 6.3654, + "step": 6350 + }, + { + "epoch": 0.07, + "grad_norm": 2.9690892696380615, + "learning_rate": 0.00019976018736698404, + "loss": 6.3968, + "step": 6400 + }, + { + "epoch": 0.07, + "grad_norm": 2.6132867336273193, + "learning_rate": 0.0001997473295554009, + "loss": 6.3444, + "step": 6450 + }, + { + "epoch": 0.07, + "grad_norm": 4.820697784423828, + "learning_rate": 0.00019973413642883424, + "loss": 6.2019, + "step": 6500 + }, + { + "epoch": 0.07, + "grad_norm": 2.2316782474517822, + "learning_rate": 0.00019972060803163458, + "loss": 6.2049, + "step": 6550 + }, + { + "epoch": 0.07, + "grad_norm": 3.9528305530548096, + "learning_rate": 0.00019970674440927957, + "loss": 6.1718, + "step": 6600 + }, + { + "epoch": 0.07, + "grad_norm": 1.891073226928711, + "learning_rate": 0.0001996925456083738, + "loss": 6.2393, + "step": 6650 + }, + { + "epoch": 0.07, + "grad_norm": 2.813270092010498, + "learning_rate": 0.00019967801167664853, + "loss": 6.2116, + "step": 6700 + }, + { + "epoch": 0.07, + "grad_norm": 2.2726826667785645, + "learning_rate": 0.00019966314266296173, + "loss": 6.1521, + "step": 6750 + }, + { + "epoch": 0.08, + "grad_norm": 2.3895318508148193, + "learning_rate": 0.00019964793861729772, + "loss": 6.1072, + "step": 6800 + }, + { + "epoch": 0.08, + "grad_norm": 3.190431833267212, + "learning_rate": 0.000199632399590767, + "loss": 6.2009, + "step": 6850 + }, + { + "epoch": 0.08, + "grad_norm": 3.79266095161438, + "learning_rate": 0.00019961652563560634, + "loss": 6.028, + "step": 6900 + }, + { + "epoch": 0.08, + "grad_norm": 3.260039806365967, + "learning_rate": 0.00019960031680517826, + "loss": 6.0733, + "step": 6950 + }, + { + "epoch": 0.08, + "grad_norm": 3.0739686489105225, + "learning_rate": 0.0001995837731539711, + "loss": 6.0521, + "step": 7000 + }, + { + "epoch": 0.08, + "grad_norm": 3.0517771244049072, + "learning_rate": 0.00019956689473759872, + "loss": 6.0544, + "step": 7050 + }, + { + "epoch": 0.08, + "grad_norm": 3.9524648189544678, + "learning_rate": 0.0001995496816128003, + "loss": 6.1326, + "step": 7100 + }, + { + "epoch": 0.08, + "grad_norm": 4.498497486114502, + "learning_rate": 0.00019953213383744033, + "loss": 6.236, + "step": 7150 + }, + { + "epoch": 0.08, + "grad_norm": 4.157576084136963, + "learning_rate": 0.00019951425147050807, + "loss": 5.9898, + "step": 7200 + }, + { + "epoch": 0.08, + "grad_norm": 3.9297516345977783, + "learning_rate": 0.00019949603457211775, + "loss": 6.086, + "step": 7250 + }, + { + "epoch": 0.08, + "grad_norm": 3.3214786052703857, + "learning_rate": 0.00019947748320350804, + "loss": 5.9589, + "step": 7300 + }, + { + "epoch": 0.08, + "grad_norm": 2.8847291469573975, + "learning_rate": 0.00019945859742704201, + "loss": 6.1931, + "step": 7350 + }, + { + "epoch": 0.08, + "grad_norm": 3.387896776199341, + "learning_rate": 0.00019943937730620702, + "loss": 6.0539, + "step": 7400 + }, + { + "epoch": 0.08, + "grad_norm": 3.1214797496795654, + "learning_rate": 0.00019941982290561417, + "loss": 6.0288, + "step": 7450 + }, + { + "epoch": 0.08, + "grad_norm": 3.7995123863220215, + "learning_rate": 0.00019939993429099841, + "loss": 6.0526, + "step": 7500 + }, + { + "epoch": 0.08, + "grad_norm": 4.788393974304199, + "learning_rate": 0.00019937971152921818, + "loss": 5.9799, + "step": 7550 + }, + { + "epoch": 0.08, + "grad_norm": 4.009220123291016, + "learning_rate": 0.0001993591546882552, + "loss": 6.1223, + "step": 7600 + }, + { + "epoch": 0.08, + "grad_norm": 3.5576276779174805, + "learning_rate": 0.00019933826383721428, + "loss": 5.989, + "step": 7650 + }, + { + "epoch": 0.09, + "grad_norm": 3.1287412643432617, + "learning_rate": 0.00019931703904632294, + "loss": 6.0542, + "step": 7700 + }, + { + "epoch": 0.09, + "grad_norm": 3.6518595218658447, + "learning_rate": 0.00019929548038693146, + "loss": 6.041, + "step": 7750 + }, + { + "epoch": 0.09, + "grad_norm": 3.268080472946167, + "learning_rate": 0.0001992735879315123, + "loss": 5.888, + "step": 7800 + }, + { + "epoch": 0.09, + "grad_norm": 3.6055593490600586, + "learning_rate": 0.00019925136175366007, + "loss": 5.913, + "step": 7850 + }, + { + "epoch": 0.09, + "grad_norm": 4.866463661193848, + "learning_rate": 0.00019922880192809137, + "loss": 5.9858, + "step": 7900 + }, + { + "epoch": 0.09, + "grad_norm": 3.44808292388916, + "learning_rate": 0.00019920590853064423, + "loss": 5.7686, + "step": 7950 + }, + { + "epoch": 0.09, + "grad_norm": 2.9507765769958496, + "learning_rate": 0.00019918268163827808, + "loss": 5.8557, + "step": 8000 + }, + { + "epoch": 0.09, + "grad_norm": 3.441870927810669, + "learning_rate": 0.00019915912132907352, + "loss": 5.8268, + "step": 8050 + }, + { + "epoch": 0.09, + "grad_norm": 3.838809013366699, + "learning_rate": 0.00019913522768223182, + "loss": 5.9833, + "step": 8100 + }, + { + "epoch": 0.09, + "grad_norm": 4.165487289428711, + "learning_rate": 0.00019911100077807498, + "loss": 5.7422, + "step": 8150 + }, + { + "epoch": 0.09, + "grad_norm": 3.5947463512420654, + "learning_rate": 0.0001990864406980452, + "loss": 5.7479, + "step": 8200 + }, + { + "epoch": 0.09, + "grad_norm": 4.130446434020996, + "learning_rate": 0.00019906154752470472, + "loss": 5.7767, + "step": 8250 + }, + { + "epoch": 0.09, + "grad_norm": 4.866550922393799, + "learning_rate": 0.00019903632134173554, + "loss": 5.7681, + "step": 8300 + }, + { + "epoch": 0.09, + "grad_norm": 3.2839725017547607, + "learning_rate": 0.00019901076223393903, + "loss": 5.6656, + "step": 8350 + }, + { + "epoch": 0.09, + "grad_norm": 3.0762476921081543, + "learning_rate": 0.0001989848702872359, + "loss": 5.789, + "step": 8400 + }, + { + "epoch": 0.09, + "grad_norm": 3.7109107971191406, + "learning_rate": 0.00019895864558866556, + "loss": 5.773, + "step": 8450 + }, + { + "epoch": 0.09, + "grad_norm": 5.400998115539551, + "learning_rate": 0.00019893208822638618, + "loss": 5.7506, + "step": 8500 + }, + { + "epoch": 0.09, + "grad_norm": 3.3062849044799805, + "learning_rate": 0.00019890519828967413, + "loss": 5.7515, + "step": 8550 + }, + { + "epoch": 0.1, + "grad_norm": 4.109920501708984, + "learning_rate": 0.00019887797586892373, + "loss": 5.7972, + "step": 8600 + }, + { + "epoch": 0.1, + "grad_norm": 3.4838390350341797, + "learning_rate": 0.00019885042105564717, + "loss": 5.6753, + "step": 8650 + }, + { + "epoch": 0.1, + "grad_norm": 4.251760959625244, + "learning_rate": 0.00019882253394247381, + "loss": 5.6303, + "step": 8700 + }, + { + "epoch": 0.1, + "grad_norm": 4.042376518249512, + "learning_rate": 0.00019879431462315025, + "loss": 5.5753, + "step": 8750 + }, + { + "epoch": 0.1, + "grad_norm": 4.239652633666992, + "learning_rate": 0.0001987657631925398, + "loss": 5.5335, + "step": 8800 + }, + { + "epoch": 0.1, + "grad_norm": 5.15481424331665, + "learning_rate": 0.00019873687974662215, + "loss": 5.5396, + "step": 8850 + }, + { + "epoch": 0.1, + "grad_norm": 4.36835241317749, + "learning_rate": 0.00019870766438249317, + "loss": 5.6017, + "step": 8900 + }, + { + "epoch": 0.1, + "grad_norm": 4.165258407592773, + "learning_rate": 0.00019867811719836452, + "loss": 5.7228, + "step": 8950 + }, + { + "epoch": 0.1, + "grad_norm": 4.125988006591797, + "learning_rate": 0.0001986482382935633, + "loss": 5.5787, + "step": 9000 + }, + { + "epoch": 0.1, + "grad_norm": 4.177731037139893, + "learning_rate": 0.0001986180277685317, + "loss": 5.5829, + "step": 9050 + }, + { + "epoch": 0.1, + "grad_norm": 5.006561279296875, + "learning_rate": 0.00019858748572482683, + "loss": 5.5466, + "step": 9100 + }, + { + "epoch": 0.1, + "grad_norm": 4.33070182800293, + "learning_rate": 0.00019855661226512007, + "loss": 5.5544, + "step": 9150 + }, + { + "epoch": 0.1, + "grad_norm": 4.358560085296631, + "learning_rate": 0.00019852540749319708, + "loss": 5.4599, + "step": 9200 + }, + { + "epoch": 0.1, + "grad_norm": 4.536096096038818, + "learning_rate": 0.00019849387151395708, + "loss": 5.4983, + "step": 9250 + }, + { + "epoch": 0.1, + "grad_norm": 4.66163444519043, + "learning_rate": 0.0001984620044334129, + "loss": 5.4097, + "step": 9300 + }, + { + "epoch": 0.1, + "grad_norm": 4.4319233894348145, + "learning_rate": 0.00019842980635869024, + "loss": 5.4093, + "step": 9350 + }, + { + "epoch": 0.1, + "grad_norm": 4.98419713973999, + "learning_rate": 0.0001983972773980276, + "loss": 5.4056, + "step": 9400 + }, + { + "epoch": 0.1, + "grad_norm": 3.6354339122772217, + "learning_rate": 0.0001983644176607757, + "loss": 5.3171, + "step": 9450 + }, + { + "epoch": 0.11, + "grad_norm": 4.495342254638672, + "learning_rate": 0.00019833122725739736, + "loss": 5.4521, + "step": 9500 + }, + { + "epoch": 0.11, + "grad_norm": 4.5558671951293945, + "learning_rate": 0.00019829770629946678, + "loss": 5.5158, + "step": 9550 + }, + { + "epoch": 0.11, + "grad_norm": 3.7165732383728027, + "learning_rate": 0.00019826385489966957, + "loss": 5.301, + "step": 9600 + }, + { + "epoch": 0.11, + "grad_norm": 6.030915260314941, + "learning_rate": 0.00019822967317180204, + "loss": 5.3316, + "step": 9650 + }, + { + "epoch": 0.11, + "grad_norm": 5.385923385620117, + "learning_rate": 0.00019819516123077094, + "loss": 5.3844, + "step": 9700 + }, + { + "epoch": 0.11, + "grad_norm": 4.383516788482666, + "learning_rate": 0.00019816101926755305, + "loss": 5.2995, + "step": 9750 + }, + { + "epoch": 0.11, + "grad_norm": 4.446406364440918, + "learning_rate": 0.00019812585384780055, + "loss": 5.386, + "step": 9800 + }, + { + "epoch": 0.11, + "grad_norm": 4.345483303070068, + "learning_rate": 0.00019809035856388805, + "loss": 5.2815, + "step": 9850 + }, + { + "epoch": 0.11, + "grad_norm": 4.791261672973633, + "learning_rate": 0.00019805453353513813, + "loss": 5.3757, + "step": 9900 + }, + { + "epoch": 0.11, + "grad_norm": 5.622151851654053, + "learning_rate": 0.00019801837888198172, + "loss": 5.4405, + "step": 9950 + }, + { + "epoch": 0.11, + "grad_norm": 4.934606075286865, + "learning_rate": 0.0001979818947259579, + "loss": 5.139, + "step": 10000 + }, + { + "epoch": 0.11, + "grad_norm": 3.9659693241119385, + "learning_rate": 0.0001979450811897134, + "loss": 5.1726, + "step": 10050 + }, + { + "epoch": 0.11, + "grad_norm": 5.214992046356201, + "learning_rate": 0.00019790793839700226, + "loss": 5.2864, + "step": 10100 + }, + { + "epoch": 0.11, + "grad_norm": 4.5359601974487305, + "learning_rate": 0.00019787046647268524, + "loss": 5.1443, + "step": 10150 + }, + { + "epoch": 0.11, + "grad_norm": 4.26462984085083, + "learning_rate": 0.00019783266554272962, + "loss": 5.0597, + "step": 10200 + }, + { + "epoch": 0.11, + "grad_norm": 5.053945064544678, + "learning_rate": 0.00019779453573420873, + "loss": 5.2946, + "step": 10250 + }, + { + "epoch": 0.11, + "grad_norm": 6.082211494445801, + "learning_rate": 0.00019775607717530127, + "loss": 5.2075, + "step": 10300 + }, + { + "epoch": 0.11, + "grad_norm": 4.107390403747559, + "learning_rate": 0.00019771728999529132, + "loss": 5.1394, + "step": 10350 + }, + { + "epoch": 0.12, + "grad_norm": 4.58411169052124, + "learning_rate": 0.00019767817432456752, + "loss": 5.1064, + "step": 10400 + }, + { + "epoch": 0.12, + "grad_norm": 8.38965892791748, + "learning_rate": 0.00019763952239228627, + "loss": 5.0808, + "step": 10450 + }, + { + "epoch": 0.12, + "grad_norm": 3.885803699493408, + "learning_rate": 0.00019759975669894338, + "loss": 5.0664, + "step": 10500 + }, + { + "epoch": 0.12, + "grad_norm": 4.1605916023254395, + "learning_rate": 0.00019755966290999167, + "loss": 5.2469, + "step": 10550 + }, + { + "epoch": 0.12, + "grad_norm": 4.821887016296387, + "learning_rate": 0.00019751924116021225, + "loss": 5.2451, + "step": 10600 + }, + { + "epoch": 0.12, + "grad_norm": 3.865694761276245, + "learning_rate": 0.00019747849158548858, + "loss": 5.2334, + "step": 10650 + }, + { + "epoch": 0.12, + "grad_norm": 3.640681028366089, + "learning_rate": 0.00019743741432280625, + "loss": 5.1206, + "step": 10700 + }, + { + "epoch": 0.12, + "grad_norm": 4.04166316986084, + "learning_rate": 0.00019739600951025236, + "loss": 5.0059, + "step": 10750 + }, + { + "epoch": 0.12, + "grad_norm": 4.637605667114258, + "learning_rate": 0.00019735427728701516, + "loss": 5.0302, + "step": 10800 + }, + { + "epoch": 0.12, + "grad_norm": 4.08723783493042, + "learning_rate": 0.0001973122177933835, + "loss": 5.1551, + "step": 10850 + }, + { + "epoch": 0.12, + "grad_norm": 3.7944953441619873, + "learning_rate": 0.00019726983117074643, + "loss": 5.0665, + "step": 10900 + }, + { + "epoch": 0.12, + "grad_norm": 5.2847371101379395, + "learning_rate": 0.00019722711756159266, + "loss": 5.2212, + "step": 10950 + }, + { + "epoch": 0.12, + "grad_norm": 4.109150409698486, + "learning_rate": 0.00019718407710951012, + "loss": 5.2645, + "step": 11000 + }, + { + "epoch": 0.12, + "grad_norm": 4.127768039703369, + "learning_rate": 0.0001971407099591855, + "loss": 5.0395, + "step": 11050 + }, + { + "epoch": 0.12, + "grad_norm": 5.058667182922363, + "learning_rate": 0.00019709701625640367, + "loss": 5.0247, + "step": 11100 + }, + { + "epoch": 0.12, + "grad_norm": 5.4407267570495605, + "learning_rate": 0.00019705299614804732, + "loss": 4.9935, + "step": 11150 + }, + { + "epoch": 0.12, + "grad_norm": 3.7877707481384277, + "learning_rate": 0.00019700864978209636, + "loss": 5.074, + "step": 11200 + }, + { + "epoch": 0.12, + "grad_norm": 3.777330160140991, + "learning_rate": 0.00019696397730762746, + "loss": 5.0458, + "step": 11250 + }, + { + "epoch": 0.13, + "grad_norm": 4.143067836761475, + "learning_rate": 0.0001969189788748136, + "loss": 4.9375, + "step": 11300 + }, + { + "epoch": 0.13, + "grad_norm": 5.560107231140137, + "learning_rate": 0.00019687365463492344, + "loss": 4.8285, + "step": 11350 + }, + { + "epoch": 0.13, + "grad_norm": 4.057905197143555, + "learning_rate": 0.00019682800474032095, + "loss": 4.9753, + "step": 11400 + }, + { + "epoch": 0.13, + "grad_norm": 3.835442066192627, + "learning_rate": 0.00019678202934446482, + "loss": 4.9368, + "step": 11450 + }, + { + "epoch": 0.13, + "grad_norm": 5.135551929473877, + "learning_rate": 0.0001967357286019079, + "loss": 4.9994, + "step": 11500 + }, + { + "epoch": 0.13, + "grad_norm": 4.615053653717041, + "learning_rate": 0.00019668910266829685, + "loss": 5.0182, + "step": 11550 + }, + { + "epoch": 0.13, + "grad_norm": 4.474258899688721, + "learning_rate": 0.0001966421517003714, + "loss": 4.8704, + "step": 11600 + }, + { + "epoch": 0.13, + "grad_norm": 4.264945030212402, + "learning_rate": 0.00019659487585596406, + "loss": 4.9076, + "step": 11650 + }, + { + "epoch": 0.13, + "grad_norm": 4.091209411621094, + "learning_rate": 0.00019654727529399925, + "loss": 4.7135, + "step": 11700 + }, + { + "epoch": 0.13, + "grad_norm": 4.154038429260254, + "learning_rate": 0.00019649935017449318, + "loss": 4.8239, + "step": 11750 + }, + { + "epoch": 0.13, + "grad_norm": 3.697162628173828, + "learning_rate": 0.00019645110065855305, + "loss": 4.9972, + "step": 11800 + }, + { + "epoch": 0.13, + "grad_norm": 4.0024847984313965, + "learning_rate": 0.00019640252690837645, + "loss": 4.8854, + "step": 11850 + }, + { + "epoch": 0.13, + "grad_norm": 3.9416885375976562, + "learning_rate": 0.0001963536290872511, + "loss": 4.8547, + "step": 11900 + }, + { + "epoch": 0.13, + "grad_norm": 3.978651285171509, + "learning_rate": 0.000196304407359554, + "loss": 4.7873, + "step": 11950 + }, + { + "epoch": 0.13, + "grad_norm": 4.435175895690918, + "learning_rate": 0.0001962548618907511, + "loss": 4.8124, + "step": 12000 + }, + { + "epoch": 0.13, + "grad_norm": 3.8776824474334717, + "learning_rate": 0.00019620499284739662, + "loss": 4.8896, + "step": 12050 + }, + { + "epoch": 0.13, + "grad_norm": 5.041496276855469, + "learning_rate": 0.00019615480039713248, + "loss": 4.8343, + "step": 12100 + }, + { + "epoch": 0.13, + "grad_norm": 4.18281888961792, + "learning_rate": 0.00019610428470868784, + "loss": 4.8559, + "step": 12150 + }, + { + "epoch": 0.14, + "grad_norm": 4.223630905151367, + "learning_rate": 0.00019605344595187844, + "loss": 4.8153, + "step": 12200 + }, + { + "epoch": 0.14, + "grad_norm": 4.63677453994751, + "learning_rate": 0.0001960022842976061, + "loss": 4.7951, + "step": 12250 + }, + { + "epoch": 0.14, + "grad_norm": 4.188296794891357, + "learning_rate": 0.00019595079991785802, + "loss": 4.8904, + "step": 12300 + }, + { + "epoch": 0.14, + "grad_norm": 4.402559280395508, + "learning_rate": 0.00019589899298570634, + "loss": 4.7851, + "step": 12350 + }, + { + "epoch": 0.14, + "grad_norm": 5.976877212524414, + "learning_rate": 0.00019584686367530755, + "loss": 4.6431, + "step": 12400 + }, + { + "epoch": 0.14, + "grad_norm": 4.849298477172852, + "learning_rate": 0.0001957944121619018, + "loss": 4.7544, + "step": 12450 + }, + { + "epoch": 0.14, + "grad_norm": 4.932714462280273, + "learning_rate": 0.0001957416386218124, + "loss": 4.6811, + "step": 12500 + }, + { + "epoch": 0.14, + "grad_norm": 4.682474136352539, + "learning_rate": 0.00019568854323244515, + "loss": 4.799, + "step": 12550 + }, + { + "epoch": 0.14, + "grad_norm": 5.228520393371582, + "learning_rate": 0.00019563619766470511, + "loss": 4.7622, + "step": 12600 + }, + { + "epoch": 0.14, + "grad_norm": 4.093870162963867, + "learning_rate": 0.00019558246554138458, + "loss": 4.7369, + "step": 12650 + }, + { + "epoch": 0.14, + "grad_norm": 5.248356342315674, + "learning_rate": 0.0001955284121038694, + "loss": 4.7519, + "step": 12700 + }, + { + "epoch": 0.14, + "grad_norm": 3.924299955368042, + "learning_rate": 0.00019547403753386803, + "loss": 4.6441, + "step": 12750 + }, + { + "epoch": 0.14, + "grad_norm": 4.972569942474365, + "learning_rate": 0.00019542043906868188, + "loss": 4.7192, + "step": 12800 + }, + { + "epoch": 0.14, + "grad_norm": 5.033604145050049, + "learning_rate": 0.00019536542919665846, + "loss": 4.6397, + "step": 12850 + }, + { + "epoch": 0.14, + "grad_norm": 5.222695350646973, + "learning_rate": 0.00019531009874003928, + "loss": 4.6309, + "step": 12900 + }, + { + "epoch": 0.14, + "grad_norm": 3.810999631881714, + "learning_rate": 0.00019525444788482562, + "loss": 4.6513, + "step": 12950 + }, + { + "epoch": 0.14, + "grad_norm": 5.272600173950195, + "learning_rate": 0.00019519847681809585, + "loss": 4.8001, + "step": 13000 + }, + { + "epoch": 0.14, + "grad_norm": 4.836308002471924, + "learning_rate": 0.00019514218572800468, + "loss": 4.7101, + "step": 13050 + }, + { + "epoch": 0.15, + "grad_norm": 4.598148345947266, + "learning_rate": 0.00019508557480378276, + "loss": 4.5578, + "step": 13100 + }, + { + "epoch": 0.15, + "grad_norm": 3.910820722579956, + "learning_rate": 0.0001950286442357358, + "loss": 4.7124, + "step": 13150 + }, + { + "epoch": 0.15, + "grad_norm": 3.856081962585449, + "learning_rate": 0.00019497139421524416, + "loss": 4.7563, + "step": 13200 + }, + { + "epoch": 0.15, + "grad_norm": 4.151907920837402, + "learning_rate": 0.00019491382493476195, + "loss": 4.6726, + "step": 13250 + }, + { + "epoch": 0.15, + "grad_norm": 4.349935054779053, + "learning_rate": 0.0001948559365878166, + "loss": 4.6341, + "step": 13300 + }, + { + "epoch": 0.15, + "grad_norm": 3.8229756355285645, + "learning_rate": 0.00019479772936900811, + "loss": 4.6183, + "step": 13350 + }, + { + "epoch": 0.15, + "grad_norm": 5.495506286621094, + "learning_rate": 0.0001947392034740084, + "loss": 4.6608, + "step": 13400 + }, + { + "epoch": 0.15, + "grad_norm": 4.307513236999512, + "learning_rate": 0.00019468035909956072, + "loss": 4.6805, + "step": 13450 + }, + { + "epoch": 0.15, + "grad_norm": 3.939659595489502, + "learning_rate": 0.0001946211964434788, + "loss": 4.679, + "step": 13500 + }, + { + "epoch": 0.15, + "grad_norm": 5.444967269897461, + "learning_rate": 0.00019456171570464653, + "loss": 4.7195, + "step": 13550 + }, + { + "epoch": 0.15, + "grad_norm": 4.513270854949951, + "learning_rate": 0.00019450191708301687, + "loss": 4.5367, + "step": 13600 + }, + { + "epoch": 0.15, + "grad_norm": 4.617405414581299, + "learning_rate": 0.00019444180077961146, + "loss": 4.5742, + "step": 13650 + }, + { + "epoch": 0.15, + "grad_norm": 4.580646991729736, + "learning_rate": 0.00019438136699652001, + "loss": 4.4936, + "step": 13700 + }, + { + "epoch": 0.15, + "grad_norm": 4.657532691955566, + "learning_rate": 0.00019432061593689927, + "loss": 4.6877, + "step": 13750 + }, + { + "epoch": 0.15, + "grad_norm": 5.374803066253662, + "learning_rate": 0.0001942595478049727, + "loss": 4.6101, + "step": 13800 + }, + { + "epoch": 0.15, + "grad_norm": 5.1111650466918945, + "learning_rate": 0.00019419816280602962, + "loss": 4.6185, + "step": 13850 + }, + { + "epoch": 0.15, + "grad_norm": 5.18306303024292, + "learning_rate": 0.00019413646114642446, + "loss": 4.5524, + "step": 13900 + }, + { + "epoch": 0.15, + "grad_norm": 4.411191463470459, + "learning_rate": 0.00019407444303357624, + "loss": 4.4346, + "step": 13950 + }, + { + "epoch": 0.16, + "grad_norm": 4.161925792694092, + "learning_rate": 0.0001940121086759678, + "loss": 4.3702, + "step": 14000 + }, + { + "epoch": 0.16, + "grad_norm": 5.059813022613525, + "learning_rate": 0.000193949458283145, + "loss": 4.5351, + "step": 14050 + }, + { + "epoch": 0.16, + "grad_norm": 5.563150882720947, + "learning_rate": 0.00019388649206571616, + "loss": 4.477, + "step": 14100 + }, + { + "epoch": 0.16, + "grad_norm": 5.1144609451293945, + "learning_rate": 0.00019382321023535127, + "loss": 4.6033, + "step": 14150 + }, + { + "epoch": 0.16, + "grad_norm": 4.734794616699219, + "learning_rate": 0.00019375961300478127, + "loss": 4.5287, + "step": 14200 + }, + { + "epoch": 0.16, + "grad_norm": 4.543684959411621, + "learning_rate": 0.00019369570058779743, + "loss": 4.4474, + "step": 14250 + }, + { + "epoch": 0.16, + "grad_norm": 5.4647979736328125, + "learning_rate": 0.00019363147319925047, + "loss": 4.3806, + "step": 14300 + }, + { + "epoch": 0.16, + "grad_norm": 5.058681964874268, + "learning_rate": 0.00019356693105505006, + "loss": 4.4998, + "step": 14350 + }, + { + "epoch": 0.16, + "grad_norm": 5.494804859161377, + "learning_rate": 0.00019350207437216386, + "loss": 4.3911, + "step": 14400 + }, + { + "epoch": 0.16, + "grad_norm": 5.227470397949219, + "learning_rate": 0.00019343690336861687, + "loss": 4.2557, + "step": 14450 + }, + { + "epoch": 0.16, + "grad_norm": 3.7686829566955566, + "learning_rate": 0.00019337141826349092, + "loss": 4.313, + "step": 14500 + }, + { + "epoch": 0.16, + "grad_norm": 4.975152492523193, + "learning_rate": 0.00019330561927692345, + "loss": 4.2914, + "step": 14550 + }, + { + "epoch": 0.16, + "grad_norm": 5.811885356903076, + "learning_rate": 0.00019323950663010733, + "loss": 4.3566, + "step": 14600 + }, + { + "epoch": 0.16, + "grad_norm": 5.566829204559326, + "learning_rate": 0.00019317308054528966, + "loss": 4.2847, + "step": 14650 + }, + { + "epoch": 0.16, + "grad_norm": 5.977478504180908, + "learning_rate": 0.0001931063412457713, + "loss": 4.3034, + "step": 14700 + }, + { + "epoch": 0.16, + "grad_norm": 4.601086616516113, + "learning_rate": 0.00019303928895590596, + "loss": 4.1929, + "step": 14750 + }, + { + "epoch": 0.16, + "grad_norm": 5.051478385925293, + "learning_rate": 0.0001929719239010996, + "loss": 4.2749, + "step": 14800 + }, + { + "epoch": 0.16, + "grad_norm": 6.248847961425781, + "learning_rate": 0.00019290424630780947, + "loss": 4.3419, + "step": 14850 + }, + { + "epoch": 0.17, + "grad_norm": 5.392062664031982, + "learning_rate": 0.0001928362564035436, + "loss": 4.4038, + "step": 14900 + }, + { + "epoch": 0.17, + "grad_norm": 5.6346211433410645, + "learning_rate": 0.00019276795441685975, + "loss": 4.3403, + "step": 14950 + }, + { + "epoch": 0.17, + "grad_norm": 5.646982192993164, + "learning_rate": 0.00019269934057736493, + "loss": 4.252, + "step": 15000 + }, + { + "epoch": 0.17, + "grad_norm": 5.455059051513672, + "learning_rate": 0.00019263041511571438, + "loss": 4.3809, + "step": 15050 + }, + { + "epoch": 0.17, + "grad_norm": 5.478726387023926, + "learning_rate": 0.00019256117826361096, + "loss": 4.1885, + "step": 15100 + }, + { + "epoch": 0.17, + "grad_norm": 5.029292106628418, + "learning_rate": 0.0001924916302538043, + "loss": 4.2615, + "step": 15150 + }, + { + "epoch": 0.17, + "grad_norm": 5.6447978019714355, + "learning_rate": 0.00019242177132009, + "loss": 4.268, + "step": 15200 + }, + { + "epoch": 0.17, + "grad_norm": 5.165138244628906, + "learning_rate": 0.00019235160169730895, + "loss": 4.3222, + "step": 15250 + }, + { + "epoch": 0.17, + "grad_norm": 5.661884784698486, + "learning_rate": 0.00019228112162134641, + "loss": 4.3179, + "step": 15300 + }, + { + "epoch": 0.17, + "grad_norm": 6.117990493774414, + "learning_rate": 0.0001922103313291313, + "loss": 4.2241, + "step": 15350 + }, + { + "epoch": 0.17, + "grad_norm": 4.299765110015869, + "learning_rate": 0.0001921392310586353, + "loss": 4.2602, + "step": 15400 + }, + { + "epoch": 0.17, + "grad_norm": 5.798460483551025, + "learning_rate": 0.00019206782104887223, + "loss": 4.3096, + "step": 15450 + }, + { + "epoch": 0.17, + "grad_norm": 5.016506671905518, + "learning_rate": 0.00019199610153989712, + "loss": 4.2073, + "step": 15500 + }, + { + "epoch": 0.17, + "grad_norm": 9.708767890930176, + "learning_rate": 0.0001919240727728054, + "loss": 4.2099, + "step": 15550 + }, + { + "epoch": 0.17, + "grad_norm": 4.904361248016357, + "learning_rate": 0.00019185173498973204, + "loss": 4.2461, + "step": 15600 + }, + { + "epoch": 0.17, + "grad_norm": 5.290199279785156, + "learning_rate": 0.00019177908843385103, + "loss": 4.115, + "step": 15650 + }, + { + "epoch": 0.17, + "grad_norm": 6.290179252624512, + "learning_rate": 0.00019170613334937406, + "loss": 4.3295, + "step": 15700 + }, + { + "epoch": 0.17, + "grad_norm": 5.071104526519775, + "learning_rate": 0.00019163286998155027, + "loss": 4.1532, + "step": 15750 + }, + { + "epoch": 0.18, + "grad_norm": 4.5464067459106445, + "learning_rate": 0.00019155929857666494, + "loss": 4.0761, + "step": 15800 + }, + { + "epoch": 0.18, + "grad_norm": 4.664229393005371, + "learning_rate": 0.0001914854193820389, + "loss": 4.1371, + "step": 15850 + }, + { + "epoch": 0.18, + "grad_norm": 7.168484210968018, + "learning_rate": 0.0001914112326460277, + "loss": 4.178, + "step": 15900 + }, + { + "epoch": 0.18, + "grad_norm": 6.570041179656982, + "learning_rate": 0.0001913367386180207, + "loss": 4.1536, + "step": 15950 + }, + { + "epoch": 0.18, + "grad_norm": 5.298222064971924, + "learning_rate": 0.00019126193754844036, + "loss": 4.2089, + "step": 16000 + }, + { + "epoch": 0.18, + "grad_norm": 7.139255523681641, + "learning_rate": 0.0001911868296887411, + "loss": 4.1362, + "step": 16050 + }, + { + "epoch": 0.18, + "grad_norm": 5.763050556182861, + "learning_rate": 0.00019111141529140887, + "loss": 4.1106, + "step": 16100 + }, + { + "epoch": 0.18, + "grad_norm": 6.586143493652344, + "learning_rate": 0.00019103569460995998, + "loss": 3.9519, + "step": 16150 + }, + { + "epoch": 0.18, + "grad_norm": 5.827348232269287, + "learning_rate": 0.00019095966789894038, + "loss": 3.9598, + "step": 16200 + }, + { + "epoch": 0.18, + "grad_norm": 5.121611595153809, + "learning_rate": 0.00019088333541392478, + "loss": 4.1347, + "step": 16250 + }, + { + "epoch": 0.18, + "grad_norm": 5.110377788543701, + "learning_rate": 0.00019080669741151581, + "loss": 4.0088, + "step": 16300 + }, + { + "epoch": 0.18, + "grad_norm": 6.672893047332764, + "learning_rate": 0.00019072975414934318, + "loss": 4.0916, + "step": 16350 + }, + { + "epoch": 0.18, + "grad_norm": 5.667397499084473, + "learning_rate": 0.00019065250588606262, + "loss": 4.0695, + "step": 16400 + }, + { + "epoch": 0.18, + "grad_norm": 6.404243469238281, + "learning_rate": 0.0001905749528813553, + "loss": 3.9728, + "step": 16450 + }, + { + "epoch": 0.18, + "grad_norm": 6.912601470947266, + "learning_rate": 0.00019049709539592686, + "loss": 4.029, + "step": 16500 + }, + { + "epoch": 0.18, + "grad_norm": 5.015479564666748, + "learning_rate": 0.00019041893369150636, + "loss": 4.0268, + "step": 16550 + }, + { + "epoch": 0.18, + "grad_norm": 6.656422138214111, + "learning_rate": 0.00019034046803084563, + "loss": 4.0393, + "step": 16600 + }, + { + "epoch": 0.18, + "grad_norm": 4.685242176055908, + "learning_rate": 0.00019026169867771825, + "loss": 4.1104, + "step": 16650 + }, + { + "epoch": 0.19, + "grad_norm": 6.503780364990234, + "learning_rate": 0.00019018262589691874, + "loss": 4.0344, + "step": 16700 + }, + { + "epoch": 0.19, + "grad_norm": 4.73757266998291, + "learning_rate": 0.00019010324995426156, + "loss": 4.1114, + "step": 16750 + }, + { + "epoch": 0.19, + "grad_norm": 7.276214122772217, + "learning_rate": 0.0001900235711165804, + "loss": 3.8838, + "step": 16800 + }, + { + "epoch": 0.19, + "grad_norm": 6.2224273681640625, + "learning_rate": 0.00018994358965172717, + "loss": 3.9479, + "step": 16850 + }, + { + "epoch": 0.19, + "grad_norm": 6.4751996994018555, + "learning_rate": 0.00018986330582857096, + "loss": 4.0079, + "step": 16900 + }, + { + "epoch": 0.19, + "grad_norm": 4.874088764190674, + "learning_rate": 0.00018978271991699743, + "loss": 4.1664, + "step": 16950 + }, + { + "epoch": 0.19, + "grad_norm": 7.713326454162598, + "learning_rate": 0.0001897018321879077, + "loss": 3.9646, + "step": 17000 + }, + { + "epoch": 0.19, + "grad_norm": 5.753252029418945, + "learning_rate": 0.00018962064291321747, + "loss": 3.8574, + "step": 17050 + }, + { + "epoch": 0.19, + "grad_norm": 5.962434768676758, + "learning_rate": 0.0001895391523658562, + "loss": 3.9757, + "step": 17100 + }, + { + "epoch": 0.19, + "grad_norm": 5.875513553619385, + "learning_rate": 0.00018945736081976607, + "loss": 4.0424, + "step": 17150 + }, + { + "epoch": 0.19, + "grad_norm": 6.298293590545654, + "learning_rate": 0.00018937526854990108, + "loss": 3.958, + "step": 17200 + }, + { + "epoch": 0.19, + "grad_norm": 4.98872184753418, + "learning_rate": 0.00018929287583222625, + "loss": 3.9225, + "step": 17250 + }, + { + "epoch": 0.19, + "grad_norm": 6.467836380004883, + "learning_rate": 0.00018921018294371645, + "loss": 3.9369, + "step": 17300 + }, + { + "epoch": 0.19, + "grad_norm": 5.920988082885742, + "learning_rate": 0.0001891271901623558, + "loss": 3.975, + "step": 17350 + }, + { + "epoch": 0.19, + "grad_norm": 5.652931213378906, + "learning_rate": 0.00018904389776713641, + "loss": 3.9067, + "step": 17400 + }, + { + "epoch": 0.19, + "grad_norm": 5.372093200683594, + "learning_rate": 0.00018896030603805767, + "loss": 3.9267, + "step": 17450 + }, + { + "epoch": 0.19, + "grad_norm": 5.743618965148926, + "learning_rate": 0.00018887641525612518, + "loss": 3.8912, + "step": 17500 + }, + { + "epoch": 0.19, + "grad_norm": 8.207468032836914, + "learning_rate": 0.00018879222570334985, + "loss": 3.9101, + "step": 17550 + }, + { + "epoch": 0.2, + "grad_norm": 6.930370807647705, + "learning_rate": 0.00018870773766274697, + "loss": 3.8817, + "step": 17600 + }, + { + "epoch": 0.2, + "grad_norm": 6.367077350616455, + "learning_rate": 0.00018862295141833523, + "loss": 3.8931, + "step": 17650 + }, + { + "epoch": 0.2, + "grad_norm": 6.587210178375244, + "learning_rate": 0.00018853786725513575, + "loss": 3.9393, + "step": 17700 + }, + { + "epoch": 0.2, + "grad_norm": 5.502545356750488, + "learning_rate": 0.0001884524854591712, + "loss": 3.8489, + "step": 17750 + }, + { + "epoch": 0.2, + "grad_norm": 6.352043628692627, + "learning_rate": 0.00018836680631746476, + "loss": 3.8162, + "step": 17800 + }, + { + "epoch": 0.2, + "grad_norm": 5.686196804046631, + "learning_rate": 0.00018828083011803917, + "loss": 3.9476, + "step": 17850 + }, + { + "epoch": 0.2, + "grad_norm": 6.225170612335205, + "learning_rate": 0.00018819455714991578, + "loss": 3.9404, + "step": 17900 + }, + { + "epoch": 0.2, + "grad_norm": 7.1347150802612305, + "learning_rate": 0.0001881079877031136, + "loss": 3.9798, + "step": 17950 + }, + { + "epoch": 0.2, + "grad_norm": 5.343573093414307, + "learning_rate": 0.0001880211220686482, + "loss": 3.9038, + "step": 18000 + }, + { + "epoch": 0.2, + "grad_norm": 6.858921051025391, + "learning_rate": 0.00018793396053853098, + "loss": 3.8792, + "step": 18050 + }, + { + "epoch": 0.2, + "grad_norm": 6.721033573150635, + "learning_rate": 0.0001878482554434291, + "loss": 3.8421, + "step": 18100 + }, + { + "epoch": 0.2, + "grad_norm": 6.173632621765137, + "learning_rate": 0.00018776050890530516, + "loss": 4.0233, + "step": 18150 + }, + { + "epoch": 0.2, + "grad_norm": 5.996013164520264, + "learning_rate": 0.00018767246734761796, + "loss": 3.8057, + "step": 18200 + }, + { + "epoch": 0.2, + "grad_norm": 5.707641124725342, + "learning_rate": 0.00018758413106633186, + "loss": 3.8299, + "step": 18250 + }, + { + "epoch": 0.2, + "grad_norm": 7.221241474151611, + "learning_rate": 0.00018749550035840193, + "loss": 3.8828, + "step": 18300 + }, + { + "epoch": 0.2, + "grad_norm": 5.554357528686523, + "learning_rate": 0.00018740657552177305, + "loss": 3.8553, + "step": 18350 + }, + { + "epoch": 0.2, + "grad_norm": 4.664674282073975, + "learning_rate": 0.00018731735685537885, + "loss": 3.8838, + "step": 18400 + }, + { + "epoch": 0.2, + "grad_norm": 5.485450267791748, + "learning_rate": 0.00018722784465914071, + "loss": 3.8165, + "step": 18450 + }, + { + "epoch": 0.21, + "grad_norm": 5.825826644897461, + "learning_rate": 0.00018713803923396668, + "loss": 3.7588, + "step": 18500 + }, + { + "epoch": 0.21, + "grad_norm": 5.392491817474365, + "learning_rate": 0.0001870479408817507, + "loss": 3.8001, + "step": 18550 + }, + { + "epoch": 0.21, + "grad_norm": 5.493740081787109, + "learning_rate": 0.00018695754990537123, + "loss": 3.9735, + "step": 18600 + }, + { + "epoch": 0.21, + "grad_norm": 5.905117511749268, + "learning_rate": 0.00018686686660869062, + "loss": 3.7334, + "step": 18650 + }, + { + "epoch": 0.21, + "grad_norm": 5.598316192626953, + "learning_rate": 0.0001867758912965537, + "loss": 3.8269, + "step": 18700 + }, + { + "epoch": 0.21, + "grad_norm": 5.979629039764404, + "learning_rate": 0.00018668462427478714, + "loss": 3.8713, + "step": 18750 + }, + { + "epoch": 0.21, + "grad_norm": 6.480854511260986, + "learning_rate": 0.00018659306585019813, + "loss": 3.7792, + "step": 18800 + }, + { + "epoch": 0.21, + "grad_norm": 5.820549488067627, + "learning_rate": 0.00018650121633057346, + "loss": 3.6656, + "step": 18850 + }, + { + "epoch": 0.21, + "grad_norm": 6.579679012298584, + "learning_rate": 0.0001864090760246785, + "loss": 3.9109, + "step": 18900 + }, + { + "epoch": 0.21, + "grad_norm": 5.669819355010986, + "learning_rate": 0.00018631664524225615, + "loss": 3.7815, + "step": 18950 + }, + { + "epoch": 0.21, + "grad_norm": 6.644351005554199, + "learning_rate": 0.0001862239242940257, + "loss": 3.7529, + "step": 19000 + }, + { + "epoch": 0.21, + "grad_norm": 5.022332191467285, + "learning_rate": 0.00018613091349168205, + "loss": 3.7001, + "step": 19050 + }, + { + "epoch": 0.21, + "grad_norm": 6.376641273498535, + "learning_rate": 0.00018603761314789425, + "loss": 3.6871, + "step": 19100 + }, + { + "epoch": 0.21, + "grad_norm": 6.298123359680176, + "learning_rate": 0.00018594402357630495, + "loss": 3.8095, + "step": 19150 + }, + { + "epoch": 0.21, + "grad_norm": 4.590997695922852, + "learning_rate": 0.00018585014509152882, + "loss": 3.8069, + "step": 19200 + }, + { + "epoch": 0.21, + "grad_norm": 6.710943222045898, + "learning_rate": 0.00018575597800915198, + "loss": 3.8547, + "step": 19250 + }, + { + "epoch": 0.21, + "grad_norm": 5.5094499588012695, + "learning_rate": 0.0001856615226457305, + "loss": 3.7314, + "step": 19300 + }, + { + "epoch": 0.21, + "grad_norm": 5.584799766540527, + "learning_rate": 0.0001855667793187898, + "loss": 3.7514, + "step": 19350 + }, + { + "epoch": 0.22, + "grad_norm": 6.0391154289245605, + "learning_rate": 0.00018547174834682308, + "loss": 3.6231, + "step": 19400 + }, + { + "epoch": 0.22, + "grad_norm": 5.92927885055542, + "learning_rate": 0.00018537643004929067, + "loss": 3.7008, + "step": 19450 + }, + { + "epoch": 0.22, + "grad_norm": 6.359600067138672, + "learning_rate": 0.00018528082474661867, + "loss": 3.798, + "step": 19500 + }, + { + "epoch": 0.22, + "grad_norm": 5.198579788208008, + "learning_rate": 0.0001851849327601981, + "loss": 3.7187, + "step": 19550 + }, + { + "epoch": 0.22, + "grad_norm": 6.796758651733398, + "learning_rate": 0.00018508875441238364, + "loss": 3.7086, + "step": 19600 + }, + { + "epoch": 0.22, + "grad_norm": 4.889728546142578, + "learning_rate": 0.00018499229002649258, + "loss": 3.7387, + "step": 19650 + }, + { + "epoch": 0.22, + "grad_norm": 6.382203102111816, + "learning_rate": 0.0001848955399268039, + "loss": 3.5992, + "step": 19700 + }, + { + "epoch": 0.22, + "grad_norm": 6.061376571655273, + "learning_rate": 0.00018479850443855686, + "loss": 3.6865, + "step": 19750 + }, + { + "epoch": 0.22, + "grad_norm": 5.2180681228637695, + "learning_rate": 0.0001847011838879503, + "loss": 3.7467, + "step": 19800 + }, + { + "epoch": 0.22, + "grad_norm": 5.063679218292236, + "learning_rate": 0.0001846035786021412, + "loss": 3.6894, + "step": 19850 + }, + { + "epoch": 0.22, + "grad_norm": 5.036098480224609, + "learning_rate": 0.00018450568890924373, + "loss": 3.6412, + "step": 19900 + }, + { + "epoch": 0.22, + "grad_norm": 4.86781644821167, + "learning_rate": 0.00018440751513832822, + "loss": 3.637, + "step": 19950 + }, + { + "epoch": 0.22, + "grad_norm": 5.41668176651001, + "learning_rate": 0.00018430905761941983, + "loss": 3.6814, + "step": 20000 + }, + { + "epoch": 0.22, + "grad_norm": 6.117024898529053, + "learning_rate": 0.00018421031668349773, + "loss": 3.6257, + "step": 20050 + }, + { + "epoch": 0.22, + "grad_norm": 7.368699073791504, + "learning_rate": 0.00018411129266249373, + "loss": 3.7111, + "step": 20100 + }, + { + "epoch": 0.22, + "grad_norm": 4.378394603729248, + "learning_rate": 0.0001840119858892913, + "loss": 3.7197, + "step": 20150 + }, + { + "epoch": 0.22, + "grad_norm": 7.029990196228027, + "learning_rate": 0.0001839123966977245, + "loss": 3.7267, + "step": 20200 + }, + { + "epoch": 0.22, + "grad_norm": 9.922813415527344, + "learning_rate": 0.00018381252542257662, + "loss": 3.7203, + "step": 20250 + }, + { + "epoch": 0.23, + "grad_norm": 4.9374518394470215, + "learning_rate": 0.00018371237239957932, + "loss": 3.6876, + "step": 20300 + }, + { + "epoch": 0.23, + "grad_norm": 5.682550430297852, + "learning_rate": 0.00018361193796541142, + "loss": 3.6862, + "step": 20350 + }, + { + "epoch": 0.23, + "grad_norm": 5.477772235870361, + "learning_rate": 0.00018351122245769771, + "loss": 3.5982, + "step": 20400 + }, + { + "epoch": 0.23, + "grad_norm": 7.745680332183838, + "learning_rate": 0.00018341224888886997, + "loss": 3.6978, + "step": 20450 + }, + { + "epoch": 0.23, + "grad_norm": 6.691402912139893, + "learning_rate": 0.0001833109778552932, + "loss": 3.6693, + "step": 20500 + }, + { + "epoch": 0.23, + "grad_norm": 6.229629993438721, + "learning_rate": 0.00018320942675989125, + "loss": 3.6327, + "step": 20550 + }, + { + "epoch": 0.23, + "grad_norm": 5.655289649963379, + "learning_rate": 0.0001831075959440427, + "loss": 3.6032, + "step": 20600 + }, + { + "epoch": 0.23, + "grad_norm": 5.4868927001953125, + "learning_rate": 0.00018300548575006658, + "loss": 3.7059, + "step": 20650 + }, + { + "epoch": 0.23, + "grad_norm": 7.387706756591797, + "learning_rate": 0.00018290309652122083, + "loss": 3.6838, + "step": 20700 + }, + { + "epoch": 0.23, + "grad_norm": 4.884798049926758, + "learning_rate": 0.00018280042860170168, + "loss": 3.665, + "step": 20750 + }, + { + "epoch": 0.23, + "grad_norm": 7.185595512390137, + "learning_rate": 0.00018269748233664204, + "loss": 3.6057, + "step": 20800 + }, + { + "epoch": 0.23, + "grad_norm": 6.449123382568359, + "learning_rate": 0.0001825942580721106, + "loss": 3.6262, + "step": 20850 + }, + { + "epoch": 0.23, + "grad_norm": 5.469310283660889, + "learning_rate": 0.00018249075615511053, + "loss": 3.522, + "step": 20900 + }, + { + "epoch": 0.23, + "grad_norm": 5.678877353668213, + "learning_rate": 0.0001823869769335784, + "loss": 3.6757, + "step": 20950 + }, + { + "epoch": 0.23, + "grad_norm": 5.033955097198486, + "learning_rate": 0.000182282920756383, + "loss": 3.7316, + "step": 21000 + }, + { + "epoch": 0.23, + "grad_norm": 6.790628433227539, + "learning_rate": 0.00018217858797332413, + "loss": 3.545, + "step": 21050 + }, + { + "epoch": 0.23, + "grad_norm": 6.205599308013916, + "learning_rate": 0.00018207397893513143, + "loss": 3.6035, + "step": 21100 + }, + { + "epoch": 0.23, + "grad_norm": 5.7604804039001465, + "learning_rate": 0.00018196909399346316, + "loss": 3.6869, + "step": 21150 + }, + { + "epoch": 0.24, + "grad_norm": 5.528883457183838, + "learning_rate": 0.0001818639335009052, + "loss": 3.6493, + "step": 21200 + }, + { + "epoch": 0.24, + "grad_norm": 6.46929407119751, + "learning_rate": 0.00018175849781096966, + "loss": 3.639, + "step": 21250 + }, + { + "epoch": 0.24, + "grad_norm": 5.487035274505615, + "learning_rate": 0.00018165278727809368, + "loss": 3.5755, + "step": 21300 + }, + { + "epoch": 0.24, + "grad_norm": 6.251669406890869, + "learning_rate": 0.00018154680225763848, + "loss": 3.704, + "step": 21350 + }, + { + "epoch": 0.24, + "grad_norm": 6.204404830932617, + "learning_rate": 0.00018144054310588792, + "loss": 3.6071, + "step": 21400 + }, + { + "epoch": 0.24, + "grad_norm": 5.7311482429504395, + "learning_rate": 0.00018133401018004743, + "loss": 3.5395, + "step": 21450 + }, + { + "epoch": 0.24, + "grad_norm": 7.110382556915283, + "learning_rate": 0.00018122720383824273, + "loss": 3.6643, + "step": 21500 + }, + { + "epoch": 0.24, + "grad_norm": 5.991401672363281, + "learning_rate": 0.0001811201244395187, + "loss": 3.6752, + "step": 21550 + }, + { + "epoch": 0.24, + "grad_norm": 5.788415431976318, + "learning_rate": 0.0001810127723438381, + "loss": 3.6362, + "step": 21600 + }, + { + "epoch": 0.24, + "grad_norm": 4.827778339385986, + "learning_rate": 0.00018090514791208043, + "loss": 3.7298, + "step": 21650 + }, + { + "epoch": 0.24, + "grad_norm": 5.7845916748046875, + "learning_rate": 0.0001807972515060407, + "loss": 3.543, + "step": 21700 + }, + { + "epoch": 0.24, + "grad_norm": 5.071081638336182, + "learning_rate": 0.00018068908348842818, + "loss": 3.5706, + "step": 21750 + }, + { + "epoch": 0.24, + "grad_norm": 5.189342021942139, + "learning_rate": 0.00018058064422286525, + "loss": 3.667, + "step": 21800 + }, + { + "epoch": 0.24, + "grad_norm": 7.787344455718994, + "learning_rate": 0.00018047193407388603, + "loss": 3.4985, + "step": 21850 + }, + { + "epoch": 0.24, + "grad_norm": 4.411252975463867, + "learning_rate": 0.00018036295340693531, + "loss": 3.6719, + "step": 21900 + }, + { + "epoch": 0.24, + "grad_norm": 5.700460433959961, + "learning_rate": 0.00018025370258836732, + "loss": 3.5075, + "step": 21950 + }, + { + "epoch": 0.24, + "grad_norm": 5.121459007263184, + "learning_rate": 0.00018014418198544432, + "loss": 3.5511, + "step": 22000 + }, + { + "epoch": 0.24, + "grad_norm": 5.29133415222168, + "learning_rate": 0.0001800343919663356, + "loss": 3.7063, + "step": 22050 + }, + { + "epoch": 0.25, + "grad_norm": 6.53157377243042, + "learning_rate": 0.00017992433290011604, + "loss": 3.5146, + "step": 22100 + }, + { + "epoch": 0.25, + "grad_norm": 6.442373275756836, + "learning_rate": 0.00017981400515676508, + "loss": 3.5431, + "step": 22150 + }, + { + "epoch": 0.25, + "grad_norm": 5.247061729431152, + "learning_rate": 0.00017970340910716522, + "loss": 3.604, + "step": 22200 + }, + { + "epoch": 0.25, + "grad_norm": 6.570899486541748, + "learning_rate": 0.000179592545123101, + "loss": 3.6034, + "step": 22250 + }, + { + "epoch": 0.25, + "grad_norm": 6.012238502502441, + "learning_rate": 0.00017948141357725764, + "loss": 3.4793, + "step": 22300 + }, + { + "epoch": 0.25, + "grad_norm": 5.5325422286987305, + "learning_rate": 0.0001793700148432198, + "loss": 3.563, + "step": 22350 + }, + { + "epoch": 0.25, + "grad_norm": 4.889975070953369, + "learning_rate": 0.00017925834929547035, + "loss": 3.5512, + "step": 22400 + }, + { + "epoch": 0.25, + "grad_norm": 5.225555896759033, + "learning_rate": 0.00017914641730938907, + "loss": 3.5521, + "step": 22450 + }, + { + "epoch": 0.25, + "grad_norm": 5.430109024047852, + "learning_rate": 0.0001790342192612514, + "loss": 3.4549, + "step": 22500 + }, + { + "epoch": 0.25, + "grad_norm": 5.8808274269104, + "learning_rate": 0.00017892175552822716, + "loss": 3.5518, + "step": 22550 + }, + { + "epoch": 0.25, + "grad_norm": 6.657894134521484, + "learning_rate": 0.00017880902648837946, + "loss": 3.4643, + "step": 22600 + }, + { + "epoch": 0.25, + "grad_norm": 4.968985557556152, + "learning_rate": 0.00017869603252066308, + "loss": 3.5022, + "step": 22650 + }, + { + "epoch": 0.25, + "grad_norm": 5.369678497314453, + "learning_rate": 0.00017858277400492357, + "loss": 3.6906, + "step": 22700 + }, + { + "epoch": 0.25, + "grad_norm": 6.433826446533203, + "learning_rate": 0.0001784692513218956, + "loss": 3.4281, + "step": 22750 + }, + { + "epoch": 0.25, + "grad_norm": 5.364591121673584, + "learning_rate": 0.00017835546485320202, + "loss": 3.6194, + "step": 22800 + }, + { + "epoch": 0.25, + "grad_norm": 5.889247894287109, + "learning_rate": 0.00017824141498135244, + "loss": 3.7013, + "step": 22850 + }, + { + "epoch": 0.25, + "grad_norm": 5.254469394683838, + "learning_rate": 0.0001781271020897419, + "loss": 3.4107, + "step": 22900 + }, + { + "epoch": 0.25, + "grad_norm": 5.486823558807373, + "learning_rate": 0.0001780125265626495, + "loss": 3.5453, + "step": 22950 + }, + { + "epoch": 0.26, + "grad_norm": 5.4713311195373535, + "learning_rate": 0.0001778976887852375, + "loss": 3.5482, + "step": 23000 + }, + { + "epoch": 0.26, + "grad_norm": 6.2519731521606445, + "learning_rate": 0.00017778258914354946, + "loss": 3.6251, + "step": 23050 + }, + { + "epoch": 0.26, + "grad_norm": 5.657818794250488, + "learning_rate": 0.00017766722802450944, + "loss": 3.5081, + "step": 23100 + }, + { + "epoch": 0.26, + "grad_norm": 6.176442623138428, + "learning_rate": 0.0001775516058159204, + "loss": 3.45, + "step": 23150 + }, + { + "epoch": 0.26, + "grad_norm": 5.838647365570068, + "learning_rate": 0.00017743572290646303, + "loss": 3.419, + "step": 23200 + }, + { + "epoch": 0.26, + "grad_norm": 6.912227630615234, + "learning_rate": 0.00017731957968569436, + "loss": 3.4892, + "step": 23250 + }, + { + "epoch": 0.26, + "grad_norm": 7.276485919952393, + "learning_rate": 0.0001772031765440465, + "loss": 3.5143, + "step": 23300 + }, + { + "epoch": 0.26, + "grad_norm": 5.351586818695068, + "learning_rate": 0.0001770865138728254, + "loss": 3.5467, + "step": 23350 + }, + { + "epoch": 0.26, + "grad_norm": 5.220416069030762, + "learning_rate": 0.00017696959206420937, + "loss": 3.4736, + "step": 23400 + }, + { + "epoch": 0.26, + "grad_norm": 5.282609462738037, + "learning_rate": 0.00017685241151124781, + "loss": 3.4181, + "step": 23450 + }, + { + "epoch": 0.26, + "grad_norm": 4.958062171936035, + "learning_rate": 0.00017673497260786006, + "loss": 3.4309, + "step": 23500 + }, + { + "epoch": 0.26, + "grad_norm": 6.3785929679870605, + "learning_rate": 0.00017661727574883388, + "loss": 3.3805, + "step": 23550 + }, + { + "epoch": 0.26, + "grad_norm": 5.702798366546631, + "learning_rate": 0.00017649932132982415, + "loss": 3.5371, + "step": 23600 + }, + { + "epoch": 0.26, + "grad_norm": 6.496365547180176, + "learning_rate": 0.0001763811097473516, + "loss": 3.4107, + "step": 23650 + }, + { + "epoch": 0.26, + "grad_norm": 5.093421936035156, + "learning_rate": 0.00017626264139880148, + "loss": 3.5514, + "step": 23700 + }, + { + "epoch": 0.26, + "grad_norm": 5.6509480476379395, + "learning_rate": 0.0001761439166824221, + "loss": 3.5612, + "step": 23750 + }, + { + "epoch": 0.26, + "grad_norm": 5.662957191467285, + "learning_rate": 0.00017602493599732372, + "loss": 3.5515, + "step": 23800 + }, + { + "epoch": 0.26, + "grad_norm": 7.548245429992676, + "learning_rate": 0.000175905699743477, + "loss": 3.5552, + "step": 23850 + }, + { + "epoch": 0.27, + "grad_norm": 5.2797112464904785, + "learning_rate": 0.00017578620832171173, + "loss": 3.4159, + "step": 23900 + }, + { + "epoch": 0.27, + "grad_norm": 6.431013584136963, + "learning_rate": 0.0001756664621337155, + "loss": 3.4257, + "step": 23950 + }, + { + "epoch": 0.27, + "grad_norm": 8.88436222076416, + "learning_rate": 0.00017554646158203236, + "loss": 3.5517, + "step": 24000 + }, + { + "epoch": 0.27, + "grad_norm": 5.538012981414795, + "learning_rate": 0.00017542620707006136, + "loss": 3.4451, + "step": 24050 + }, + { + "epoch": 0.27, + "grad_norm": 6.702478408813477, + "learning_rate": 0.00017530569900205538, + "loss": 3.5453, + "step": 24100 + }, + { + "epoch": 0.27, + "grad_norm": 5.236027240753174, + "learning_rate": 0.00017518493778311957, + "loss": 3.4483, + "step": 24150 + }, + { + "epoch": 0.27, + "grad_norm": 5.824537754058838, + "learning_rate": 0.00017506392381921014, + "loss": 3.507, + "step": 24200 + }, + { + "epoch": 0.27, + "grad_norm": 6.682642459869385, + "learning_rate": 0.0001749426575171329, + "loss": 3.4624, + "step": 24250 + }, + { + "epoch": 0.27, + "grad_norm": 6.02097225189209, + "learning_rate": 0.00017482113928454196, + "loss": 3.4782, + "step": 24300 + }, + { + "epoch": 0.27, + "grad_norm": 5.951188087463379, + "learning_rate": 0.00017469936952993834, + "loss": 3.5305, + "step": 24350 + }, + { + "epoch": 0.27, + "grad_norm": 5.47694730758667, + "learning_rate": 0.00017457734866266854, + "loss": 3.4653, + "step": 24400 + }, + { + "epoch": 0.27, + "grad_norm": 5.073057651519775, + "learning_rate": 0.0001744575249785453, + "loss": 3.4969, + "step": 24450 + }, + { + "epoch": 0.27, + "grad_norm": 6.532285690307617, + "learning_rate": 0.00017433500811915326, + "loss": 3.3932, + "step": 24500 + }, + { + "epoch": 0.27, + "grad_norm": 6.016458988189697, + "learning_rate": 0.00017421224137194837, + "loss": 3.4828, + "step": 24550 + }, + { + "epoch": 0.27, + "grad_norm": 7.032898902893066, + "learning_rate": 0.0001740892251496286, + "loss": 3.4347, + "step": 24600 + }, + { + "epoch": 0.27, + "grad_norm": 6.3446431159973145, + "learning_rate": 0.00017396595986573065, + "loss": 3.4101, + "step": 24650 + }, + { + "epoch": 0.27, + "grad_norm": 5.952356815338135, + "learning_rate": 0.00017384244593462859, + "loss": 3.4296, + "step": 24700 + }, + { + "epoch": 0.27, + "grad_norm": 5.403810501098633, + "learning_rate": 0.00017371868377153216, + "loss": 3.4264, + "step": 24750 + }, + { + "epoch": 0.27, + "grad_norm": 6.429996490478516, + "learning_rate": 0.00017359467379248568, + "loss": 3.4341, + "step": 24800 + }, + { + "epoch": 0.28, + "grad_norm": 5.618744850158691, + "learning_rate": 0.00017347041641436653, + "loss": 3.3357, + "step": 24850 + }, + { + "epoch": 0.28, + "grad_norm": 6.539459705352783, + "learning_rate": 0.00017334840455978504, + "loss": 3.5718, + "step": 24900 + }, + { + "epoch": 0.28, + "grad_norm": 4.993662357330322, + "learning_rate": 0.00017322365856462736, + "loss": 3.4774, + "step": 24950 + }, + { + "epoch": 0.28, + "grad_norm": 5.6996002197265625, + "learning_rate": 0.00017309866641761798, + "loss": 3.311, + "step": 25000 + }, + { + "epoch": 0.28, + "grad_norm": 4.32814884185791, + "learning_rate": 0.00017297342853893604, + "loss": 3.5558, + "step": 25050 + }, + { + "epoch": 0.28, + "grad_norm": 5.538712501525879, + "learning_rate": 0.0001728479453495866, + "loss": 3.3261, + "step": 25100 + }, + { + "epoch": 0.28, + "grad_norm": 4.975490093231201, + "learning_rate": 0.00017272221727139946, + "loss": 3.5, + "step": 25150 + }, + { + "epoch": 0.28, + "grad_norm": 4.377697467803955, + "learning_rate": 0.00017259624472702764, + "loss": 3.4562, + "step": 25200 + }, + { + "epoch": 0.28, + "grad_norm": 6.686251163482666, + "learning_rate": 0.00017247002813994592, + "loss": 3.3968, + "step": 25250 + }, + { + "epoch": 0.28, + "grad_norm": 7.116434097290039, + "learning_rate": 0.00017234356793444954, + "loss": 3.3161, + "step": 25300 + }, + { + "epoch": 0.28, + "grad_norm": 5.755252838134766, + "learning_rate": 0.0001722168645356526, + "loss": 3.4195, + "step": 25350 + }, + { + "epoch": 0.28, + "grad_norm": 6.647252559661865, + "learning_rate": 0.00017208991836948685, + "loss": 3.1887, + "step": 25400 + }, + { + "epoch": 0.28, + "grad_norm": 4.997719764709473, + "learning_rate": 0.0001719627298627, + "loss": 3.4098, + "step": 25450 + }, + { + "epoch": 0.28, + "grad_norm": 6.054971218109131, + "learning_rate": 0.00017183529944285456, + "loss": 3.4159, + "step": 25500 + }, + { + "epoch": 0.28, + "grad_norm": 4.706241130828857, + "learning_rate": 0.00017170762753832615, + "loss": 3.4024, + "step": 25550 + }, + { + "epoch": 0.28, + "grad_norm": 7.701054096221924, + "learning_rate": 0.00017157971457830226, + "loss": 3.3564, + "step": 25600 + }, + { + "epoch": 0.28, + "grad_norm": 5.441225528717041, + "learning_rate": 0.00017145156099278067, + "loss": 3.5887, + "step": 25650 + }, + { + "epoch": 0.28, + "grad_norm": 5.631026268005371, + "learning_rate": 0.0001713231672125681, + "loss": 3.352, + "step": 25700 + }, + { + "epoch": 0.29, + "grad_norm": 4.974308967590332, + "learning_rate": 0.0001711945336692786, + "loss": 3.3959, + "step": 25750 + }, + { + "epoch": 0.29, + "grad_norm": 7.063317775726318, + "learning_rate": 0.00017106566079533246, + "loss": 3.3942, + "step": 25800 + }, + { + "epoch": 0.29, + "grad_norm": 6.312389850616455, + "learning_rate": 0.0001709365490239543, + "loss": 3.3928, + "step": 25850 + }, + { + "epoch": 0.29, + "grad_norm": 5.022332668304443, + "learning_rate": 0.00017080719878917182, + "loss": 3.4401, + "step": 25900 + }, + { + "epoch": 0.29, + "grad_norm": 4.356366157531738, + "learning_rate": 0.00017067761052581455, + "loss": 3.4353, + "step": 25950 + }, + { + "epoch": 0.29, + "grad_norm": 5.611413478851318, + "learning_rate": 0.00017054778466951196, + "loss": 3.2737, + "step": 26000 + }, + { + "epoch": 0.29, + "grad_norm": 7.244396686553955, + "learning_rate": 0.0001704177216566924, + "loss": 3.2309, + "step": 26050 + }, + { + "epoch": 0.29, + "grad_norm": 6.024662017822266, + "learning_rate": 0.00017028742192458132, + "loss": 3.3593, + "step": 26100 + }, + { + "epoch": 0.29, + "grad_norm": 7.399158954620361, + "learning_rate": 0.00017015688591120006, + "loss": 3.2026, + "step": 26150 + }, + { + "epoch": 0.29, + "grad_norm": 7.96980619430542, + "learning_rate": 0.00017002611405536413, + "loss": 3.4413, + "step": 26200 + }, + { + "epoch": 0.29, + "grad_norm": 5.538659572601318, + "learning_rate": 0.00016989510679668194, + "loss": 3.3497, + "step": 26250 + }, + { + "epoch": 0.29, + "grad_norm": 5.8960394859313965, + "learning_rate": 0.00016976386457555323, + "loss": 3.3708, + "step": 26300 + }, + { + "epoch": 0.29, + "grad_norm": 5.486491680145264, + "learning_rate": 0.00016963238783316754, + "loss": 3.4697, + "step": 26350 + }, + { + "epoch": 0.29, + "grad_norm": 5.217641353607178, + "learning_rate": 0.0001695006770115029, + "loss": 3.4249, + "step": 26400 + }, + { + "epoch": 0.29, + "grad_norm": 5.6906938552856445, + "learning_rate": 0.00016936873255332413, + "loss": 3.5343, + "step": 26450 + }, + { + "epoch": 0.29, + "grad_norm": 6.275619983673096, + "learning_rate": 0.00016923655490218149, + "loss": 3.3991, + "step": 26500 + }, + { + "epoch": 0.29, + "grad_norm": 5.75913667678833, + "learning_rate": 0.00016910414450240917, + "loss": 3.3861, + "step": 26550 + }, + { + "epoch": 0.29, + "grad_norm": 5.206583499908447, + "learning_rate": 0.0001689715017991237, + "loss": 3.369, + "step": 26600 + }, + { + "epoch": 0.3, + "grad_norm": 5.569302082061768, + "learning_rate": 0.0001688386272382227, + "loss": 3.4837, + "step": 26650 + }, + { + "epoch": 0.3, + "grad_norm": 5.360637187957764, + "learning_rate": 0.00016870552126638298, + "loss": 3.3299, + "step": 26700 + }, + { + "epoch": 0.3, + "grad_norm": 5.168808937072754, + "learning_rate": 0.00016857218433105945, + "loss": 3.3613, + "step": 26750 + }, + { + "epoch": 0.3, + "grad_norm": 5.515918731689453, + "learning_rate": 0.0001684386168804834, + "loss": 3.159, + "step": 26800 + }, + { + "epoch": 0.3, + "grad_norm": 5.885009765625, + "learning_rate": 0.000168304819363661, + "loss": 3.3029, + "step": 26850 + }, + { + "epoch": 0.3, + "grad_norm": 5.174962997436523, + "learning_rate": 0.0001681707922303718, + "loss": 3.3289, + "step": 26900 + }, + { + "epoch": 0.3, + "grad_norm": 4.86044454574585, + "learning_rate": 0.0001680365359311673, + "loss": 3.3132, + "step": 26950 + }, + { + "epoch": 0.3, + "grad_norm": 6.267008304595947, + "learning_rate": 0.00016790205091736935, + "loss": 3.3649, + "step": 27000 + }, + { + "epoch": 0.3, + "grad_norm": 6.221423625946045, + "learning_rate": 0.00016776733764106862, + "loss": 3.3311, + "step": 27050 + }, + { + "epoch": 0.3, + "grad_norm": 5.069894790649414, + "learning_rate": 0.00016763239655512318, + "loss": 3.3157, + "step": 27100 + }, + { + "epoch": 0.3, + "grad_norm": 8.425812721252441, + "learning_rate": 0.00016749722811315688, + "loss": 3.2714, + "step": 27150 + }, + { + "epoch": 0.3, + "grad_norm": 5.80504846572876, + "learning_rate": 0.00016736183276955783, + "loss": 3.3274, + "step": 27200 + }, + { + "epoch": 0.3, + "grad_norm": 7.617208003997803, + "learning_rate": 0.00016722621097947697, + "loss": 3.1857, + "step": 27250 + }, + { + "epoch": 0.3, + "grad_norm": 5.363246440887451, + "learning_rate": 0.00016709036319882646, + "loss": 3.4673, + "step": 27300 + }, + { + "epoch": 0.3, + "grad_norm": 6.790156364440918, + "learning_rate": 0.00016695428988427807, + "loss": 3.3016, + "step": 27350 + }, + { + "epoch": 0.3, + "grad_norm": 4.9824981689453125, + "learning_rate": 0.00016681799149326185, + "loss": 3.4103, + "step": 27400 + }, + { + "epoch": 0.3, + "grad_norm": 4.915642738342285, + "learning_rate": 0.00016668146848396442, + "loss": 3.4356, + "step": 27450 + }, + { + "epoch": 0.3, + "grad_norm": 4.938210487365723, + "learning_rate": 0.0001665447213153275, + "loss": 3.299, + "step": 27500 + }, + { + "epoch": 0.31, + "grad_norm": 5.116371154785156, + "learning_rate": 0.00016640775044704634, + "loss": 3.3231, + "step": 27550 + }, + { + "epoch": 0.31, + "grad_norm": 6.796716213226318, + "learning_rate": 0.0001662705563395682, + "loss": 3.3685, + "step": 27600 + }, + { + "epoch": 0.31, + "grad_norm": 9.196764945983887, + "learning_rate": 0.0001661331394540908, + "loss": 3.2807, + "step": 27650 + }, + { + "epoch": 0.31, + "grad_norm": 4.64096736907959, + "learning_rate": 0.00016599550025256076, + "loss": 3.2909, + "step": 27700 + }, + { + "epoch": 0.31, + "grad_norm": 5.615271091461182, + "learning_rate": 0.000165857639197672, + "loss": 3.2044, + "step": 27750 + }, + { + "epoch": 0.31, + "grad_norm": 6.112679481506348, + "learning_rate": 0.0001657195567528643, + "loss": 3.2377, + "step": 27800 + }, + { + "epoch": 0.31, + "grad_norm": 4.882411003112793, + "learning_rate": 0.0001655812533823216, + "loss": 3.4462, + "step": 27850 + }, + { + "epoch": 0.31, + "grad_norm": 4.716900825500488, + "learning_rate": 0.00016544272955097063, + "loss": 3.3563, + "step": 27900 + }, + { + "epoch": 0.31, + "grad_norm": 5.415688991546631, + "learning_rate": 0.0001653039857244791, + "loss": 3.2475, + "step": 27950 + }, + { + "epoch": 0.31, + "grad_norm": 5.752101421356201, + "learning_rate": 0.00016516502236925434, + "loss": 3.3646, + "step": 28000 + }, + { + "epoch": 0.31, + "grad_norm": 7.952321529388428, + "learning_rate": 0.00016502583995244163, + "loss": 3.2835, + "step": 28050 + }, + { + "epoch": 0.31, + "grad_norm": 5.233190536499023, + "learning_rate": 0.00016488643894192268, + "loss": 3.3653, + "step": 28100 + }, + { + "epoch": 0.31, + "grad_norm": 5.358859062194824, + "learning_rate": 0.00016474681980631402, + "loss": 3.2425, + "step": 28150 + }, + { + "epoch": 0.31, + "grad_norm": 5.584342002868652, + "learning_rate": 0.0001646069830149654, + "loss": 3.3139, + "step": 28200 + }, + { + "epoch": 0.31, + "grad_norm": 6.155908584594727, + "learning_rate": 0.00016446692903795837, + "loss": 3.2732, + "step": 28250 + }, + { + "epoch": 0.31, + "grad_norm": 6.454020023345947, + "learning_rate": 0.00016432665834610445, + "loss": 3.2121, + "step": 28300 + }, + { + "epoch": 0.31, + "grad_norm": 5.337937831878662, + "learning_rate": 0.00016418617141094374, + "loss": 3.3123, + "step": 28350 + }, + { + "epoch": 0.31, + "grad_norm": 4.727194786071777, + "learning_rate": 0.00016404546870474324, + "loss": 3.2558, + "step": 28400 + }, + { + "epoch": 0.32, + "grad_norm": 5.887204647064209, + "learning_rate": 0.00016390455070049536, + "loss": 3.377, + "step": 28450 + }, + { + "epoch": 0.32, + "grad_norm": 5.2247724533081055, + "learning_rate": 0.0001637634178719162, + "loss": 3.1277, + "step": 28500 + }, + { + "epoch": 0.32, + "grad_norm": 5.014094829559326, + "learning_rate": 0.00016362207069344403, + "loss": 3.142, + "step": 28550 + }, + { + "epoch": 0.32, + "grad_norm": 7.58336067199707, + "learning_rate": 0.00016348050964023773, + "loss": 3.3156, + "step": 28600 + }, + { + "epoch": 0.32, + "grad_norm": 4.806646823883057, + "learning_rate": 0.00016333873518817514, + "loss": 3.2814, + "step": 28650 + }, + { + "epoch": 0.32, + "grad_norm": 5.387387275695801, + "learning_rate": 0.00016319674781385143, + "loss": 3.3977, + "step": 28700 + }, + { + "epoch": 0.32, + "grad_norm": 5.4912638664245605, + "learning_rate": 0.00016305454799457755, + "loss": 3.2323, + "step": 28750 + }, + { + "epoch": 0.32, + "grad_norm": 4.692640781402588, + "learning_rate": 0.00016291213620837867, + "loss": 3.3033, + "step": 28800 + }, + { + "epoch": 0.32, + "grad_norm": 5.484092712402344, + "learning_rate": 0.0001627695129339924, + "loss": 3.1466, + "step": 28850 + }, + { + "epoch": 0.32, + "grad_norm": 5.305532932281494, + "learning_rate": 0.00016262667865086746, + "loss": 3.4111, + "step": 28900 + }, + { + "epoch": 0.32, + "grad_norm": 5.511746883392334, + "learning_rate": 0.00016248363383916182, + "loss": 3.2535, + "step": 28950 + }, + { + "epoch": 0.32, + "grad_norm": 7.3723649978637695, + "learning_rate": 0.00016234037897974108, + "loss": 3.3265, + "step": 29000 + }, + { + "epoch": 0.32, + "grad_norm": 5.085361003875732, + "learning_rate": 0.0001621997858933184, + "loss": 3.3003, + "step": 29050 + }, + { + "epoch": 0.32, + "grad_norm": 4.863938331604004, + "learning_rate": 0.000162056116560834, + "loss": 3.2411, + "step": 29100 + }, + { + "epoch": 0.32, + "grad_norm": 5.351004123687744, + "learning_rate": 0.00016191223861779529, + "loss": 3.2409, + "step": 29150 + }, + { + "epoch": 0.32, + "grad_norm": 8.316920280456543, + "learning_rate": 0.0001617681525478687, + "loss": 3.0881, + "step": 29200 + }, + { + "epoch": 0.32, + "grad_norm": 7.093586444854736, + "learning_rate": 0.0001616238588354203, + "loss": 3.2573, + "step": 29250 + }, + { + "epoch": 0.32, + "grad_norm": 6.5853705406188965, + "learning_rate": 0.00016147935796551405, + "loss": 3.3215, + "step": 29300 + }, + { + "epoch": 0.33, + "grad_norm": 4.667483329772949, + "learning_rate": 0.00016133465042391046, + "loss": 3.3032, + "step": 29350 + }, + { + "epoch": 0.33, + "grad_norm": 5.287956237792969, + "learning_rate": 0.00016118973669706468, + "loss": 3.2255, + "step": 29400 + }, + { + "epoch": 0.33, + "grad_norm": 5.676483631134033, + "learning_rate": 0.0001610446172721251, + "loss": 3.3663, + "step": 29450 + }, + { + "epoch": 0.33, + "grad_norm": 6.739033222198486, + "learning_rate": 0.00016089929263693144, + "loss": 3.2537, + "step": 29500 + }, + { + "epoch": 0.33, + "grad_norm": 5.158905982971191, + "learning_rate": 0.00016075376328001344, + "loss": 3.2336, + "step": 29550 + }, + { + "epoch": 0.33, + "grad_norm": 5.2512526512146, + "learning_rate": 0.00016060802969058885, + "loss": 3.2982, + "step": 29600 + }, + { + "epoch": 0.33, + "grad_norm": 8.520125389099121, + "learning_rate": 0.00016046209235856212, + "loss": 3.3153, + "step": 29650 + }, + { + "epoch": 0.33, + "grad_norm": 6.461225509643555, + "learning_rate": 0.00016031595177452257, + "loss": 3.2629, + "step": 29700 + }, + { + "epoch": 0.33, + "grad_norm": 6.117869853973389, + "learning_rate": 0.00016016960842974278, + "loss": 3.2225, + "step": 29750 + }, + { + "epoch": 0.33, + "grad_norm": 4.035212993621826, + "learning_rate": 0.00016002306281617692, + "loss": 3.3866, + "step": 29800 + }, + { + "epoch": 0.33, + "grad_norm": 5.271117687225342, + "learning_rate": 0.00015987631542645913, + "loss": 3.2602, + "step": 29850 + }, + { + "epoch": 0.33, + "grad_norm": 4.860154151916504, + "learning_rate": 0.00015972936675390185, + "loss": 3.288, + "step": 29900 + }, + { + "epoch": 0.33, + "grad_norm": 5.154600143432617, + "learning_rate": 0.0001595822172924942, + "loss": 3.1941, + "step": 29950 + }, + { + "epoch": 0.33, + "grad_norm": 6.897374153137207, + "learning_rate": 0.00015943486753690017, + "loss": 3.2323, + "step": 30000 + }, + { + "epoch": 0.33, + "grad_norm": 6.063130855560303, + "learning_rate": 0.00015928731798245721, + "loss": 3.1718, + "step": 30050 + }, + { + "epoch": 0.33, + "grad_norm": 5.736262321472168, + "learning_rate": 0.00015913956912517432, + "loss": 3.3035, + "step": 30100 + }, + { + "epoch": 0.33, + "grad_norm": 5.386317253112793, + "learning_rate": 0.00015899162146173053, + "loss": 3.2879, + "step": 30150 + }, + { + "epoch": 0.33, + "grad_norm": 5.535543441772461, + "learning_rate": 0.00015884347548947314, + "loss": 3.2266, + "step": 30200 + }, + { + "epoch": 0.34, + "grad_norm": 6.057496070861816, + "learning_rate": 0.00015869513170641616, + "loss": 3.1668, + "step": 30250 + }, + { + "epoch": 0.34, + "grad_norm": 5.6912055015563965, + "learning_rate": 0.00015854659061123854, + "loss": 3.1562, + "step": 30300 + }, + { + "epoch": 0.34, + "grad_norm": 5.563050270080566, + "learning_rate": 0.0001583978527032825, + "loss": 3.1819, + "step": 30350 + }, + { + "epoch": 0.34, + "grad_norm": 6.75504732131958, + "learning_rate": 0.0001582489184825519, + "loss": 3.1891, + "step": 30400 + }, + { + "epoch": 0.34, + "grad_norm": 5.390425205230713, + "learning_rate": 0.00015809978844971053, + "loss": 3.1856, + "step": 30450 + }, + { + "epoch": 0.34, + "grad_norm": 6.183398246765137, + "learning_rate": 0.0001579504631060804, + "loss": 3.3115, + "step": 30500 + }, + { + "epoch": 0.34, + "grad_norm": 5.40380859375, + "learning_rate": 0.00015780094295364015, + "loss": 3.162, + "step": 30550 + }, + { + "epoch": 0.34, + "grad_norm": 5.078804016113281, + "learning_rate": 0.00015765122849502325, + "loss": 3.2046, + "step": 30600 + }, + { + "epoch": 0.34, + "grad_norm": 6.183681964874268, + "learning_rate": 0.00015750132023351638, + "loss": 3.0689, + "step": 30650 + }, + { + "epoch": 0.34, + "grad_norm": 12.730826377868652, + "learning_rate": 0.00015735121867305768, + "loss": 3.2468, + "step": 30700 + }, + { + "epoch": 0.34, + "grad_norm": 6.127053260803223, + "learning_rate": 0.00015720092431823515, + "loss": 3.1628, + "step": 30750 + }, + { + "epoch": 0.34, + "grad_norm": 5.8310089111328125, + "learning_rate": 0.00015705043767428483, + "loss": 3.2047, + "step": 30800 + }, + { + "epoch": 0.34, + "grad_norm": 7.505776882171631, + "learning_rate": 0.0001568997592470892, + "loss": 3.2827, + "step": 30850 + }, + { + "epoch": 0.34, + "grad_norm": 5.399072170257568, + "learning_rate": 0.00015674888954317549, + "loss": 3.1483, + "step": 30900 + }, + { + "epoch": 0.34, + "grad_norm": 5.224669456481934, + "learning_rate": 0.00015659782906971383, + "loss": 3.2698, + "step": 30950 + }, + { + "epoch": 0.34, + "grad_norm": 7.958742618560791, + "learning_rate": 0.00015644657833451577, + "loss": 3.0145, + "step": 31000 + }, + { + "epoch": 0.34, + "grad_norm": 6.20373010635376, + "learning_rate": 0.0001562981685120925, + "loss": 3.1598, + "step": 31050 + }, + { + "epoch": 0.34, + "grad_norm": 5.2080159187316895, + "learning_rate": 0.00015614654255930347, + "loss": 3.2801, + "step": 31100 + }, + { + "epoch": 0.35, + "grad_norm": 4.195250988006592, + "learning_rate": 0.00015599472786184245, + "loss": 3.156, + "step": 31150 + }, + { + "epoch": 0.35, + "grad_norm": 5.0389204025268555, + "learning_rate": 0.00015584272493005642, + "loss": 3.1345, + "step": 31200 + }, + { + "epoch": 0.35, + "grad_norm": 7.003210544586182, + "learning_rate": 0.00015569053427492505, + "loss": 3.2186, + "step": 31250 + }, + { + "epoch": 0.35, + "grad_norm": 5.47674036026001, + "learning_rate": 0.00015553815640805907, + "loss": 3.3211, + "step": 31300 + }, + { + "epoch": 0.35, + "grad_norm": 5.2981648445129395, + "learning_rate": 0.00015538559184169863, + "loss": 3.2454, + "step": 31350 + }, + { + "epoch": 0.35, + "grad_norm": 5.528575420379639, + "learning_rate": 0.00015523284108871142, + "loss": 3.1963, + "step": 31400 + }, + { + "epoch": 0.35, + "grad_norm": 5.553009033203125, + "learning_rate": 0.0001550799046625911, + "loss": 3.1682, + "step": 31450 + }, + { + "epoch": 0.35, + "grad_norm": 5.608404636383057, + "learning_rate": 0.0001549267830774553, + "loss": 3.1461, + "step": 31500 + }, + { + "epoch": 0.35, + "grad_norm": 6.462625503540039, + "learning_rate": 0.00015477347684804445, + "loss": 3.2772, + "step": 31550 + }, + { + "epoch": 0.35, + "grad_norm": 5.327962875366211, + "learning_rate": 0.00015461998648971928, + "loss": 3.2144, + "step": 31600 + }, + { + "epoch": 0.35, + "grad_norm": 6.94124174118042, + "learning_rate": 0.00015446631251845978, + "loss": 3.2227, + "step": 31650 + }, + { + "epoch": 0.35, + "grad_norm": 5.351782321929932, + "learning_rate": 0.00015431245545086307, + "loss": 3.2687, + "step": 31700 + }, + { + "epoch": 0.35, + "grad_norm": 4.562844276428223, + "learning_rate": 0.00015415841580414185, + "loss": 3.1332, + "step": 31750 + }, + { + "epoch": 0.35, + "grad_norm": 5.023700714111328, + "learning_rate": 0.00015400419409612243, + "loss": 3.2272, + "step": 31800 + }, + { + "epoch": 0.35, + "grad_norm": 5.127398490905762, + "learning_rate": 0.0001538497908452433, + "loss": 3.2843, + "step": 31850 + }, + { + "epoch": 0.35, + "grad_norm": 5.577905178070068, + "learning_rate": 0.0001536952065705532, + "loss": 3.2635, + "step": 31900 + }, + { + "epoch": 0.35, + "grad_norm": 6.119299411773682, + "learning_rate": 0.00015354044179170933, + "loss": 3.126, + "step": 31950 + }, + { + "epoch": 0.35, + "grad_norm": 4.827983856201172, + "learning_rate": 0.0001533854970289758, + "loss": 3.2345, + "step": 32000 + }, + { + "epoch": 0.36, + "grad_norm": 4.499656677246094, + "learning_rate": 0.00015323037280322166, + "loss": 3.0808, + "step": 32050 + }, + { + "epoch": 0.36, + "grad_norm": 5.260239601135254, + "learning_rate": 0.00015307506963591923, + "loss": 3.1234, + "step": 32100 + }, + { + "epoch": 0.36, + "grad_norm": 5.486075401306152, + "learning_rate": 0.00015291958804914256, + "loss": 3.1769, + "step": 32150 + }, + { + "epoch": 0.36, + "grad_norm": 4.572110176086426, + "learning_rate": 0.00015276392856556527, + "loss": 3.2166, + "step": 32200 + }, + { + "epoch": 0.36, + "grad_norm": 5.288125991821289, + "learning_rate": 0.0001526080917084591, + "loss": 3.0781, + "step": 32250 + }, + { + "epoch": 0.36, + "grad_norm": 6.380829334259033, + "learning_rate": 0.000152452078001692, + "loss": 3.1178, + "step": 32300 + }, + { + "epoch": 0.36, + "grad_norm": 5.039462566375732, + "learning_rate": 0.00015229588796972652, + "loss": 3.2808, + "step": 32350 + }, + { + "epoch": 0.36, + "grad_norm": 7.323626518249512, + "learning_rate": 0.00015213952213761787, + "loss": 3.1391, + "step": 32400 + }, + { + "epoch": 0.36, + "grad_norm": 5.912395000457764, + "learning_rate": 0.00015198298103101228, + "loss": 3.1744, + "step": 32450 + }, + { + "epoch": 0.36, + "grad_norm": 5.694441795349121, + "learning_rate": 0.00015182626517614518, + "loss": 3.0576, + "step": 32500 + }, + { + "epoch": 0.36, + "grad_norm": 6.467188358306885, + "learning_rate": 0.00015166937509983943, + "loss": 3.2361, + "step": 32550 + }, + { + "epoch": 0.36, + "grad_norm": 5.292226791381836, + "learning_rate": 0.00015151231132950357, + "loss": 3.1376, + "step": 32600 + }, + { + "epoch": 0.36, + "grad_norm": 6.369929313659668, + "learning_rate": 0.00015135507439313005, + "loss": 3.1406, + "step": 32650 + }, + { + "epoch": 0.36, + "grad_norm": 5.252573013305664, + "learning_rate": 0.00015119766481929342, + "loss": 3.123, + "step": 32700 + }, + { + "epoch": 0.36, + "grad_norm": 6.9053730964660645, + "learning_rate": 0.00015104008313714858, + "loss": 3.0018, + "step": 32750 + }, + { + "epoch": 0.36, + "grad_norm": 6.080839157104492, + "learning_rate": 0.00015088232987642898, + "loss": 3.2106, + "step": 32800 + }, + { + "epoch": 0.36, + "grad_norm": 5.92653226852417, + "learning_rate": 0.00015072440556744492, + "loss": 3.2095, + "step": 32850 + }, + { + "epoch": 0.36, + "grad_norm": 5.627429008483887, + "learning_rate": 0.00015056631074108166, + "loss": 3.089, + "step": 32900 + }, + { + "epoch": 0.37, + "grad_norm": 5.694194793701172, + "learning_rate": 0.00015040804592879762, + "loss": 3.0885, + "step": 32950 + }, + { + "epoch": 0.37, + "grad_norm": 5.04107666015625, + "learning_rate": 0.00015024961166262276, + "loss": 3.0906, + "step": 33000 + }, + { + "epoch": 0.37, + "grad_norm": 5.280002117156982, + "learning_rate": 0.0001500910084751567, + "loss": 3.2142, + "step": 33050 + }, + { + "epoch": 0.37, + "grad_norm": 4.803068161010742, + "learning_rate": 0.00014993223689956672, + "loss": 3.2014, + "step": 33100 + }, + { + "epoch": 0.37, + "grad_norm": 5.611780643463135, + "learning_rate": 0.00014977329746958636, + "loss": 3.2491, + "step": 33150 + }, + { + "epoch": 0.37, + "grad_norm": 5.6669020652771, + "learning_rate": 0.00014961737449079314, + "loss": 3.2, + "step": 33200 + }, + { + "epoch": 0.37, + "grad_norm": 5.894138336181641, + "learning_rate": 0.00014945810428594703, + "loss": 3.0321, + "step": 33250 + }, + { + "epoch": 0.37, + "grad_norm": 6.354518413543701, + "learning_rate": 0.0001492986678205755, + "loss": 3.1314, + "step": 33300 + }, + { + "epoch": 0.37, + "grad_norm": 4.786489009857178, + "learning_rate": 0.00014913906563064706, + "loss": 3.1937, + "step": 33350 + }, + { + "epoch": 0.37, + "grad_norm": 8.759417533874512, + "learning_rate": 0.00014897929825268745, + "loss": 3.1069, + "step": 33400 + }, + { + "epoch": 0.37, + "grad_norm": 5.354910850524902, + "learning_rate": 0.00014881936622377766, + "loss": 3.1519, + "step": 33450 + }, + { + "epoch": 0.37, + "grad_norm": 5.169478416442871, + "learning_rate": 0.0001486592700815522, + "loss": 3.1414, + "step": 33500 + }, + { + "epoch": 0.37, + "grad_norm": 6.09418249130249, + "learning_rate": 0.00014849901036419723, + "loss": 3.0954, + "step": 33550 + }, + { + "epoch": 0.37, + "grad_norm": 4.986037731170654, + "learning_rate": 0.00014833858761044883, + "loss": 3.2445, + "step": 33600 + }, + { + "epoch": 0.37, + "grad_norm": 12.803654670715332, + "learning_rate": 0.00014817800235959118, + "loss": 3.0699, + "step": 33650 + }, + { + "epoch": 0.37, + "grad_norm": 6.191990852355957, + "learning_rate": 0.00014801725515145467, + "loss": 3.2574, + "step": 33700 + }, + { + "epoch": 0.37, + "grad_norm": 5.233778476715088, + "learning_rate": 0.00014785634652641412, + "loss": 3.1152, + "step": 33750 + }, + { + "epoch": 0.37, + "grad_norm": 6.065474987030029, + "learning_rate": 0.000147695277025387, + "loss": 3.1178, + "step": 33800 + }, + { + "epoch": 0.38, + "grad_norm": 5.4664435386657715, + "learning_rate": 0.00014753404718983158, + "loss": 3.0627, + "step": 33850 + }, + { + "epoch": 0.38, + "grad_norm": 4.926270484924316, + "learning_rate": 0.00014737265756174515, + "loss": 3.0182, + "step": 33900 + }, + { + "epoch": 0.38, + "grad_norm": 6.013931751251221, + "learning_rate": 0.0001472111086836621, + "loss": 3.0801, + "step": 33950 + }, + { + "epoch": 0.38, + "grad_norm": 6.25607442855835, + "learning_rate": 0.00014704940109865224, + "loss": 3.1227, + "step": 34000 + }, + { + "epoch": 0.38, + "grad_norm": 5.084470272064209, + "learning_rate": 0.00014688753535031882, + "loss": 3.1786, + "step": 34050 + }, + { + "epoch": 0.38, + "grad_norm": 5.16444730758667, + "learning_rate": 0.00014672551198279687, + "loss": 3.0651, + "step": 34100 + }, + { + "epoch": 0.38, + "grad_norm": 5.316379070281982, + "learning_rate": 0.00014656333154075118, + "loss": 2.9925, + "step": 34150 + }, + { + "epoch": 0.38, + "grad_norm": 5.331335067749023, + "learning_rate": 0.00014640099456937462, + "loss": 3.097, + "step": 34200 + }, + { + "epoch": 0.38, + "grad_norm": 4.452579498291016, + "learning_rate": 0.00014623850161438626, + "loss": 3.1222, + "step": 34250 + }, + { + "epoch": 0.38, + "grad_norm": 5.550851345062256, + "learning_rate": 0.00014607585322202953, + "loss": 3.2343, + "step": 34300 + }, + { + "epoch": 0.38, + "grad_norm": 6.5701375007629395, + "learning_rate": 0.00014591304993907033, + "loss": 3.1558, + "step": 34350 + }, + { + "epoch": 0.38, + "grad_norm": 5.140341758728027, + "learning_rate": 0.00014575009231279534, + "loss": 3.0036, + "step": 34400 + }, + { + "epoch": 0.38, + "grad_norm": 4.767757415771484, + "learning_rate": 0.00014558698089101003, + "loss": 3.1355, + "step": 34450 + }, + { + "epoch": 0.38, + "grad_norm": 6.189707279205322, + "learning_rate": 0.00014542371622203689, + "loss": 3.0721, + "step": 34500 + }, + { + "epoch": 0.38, + "grad_norm": 4.492117881774902, + "learning_rate": 0.00014526029885471355, + "loss": 3.1083, + "step": 34550 + }, + { + "epoch": 0.38, + "grad_norm": 5.021632671356201, + "learning_rate": 0.000145096729338391, + "loss": 3.1291, + "step": 34600 + }, + { + "epoch": 0.38, + "grad_norm": 5.501243591308594, + "learning_rate": 0.00014493300822293164, + "loss": 3.0654, + "step": 34650 + }, + { + "epoch": 0.38, + "grad_norm": 5.056375026702881, + "learning_rate": 0.0001447691360587076, + "loss": 3.054, + "step": 34700 + }, + { + "epoch": 0.39, + "grad_norm": 5.866014003753662, + "learning_rate": 0.0001446051133965986, + "loss": 3.1111, + "step": 34750 + }, + { + "epoch": 0.39, + "grad_norm": 5.377662658691406, + "learning_rate": 0.0001444409407879905, + "loss": 3.2083, + "step": 34800 + }, + { + "epoch": 0.39, + "grad_norm": 5.692511558532715, + "learning_rate": 0.00014427661878477305, + "loss": 3.1613, + "step": 34850 + }, + { + "epoch": 0.39, + "grad_norm": 5.763862609863281, + "learning_rate": 0.0001441121479393383, + "loss": 3.1422, + "step": 34900 + }, + { + "epoch": 0.39, + "grad_norm": 6.247575283050537, + "learning_rate": 0.00014394752880457867, + "loss": 3.1826, + "step": 34950 + }, + { + "epoch": 0.39, + "grad_norm": 6.18945837020874, + "learning_rate": 0.00014378276193388498, + "loss": 3.0887, + "step": 35000 + } + ], + "logging_steps": 50, + "max_steps": 90183, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 1.258566844416e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}