{ "best_metric": 0.35886332392692566, "best_model_checkpoint": "mikhail_panzo/zlm_b128_le5_s12000/checkpoint-8500", "epoch": 14.243769633507853, "eval_steps": 500, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08376963350785341, "grad_norm": 6.650441646575928, "learning_rate": 2.4500000000000004e-07, "loss": 1.1088, "step": 50 }, { "epoch": 0.16753926701570682, "grad_norm": 13.608625411987305, "learning_rate": 4.95e-07, "loss": 1.1077, "step": 100 }, { "epoch": 0.2513089005235602, "grad_norm": 2.7265217304229736, "learning_rate": 7.450000000000001e-07, "loss": 0.9759, "step": 150 }, { "epoch": 0.33507853403141363, "grad_norm": 5.083759307861328, "learning_rate": 9.950000000000002e-07, "loss": 0.8705, "step": 200 }, { "epoch": 0.418848167539267, "grad_norm": 1.8121678829193115, "learning_rate": 1.2450000000000002e-06, "loss": 0.8185, "step": 250 }, { "epoch": 0.5026178010471204, "grad_norm": 2.275899648666382, "learning_rate": 1.495e-06, "loss": 0.784, "step": 300 }, { "epoch": 0.5863874345549738, "grad_norm": 2.2861170768737793, "learning_rate": 1.745e-06, "loss": 0.7545, "step": 350 }, { "epoch": 0.6701570680628273, "grad_norm": 3.1991238594055176, "learning_rate": 1.9950000000000004e-06, "loss": 0.7227, "step": 400 }, { "epoch": 0.7539267015706806, "grad_norm": 1.843092679977417, "learning_rate": 2.245e-06, "loss": 0.6829, "step": 450 }, { "epoch": 0.837696335078534, "grad_norm": 1.4700403213500977, "learning_rate": 2.4950000000000003e-06, "loss": 0.6644, "step": 500 }, { "epoch": 0.837696335078534, "eval_loss": 0.5727962851524353, "eval_runtime": 293.5061, "eval_samples_per_second": 28.923, "eval_steps_per_second": 3.618, "step": 500 }, { "epoch": 0.9214659685863874, "grad_norm": 1.4362367391586304, "learning_rate": 2.7450000000000004e-06, "loss": 0.6487, "step": 550 }, { "epoch": 1.0052356020942408, "grad_norm": 1.8555901050567627, "learning_rate": 2.995e-06, "loss": 0.6301, "step": 600 }, { "epoch": 1.0890052356020943, "grad_norm": 1.9496757984161377, "learning_rate": 3.2450000000000003e-06, "loss": 0.6237, "step": 650 }, { "epoch": 1.1727748691099475, "grad_norm": 3.0705084800720215, "learning_rate": 3.495e-06, "loss": 0.6156, "step": 700 }, { "epoch": 1.256544502617801, "grad_norm": 2.2215688228607178, "learning_rate": 3.745e-06, "loss": 0.5945, "step": 750 }, { "epoch": 1.3403141361256545, "grad_norm": 2.021375894546509, "learning_rate": 3.995000000000001e-06, "loss": 0.5891, "step": 800 }, { "epoch": 1.4240837696335078, "grad_norm": 2.0354769229888916, "learning_rate": 4.245e-06, "loss": 0.5781, "step": 850 }, { "epoch": 1.5078534031413613, "grad_norm": 1.8674426078796387, "learning_rate": 4.495e-06, "loss": 0.5677, "step": 900 }, { "epoch": 1.5916230366492146, "grad_norm": 2.9264817237854004, "learning_rate": 4.745e-06, "loss": 0.5576, "step": 950 }, { "epoch": 1.675392670157068, "grad_norm": 1.9513416290283203, "learning_rate": 4.9950000000000005e-06, "loss": 0.5594, "step": 1000 }, { "epoch": 1.675392670157068, "eval_loss": 0.48248979449272156, "eval_runtime": 294.2627, "eval_samples_per_second": 28.848, "eval_steps_per_second": 3.609, "step": 1000 }, { "epoch": 1.7591623036649215, "grad_norm": 1.934941291809082, "learning_rate": 5.245e-06, "loss": 0.5571, "step": 1050 }, { "epoch": 1.8429319371727748, "grad_norm": 1.7669196128845215, "learning_rate": 5.495000000000001e-06, "loss": 0.5477, "step": 1100 }, { "epoch": 1.9267015706806283, "grad_norm": 3.954806089401245, "learning_rate": 5.745000000000001e-06, "loss": 0.5294, "step": 1150 }, { "epoch": 2.0104712041884816, "grad_norm": 2.8323569297790527, "learning_rate": 5.995000000000001e-06, "loss": 0.5261, "step": 1200 }, { "epoch": 2.094240837696335, "grad_norm": 2.193530797958374, "learning_rate": 6.245000000000001e-06, "loss": 0.5262, "step": 1250 }, { "epoch": 2.1780104712041886, "grad_norm": 1.5814998149871826, "learning_rate": 6.4950000000000005e-06, "loss": 0.5204, "step": 1300 }, { "epoch": 2.261780104712042, "grad_norm": 5.6610918045043945, "learning_rate": 6.745000000000001e-06, "loss": 0.5157, "step": 1350 }, { "epoch": 2.345549738219895, "grad_norm": 9.156899452209473, "learning_rate": 6.995000000000001e-06, "loss": 0.5178, "step": 1400 }, { "epoch": 2.4293193717277486, "grad_norm": 2.3170149326324463, "learning_rate": 7.245000000000001e-06, "loss": 0.5163, "step": 1450 }, { "epoch": 2.513089005235602, "grad_norm": 2.1565325260162354, "learning_rate": 7.495000000000001e-06, "loss": 0.5042, "step": 1500 }, { "epoch": 2.513089005235602, "eval_loss": 0.4464746415615082, "eval_runtime": 293.8295, "eval_samples_per_second": 28.891, "eval_steps_per_second": 3.614, "step": 1500 }, { "epoch": 2.5997905759162303, "grad_norm": 2.4758172035217285, "learning_rate": 7.745e-06, "loss": 0.5031, "step": 1550 }, { "epoch": 2.683560209424084, "grad_norm": 2.1877381801605225, "learning_rate": 7.990000000000001e-06, "loss": 0.4994, "step": 1600 }, { "epoch": 2.7673298429319373, "grad_norm": 2.4486210346221924, "learning_rate": 8.24e-06, "loss": 0.4965, "step": 1650 }, { "epoch": 2.8510994764397903, "grad_norm": 2.596200704574585, "learning_rate": 8.49e-06, "loss": 0.4988, "step": 1700 }, { "epoch": 2.934869109947644, "grad_norm": 1.7787096500396729, "learning_rate": 8.740000000000001e-06, "loss": 0.4947, "step": 1750 }, { "epoch": 3.0186387434554973, "grad_norm": 2.09403133392334, "learning_rate": 8.99e-06, "loss": 0.4845, "step": 1800 }, { "epoch": 3.102408376963351, "grad_norm": 2.0056636333465576, "learning_rate": 9.240000000000001e-06, "loss": 0.4833, "step": 1850 }, { "epoch": 3.1861780104712043, "grad_norm": 3.200199842453003, "learning_rate": 9.49e-06, "loss": 0.484, "step": 1900 }, { "epoch": 3.269947643979058, "grad_norm": 2.5462379455566406, "learning_rate": 9.74e-06, "loss": 0.4726, "step": 1950 }, { "epoch": 3.353717277486911, "grad_norm": 1.3994622230529785, "learning_rate": 9.990000000000001e-06, "loss": 0.4795, "step": 2000 }, { "epoch": 3.353717277486911, "eval_loss": 0.42617055773735046, "eval_runtime": 269.3627, "eval_samples_per_second": 31.515, "eval_steps_per_second": 3.943, "step": 2000 }, { "epoch": 3.4374869109947643, "grad_norm": 1.8979076147079468, "learning_rate": 9.952e-06, "loss": 0.4792, "step": 2050 }, { "epoch": 3.521256544502618, "grad_norm": 1.4493324756622314, "learning_rate": 9.902000000000001e-06, "loss": 0.4751, "step": 2100 }, { "epoch": 3.6050261780104713, "grad_norm": 3.3374929428100586, "learning_rate": 9.852e-06, "loss": 0.4736, "step": 2150 }, { "epoch": 3.6887958115183244, "grad_norm": 3.5062992572784424, "learning_rate": 9.802e-06, "loss": 0.4675, "step": 2200 }, { "epoch": 3.772565445026178, "grad_norm": 2.250505208969116, "learning_rate": 9.752e-06, "loss": 0.4611, "step": 2250 }, { "epoch": 3.8563350785340313, "grad_norm": 1.793270468711853, "learning_rate": 9.702e-06, "loss": 0.4605, "step": 2300 }, { "epoch": 3.940104712041885, "grad_norm": 1.663677453994751, "learning_rate": 9.652e-06, "loss": 0.4693, "step": 2350 }, { "epoch": 4.023874345549738, "grad_norm": 2.1321282386779785, "learning_rate": 9.602e-06, "loss": 0.4587, "step": 2400 }, { "epoch": 4.107643979057592, "grad_norm": 1.7361410856246948, "learning_rate": 9.552000000000001e-06, "loss": 0.4611, "step": 2450 }, { "epoch": 4.191413612565445, "grad_norm": 2.167386770248413, "learning_rate": 9.502000000000002e-06, "loss": 0.455, "step": 2500 }, { "epoch": 4.191413612565445, "eval_loss": 0.40905508399009705, "eval_runtime": 269.8231, "eval_samples_per_second": 31.461, "eval_steps_per_second": 3.936, "step": 2500 }, { "epoch": 4.275183246073299, "grad_norm": 1.430746078491211, "learning_rate": 9.452000000000002e-06, "loss": 0.4581, "step": 2550 }, { "epoch": 4.358952879581151, "grad_norm": 2.1168527603149414, "learning_rate": 9.402e-06, "loss": 0.4516, "step": 2600 }, { "epoch": 4.442722513089005, "grad_norm": 2.1330721378326416, "learning_rate": 9.353000000000002e-06, "loss": 0.4505, "step": 2650 }, { "epoch": 4.526492146596858, "grad_norm": 1.274557113647461, "learning_rate": 9.303e-06, "loss": 0.4575, "step": 2700 }, { "epoch": 4.610261780104712, "grad_norm": 1.835204839706421, "learning_rate": 9.253000000000001e-06, "loss": 0.4491, "step": 2750 }, { "epoch": 4.694031413612565, "grad_norm": 2.0255746841430664, "learning_rate": 9.203000000000002e-06, "loss": 0.4484, "step": 2800 }, { "epoch": 4.777801047120419, "grad_norm": 2.4793522357940674, "learning_rate": 9.153e-06, "loss": 0.4489, "step": 2850 }, { "epoch": 4.861570680628272, "grad_norm": 1.7192201614379883, "learning_rate": 9.103e-06, "loss": 0.4502, "step": 2900 }, { "epoch": 4.945340314136126, "grad_norm": 1.9378846883773804, "learning_rate": 9.053000000000001e-06, "loss": 0.4483, "step": 2950 }, { "epoch": 5.029109947643979, "grad_norm": 2.4576103687286377, "learning_rate": 9.003e-06, "loss": 0.4474, "step": 3000 }, { "epoch": 5.029109947643979, "eval_loss": 0.39903518557548523, "eval_runtime": 269.2955, "eval_samples_per_second": 31.523, "eval_steps_per_second": 3.944, "step": 3000 }, { "epoch": 5.112879581151833, "grad_norm": 1.4718371629714966, "learning_rate": 8.953e-06, "loss": 0.4428, "step": 3050 }, { "epoch": 5.196649214659685, "grad_norm": 2.4222004413604736, "learning_rate": 8.903000000000001e-06, "loss": 0.4417, "step": 3100 }, { "epoch": 5.280418848167539, "grad_norm": 1.5284855365753174, "learning_rate": 8.853e-06, "loss": 0.4395, "step": 3150 }, { "epoch": 5.364188481675392, "grad_norm": 1.7470890283584595, "learning_rate": 8.803e-06, "loss": 0.4359, "step": 3200 }, { "epoch": 5.447958115183246, "grad_norm": 2.1919684410095215, "learning_rate": 8.753e-06, "loss": 0.4389, "step": 3250 }, { "epoch": 5.531727748691099, "grad_norm": 1.6179628372192383, "learning_rate": 8.703e-06, "loss": 0.4423, "step": 3300 }, { "epoch": 5.615497382198953, "grad_norm": 1.420087218284607, "learning_rate": 8.653e-06, "loss": 0.4324, "step": 3350 }, { "epoch": 5.699267015706806, "grad_norm": 1.6972101926803589, "learning_rate": 8.603e-06, "loss": 0.4327, "step": 3400 }, { "epoch": 5.78303664921466, "grad_norm": 8.797286033630371, "learning_rate": 8.553000000000001e-06, "loss": 0.4347, "step": 3450 }, { "epoch": 5.866806282722513, "grad_norm": 1.5441597700119019, "learning_rate": 8.503e-06, "loss": 0.4342, "step": 3500 }, { "epoch": 5.866806282722513, "eval_loss": 0.39290204644203186, "eval_runtime": 272.1215, "eval_samples_per_second": 31.196, "eval_steps_per_second": 3.903, "step": 3500 }, { "epoch": 5.950575916230367, "grad_norm": 1.957065224647522, "learning_rate": 8.453000000000002e-06, "loss": 0.4288, "step": 3550 }, { "epoch": 6.0343455497382195, "grad_norm": 1.3891559839248657, "learning_rate": 8.403e-06, "loss": 0.429, "step": 3600 }, { "epoch": 6.118115183246073, "grad_norm": 1.6092897653579712, "learning_rate": 8.353000000000001e-06, "loss": 0.431, "step": 3650 }, { "epoch": 6.201884816753926, "grad_norm": 1.5566798448562622, "learning_rate": 8.303000000000002e-06, "loss": 0.428, "step": 3700 }, { "epoch": 6.28565445026178, "grad_norm": 2.70001220703125, "learning_rate": 8.253e-06, "loss": 0.433, "step": 3750 }, { "epoch": 6.369424083769633, "grad_norm": 2.6352884769439697, "learning_rate": 8.203000000000001e-06, "loss": 0.431, "step": 3800 }, { "epoch": 6.453193717277487, "grad_norm": 2.049609422683716, "learning_rate": 8.153000000000001e-06, "loss": 0.4279, "step": 3850 }, { "epoch": 6.53696335078534, "grad_norm": 1.422606110572815, "learning_rate": 8.103e-06, "loss": 0.4266, "step": 3900 }, { "epoch": 6.620732984293194, "grad_norm": 1.659340739250183, "learning_rate": 8.053e-06, "loss": 0.4293, "step": 3950 }, { "epoch": 6.704502617801047, "grad_norm": 1.6214194297790527, "learning_rate": 8.003000000000001e-06, "loss": 0.4258, "step": 4000 }, { "epoch": 6.704502617801047, "eval_loss": 0.385530561208725, "eval_runtime": 270.5177, "eval_samples_per_second": 31.381, "eval_steps_per_second": 3.926, "step": 4000 }, { "epoch": 6.788272251308901, "grad_norm": 2.0542335510253906, "learning_rate": 7.953e-06, "loss": 0.4232, "step": 4050 }, { "epoch": 6.872041884816754, "grad_norm": 2.0001602172851562, "learning_rate": 7.903e-06, "loss": 0.4201, "step": 4100 }, { "epoch": 6.955811518324607, "grad_norm": 1.5019524097442627, "learning_rate": 7.853000000000001e-06, "loss": 0.4202, "step": 4150 }, { "epoch": 7.0395811518324605, "grad_norm": 1.5901544094085693, "learning_rate": 7.803000000000001e-06, "loss": 0.4214, "step": 4200 }, { "epoch": 7.123350785340314, "grad_norm": 1.3859797716140747, "learning_rate": 7.753e-06, "loss": 0.4268, "step": 4250 }, { "epoch": 7.2071204188481675, "grad_norm": 1.6223664283752441, "learning_rate": 7.703e-06, "loss": 0.4163, "step": 4300 }, { "epoch": 7.290890052356021, "grad_norm": 1.9755197763442993, "learning_rate": 7.653000000000001e-06, "loss": 0.421, "step": 4350 }, { "epoch": 7.374659685863874, "grad_norm": 1.9882097244262695, "learning_rate": 7.603000000000001e-06, "loss": 0.4215, "step": 4400 }, { "epoch": 7.458429319371728, "grad_norm": 1.4274579286575317, "learning_rate": 7.553e-06, "loss": 0.424, "step": 4450 }, { "epoch": 7.542198952879581, "grad_norm": 1.5370174646377563, "learning_rate": 7.503e-06, "loss": 0.4175, "step": 4500 }, { "epoch": 7.542198952879581, "eval_loss": 0.3783535957336426, "eval_runtime": 264.9436, "eval_samples_per_second": 32.041, "eval_steps_per_second": 4.008, "step": 4500 }, { "epoch": 7.625968586387435, "grad_norm": 1.5206329822540283, "learning_rate": 7.4530000000000005e-06, "loss": 0.418, "step": 4550 }, { "epoch": 7.7097382198952875, "grad_norm": 1.5614436864852905, "learning_rate": 7.403e-06, "loss": 0.4206, "step": 4600 }, { "epoch": 7.793507853403141, "grad_norm": 1.9834551811218262, "learning_rate": 7.353e-06, "loss": 0.421, "step": 4650 }, { "epoch": 7.8772774869109945, "grad_norm": 2.0368449687957764, "learning_rate": 7.303e-06, "loss": 0.4206, "step": 4700 }, { "epoch": 7.961047120418848, "grad_norm": 1.4374061822891235, "learning_rate": 7.253e-06, "loss": 0.4127, "step": 4750 }, { "epoch": 8.044816753926701, "grad_norm": 1.9674043655395508, "learning_rate": 7.203000000000001e-06, "loss": 0.4111, "step": 4800 }, { "epoch": 8.128586387434556, "grad_norm": 1.4639081954956055, "learning_rate": 7.153000000000001e-06, "loss": 0.4153, "step": 4850 }, { "epoch": 8.212356020942408, "grad_norm": 1.189695954322815, "learning_rate": 7.103000000000001e-06, "loss": 0.4257, "step": 4900 }, { "epoch": 8.296125654450261, "grad_norm": 1.5426299571990967, "learning_rate": 7.053000000000001e-06, "loss": 0.4119, "step": 4950 }, { "epoch": 8.379895287958115, "grad_norm": 1.526410460472107, "learning_rate": 7.0030000000000005e-06, "loss": 0.4185, "step": 5000 }, { "epoch": 8.379895287958115, "eval_loss": 0.3761734962463379, "eval_runtime": 274.1558, "eval_samples_per_second": 30.964, "eval_steps_per_second": 3.874, "step": 5000 }, { "epoch": 8.463664921465968, "grad_norm": 1.3419328927993774, "learning_rate": 6.953000000000001e-06, "loss": 0.412, "step": 5050 }, { "epoch": 8.547434554973822, "grad_norm": 2.1306657791137695, "learning_rate": 6.903000000000001e-06, "loss": 0.4108, "step": 5100 }, { "epoch": 8.631204188481675, "grad_norm": 2.0914053916931152, "learning_rate": 6.853000000000001e-06, "loss": 0.4177, "step": 5150 }, { "epoch": 8.71497382198953, "grad_norm": 1.6214958429336548, "learning_rate": 6.803000000000001e-06, "loss": 0.4083, "step": 5200 }, { "epoch": 8.798743455497382, "grad_norm": 1.5650039911270142, "learning_rate": 6.753e-06, "loss": 0.4118, "step": 5250 }, { "epoch": 8.882513089005236, "grad_norm": 1.564424991607666, "learning_rate": 6.703000000000001e-06, "loss": 0.4093, "step": 5300 }, { "epoch": 8.966282722513089, "grad_norm": 1.623558759689331, "learning_rate": 6.6530000000000005e-06, "loss": 0.412, "step": 5350 }, { "epoch": 9.050052356020942, "grad_norm": 1.5212219953536987, "learning_rate": 6.603e-06, "loss": 0.4125, "step": 5400 }, { "epoch": 9.133821989528796, "grad_norm": 2.202746629714966, "learning_rate": 6.553000000000001e-06, "loss": 0.4049, "step": 5450 }, { "epoch": 9.217591623036649, "grad_norm": 1.6582872867584229, "learning_rate": 6.503e-06, "loss": 0.4093, "step": 5500 }, { "epoch": 9.217591623036649, "eval_loss": 0.373293936252594, "eval_runtime": 279.0862, "eval_samples_per_second": 30.417, "eval_steps_per_second": 3.805, "step": 5500 }, { "epoch": 9.301361256544503, "grad_norm": 2.3431801795959473, "learning_rate": 6.453000000000001e-06, "loss": 0.4099, "step": 5550 }, { "epoch": 9.385130890052356, "grad_norm": 1.7502645254135132, "learning_rate": 6.403e-06, "loss": 0.4097, "step": 5600 }, { "epoch": 9.46890052356021, "grad_norm": 1.638600468635559, "learning_rate": 6.353e-06, "loss": 0.4068, "step": 5650 }, { "epoch": 9.552670157068063, "grad_norm": 1.3677990436553955, "learning_rate": 6.3030000000000005e-06, "loss": 0.4086, "step": 5700 }, { "epoch": 9.636439790575917, "grad_norm": 1.4438238143920898, "learning_rate": 6.253e-06, "loss": 0.4067, "step": 5750 }, { "epoch": 9.72020942408377, "grad_norm": 1.7439764738082886, "learning_rate": 6.204e-06, "loss": 0.4085, "step": 5800 }, { "epoch": 9.803979057591622, "grad_norm": 1.257203459739685, "learning_rate": 6.154e-06, "loss": 0.4069, "step": 5850 }, { "epoch": 9.887748691099477, "grad_norm": 1.2076969146728516, "learning_rate": 6.104000000000001e-06, "loss": 0.4, "step": 5900 }, { "epoch": 9.97151832460733, "grad_norm": 1.1376831531524658, "learning_rate": 6.054000000000001e-06, "loss": 0.4088, "step": 5950 }, { "epoch": 10.055287958115183, "grad_norm": 1.4293948411941528, "learning_rate": 6.004000000000001e-06, "loss": 0.4012, "step": 6000 }, { "epoch": 10.055287958115183, "eval_loss": 0.3698088824748993, "eval_runtime": 276.0941, "eval_samples_per_second": 30.747, "eval_steps_per_second": 3.847, "step": 6000 }, { "epoch": 10.139057591623036, "grad_norm": 1.1719565391540527, "learning_rate": 5.954000000000001e-06, "loss": 0.4036, "step": 6050 }, { "epoch": 10.22282722513089, "grad_norm": 1.0520621538162231, "learning_rate": 5.9040000000000006e-06, "loss": 0.4074, "step": 6100 }, { "epoch": 10.306596858638743, "grad_norm": 1.9477139711380005, "learning_rate": 5.854000000000001e-06, "loss": 0.4049, "step": 6150 }, { "epoch": 10.390366492146597, "grad_norm": 1.147050142288208, "learning_rate": 5.804000000000001e-06, "loss": 0.4009, "step": 6200 }, { "epoch": 10.47413612565445, "grad_norm": 1.32716703414917, "learning_rate": 5.754e-06, "loss": 0.4037, "step": 6250 }, { "epoch": 10.557905759162304, "grad_norm": 1.6168224811553955, "learning_rate": 5.704000000000001e-06, "loss": 0.401, "step": 6300 }, { "epoch": 10.641675392670157, "grad_norm": 1.3314645290374756, "learning_rate": 5.654e-06, "loss": 0.4039, "step": 6350 }, { "epoch": 10.72544502617801, "grad_norm": 1.0647807121276855, "learning_rate": 5.604000000000001e-06, "loss": 0.4039, "step": 6400 }, { "epoch": 10.809214659685864, "grad_norm": 1.3919047117233276, "learning_rate": 5.5540000000000005e-06, "loss": 0.4014, "step": 6450 }, { "epoch": 10.892984293193717, "grad_norm": 1.3037397861480713, "learning_rate": 5.504e-06, "loss": 0.403, "step": 6500 }, { "epoch": 10.892984293193717, "eval_loss": 0.3670424520969391, "eval_runtime": 281.4305, "eval_samples_per_second": 30.164, "eval_steps_per_second": 3.774, "step": 6500 }, { "epoch": 10.976753926701571, "grad_norm": 0.9927046298980713, "learning_rate": 5.454000000000001e-06, "loss": 0.4002, "step": 6550 }, { "epoch": 11.060523560209424, "grad_norm": 2.82547926902771, "learning_rate": 5.404e-06, "loss": 0.4035, "step": 6600 }, { "epoch": 11.144293193717278, "grad_norm": 1.4662278890609741, "learning_rate": 5.354e-06, "loss": 0.4031, "step": 6650 }, { "epoch": 11.22806282722513, "grad_norm": 0.9803363084793091, "learning_rate": 5.304e-06, "loss": 0.3984, "step": 6700 }, { "epoch": 11.311832460732985, "grad_norm": 1.580372929573059, "learning_rate": 5.254e-06, "loss": 0.3987, "step": 6750 }, { "epoch": 11.395602094240838, "grad_norm": 1.5269379615783691, "learning_rate": 5.2040000000000005e-06, "loss": 0.3999, "step": 6800 }, { "epoch": 11.479371727748692, "grad_norm": 1.2878773212432861, "learning_rate": 5.154e-06, "loss": 0.3991, "step": 6850 }, { "epoch": 11.563141361256545, "grad_norm": 1.2725164890289307, "learning_rate": 5.104e-06, "loss": 0.3979, "step": 6900 }, { "epoch": 11.646910994764397, "grad_norm": 2.5339043140411377, "learning_rate": 5.054e-06, "loss": 0.3966, "step": 6950 }, { "epoch": 11.730680628272252, "grad_norm": 1.1876580715179443, "learning_rate": 5.004e-06, "loss": 0.4017, "step": 7000 }, { "epoch": 11.730680628272252, "eval_loss": 0.3627295196056366, "eval_runtime": 270.756, "eval_samples_per_second": 31.353, "eval_steps_per_second": 3.922, "step": 7000 }, { "epoch": 11.814450261780104, "grad_norm": 1.1595991849899292, "learning_rate": 4.954e-06, "loss": 0.4003, "step": 7050 }, { "epoch": 11.898219895287959, "grad_norm": 1.2361228466033936, "learning_rate": 4.904000000000001e-06, "loss": 0.4019, "step": 7100 }, { "epoch": 11.981989528795811, "grad_norm": 1.5332549810409546, "learning_rate": 4.8540000000000005e-06, "loss": 0.395, "step": 7150 }, { "epoch": 12.065759162303666, "grad_norm": 1.4963312149047852, "learning_rate": 4.804e-06, "loss": 0.406, "step": 7200 }, { "epoch": 12.149528795811518, "grad_norm": 1.5511187314987183, "learning_rate": 4.7540000000000006e-06, "loss": 0.3991, "step": 7250 }, { "epoch": 12.233298429319373, "grad_norm": 1.8427493572235107, "learning_rate": 4.704e-06, "loss": 0.3982, "step": 7300 }, { "epoch": 12.317068062827225, "grad_norm": 1.3739436864852905, "learning_rate": 4.654e-06, "loss": 0.3966, "step": 7350 }, { "epoch": 12.400837696335078, "grad_norm": 1.659621238708496, "learning_rate": 4.604e-06, "loss": 0.4019, "step": 7400 }, { "epoch": 12.484607329842932, "grad_norm": 1.8510388135910034, "learning_rate": 4.554000000000001e-06, "loss": 0.4013, "step": 7450 }, { "epoch": 12.568376963350785, "grad_norm": 1.100199580192566, "learning_rate": 4.504e-06, "loss": 0.3961, "step": 7500 }, { "epoch": 12.568376963350785, "eval_loss": 0.36272022128105164, "eval_runtime": 264.6514, "eval_samples_per_second": 32.076, "eval_steps_per_second": 4.013, "step": 7500 }, { "epoch": 12.652146596858639, "grad_norm": 1.439460039138794, "learning_rate": 4.454000000000001e-06, "loss": 0.3937, "step": 7550 }, { "epoch": 12.735916230366492, "grad_norm": 1.253373622894287, "learning_rate": 4.4040000000000005e-06, "loss": 0.4011, "step": 7600 }, { "epoch": 12.819685863874346, "grad_norm": 1.6404370069503784, "learning_rate": 4.354e-06, "loss": 0.4016, "step": 7650 }, { "epoch": 12.903455497382199, "grad_norm": 1.3606590032577515, "learning_rate": 4.304000000000001e-06, "loss": 0.3964, "step": 7700 }, { "epoch": 12.987225130890053, "grad_norm": 1.249391794204712, "learning_rate": 4.254e-06, "loss": 0.4013, "step": 7750 }, { "epoch": 13.070994764397906, "grad_norm": 1.1404308080673218, "learning_rate": 4.205e-06, "loss": 0.3939, "step": 7800 }, { "epoch": 13.15476439790576, "grad_norm": 0.9418945908546448, "learning_rate": 4.155e-06, "loss": 0.3968, "step": 7850 }, { "epoch": 13.238534031413613, "grad_norm": 1.3190369606018066, "learning_rate": 4.1050000000000005e-06, "loss": 0.3977, "step": 7900 }, { "epoch": 13.322303664921465, "grad_norm": 1.2260388135910034, "learning_rate": 4.055000000000001e-06, "loss": 0.3973, "step": 7950 }, { "epoch": 13.40607329842932, "grad_norm": 1.035477638244629, "learning_rate": 4.005000000000001e-06, "loss": 0.3957, "step": 8000 }, { "epoch": 13.40607329842932, "eval_loss": 0.3603293001651764, "eval_runtime": 270.7314, "eval_samples_per_second": 31.356, "eval_steps_per_second": 3.923, "step": 8000 }, { "epoch": 13.489842931937172, "grad_norm": 1.4767895936965942, "learning_rate": 3.955e-06, "loss": 0.3921, "step": 8050 }, { "epoch": 13.573612565445027, "grad_norm": 15.019492149353027, "learning_rate": 3.905000000000001e-06, "loss": 0.3947, "step": 8100 }, { "epoch": 13.65738219895288, "grad_norm": 1.3216609954833984, "learning_rate": 3.855e-06, "loss": 0.3969, "step": 8150 }, { "epoch": 13.741151832460734, "grad_norm": 1.0909982919692993, "learning_rate": 3.8050000000000004e-06, "loss": 0.3916, "step": 8200 }, { "epoch": 13.824921465968586, "grad_norm": 1.6264184713363647, "learning_rate": 3.7550000000000005e-06, "loss": 0.3951, "step": 8250 }, { "epoch": 13.90869109947644, "grad_norm": 1.1289631128311157, "learning_rate": 3.705e-06, "loss": 0.3926, "step": 8300 }, { "epoch": 13.992460732984293, "grad_norm": 1.1617991924285889, "learning_rate": 3.655e-06, "loss": 0.393, "step": 8350 }, { "epoch": 14.076230366492146, "grad_norm": 1.1278605461120605, "learning_rate": 3.6050000000000002e-06, "loss": 0.3969, "step": 8400 }, { "epoch": 14.16, "grad_norm": 0.9024301767349243, "learning_rate": 3.5550000000000003e-06, "loss": 0.3888, "step": 8450 }, { "epoch": 14.243769633507853, "grad_norm": 0.9253320097923279, "learning_rate": 3.505e-06, "loss": 0.3919, "step": 8500 }, { "epoch": 14.243769633507853, "eval_loss": 0.35886332392692566, "eval_runtime": 262.8233, "eval_samples_per_second": 32.299, "eval_steps_per_second": 4.041, "step": 8500 } ], "logging_steps": 50, "max_steps": 12000, "num_input_tokens_seen": 0, "num_train_epochs": 21, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5251375860469338e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }