{ "best_metric": 1.9382596015930176, "best_model_checkpoint": "./codegen-350M-mono-QLoRa-flytech/checkpoint-2000", "epoch": 0.4605907075824745, "eval_steps": 5, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "eval_loss": 2.743941068649292, "eval_runtime": 749.578, "eval_samples_per_second": 19.862, "eval_steps_per_second": 2.483, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.3329066038131714, "learning_rate": 0.000199, "loss": 2.9969, "step": 10 }, { "epoch": 0.0, "eval_loss": 2.6796510219573975, "eval_runtime": 751.2092, "eval_samples_per_second": 19.819, "eval_steps_per_second": 2.477, "step": 10 }, { "epoch": 0.0, "eval_loss": 2.5984177589416504, "eval_runtime": 764.9736, "eval_samples_per_second": 19.462, "eval_steps_per_second": 2.433, "step": 15 }, { "epoch": 0.0, "grad_norm": 0.283966600894928, "learning_rate": 0.00019800000000000002, "loss": 2.5613, "step": 20 }, { "epoch": 0.0, "eval_loss": 2.510923147201538, "eval_runtime": 775.4648, "eval_samples_per_second": 19.199, "eval_steps_per_second": 2.4, "step": 20 }, { "epoch": 0.01, "eval_loss": 2.4384284019470215, "eval_runtime": 807.2199, "eval_samples_per_second": 18.444, "eval_steps_per_second": 2.305, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.3212601840496063, "learning_rate": 0.00019700000000000002, "loss": 2.4609, "step": 30 }, { "epoch": 0.01, "eval_loss": 2.398897886276245, "eval_runtime": 806.9842, "eval_samples_per_second": 18.449, "eval_steps_per_second": 2.306, "step": 30 }, { "epoch": 0.01, "eval_loss": 2.3749544620513916, "eval_runtime": 807.3296, "eval_samples_per_second": 18.441, "eval_steps_per_second": 2.305, "step": 35 }, { "epoch": 0.01, "grad_norm": 0.5253788232803345, "learning_rate": 0.000196, "loss": 2.4028, "step": 40 }, { "epoch": 0.01, "eval_loss": 2.3531863689422607, "eval_runtime": 807.2812, "eval_samples_per_second": 18.442, "eval_steps_per_second": 2.305, "step": 40 }, { "epoch": 0.01, "eval_loss": 2.3311145305633545, "eval_runtime": 807.5901, "eval_samples_per_second": 18.435, "eval_steps_per_second": 2.304, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.4847645163536072, "learning_rate": 0.000195, "loss": 2.4339, "step": 50 }, { "epoch": 0.01, "eval_loss": 2.31105637550354, "eval_runtime": 807.447, "eval_samples_per_second": 18.438, "eval_steps_per_second": 2.305, "step": 50 }, { "epoch": 0.01, "eval_loss": 2.294403314590454, "eval_runtime": 804.5301, "eval_samples_per_second": 18.505, "eval_steps_per_second": 2.313, "step": 55 }, { "epoch": 0.01, "grad_norm": 0.5184943675994873, "learning_rate": 0.000194, "loss": 2.3291, "step": 60 }, { "epoch": 0.01, "eval_loss": 2.28222393989563, "eval_runtime": 744.3103, "eval_samples_per_second": 20.002, "eval_steps_per_second": 2.5, "step": 60 }, { "epoch": 0.01, "eval_loss": 2.271087169647217, "eval_runtime": 744.2352, "eval_samples_per_second": 20.004, "eval_steps_per_second": 2.501, "step": 65 }, { "epoch": 0.02, "grad_norm": 0.42745935916900635, "learning_rate": 0.000193, "loss": 2.3128, "step": 70 }, { "epoch": 0.02, "eval_loss": 2.2600862979888916, "eval_runtime": 747.8797, "eval_samples_per_second": 19.907, "eval_steps_per_second": 2.488, "step": 70 }, { "epoch": 0.02, "eval_loss": 2.251253128051758, "eval_runtime": 755.1321, "eval_samples_per_second": 19.716, "eval_steps_per_second": 2.464, "step": 75 }, { "epoch": 0.02, "grad_norm": 0.5841810703277588, "learning_rate": 0.000192, "loss": 2.6201, "step": 80 }, { "epoch": 0.02, "eval_loss": 2.2421493530273438, "eval_runtime": 750.5937, "eval_samples_per_second": 19.835, "eval_steps_per_second": 2.479, "step": 80 }, { "epoch": 0.02, "eval_loss": 2.2358789443969727, "eval_runtime": 750.3335, "eval_samples_per_second": 19.842, "eval_steps_per_second": 2.48, "step": 85 }, { "epoch": 0.02, "grad_norm": 0.34617361426353455, "learning_rate": 0.000191, "loss": 2.2322, "step": 90 }, { "epoch": 0.02, "eval_loss": 2.2303481101989746, "eval_runtime": 748.12, "eval_samples_per_second": 19.901, "eval_steps_per_second": 2.488, "step": 90 }, { "epoch": 0.02, "eval_loss": 2.2245254516601562, "eval_runtime": 763.3784, "eval_samples_per_second": 19.503, "eval_steps_per_second": 2.438, "step": 95 }, { "epoch": 0.02, "grad_norm": 0.5058037638664246, "learning_rate": 0.00019, "loss": 2.3576, "step": 100 }, { "epoch": 0.02, "eval_loss": 2.217428207397461, "eval_runtime": 800.3219, "eval_samples_per_second": 18.603, "eval_steps_per_second": 2.325, "step": 100 }, { "epoch": 0.02, "eval_loss": 2.2102160453796387, "eval_runtime": 806.6841, "eval_samples_per_second": 18.456, "eval_steps_per_second": 2.307, "step": 105 }, { "epoch": 0.03, "grad_norm": 0.5872398018836975, "learning_rate": 0.00018899999999999999, "loss": 2.3798, "step": 110 }, { "epoch": 0.03, "eval_loss": 2.203975200653076, "eval_runtime": 778.7436, "eval_samples_per_second": 19.118, "eval_steps_per_second": 2.39, "step": 110 }, { "epoch": 0.03, "eval_loss": 2.198529005050659, "eval_runtime": 754.0178, "eval_samples_per_second": 19.745, "eval_steps_per_second": 2.468, "step": 115 }, { "epoch": 0.03, "grad_norm": 0.4760008752346039, "learning_rate": 0.000188, "loss": 2.2499, "step": 120 }, { "epoch": 0.03, "eval_loss": 2.1929078102111816, "eval_runtime": 770.8151, "eval_samples_per_second": 19.315, "eval_steps_per_second": 2.414, "step": 120 }, { "epoch": 0.03, "eval_loss": 2.1861350536346436, "eval_runtime": 749.444, "eval_samples_per_second": 19.865, "eval_steps_per_second": 2.483, "step": 125 }, { "epoch": 0.03, "grad_norm": 0.6719084978103638, "learning_rate": 0.00018700000000000002, "loss": 2.2002, "step": 130 }, { "epoch": 0.03, "eval_loss": 2.1807847023010254, "eval_runtime": 790.3227, "eval_samples_per_second": 18.838, "eval_steps_per_second": 2.355, "step": 130 }, { "epoch": 0.03, "eval_loss": 2.177309274673462, "eval_runtime": 767.7802, "eval_samples_per_second": 19.391, "eval_steps_per_second": 2.424, "step": 135 }, { "epoch": 0.03, "grad_norm": 0.6400887966156006, "learning_rate": 0.00018600000000000002, "loss": 2.3217, "step": 140 }, { "epoch": 0.03, "eval_loss": 2.1720025539398193, "eval_runtime": 794.6729, "eval_samples_per_second": 18.735, "eval_steps_per_second": 2.342, "step": 140 }, { "epoch": 0.03, "eval_loss": 2.1658318042755127, "eval_runtime": 774.1526, "eval_samples_per_second": 19.231, "eval_steps_per_second": 2.404, "step": 145 }, { "epoch": 0.03, "grad_norm": 0.48335766792297363, "learning_rate": 0.00018500000000000002, "loss": 2.1297, "step": 150 }, { "epoch": 0.03, "eval_loss": 2.1612164974212646, "eval_runtime": 778.6453, "eval_samples_per_second": 19.12, "eval_steps_per_second": 2.39, "step": 150 }, { "epoch": 0.04, "eval_loss": 2.1569318771362305, "eval_runtime": 761.275, "eval_samples_per_second": 19.557, "eval_steps_per_second": 2.445, "step": 155 }, { "epoch": 0.04, "grad_norm": 0.8584076166152954, "learning_rate": 0.00018400000000000003, "loss": 2.1718, "step": 160 }, { "epoch": 0.04, "eval_loss": 2.1521499156951904, "eval_runtime": 746.5904, "eval_samples_per_second": 19.941, "eval_steps_per_second": 2.493, "step": 160 }, { "epoch": 0.04, "eval_loss": 2.1496639251708984, "eval_runtime": 751.6198, "eval_samples_per_second": 19.808, "eval_steps_per_second": 2.476, "step": 165 }, { "epoch": 0.04, "grad_norm": 0.7087550163269043, "learning_rate": 0.000183, "loss": 2.3469, "step": 170 }, { "epoch": 0.04, "eval_loss": 2.146427631378174, "eval_runtime": 768.8707, "eval_samples_per_second": 19.363, "eval_steps_per_second": 2.42, "step": 170 }, { "epoch": 0.04, "eval_loss": 2.1417484283447266, "eval_runtime": 775.3535, "eval_samples_per_second": 19.202, "eval_steps_per_second": 2.4, "step": 175 }, { "epoch": 0.04, "grad_norm": 0.6968621611595154, "learning_rate": 0.000182, "loss": 2.2498, "step": 180 }, { "epoch": 0.04, "eval_loss": 2.138108015060425, "eval_runtime": 773.4912, "eval_samples_per_second": 19.248, "eval_steps_per_second": 2.406, "step": 180 }, { "epoch": 0.04, "eval_loss": 2.1360790729522705, "eval_runtime": 773.9293, "eval_samples_per_second": 19.237, "eval_steps_per_second": 2.405, "step": 185 }, { "epoch": 0.04, "grad_norm": 0.7121121287345886, "learning_rate": 0.000181, "loss": 2.3611, "step": 190 }, { "epoch": 0.04, "eval_loss": 2.1335365772247314, "eval_runtime": 763.0423, "eval_samples_per_second": 19.511, "eval_steps_per_second": 2.439, "step": 190 }, { "epoch": 0.04, "eval_loss": 2.1294620037078857, "eval_runtime": 755.8471, "eval_samples_per_second": 19.697, "eval_steps_per_second": 2.462, "step": 195 }, { "epoch": 0.05, "grad_norm": 0.6406265497207642, "learning_rate": 0.00018, "loss": 2.1718, "step": 200 }, { "epoch": 0.05, "eval_loss": 2.125330924987793, "eval_runtime": 764.112, "eval_samples_per_second": 19.484, "eval_steps_per_second": 2.436, "step": 200 }, { "epoch": 0.05, "eval_loss": 2.121659278869629, "eval_runtime": 784.0069, "eval_samples_per_second": 18.99, "eval_steps_per_second": 2.374, "step": 205 }, { "epoch": 0.05, "grad_norm": 0.94074547290802, "learning_rate": 0.00017900000000000001, "loss": 2.2531, "step": 210 }, { "epoch": 0.05, "eval_loss": 2.1184146404266357, "eval_runtime": 795.2056, "eval_samples_per_second": 18.722, "eval_steps_per_second": 2.34, "step": 210 }, { "epoch": 0.05, "eval_loss": 2.11686635017395, "eval_runtime": 782.4427, "eval_samples_per_second": 19.028, "eval_steps_per_second": 2.378, "step": 215 }, { "epoch": 0.05, "grad_norm": 0.4468848407268524, "learning_rate": 0.00017800000000000002, "loss": 2.276, "step": 220 }, { "epoch": 0.05, "eval_loss": 2.1175310611724854, "eval_runtime": 755.2557, "eval_samples_per_second": 19.713, "eval_steps_per_second": 2.464, "step": 220 }, { "epoch": 0.05, "eval_loss": 2.1144332885742188, "eval_runtime": 777.8858, "eval_samples_per_second": 19.139, "eval_steps_per_second": 2.392, "step": 225 }, { "epoch": 0.05, "grad_norm": 0.697229266166687, "learning_rate": 0.00017700000000000002, "loss": 2.2138, "step": 230 }, { "epoch": 0.05, "eval_loss": 2.111482620239258, "eval_runtime": 751.8011, "eval_samples_per_second": 19.803, "eval_steps_per_second": 2.475, "step": 230 }, { "epoch": 0.05, "eval_loss": 2.1105000972747803, "eval_runtime": 751.7683, "eval_samples_per_second": 19.804, "eval_steps_per_second": 2.475, "step": 235 }, { "epoch": 0.06, "grad_norm": 0.9181985855102539, "learning_rate": 0.00017600000000000002, "loss": 2.3692, "step": 240 }, { "epoch": 0.06, "eval_loss": 2.109123468399048, "eval_runtime": 749.086, "eval_samples_per_second": 19.875, "eval_steps_per_second": 2.484, "step": 240 }, { "epoch": 0.06, "eval_loss": 2.10528564453125, "eval_runtime": 750.8258, "eval_samples_per_second": 19.829, "eval_steps_per_second": 2.479, "step": 245 }, { "epoch": 0.06, "grad_norm": 0.48959431052207947, "learning_rate": 0.000175, "loss": 2.2648, "step": 250 }, { "epoch": 0.06, "eval_loss": 2.103628396987915, "eval_runtime": 785.1888, "eval_samples_per_second": 18.961, "eval_steps_per_second": 2.37, "step": 250 }, { "epoch": 0.06, "eval_loss": 2.1015634536743164, "eval_runtime": 771.9757, "eval_samples_per_second": 19.286, "eval_steps_per_second": 2.411, "step": 255 }, { "epoch": 0.06, "grad_norm": 0.6428173780441284, "learning_rate": 0.000174, "loss": 2.1139, "step": 260 }, { "epoch": 0.06, "eval_loss": 2.097482919692993, "eval_runtime": 763.409, "eval_samples_per_second": 19.502, "eval_steps_per_second": 2.438, "step": 260 }, { "epoch": 0.06, "eval_loss": 2.095787286758423, "eval_runtime": 760.3894, "eval_samples_per_second": 19.579, "eval_steps_per_second": 2.447, "step": 265 }, { "epoch": 0.06, "grad_norm": 0.5767454504966736, "learning_rate": 0.000173, "loss": 2.1961, "step": 270 }, { "epoch": 0.06, "eval_loss": 2.0945658683776855, "eval_runtime": 758.8729, "eval_samples_per_second": 19.619, "eval_steps_per_second": 2.452, "step": 270 }, { "epoch": 0.06, "eval_loss": 2.0915117263793945, "eval_runtime": 748.4004, "eval_samples_per_second": 19.893, "eval_steps_per_second": 2.487, "step": 275 }, { "epoch": 0.06, "grad_norm": 0.6312183737754822, "learning_rate": 0.000172, "loss": 2.2072, "step": 280 }, { "epoch": 0.06, "eval_loss": 2.0887210369110107, "eval_runtime": 762.6911, "eval_samples_per_second": 19.52, "eval_steps_per_second": 2.44, "step": 280 }, { "epoch": 0.07, "eval_loss": 2.086923360824585, "eval_runtime": 806.6105, "eval_samples_per_second": 18.457, "eval_steps_per_second": 2.307, "step": 285 }, { "epoch": 0.07, "grad_norm": 0.6048781275749207, "learning_rate": 0.000171, "loss": 2.0356, "step": 290 }, { "epoch": 0.07, "eval_loss": 2.085240125656128, "eval_runtime": 806.3511, "eval_samples_per_second": 18.463, "eval_steps_per_second": 2.308, "step": 290 }, { "epoch": 0.07, "eval_loss": 2.084153175354004, "eval_runtime": 806.3163, "eval_samples_per_second": 18.464, "eval_steps_per_second": 2.308, "step": 295 }, { "epoch": 0.07, "grad_norm": 0.7954710125923157, "learning_rate": 0.00017, "loss": 2.0189, "step": 300 }, { "epoch": 0.07, "eval_loss": 2.082386016845703, "eval_runtime": 806.423, "eval_samples_per_second": 18.462, "eval_steps_per_second": 2.308, "step": 300 }, { "epoch": 0.07, "eval_loss": 2.0789432525634766, "eval_runtime": 806.4464, "eval_samples_per_second": 18.461, "eval_steps_per_second": 2.308, "step": 305 }, { "epoch": 0.07, "grad_norm": 0.7538740038871765, "learning_rate": 0.00016900000000000002, "loss": 1.8978, "step": 310 }, { "epoch": 0.07, "eval_loss": 2.076179027557373, "eval_runtime": 806.5403, "eval_samples_per_second": 18.459, "eval_steps_per_second": 2.307, "step": 310 }, { "epoch": 0.07, "eval_loss": 2.075080156326294, "eval_runtime": 784.5471, "eval_samples_per_second": 18.977, "eval_steps_per_second": 2.372, "step": 315 }, { "epoch": 0.07, "grad_norm": 0.2843266427516937, "learning_rate": 0.000168, "loss": 2.1163, "step": 320 }, { "epoch": 0.07, "eval_loss": 2.074650764465332, "eval_runtime": 752.9254, "eval_samples_per_second": 19.774, "eval_steps_per_second": 2.472, "step": 320 }, { "epoch": 0.07, "eval_loss": 2.0736653804779053, "eval_runtime": 757.9611, "eval_samples_per_second": 19.642, "eval_steps_per_second": 2.455, "step": 325 }, { "epoch": 0.08, "grad_norm": 0.7260650396347046, "learning_rate": 0.000167, "loss": 2.2219, "step": 330 }, { "epoch": 0.08, "eval_loss": 2.073575496673584, "eval_runtime": 806.2254, "eval_samples_per_second": 18.466, "eval_steps_per_second": 2.308, "step": 330 }, { "epoch": 0.08, "eval_loss": 2.0729920864105225, "eval_runtime": 779.9527, "eval_samples_per_second": 19.088, "eval_steps_per_second": 2.386, "step": 335 }, { "epoch": 0.08, "grad_norm": 0.7835765480995178, "learning_rate": 0.000166, "loss": 2.2111, "step": 340 }, { "epoch": 0.08, "eval_loss": 2.070727825164795, "eval_runtime": 758.1738, "eval_samples_per_second": 19.637, "eval_steps_per_second": 2.455, "step": 340 }, { "epoch": 0.08, "eval_loss": 2.069014549255371, "eval_runtime": 747.7568, "eval_samples_per_second": 19.91, "eval_steps_per_second": 2.489, "step": 345 }, { "epoch": 0.08, "grad_norm": 0.5928293466567993, "learning_rate": 0.000165, "loss": 2.1936, "step": 350 }, { "epoch": 0.08, "eval_loss": 2.0676755905151367, "eval_runtime": 798.6576, "eval_samples_per_second": 18.641, "eval_steps_per_second": 2.33, "step": 350 }, { "epoch": 0.08, "eval_loss": 2.0652225017547607, "eval_runtime": 806.0571, "eval_samples_per_second": 18.47, "eval_steps_per_second": 2.309, "step": 355 }, { "epoch": 0.08, "grad_norm": 0.7964838147163391, "learning_rate": 0.000164, "loss": 2.1177, "step": 360 }, { "epoch": 0.08, "eval_loss": 2.0634725093841553, "eval_runtime": 805.9924, "eval_samples_per_second": 18.472, "eval_steps_per_second": 2.309, "step": 360 }, { "epoch": 0.08, "eval_loss": 2.063730478286743, "eval_runtime": 805.9942, "eval_samples_per_second": 18.472, "eval_steps_per_second": 2.309, "step": 365 }, { "epoch": 0.09, "grad_norm": 0.5622439980506897, "learning_rate": 0.000163, "loss": 2.1331, "step": 370 }, { "epoch": 0.09, "eval_loss": 2.06245756149292, "eval_runtime": 806.0779, "eval_samples_per_second": 18.47, "eval_steps_per_second": 2.309, "step": 370 }, { "epoch": 0.09, "eval_loss": 2.0626060962677, "eval_runtime": 806.2506, "eval_samples_per_second": 18.466, "eval_steps_per_second": 2.308, "step": 375 }, { "epoch": 0.09, "grad_norm": 0.7700399160385132, "learning_rate": 0.000162, "loss": 2.3266, "step": 380 }, { "epoch": 0.09, "eval_loss": 2.0606532096862793, "eval_runtime": 806.0725, "eval_samples_per_second": 18.47, "eval_steps_per_second": 2.309, "step": 380 }, { "epoch": 0.09, "eval_loss": 2.060309410095215, "eval_runtime": 806.3975, "eval_samples_per_second": 18.462, "eval_steps_per_second": 2.308, "step": 385 }, { "epoch": 0.09, "grad_norm": 0.7712746858596802, "learning_rate": 0.000161, "loss": 2.0627, "step": 390 }, { "epoch": 0.09, "eval_loss": 2.0599002838134766, "eval_runtime": 806.2192, "eval_samples_per_second": 18.466, "eval_steps_per_second": 2.308, "step": 390 }, { "epoch": 0.09, "eval_loss": 2.0586650371551514, "eval_runtime": 806.299, "eval_samples_per_second": 18.465, "eval_steps_per_second": 2.308, "step": 395 }, { "epoch": 0.09, "grad_norm": 0.5348338484764099, "learning_rate": 0.00016, "loss": 2.3373, "step": 400 }, { "epoch": 0.09, "eval_loss": 2.056227445602417, "eval_runtime": 806.1063, "eval_samples_per_second": 18.469, "eval_steps_per_second": 2.309, "step": 400 }, { "epoch": 0.09, "eval_loss": 2.054941177368164, "eval_runtime": 806.3058, "eval_samples_per_second": 18.464, "eval_steps_per_second": 2.308, "step": 405 }, { "epoch": 0.09, "grad_norm": 0.577366828918457, "learning_rate": 0.00015900000000000002, "loss": 2.0422, "step": 410 }, { "epoch": 0.09, "eval_loss": 2.0555078983306885, "eval_runtime": 806.2774, "eval_samples_per_second": 18.465, "eval_steps_per_second": 2.308, "step": 410 }, { "epoch": 0.1, "eval_loss": 2.054901361465454, "eval_runtime": 806.2591, "eval_samples_per_second": 18.466, "eval_steps_per_second": 2.308, "step": 415 }, { "epoch": 0.1, "grad_norm": 0.7189317345619202, "learning_rate": 0.00015800000000000002, "loss": 2.1189, "step": 420 }, { "epoch": 0.1, "eval_loss": 2.0532333850860596, "eval_runtime": 806.381, "eval_samples_per_second": 18.463, "eval_steps_per_second": 2.308, "step": 420 }, { "epoch": 0.1, "eval_loss": 2.0509543418884277, "eval_runtime": 806.2323, "eval_samples_per_second": 18.466, "eval_steps_per_second": 2.308, "step": 425 }, { "epoch": 0.1, "grad_norm": 0.548724353313446, "learning_rate": 0.00015700000000000002, "loss": 2.0847, "step": 430 }, { "epoch": 0.1, "eval_loss": 2.0487470626831055, "eval_runtime": 806.372, "eval_samples_per_second": 18.463, "eval_steps_per_second": 2.308, "step": 430 }, { "epoch": 0.1, "eval_loss": 2.0478439331054688, "eval_runtime": 806.31, "eval_samples_per_second": 18.464, "eval_steps_per_second": 2.308, "step": 435 }, { "epoch": 0.1, "grad_norm": 0.5101466774940491, "learning_rate": 0.00015600000000000002, "loss": 2.0727, "step": 440 }, { "epoch": 0.1, "eval_loss": 2.0454509258270264, "eval_runtime": 806.399, "eval_samples_per_second": 18.462, "eval_steps_per_second": 2.308, "step": 440 }, { "epoch": 0.1, "eval_loss": 2.043710708618164, "eval_runtime": 806.2814, "eval_samples_per_second": 18.465, "eval_steps_per_second": 2.308, "step": 445 }, { "epoch": 0.1, "grad_norm": 0.7228645086288452, "learning_rate": 0.000155, "loss": 2.0467, "step": 450 }, { "epoch": 0.1, "eval_loss": 2.042335033416748, "eval_runtime": 806.3613, "eval_samples_per_second": 18.463, "eval_steps_per_second": 2.308, "step": 450 }, { "epoch": 0.1, "eval_loss": 2.0407748222351074, "eval_runtime": 806.5248, "eval_samples_per_second": 18.459, "eval_steps_per_second": 2.307, "step": 455 }, { "epoch": 0.11, "grad_norm": 0.7004246115684509, "learning_rate": 0.000154, "loss": 2.2736, "step": 460 }, { "epoch": 0.11, "eval_loss": 2.0401110649108887, "eval_runtime": 807.0877, "eval_samples_per_second": 18.447, "eval_steps_per_second": 2.306, "step": 460 }, { "epoch": 0.11, "eval_loss": 2.039562702178955, "eval_runtime": 808.9767, "eval_samples_per_second": 18.403, "eval_steps_per_second": 2.3, "step": 465 }, { "epoch": 0.11, "grad_norm": 0.5844886898994446, "learning_rate": 0.000153, "loss": 2.0475, "step": 470 }, { "epoch": 0.11, "eval_loss": 2.039823055267334, "eval_runtime": 768.1785, "eval_samples_per_second": 19.381, "eval_steps_per_second": 2.423, "step": 470 }, { "epoch": 0.11, "eval_loss": 2.0399796962738037, "eval_runtime": 745.2193, "eval_samples_per_second": 19.978, "eval_steps_per_second": 2.497, "step": 475 }, { "epoch": 0.11, "grad_norm": 0.9306110143661499, "learning_rate": 0.000152, "loss": 2.1269, "step": 480 }, { "epoch": 0.11, "eval_loss": 2.0386431217193604, "eval_runtime": 744.6033, "eval_samples_per_second": 19.995, "eval_steps_per_second": 2.499, "step": 480 }, { "epoch": 0.11, "eval_loss": 2.038109302520752, "eval_runtime": 744.4904, "eval_samples_per_second": 19.998, "eval_steps_per_second": 2.5, "step": 485 }, { "epoch": 0.11, "grad_norm": 0.7265554666519165, "learning_rate": 0.000151, "loss": 2.1965, "step": 490 }, { "epoch": 0.11, "eval_loss": 2.039163589477539, "eval_runtime": 745.1648, "eval_samples_per_second": 19.979, "eval_steps_per_second": 2.497, "step": 490 }, { "epoch": 0.11, "eval_loss": 2.039876699447632, "eval_runtime": 745.0115, "eval_samples_per_second": 19.984, "eval_steps_per_second": 2.498, "step": 495 }, { "epoch": 0.12, "grad_norm": 0.7146623134613037, "learning_rate": 0.00015000000000000001, "loss": 1.9578, "step": 500 }, { "epoch": 0.12, "eval_loss": 2.0375914573669434, "eval_runtime": 744.9842, "eval_samples_per_second": 19.984, "eval_steps_per_second": 2.498, "step": 500 }, { "epoch": 0.12, "eval_loss": 2.0346121788024902, "eval_runtime": 746.8332, "eval_samples_per_second": 19.935, "eval_steps_per_second": 2.492, "step": 505 }, { "epoch": 0.12, "grad_norm": 1.0184926986694336, "learning_rate": 0.00014900000000000002, "loss": 2.1056, "step": 510 }, { "epoch": 0.12, "eval_loss": 2.032033920288086, "eval_runtime": 755.7533, "eval_samples_per_second": 19.7, "eval_steps_per_second": 2.462, "step": 510 }, { "epoch": 0.12, "eval_loss": 2.0300207138061523, "eval_runtime": 752.8834, "eval_samples_per_second": 19.775, "eval_steps_per_second": 2.472, "step": 515 }, { "epoch": 0.12, "grad_norm": 0.6594335436820984, "learning_rate": 0.000148, "loss": 2.3502, "step": 520 }, { "epoch": 0.12, "eval_loss": 2.028749465942383, "eval_runtime": 782.3884, "eval_samples_per_second": 19.029, "eval_steps_per_second": 2.379, "step": 520 }, { "epoch": 0.12, "eval_loss": 2.0278258323669434, "eval_runtime": 785.0029, "eval_samples_per_second": 18.966, "eval_steps_per_second": 2.371, "step": 525 }, { "epoch": 0.12, "grad_norm": 0.8392504453659058, "learning_rate": 0.000147, "loss": 2.203, "step": 530 }, { "epoch": 0.12, "eval_loss": 2.02721905708313, "eval_runtime": 744.3301, "eval_samples_per_second": 20.002, "eval_steps_per_second": 2.5, "step": 530 }, { "epoch": 0.12, "eval_loss": 2.0271682739257812, "eval_runtime": 744.0199, "eval_samples_per_second": 20.01, "eval_steps_per_second": 2.501, "step": 535 }, { "epoch": 0.12, "grad_norm": 0.7347747683525085, "learning_rate": 0.000146, "loss": 2.2609, "step": 540 }, { "epoch": 0.12, "eval_loss": 2.0258853435516357, "eval_runtime": 744.263, "eval_samples_per_second": 20.004, "eval_steps_per_second": 2.5, "step": 540 }, { "epoch": 0.13, "eval_loss": 2.026393413543701, "eval_runtime": 744.209, "eval_samples_per_second": 20.005, "eval_steps_per_second": 2.501, "step": 545 }, { "epoch": 0.13, "grad_norm": 0.4300149083137512, "learning_rate": 0.000145, "loss": 1.9083, "step": 550 }, { "epoch": 0.13, "eval_loss": 2.0268478393554688, "eval_runtime": 744.0279, "eval_samples_per_second": 20.01, "eval_steps_per_second": 2.501, "step": 550 }, { "epoch": 0.13, "eval_loss": 2.0266873836517334, "eval_runtime": 744.0752, "eval_samples_per_second": 20.009, "eval_steps_per_second": 2.501, "step": 555 }, { "epoch": 0.13, "grad_norm": 0.46820569038391113, "learning_rate": 0.000144, "loss": 2.1976, "step": 560 }, { "epoch": 0.13, "eval_loss": 2.0266172885894775, "eval_runtime": 744.0609, "eval_samples_per_second": 20.009, "eval_steps_per_second": 2.501, "step": 560 }, { "epoch": 0.13, "eval_loss": 2.0261571407318115, "eval_runtime": 744.0034, "eval_samples_per_second": 20.011, "eval_steps_per_second": 2.501, "step": 565 }, { "epoch": 0.13, "grad_norm": 0.7849207520484924, "learning_rate": 0.000143, "loss": 1.8726, "step": 570 }, { "epoch": 0.13, "eval_loss": 2.0230391025543213, "eval_runtime": 744.0221, "eval_samples_per_second": 20.01, "eval_steps_per_second": 2.501, "step": 570 }, { "epoch": 0.13, "eval_loss": 2.0199837684631348, "eval_runtime": 744.0989, "eval_samples_per_second": 20.008, "eval_steps_per_second": 2.501, "step": 575 }, { "epoch": 0.13, "grad_norm": 0.6005426645278931, "learning_rate": 0.000142, "loss": 2.0767, "step": 580 }, { "epoch": 0.13, "eval_loss": 2.0186009407043457, "eval_runtime": 744.1002, "eval_samples_per_second": 20.008, "eval_steps_per_second": 2.501, "step": 580 }, { "epoch": 0.13, "eval_loss": 2.0188121795654297, "eval_runtime": 768.6396, "eval_samples_per_second": 19.369, "eval_steps_per_second": 2.421, "step": 585 }, { "epoch": 0.14, "grad_norm": 0.4590970575809479, "learning_rate": 0.000141, "loss": 2.0767, "step": 590 }, { "epoch": 0.14, "eval_loss": 2.01908278465271, "eval_runtime": 767.2243, "eval_samples_per_second": 19.405, "eval_steps_per_second": 2.426, "step": 590 }, { "epoch": 0.14, "eval_loss": 2.0185952186584473, "eval_runtime": 744.1236, "eval_samples_per_second": 20.007, "eval_steps_per_second": 2.501, "step": 595 }, { "epoch": 0.14, "grad_norm": 0.8970805406570435, "learning_rate": 0.00014, "loss": 2.1128, "step": 600 }, { "epoch": 0.14, "eval_loss": 2.0173678398132324, "eval_runtime": 744.3957, "eval_samples_per_second": 20.0, "eval_steps_per_second": 2.5, "step": 600 }, { "epoch": 0.14, "eval_loss": 2.015655279159546, "eval_runtime": 744.2471, "eval_samples_per_second": 20.004, "eval_steps_per_second": 2.501, "step": 605 }, { "epoch": 0.14, "grad_norm": 0.6531006693840027, "learning_rate": 0.000139, "loss": 2.0945, "step": 610 }, { "epoch": 0.14, "eval_loss": 2.0149075984954834, "eval_runtime": 744.0697, "eval_samples_per_second": 20.009, "eval_steps_per_second": 2.501, "step": 610 }, { "epoch": 0.14, "eval_loss": 2.0139400959014893, "eval_runtime": 744.1056, "eval_samples_per_second": 20.008, "eval_steps_per_second": 2.501, "step": 615 }, { "epoch": 0.14, "grad_norm": 0.6212542653083801, "learning_rate": 0.000138, "loss": 2.0035, "step": 620 }, { "epoch": 0.14, "eval_loss": 2.0124478340148926, "eval_runtime": 747.8028, "eval_samples_per_second": 19.909, "eval_steps_per_second": 2.489, "step": 620 }, { "epoch": 0.14, "eval_loss": 2.0124478340148926, "eval_runtime": 763.4128, "eval_samples_per_second": 19.502, "eval_steps_per_second": 2.438, "step": 625 }, { "epoch": 0.15, "grad_norm": 0.6082640290260315, "learning_rate": 0.00013700000000000002, "loss": 2.1103, "step": 630 }, { "epoch": 0.15, "eval_loss": 2.0123772621154785, "eval_runtime": 745.1902, "eval_samples_per_second": 19.979, "eval_steps_per_second": 2.497, "step": 630 }, { "epoch": 0.15, "eval_loss": 2.011105537414551, "eval_runtime": 799.942, "eval_samples_per_second": 18.611, "eval_steps_per_second": 2.326, "step": 635 }, { "epoch": 0.15, "grad_norm": 0.7771211862564087, "learning_rate": 0.00013600000000000003, "loss": 2.1028, "step": 640 }, { "epoch": 0.15, "eval_loss": 2.009859323501587, "eval_runtime": 796.318, "eval_samples_per_second": 18.696, "eval_steps_per_second": 2.337, "step": 640 }, { "epoch": 0.15, "eval_loss": 2.009398937225342, "eval_runtime": 745.0813, "eval_samples_per_second": 19.982, "eval_steps_per_second": 2.498, "step": 645 }, { "epoch": 0.15, "grad_norm": 0.8797178268432617, "learning_rate": 0.00013500000000000003, "loss": 2.2711, "step": 650 }, { "epoch": 0.15, "eval_loss": 2.0092194080352783, "eval_runtime": 743.6761, "eval_samples_per_second": 20.019, "eval_steps_per_second": 2.502, "step": 650 }, { "epoch": 0.15, "eval_loss": 2.00980806350708, "eval_runtime": 742.9948, "eval_samples_per_second": 20.038, "eval_steps_per_second": 2.505, "step": 655 }, { "epoch": 0.15, "grad_norm": 1.1657720804214478, "learning_rate": 0.000134, "loss": 2.243, "step": 660 }, { "epoch": 0.15, "eval_loss": 2.0087344646453857, "eval_runtime": 742.7774, "eval_samples_per_second": 20.044, "eval_steps_per_second": 2.505, "step": 660 }, { "epoch": 0.15, "eval_loss": 2.0070958137512207, "eval_runtime": 748.9587, "eval_samples_per_second": 19.878, "eval_steps_per_second": 2.485, "step": 665 }, { "epoch": 0.15, "grad_norm": 0.6517012715339661, "learning_rate": 0.000133, "loss": 1.9059, "step": 670 }, { "epoch": 0.15, "eval_loss": 2.0066118240356445, "eval_runtime": 746.8276, "eval_samples_per_second": 19.935, "eval_steps_per_second": 2.492, "step": 670 }, { "epoch": 0.16, "eval_loss": 2.006216287612915, "eval_runtime": 746.1771, "eval_samples_per_second": 19.952, "eval_steps_per_second": 2.494, "step": 675 }, { "epoch": 0.16, "grad_norm": 0.6520020365715027, "learning_rate": 0.000132, "loss": 1.9953, "step": 680 }, { "epoch": 0.16, "eval_loss": 2.0057804584503174, "eval_runtime": 800.2909, "eval_samples_per_second": 18.603, "eval_steps_per_second": 2.325, "step": 680 }, { "epoch": 0.16, "eval_loss": 2.0061757564544678, "eval_runtime": 743.2104, "eval_samples_per_second": 20.032, "eval_steps_per_second": 2.504, "step": 685 }, { "epoch": 0.16, "grad_norm": 0.7605012059211731, "learning_rate": 0.000131, "loss": 2.0684, "step": 690 }, { "epoch": 0.16, "eval_loss": 2.0061566829681396, "eval_runtime": 743.1127, "eval_samples_per_second": 20.035, "eval_steps_per_second": 2.504, "step": 690 }, { "epoch": 0.16, "eval_loss": 2.005591869354248, "eval_runtime": 743.2532, "eval_samples_per_second": 20.031, "eval_steps_per_second": 2.504, "step": 695 }, { "epoch": 0.16, "grad_norm": 0.8598791360855103, "learning_rate": 0.00013000000000000002, "loss": 1.9624, "step": 700 }, { "epoch": 0.16, "eval_loss": 2.00502347946167, "eval_runtime": 743.0596, "eval_samples_per_second": 20.036, "eval_steps_per_second": 2.505, "step": 700 }, { "epoch": 0.16, "eval_loss": 2.0032622814178467, "eval_runtime": 743.2083, "eval_samples_per_second": 20.032, "eval_steps_per_second": 2.504, "step": 705 }, { "epoch": 0.16, "grad_norm": 0.5449765920639038, "learning_rate": 0.00012900000000000002, "loss": 2.0671, "step": 710 }, { "epoch": 0.16, "eval_loss": 2.0018348693847656, "eval_runtime": 742.4305, "eval_samples_per_second": 20.053, "eval_steps_per_second": 2.507, "step": 710 }, { "epoch": 0.16, "eval_loss": 2.001499652862549, "eval_runtime": 742.3408, "eval_samples_per_second": 20.055, "eval_steps_per_second": 2.507, "step": 715 }, { "epoch": 0.17, "grad_norm": 0.7031337022781372, "learning_rate": 0.00012800000000000002, "loss": 2.1012, "step": 720 }, { "epoch": 0.17, "eval_loss": 2.0012526512145996, "eval_runtime": 742.3685, "eval_samples_per_second": 20.055, "eval_steps_per_second": 2.507, "step": 720 }, { "epoch": 0.17, "eval_loss": 2.0008952617645264, "eval_runtime": 742.5083, "eval_samples_per_second": 20.051, "eval_steps_per_second": 2.506, "step": 725 }, { "epoch": 0.17, "grad_norm": 0.9199779033660889, "learning_rate": 0.000127, "loss": 2.1093, "step": 730 }, { "epoch": 0.17, "eval_loss": 2.0010104179382324, "eval_runtime": 742.3134, "eval_samples_per_second": 20.056, "eval_steps_per_second": 2.507, "step": 730 }, { "epoch": 0.17, "eval_loss": 2.000081777572632, "eval_runtime": 742.5728, "eval_samples_per_second": 20.049, "eval_steps_per_second": 2.506, "step": 735 }, { "epoch": 0.17, "grad_norm": 0.5495649576187134, "learning_rate": 0.000126, "loss": 2.0745, "step": 740 }, { "epoch": 0.17, "eval_loss": 1.9993902444839478, "eval_runtime": 742.2628, "eval_samples_per_second": 20.058, "eval_steps_per_second": 2.507, "step": 740 }, { "epoch": 0.17, "eval_loss": 1.9984277486801147, "eval_runtime": 742.413, "eval_samples_per_second": 20.054, "eval_steps_per_second": 2.507, "step": 745 }, { "epoch": 0.17, "grad_norm": 0.6500754952430725, "learning_rate": 0.000125, "loss": 2.1672, "step": 750 }, { "epoch": 0.17, "eval_loss": 1.9985421895980835, "eval_runtime": 742.5432, "eval_samples_per_second": 20.05, "eval_steps_per_second": 2.506, "step": 750 }, { "epoch": 0.17, "eval_loss": 1.9995174407958984, "eval_runtime": 742.7051, "eval_samples_per_second": 20.046, "eval_steps_per_second": 2.506, "step": 755 }, { "epoch": 0.18, "grad_norm": 0.6539570689201355, "learning_rate": 0.000124, "loss": 2.0329, "step": 760 }, { "epoch": 0.18, "eval_loss": 1.999580979347229, "eval_runtime": 742.8251, "eval_samples_per_second": 20.042, "eval_steps_per_second": 2.505, "step": 760 }, { "epoch": 0.18, "eval_loss": 1.9998208284378052, "eval_runtime": 742.7139, "eval_samples_per_second": 20.045, "eval_steps_per_second": 2.506, "step": 765 }, { "epoch": 0.18, "grad_norm": 0.7342298626899719, "learning_rate": 0.000123, "loss": 2.1127, "step": 770 }, { "epoch": 0.18, "eval_loss": 1.998628854751587, "eval_runtime": 742.6862, "eval_samples_per_second": 20.046, "eval_steps_per_second": 2.506, "step": 770 }, { "epoch": 0.18, "eval_loss": 1.9968189001083374, "eval_runtime": 742.6582, "eval_samples_per_second": 20.047, "eval_steps_per_second": 2.506, "step": 775 }, { "epoch": 0.18, "grad_norm": 0.5857991576194763, "learning_rate": 0.000122, "loss": 1.83, "step": 780 }, { "epoch": 0.18, "eval_loss": 1.9953583478927612, "eval_runtime": 742.6099, "eval_samples_per_second": 20.048, "eval_steps_per_second": 2.506, "step": 780 }, { "epoch": 0.18, "eval_loss": 1.9942070245742798, "eval_runtime": 742.6392, "eval_samples_per_second": 20.047, "eval_steps_per_second": 2.506, "step": 785 }, { "epoch": 0.18, "grad_norm": 0.7684934735298157, "learning_rate": 0.000121, "loss": 2.1953, "step": 790 }, { "epoch": 0.18, "eval_loss": 1.9930860996246338, "eval_runtime": 742.5366, "eval_samples_per_second": 20.05, "eval_steps_per_second": 2.506, "step": 790 }, { "epoch": 0.18, "eval_loss": 1.9929825067520142, "eval_runtime": 742.4635, "eval_samples_per_second": 20.052, "eval_steps_per_second": 2.507, "step": 795 }, { "epoch": 0.18, "grad_norm": 0.7895638346672058, "learning_rate": 0.00012, "loss": 2.0523, "step": 800 }, { "epoch": 0.18, "eval_loss": 1.993416428565979, "eval_runtime": 742.3749, "eval_samples_per_second": 20.055, "eval_steps_per_second": 2.507, "step": 800 }, { "epoch": 0.19, "eval_loss": 1.9942388534545898, "eval_runtime": 742.6099, "eval_samples_per_second": 20.048, "eval_steps_per_second": 2.506, "step": 805 }, { "epoch": 0.19, "grad_norm": 0.7561328411102295, "learning_rate": 0.000119, "loss": 1.9476, "step": 810 }, { "epoch": 0.19, "eval_loss": 1.9943628311157227, "eval_runtime": 742.6406, "eval_samples_per_second": 20.047, "eval_steps_per_second": 2.506, "step": 810 }, { "epoch": 0.19, "eval_loss": 1.993873953819275, "eval_runtime": 742.6764, "eval_samples_per_second": 20.046, "eval_steps_per_second": 2.506, "step": 815 }, { "epoch": 0.19, "grad_norm": 0.6752389073371887, "learning_rate": 0.000118, "loss": 2.2179, "step": 820 }, { "epoch": 0.19, "eval_loss": 1.993316411972046, "eval_runtime": 742.8432, "eval_samples_per_second": 20.042, "eval_steps_per_second": 2.505, "step": 820 }, { "epoch": 0.19, "eval_loss": 1.9925029277801514, "eval_runtime": 742.8122, "eval_samples_per_second": 20.043, "eval_steps_per_second": 2.505, "step": 825 }, { "epoch": 0.19, "grad_norm": 0.8891268372535706, "learning_rate": 0.000117, "loss": 2.0997, "step": 830 }, { "epoch": 0.19, "eval_loss": 1.9914040565490723, "eval_runtime": 742.5596, "eval_samples_per_second": 20.05, "eval_steps_per_second": 2.506, "step": 830 }, { "epoch": 0.19, "eval_loss": 1.989838719367981, "eval_runtime": 742.7205, "eval_samples_per_second": 20.045, "eval_steps_per_second": 2.506, "step": 835 }, { "epoch": 0.19, "grad_norm": 0.8138695359230042, "learning_rate": 0.000116, "loss": 2.0146, "step": 840 }, { "epoch": 0.19, "eval_loss": 1.9878920316696167, "eval_runtime": 746.0661, "eval_samples_per_second": 19.955, "eval_steps_per_second": 2.494, "step": 840 }, { "epoch": 0.19, "eval_loss": 1.986674427986145, "eval_runtime": 753.8888, "eval_samples_per_second": 19.748, "eval_steps_per_second": 2.469, "step": 845 }, { "epoch": 0.2, "grad_norm": 0.8682060241699219, "learning_rate": 0.00011499999999999999, "loss": 2.1124, "step": 850 }, { "epoch": 0.2, "eval_loss": 1.9865528345108032, "eval_runtime": 754.5751, "eval_samples_per_second": 19.73, "eval_steps_per_second": 2.466, "step": 850 }, { "epoch": 0.2, "eval_loss": 1.9874132871627808, "eval_runtime": 758.8328, "eval_samples_per_second": 19.62, "eval_steps_per_second": 2.452, "step": 855 }, { "epoch": 0.2, "grad_norm": 0.8115680813789368, "learning_rate": 0.00011399999999999999, "loss": 2.3409, "step": 860 }, { "epoch": 0.2, "eval_loss": 1.9879497289657593, "eval_runtime": 752.4654, "eval_samples_per_second": 19.786, "eval_steps_per_second": 2.473, "step": 860 }, { "epoch": 0.2, "eval_loss": 1.9877504110336304, "eval_runtime": 777.0301, "eval_samples_per_second": 19.16, "eval_steps_per_second": 2.395, "step": 865 }, { "epoch": 0.2, "grad_norm": 0.8121163845062256, "learning_rate": 0.000113, "loss": 2.1718, "step": 870 }, { "epoch": 0.2, "eval_loss": 1.9862942695617676, "eval_runtime": 789.4357, "eval_samples_per_second": 18.859, "eval_steps_per_second": 2.357, "step": 870 }, { "epoch": 0.2, "eval_loss": 1.9849509000778198, "eval_runtime": 766.4313, "eval_samples_per_second": 19.425, "eval_steps_per_second": 2.428, "step": 875 }, { "epoch": 0.2, "grad_norm": 0.6633480787277222, "learning_rate": 0.00011200000000000001, "loss": 2.0154, "step": 880 }, { "epoch": 0.2, "eval_loss": 1.9856353998184204, "eval_runtime": 797.5238, "eval_samples_per_second": 18.668, "eval_steps_per_second": 2.333, "step": 880 }, { "epoch": 0.2, "eval_loss": 1.9872477054595947, "eval_runtime": 796.5052, "eval_samples_per_second": 18.692, "eval_steps_per_second": 2.336, "step": 885 }, { "epoch": 0.2, "grad_norm": 0.6087526679039001, "learning_rate": 0.00011100000000000001, "loss": 2.03, "step": 890 }, { "epoch": 0.2, "eval_loss": 1.9866758584976196, "eval_runtime": 776.7422, "eval_samples_per_second": 19.167, "eval_steps_per_second": 2.396, "step": 890 }, { "epoch": 0.21, "eval_loss": 1.985320806503296, "eval_runtime": 801.3372, "eval_samples_per_second": 18.579, "eval_steps_per_second": 2.322, "step": 895 }, { "epoch": 0.21, "grad_norm": 0.8433945178985596, "learning_rate": 0.00011000000000000002, "loss": 2.0573, "step": 900 }, { "epoch": 0.21, "eval_loss": 1.9836574792861938, "eval_runtime": 767.1796, "eval_samples_per_second": 19.406, "eval_steps_per_second": 2.426, "step": 900 }, { "epoch": 0.21, "eval_loss": 1.9830906391143799, "eval_runtime": 755.8391, "eval_samples_per_second": 19.697, "eval_steps_per_second": 2.462, "step": 905 }, { "epoch": 0.21, "grad_norm": 0.9100953340530396, "learning_rate": 0.000109, "loss": 1.9915, "step": 910 }, { "epoch": 0.21, "eval_loss": 1.9824570417404175, "eval_runtime": 766.9525, "eval_samples_per_second": 19.412, "eval_steps_per_second": 2.426, "step": 910 }, { "epoch": 0.21, "eval_loss": 1.983730435371399, "eval_runtime": 761.0826, "eval_samples_per_second": 19.562, "eval_steps_per_second": 2.445, "step": 915 }, { "epoch": 0.21, "grad_norm": 0.5524155497550964, "learning_rate": 0.00010800000000000001, "loss": 2.4334, "step": 920 }, { "epoch": 0.21, "eval_loss": 1.9855434894561768, "eval_runtime": 806.2529, "eval_samples_per_second": 18.466, "eval_steps_per_second": 2.308, "step": 920 }, { "epoch": 0.21, "eval_loss": 1.9850364923477173, "eval_runtime": 759.49, "eval_samples_per_second": 19.603, "eval_steps_per_second": 2.45, "step": 925 }, { "epoch": 0.21, "grad_norm": 0.8319659233093262, "learning_rate": 0.00010700000000000001, "loss": 2.0895, "step": 930 }, { "epoch": 0.21, "eval_loss": 1.9837418794631958, "eval_runtime": 742.155, "eval_samples_per_second": 20.06, "eval_steps_per_second": 2.508, "step": 930 }, { "epoch": 0.22, "eval_loss": 1.9824495315551758, "eval_runtime": 742.1038, "eval_samples_per_second": 20.062, "eval_steps_per_second": 2.508, "step": 935 }, { "epoch": 0.22, "grad_norm": 0.5936623811721802, "learning_rate": 0.00010600000000000002, "loss": 2.018, "step": 940 }, { "epoch": 0.22, "eval_loss": 1.9815988540649414, "eval_runtime": 742.0378, "eval_samples_per_second": 20.064, "eval_steps_per_second": 2.508, "step": 940 }, { "epoch": 0.22, "eval_loss": 1.9809609651565552, "eval_runtime": 742.3078, "eval_samples_per_second": 20.056, "eval_steps_per_second": 2.507, "step": 945 }, { "epoch": 0.22, "grad_norm": 0.6863654255867004, "learning_rate": 0.000105, "loss": 2.1028, "step": 950 }, { "epoch": 0.22, "eval_loss": 1.9804768562316895, "eval_runtime": 742.0981, "eval_samples_per_second": 20.062, "eval_steps_per_second": 2.508, "step": 950 }, { "epoch": 0.22, "eval_loss": 1.979957103729248, "eval_runtime": 742.167, "eval_samples_per_second": 20.06, "eval_steps_per_second": 2.508, "step": 955 }, { "epoch": 0.22, "grad_norm": 0.8558996915817261, "learning_rate": 0.00010400000000000001, "loss": 2.0259, "step": 960 }, { "epoch": 0.22, "eval_loss": 1.9790544509887695, "eval_runtime": 742.0942, "eval_samples_per_second": 20.062, "eval_steps_per_second": 2.508, "step": 960 }, { "epoch": 0.22, "eval_loss": 1.9782884120941162, "eval_runtime": 742.2666, "eval_samples_per_second": 20.057, "eval_steps_per_second": 2.507, "step": 965 }, { "epoch": 0.22, "grad_norm": 1.0001534223556519, "learning_rate": 0.00010300000000000001, "loss": 1.9648, "step": 970 }, { "epoch": 0.22, "eval_loss": 1.9775859117507935, "eval_runtime": 742.263, "eval_samples_per_second": 20.058, "eval_steps_per_second": 2.507, "step": 970 }, { "epoch": 0.22, "eval_loss": 1.9767318964004517, "eval_runtime": 742.3891, "eval_samples_per_second": 20.054, "eval_steps_per_second": 2.507, "step": 975 }, { "epoch": 0.23, "grad_norm": 0.8744415640830994, "learning_rate": 0.00010200000000000001, "loss": 2.0649, "step": 980 }, { "epoch": 0.23, "eval_loss": 1.976646065711975, "eval_runtime": 742.1965, "eval_samples_per_second": 20.059, "eval_steps_per_second": 2.507, "step": 980 }, { "epoch": 0.23, "eval_loss": 1.9765182733535767, "eval_runtime": 742.453, "eval_samples_per_second": 20.052, "eval_steps_per_second": 2.507, "step": 985 }, { "epoch": 0.23, "grad_norm": 0.7335282564163208, "learning_rate": 0.000101, "loss": 2.0636, "step": 990 }, { "epoch": 0.23, "eval_loss": 1.9757829904556274, "eval_runtime": 742.3078, "eval_samples_per_second": 20.056, "eval_steps_per_second": 2.507, "step": 990 }, { "epoch": 0.23, "eval_loss": 1.9756759405136108, "eval_runtime": 742.3788, "eval_samples_per_second": 20.054, "eval_steps_per_second": 2.507, "step": 995 }, { "epoch": 0.23, "grad_norm": 0.9827843904495239, "learning_rate": 0.0001, "loss": 1.9848, "step": 1000 }, { "epoch": 0.23, "eval_loss": 1.9751605987548828, "eval_runtime": 742.3183, "eval_samples_per_second": 20.056, "eval_steps_per_second": 2.507, "step": 1000 }, { "epoch": 0.23, "eval_loss": 1.974611759185791, "eval_runtime": 741.7661, "eval_samples_per_second": 20.071, "eval_steps_per_second": 2.509, "step": 1005 }, { "epoch": 0.23, "grad_norm": 0.8337591290473938, "learning_rate": 9.900000000000001e-05, "loss": 1.8473, "step": 1010 }, { "epoch": 0.23, "eval_loss": 1.974379301071167, "eval_runtime": 741.9424, "eval_samples_per_second": 20.066, "eval_steps_per_second": 2.508, "step": 1010 }, { "epoch": 0.23, "eval_loss": 1.973697304725647, "eval_runtime": 741.9395, "eval_samples_per_second": 20.066, "eval_steps_per_second": 2.508, "step": 1015 }, { "epoch": 0.23, "grad_norm": 0.5600916147232056, "learning_rate": 9.8e-05, "loss": 1.9928, "step": 1020 }, { "epoch": 0.23, "eval_loss": 1.9728336334228516, "eval_runtime": 742.058, "eval_samples_per_second": 20.063, "eval_steps_per_second": 2.508, "step": 1020 }, { "epoch": 0.24, "eval_loss": 1.9720138311386108, "eval_runtime": 741.909, "eval_samples_per_second": 20.067, "eval_steps_per_second": 2.508, "step": 1025 }, { "epoch": 0.24, "grad_norm": 0.7983824014663696, "learning_rate": 9.7e-05, "loss": 1.9903, "step": 1030 }, { "epoch": 0.24, "eval_loss": 1.9715585708618164, "eval_runtime": 741.9989, "eval_samples_per_second": 20.065, "eval_steps_per_second": 2.508, "step": 1030 }, { "epoch": 0.24, "eval_loss": 1.971234917640686, "eval_runtime": 742.1566, "eval_samples_per_second": 20.06, "eval_steps_per_second": 2.508, "step": 1035 }, { "epoch": 0.24, "grad_norm": 1.0410112142562866, "learning_rate": 9.6e-05, "loss": 2.2528, "step": 1040 }, { "epoch": 0.24, "eval_loss": 1.9711899757385254, "eval_runtime": 742.4115, "eval_samples_per_second": 20.054, "eval_steps_per_second": 2.507, "step": 1040 }, { "epoch": 0.24, "eval_loss": 1.9705572128295898, "eval_runtime": 742.1151, "eval_samples_per_second": 20.062, "eval_steps_per_second": 2.508, "step": 1045 }, { "epoch": 0.24, "grad_norm": 0.9996401071548462, "learning_rate": 9.5e-05, "loss": 1.8971, "step": 1050 }, { "epoch": 0.24, "eval_loss": 1.9699151515960693, "eval_runtime": 742.02, "eval_samples_per_second": 20.064, "eval_steps_per_second": 2.508, "step": 1050 }, { "epoch": 0.24, "eval_loss": 1.9693446159362793, "eval_runtime": 742.2312, "eval_samples_per_second": 20.058, "eval_steps_per_second": 2.507, "step": 1055 }, { "epoch": 0.24, "grad_norm": 0.6173760294914246, "learning_rate": 9.4e-05, "loss": 1.9011, "step": 1060 }, { "epoch": 0.24, "eval_loss": 1.9691505432128906, "eval_runtime": 742.0593, "eval_samples_per_second": 20.063, "eval_steps_per_second": 2.508, "step": 1060 }, { "epoch": 0.25, "eval_loss": 1.9691288471221924, "eval_runtime": 741.9923, "eval_samples_per_second": 20.065, "eval_steps_per_second": 2.508, "step": 1065 }, { "epoch": 0.25, "grad_norm": 0.7394955158233643, "learning_rate": 9.300000000000001e-05, "loss": 2.0518, "step": 1070 }, { "epoch": 0.25, "eval_loss": 1.9698320627212524, "eval_runtime": 742.0516, "eval_samples_per_second": 20.063, "eval_steps_per_second": 2.508, "step": 1070 }, { "epoch": 0.25, "eval_loss": 1.9706062078475952, "eval_runtime": 746.4611, "eval_samples_per_second": 19.945, "eval_steps_per_second": 2.493, "step": 1075 }, { "epoch": 0.25, "grad_norm": 0.6462586522102356, "learning_rate": 9.200000000000001e-05, "loss": 2.04, "step": 1080 }, { "epoch": 0.25, "eval_loss": 1.9709925651550293, "eval_runtime": 793.086, "eval_samples_per_second": 18.772, "eval_steps_per_second": 2.347, "step": 1080 }, { "epoch": 0.25, "eval_loss": 1.9699233770370483, "eval_runtime": 772.4813, "eval_samples_per_second": 19.273, "eval_steps_per_second": 2.409, "step": 1085 }, { "epoch": 0.25, "grad_norm": 0.8134130835533142, "learning_rate": 9.1e-05, "loss": 1.9992, "step": 1090 }, { "epoch": 0.25, "eval_loss": 1.9690241813659668, "eval_runtime": 742.7771, "eval_samples_per_second": 20.044, "eval_steps_per_second": 2.505, "step": 1090 }, { "epoch": 0.25, "eval_loss": 1.9685667753219604, "eval_runtime": 742.8374, "eval_samples_per_second": 20.042, "eval_steps_per_second": 2.505, "step": 1095 }, { "epoch": 0.25, "grad_norm": 1.065739631652832, "learning_rate": 9e-05, "loss": 2.2447, "step": 1100 }, { "epoch": 0.25, "eval_loss": 1.9679232835769653, "eval_runtime": 743.058, "eval_samples_per_second": 20.036, "eval_steps_per_second": 2.505, "step": 1100 }, { "epoch": 0.25, "eval_loss": 1.967824101448059, "eval_runtime": 742.6615, "eval_samples_per_second": 20.047, "eval_steps_per_second": 2.506, "step": 1105 }, { "epoch": 0.26, "grad_norm": 0.8471609950065613, "learning_rate": 8.900000000000001e-05, "loss": 2.0276, "step": 1110 }, { "epoch": 0.26, "eval_loss": 1.9677886962890625, "eval_runtime": 742.686, "eval_samples_per_second": 20.046, "eval_steps_per_second": 2.506, "step": 1110 }, { "epoch": 0.26, "eval_loss": 1.9673609733581543, "eval_runtime": 742.7476, "eval_samples_per_second": 20.044, "eval_steps_per_second": 2.506, "step": 1115 }, { "epoch": 0.26, "grad_norm": 1.0944584608078003, "learning_rate": 8.800000000000001e-05, "loss": 1.9626, "step": 1120 }, { "epoch": 0.26, "eval_loss": 1.9674118757247925, "eval_runtime": 742.8656, "eval_samples_per_second": 20.041, "eval_steps_per_second": 2.505, "step": 1120 }, { "epoch": 0.26, "eval_loss": 1.966983675956726, "eval_runtime": 742.5557, "eval_samples_per_second": 20.05, "eval_steps_per_second": 2.506, "step": 1125 }, { "epoch": 0.26, "grad_norm": 1.0942668914794922, "learning_rate": 8.7e-05, "loss": 2.1137, "step": 1130 }, { "epoch": 0.26, "eval_loss": 1.966120719909668, "eval_runtime": 743.2494, "eval_samples_per_second": 20.031, "eval_steps_per_second": 2.504, "step": 1130 }, { "epoch": 0.26, "eval_loss": 1.9657062292099, "eval_runtime": 742.7716, "eval_samples_per_second": 20.044, "eval_steps_per_second": 2.505, "step": 1135 }, { "epoch": 0.26, "grad_norm": 0.9553364515304565, "learning_rate": 8.6e-05, "loss": 1.9172, "step": 1140 }, { "epoch": 0.26, "eval_loss": 1.9652293920516968, "eval_runtime": 742.8386, "eval_samples_per_second": 20.042, "eval_steps_per_second": 2.505, "step": 1140 }, { "epoch": 0.26, "eval_loss": 1.965266466140747, "eval_runtime": 742.7224, "eval_samples_per_second": 20.045, "eval_steps_per_second": 2.506, "step": 1145 }, { "epoch": 0.26, "grad_norm": 0.7700403332710266, "learning_rate": 8.5e-05, "loss": 1.969, "step": 1150 }, { "epoch": 0.26, "eval_loss": 1.9645556211471558, "eval_runtime": 742.7935, "eval_samples_per_second": 20.043, "eval_steps_per_second": 2.505, "step": 1150 }, { "epoch": 0.27, "eval_loss": 1.9638077020645142, "eval_runtime": 742.9212, "eval_samples_per_second": 20.04, "eval_steps_per_second": 2.505, "step": 1155 }, { "epoch": 0.27, "grad_norm": 0.6622421741485596, "learning_rate": 8.4e-05, "loss": 2.0767, "step": 1160 }, { "epoch": 0.27, "eval_loss": 1.9633898735046387, "eval_runtime": 743.0434, "eval_samples_per_second": 20.037, "eval_steps_per_second": 2.505, "step": 1160 }, { "epoch": 0.27, "eval_loss": 1.9631668329238892, "eval_runtime": 743.0892, "eval_samples_per_second": 20.035, "eval_steps_per_second": 2.504, "step": 1165 }, { "epoch": 0.27, "grad_norm": 0.902953565120697, "learning_rate": 8.3e-05, "loss": 1.9907, "step": 1170 }, { "epoch": 0.27, "eval_loss": 1.9637709856033325, "eval_runtime": 742.9531, "eval_samples_per_second": 20.039, "eval_steps_per_second": 2.505, "step": 1170 }, { "epoch": 0.27, "eval_loss": 1.964076042175293, "eval_runtime": 752.1865, "eval_samples_per_second": 19.793, "eval_steps_per_second": 2.474, "step": 1175 }, { "epoch": 0.27, "grad_norm": 0.6186763644218445, "learning_rate": 8.2e-05, "loss": 2.0733, "step": 1180 }, { "epoch": 0.27, "eval_loss": 1.9636982679367065, "eval_runtime": 806.7383, "eval_samples_per_second": 18.455, "eval_steps_per_second": 2.307, "step": 1180 }, { "epoch": 0.27, "eval_loss": 1.9626468420028687, "eval_runtime": 794.7659, "eval_samples_per_second": 18.733, "eval_steps_per_second": 2.342, "step": 1185 }, { "epoch": 0.27, "grad_norm": 2.3641421794891357, "learning_rate": 8.1e-05, "loss": 2.2387, "step": 1190 }, { "epoch": 0.27, "eval_loss": 1.962171196937561, "eval_runtime": 742.946, "eval_samples_per_second": 20.039, "eval_steps_per_second": 2.505, "step": 1190 }, { "epoch": 0.28, "eval_loss": 1.9623751640319824, "eval_runtime": 748.6093, "eval_samples_per_second": 19.888, "eval_steps_per_second": 2.486, "step": 1195 }, { "epoch": 0.28, "grad_norm": 0.8681169748306274, "learning_rate": 8e-05, "loss": 1.7697, "step": 1200 }, { "epoch": 0.28, "eval_loss": 1.9623966217041016, "eval_runtime": 747.3889, "eval_samples_per_second": 19.92, "eval_steps_per_second": 2.49, "step": 1200 }, { "epoch": 0.28, "eval_loss": 1.9618523120880127, "eval_runtime": 747.8306, "eval_samples_per_second": 19.908, "eval_steps_per_second": 2.489, "step": 1205 }, { "epoch": 0.28, "grad_norm": 0.8061439990997314, "learning_rate": 7.900000000000001e-05, "loss": 1.8372, "step": 1210 }, { "epoch": 0.28, "eval_loss": 1.9610604047775269, "eval_runtime": 754.4243, "eval_samples_per_second": 19.734, "eval_steps_per_second": 2.467, "step": 1210 }, { "epoch": 0.28, "eval_loss": 1.9600740671157837, "eval_runtime": 773.0575, "eval_samples_per_second": 19.259, "eval_steps_per_second": 2.407, "step": 1215 }, { "epoch": 0.28, "grad_norm": 0.6680261492729187, "learning_rate": 7.800000000000001e-05, "loss": 1.8579, "step": 1220 }, { "epoch": 0.28, "eval_loss": 1.9600106477737427, "eval_runtime": 758.3783, "eval_samples_per_second": 19.631, "eval_steps_per_second": 2.454, "step": 1220 }, { "epoch": 0.28, "eval_loss": 1.960121512413025, "eval_runtime": 742.6905, "eval_samples_per_second": 20.046, "eval_steps_per_second": 2.506, "step": 1225 }, { "epoch": 0.28, "grad_norm": 0.7453675866127014, "learning_rate": 7.7e-05, "loss": 2.0669, "step": 1230 }, { "epoch": 0.28, "eval_loss": 1.9599344730377197, "eval_runtime": 742.6985, "eval_samples_per_second": 20.046, "eval_steps_per_second": 2.506, "step": 1230 }, { "epoch": 0.28, "eval_loss": 1.9595812559127808, "eval_runtime": 743.459, "eval_samples_per_second": 20.025, "eval_steps_per_second": 2.503, "step": 1235 }, { "epoch": 0.29, "grad_norm": 0.8137094378471375, "learning_rate": 7.6e-05, "loss": 1.8052, "step": 1240 }, { "epoch": 0.29, "eval_loss": 1.959162950515747, "eval_runtime": 762.4324, "eval_samples_per_second": 19.527, "eval_steps_per_second": 2.441, "step": 1240 }, { "epoch": 0.29, "eval_loss": 1.958164930343628, "eval_runtime": 760.8758, "eval_samples_per_second": 19.567, "eval_steps_per_second": 2.446, "step": 1245 }, { "epoch": 0.29, "grad_norm": 0.9641257524490356, "learning_rate": 7.500000000000001e-05, "loss": 2.012, "step": 1250 }, { "epoch": 0.29, "eval_loss": 1.9580010175704956, "eval_runtime": 758.6208, "eval_samples_per_second": 19.625, "eval_steps_per_second": 2.453, "step": 1250 }, { "epoch": 0.29, "eval_loss": 1.9579390287399292, "eval_runtime": 756.8034, "eval_samples_per_second": 19.672, "eval_steps_per_second": 2.459, "step": 1255 }, { "epoch": 0.29, "grad_norm": 0.9015198349952698, "learning_rate": 7.4e-05, "loss": 1.7978, "step": 1260 }, { "epoch": 0.29, "eval_loss": 1.957844614982605, "eval_runtime": 762.8272, "eval_samples_per_second": 19.517, "eval_steps_per_second": 2.44, "step": 1260 }, { "epoch": 0.29, "eval_loss": 1.9587702751159668, "eval_runtime": 757.6095, "eval_samples_per_second": 19.651, "eval_steps_per_second": 2.456, "step": 1265 }, { "epoch": 0.29, "grad_norm": 0.6779741644859314, "learning_rate": 7.3e-05, "loss": 2.2184, "step": 1270 }, { "epoch": 0.29, "eval_loss": 1.9595640897750854, "eval_runtime": 752.2491, "eval_samples_per_second": 19.791, "eval_steps_per_second": 2.474, "step": 1270 }, { "epoch": 0.29, "eval_loss": 1.9594327211380005, "eval_runtime": 753.8257, "eval_samples_per_second": 19.75, "eval_steps_per_second": 2.469, "step": 1275 }, { "epoch": 0.29, "grad_norm": 1.2027363777160645, "learning_rate": 7.2e-05, "loss": 2.1702, "step": 1280 }, { "epoch": 0.29, "eval_loss": 1.9583239555358887, "eval_runtime": 762.8807, "eval_samples_per_second": 19.516, "eval_steps_per_second": 2.439, "step": 1280 }, { "epoch": 0.3, "eval_loss": 1.9568368196487427, "eval_runtime": 760.0025, "eval_samples_per_second": 19.589, "eval_steps_per_second": 2.449, "step": 1285 }, { "epoch": 0.3, "grad_norm": 0.690773069858551, "learning_rate": 7.1e-05, "loss": 2.1691, "step": 1290 }, { "epoch": 0.3, "eval_loss": 1.9561883211135864, "eval_runtime": 763.7974, "eval_samples_per_second": 19.492, "eval_steps_per_second": 2.437, "step": 1290 }, { "epoch": 0.3, "eval_loss": 1.9560072422027588, "eval_runtime": 750.1629, "eval_samples_per_second": 19.846, "eval_steps_per_second": 2.481, "step": 1295 }, { "epoch": 0.3, "grad_norm": 0.5568638443946838, "learning_rate": 7e-05, "loss": 2.0352, "step": 1300 }, { "epoch": 0.3, "eval_loss": 1.9558581113815308, "eval_runtime": 771.8238, "eval_samples_per_second": 19.289, "eval_steps_per_second": 2.411, "step": 1300 }, { "epoch": 0.3, "eval_loss": 1.9561656713485718, "eval_runtime": 761.3654, "eval_samples_per_second": 19.554, "eval_steps_per_second": 2.444, "step": 1305 }, { "epoch": 0.3, "grad_norm": 0.9243978261947632, "learning_rate": 6.9e-05, "loss": 2.1049, "step": 1310 }, { "epoch": 0.3, "eval_loss": 1.9566450119018555, "eval_runtime": 774.2138, "eval_samples_per_second": 19.23, "eval_steps_per_second": 2.404, "step": 1310 }, { "epoch": 0.3, "eval_loss": 1.956655502319336, "eval_runtime": 757.6655, "eval_samples_per_second": 19.65, "eval_steps_per_second": 2.456, "step": 1315 }, { "epoch": 0.3, "grad_norm": 0.799391508102417, "learning_rate": 6.800000000000001e-05, "loss": 1.9632, "step": 1320 }, { "epoch": 0.3, "eval_loss": 1.9563167095184326, "eval_runtime": 775.3068, "eval_samples_per_second": 19.203, "eval_steps_per_second": 2.4, "step": 1320 }, { "epoch": 0.31, "eval_loss": 1.955706000328064, "eval_runtime": 775.6046, "eval_samples_per_second": 19.195, "eval_steps_per_second": 2.399, "step": 1325 }, { "epoch": 0.31, "grad_norm": 0.5652315020561218, "learning_rate": 6.7e-05, "loss": 2.1231, "step": 1330 }, { "epoch": 0.31, "eval_loss": 1.9552385807037354, "eval_runtime": 774.2845, "eval_samples_per_second": 19.228, "eval_steps_per_second": 2.404, "step": 1330 }, { "epoch": 0.31, "eval_loss": 1.9548553228378296, "eval_runtime": 804.564, "eval_samples_per_second": 18.504, "eval_steps_per_second": 2.313, "step": 1335 }, { "epoch": 0.31, "grad_norm": 0.838487446308136, "learning_rate": 6.6e-05, "loss": 2.1477, "step": 1340 }, { "epoch": 0.31, "eval_loss": 1.9545986652374268, "eval_runtime": 804.7596, "eval_samples_per_second": 18.5, "eval_steps_per_second": 2.312, "step": 1340 }, { "epoch": 0.31, "eval_loss": 1.954168438911438, "eval_runtime": 804.8099, "eval_samples_per_second": 18.499, "eval_steps_per_second": 2.312, "step": 1345 }, { "epoch": 0.31, "grad_norm": 0.6252363920211792, "learning_rate": 6.500000000000001e-05, "loss": 1.9284, "step": 1350 }, { "epoch": 0.31, "eval_loss": 1.9540753364562988, "eval_runtime": 756.5934, "eval_samples_per_second": 19.678, "eval_steps_per_second": 2.46, "step": 1350 }, { "epoch": 0.31, "eval_loss": 1.9538133144378662, "eval_runtime": 753.786, "eval_samples_per_second": 19.751, "eval_steps_per_second": 2.469, "step": 1355 }, { "epoch": 0.31, "grad_norm": 0.7139986753463745, "learning_rate": 6.400000000000001e-05, "loss": 1.9112, "step": 1360 }, { "epoch": 0.31, "eval_loss": 1.9534461498260498, "eval_runtime": 752.6982, "eval_samples_per_second": 19.78, "eval_steps_per_second": 2.472, "step": 1360 }, { "epoch": 0.31, "eval_loss": 1.9531402587890625, "eval_runtime": 753.832, "eval_samples_per_second": 19.75, "eval_steps_per_second": 2.469, "step": 1365 }, { "epoch": 0.32, "grad_norm": 0.7954660058021545, "learning_rate": 6.3e-05, "loss": 1.879, "step": 1370 }, { "epoch": 0.32, "eval_loss": 1.9530184268951416, "eval_runtime": 749.4503, "eval_samples_per_second": 19.865, "eval_steps_per_second": 2.483, "step": 1370 }, { "epoch": 0.32, "eval_loss": 1.9531172513961792, "eval_runtime": 751.563, "eval_samples_per_second": 19.809, "eval_steps_per_second": 2.476, "step": 1375 }, { "epoch": 0.32, "grad_norm": 0.6721971035003662, "learning_rate": 6.2e-05, "loss": 1.793, "step": 1380 }, { "epoch": 0.32, "eval_loss": 1.9533251523971558, "eval_runtime": 760.6235, "eval_samples_per_second": 19.573, "eval_steps_per_second": 2.447, "step": 1380 }, { "epoch": 0.32, "eval_loss": 1.9533405303955078, "eval_runtime": 762.9017, "eval_samples_per_second": 19.515, "eval_steps_per_second": 2.439, "step": 1385 }, { "epoch": 0.32, "grad_norm": 0.9299044609069824, "learning_rate": 6.1e-05, "loss": 2.0359, "step": 1390 }, { "epoch": 0.32, "eval_loss": 1.9533050060272217, "eval_runtime": 770.8252, "eval_samples_per_second": 19.314, "eval_steps_per_second": 2.414, "step": 1390 }, { "epoch": 0.32, "eval_loss": 1.9530737400054932, "eval_runtime": 774.7678, "eval_samples_per_second": 19.216, "eval_steps_per_second": 2.402, "step": 1395 }, { "epoch": 0.32, "grad_norm": 0.8456938862800598, "learning_rate": 6e-05, "loss": 2.1071, "step": 1400 }, { "epoch": 0.32, "eval_loss": 1.9526972770690918, "eval_runtime": 785.1544, "eval_samples_per_second": 18.962, "eval_steps_per_second": 2.37, "step": 1400 }, { "epoch": 0.32, "eval_loss": 1.9520927667617798, "eval_runtime": 774.7517, "eval_samples_per_second": 19.216, "eval_steps_per_second": 2.402, "step": 1405 }, { "epoch": 0.32, "grad_norm": 0.8644620180130005, "learning_rate": 5.9e-05, "loss": 1.922, "step": 1410 }, { "epoch": 0.32, "eval_loss": 1.951416015625, "eval_runtime": 770.2102, "eval_samples_per_second": 19.33, "eval_steps_per_second": 2.416, "step": 1410 }, { "epoch": 0.33, "eval_loss": 1.951120138168335, "eval_runtime": 776.0343, "eval_samples_per_second": 19.185, "eval_steps_per_second": 2.398, "step": 1415 }, { "epoch": 0.33, "grad_norm": 0.7249743342399597, "learning_rate": 5.8e-05, "loss": 1.8578, "step": 1420 }, { "epoch": 0.33, "eval_loss": 1.9509921073913574, "eval_runtime": 786.6, "eval_samples_per_second": 18.927, "eval_steps_per_second": 2.366, "step": 1420 }, { "epoch": 0.33, "eval_loss": 1.9510488510131836, "eval_runtime": 776.8377, "eval_samples_per_second": 19.165, "eval_steps_per_second": 2.396, "step": 1425 }, { "epoch": 0.33, "grad_norm": 0.6929222941398621, "learning_rate": 5.6999999999999996e-05, "loss": 2.2087, "step": 1430 }, { "epoch": 0.33, "eval_loss": 1.9513132572174072, "eval_runtime": 759.5413, "eval_samples_per_second": 19.601, "eval_steps_per_second": 2.45, "step": 1430 }, { "epoch": 0.33, "eval_loss": 1.9508575201034546, "eval_runtime": 762.6392, "eval_samples_per_second": 19.522, "eval_steps_per_second": 2.44, "step": 1435 }, { "epoch": 0.33, "grad_norm": 0.7323941588401794, "learning_rate": 5.6000000000000006e-05, "loss": 2.1452, "step": 1440 }, { "epoch": 0.33, "eval_loss": 1.950052261352539, "eval_runtime": 744.7027, "eval_samples_per_second": 19.992, "eval_steps_per_second": 2.499, "step": 1440 }, { "epoch": 0.33, "eval_loss": 1.9493366479873657, "eval_runtime": 740.8795, "eval_samples_per_second": 20.095, "eval_steps_per_second": 2.512, "step": 1445 }, { "epoch": 0.33, "grad_norm": 0.9700380563735962, "learning_rate": 5.500000000000001e-05, "loss": 1.945, "step": 1450 }, { "epoch": 0.33, "eval_loss": 1.9489691257476807, "eval_runtime": 740.6194, "eval_samples_per_second": 20.102, "eval_steps_per_second": 2.513, "step": 1450 }, { "epoch": 0.34, "eval_loss": 1.9490820169448853, "eval_runtime": 740.5722, "eval_samples_per_second": 20.103, "eval_steps_per_second": 2.513, "step": 1455 }, { "epoch": 0.34, "grad_norm": 0.704833984375, "learning_rate": 5.4000000000000005e-05, "loss": 2.158, "step": 1460 }, { "epoch": 0.34, "eval_loss": 1.949270486831665, "eval_runtime": 743.446, "eval_samples_per_second": 20.026, "eval_steps_per_second": 2.503, "step": 1460 }, { "epoch": 0.34, "eval_loss": 1.9491441249847412, "eval_runtime": 768.2882, "eval_samples_per_second": 19.378, "eval_steps_per_second": 2.422, "step": 1465 }, { "epoch": 0.34, "grad_norm": 0.7452517747879028, "learning_rate": 5.300000000000001e-05, "loss": 2.1391, "step": 1470 }, { "epoch": 0.34, "eval_loss": 1.949285864830017, "eval_runtime": 784.4371, "eval_samples_per_second": 18.979, "eval_steps_per_second": 2.372, "step": 1470 }, { "epoch": 0.34, "eval_loss": 1.9491676092147827, "eval_runtime": 779.509, "eval_samples_per_second": 19.099, "eval_steps_per_second": 2.387, "step": 1475 }, { "epoch": 0.34, "grad_norm": 1.1882747411727905, "learning_rate": 5.2000000000000004e-05, "loss": 2.0945, "step": 1480 }, { "epoch": 0.34, "eval_loss": 1.9490594863891602, "eval_runtime": 773.5132, "eval_samples_per_second": 19.247, "eval_steps_per_second": 2.406, "step": 1480 }, { "epoch": 0.34, "eval_loss": 1.9491394758224487, "eval_runtime": 769.1156, "eval_samples_per_second": 19.357, "eval_steps_per_second": 2.42, "step": 1485 }, { "epoch": 0.34, "grad_norm": 0.636154055595398, "learning_rate": 5.1000000000000006e-05, "loss": 1.812, "step": 1490 }, { "epoch": 0.34, "eval_loss": 1.9490303993225098, "eval_runtime": 800.4965, "eval_samples_per_second": 18.598, "eval_steps_per_second": 2.325, "step": 1490 }, { "epoch": 0.34, "eval_loss": 1.9486448764801025, "eval_runtime": 800.2384, "eval_samples_per_second": 18.604, "eval_steps_per_second": 2.326, "step": 1495 }, { "epoch": 0.35, "grad_norm": 0.7470951676368713, "learning_rate": 5e-05, "loss": 2.0111, "step": 1500 }, { "epoch": 0.35, "eval_loss": 1.9481943845748901, "eval_runtime": 799.7192, "eval_samples_per_second": 18.617, "eval_steps_per_second": 2.327, "step": 1500 }, { "epoch": 0.35, "eval_loss": 1.9475739002227783, "eval_runtime": 799.7239, "eval_samples_per_second": 18.616, "eval_steps_per_second": 2.327, "step": 1505 }, { "epoch": 0.35, "grad_norm": 1.1213610172271729, "learning_rate": 4.9e-05, "loss": 2.2712, "step": 1510 }, { "epoch": 0.35, "eval_loss": 1.9474812746047974, "eval_runtime": 799.7984, "eval_samples_per_second": 18.615, "eval_steps_per_second": 2.327, "step": 1510 }, { "epoch": 0.35, "eval_loss": 1.947679877281189, "eval_runtime": 799.9014, "eval_samples_per_second": 18.612, "eval_steps_per_second": 2.327, "step": 1515 }, { "epoch": 0.35, "grad_norm": 1.1590468883514404, "learning_rate": 4.8e-05, "loss": 2.0296, "step": 1520 }, { "epoch": 0.35, "eval_loss": 1.947396159172058, "eval_runtime": 799.8052, "eval_samples_per_second": 18.615, "eval_steps_per_second": 2.327, "step": 1520 }, { "epoch": 0.35, "eval_loss": 1.9468810558319092, "eval_runtime": 799.9551, "eval_samples_per_second": 18.611, "eval_steps_per_second": 2.326, "step": 1525 }, { "epoch": 0.35, "grad_norm": 0.7615826725959778, "learning_rate": 4.7e-05, "loss": 1.8507, "step": 1530 }, { "epoch": 0.35, "eval_loss": 1.9464668035507202, "eval_runtime": 799.7366, "eval_samples_per_second": 18.616, "eval_steps_per_second": 2.327, "step": 1530 }, { "epoch": 0.35, "eval_loss": 1.9460117816925049, "eval_runtime": 799.5384, "eval_samples_per_second": 18.621, "eval_steps_per_second": 2.328, "step": 1535 }, { "epoch": 0.35, "grad_norm": 1.255834698677063, "learning_rate": 4.600000000000001e-05, "loss": 1.9103, "step": 1540 }, { "epoch": 0.35, "eval_loss": 1.9458439350128174, "eval_runtime": 799.7058, "eval_samples_per_second": 18.617, "eval_steps_per_second": 2.327, "step": 1540 }, { "epoch": 0.36, "eval_loss": 1.9456100463867188, "eval_runtime": 799.6359, "eval_samples_per_second": 18.618, "eval_steps_per_second": 2.327, "step": 1545 }, { "epoch": 0.36, "grad_norm": 0.8211413621902466, "learning_rate": 4.5e-05, "loss": 1.9487, "step": 1550 }, { "epoch": 0.36, "eval_loss": 1.94538152217865, "eval_runtime": 799.8615, "eval_samples_per_second": 18.613, "eval_steps_per_second": 2.327, "step": 1550 }, { "epoch": 0.36, "eval_loss": 1.9452861547470093, "eval_runtime": 799.6185, "eval_samples_per_second": 18.619, "eval_steps_per_second": 2.327, "step": 1555 }, { "epoch": 0.36, "grad_norm": 1.0679750442504883, "learning_rate": 4.4000000000000006e-05, "loss": 2.0504, "step": 1560 }, { "epoch": 0.36, "eval_loss": 1.9451079368591309, "eval_runtime": 799.6427, "eval_samples_per_second": 18.618, "eval_steps_per_second": 2.327, "step": 1560 }, { "epoch": 0.36, "eval_loss": 1.945074439048767, "eval_runtime": 799.6799, "eval_samples_per_second": 18.617, "eval_steps_per_second": 2.327, "step": 1565 }, { "epoch": 0.36, "grad_norm": 0.6025732159614563, "learning_rate": 4.3e-05, "loss": 2.1497, "step": 1570 }, { "epoch": 0.36, "eval_loss": 1.9450632333755493, "eval_runtime": 799.6958, "eval_samples_per_second": 18.617, "eval_steps_per_second": 2.327, "step": 1570 }, { "epoch": 0.36, "eval_loss": 1.9450970888137817, "eval_runtime": 799.7035, "eval_samples_per_second": 18.617, "eval_steps_per_second": 2.327, "step": 1575 }, { "epoch": 0.36, "grad_norm": 0.9095989465713501, "learning_rate": 4.2e-05, "loss": 1.9836, "step": 1580 }, { "epoch": 0.36, "eval_loss": 1.9450123310089111, "eval_runtime": 799.6334, "eval_samples_per_second": 18.619, "eval_steps_per_second": 2.327, "step": 1580 }, { "epoch": 0.37, "eval_loss": 1.9449422359466553, "eval_runtime": 799.7912, "eval_samples_per_second": 18.615, "eval_steps_per_second": 2.327, "step": 1585 }, { "epoch": 0.37, "grad_norm": 0.714409351348877, "learning_rate": 4.1e-05, "loss": 1.8096, "step": 1590 }, { "epoch": 0.37, "eval_loss": 1.944868803024292, "eval_runtime": 799.8495, "eval_samples_per_second": 18.614, "eval_steps_per_second": 2.327, "step": 1590 }, { "epoch": 0.37, "eval_loss": 1.9445998668670654, "eval_runtime": 799.4538, "eval_samples_per_second": 18.623, "eval_steps_per_second": 2.328, "step": 1595 }, { "epoch": 0.37, "grad_norm": 0.5383341908454895, "learning_rate": 4e-05, "loss": 2.0759, "step": 1600 }, { "epoch": 0.37, "eval_loss": 1.9442588090896606, "eval_runtime": 777.9784, "eval_samples_per_second": 19.137, "eval_steps_per_second": 2.392, "step": 1600 }, { "epoch": 0.37, "eval_loss": 1.9438750743865967, "eval_runtime": 738.7615, "eval_samples_per_second": 20.153, "eval_steps_per_second": 2.519, "step": 1605 }, { "epoch": 0.37, "grad_norm": 0.8301254510879517, "learning_rate": 3.9000000000000006e-05, "loss": 2.0214, "step": 1610 }, { "epoch": 0.37, "eval_loss": 1.9435011148452759, "eval_runtime": 739.0062, "eval_samples_per_second": 20.146, "eval_steps_per_second": 2.518, "step": 1610 }, { "epoch": 0.37, "eval_loss": 1.9431562423706055, "eval_runtime": 738.2469, "eval_samples_per_second": 20.167, "eval_steps_per_second": 2.521, "step": 1615 }, { "epoch": 0.37, "grad_norm": 0.9721934795379639, "learning_rate": 3.8e-05, "loss": 1.9382, "step": 1620 }, { "epoch": 0.37, "eval_loss": 1.9430102109909058, "eval_runtime": 738.2353, "eval_samples_per_second": 20.167, "eval_steps_per_second": 2.521, "step": 1620 }, { "epoch": 0.37, "eval_loss": 1.9429086446762085, "eval_runtime": 738.1793, "eval_samples_per_second": 20.169, "eval_steps_per_second": 2.521, "step": 1625 }, { "epoch": 0.38, "grad_norm": 0.9744455218315125, "learning_rate": 3.7e-05, "loss": 2.0086, "step": 1630 }, { "epoch": 0.38, "eval_loss": 1.942738652229309, "eval_runtime": 738.1789, "eval_samples_per_second": 20.169, "eval_steps_per_second": 2.521, "step": 1630 }, { "epoch": 0.38, "eval_loss": 1.9425232410430908, "eval_runtime": 738.3004, "eval_samples_per_second": 20.165, "eval_steps_per_second": 2.521, "step": 1635 }, { "epoch": 0.38, "grad_norm": 0.9008609652519226, "learning_rate": 3.6e-05, "loss": 2.1409, "step": 1640 }, { "epoch": 0.38, "eval_loss": 1.9424079656600952, "eval_runtime": 738.2426, "eval_samples_per_second": 20.167, "eval_steps_per_second": 2.521, "step": 1640 }, { "epoch": 0.38, "eval_loss": 1.9424738883972168, "eval_runtime": 738.2907, "eval_samples_per_second": 20.165, "eval_steps_per_second": 2.521, "step": 1645 }, { "epoch": 0.38, "grad_norm": 0.7677657604217529, "learning_rate": 3.5e-05, "loss": 2.067, "step": 1650 }, { "epoch": 0.38, "eval_loss": 1.9425007104873657, "eval_runtime": 742.0127, "eval_samples_per_second": 20.064, "eval_steps_per_second": 2.508, "step": 1650 }, { "epoch": 0.38, "eval_loss": 1.9425512552261353, "eval_runtime": 750.5258, "eval_samples_per_second": 19.837, "eval_steps_per_second": 2.48, "step": 1655 }, { "epoch": 0.38, "grad_norm": 0.6732711791992188, "learning_rate": 3.4000000000000007e-05, "loss": 1.7463, "step": 1660 }, { "epoch": 0.38, "eval_loss": 1.9426753520965576, "eval_runtime": 748.9114, "eval_samples_per_second": 19.88, "eval_steps_per_second": 2.485, "step": 1660 }, { "epoch": 0.38, "eval_loss": 1.942758560180664, "eval_runtime": 746.9293, "eval_samples_per_second": 19.932, "eval_steps_per_second": 2.492, "step": 1665 }, { "epoch": 0.38, "grad_norm": 0.9339891076087952, "learning_rate": 3.3e-05, "loss": 2.001, "step": 1670 }, { "epoch": 0.38, "eval_loss": 1.9427272081375122, "eval_runtime": 797.2689, "eval_samples_per_second": 18.674, "eval_steps_per_second": 2.334, "step": 1670 }, { "epoch": 0.39, "eval_loss": 1.9428359270095825, "eval_runtime": 800.5836, "eval_samples_per_second": 18.596, "eval_steps_per_second": 2.325, "step": 1675 }, { "epoch": 0.39, "grad_norm": 0.8810248374938965, "learning_rate": 3.2000000000000005e-05, "loss": 1.8671, "step": 1680 }, { "epoch": 0.39, "eval_loss": 1.9427249431610107, "eval_runtime": 799.8884, "eval_samples_per_second": 18.613, "eval_steps_per_second": 2.327, "step": 1680 }, { "epoch": 0.39, "eval_loss": 1.9424289464950562, "eval_runtime": 799.7098, "eval_samples_per_second": 18.617, "eval_steps_per_second": 2.327, "step": 1685 }, { "epoch": 0.39, "grad_norm": 0.9490606784820557, "learning_rate": 3.1e-05, "loss": 1.9094, "step": 1690 }, { "epoch": 0.39, "eval_loss": 1.9420220851898193, "eval_runtime": 752.2407, "eval_samples_per_second": 19.792, "eval_steps_per_second": 2.474, "step": 1690 }, { "epoch": 0.39, "eval_loss": 1.9419386386871338, "eval_runtime": 738.4421, "eval_samples_per_second": 20.161, "eval_steps_per_second": 2.52, "step": 1695 }, { "epoch": 0.39, "grad_norm": 0.7382946610450745, "learning_rate": 3e-05, "loss": 2.048, "step": 1700 }, { "epoch": 0.39, "eval_loss": 1.9419728517532349, "eval_runtime": 738.4148, "eval_samples_per_second": 20.162, "eval_steps_per_second": 2.52, "step": 1700 }, { "epoch": 0.39, "eval_loss": 1.9419692754745483, "eval_runtime": 744.3255, "eval_samples_per_second": 20.002, "eval_steps_per_second": 2.5, "step": 1705 }, { "epoch": 0.39, "grad_norm": 0.8462529182434082, "learning_rate": 2.9e-05, "loss": 1.9912, "step": 1710 }, { "epoch": 0.39, "eval_loss": 1.9418373107910156, "eval_runtime": 745.3964, "eval_samples_per_second": 19.973, "eval_steps_per_second": 2.497, "step": 1710 }, { "epoch": 0.39, "eval_loss": 1.9416935443878174, "eval_runtime": 745.4722, "eval_samples_per_second": 19.971, "eval_steps_per_second": 2.496, "step": 1715 }, { "epoch": 0.4, "grad_norm": 1.0404905080795288, "learning_rate": 2.8000000000000003e-05, "loss": 2.1166, "step": 1720 }, { "epoch": 0.4, "eval_loss": 1.9415353536605835, "eval_runtime": 744.4599, "eval_samples_per_second": 19.998, "eval_steps_per_second": 2.5, "step": 1720 }, { "epoch": 0.4, "eval_loss": 1.941475510597229, "eval_runtime": 747.4114, "eval_samples_per_second": 19.919, "eval_steps_per_second": 2.49, "step": 1725 }, { "epoch": 0.4, "grad_norm": 0.9904809594154358, "learning_rate": 2.7000000000000002e-05, "loss": 1.8904, "step": 1730 }, { "epoch": 0.4, "eval_loss": 1.9413694143295288, "eval_runtime": 745.7476, "eval_samples_per_second": 19.964, "eval_steps_per_second": 2.495, "step": 1730 }, { "epoch": 0.4, "eval_loss": 1.9412492513656616, "eval_runtime": 746.5069, "eval_samples_per_second": 19.944, "eval_steps_per_second": 2.493, "step": 1735 }, { "epoch": 0.4, "grad_norm": 0.8735015988349915, "learning_rate": 2.6000000000000002e-05, "loss": 1.926, "step": 1740 }, { "epoch": 0.4, "eval_loss": 1.9410345554351807, "eval_runtime": 745.8571, "eval_samples_per_second": 19.961, "eval_steps_per_second": 2.495, "step": 1740 }, { "epoch": 0.4, "eval_loss": 1.940790057182312, "eval_runtime": 745.3703, "eval_samples_per_second": 19.974, "eval_steps_per_second": 2.497, "step": 1745 }, { "epoch": 0.4, "grad_norm": 0.8582627177238464, "learning_rate": 2.5e-05, "loss": 2.0692, "step": 1750 }, { "epoch": 0.4, "eval_loss": 1.9406797885894775, "eval_runtime": 759.2984, "eval_samples_per_second": 19.608, "eval_steps_per_second": 2.451, "step": 1750 }, { "epoch": 0.4, "eval_loss": 1.9406994581222534, "eval_runtime": 799.6272, "eval_samples_per_second": 18.619, "eval_steps_per_second": 2.327, "step": 1755 }, { "epoch": 0.41, "grad_norm": 0.8529316782951355, "learning_rate": 2.4e-05, "loss": 2.0355, "step": 1760 }, { "epoch": 0.41, "eval_loss": 1.9407473802566528, "eval_runtime": 799.5818, "eval_samples_per_second": 18.62, "eval_steps_per_second": 2.327, "step": 1760 }, { "epoch": 0.41, "eval_loss": 1.9407432079315186, "eval_runtime": 799.2768, "eval_samples_per_second": 18.627, "eval_steps_per_second": 2.328, "step": 1765 }, { "epoch": 0.41, "grad_norm": 0.9113526940345764, "learning_rate": 2.3000000000000003e-05, "loss": 1.9542, "step": 1770 }, { "epoch": 0.41, "eval_loss": 1.9405388832092285, "eval_runtime": 799.4163, "eval_samples_per_second": 18.624, "eval_steps_per_second": 2.328, "step": 1770 }, { "epoch": 0.41, "eval_loss": 1.9403679370880127, "eval_runtime": 799.618, "eval_samples_per_second": 18.619, "eval_steps_per_second": 2.327, "step": 1775 }, { "epoch": 0.41, "grad_norm": 0.6997771859169006, "learning_rate": 2.2000000000000003e-05, "loss": 1.9599, "step": 1780 }, { "epoch": 0.41, "eval_loss": 1.9402695894241333, "eval_runtime": 799.4944, "eval_samples_per_second": 18.622, "eval_steps_per_second": 2.328, "step": 1780 }, { "epoch": 0.41, "eval_loss": 1.9402552843093872, "eval_runtime": 799.5449, "eval_samples_per_second": 18.621, "eval_steps_per_second": 2.328, "step": 1785 }, { "epoch": 0.41, "grad_norm": 0.6518684029579163, "learning_rate": 2.1e-05, "loss": 1.8072, "step": 1790 }, { "epoch": 0.41, "eval_loss": 1.9402391910552979, "eval_runtime": 799.5838, "eval_samples_per_second": 18.62, "eval_steps_per_second": 2.327, "step": 1790 }, { "epoch": 0.41, "eval_loss": 1.9402625560760498, "eval_runtime": 764.0189, "eval_samples_per_second": 19.486, "eval_steps_per_second": 2.436, "step": 1795 }, { "epoch": 0.41, "grad_norm": 1.0698702335357666, "learning_rate": 2e-05, "loss": 2.0761, "step": 1800 }, { "epoch": 0.41, "eval_loss": 1.9402350187301636, "eval_runtime": 766.71, "eval_samples_per_second": 19.418, "eval_steps_per_second": 2.427, "step": 1800 }, { "epoch": 0.42, "eval_loss": 1.94016695022583, "eval_runtime": 767.2932, "eval_samples_per_second": 19.403, "eval_steps_per_second": 2.425, "step": 1805 }, { "epoch": 0.42, "grad_norm": 1.5021382570266724, "learning_rate": 1.9e-05, "loss": 2.1248, "step": 1810 }, { "epoch": 0.42, "eval_loss": 1.9400701522827148, "eval_runtime": 751.0521, "eval_samples_per_second": 19.823, "eval_steps_per_second": 2.478, "step": 1810 }, { "epoch": 0.42, "eval_loss": 1.9399473667144775, "eval_runtime": 781.0617, "eval_samples_per_second": 19.061, "eval_steps_per_second": 2.383, "step": 1815 }, { "epoch": 0.42, "grad_norm": 0.5781351327896118, "learning_rate": 1.8e-05, "loss": 1.8231, "step": 1820 }, { "epoch": 0.42, "eval_loss": 1.9398096799850464, "eval_runtime": 786.8736, "eval_samples_per_second": 18.92, "eval_steps_per_second": 2.365, "step": 1820 }, { "epoch": 0.42, "eval_loss": 1.93966805934906, "eval_runtime": 768.2598, "eval_samples_per_second": 19.379, "eval_steps_per_second": 2.422, "step": 1825 }, { "epoch": 0.42, "grad_norm": 0.6509262919425964, "learning_rate": 1.7000000000000003e-05, "loss": 2.0489, "step": 1830 }, { "epoch": 0.42, "eval_loss": 1.9395979642868042, "eval_runtime": 751.1152, "eval_samples_per_second": 19.821, "eval_steps_per_second": 2.478, "step": 1830 }, { "epoch": 0.42, "eval_loss": 1.9395313262939453, "eval_runtime": 759.4106, "eval_samples_per_second": 19.605, "eval_steps_per_second": 2.451, "step": 1835 }, { "epoch": 0.42, "grad_norm": 1.2815340757369995, "learning_rate": 1.6000000000000003e-05, "loss": 2.0146, "step": 1840 }, { "epoch": 0.42, "eval_loss": 1.939477562904358, "eval_runtime": 749.6182, "eval_samples_per_second": 19.861, "eval_steps_per_second": 2.483, "step": 1840 }, { "epoch": 0.42, "eval_loss": 1.9394245147705078, "eval_runtime": 752.8584, "eval_samples_per_second": 19.775, "eval_steps_per_second": 2.472, "step": 1845 }, { "epoch": 0.43, "grad_norm": 0.43942534923553467, "learning_rate": 1.5e-05, "loss": 1.942, "step": 1850 }, { "epoch": 0.43, "eval_loss": 1.9393796920776367, "eval_runtime": 759.4255, "eval_samples_per_second": 19.604, "eval_steps_per_second": 2.451, "step": 1850 }, { "epoch": 0.43, "eval_loss": 1.9393460750579834, "eval_runtime": 804.7704, "eval_samples_per_second": 18.5, "eval_steps_per_second": 2.312, "step": 1855 }, { "epoch": 0.43, "grad_norm": 0.7411811947822571, "learning_rate": 1.4000000000000001e-05, "loss": 2.1642, "step": 1860 }, { "epoch": 0.43, "eval_loss": 1.9392962455749512, "eval_runtime": 804.5633, "eval_samples_per_second": 18.504, "eval_steps_per_second": 2.313, "step": 1860 }, { "epoch": 0.43, "eval_loss": 1.939192533493042, "eval_runtime": 795.592, "eval_samples_per_second": 18.713, "eval_steps_per_second": 2.339, "step": 1865 }, { "epoch": 0.43, "grad_norm": 0.844436764717102, "learning_rate": 1.3000000000000001e-05, "loss": 1.8236, "step": 1870 }, { "epoch": 0.43, "eval_loss": 1.939056634902954, "eval_runtime": 752.4134, "eval_samples_per_second": 19.787, "eval_steps_per_second": 2.473, "step": 1870 }, { "epoch": 0.43, "eval_loss": 1.9389359951019287, "eval_runtime": 748.7084, "eval_samples_per_second": 19.885, "eval_steps_per_second": 2.486, "step": 1875 }, { "epoch": 0.43, "grad_norm": 0.8722310662269592, "learning_rate": 1.2e-05, "loss": 2.134, "step": 1880 }, { "epoch": 0.43, "eval_loss": 1.9388750791549683, "eval_runtime": 751.6317, "eval_samples_per_second": 19.808, "eval_steps_per_second": 2.476, "step": 1880 }, { "epoch": 0.43, "eval_loss": 1.9388281106948853, "eval_runtime": 754.3186, "eval_samples_per_second": 19.737, "eval_steps_per_second": 2.467, "step": 1885 }, { "epoch": 0.44, "grad_norm": 0.8985447287559509, "learning_rate": 1.1000000000000001e-05, "loss": 1.9291, "step": 1890 }, { "epoch": 0.44, "eval_loss": 1.938772439956665, "eval_runtime": 769.6876, "eval_samples_per_second": 19.343, "eval_steps_per_second": 2.418, "step": 1890 }, { "epoch": 0.44, "eval_loss": 1.9387296438217163, "eval_runtime": 804.3454, "eval_samples_per_second": 18.509, "eval_steps_per_second": 2.314, "step": 1895 }, { "epoch": 0.44, "grad_norm": 1.002416968345642, "learning_rate": 1e-05, "loss": 1.9172, "step": 1900 }, { "epoch": 0.44, "eval_loss": 1.938694953918457, "eval_runtime": 804.4582, "eval_samples_per_second": 18.507, "eval_steps_per_second": 2.313, "step": 1900 }, { "epoch": 0.44, "eval_loss": 1.9386274814605713, "eval_runtime": 804.296, "eval_samples_per_second": 18.511, "eval_steps_per_second": 2.314, "step": 1905 }, { "epoch": 0.44, "grad_norm": 0.7875113487243652, "learning_rate": 9e-06, "loss": 2.0179, "step": 1910 }, { "epoch": 0.44, "eval_loss": 1.93858003616333, "eval_runtime": 767.3044, "eval_samples_per_second": 19.403, "eval_steps_per_second": 2.425, "step": 1910 }, { "epoch": 0.44, "eval_loss": 1.9385550022125244, "eval_runtime": 753.0534, "eval_samples_per_second": 19.77, "eval_steps_per_second": 2.471, "step": 1915 }, { "epoch": 0.44, "grad_norm": 0.8589398264884949, "learning_rate": 8.000000000000001e-06, "loss": 2.1198, "step": 1920 }, { "epoch": 0.44, "eval_loss": 1.9385381937026978, "eval_runtime": 752.7415, "eval_samples_per_second": 19.778, "eval_steps_per_second": 2.472, "step": 1920 }, { "epoch": 0.44, "eval_loss": 1.9385038614273071, "eval_runtime": 751.344, "eval_samples_per_second": 19.815, "eval_steps_per_second": 2.477, "step": 1925 }, { "epoch": 0.44, "grad_norm": 0.7445068359375, "learning_rate": 7.000000000000001e-06, "loss": 1.7865, "step": 1930 }, { "epoch": 0.44, "eval_loss": 1.938494086265564, "eval_runtime": 754.4106, "eval_samples_per_second": 19.735, "eval_steps_per_second": 2.467, "step": 1930 }, { "epoch": 0.45, "eval_loss": 1.9384897947311401, "eval_runtime": 751.5617, "eval_samples_per_second": 19.809, "eval_steps_per_second": 2.476, "step": 1935 }, { "epoch": 0.45, "grad_norm": 0.9787230491638184, "learning_rate": 6e-06, "loss": 1.8503, "step": 1940 }, { "epoch": 0.45, "eval_loss": 1.9384899139404297, "eval_runtime": 762.4984, "eval_samples_per_second": 19.525, "eval_steps_per_second": 2.441, "step": 1940 }, { "epoch": 0.45, "eval_loss": 1.9384738206863403, "eval_runtime": 759.6919, "eval_samples_per_second": 19.597, "eval_steps_per_second": 2.45, "step": 1945 }, { "epoch": 0.45, "grad_norm": 0.5382779240608215, "learning_rate": 5e-06, "loss": 1.9748, "step": 1950 }, { "epoch": 0.45, "eval_loss": 1.9384503364562988, "eval_runtime": 765.0189, "eval_samples_per_second": 19.461, "eval_steps_per_second": 2.433, "step": 1950 }, { "epoch": 0.45, "eval_loss": 1.9384208917617798, "eval_runtime": 759.892, "eval_samples_per_second": 19.592, "eval_steps_per_second": 2.449, "step": 1955 }, { "epoch": 0.45, "grad_norm": 0.9979357123374939, "learning_rate": 4.000000000000001e-06, "loss": 1.8969, "step": 1960 }, { "epoch": 0.45, "eval_loss": 1.938381314277649, "eval_runtime": 779.852, "eval_samples_per_second": 19.091, "eval_steps_per_second": 2.386, "step": 1960 }, { "epoch": 0.45, "eval_loss": 1.9383498430252075, "eval_runtime": 790.6207, "eval_samples_per_second": 18.831, "eval_steps_per_second": 2.354, "step": 1965 }, { "epoch": 0.45, "grad_norm": 0.6065915822982788, "learning_rate": 3e-06, "loss": 1.9022, "step": 1970 }, { "epoch": 0.45, "eval_loss": 1.9383246898651123, "eval_runtime": 768.8896, "eval_samples_per_second": 19.363, "eval_steps_per_second": 2.42, "step": 1970 }, { "epoch": 0.45, "eval_loss": 1.9383031129837036, "eval_runtime": 777.816, "eval_samples_per_second": 19.141, "eval_steps_per_second": 2.393, "step": 1975 }, { "epoch": 0.46, "grad_norm": 0.9139683246612549, "learning_rate": 2.0000000000000003e-06, "loss": 1.9378, "step": 1980 }, { "epoch": 0.46, "eval_loss": 1.938288927078247, "eval_runtime": 800.6894, "eval_samples_per_second": 18.594, "eval_steps_per_second": 2.324, "step": 1980 }, { "epoch": 0.46, "eval_loss": 1.9382776021957397, "eval_runtime": 796.1175, "eval_samples_per_second": 18.701, "eval_steps_per_second": 2.338, "step": 1985 }, { "epoch": 0.46, "grad_norm": 1.0108898878097534, "learning_rate": 1.0000000000000002e-06, "loss": 2.0676, "step": 1990 }, { "epoch": 0.46, "eval_loss": 1.938266396522522, "eval_runtime": 764.489, "eval_samples_per_second": 19.474, "eval_steps_per_second": 2.434, "step": 1990 }, { "epoch": 0.46, "eval_loss": 1.9382603168487549, "eval_runtime": 774.5865, "eval_samples_per_second": 19.221, "eval_steps_per_second": 2.403, "step": 1995 }, { "epoch": 0.46, "grad_norm": 0.8565602898597717, "learning_rate": 0.0, "loss": 1.9374, "step": 2000 }, { "epoch": 0.46, "eval_loss": 1.9382596015930176, "eval_runtime": 749.4454, "eval_samples_per_second": 19.865, "eval_steps_per_second": 2.483, "step": 2000 } ], "logging_steps": 10, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 1244221462020096.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }